Merge remote-tracking branch 'tip/auto-latest'

author: Stephen Rothwell <sfr@canb.auug.org.au> 2013-01-28 19:19:54 +1100
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2013-01-28 19:19:54 +1100
commit: 9203a18694285cf12becb6a695feac055268f675 (patch)
tree: 685bec0b563948ca7602de2d42ca03489317a463 /tools
parent: fa7201203eac6d21f8cd1bdbca8136672e82636c (diff)
parent: ec4add71968abd7e4cbcab8c73421aae7100b611 (diff)
360 files changed, 32070 insertions, 2676 deletions
diff --git a/tools/kvm/.gitignore b/tools/kvm/.gitignore
new file mode 100644
index 000000000000..60dd6dba3a50
--- /dev/null
+++ b/tools/kvm/.gitignore
@@ -0,0 +1,12 @@
+/lkvm
+/vm
+*.o
+*.d
+.cscope
+tags
+include/common-cmds.h
+tests/boot/boot_test.iso
+tests/boot/rootfs/
+guest/init
+guest/init_stage2
+KVMTOOLS-VERSION-FILE
diff --git a/tools/kvm/CREDITS-Git b/tools/kvm/CREDITS-Git
new file mode 100644
index 000000000000..c2ddcb3acbd0
--- /dev/null
+++ b/tools/kvm/CREDITS-Git
@@ -0,0 +1,30 @@
+Most of the infrastructure that 'perf' uses here has been reused
+from the Git project, as of version:
+
+    66996ec: Sync with 1.6.2.4
+
+Here is an (incomplete!) list of main contributors to those files
+in util/* and elsewhere:
+
+ Alex Riesen
+ Christian Couder
+ Dmitry Potapov
+ Jeff King
+ Johannes Schindelin
+ Johannes Sixt
+ Junio C Hamano
+ Linus Torvalds
+ Matthias Kestenholz
+ Michal Ostrowski
+ Miklos Vajna
+ Petr Baudis
+ Pierre Habouzit
+ René Scharfe
+ Samuel Tardieu
+ Shawn O. Pearce
+ Steffen Prohaska
+ Steve Haslam
+
+Thanks guys!
+
+The full history of the files can be found in the upstream Git commits.
diff --git a/tools/kvm/Documentation/kernel-debugging.txt b/tools/kvm/Documentation/kernel-debugging.txt
new file mode 100644
index 000000000000..98b943829cbb
--- /dev/null
+++ b/tools/kvm/Documentation/kernel-debugging.txt
@@ -0,0 +1,15 @@
+This document explains how to debug a guests' kernel using KGDB.
+
+1. Run the guest:
+        'lkvm run -k [vmlinuz] -p "kgdboc=ttyS1 kgdbwait" --tty 1'
+
+And see which PTY got assigned to ttyS1 (you'll see:
+'  Info: Assigned terminal 1 to pty /dev/pts/X').
+
+2. Run GDB on the host:
+        'gdb [vmlinuz]'
+
+3. Connect to the guest (from within GDB):
+        'target remote /dev/pty/X'
+
+4. Start debugging! (enter 'continue' to continue boot).
diff --git a/tools/kvm/Documentation/kvm-balloon.txt b/tools/kvm/Documentation/kvm-balloon.txt
new file mode 100644
index 000000000000..efc0a87e68c7
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-balloon.txt
@@ -0,0 +1,24 @@
+lkvm-balloon(1)
+================
+
+NAME
+----
+lkvm-balloon - Inflate or deflate the virtio balloon
+
+SYNOPSIS
+--------
+[verse]
+'lkvm balloon [command] [size] [instance]'
+
+DESCRIPTION
+-----------
+The command inflates or deflates the virtio balloon located in the
+specified instance.
+For a list of running instances see 'lkvm list'.
+
+Command can be either 'inflate' or 'deflate'. Inflate increases the
+size of the balloon, thus decreasing the amount of virtual RAM available
+for the guest. Deflation returns previously inflated memory back to the
+guest.
+
+size is specified in Mb.
diff --git a/tools/kvm/Documentation/kvm-debug.txt b/tools/kvm/Documentation/kvm-debug.txt
new file mode 100644
index 000000000000..a8eb2c0196f7
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-debug.txt
@@ -0,0 +1,16 @@
+lkvm-debug(1)
+================
+
+NAME
+----
+lkvm-debug - Print debug information from a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm debug [instance]'
+
+DESCRIPTION
+-----------
+The command prints debug information from a running instance.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-list.txt b/tools/kvm/Documentation/kvm-list.txt
new file mode 100644
index 000000000000..a245607d4d97
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-list.txt
@@ -0,0 +1,16 @@
+lkvm-list(1)
+================
+
+NAME
+----
+lkvm-list - Print a list of running instances on the host.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm list'
+
+DESCRIPTION
+-----------
+This command prints a list of running instances on the host which
+belong to the user who currently ran 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-pause.txt b/tools/kvm/Documentation/kvm-pause.txt
new file mode 100644
index 000000000000..1ea2a239cc75
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-pause.txt
@@ -0,0 +1,16 @@
+lkvm-pause(1)
+================
+
+NAME
+----
+lkvm-pause - Pause the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm pause [instance]'
+
+DESCRIPTION
+-----------
+The command pauses a virtual machine.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-resume.txt b/tools/kvm/Documentation/kvm-resume.txt
new file mode 100644
index 000000000000..a36c4df40d76
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-resume.txt
@@ -0,0 +1,16 @@
+lkvm-resume(1)
+================
+
+NAME
+----
+lkvm-resume - Resume the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm resume [instance]'
+
+DESCRIPTION
+-----------
+The command resumes a virtual machine.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-run.txt b/tools/kvm/Documentation/kvm-run.txt
new file mode 100644
index 000000000000..8ddf470145d1
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-run.txt
@@ -0,0 +1,62 @@
+lkvm-run(1)
+================
+
+NAME
+----
+lkvm-run - Start the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm run' [-k <kernel image> | --kernel <kernel image>]
+
+DESCRIPTION
+-----------
+The command starts a virtual machine.
+
+OPTIONS
+-------
+-m::
+--mem=::
+	Virtual machine memory size in MiB.
+
+-p::
+--params::
+	Additional kernel command line arguments.
+
+-r::
+--initrd=::
+	Initial RAM disk image.
+
+-k::
+--kernel=::
+	The virtual machine kernel.
+
+--dev=::
+	KVM device file.
+
+-i::
+--image=::
+	A disk image file.
+
+-s::
+--single-step::
+	Enable single stepping.
+
+-g::
+--ioport-debug::
+	Enable ioport debugging.
+
+-c::
+--enable-virtio-console::
+	Enable the virtual IO console.
+
+--cpus::
+	The number of virtual CPUs to run.
+
+--debug::
+	Enable debug messages.
+
+SEE ALSO
+--------
+linkkvm:
diff --git a/tools/kvm/Documentation/kvm-sandbox.txt b/tools/kvm/Documentation/kvm-sandbox.txt
new file mode 100644
index 000000000000..2d7f558e5d52
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-sandbox.txt
@@ -0,0 +1,16 @@
+lkvm-sandbox(1)
+================
+
+NAME
+----
+lkvm-sandbox - Run a command in a sandboxed guest
+
+SYNOPSIS
+--------
+[verse]
+'lkvm sandbox ['lkvm run' arguments] -- [sandboxed command]'
+
+DESCRIPTION
+-----------
+The sandboxed command will run in a guest as part of it's init
+command.
diff --git a/tools/kvm/Documentation/kvm-setup.txt b/tools/kvm/Documentation/kvm-setup.txt
new file mode 100644
index 000000000000..4b6e3318b0a7
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-setup.txt
@@ -0,0 +1,15 @@
+lkvm-setup(1)
+================
+
+NAME
+----
+lkvm-setup - Setup a new virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm setup <name>'
+
+DESCRIPTION
+-----------
+The command setups a virtual machine.
diff --git a/tools/kvm/Documentation/kvm-stat.txt b/tools/kvm/Documentation/kvm-stat.txt
new file mode 100644
index 000000000000..101ce7ac12a7
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stat.txt
@@ -0,0 +1,19 @@
+lkvm-stat(1)
+================
+
+NAME
+----
+lkvm-stat - Print statistics about a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm [command] [-n instance] [-p instance pid] [--all]'
+
+DESCRIPTION
+-----------
+The command prints statistics about a running instance.
+For a list of running instances see 'lkvm list'.
+
+Commands:
+ --memory, -m	Display memory statistics
diff --git a/tools/kvm/Documentation/kvm-stop.txt b/tools/kvm/Documentation/kvm-stop.txt
new file mode 100644
index 000000000000..6e4bc831e2af
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stop.txt
@@ -0,0 +1,16 @@
+lkvm-stop(1)
+================
+
+NAME
+----
+lkvm-stop - Stop a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm stop [instance]'
+
+DESCRIPTION
+-----------
+The command stops a running instance.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-version.txt b/tools/kvm/Documentation/kvm-version.txt
new file mode 100644
index 000000000000..41003d2b99bb
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-version.txt
@@ -0,0 +1,21 @@
+lkvm-version(1)
+================
+
+NAME
+----
+lkvm-version - Print the version of the kernel tree kvm tools
+was built on.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm version'
+
+DESCRIPTION
+-----------
+The command prints the version of the kernel that was used to build
+kvm tools.
+
+Note that the version is not the version of the kernel which is currently
+running on the host, but is the version of the kernel tree from which kvm
+tools was built.
diff --git a/tools/kvm/Documentation/virtio-console.txt b/tools/kvm/Documentation/virtio-console.txt
new file mode 100644
index 000000000000..4a58d567c991
--- /dev/null
+++ b/tools/kvm/Documentation/virtio-console.txt
@@ -0,0 +1,41 @@
+General
+--------
+
+virtio-console as the name implies is a console over virtio transport. Here is
+a simple head to head comparison of the virtio-console vs regular 8250 console:
+
+8250 serial console:
+
+ - Requires CONFIG_SERIAL_8250=y and CONFIG_SERIAL_8250_CONSOLE=y kernel configs,
+which are enabled almost everywhere.
+ - Doesn't require guest-side changes.
+ - Compatible with older guests.
+
+virtio-console:
+
+ - Requires CONFIG_VIRTIO_CONSOLE=y (along with all other virtio dependencies),
+which got enabled only in recent kernels (but not all of them).
+ - Much faster.
+ - Consumes less processing resources.
+ - Requires guest-side changes.
+
+Enabling virtio-console
+------------------------
+
+First, make sure guest kernel is built with CONFIG_VIRTIO_CONSOLE=y. Once this
+is done, the following has to be done inside guest image:
+
+ - Add the following line to /etc/inittab:
+	'hvc0:2345:respawn:/sbin/agetty -L 9600 hvc0'
+ - Add 'hvc0' to /etc/securetty (so you could actually log on)
+ - Start the guest with '--console virtio'
+
+Common errors
+--------------
+
+Q: I don't see anything on the screen!
+A: Make sure CONFIG_VIRTIO_CONSOLE=y is enabled in the *guest* kernel, also
+make sure you've updated /etc/inittab
+
+Q: It won't accept my username/password, but I enter them correctly!
+A: You didn't add 'hvc0' to /etc/securetty
diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile
new file mode 100644
index 000000000000..0c59faaa0779
--- /dev/null
+++ b/tools/kvm/Makefile
@@ -0,0 +1,491 @@
+#
+# Define WERROR=0 to disable -Werror.
+#
+
+ifeq ($(strip $(V)),)
+	E = @echo
+	Q = @
+else
+	E = @\#
+	Q =
+endif
+ifneq ($(I), )
+	KINCL_PATH=$(I)
+else
+	KINCL_PATH=../..
+endif
+export E Q KINCL_PATH
+
+include config/utilities.mak
+include config/feature-tests.mak
+
+CC	:= $(CROSS_COMPILE)gcc
+LD	:= $(CROSS_COMPILE)ld
+
+FIND	:= find
+CSCOPE	:= cscope
+TAGS	:= ctags
+INSTALL := install
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+
+PROGRAM	:= lkvm
+PROGRAM_ALIAS := vm
+
+GUEST_INIT := guest/init
+
+OBJS	+= builtin-balloon.o
+OBJS	+= builtin-debug.o
+OBJS	+= builtin-help.o
+OBJS	+= builtin-list.o
+OBJS	+= builtin-stat.o
+OBJS	+= builtin-pause.o
+OBJS	+= builtin-resume.o
+OBJS	+= builtin-run.o
+OBJS	+= builtin-setup.o
+OBJS	+= builtin-stop.o
+OBJS	+= builtin-version.o
+OBJS	+= devices.o
+OBJS	+= disk/core.o
+OBJS	+= framebuffer.o
+OBJS	+= guest_compat.o
+OBJS	+= hw/rtc.o
+OBJS	+= hw/serial.o
+OBJS	+= ioport.o
+OBJS	+= kvm-cpu.o
+OBJS	+= kvm.o
+OBJS	+= main.o
+OBJS	+= mmio.o
+OBJS	+= pci.o
+OBJS	+= term.o
+OBJS	+= virtio/blk.o
+OBJS	+= virtio/scsi.o
+OBJS	+= virtio/console.o
+OBJS	+= virtio/core.o
+OBJS	+= virtio/net.o
+OBJS	+= virtio/rng.o
+OBJS    += virtio/balloon.o
+OBJS	+= virtio/pci.o
+OBJS	+= disk/blk.o
+OBJS	+= disk/qcow.o
+OBJS	+= disk/raw.o
+OBJS	+= ioeventfd.o
+OBJS	+= net/uip/core.o
+OBJS	+= net/uip/arp.o
+OBJS	+= net/uip/icmp.o
+OBJS	+= net/uip/ipv4.o
+OBJS	+= net/uip/tcp.o
+OBJS	+= net/uip/udp.o
+OBJS	+= net/uip/buf.o
+OBJS	+= net/uip/csum.o
+OBJS	+= net/uip/dhcp.o
+OBJS	+= kvm-cmd.o
+OBJS	+= util/init.o
+OBJS	+= util/rbtree.o
+OBJS	+= util/threadpool.o
+OBJS	+= util/parse-options.o
+OBJS	+= util/rbtree-interval.o
+OBJS	+= util/strbuf.o
+OBJS	+= util/read-write.o
+OBJS	+= util/util.o
+OBJS	+= virtio/9p.o
+OBJS	+= virtio/9p-pdu.o
+OBJS	+= hw/vesa.o
+OBJS	+= hw/pci-shmem.o
+OBJS	+= kvm-ipc.o
+OBJS	+= builtin-sandbox.o
+OBJS	+= virtio/mmio.o
+
+# Translate uname -m into ARCH string
+ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/ \
+	  -e s/armv7.*/arm/ -e s/aarch64.*/arm64/)
+
+ifeq ($(ARCH),i386)
+	ARCH         := x86
+	DEFINES      += -DCONFIG_X86_32
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH         := x86
+	DEFINES      += -DCONFIG_X86_64
+endif
+
+LIBFDT_SRC = fdt.o fdt_ro.o fdt_wip.o fdt_sw.o fdt_rw.o fdt_strerror.o
+LIBFDT_OBJS = $(patsubst %,../../scripts/dtc/libfdt/%,$(LIBFDT_SRC))
+
+### Arch-specific stuff
+
+#x86
+ifeq ($(ARCH),x86)
+	DEFINES += -DCONFIG_X86
+	OBJS	+= x86/boot.o
+	OBJS	+= x86/cpuid.o
+	OBJS	+= x86/interrupt.o
+	OBJS	+= x86/ioport.o
+	OBJS	+= x86/irq.o
+	OBJS	+= x86/kvm.o
+	OBJS	+= x86/kvm-cpu.o
+	OBJS	+= x86/mptable.o
+	OBJS	+= hw/i8042.o
+# Exclude BIOS object files from header dependencies.
+	OTHEROBJS	+= x86/bios.o
+	OTHEROBJS	+= x86/bios/bios-rom.o
+	ARCH_INCLUDE := x86/include
+endif
+# POWER/ppc:  Actually only support ppc64 currently.
+ifeq ($(ARCH), powerpc)
+	DEFINES += -DCONFIG_PPC
+	OBJS	+= powerpc/boot.o
+	OBJS	+= powerpc/ioport.o
+	OBJS	+= powerpc/irq.o
+	OBJS	+= powerpc/kvm.o
+	OBJS	+= powerpc/cpu_info.o
+	OBJS	+= powerpc/kvm-cpu.o
+	OBJS	+= powerpc/spapr_hcall.o
+	OBJS	+= powerpc/spapr_rtas.o
+	OBJS	+= powerpc/spapr_hvcons.o
+	OBJS	+= powerpc/spapr_pci.o
+	OBJS	+= powerpc/xics.o
+# We use libfdt, but it's sometimes not packaged 64bit.  It's small too,
+# so just build it in:
+	CFLAGS 	+= -I../../scripts/dtc/libfdt
+	OTHEROBJS	+= $(LIBFDT_OBJS)
+	ARCH_INCLUDE := powerpc/include
+	CFLAGS 	+= -m64
+endif
+
+# ARM
+OBJS_ARM_COMMON		:= arm/fdt.o arm/gic.o arm/ioport.o arm/irq.o \
+			   arm/kvm.o arm/kvm-cpu.o
+HDRS_ARM_COMMON		:= arm/include
+ifeq ($(ARCH), arm)
+	DEFINES		+= -DCONFIG_ARM
+	OBJS		+= $(OBJS_ARM_COMMON)
+	OBJS		+= arm/aarch32/cortex-a15.o
+	OBJS		+= arm/aarch32/kvm-cpu.o
+	ARCH_INCLUDE	:= $(HDRS_ARM_COMMON)
+	ARCH_INCLUDE	+= -Iarm/aarch32/include
+	CFLAGS		+= -march=armv7-a
+	CFLAGS		+= -I../../scripts/dtc/libfdt
+	OTHEROBJS	+= $(LIBFDT_OBJS)
+endif
+
+# ARM64
+ifeq ($(ARCH), arm64)
+	DEFINES		+= -DCONFIG_ARM64
+	OBJS		+= $(OBJS_ARM_COMMON)
+	OBJS		+= arm/aarch64/cortex-a57.o
+	OBJS		+= arm/aarch64/kvm-cpu.o
+	ARCH_INCLUDE	:= $(HDRS_ARM_COMMON)
+	ARCH_INCLUDE	+= -Iarm/aarch64/include
+	CFLAGS		+= -I../../scripts/dtc/libfdt
+	OTHEROBJS	+= $(LIBFDT_OBJS)
+endif
+
+###
+
+ifeq (,$(ARCH_INCLUDE))
+	UNSUPP_ERR = @echo "This architecture is not supported in kvmtool." && exit 1
+else
+	UNSUPP_ERR =
+endif
+
+###
+
+# Detect optional features.
+# On a given system, some libs may link statically, some may not; so, check
+# both and only build those that link!
+
+FLAGS_BFD := $(CFLAGS) -lbfd
+ifeq ($(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD) -static),y)
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_BFD
+	OBJS_STATOPT	+= symbol.o
+	LIBS_STATOPT	+= -lbfd
+endif
+
+FLAGS_VNCSERVER := $(CFLAGS) -lvncserver
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER)),y)
+	OBJS_DYNOPT	+= ui/vnc.o
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_VNCSERVER
+	LIBS_DYNOPT	+= -lvncserver
+endif
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER) -static),y)
+	OBJS_STATOPT	+= ui/vnc.o
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_VNCSERVER
+	LIBS_STATOPT	+= -lvncserver
+endif
+
+FLAGS_SDL := $(CFLAGS) -lSDL
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL)),y)
+	OBJS_DYNOPT	+= ui/sdl.o
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_SDL
+	LIBS_DYNOPT	+= -lSDL
+endif
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL) -static), y)
+	OBJS_STATOPT	+= ui/sdl.o
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_SDL
+	LIBS_STATOPT	+= -lSDL
+endif
+
+FLAGS_ZLIB := $(CFLAGS) -lz
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB)),y)
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_ZLIB
+	LIBS_DYNOPT	+= -lz
+endif
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB) -static),y)
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_ZLIB
+	LIBS_STATOPT	+= -lz
+endif
+
+FLAGS_AIO := $(CFLAGS) -laio
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO)),y)
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_AIO
+	LIBS_DYNOPT	+= -laio
+endif
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO) -static),y)
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_AIO
+	LIBS_STATOPT	+= -laio
+endif
+
+ifeq ($(LTO),1)
+	FLAGS_LTO := -flto
+	ifeq ($(call try-cc,$(SOURCE_HELLO),$(FLAGS_LTO)),y)
+		CFLAGS		+= $(FLAGS_LTO)
+	endif
+endif
+
+ifneq ($(call try-build,$(SOURCE_STATIC),-static,),y)
+$(error No static libc found. Please install glibc-static package.)
+endif
+###
+
+LIBS	+= -lrt
+LIBS	+= -lpthread
+LIBS	+= -lutil
+
+
+DEPS	:= $(patsubst %.o,%.d,$(OBJS))
+
+DEFINES	+= -D_FILE_OFFSET_BITS=64
+DEFINES	+= -D_GNU_SOURCE
+DEFINES	+= -DKVMTOOLS_VERSION='"$(KVMTOOLS_VERSION)"'
+DEFINES	+= -DBUILD_ARCH='"$(ARCH)"'
+
+KVM_INCLUDE := include
+CFLAGS	+= $(CPPFLAGS) $(DEFINES) -I$(KVM_INCLUDE) -I$(ARCH_INCLUDE) -I$(KINCL_PATH)/include/uapi -I$(KINCL_PATH)/include -I$(KINCL_PATH)/arch/$(ARCH)/include/uapi -I$(KINCL_PATH)/arch/$(ARCH)/include/ -O2 -fno-strict-aliasing -g
+
+WARNINGS += -Wall
+WARNINGS += -Wformat=2
+WARNINGS += -Winit-self
+WARNINGS += -Wmissing-declarations
+WARNINGS += -Wmissing-prototypes
+WARNINGS += -Wnested-externs
+WARNINGS += -Wno-system-headers
+WARNINGS += -Wold-style-definition
+WARNINGS += -Wredundant-decls
+WARNINGS += -Wsign-compare
+WARNINGS += -Wstrict-prototypes
+WARNINGS += -Wundef
+WARNINGS += -Wvolatile-register-var
+WARNINGS += -Wwrite-strings
+
+CFLAGS	+= $(WARNINGS)
+
+# Some targets may use 'external' sources that don't build totally cleanly.
+CFLAGS_EASYGOING := $(CFLAGS)
+
+ifneq ($(WERROR),0)
+	CFLAGS += -Werror
+endif
+
+all: arch_support_check $(PROGRAM) $(PROGRAM_ALIAS) $(GUEST_INIT)
+
+arch_support_check:
+	$(UNSUPP_ERR)
+
+KVMTOOLS-VERSION-FILE:
+	@$(SHELL_PATH) util/KVMTOOLS-VERSION-GEN $(OUTPUT)
+-include $(OUTPUT)KVMTOOLS-VERSION-FILE
+
+# When building -static all objects are built with appropriate flags, which
+# may differ between static & dynamic .o.  The objects are separated into
+# .o and .static.o.  See the %.o: %.c rules below.
+#
+# $(OTHEROBJS) are things that do not get substituted like this.
+#
+STATIC_OBJS = $(patsubst %.o,%.static.o,$(OBJS) $(OBJS_STATOPT))
+GUEST_OBJS = guest/guest_init.o
+
+$(PROGRAM)-static:  $(DEPS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_INIT)
+	$(E) "  LINK    " $@
+	$(Q) $(CC) -static $(CFLAGS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_STATOPT) -o $@
+
+$(PROGRAM): $(DEPS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_INIT)
+	$(E) "  LINK    " $@
+	$(Q) $(CC) $(CFLAGS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_DYNOPT) -o $@
+
+$(PROGRAM_ALIAS): $(PROGRAM)
+	$(E) "  LN      " $@
+	$(Q) ln -f $(PROGRAM) $@
+
+$(GUEST_INIT): guest/init.c
+	$(E) "  LINK    " $@
+	$(Q) $(CC) -static guest/init.c -o $@
+	$(Q) $(LD) -r -b binary -o guest/guest_init.o $(GUEST_INIT)
+
+$(DEPS):
+
+util/rbtree.d: ../../lib/rbtree.c
+	$(Q) $(CC) -M -MT util/rbtree.o $(CFLAGS) $< -o $@
+
+%.d: %.c
+	$(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@
+
+%.s: %.c
+	$(Q) $(CC) -o $@ -S $(CFLAGS) -fverbose-asm $<
+
+# The header file common-cmds.h is needed for compilation of builtin-help.c.
+builtin-help.d: $(KVM_INCLUDE)/common-cmds.h
+
+$(OBJS):
+
+# This rule relaxes the -Werror on libfdt, since for now it still has
+# a bunch of warnings. :(
+../../scripts/dtc/libfdt/%.o: ../../scripts/dtc/libfdt/%.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS_EASYGOING) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS_EASYGOING) $< -o $@
+
+util/rbtree.static.o util/rbtree.o: ../../lib/rbtree.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) $< -o $@
+
+%.static.o: %.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_STATOPT) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) $(CFLAGS_STATOPT)  $< -o $@
+
+%.o: %.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+
+
+$(KVM_INCLUDE)/common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+$(KVM_INCLUDE)/common-cmds.h: $(wildcard Documentation/kvm-*.txt)
+	$(E) "  GEN     " $@
+	$(Q) util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+#
+# BIOS assembly weirdness
+#
+BIOS_CFLAGS += -m32
+BIOS_CFLAGS += -march=i386
+BIOS_CFLAGS += -mregparm=3
+
+BIOS_CFLAGS += -fno-stack-protector
+BIOS_CFLAGS += -I../../arch/$(ARCH)
+
+x86/bios.o: x86/bios/bios.bin x86/bios/bios-rom.h
+
+x86/bios/bios.bin.elf: x86/bios/entry.S x86/bios/e820.c x86/bios/int10.c x86/bios/int15.c x86/bios/rom.ld.S
+	$(E) "  CC       x86/bios/memcpy.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/memcpy.c -o x86/bios/memcpy.o
+	$(E) "  CC       x86/bios/e820.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/e820.c -o x86/bios/e820.o
+	$(E) "  CC       x86/bios/int10.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int10.c -o x86/bios/int10.o
+	$(E) "  CC       x86/bios/int15.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int15.c -o x86/bios/int15.o
+	$(E) "  CC       x86/bios/entry.o"
+	$(Q) $(CC) $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/entry.S -o x86/bios/entry.o
+	$(E) "  LD      " $@
+	$(Q) $(LD) -T x86/bios/rom.ld.S -o x86/bios/bios.bin.elf x86/bios/memcpy.o x86/bios/entry.o x86/bios/e820.o x86/bios/int10.o x86/bios/int15.o
+
+x86/bios/bios.bin: x86/bios/bios.bin.elf
+	$(E) "  OBJCOPY " $@
+	$(Q) objcopy -O binary -j .text x86/bios/bios.bin.elf x86/bios/bios.bin
+
+x86/bios/bios-rom.o: x86/bios/bios-rom.S x86/bios/bios.bin x86/bios/bios-rom.h
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) x86/bios/bios-rom.S -o x86/bios/bios-rom.o
+
+x86/bios/bios-rom.h: x86/bios/bios.bin.elf
+	$(E) "  NM      " $@
+	$(Q) cd x86/bios && sh gen-offsets.sh > bios-rom.h && cd ..
+
+check: all
+	$(MAKE) -C tests
+	./$(PROGRAM) run tests/pit/tick.bin
+	./$(PROGRAM) run -d tests/boot/boot_test.iso -p "init=init"
+.PHONY: check
+
+install: all
+	$(E) "  INSTALL"
+	$(Q) $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' 
+	$(Q) $(INSTALL) $(PROGRAM) '$(DESTDIR_SQ)$(bindir_SQ)' 
+.PHONY: install
+
+clean:
+	$(E) "  CLEAN"
+	$(Q) rm -f x86/bios/*.bin
+	$(Q) rm -f x86/bios/*.elf
+	$(Q) rm -f x86/bios/*.o
+	$(Q) rm -f x86/bios/bios-rom.h
+	$(Q) rm -f tests/boot/boot_test.iso
+	$(Q) rm -rf tests/boot/rootfs/
+	$(Q) rm -f $(DEPS) $(OBJS) $(OTHEROBJS) $(OBJS_DYNOPT) $(STATIC_OBJS) $(PROGRAM) $(PROGRAM_ALIAS) $(PROGRAM)-static $(GUEST_INIT) $(GUEST_OBJS)
+	$(Q) rm -f cscope.*
+	$(Q) rm -f tags
+	$(Q) rm -f TAGS
+	$(Q) rm -f $(KVM_INCLUDE)/common-cmds.h
+	$(Q) rm -f KVMTOOLS-VERSION-FILE
+.PHONY: clean
+
+KVM_DEV	?= /dev/kvm
+
+$(KVM_DEV):
+	$(E) "  MKNOD " $@
+	$(Q) mknod $@ char 10 232
+
+devices: $(KVM_DEV)
+.PHONY: devices
+
+TAGS:
+	$(E) "  GEN" $@
+	$(Q) $(RM) -f TAGS
+	$(Q) $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+.PHONY: TAGS
+
+tags:
+	$(E) "  GEN" $@
+	$(Q) $(RM) -f tags
+	$(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+.PHONY: tags
+
+cscope:
+	$(E) "  GEN" $@
+	$(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
+	$(Q) $(CSCOPE) -bkqu
+.PHONY: cscope
+
+# Deps
+-include $(DEPS)
diff --git a/tools/kvm/README b/tools/kvm/README
new file mode 100644
index 000000000000..358fa23cbab3
--- /dev/null
+++ b/tools/kvm/README
@@ -0,0 +1,112 @@
+Native Linux KVM tool
+=====================
+The goal of this tool is to provide a clean, from-scratch, lightweight
+KVM host tool implementation that can boot Linux guest images (just a
+hobby, won't be big and professional like QEMU) with no BIOS
+dependencies and with only the minimal amount of legacy device
+emulation.
+
+It's great as a learning tool if you want to get your feet wet in
+virtualization land: it's only 5 KLOC of clean C code that can already
+boot a guest Linux image.
+
+Right now it can boot a Linux image and provide you output via a serial
+console, over the host terminal, i.e. you can use it to boot a guest
+Linux image in a terminal or over ssh and log into the guest without
+much guest or host side setup work needed.
+
+1. To try out the tool, clone the git repository:
+
+  git clone git://github.com/penberg/linux-kvm.git
+
+or alternatively, if you already have a kernel source tree:
+
+  git remote add kvm-tool git://github.com/penberg/linux-kvm.git
+  git remote update
+  git checkout -b kvm-tool/master kvm-tool
+
+2. Compile the tool:
+
+  cd tools/kvm && make
+
+3. Download a raw userspace image:
+
+  wget http://wiki.qemu.org/download/linux-0.2.img.bz2 && bunzip2
+linux-0.2.img.bz2
+
+4. The guest kernel has to be built with the following configuration:
+
+ - For the default console output:
+	CONFIG_SERIAL_8250=y
+	CONFIG_SERIAL_8250_CONSOLE=y
+
+ - For running 32bit images on 64bit hosts:
+	CONFIG_IA32_EMULATION=y
+
+ - Proper FS options according to image FS (e.g. CONFIG_EXT2_FS, CONFIG_EXT4_FS).
+
+ - For all virtio devices listed below:
+	CONFIG_VIRTIO=y
+	CONFIG_VIRTIO_RING=y
+	CONFIG_VIRTIO_PCI=y
+
+ - For virtio-blk devices (--disk, -d):
+	CONFIG_VIRTIO_BLK=y
+
+ - For virtio-net devices ([--network, -n] virtio):
+	CONFIG_VIRTIO_NET=y
+
+ - For virtio-9p devices (--virtio-9p):
+	CONFIG_NET_9P=y
+	CONFIG_NET_9P_VIRTIO=y
+	CONFIG_9P_FS=y
+
+ - For virtio-balloon device (--balloon):
+	CONFIG_VIRTIO_BALLOON=y
+
+ - For virtio-console device (--console virtio):
+	CONFIG_VIRTIO_CONSOLE=y
+
+ - For virtio-rng device (--rng):
+	CONFIG_HW_RANDOM_VIRTIO=y
+
+ - For vesa device (--sdl or --vnc):
+	CONFIG_FB_VESA=y
+
+
+5. And finally, launch the hypervisor:
+
+  ./lkvm run --disk linux-0.2.img \
+	    --kernel ../../arch/x86/boot/bzImage \
+or
+
+  sudo ./lkvm run --disk linux-0.2.img \
+		 --kernel ../../arch/x86/boot/bzImage \
+		 --network virtio
+
+The tool has been written by Pekka Enberg, Cyrill Gorcunov, Asias He,
+Sasha Levin and Prasad Joshi. Special thanks to Avi Kivity for his help
+on KVM internals and Ingo Molnar for all-around support and encouragement!
+
+See the following thread for original discussion for motivation of this
+project:
+
+http://thread.gmane.org/gmane.linux.kernel/962051/focus=962620
+
+Build dependencies
+=====================
+For deb based systems:
+32-bit:
+sudo apt-get install build-essential
+64-bit:
+sudo apt-get install build-essential libc6-dev-i386
+
+For rpm based systems:
+32-bit:
+yum install glibc-devel
+64-bit:
+yum install glibc-devel glibc-static
+
+On 64-bit Arch Linux make sure the multilib repository is enabled in your
+/etc/pacman.conf and run
+pacman -Sy lib32-glibc
diff --git a/tools/kvm/arm/aarch32/cortex-a15.c b/tools/kvm/arm/aarch32/cortex-a15.c
new file mode 100644
index 000000000000..80317474c8bc
--- /dev/null
+++ b/tools/kvm/arm/aarch32/cortex-a15.c
@@ -0,0 +1,94 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+#define CPU_NAME_MAX_LEN 8
+static void generate_cpu_nodes(void *fdt, struct kvm *kvm)
+{
+	int cpu;
+
+	_FDT(fdt_begin_node(fdt, "cpus"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+	for (cpu = 0; cpu < kvm->nrcpus; ++cpu) {
+		char cpu_name[CPU_NAME_MAX_LEN];
+
+		if (kvm->cpus[cpu]->cpu_type != KVM_ARM_TARGET_CORTEX_A15) {
+			pr_warning("Ignoring unknown type for CPU %d\n", cpu);
+			continue;
+		}
+
+		snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%d", cpu);
+
+		_FDT(fdt_begin_node(fdt, cpu_name));
+		_FDT(fdt_property_string(fdt, "device_type", "cpu"));
+		_FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a15"));
+
+		if (kvm->nrcpus > 1)
+			_FDT(fdt_property_string(fdt, "enable-method", "psci"));
+
+		_FDT(fdt_property_cell(fdt, "reg", cpu));
+		_FDT(fdt_end_node(fdt));
+	}
+
+	_FDT(fdt_end_node(fdt));
+}
+
+static void generate_timer_nodes(void *fdt, struct kvm *kvm)
+{
+	u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \
+		       & GIC_FDT_IRQ_PPI_CPU_MASK;
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(13),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(14),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(11),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(10),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+	};
+
+	_FDT(fdt_begin_node(fdt, "timer"));
+	_FDT(fdt_property_string(fdt, "compatible", "arm,armv7-timer"));
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+	_FDT(fdt_end_node(fdt));
+}
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle)
+{
+	generate_cpu_nodes(fdt, kvm);
+	gic__generate_fdt_nodes(fdt, gic_phandle);
+	generate_timer_nodes(fdt, kvm);
+}
+
+static int cortex_a15__vcpu_init(struct kvm_cpu *vcpu)
+{
+	vcpu->generate_fdt_nodes = generate_fdt_nodes;
+	return 0;
+}
+
+static struct kvm_arm_target target_cortex_a15 = {
+	.id	= KVM_ARM_TARGET_CORTEX_A15,
+	.init	= cortex_a15__vcpu_init,
+};
+
+static int cortex_a15__core_init(struct kvm *kvm)
+{
+	return kvm_cpu__register_kvm_arm_target(&target_cortex_a15);
+}
+core_init(cortex_a15__core_init);
diff --git a/tools/kvm/arm/aarch32/include/kvm/barrier.h b/tools/kvm/arm/aarch32/include/kvm/barrier.h
new file mode 100644
index 000000000000..94913a9564d4
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/barrier.h
@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define dmb()	asm volatile ("dmb" : : : "memory")
+
+#define mb()	dmb()
+#define rmb()	dmb()
+#define wmb()	dmb()
+
+#endif /* KVM__KVM_BARRIER_H */
diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h
new file mode 100644
index 000000000000..1632e3c5e834
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h
@@ -0,0 +1,13 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define ARM_GIC_DIST_SIZE	0x1000
+#define ARM_GIC_CPUI_SIZE	0x2000
+
+#define ARM_KERN_OFFSET(...)	0x8000
+
+#define ARM_MAX_MEMORY(...)	ARM_LOMAP_MAX_MEMORY
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h
new file mode 100644
index 000000000000..acf0d2387774
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h
@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(...)
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 000000000000..b9fda07d1e55
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,12 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid)	{			\
+	[0] = (!!(cpuid) << KVM_ARM_VCPU_POWER_OFF),			\
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/arm/aarch32/kvm-cpu.c b/tools/kvm/arm/aarch32/kvm-cpu.c
new file mode 100644
index 000000000000..a5287897ee72
--- /dev/null
+++ b/tools/kvm/arm/aarch32/kvm-cpu.c
@@ -0,0 +1,106 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+
+#include <asm/ptrace.h>
+
+#define ARM_CORE_REG(x)	(KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | \
+			 KVM_REG_ARM_CORE_REG(x))
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm	= vcpu->kvm;
+	struct kvm_one_reg reg;
+	u32 data;
+
+	/* Who said future-proofing was a good idea? */
+	reg.addr = (u64)(unsigned long)&data;
+
+	/* cpsr = IRQs/FIQs masked */
+	data	= PSR_I_BIT | PSR_F_BIT | SVC_MODE;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_cpsr);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (cpsr)");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id != 0)
+		return;
+
+	/* r0 = 0 */
+	data	= 0;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r0);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r0)");
+
+	/* r1 = machine type (-1) */
+	data	= -1;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r1);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r1)");
+
+	/* r2 = physical address of the device tree blob */
+	data	= kvm->arch.dtb_guest_start;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r2);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r2)");
+
+	/* pc = start of kernel image */
+	data	= kvm->arch.kern_guest_start;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+
+	reg.addr = (u64)(unsigned long)&data;
+
+	printf("*pc:\n");
+	reg.id = ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32);
+	printf("\n");
+
+	printf("*lr (svc):\n");
+	reg.id = ARM_CORE_REG(svc_regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ LR_svc)");
+	data &= ~0x1;
+
+	kvm__dump_mem(vcpu->kvm, data, 32);
+	printf("\n");
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr	= (u64)(unsigned long)&data;
+	dprintf(debug_fd, "\n Registers:\n");
+
+	reg.id		= ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pc)");
+	dprintf(debug_fd, " PC:    0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(usr_regs.ARM_cpsr);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (cpsr)");
+	dprintf(debug_fd, " CPSR:  0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(svc_regs[0]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (SP_svc)");
+	dprintf(debug_fd, " SP_svc:  0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(svc_regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (LR_svc)");
+	dprintf(debug_fd, " LR_svc:  0x%x\n", data);
+}
diff --git a/tools/kvm/arm/aarch64/cortex-a57.c b/tools/kvm/arm/aarch64/cortex-a57.c
new file mode 100644
index 000000000000..4fd11ba1b8c2
--- /dev/null
+++ b/tools/kvm/arm/aarch64/cortex-a57.c
@@ -0,0 +1,95 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+#define CPU_NAME_MAX_LEN 8
+static void generate_cpu_nodes(void *fdt, struct kvm *kvm)
+{
+	int cpu;
+
+	_FDT(fdt_begin_node(fdt, "cpus"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+	for (cpu = 0; cpu < kvm->nrcpus; ++cpu) {
+		char cpu_name[CPU_NAME_MAX_LEN];
+
+		if (kvm->cpus[cpu]->cpu_type != KVM_ARM_TARGET_CORTEX_A57) {
+			pr_warning("Ignoring unknown type for CPU %d\n", cpu);
+			continue;
+		}
+
+		snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%d", cpu);
+
+		_FDT(fdt_begin_node(fdt, cpu_name));
+		_FDT(fdt_property_string(fdt, "device_type", "cpu"));
+		_FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a57"));
+
+		if (kvm->nrcpus > 1)
+			_FDT(fdt_property_string(fdt, "enable-method", "psci"));
+
+		_FDT(fdt_property_cell(fdt, "reg", cpu));
+		_FDT(fdt_end_node(fdt));
+	}
+
+	_FDT(fdt_end_node(fdt));
+}
+
+static void generate_timer_nodes(void *fdt, struct kvm *kvm)
+{
+	u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \
+		       & GIC_FDT_IRQ_PPI_CPU_MASK;
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(13),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(14),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(11),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(10),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+	};
+
+	_FDT(fdt_begin_node(fdt, "timer"));
+	_FDT(fdt_property_string(fdt, "compatible", "arm,armv8-timer"));
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+	_FDT(fdt_end_node(fdt));
+}
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle)
+{
+	generate_cpu_nodes(fdt, kvm);
+	gic__generate_fdt_nodes(fdt, gic_phandle);
+	generate_timer_nodes(fdt, kvm);
+}
+
+
+static int cortex_a57__vcpu_init(struct kvm_cpu *vcpu)
+{
+	vcpu->generate_fdt_nodes = generate_fdt_nodes;
+	return 0;
+}
+
+static struct kvm_arm_target target_cortex_a57 = {
+	.id	= KVM_ARM_TARGET_CORTEX_A57,
+	.init	= cortex_a57__vcpu_init,
+};
+
+static int cortex_a57__core_init(struct kvm *kvm)
+{
+	return kvm_cpu__register_kvm_arm_target(&target_cortex_a57);
+}
+core_init(cortex_a57__core_init);
diff --git a/tools/kvm/arm/aarch64/include/kvm/barrier.h b/tools/kvm/arm/aarch64/include/kvm/barrier.h
new file mode 100644
index 000000000000..97ab252171e1
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define mb()	asm volatile ("dmb ish"		: : : "memory")
+#define rmb()	asm volatile ("dmb ishld"	: : : "memory")
+#define wmb()	asm volatile ("dmb ishst"	: : : "memory")
+
+#endif /* KVM__KVM_BARRIER_H */
diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h
new file mode 100644
index 000000000000..2f08a26306d7
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h
@@ -0,0 +1,17 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define ARM_GIC_DIST_SIZE	0x10000
+#define ARM_GIC_CPUI_SIZE	0x10000
+
+#define ARM_KERN_OFFSET(kvm)	((kvm)->cfg.arch.aarch32_guest	?	\
+				0x8000				:	\
+				0x80000)
+
+#define ARM_MAX_MEMORY(kvm)	((kvm)->cfg.arch.aarch32_guest	?	\
+				ARM_LOMAP_MAX_MEMORY		:	\
+				ARM_HIMAP_MAX_MEMORY)
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h
new file mode 100644
index 000000000000..89860ae3166c
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h
@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(cfg)						\
+	OPT_BOOLEAN('\0', "aarch32", &(cfg)->aarch32_guest,		\
+			"Run AArch32 guest"),
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 000000000000..d85c583421c5
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,13 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid)	{				\
+	[0] = ((!!(cpuid) << KVM_ARM_VCPU_POWER_OFF) |				\
+	       (!!(kvm)->cfg.arch.aarch32_guest << KVM_ARM_VCPU_EL1_32BIT))	\
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/arm/aarch64/kvm-cpu.c b/tools/kvm/arm/aarch64/kvm-cpu.c
new file mode 100644
index 000000000000..2eb06eab7e4b
--- /dev/null
+++ b/tools/kvm/arm/aarch64/kvm-cpu.c
@@ -0,0 +1,160 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+
+#include <asm/ptrace.h>
+
+#define COMPAT_PSR_F_BIT	0x00000040
+#define COMPAT_PSR_I_BIT	0x00000080
+#define COMPAT_PSR_MODE_SVC	0x00000013
+
+#define ARM64_CORE_REG(x)	(KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+				 KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+static void reset_vcpu_aarch32(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_one_reg reg;
+	u64 data;
+
+	reg.addr = (u64)&data;
+
+	/* pstate = all interrupts masked */
+	data	= COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT | COMPAT_PSR_MODE_SVC;
+	reg.id	= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id != 0)
+		return;
+
+	/* r0 = 0 */
+	data	= 0;
+	reg.id	= ARM64_CORE_REG(regs.regs[0]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r0)");
+
+	/* r1 = machine type (-1) */
+	data	= -1;
+	reg.id	= ARM64_CORE_REG(regs.regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r1)");
+
+	/* r2 = physical address of the device tree blob */
+	data	= kvm->arch.dtb_guest_start;
+	reg.id	= ARM64_CORE_REG(regs.regs[2]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r2)");
+
+	/* pc = start of kernel image */
+	data	= kvm->arch.kern_guest_start;
+	reg.id	= ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+static void reset_vcpu_aarch64(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_one_reg reg;
+	u64 data;
+
+	reg.addr = (u64)&data;
+
+	/* pstate = all interrupts masked */
+	data	= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h;
+	reg.id	= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+	/* x1...x3 = 0 */
+	data	= 0;
+	reg.id	= ARM64_CORE_REG(regs.regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x1)");
+
+	reg.id	= ARM64_CORE_REG(regs.regs[2]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x2)");
+
+	reg.id	= ARM64_CORE_REG(regs.regs[3]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x3)");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id == 0) {
+		/* x0 = physical address of the device tree blob */
+		data	= kvm->arch.dtb_guest_start;
+		reg.id	= ARM64_CORE_REG(regs.regs[0]);
+		if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+			die_perror("KVM_SET_ONE_REG failed (x0)");
+
+		/* pc = start of kernel image */
+		data	= kvm->arch.kern_guest_start;
+		reg.id	= ARM64_CORE_REG(regs.pc);
+		if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+			die_perror("KVM_SET_ONE_REG failed (pc)");
+	}
+}
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	if (vcpu->kvm->cfg.arch.aarch32_guest)
+		return reset_vcpu_aarch32(vcpu);
+	else
+		return reset_vcpu_aarch64(vcpu);
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	unsigned long data;
+
+	reg.addr = (u64)&data;
+
+	printf("*pc:\n");
+	reg.id = ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32);
+	printf("\n");
+
+	printf("*lr:\n");
+	reg.id = ARM64_CORE_REG(regs.regs[30]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ LR)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32);
+	printf("\n");
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	unsigned long data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr = (u64)&data;
+	dprintf(debug_fd, "\n Registers:\n");
+
+	reg.id		= ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pc)");
+	dprintf(debug_fd, " PC:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pstate)");
+	dprintf(debug_fd, " PSTATE:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(sp_el1);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (sp_el1)");
+	dprintf(debug_fd, " SP_EL1:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(regs.regs[30]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (lr)");
+	dprintf(debug_fd, " LR:    0x%lx\n", data);
+}
diff --git a/tools/kvm/arm/fdt.c b/tools/kvm/arm/fdt.c
new file mode 100644
index 000000000000..20e03084e518
--- /dev/null
+++ b/tools/kvm/arm/fdt.c
@@ -0,0 +1,250 @@
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/virtio-mmio.h"
+
+#include "arm-common/gic.h"
+
+#include <stdbool.h>
+
+#include <asm/setup.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+static char kern_cmdline[COMMAND_LINE_SIZE];
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	return false;
+}
+
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	return 0;
+}
+
+static void dump_fdt(const char *dtb_file, void *fdt)
+{
+	int count, fd;
+
+	fd = open(dtb_file, O_CREAT | O_TRUNC | O_RDWR, 0666);
+	if (fd < 0)
+		die("Failed to write dtb to %s", dtb_file);
+
+	count = write(fd, fdt, FDT_MAX_SIZE);
+	if (count < 0)
+		die_perror("Failed to dump dtb");
+
+	pr_info("Wrote %d bytes to dtb %s\n", count, dtb_file);
+	close(fd);
+}
+
+#define DEVICE_NAME_MAX_LEN 32
+static void generate_virtio_mmio_node(void *fdt, struct virtio_mmio *vmmio)
+{
+	char dev_name[DEVICE_NAME_MAX_LEN];
+	u64 addr = vmmio->addr;
+	u64 reg_prop[] = {
+		cpu_to_fdt64(addr),
+		cpu_to_fdt64(VIRTIO_MMIO_IO_SIZE)
+	};
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI),
+		cpu_to_fdt32(vmmio->irq - GIC_SPI_IRQ_BASE),
+		cpu_to_fdt32(GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+	};
+
+	snprintf(dev_name, DEVICE_NAME_MAX_LEN, "virtio@%llx", addr);
+
+	_FDT(fdt_begin_node(fdt, dev_name));
+	_FDT(fdt_property_string(fdt, "compatible", "virtio,mmio"));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+	_FDT(fdt_end_node(fdt));
+}
+
+static int setup_fdt(struct kvm *kvm)
+{
+	struct device_header *dev_hdr;
+	u8 staging_fdt[FDT_MAX_SIZE];
+	u32 gic_phandle		= fdt__alloc_phandle();
+	u64 mem_reg_prop[]	= {
+		cpu_to_fdt64(kvm->arch.memory_guest_start),
+		cpu_to_fdt64(kvm->ram_size),
+	};
+	void *fdt		= staging_fdt;
+	void *fdt_dest		= guest_flat_to_host(kvm,
+						     kvm->arch.dtb_guest_start);
+	void (*generate_cpu_nodes)(void *, struct kvm *, u32)
+				= kvm->cpus[0]->generate_fdt_nodes;
+
+	/* Create new tree without a reserve map */
+	_FDT(fdt_create(fdt, FDT_MAX_SIZE));
+	_FDT(fdt_finish_reservemap(fdt));
+
+	/* Header */
+	_FDT(fdt_begin_node(fdt, ""));
+	_FDT(fdt_property_cell(fdt, "interrupt-parent", gic_phandle));
+	_FDT(fdt_property_string(fdt, "compatible", "linux,dummy-virt"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+	/* /chosen */
+	_FDT(fdt_begin_node(fdt, "chosen"));
+	_FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+
+	/* Initrd */
+	if (kvm->arch.initrd_size != 0) {
+		u32 ird_st_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start);
+		u32 ird_end_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start +
+					       kvm->arch.initrd_size);
+
+		_FDT(fdt_property(fdt, "linux,initrd-start",
+				   &ird_st_prop, sizeof(ird_st_prop)));
+		_FDT(fdt_property(fdt, "linux,initrd-end",
+				   &ird_end_prop, sizeof(ird_end_prop)));
+	}
+	_FDT(fdt_end_node(fdt));
+
+	/* Memory */
+	_FDT(fdt_begin_node(fdt, "memory"));
+	_FDT(fdt_property_string(fdt, "device_type", "memory"));
+	_FDT(fdt_property(fdt, "reg", mem_reg_prop, sizeof(mem_reg_prop)));
+	_FDT(fdt_end_node(fdt));
+
+	/* CPU and peripherals (interrupt controller, timers, etc) */
+	if (generate_cpu_nodes)
+		generate_cpu_nodes(fdt, kvm, gic_phandle);
+
+	/* Virtio MMIO devices */
+	dev_hdr = device__first_dev(DEVICE_BUS_MMIO);
+	while (dev_hdr) {
+		generate_virtio_mmio_node(fdt, dev_hdr->data);
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* PSCI firmware */
+	_FDT(fdt_begin_node(fdt, "psci"));
+	_FDT(fdt_property_string(fdt, "compatible", "arm,psci"));
+	_FDT(fdt_property_string(fdt, "method", "hvc"));
+	_FDT(fdt_property_cell(fdt, "cpu_suspend", KVM_PSCI_FN_CPU_SUSPEND));
+	_FDT(fdt_property_cell(fdt, "cpu_off", KVM_PSCI_FN_CPU_OFF));
+	_FDT(fdt_property_cell(fdt, "cpu_on", KVM_PSCI_FN_CPU_ON));
+	_FDT(fdt_property_cell(fdt, "migrate", KVM_PSCI_FN_MIGRATE));
+	_FDT(fdt_end_node(fdt));
+
+	/* Finalise. */
+	_FDT(fdt_end_node(fdt));
+	_FDT(fdt_finish(fdt));
+
+	_FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+	_FDT(fdt_pack(fdt_dest));
+
+	if (kvm->cfg.arch.dump_dtb_filename)
+		dump_fdt(kvm->cfg.arch.dump_dtb_filename, fdt_dest);
+	return 0;
+}
+late_init(setup_fdt);
+
+static int read_image(int fd, void **pos, void *limit)
+{
+	int count;
+
+	while (((count = xread(fd, *pos, SZ_64K)) > 0) && *pos <= limit)
+		*pos += count;
+
+	if (pos < 0)
+		die_perror("xread");
+
+	return *pos < limit ? 0 : -ENOMEM;
+}
+
+#define FDT_ALIGN	SZ_2M
+#define INITRD_ALIGN	4
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd,
+		     const char *kernel_cmdline)
+{
+	void *pos, *kernel_end, *limit;
+	unsigned long guest_addr;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	/*
+	 * Linux requires the initrd and dtb to be mapped inside lowmem,
+	 * so we can't just place them at the top of memory.
+	 */
+	limit = kvm->ram_start + min(kvm->ram_size, (u64)SZ_256M) - 1;
+
+	pos = kvm->ram_start + ARM_KERN_OFFSET(kvm);
+	kvm->arch.kern_guest_start = host_to_guest_flat(kvm, pos);
+	if (read_image(fd_kernel, &pos, limit) == -ENOMEM)
+		die("kernel image too big to contain in guest memory.");
+
+	kernel_end = pos;
+	pr_info("Loaded kernel to 0x%llx (%llu bytes)",
+		kvm->arch.kern_guest_start,
+		host_to_guest_flat(kvm, pos) - kvm->arch.kern_guest_start);
+
+	/*
+	 * Now load backwards from the end of memory so the kernel
+	 * decompressor has plenty of space to work with. First up is
+	 * the device tree blob...
+	 */
+	pos = limit;
+	pos -= (FDT_MAX_SIZE + FDT_ALIGN);
+	guest_addr = ALIGN(host_to_guest_flat(kvm, pos), FDT_ALIGN);
+	pos = guest_flat_to_host(kvm, guest_addr);
+	if (pos < kernel_end)
+		die("fdt overlaps with kernel image.");
+
+	kvm->arch.dtb_guest_start = guest_addr;
+	pr_info("Placing fdt at 0x%llx - 0x%llx",
+		kvm->arch.dtb_guest_start,
+		host_to_guest_flat(kvm, limit));
+	limit = pos;
+
+	/* ... and finally the initrd, if we have one. */
+	if (fd_initrd != -1) {
+		struct stat sb;
+		unsigned long initrd_start;
+
+		if (lseek(fd_initrd, 0, SEEK_SET) < 0)
+			die_perror("lseek");
+
+		if (fstat(fd_initrd, &sb))
+			die_perror("fstat");
+
+		pos -= (sb.st_size + INITRD_ALIGN);
+		guest_addr = ALIGN(host_to_guest_flat(kvm, pos), INITRD_ALIGN);
+		pos = guest_flat_to_host(kvm, guest_addr);
+		if (pos < kernel_end)
+			die("initrd overlaps with kernel image.");
+
+		initrd_start = guest_addr;
+		if (read_image(fd_initrd, &pos, limit) == -ENOMEM)
+			die("initrd too big to contain in guest memory.");
+
+		kvm->arch.initrd_guest_start = initrd_start;
+		kvm->arch.initrd_size = host_to_guest_flat(kvm, pos) - initrd_start;
+		pr_info("Loaded initrd to 0x%llx (%llu bytes)",
+			kvm->arch.initrd_guest_start,
+			kvm->arch.initrd_size);
+	} else {
+		kvm->arch.initrd_size = 0;
+	}
+
+	strncpy(kern_cmdline, kernel_cmdline, COMMAND_LINE_SIZE);
+	kern_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
+
+	return true;
+}
+
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd,
+		  const char *kernel_cmdline)
+{
+	/* To b or not to b? That is the zImage. */
+	return false;
+}
diff --git a/tools/kvm/arm/gic.c b/tools/kvm/arm/gic.c
new file mode 100644
index 000000000000..3f42c3a11d16
--- /dev/null
+++ b/tools/kvm/arm/gic.c
@@ -0,0 +1,92 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/byteorder.h>
+#include <linux/kvm.h>
+
+static int irq_ids;
+
+int gic__alloc_irqnum(void)
+{
+	int irq = GIC_SPI_IRQ_BASE + irq_ids++;
+
+	if (irq > GIC_MAX_IRQ)
+		die("GIC IRQ limit %d reached!", GIC_MAX_IRQ);
+
+	return irq;
+}
+
+int gic__init_irqchip(struct kvm *kvm)
+{
+	int err;
+	struct kvm_device_address gic_addr[] = {
+		[0] = {
+			.id = (KVM_ARM_DEVICE_VGIC_V2 << KVM_DEVICE_ID_SHIFT) |\
+			       KVM_VGIC_V2_ADDR_TYPE_DIST,
+			.addr = ARM_GIC_DIST_BASE,
+		},
+		[1] = {
+			.id = (KVM_ARM_DEVICE_VGIC_V2 << KVM_DEVICE_ID_SHIFT) |\
+			       KVM_VGIC_V2_ADDR_TYPE_CPU,
+			.addr = ARM_GIC_CPUI_BASE,
+		}
+	};
+
+	if (kvm->nrcpus > GIC_MAX_CPUS) {
+		pr_warning("%d CPUS greater than maximum of %d -- truncating\n",
+				kvm->nrcpus, GIC_MAX_CPUS);
+		kvm->nrcpus = GIC_MAX_CPUS;
+	}
+
+	err = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (err)
+		return err;
+
+	err = ioctl(kvm->vm_fd, KVM_SET_DEVICE_ADDRESS, &gic_addr[0]);
+	if (err)
+		return err;
+
+	err = ioctl(kvm->vm_fd, KVM_SET_DEVICE_ADDRESS, &gic_addr[1]);
+	return err;
+}
+
+void gic__generate_fdt_nodes(void *fdt, u32 phandle)
+{
+	u64 reg_prop[] = {
+		cpu_to_fdt64(ARM_GIC_DIST_BASE), cpu_to_fdt64(ARM_GIC_DIST_SIZE),
+		cpu_to_fdt64(ARM_GIC_CPUI_BASE), cpu_to_fdt64(ARM_GIC_CPUI_SIZE),
+	};
+
+	_FDT(fdt_begin_node(fdt, "intc"));
+	_FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a15-gic"));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", GIC_FDT_IRQ_NUM_CELLS));
+	_FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	_FDT(fdt_property_cell(fdt, "phandle", phandle));
+	_FDT(fdt_end_node(fdt));
+}
+
+#define KVM_IRQCHIP_IRQ(x) (KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT) |\
+			   ((x) & KVM_ARM_IRQ_NUM_MASK)
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level = {
+		.irq	= KVM_IRQCHIP_IRQ(irq),
+		.level	= !!level,
+	};
+
+	if (irq < GIC_SPI_IRQ_BASE || irq > GIC_MAX_IRQ)
+		pr_warning("Ignoring invalid GIC IRQ %d", irq);
+	else if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+		pr_warning("Could not KVM_IRQ_LINE for irq %d", irq);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, VIRTIO_IRQ_HIGH);
+	kvm__irq_line(kvm, irq, VIRTIO_IRQ_LOW);
+}
diff --git a/tools/kvm/arm/include/arm-common/gic.h b/tools/kvm/arm/include/arm-common/gic.h
new file mode 100644
index 000000000000..850edc78e427
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/gic.h
@@ -0,0 +1,35 @@
+#ifndef ARM_COMMON__GIC_H
+#define ARM_COMMON__GIC_H
+
+#define GIC_SGI_IRQ_BASE		0
+#define GIC_PPI_IRQ_BASE		16
+#define GIC_SPI_IRQ_BASE		32
+
+#define GIC_FDT_IRQ_NUM_CELLS		3
+
+#define GIC_FDT_IRQ_TYPE_SPI		0
+#define GIC_FDT_IRQ_TYPE_PPI		1
+
+#define GIC_FDT_IRQ_FLAGS_EDGE_LO_HI	1
+#define GIC_FDT_IRQ_FLAGS_EDGE_HI_LO	2
+#define GIC_FDT_IRQ_FLAGS_LEVEL_HI	4
+#define GIC_FDT_IRQ_FLAGS_LEVEL_LO	8
+
+#define GIC_FDT_IRQ_PPI_CPU_SHIFT	8
+#define GIC_FDT_IRQ_PPI_CPU_MASK	(0xff << GIC_FDT_IRQ_PPI_CPU_SHIFT)
+
+#define GIC_CPUI_CTLR_EN		(1 << 0)
+#define GIC_CPUI_PMR_MIN_PRIO		0xff
+
+#define GIC_CPUI_OFF_PMR		4
+
+#define GIC_MAX_CPUS			8
+#define GIC_MAX_IRQ			255
+
+struct kvm;
+
+int gic__alloc_irqnum(void);
+int gic__init_irqchip(struct kvm *kvm);
+void gic__generate_fdt_nodes(void *fdt, u32 phandle);
+
+#endif /* ARM_COMMON__GIC_H */
diff --git a/tools/kvm/arm/include/arm-common/kvm-arch.h b/tools/kvm/arm/include/arm-common/kvm-arch.h
new file mode 100644
index 000000000000..7860e1729ca1
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-arch.h
@@ -0,0 +1,57 @@
+#ifndef ARM_COMMON__KVM_ARCH_H
+#define ARM_COMMON__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/const.h>
+#include <linux/types.h>
+
+#define ARM_MMIO_AREA		_AC(0x0000000000000000, UL)
+#define ARM_AXI_AREA		_AC(0x0000000040000000, UL)
+#define ARM_MEMORY_AREA		_AC(0x0000000080000000, UL)
+
+#define ARM_LOMAP_MAX_MEMORY	((1ULL << 32) - ARM_MEMORY_AREA)
+#define ARM_HIMAP_MAX_MEMORY	((1ULL << 40) - ARM_MEMORY_AREA)
+
+#define ARM_GIC_DIST_BASE	(ARM_AXI_AREA - ARM_GIC_DIST_SIZE)
+#define ARM_GIC_CPUI_BASE	(ARM_GIC_DIST_BASE - ARM_GIC_CPUI_SIZE)
+#define ARM_GIC_SIZE		(ARM_GIC_DIST_SIZE + ARM_GIC_CPUI_SIZE)
+
+#define ARM_VIRTIO_MMIO_SIZE	(ARM_AXI_AREA - ARM_GIC_SIZE)
+#define ARM_PCI_MMIO_SIZE	(ARM_MEMORY_AREA - ARM_AXI_AREA)
+
+#define KVM_PCI_MMIO_AREA	ARM_AXI_AREA
+#define KVM_VIRTIO_MMIO_AREA	ARM_MMIO_AREA
+
+#define VIRTIO_DEFAULT_TRANS	VIRTIO_MMIO
+
+static inline bool arm_addr_in_virtio_mmio_region(u64 phys_addr)
+{
+	u64 limit = KVM_VIRTIO_MMIO_AREA + ARM_VIRTIO_MMIO_SIZE;
+	return phys_addr >= KVM_VIRTIO_MMIO_AREA && phys_addr < limit;
+}
+
+static inline bool arm_addr_in_pci_mmio_region(u64 phys_addr)
+{
+	u64 limit = KVM_PCI_MMIO_AREA + ARM_PCI_MMIO_SIZE;
+	return phys_addr >= KVM_PCI_MMIO_AREA && phys_addr < limit;
+}
+
+struct kvm_arch {
+	/*
+	 * We may have to align the guest memory for virtio, so keep the
+	 * original pointers here for munmap.
+	 */
+	void	*ram_alloc_start;
+	u64	ram_alloc_size;
+
+	/*
+	 * Guest addresses for memory layout.
+	 */
+	u64	memory_guest_start;
+	u64	kern_guest_start;
+	u64	initrd_guest_start;
+	u64	initrd_size;
+	u64	dtb_guest_start;
+};
+
+#endif /* ARM_COMMON__KVM_ARCH_H */
diff --git a/tools/kvm/arm/include/arm-common/kvm-config-arch.h b/tools/kvm/arm/include/arm-common/kvm-config-arch.h
new file mode 100644
index 000000000000..7ac6f6e88550
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-config-arch.h
@@ -0,0 +1,17 @@
+#ifndef ARM_COMMON__KVM_CONFIG_ARCH_H
+#define ARM_COMMON__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+	const char *dump_dtb_filename;
+	bool aarch32_guest;
+};
+
+#define OPT_ARCH_RUN(pfx, cfg)						\
+	pfx,								\
+	ARM_OPT_ARCH_RUN(cfg)						\
+	OPT_STRING('\0', "dump-dtb", &(cfg)->dump_dtb_filename,		\
+		   ".dtb file", "Dump generated .dtb to specified file"),
+
+#endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h
new file mode 100644
index 000000000000..351fbe68e5a9
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h
@@ -0,0 +1,46 @@
+#ifndef ARM_COMMON__KVM_CPU_ARCH_H
+#define ARM_COMMON__KVM_CPU_ARCH_H
+
+#include <linux/kvm.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t	thread;
+
+	unsigned long	cpu_id;
+	unsigned long	cpu_type;
+
+	struct kvm	*kvm;
+	int		vcpu_fd;
+	struct kvm_run	*kvm_run;
+
+	u8		is_running;
+	u8		paused;
+	u8		needs_nmi;
+
+	struct kvm_coalesced_mmio_ring	*ring;
+
+	void		(*generate_fdt_nodes)(void *fdt, struct kvm* kvm,
+					      u32 gic_phandle);
+};
+
+struct kvm_arm_target {
+	u32	id;
+	int	(*init)(struct kvm_cpu *vcpu);
+};
+
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target);
+
+static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data,
+				       int direction, int size, u32 count)
+{
+	return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len,
+			   u8 is_write);
+
+#endif /* ARM_COMMON__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/arm/ioport.c b/tools/kvm/arm/ioport.c
new file mode 100644
index 000000000000..3c03fa05dd05
--- /dev/null
+++ b/tools/kvm/arm/ioport.c
@@ -0,0 +1,5 @@
+#include "kvm/ioport.h"
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+}
diff --git a/tools/kvm/arm/irq.c b/tools/kvm/arm/irq.c
new file mode 100644
index 000000000000..e173e04f3668
--- /dev/null
+++ b/tools/kvm/arm/irq.c
@@ -0,0 +1,17 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+
+int irq__register_device(u32 dev, u8 *pin, u8 *line)
+{
+	*line = gic__alloc_irqnum();
+	return 0;
+}
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+	die(__FUNCTION__);
+	return 0;
+}
diff --git a/tools/kvm/arm/kvm-cpu.c b/tools/kvm/arm/kvm-cpu.c
new file mode 100644
index 000000000000..7a0eff45d4ca
--- /dev/null
+++ b/tools/kvm/arm/kvm-cpu.c
@@ -0,0 +1,109 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static struct kvm_arm_target *kvm_arm_targets[KVM_ARM_NUM_TARGETS];
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target)
+{
+	unsigned int i = 0;
+
+	for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+		if (!kvm_arm_targets[i]) {
+			kvm_arm_targets[i] = target;
+			return 0;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int coalesced_offset, mmap_size, err = -1;
+	unsigned int i;
+	struct kvm_vcpu_init vcpu_init = {
+		.features = ARM_VCPU_FEATURE_FLAGS(kvm, cpu_id)
+	};
+
+	vcpu = calloc(1, sizeof(struct kvm_cpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->vcpu_fd = ioctl(kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED,
+			     vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	/* Find an appropriate target CPU type. */
+	for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+		vcpu_init.target = kvm_arm_targets[i]->id;
+		err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init);
+		if (!err)
+			break;
+	}
+
+	if (err || kvm_arm_targets[i]->init(vcpu))
+		die("Unable to initialise ARM vcpu");
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION,
+				 KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run +
+			     (coalesced_offset * PAGE_SIZE);
+
+	/* Populate the vcpu structure. */
+	vcpu->kvm		= kvm;
+	vcpu->cpu_id		= cpu_id;
+	vcpu->cpu_type		= vcpu_init.target;
+	vcpu->is_running	= true;
+	return vcpu;
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len,
+			   u8 is_write)
+{
+	if (arm_addr_in_virtio_mmio_region(phys_addr))
+		return kvm__emulate_mmio(kvm, phys_addr, data, len, is_write);
+	else if (arm_addr_in_pci_mmio_region(phys_addr))
+		die("PCI emulation not supported on ARM!");
+
+	return false;
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+}
diff --git a/tools/kvm/arm/kvm.c b/tools/kvm/arm/kvm.c
new file mode 100644
index 000000000000..1bcfce3c1f44
--- /dev/null
+++ b/tools/kvm/arm/kvm.c
@@ -0,0 +1,82 @@
+#include "kvm/kvm.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/virtio-console.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+	{ DEFINE_KVM_EXT(KVM_CAP_ONE_REG) },
+	{ DEFINE_KVM_EXT(KVM_CAP_ARM_PSCI) },
+	{ 0, 0 },
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	/* The KVM capability check is enough. */
+	return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	int err;
+	u64 phys_start, phys_size;
+	void *host_mem;
+
+	phys_start	= ARM_MEMORY_AREA;
+	phys_size	= kvm->ram_size;
+	host_mem	= kvm->ram_start;
+
+	err = kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+	if (err)
+		die("Failed to register %lld bytes of memory at physical "
+		    "address 0x%llx [err %d]", phys_size, phys_start, err);
+
+	kvm->arch.memory_guest_start = phys_start;
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size);
+}
+
+void kvm__arch_periodic_poll(struct kvm *kvm)
+{
+	if (term_readable(0))
+		virtio_console__inject_interrupt(kvm);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+}
+
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	/*
+	 * Allocate guest memory. We must align out buffer to 64K to
+	 * correlate with the maximum guest page size for virtio-mmio.
+	 */
+	kvm->ram_size = min(ram_size, (u64)ARM_MAX_MEMORY(kvm));
+	kvm->arch.ram_alloc_size = kvm->ram_size + SZ_64K;
+	kvm->arch.ram_alloc_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path,
+						kvm->arch.ram_alloc_size);
+
+	if (kvm->arch.ram_alloc_start == MAP_FAILED)
+		die("Failed to map %lld bytes for guest memory (%d)",
+		    kvm->arch.ram_alloc_size, errno);
+
+	kvm->ram_start = (void *)ALIGN((unsigned long)kvm->arch.ram_alloc_start,
+					SZ_64K);
+
+	madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
+		MADV_MERGEABLE);
+
+	/* Initialise the virtual GIC. */
+	if (gic__init_irqchip(kvm))
+		die("Failed to initialise virtual GIC");
+}
diff --git a/tools/kvm/builtin-balloon.c b/tools/kvm/builtin-balloon.c
new file mode 100644
index 000000000000..d158acec63a1
--- /dev/null
+++ b/tools/kvm/builtin-balloon.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-balloon.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm.h>
+#include <kvm/kvm-ipc.h>
+
+static const char *instance_name;
+static u64 inflate;
+static u64 deflate;
+
+static const char * const balloon_usage[] = {
+	"lkvm balloon [-n name] [-p pid] [-i amount] [-d amount]",
+	NULL
+};
+
+static const struct option balloon_options[] = {
+	OPT_GROUP("Instance options:"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_GROUP("Balloon options:"),
+	OPT_U64('i', "inflate", &inflate, "Amount to inflate (in MB)"),
+	OPT_U64('d', "deflate", &deflate, "Amount to deflate (in MB)"),
+	OPT_END(),
+};
+
+void kvm_balloon_help(void)
+{
+	usage_with_options(balloon_usage, balloon_options);
+}
+
+static void parse_balloon_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, balloon_options, balloon_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_balloon_help();
+	}
+}
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+	int amount;
+
+	parse_balloon_options(argc, argv);
+
+	if (inflate == 0 && deflate == 0)
+		kvm_balloon_help();
+
+	if (instance_name == NULL)
+		kvm_balloon_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	if (inflate)
+		amount = inflate;
+	else if (deflate)
+		amount = -deflate;
+	else
+		kvm_balloon_help();
+
+	r = kvm_ipc__send_msg(instance, KVM_IPC_BALLOON,
+			sizeof(amount), (u8 *)&amount);
+
+	close(instance);
+
+	if (r < 0)
+		return -1;
+
+	return 0;
+}
diff --git a/tools/kvm/builtin-debug.c b/tools/kvm/builtin-debug.c
new file mode 100644
index 000000000000..4ae51d200374
--- /dev/null
+++ b/tools/kvm/builtin-debug.c
@@ -0,0 +1,110 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-debug.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+#include <kvm/read-write.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#define BUFFER_SIZE 100
+
+static bool all;
+static int nmi = -1;
+static bool dump;
+static const char *instance_name;
+static const char *sysrq;
+
+static const char * const debug_usage[] = {
+	"lkvm debug [--all] [-n name] [-d] [-m vcpu]",
+	NULL
+};
+
+static const struct option debug_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('d', "dump", &dump, "Generate a debug dump from guest"),
+	OPT_INTEGER('m', "nmi", &nmi, "Generate NMI on VCPU"),
+	OPT_STRING('s', "sysrq", &sysrq, "sysrq", "Inject a sysrq"),
+	OPT_GROUP("Instance options:"),
+	OPT_BOOLEAN('a', "all", &all, "Debug all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_debug_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, debug_options, debug_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_debug_help();
+	}
+}
+
+void kvm_debug_help(void)
+{
+	usage_with_options(debug_usage, debug_options);
+}
+
+static int do_debug(const char *name, int sock)
+{
+	char buff[BUFFER_SIZE];
+	struct debug_cmd_params cmd = {.dbg_type = 0};
+	int r;
+
+	if (dump)
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_DUMP;
+
+	if (nmi != -1) {
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_NMI;
+		cmd.cpu = nmi;
+	}
+
+	if (sysrq) {
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_SYSRQ;
+		cmd.sysrq = sysrq[0];
+	}
+
+	r = kvm_ipc__send_msg(sock, KVM_IPC_DEBUG, sizeof(cmd), (u8 *)&cmd);
+	if (r < 0)
+		return r;
+
+	if (!dump)
+		return 0;
+
+	do {
+		r = xread(sock, buff, BUFFER_SIZE);
+		if (r < 0)
+			return 0;
+		printf("%.*s", r, buff);
+	} while (r > 0);
+
+	return 0;
+}
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix)
+{
+	parse_debug_options(argc, argv);
+	int instance;
+	int r;
+
+	if (all)
+		return kvm__enumerate_instances(do_debug);
+
+	if (instance_name == NULL)
+		kvm_debug_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_debug(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/tools/kvm/builtin-help.c b/tools/kvm/builtin-help.c
new file mode 100644
index 000000000000..5970fb7484f6
--- /dev/null
+++ b/tools/kvm/builtin-help.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <string.h>
+
+/* user defined headers */
+#include <common-cmds.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-help.h>
+#include <kvm/kvm.h>
+
+
+const char kvm_usage_string[] =
+	"lkvm COMMAND [ARGS]";
+
+const char kvm_more_info_string[] =
+	"See 'lkvm help COMMAND' for more information on a specific command.";
+
+
+static void list_common_cmds_help(void)
+{
+	unsigned int i, longest = 0;
+
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		if (longest < strlen(common_cmds[i].name))
+			longest = strlen(common_cmds[i].name);
+	}
+
+	puts(" The most commonly used lkvm commands are:");
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %-*s   ", longest, common_cmds[i].name);
+		puts(common_cmds[i].help);
+	}
+}
+
+static void kvm_help(void)
+{
+	printf("\n To start a simple non-privileged shell run '%s run'\n\n"
+		"usage: %s\n\n", KVM_BINARY_NAME, kvm_usage_string);
+	list_common_cmds_help();
+	printf("\n %s\n\n", kvm_more_info_string);
+}
+
+
+static void help_cmd(const char *cmd)
+{
+	struct cmd_struct *p;
+	p = kvm_get_command(kvm_commands, cmd);
+	if (!p)
+		kvm_help();
+	else if (p->help)
+		p->help();
+}
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix)
+{
+	if (!argv || !*argv) {
+		kvm_help();
+		return 0;
+	}
+	help_cmd(argv[0]);
+	return 0;
+}
diff --git a/tools/kvm/builtin-list.c b/tools/kvm/builtin-list.c
new file mode 100644
index 000000000000..9299f17b6f00
--- /dev/null
+++ b/tools/kvm/builtin-list.c
@@ -0,0 +1,149 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <dirent.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <fcntl.h>
+
+static bool run;
+static bool rootfs;
+
+static const char * const list_usage[] = {
+	"lkvm list",
+	NULL
+};
+
+static const struct option list_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('i', "run", &run, "List running instances"),
+	OPT_BOOLEAN('r', "rootfs", &rootfs, "List rootfs instances"),
+	OPT_END()
+};
+
+#define KVM_INSTANCE_RUNNING	"running"
+#define KVM_INSTANCE_PAUSED	"paused"
+#define KVM_INSTANCE_SHUTOFF	"shut off"
+
+void kvm_list_help(void)
+{
+	usage_with_options(list_usage, list_options);
+}
+
+static pid_t get_pid(int sock)
+{
+	pid_t pid;
+	int r;
+
+	r = kvm_ipc__send(sock, KVM_IPC_PID);
+	if (r < 0)
+		return r;
+
+	r = read(sock, &pid, sizeof(pid));
+	if (r < 0)
+		return r;
+
+	return pid;
+}
+
+int get_vmstate(int sock)
+{
+	int vmstate;
+	int r;
+
+	r = kvm_ipc__send(sock, KVM_IPC_VMSTATE);
+	if (r < 0)
+		return r;
+
+	r = read(sock, &vmstate, sizeof(vmstate));
+	if (r < 0)
+		return r;
+
+	return vmstate;
+
+}
+
+static int print_guest(const char *name, int sock)
+{
+	pid_t pid;
+	int vmstate;
+
+	pid = get_pid(sock);
+	vmstate = get_vmstate(sock);
+
+	if ((int)pid < 0 || vmstate < 0)
+		return -1;
+
+	if (vmstate == KVM_VMSTATE_PAUSED)
+		printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_PAUSED);
+	else
+		printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_RUNNING);
+
+	return 0;
+}
+
+static int kvm_list_running_instances(void)
+{
+	return kvm__enumerate_instances(print_guest);
+}
+
+static int kvm_list_rootfs(void)
+{
+	DIR *dir;
+	struct dirent *dirent;
+
+	dir = opendir(kvm__get_dir());
+	if (dir == NULL)
+		return -1;
+
+	while ((dirent = readdir(dir))) {
+		if (dirent->d_type == DT_DIR &&
+			strcmp(dirent->d_name, ".") &&
+			strcmp(dirent->d_name, ".."))
+			printf("%5s %-20s %s\n", "", dirent->d_name, KVM_INSTANCE_SHUTOFF);
+	}
+
+	return 0;
+}
+
+static void parse_setup_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, list_options, list_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_list_help();
+	}
+}
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix)
+{
+	int r;
+
+	parse_setup_options(argc, argv);
+
+	if (!run && !rootfs)
+		run = rootfs = true;
+
+	printf("%6s %-20s %s\n", "PID", "NAME", "STATE");
+	printf("------------------------------------\n");
+
+	if (run) {
+		r = kvm_list_running_instances();
+		if (r < 0)
+			perror("Error listing instances");
+	}
+
+	if (rootfs) {
+		r = kvm_list_rootfs();
+		if (r < 0)
+			perror("Error listing rootfs");
+	}
+
+	return 0;
+}
diff --git a/tools/kvm/builtin-pause.c b/tools/kvm/builtin-pause.c
new file mode 100644
index 000000000000..c08595a304d1
--- /dev/null
+++ b/tools/kvm/builtin-pause.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-pause.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const pause_usage[] = {
+	"lkvm pause [--all] [-n name]",
+	NULL
+};
+
+static const struct option pause_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Pause all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_pause_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, pause_options, pause_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_pause_help();
+	}
+}
+
+void kvm_pause_help(void)
+{
+	usage_with_options(pause_usage, pause_options);
+}
+
+static int do_pause(const char *name, int sock)
+{
+	int r;
+	int vmstate;
+
+	vmstate = get_vmstate(sock);
+	if (vmstate < 0)
+		return vmstate;
+	if (vmstate == KVM_VMSTATE_PAUSED) {
+		printf("Guest %s is already paused.\n", name);
+		return 0;
+	}
+
+	r = kvm_ipc__send(sock, KVM_IPC_PAUSE);
+	if (r)
+		return r;
+
+	printf("Guest %s paused\n", name);
+
+	return 0;
+}
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_pause_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_pause);
+
+	if (instance_name == NULL)
+		kvm_pause_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_pause(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/tools/kvm/builtin-resume.c b/tools/kvm/builtin-resume.c
new file mode 100644
index 000000000000..0e954b405ee8
--- /dev/null
+++ b/tools/kvm/builtin-resume.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-resume.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const resume_usage[] = {
+	"lkvm resume [--all] [-n name]",
+	NULL
+};
+
+static const struct option resume_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Resume all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_resume_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, resume_options, resume_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_resume_help();
+	}
+}
+
+void kvm_resume_help(void)
+{
+	usage_with_options(resume_usage, resume_options);
+}
+
+static int do_resume(const char *name, int sock)
+{
+	int r;
+	int vmstate;
+
+	vmstate = get_vmstate(sock);
+	if (vmstate < 0)
+		return vmstate;
+	if (vmstate == KVM_VMSTATE_RUNNING) {
+		printf("Guest %s is still running.\n", name);
+		return 0;
+	}
+
+	r = kvm_ipc__send(sock, KVM_IPC_RESUME);
+	if (r)
+		return r;
+
+	printf("Guest %s resumed\n", name);
+
+	return 0;
+}
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_resume_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_resume);
+
+	if (instance_name == NULL)
+		kvm_resume_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_resume(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
new file mode 100644
index 000000000000..d0b876a78819
--- /dev/null
+++ b/tools/kvm/builtin-run.c
@@ -0,0 +1,702 @@
+#include "kvm/builtin-run.h"
+
+#include "kvm/builtin-setup.h"
+#include "kvm/virtio-balloon.h"
+#include "kvm/virtio-console.h"
+#include "kvm/parse-options.h"
+#include "kvm/8250-serial.h"
+#include "kvm/framebuffer.h"
+#include "kvm/disk-image.h"
+#include "kvm/threadpool.h"
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio-rng.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/barrier.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/symbol.h"
+#include "kvm/i8042.h"
+#include "kvm/mutex.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/vesa.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/rtc.h"
+#include "kvm/sdl.h"
+#include "kvm/vnc.h"
+#include "kvm/guest_compat.h"
+#include "kvm/pci-shmem.h"
+#include "kvm/kvm-ipc.h"
+#include "kvm/builtin-debug.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#define MB_SHIFT		(20)
+#define KB_SHIFT		(10)
+#define GB_SHIFT		(30)
+
+__thread struct kvm_cpu *current_kvm_cpu;
+
+static int  kvm_run_wrapper;
+
+bool do_debug_print = false;
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char * const run_usage[] = {
+	"lkvm run [<options>] [<kernel image>]",
+	NULL
+};
+
+enum {
+	KVM_RUN_DEFAULT,
+	KVM_RUN_SANDBOX,
+};
+
+static int img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	char path[PATH_MAX];
+	struct stat st;
+
+	snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+	if ((stat(arg, &st) == 0 && S_ISDIR(st.st_mode)) ||
+	   (stat(path, &st) == 0 && S_ISDIR(st.st_mode)))
+		return virtio_9p_img_name_parser(opt, arg, unset);
+	return disk_img_name_parser(opt, arg, unset);
+}
+
+void kvm_run_set_wrapper_sandbox(void)
+{
+	kvm_run_wrapper = KVM_RUN_SANDBOX;
+}
+
+#ifndef OPT_ARCH_RUN
+#define OPT_ARCH_RUN(...)
+#endif
+
+#define BUILD_OPTIONS(name, cfg, kvm)					\
+	struct option name[] = {					\
+	OPT_GROUP("Basic options:"),					\
+	OPT_STRING('\0', "name", &(cfg)->guest_name, "guest name",	\
+			"A name for the guest"),			\
+	OPT_INTEGER('c', "cpus", &(cfg)->nrcpus, "Number of CPUs"),	\
+	OPT_U64('m', "mem", &(cfg)->ram_size, "Virtual machine memory"	\
+		" size in MiB."),					\
+	OPT_CALLBACK('\0', "shmem", NULL,				\
+		     "[pci:]<addr>:<size>[:handle=<handle>][:create]",	\
+		     "Share host shmem with guest via pci device",	\
+		     shmem_parser, NULL),				\
+	OPT_CALLBACK('d', "disk", kvm, "image or rootfs_dir", "Disk "	\
+			" image or rootfs directory", img_name_parser,	\
+			kvm),						\
+	OPT_BOOLEAN('\0', "balloon", &(cfg)->balloon, "Enable virtio"	\
+			" balloon"),					\
+	OPT_BOOLEAN('\0', "vnc", &(cfg)->vnc, "Enable VNC framebuffer"),\
+	OPT_BOOLEAN('\0', "sdl", &(cfg)->sdl, "Enable SDL framebuffer"),\
+	OPT_BOOLEAN('\0', "rng", &(cfg)->virtio_rng, "Enable virtio"	\
+			" Random Number Generator"),			\
+	OPT_CALLBACK('\0', "9p", NULL, "dir_to_share,tag_name",		\
+		     "Enable virtio 9p to share files between host and"	\
+		     " guest", virtio_9p_rootdir_parser, kvm),		\
+	OPT_STRING('\0', "console", &(cfg)->console, "serial, virtio or"\
+			" hv", "Console to use"),			\
+	OPT_STRING('\0', "dev", &(cfg)->dev, "device_file",		\
+			"KVM device file"),				\
+	OPT_CALLBACK('\0', "tty", NULL, "tty id",			\
+		     "Remap guest TTY into a pty on the host",		\
+		     tty_parser, NULL),					\
+	OPT_STRING('\0', "sandbox", &(cfg)->sandbox, "script",		\
+			"Run this script when booting into custom"	\
+			" rootfs"),					\
+	OPT_STRING('\0', "hugetlbfs", &(cfg)->hugetlbfs_path, "path",	\
+			"Hugetlbfs path"),				\
+									\
+	OPT_GROUP("Kernel options:"),					\
+	OPT_STRING('k', "kernel", &(cfg)->kernel_filename, "kernel",	\
+			"Kernel to boot in virtual machine"),		\
+	OPT_STRING('i', "initrd", &(cfg)->initrd_filename, "initrd",	\
+			"Initial RAM disk image"),			\
+	OPT_STRING('p', "params", &(cfg)->kernel_cmdline, "params",	\
+			"Kernel command line arguments"),		\
+	OPT_STRING('f', "firmware", &(cfg)->firmware_filename, "firmware",\
+			"Firmware image to boot in virtual machine"),	\
+									\
+	OPT_GROUP("Networking options:"),				\
+	OPT_CALLBACK_DEFAULT('n', "network", NULL, "network params",	\
+		     "Create a new guest NIC",				\
+		     netdev_parser, NULL, kvm),				\
+	OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel"	\
+			" DHCP in rootfs mode"),			\
+									\
+	OPT_GROUP("Debug options:"),					\
+	OPT_BOOLEAN('\0', "debug", &do_debug_print,			\
+			"Enable debug messages"),			\
+	OPT_BOOLEAN('\0', "debug-single-step", &(cfg)->single_step,	\
+			"Enable single stepping"),			\
+	OPT_BOOLEAN('\0', "debug-ioport", &(cfg)->ioport_debug,		\
+			"Enable ioport debugging"),			\
+	OPT_BOOLEAN('\0', "debug-mmio", &(cfg)->mmio_debug,		\
+			"Enable MMIO debugging"),			\
+	OPT_INTEGER('\0', "debug-iodelay", &(cfg)->debug_iodelay,	\
+			"Delay IO by millisecond"),			\
+									\
+	OPT_ARCH(RUN, cfg)						\
+	OPT_END()							\
+	};
+
+static void handle_sigalrm(int sig, siginfo_t *si, void *uc)
+{
+	struct kvm *kvm = si->si_value.sival_ptr;
+
+	kvm__arch_periodic_poll(kvm);
+}
+
+static void *kvm_cpu_thread(void *arg)
+{
+	char name[16];
+
+	current_kvm_cpu = arg;
+
+	sprintf(name, "kvm-vcpu-%lu", current_kvm_cpu->cpu_id);
+	kvm__set_thread_name(name);
+
+	if (kvm_cpu__start(current_kvm_cpu))
+		goto panic_kvm;
+
+	return (void *) (intptr_t) 0;
+
+panic_kvm:
+	fprintf(stderr, "KVM exit reason: %u (\"%s\")\n",
+		current_kvm_cpu->kvm_run->exit_reason,
+		kvm_exit_reasons[current_kvm_cpu->kvm_run->exit_reason]);
+	if (current_kvm_cpu->kvm_run->exit_reason == KVM_EXIT_UNKNOWN)
+		fprintf(stderr, "KVM exit code: 0x%Lu\n",
+			current_kvm_cpu->kvm_run->hw.hardware_exit_reason);
+
+	kvm_cpu__set_debug_fd(STDOUT_FILENO);
+	kvm_cpu__show_registers(current_kvm_cpu);
+	kvm_cpu__show_code(current_kvm_cpu);
+	kvm_cpu__show_page_tables(current_kvm_cpu);
+
+	return (void *) (intptr_t) 1;
+}
+
+static char kernel[PATH_MAX];
+
+static const char *host_kernels[] = {
+	"/boot/vmlinuz",
+	"/boot/bzImage",
+	NULL
+};
+
+static const char *default_kernels[] = {
+	"./bzImage",
+	"arch/" BUILD_ARCH "/boot/bzImage",
+	"../../arch/" BUILD_ARCH "/boot/bzImage",
+	NULL
+};
+
+static const char *default_vmlinux[] = {
+	"vmlinux",
+	"../../../vmlinux",
+	"../../vmlinux",
+	NULL
+};
+
+static void kernel_usage_with_options(void)
+{
+	const char **k;
+	struct utsname uts;
+
+	fprintf(stderr, "Fatal: could not find default kernel image in:\n");
+	k = &default_kernels[0];
+	while (*k) {
+		fprintf(stderr, "\t%s\n", *k);
+		k++;
+	}
+
+	if (uname(&uts) < 0)
+		return;
+
+	k = &host_kernels[0];
+	while (*k) {
+		if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+			return;
+		fprintf(stderr, "\t%s\n", kernel);
+		k++;
+	}
+	fprintf(stderr, "\nPlease see '%s run --help' for more options.\n\n",
+		KVM_BINARY_NAME);
+}
+
+static u64 host_ram_size(void)
+{
+	long page_size;
+	long nr_pages;
+
+	nr_pages	= sysconf(_SC_PHYS_PAGES);
+	if (nr_pages < 0) {
+		pr_warning("sysconf(_SC_PHYS_PAGES) failed");
+		return 0;
+	}
+
+	page_size	= sysconf(_SC_PAGE_SIZE);
+	if (page_size < 0) {
+		pr_warning("sysconf(_SC_PAGE_SIZE) failed");
+		return 0;
+	}
+
+	return (nr_pages * page_size) >> MB_SHIFT;
+}
+
+/*
+ * If user didn't specify how much memory it wants to allocate for the guest,
+ * avoid filling the whole host RAM.
+ */
+#define RAM_SIZE_RATIO		0.8
+
+static u64 get_ram_size(int nr_cpus)
+{
+	u64 available;
+	u64 ram_size;
+
+	ram_size	= 64 * (nr_cpus + 3);
+
+	available	= host_ram_size() * RAM_SIZE_RATIO;
+	if (!available)
+		available = MIN_RAM_SIZE_MB;
+
+	if (ram_size > available)
+		ram_size	= available;
+
+	return ram_size;
+}
+
+static const char *find_kernel(void)
+{
+	const char **k;
+	struct stat st;
+	struct utsname uts;
+
+	k = &default_kernels[0];
+	while (*k) {
+		if (stat(*k, &st) < 0 || !S_ISREG(st.st_mode)) {
+			k++;
+			continue;
+		}
+		strncpy(kernel, *k, PATH_MAX);
+		return kernel;
+	}
+
+	if (uname(&uts) < 0)
+		return NULL;
+
+	k = &host_kernels[0];
+	while (*k) {
+		if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+			return NULL;
+
+		if (stat(kernel, &st) < 0 || !S_ISREG(st.st_mode)) {
+			k++;
+			continue;
+		}
+		return kernel;
+
+	}
+	return NULL;
+}
+
+static const char *find_vmlinux(void)
+{
+	const char **vmlinux;
+
+	vmlinux = &default_vmlinux[0];
+	while (*vmlinux) {
+		struct stat st;
+
+		if (stat(*vmlinux, &st) < 0 || !S_ISREG(st.st_mode)) {
+			vmlinux++;
+			continue;
+		}
+		return *vmlinux;
+	}
+	return NULL;
+}
+
+void kvm_run_help(void)
+{
+	struct kvm *kvm = NULL;
+
+	BUILD_OPTIONS(options, &kvm->cfg, kvm);
+	usage_with_options(run_usage, options);
+}
+
+static int kvm_setup_guest_init(struct kvm *kvm)
+{
+	const char *rootfs = kvm->cfg.custom_rootfs_name;
+	char tmp[PATH_MAX];
+	size_t size;
+	int fd, ret;
+	char *data;
+
+	/* Setup /virt/init */
+	size = (size_t)&_binary_guest_init_size;
+	data = (char *)&_binary_guest_init_start;
+	snprintf(tmp, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), rootfs);
+	remove(tmp);
+	fd = open(tmp, O_CREAT | O_WRONLY, 0755);
+	if (fd < 0)
+		die("Fail to setup %s", tmp);
+	ret = xwrite(fd, data, size);
+	if (ret < 0)
+		die("Fail to setup %s", tmp);
+	close(fd);
+
+	return 0;
+}
+
+static int kvm_run_set_sandbox(struct kvm *kvm)
+{
+	const char *guestfs_name = kvm->cfg.custom_rootfs_name;
+	char path[PATH_MAX], script[PATH_MAX], *tmp;
+
+	snprintf(path, PATH_MAX, "%s%s/virt/sandbox.sh", kvm__get_dir(), guestfs_name);
+
+	remove(path);
+
+	if (kvm->cfg.sandbox == NULL)
+		return 0;
+
+	tmp = realpath(kvm->cfg.sandbox, NULL);
+	if (tmp == NULL)
+		return -ENOMEM;
+
+	snprintf(script, PATH_MAX, "/host/%s", tmp);
+	free(tmp);
+
+	return symlink(script, path);
+}
+
+static void kvm_write_sandbox_cmd_exactly(int fd, const char *arg)
+{
+	const char *single_quote;
+
+	if (!*arg) { /* zero length string */
+		if (write(fd, "''", 2) <= 0)
+			die("Failed writing sandbox script");
+		return;
+	}
+
+	while (*arg) {
+		single_quote = strchrnul(arg, '\'');
+
+		/* write non-single-quote string as #('string') */
+		if (arg != single_quote) {
+			if (write(fd, "'", 1) <= 0 ||
+			    write(fd, arg, single_quote - arg) <= 0 ||
+			    write(fd, "'", 1) <= 0)
+				die("Failed writing sandbox script");
+		}
+
+		/* write single quote as #("'") */
+		if (*single_quote) {
+			if (write(fd, "\"'\"", 3) <= 0)
+				die("Failed writing sandbox script");
+		} else
+			break;
+
+		arg = single_quote + 1;
+	}
+}
+
+static void resolve_program(const char *src, char *dst, size_t len)
+{
+	struct stat st;
+	int err;
+
+	err = stat(src, &st);
+
+	if (!err && S_ISREG(st.st_mode)) {
+		char resolved_path[PATH_MAX];
+
+		if (!realpath(src, resolved_path))
+			die("Unable to resolve program %s: %s\n", src, strerror(errno));
+
+		snprintf(dst, len, "/host%s", resolved_path);
+	} else
+		strncpy(dst, src, len);
+}
+
+static void kvm_run_write_sandbox_cmd(struct kvm *kvm, const char **argv, int argc)
+{
+	const char script_hdr[] = "#! /bin/bash\n\n";
+	char program[PATH_MAX];
+	int fd;
+
+	remove(kvm->cfg.sandbox);
+
+	fd = open(kvm->cfg.sandbox, O_RDWR | O_CREAT, 0777);
+	if (fd < 0)
+		die("Failed creating sandbox script");
+
+	if (write(fd, script_hdr, sizeof(script_hdr) - 1) <= 0)
+		die("Failed writing sandbox script");
+
+	resolve_program(argv[0], program, PATH_MAX);
+	kvm_write_sandbox_cmd_exactly(fd, program);
+
+	argv++;
+	argc--;
+
+	while (argc) {
+		if (write(fd, " ", 1) <= 0)
+			die("Failed writing sandbox script");
+
+		kvm_write_sandbox_cmd_exactly(fd, argv[0]);
+		argv++;
+		argc--;
+	}
+	if (write(fd, "\n", 1) <= 0)
+		die("Failed writing sandbox script");
+
+	close(fd);
+}
+
+static struct kvm *kvm_cmd_run_init(int argc, const char **argv)
+{
+	static char real_cmdline[2048], default_name[20];
+	unsigned int nr_online_cpus;
+	struct sigaction sa;
+	struct kvm *kvm = kvm__new();
+
+	if (IS_ERR(kvm))
+		return kvm;
+
+	sa.sa_flags = SA_SIGINFO;
+	sa.sa_sigaction = handle_sigalrm;
+	sigemptyset(&sa.sa_mask);
+	sigaction(SIGALRM, &sa, NULL);
+
+	nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	kvm->cfg.custom_rootfs_name = "default";
+
+	while (argc != 0) {
+		BUILD_OPTIONS(options, &kvm->cfg, kvm);
+		argc = parse_options(argc, argv, options, run_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION |
+				PARSE_OPT_KEEP_DASHDASH);
+		if (argc != 0) {
+			/* Cusrom options, should have been handled elsewhere */
+			if (strcmp(argv[0], "--") == 0) {
+				if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+					kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+					kvm_run_write_sandbox_cmd(kvm, argv+1, argc-1);
+					break;
+				}
+			}
+
+			if ((kvm_run_wrapper == KVM_RUN_DEFAULT && kvm->cfg.kernel_filename) ||
+				(kvm_run_wrapper == KVM_RUN_SANDBOX && kvm->cfg.sandbox)) {
+				fprintf(stderr, "Cannot handle parameter: "
+						"%s\n", argv[0]);
+				usage_with_options(run_usage, options);
+				free(kvm);
+				return ERR_PTR(-EINVAL);
+			}
+			if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+				/*
+				 * first unhandled parameter is treated as
+				 * sandbox command
+				 */
+				kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+				kvm_run_write_sandbox_cmd(kvm, argv, argc);
+			} else {
+				/*
+				 * first unhandled parameter is treated as a kernel
+				 * image
+				 */
+				kvm->cfg.kernel_filename = argv[0];
+			}
+			argv++;
+			argc--;
+		}
+
+	}
+
+	kvm->nr_disks = kvm->cfg.image_count;
+
+	if (!kvm->cfg.kernel_filename)
+		kvm->cfg.kernel_filename = find_kernel();
+
+	if (!kvm->cfg.kernel_filename) {
+		kernel_usage_with_options();
+		return ERR_PTR(-EINVAL);
+	}
+
+	kvm->cfg.vmlinux_filename = find_vmlinux();
+	kvm->vmlinux = kvm->cfg.vmlinux_filename;
+
+	if (kvm->cfg.nrcpus == 0)
+		kvm->cfg.nrcpus = nr_online_cpus;
+
+	if (!kvm->cfg.ram_size)
+		kvm->cfg.ram_size = get_ram_size(kvm->cfg.nrcpus);
+
+	if (kvm->cfg.ram_size < MIN_RAM_SIZE_MB)
+		die("Not enough memory specified: %lluMB (min %lluMB)", kvm->cfg.ram_size, MIN_RAM_SIZE_MB);
+
+	if (kvm->cfg.ram_size > host_ram_size())
+		pr_warning("Guest memory size %lluMB exceeds host physical RAM size %lluMB", kvm->cfg.ram_size, host_ram_size());
+
+	kvm->cfg.ram_size <<= MB_SHIFT;
+
+	if (!kvm->cfg.dev)
+		kvm->cfg.dev = DEFAULT_KVM_DEV;
+
+	if (!kvm->cfg.console)
+		kvm->cfg.console = DEFAULT_CONSOLE;
+
+	if (!strncmp(kvm->cfg.console, "virtio", 6))
+		kvm->cfg.active_console  = CONSOLE_VIRTIO;
+	else if (!strncmp(kvm->cfg.console, "serial", 6))
+		kvm->cfg.active_console  = CONSOLE_8250;
+	else if (!strncmp(kvm->cfg.console, "hv", 2))
+		kvm->cfg.active_console = CONSOLE_HV;
+	else
+		pr_warning("No console!");
+
+	if (!kvm->cfg.host_ip)
+		kvm->cfg.host_ip = DEFAULT_HOST_ADDR;
+
+	if (!kvm->cfg.guest_ip)
+		kvm->cfg.guest_ip = DEFAULT_GUEST_ADDR;
+
+	if (!kvm->cfg.guest_mac)
+		kvm->cfg.guest_mac = DEFAULT_GUEST_MAC;
+
+	if (!kvm->cfg.host_mac)
+		kvm->cfg.host_mac = DEFAULT_HOST_MAC;
+
+	if (!kvm->cfg.script)
+		kvm->cfg.script = DEFAULT_SCRIPT;
+
+	if (!kvm->cfg.network)
+                kvm->cfg.network = DEFAULT_NETWORK;
+
+	memset(real_cmdline, 0, sizeof(real_cmdline));
+	kvm__arch_set_cmdline(real_cmdline, kvm->cfg.vnc || kvm->cfg.sdl);
+
+	if (strlen(real_cmdline) > 0)
+		strcat(real_cmdline, " ");
+
+	if (kvm->cfg.kernel_cmdline)
+		strlcat(real_cmdline, kvm->cfg.kernel_cmdline, sizeof(real_cmdline));
+
+	if (!kvm->cfg.guest_name) {
+		if (kvm->cfg.custom_rootfs) {
+			kvm->cfg.guest_name = kvm->cfg.custom_rootfs_name;
+		} else {
+			sprintf(default_name, "guest-%u", getpid());
+			kvm->cfg.guest_name = default_name;
+		}
+	}
+
+	if (!kvm->cfg.using_rootfs && !kvm->cfg.disk_image[0].filename && !kvm->cfg.initrd_filename) {
+		char tmp[PATH_MAX];
+
+		kvm_setup_create_new(kvm->cfg.custom_rootfs_name);
+		kvm_setup_resolv(kvm->cfg.custom_rootfs_name);
+
+		snprintf(tmp, PATH_MAX, "%s%s", kvm__get_dir(), "default");
+		if (virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+	}
+
+	if (kvm->cfg.using_rootfs) {
+		strcat(real_cmdline, " root=/dev/root rw rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p");
+		if (kvm->cfg.custom_rootfs) {
+			kvm_run_set_sandbox(kvm);
+
+			strcat(real_cmdline, " init=/virt/init");
+
+			if (!kvm->cfg.no_dhcp)
+				strcat(real_cmdline, "  ip=dhcp");
+			if (kvm_setup_guest_init(kvm))
+				die("Failed to setup init for guest.");
+		}
+	} else if (!strstr(real_cmdline, "root=")) {
+		strlcat(real_cmdline, " root=/dev/vda rw ", sizeof(real_cmdline));
+	}
+
+	kvm->cfg.real_cmdline = real_cmdline;
+
+	printf("  # %s run -k %s -m %Lu -c %d --name %s\n", KVM_BINARY_NAME,
+		kvm->cfg.kernel_filename, kvm->cfg.ram_size / 1024 / 1024, kvm->cfg.nrcpus, kvm->cfg.guest_name);
+
+	if (init_list__init(kvm) < 0)
+		die ("Initialisation failed");
+
+	return kvm;
+}
+
+static int kvm_cmd_run_work(struct kvm *kvm)
+{
+	int i;
+	void *ret = NULL;
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		if (pthread_create(&kvm->cpus[i]->thread, NULL, kvm_cpu_thread, kvm->cpus[i]) != 0)
+			die("unable to create KVM VCPU thread");
+	}
+
+	/* Only VCPU #0 is going to exit by itself when shutting down */
+	return pthread_join(kvm->cpus[0]->thread, &ret);
+}
+
+static void kvm_cmd_run_exit(struct kvm *kvm, int guest_ret)
+{
+	compat__print_all_messages();
+
+	init_list__exit(kvm);
+
+	if (guest_ret == 0)
+		printf("\n  # KVM session ended normally.\n");
+}
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix)
+{
+	int ret = -EFAULT;
+	struct kvm *kvm;
+
+	kvm = kvm_cmd_run_init(argc, argv);
+	if (IS_ERR(kvm))
+		return PTR_ERR(kvm);
+
+	ret = kvm_cmd_run_work(kvm);
+	kvm_cmd_run_exit(kvm, ret);
+
+	return ret;
+}
diff --git a/tools/kvm/builtin-sandbox.c b/tools/kvm/builtin-sandbox.c
new file mode 100644
index 000000000000..433f5361e8a8
--- /dev/null
+++ b/tools/kvm/builtin-sandbox.c
@@ -0,0 +1,9 @@
+#include "kvm/builtin-sandbox.h"
+#include "kvm/builtin-run.h"
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix)
+{
+	kvm_run_set_wrapper_sandbox();
+
+	return kvm_cmd_run(argc, argv, prefix);
+}
diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c
new file mode 100644
index 000000000000..8b45c5645ad4
--- /dev/null
+++ b/tools/kvm/builtin-setup.c
@@ -0,0 +1,258 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-setup.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/read-write.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char *instance_name;
+
+static const char * const setup_usage[] = {
+	"lkvm setup [name]",
+	NULL
+};
+
+static const struct option setup_options[] = {
+	OPT_END()
+};
+
+static void parse_setup_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, setup_options, setup_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0 && instance_name)
+			kvm_setup_help();
+		else
+			instance_name = argv[0];
+		argv++;
+		argc--;
+	}
+}
+
+void kvm_setup_help(void)
+{
+	printf("\n%s setup creates a new rootfs under %s.\n"
+		"This can be used later by the '-d' parameter of '%s run'.\n",
+		KVM_BINARY_NAME, kvm__get_dir(), KVM_BINARY_NAME);
+	usage_with_options(setup_usage, setup_options);
+}
+
+static int copy_file(const char *from, const char *to)
+{
+	int in_fd, out_fd;
+	void *src, *dst;
+	struct stat st;
+	int err = -1;
+
+	in_fd = open(from, O_RDONLY);
+	if (in_fd < 0)
+		return err;
+
+	if (fstat(in_fd, &st) < 0)
+		goto error_close_in;
+
+	out_fd = open(to, O_RDWR | O_CREAT | O_TRUNC, st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO));
+	if (out_fd < 0)
+		goto error_close_in;
+
+	src = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, in_fd, 0);
+	if (src == MAP_FAILED)
+		goto error_close_out;
+
+	if (ftruncate(out_fd, st.st_size) < 0)
+		goto error_munmap_src;
+
+	dst = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, out_fd, 0);
+	if (dst == MAP_FAILED)
+		goto error_munmap_src;
+
+	memcpy(dst, src, st.st_size);
+
+	if (fsync(out_fd) < 0)
+		goto error_munmap_dst;
+
+	err = 0;
+
+error_munmap_dst:
+	munmap(dst, st.st_size);
+error_munmap_src:
+	munmap(src, st.st_size);
+error_close_out:
+	close(out_fd);
+error_close_in:
+	close(in_fd);
+
+	return err;
+}
+
+static const char *guestfs_dirs[] = {
+	"/dev",
+	"/etc",
+	"/home",
+	"/host",
+	"/proc",
+	"/root",
+	"/sys",
+	"/tmp",
+	"/var",
+	"/var/lib",
+	"/virt",
+	"/virt/home",
+};
+
+static const char *guestfs_symlinks[] = {
+	"/bin",
+	"/lib",
+	"/lib64",
+	"/sbin",
+	"/usr",
+	"/etc/ld.so.conf",
+};
+
+static int copy_init(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+	size_t size;
+	int fd, ret;
+	char *data;
+
+	size = (size_t)&_binary_guest_init_size;
+	data = (char *)&_binary_guest_init_start;
+	snprintf(path, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), guestfs_name);
+	remove(path);
+	fd = open(path, O_CREAT | O_WRONLY, 0755);
+	if (fd < 0)
+		die("Fail to setup %s", path);
+	ret = xwrite(fd, data, size);
+	if (ret < 0)
+		die("Fail to setup %s", path);
+	close(fd);
+
+	return 0;
+}
+
+static int copy_passwd(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+	FILE *file;
+	int ret;
+
+	snprintf(path, PATH_MAX, "%s%s/etc/passwd", kvm__get_dir(), guestfs_name);
+
+	file = fopen(path, "w");
+	if (!file)
+		return -1;
+
+	ret = fprintf(file, "root:x:0:0:root:/root:/bin/sh\n");
+	if (ret > 0)
+		ret = 0;
+
+	fclose(file);
+
+	return ret;
+}
+
+static int make_guestfs_symlink(const char *guestfs_name, const char *path)
+{
+	char target[PATH_MAX];
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s%s", kvm__get_dir(), guestfs_name, path);
+
+	snprintf(target, PATH_MAX, "/host%s", path);
+
+	return symlink(target, name);
+}
+
+static int make_dir(const char *dir)
+{
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s", kvm__get_dir(), dir);
+
+	return mkdir(name, 0777);
+}
+
+static void make_guestfs_dir(const char *guestfs_name, const char *dir)
+{
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s", guestfs_name, dir);
+
+	make_dir(name);
+}
+
+void kvm_setup_resolv(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+
+	snprintf(path, PATH_MAX, "%s%s/etc/resolv.conf", kvm__get_dir(), guestfs_name);
+
+	copy_file("/etc/resolv.conf", path);
+}
+
+static int do_setup(const char *guestfs_name)
+{
+	unsigned int i;
+	int ret;
+
+	ret = make_dir(guestfs_name);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(guestfs_dirs); i++)
+		make_guestfs_dir(guestfs_name, guestfs_dirs[i]);
+
+	for (i = 0; i < ARRAY_SIZE(guestfs_symlinks); i++) {
+		make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]);
+	}
+
+	ret = copy_init(guestfs_name);
+	if (ret < 0)
+		return ret;
+
+	return copy_passwd(guestfs_name);
+}
+
+int kvm_setup_create_new(const char *guestfs_name)
+{
+	return do_setup(guestfs_name);
+}
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix)
+{
+	int r;
+
+	parse_setup_options(argc, argv);
+
+	if (instance_name == NULL)
+		kvm_setup_help();
+
+	r = do_setup(instance_name);
+	if (r == 0)
+		printf("A new rootfs '%s' has been created in '%s%s'.\n\n"
+			"You can now start it by running the following command:\n\n"
+			"  %s run -d %s\n",
+			instance_name, kvm__get_dir(), instance_name,
+			KVM_BINARY_NAME,instance_name);
+	else
+		printf("Unable to create rootfs in %s%s: %s\n",
+			kvm__get_dir(), instance_name, strerror(errno));
+
+	return r;
+}
diff --git a/tools/kvm/builtin-stat.c b/tools/kvm/builtin-stat.c
new file mode 100644
index 000000000000..ffd72e80ba16
--- /dev/null
+++ b/tools/kvm/builtin-stat.c
@@ -0,0 +1,127 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stat.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <sys/select.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <linux/virtio_balloon.h>
+
+static bool mem;
+static bool all;
+static const char *instance_name;
+
+static const char * const stat_usage[] = {
+	"lkvm stat [command] [--all] [-n name]",
+	NULL
+};
+
+static const struct option stat_options[] = {
+	OPT_GROUP("Commands options:"),
+	OPT_BOOLEAN('m', "memory", &mem, "Display memory statistics"),
+	OPT_GROUP("Instance options:"),
+	OPT_BOOLEAN('a', "all", &all, "All instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_stat_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, stat_options, stat_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_stat_help();
+	}
+}
+
+void kvm_stat_help(void)
+{
+	usage_with_options(stat_usage, stat_options);
+}
+
+static int do_memstat(const char *name, int sock)
+{
+	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+	fd_set fdset;
+	struct timeval t = { .tv_sec = 1 };
+	int r;
+	u8 i;
+
+	FD_ZERO(&fdset);
+	FD_SET(sock, &fdset);
+	r = kvm_ipc__send(sock, KVM_IPC_STAT);
+	if (r < 0)
+		return r;
+
+	r = select(1, &fdset, NULL, NULL, &t);
+	if (r < 0) {
+		pr_err("Could not retrieve mem stats from %s", name);
+		return r;
+	}
+	r = read(sock, &stats, sizeof(stats));
+	if (r < 0)
+		return r;
+
+	printf("\n\n\t*** Guest memory statistics ***\n\n");
+	for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
+		switch (stats[i].tag) {
+		case VIRTIO_BALLOON_S_SWAP_IN:
+			printf("The amount of memory that has been swapped in (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_SWAP_OUT:
+			printf("The amount of memory that has been swapped out to disk (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_MAJFLT:
+			printf("The number of major page faults that have occurred:");
+			break;
+		case VIRTIO_BALLOON_S_MINFLT:
+			printf("The number of minor page faults that have occurred:");
+			break;
+		case VIRTIO_BALLOON_S_MEMFREE:
+			printf("The amount of memory not being used for any purpose (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_MEMTOT:
+			printf("The total amount of memory available (in bytes):");
+			break;
+		}
+		printf("%llu\n", stats[i].val);
+	}
+	printf("\n");
+
+	return 0;
+}
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r = 0;
+
+	parse_stat_options(argc, argv);
+
+	if (!mem)
+		usage_with_options(stat_usage, stat_options);
+
+	if (mem && all)
+		return kvm__enumerate_instances(do_memstat);
+
+	if (instance_name == NULL)
+		kvm_stat_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	if (mem)
+		r = do_memstat(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/tools/kvm/builtin-stop.c b/tools/kvm/builtin-stop.c
new file mode 100644
index 000000000000..6067630568df
--- /dev/null
+++ b/tools/kvm/builtin-stop.c
@@ -0,0 +1,70 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stop.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const stop_usage[] = {
+	"lkvm stop [--all] [-n name]",
+	NULL
+};
+
+static const struct option stop_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Stop all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_stop_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, stop_options, stop_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_stop_help();
+	}
+}
+
+void kvm_stop_help(void)
+{
+	usage_with_options(stop_usage, stop_options);
+}
+
+static int do_stop(const char *name, int sock)
+{
+	return kvm_ipc__send(sock, KVM_IPC_STOP);
+}
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_stop_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_stop);
+
+	if (instance_name == NULL)
+		kvm_stop_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_stop(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}
diff --git a/tools/kvm/builtin-version.c b/tools/kvm/builtin-version.c
new file mode 100644
index 000000000000..b8bb8597b97d
--- /dev/null
+++ b/tools/kvm/builtin-version.c
@@ -0,0 +1,15 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-version.h>
+#include <kvm/kvm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix)
+{
+	printf("kvm tool %s\n", KVMTOOLS_VERSION);
+
+	return 0;
+}
diff --git a/tools/kvm/code16gcc.h b/tools/kvm/code16gcc.h
new file mode 100644
index 000000000000..d93e48010b61
--- /dev/null
+++ b/tools/kvm/code16gcc.h
@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc emits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif
diff --git a/tools/kvm/command-list.txt b/tools/kvm/command-list.txt
new file mode 100644
index 000000000000..d93597dc551d
--- /dev/null
+++ b/tools/kvm/command-list.txt
@@ -0,0 +1,15 @@
+#
+# List of known perf commands.
+# command name			category [deprecated] [common]
+#
+lkvm-run			mainporcelain common
+lkvm-setup			mainporcelain common
+lkvm-pause			common
+lkvm-resume			common
+lkvm-version			common
+lkvm-list			common
+lkvm-debug			common
+lkvm-balloon			common
+lkvm-stop			common
+lkvm-stat			common
+lkvm-sandbox			common
diff --git a/tools/kvm/config/feature-tests.mak b/tools/kvm/config/feature-tests.mak
new file mode 100644
index 000000000000..4a81f562903b
--- /dev/null
+++ b/tools/kvm/config/feature-tests.mak
@@ -0,0 +1,177 @@
+define SOURCE_HELLO
+#include <stdio.h>
+int main(void)
+{
+	return puts(\"hi\");
+}
+endef
+
+ifndef NO_DWARF
+define SOURCE_DWARF
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+#ifndef _ELFUTILS_PREREQ
+#error
+#endif
+
+int main(void)
+{
+	Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+	return (long)dbg;
+}
+endef
+endif
+
+define SOURCE_LIBELF
+#include <libelf.h>
+
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ, 0);
+	return (long)elf;
+}
+endef
+
+define SOURCE_GLIBC
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+	const char *version = gnu_get_libc_version();
+	return (long)version;
+}
+endef
+
+define SOURCE_ELF_MMAP
+#include <libelf.h>
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+	return (long)elf;
+}
+endef
+
+ifndef NO_NEWT
+define SOURCE_NEWT
+#include <newt.h>
+
+int main(void)
+{
+	newtInit();
+	newtCls();
+	return newtFinished();
+}
+endef
+endif
+
+ifndef NO_LIBPERL
+define SOURCE_PERL_EMBED
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+perl_alloc();
+return 0;
+}
+endef
+endif
+
+ifndef NO_LIBPYTHON
+define SOURCE_PYTHON_VERSION
+#include <Python.h>
+#if PY_VERSION_HEX >= 0x03000000
+	#error
+#endif
+int main(void){}
+endef
+define SOURCE_PYTHON_EMBED
+#include <Python.h>
+int main(void)
+{
+	Py_Initialize();
+	return 0;
+}
+endef
+endif
+
+define SOURCE_BFD
+#include <bfd.h>
+
+int main(void)
+{
+	bfd_demangle(0, 0, 0);
+	return 0;
+}
+endef
+
+define SOURCE_CPLUS_DEMANGLE
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+	cplus_demangle(0, 0);
+	return 0;
+}
+endef
+
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+	strlcpy(NULL, NULL, 0);
+	return 0;
+}
+endef
+
+define SOURCE_VNCSERVER
+#include <rfb/rfb.h>
+
+int main(void)
+{
+	rfbIsActive((void *)0);
+	return 0;
+}
+endef
+
+define SOURCE_SDL
+#include <SDL/SDL.h>
+
+int main(void)
+{
+	SDL_Init(SDL_INIT_VIDEO);
+	return 0;
+}
+endef
+
+define SOURCE_ZLIB
+#include <zlib.h>
+
+int main(void)
+{
+	inflateInit2(NULL, 0);
+	return 0;
+}
+endef
+
+define SOURCE_AIO
+#include <libaio.h>
+
+int main(void)
+{
+	io_setup(0, NULL);
+	return 0;
+}
+endef
+
+define SOURCE_STATIC
+#include <stdlib.h>
+
+int main(void)
+{
+	return 0;
+}
+endef
diff --git a/tools/kvm/config/utilities.mak b/tools/kvm/config/utilities.mak
new file mode 100644
index 000000000000..a70963b33b0f
--- /dev/null
+++ b/tools/kvm/config/utilities.mak
@@ -0,0 +1,196 @@
+# This allows us to work with the newline character:
+define newline
+
+
+endef
+newline := $(newline)
+
+# nl-escape
+#
+# Usage: escape = $(call nl-escape[,escape])
+#
+# This is used as the common way to specify
+# what should replace a newline when escaping
+# newlines; the default is a bizarre string.
+#
+nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+
+# escape-nl
+#
+# Usage: escaped-text = $(call escape-nl,text[,escape])
+#
+# GNU make's $(shell ...) function converts to a
+# single space each newline character in the output
+# produced during the expansion; this may not be
+# desirable.
+#
+# The only solution is to change each newline into
+# something that won't be converted, so that the
+# information can be recovered later with
+# $(call unescape-nl...)
+#
+escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1))
+
+# unescape-nl
+#
+# Usage: text = $(call unescape-nl,escaped-text[,escape])
+#
+# See escape-nl.
+#
+unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1))
+
+# shell-escape-nl
+#
+# Usage: $(shell some-command | $(call shell-escape-nl[,escape]))
+#
+# Use this to escape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as a string constant
+#       in an `awk' program that is delimited by shell
+#       single-quotes, so be wary of the characters
+#       that are chosen.
+#
+define shell-escape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}'
+endef
+
+# shell-unescape-nl
+#
+# Usage: $(shell some-command | $(call shell-unescape-nl[,escape]))
+#
+# Use this to unescape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as an extended regular
+#       expression constant in an `awk' program that is
+#       delimited by shell single-quotes, so be wary
+#       of the characters that are chosen.
+#
+# (The bash shell has a bug where `{gsub(...),...}' is
+#  misinterpreted as a brace expansion; this can be
+#  overcome by putting a space between `{' and `gsub').
+#
+define shell-unescape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }'
+endef
+
+# escape-for-shell-sq
+#
+# Usage: embeddable-text = $(call escape-for-shell-sq,text)
+#
+# This function produces text that is suitable for
+# embedding in a shell string that is delimited by
+# single-quotes.
+#
+escape-for-shell-sq =  $(subst ','\'',$(1))
+
+# shell-sq
+#
+# Usage: single-quoted-and-escaped-text = $(call shell-sq,text)
+#
+shell-sq = '$(escape-for-shell-sq)'
+
+# shell-wordify
+#
+# Usage: wordified-text = $(call shell-wordify,text)
+#
+# For instance:
+#
+#  |define text
+#  |hello
+#  |world
+#  |endef
+#  |
+#  |target:
+#  |	echo $(call shell-wordify,$(text))
+#
+# At least GNU make gets confused by expanding a newline
+# within the context of a command line of a makefile rule
+# (this is in constrast to a `$(shell ...)' function call,
+# which can handle it just fine).
+#
+# This function avoids the problem by producing a string
+# that works as a shell word, regardless of whether or
+# not it contains a newline.
+#
+# If the text to be wordified contains a newline, then
+# an intrictate shell command substitution is constructed
+# to render the text as a single line; when the shell
+# processes the resulting escaped text, it transforms
+# it into the original unescaped text.
+#
+# If the text does not contain a newline, then this function
+# produces the same results as the `$(shell-sq)' function.
+#
+shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq))
+define _sw-esc-nl
+"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))"
+endef
+
+# is-absolute
+#
+# Usage: bool-value = $(call is-absolute,path)
+#
+is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y)
+
+# lookup
+#
+# Usage: absolute-executable-path-or-empty = $(call lookup,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+lookup = $(call unescape-nl,$(shell sh -c $(_l-sh)))
+_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,))
+
+# is-executable
+#
+# Usage: bool-value = $(call is-executable,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+is-executable = $(call _is-executable-helper,$(shell-sq))
+_is-executable-helper = $(shell sh -c $(_is-executable-sh))
+_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y)
+
+# get-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable,path)
+#
+# The goal is to get an absolute path for an executable;
+# the `command -v' is defined by POSIX, but it's not
+# necessarily very portable, so it's only used if
+# relative path resolution is requested, as determined
+# by the presence of a leading `/'.
+#
+get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup)))
+_ge-abspath = $(if $(is-executable),$(1))
+
+# get-supplied-or-default-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
+#
+define get-executable-or-default
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+endef
+_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_gea_warn = $(warning The path '$(1)' is not executable.)
+_gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-cc
+# Usage: option = $(call try-cc, source-to-build, cc-options)
+try-cc = $(shell sh -c						  \
+	'TMP="$(OUTPUT)$(TMPOUT).$$$$";				  \
+	 echo "$(1)" |						  \
+	 $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \
+	 rm -f "$$TMP"')
+
+# try-build
+# Usage: option = $(call try-build, source-to-build, cc-options, link-options)
+try-build = $(shell sh -c							\
+	'TMP="$(OUTPUT)$(TMPOUT).$$$$";						\
+	echo "$(1)" |								\
+	$(CC) -x c - $(2) $(3) -o "$$TMP" > /dev/null 2>&1 && echo y;		\
+	rm -f "$$TMP"')
diff --git a/tools/kvm/devices.c b/tools/kvm/devices.c
new file mode 100644
index 000000000000..9f1941d8f7c4
--- /dev/null
+++ b/tools/kvm/devices.c
@@ -0,0 +1,86 @@
+#include "kvm/devices.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <linux/rbtree.h>
+
+struct device_bus {
+	struct rb_root	root;
+	int		dev_num;
+};
+
+static struct device_bus device_trees[DEVICE_BUS_MAX] = {
+	[0 ... (DEVICE_BUS_MAX - 1)] = { RB_ROOT, 0 },
+};
+
+int device__register(struct device_header *dev)
+{
+	struct device_bus *bus;
+	struct rb_node **node, *parent = NULL;
+
+	if (dev->bus_type >= DEVICE_BUS_MAX) {
+		pr_warning("Ignoring device registration on unknown bus %d\n",
+			   dev->bus_type);
+		return -EINVAL;
+	}
+
+	bus = &device_trees[dev->bus_type];
+	dev->dev_num = bus->dev_num++;
+
+	node = &bus->root.rb_node;
+	while (*node) {
+		int num = rb_entry(*node, struct device_header, node)->dev_num;
+		int result = dev->dev_num - num;
+
+		if (result < 0)
+			node = &((*node)->rb_left);
+		else if (result > 0)
+			node = &((*node)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&dev->node, parent, node);
+	rb_insert_color(&dev->node, &bus->root);
+	return 0;
+}
+
+struct device_header *device__find_dev(enum device_bus_type bus_type, u8 dev_num)
+{
+	struct rb_node *node;
+
+	if (bus_type >= DEVICE_BUS_MAX)
+		return ERR_PTR(-EINVAL);
+
+	node = device_trees[bus_type].root.rb_node;
+	while (node) {
+		struct device_header *dev = rb_entry(node, struct device_header,
+						     node);
+		if (dev_num < dev->dev_num) {
+			node = node->rb_left;
+		} else if (dev_num > dev->dev_num) {
+			node = node->rb_right;
+		} else {
+			return dev;
+		}
+	}
+
+	return NULL;
+}
+
+struct device_header *device__first_dev(enum device_bus_type bus_type)
+{
+	struct rb_node *node;
+
+	if (bus_type >= DEVICE_BUS_MAX)
+		return NULL;
+
+	node = rb_first(&device_trees[bus_type].root);
+	return node ? rb_entry(node, struct device_header, node) : NULL;
+}
+
+struct device_header *device__next_dev(struct device_header *dev)
+{
+	struct rb_node *node = rb_next(&dev->node);
+	return node ? rb_entry(node, struct device_header, node) : NULL;
+}
diff --git a/tools/kvm/disk/blk.c b/tools/kvm/disk/blk.c
new file mode 100644
index 000000000000..37581d33136b
--- /dev/null
+++ b/tools/kvm/disk/blk.c
@@ -0,0 +1,76 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+#include <mntent.h>
+
+/*
+ * raw image and blk dev are similar, so reuse raw image ops.
+ */
+static struct disk_image_operations blk_dev_ops = {
+	.read	= raw_image__read,
+	.write	= raw_image__write,
+};
+
+static bool is_mounted(struct stat *st)
+{
+	struct stat st_buf;
+	struct mntent *mnt;
+	FILE *f;
+
+	f = setmntent("/proc/mounts", "r");
+	if (!f)
+		return false;
+
+	while ((mnt = getmntent(f)) != NULL) {
+		if (stat(mnt->mnt_fsname, &st_buf) == 0 &&
+		    S_ISBLK(st_buf.st_mode) && st->st_rdev == st_buf.st_rdev) {
+			fclose(f);
+			return true;
+		}
+	}
+
+	fclose(f);
+	return false;
+}
+
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st)
+{
+	struct disk_image *disk;
+	int fd, r;
+	u64 size;
+
+	if (!S_ISBLK(st->st_mode))
+		return ERR_PTR(-EINVAL);
+
+	if (is_mounted(st)) {
+		pr_err("Block device %s is already mounted! Unmount before use.",
+		       filename);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/*
+	 * Be careful! We are opening host block device!
+	 * Open it readonly since we do not want to break user's data on disk.
+	 */
+	fd = open(filename, flags);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	if (ioctl(fd, BLKGETSIZE64, &size) < 0) {
+		r = -errno;
+		close(fd);
+		return ERR_PTR(r);
+	}
+
+	/*
+	 * FIXME: This will not work on 32-bit host because we can not
+	 * mmap large disk. There is not enough virtual address space
+	 * in 32-bit host. However, this works on 64-bit host.
+	 */
+	disk = disk_image__new(fd, size, &blk_dev_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+		if (!IS_ERR_OR_NULL(disk))
+			disk->async = 1;
+#endif
+	return disk;
+}
diff --git a/tools/kvm/disk/core.c b/tools/kvm/disk/core.c
new file mode 100644
index 000000000000..4e9bda01c6d0
--- /dev/null
+++ b/tools/kvm/disk/core.c
@@ -0,0 +1,356 @@
+#include "kvm/disk-image.h"
+#include "kvm/qcow.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <sys/eventfd.h>
+#include <sys/poll.h>
+
+#define AIO_MAX 256
+
+int debug_iodelay;
+
+static int disk_image__close(struct disk_image *disk);
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	const char *cur;
+	char *sep;
+	struct kvm *kvm = opt->ptr;
+
+	if (kvm->cfg.image_count >= MAX_DISK_IMAGES)
+		die("Currently only 4 images are supported");
+
+	kvm->cfg.disk_image[kvm->cfg.image_count].filename = arg;
+	cur = arg;
+
+	if (strncmp(arg, "scsi:", 5) == 0) {
+		sep = strstr(arg, ":");
+		if (sep)
+			kvm->cfg.disk_image[kvm->cfg.image_count].wwpn = sep + 1;
+		sep = strstr(sep + 1, ":");
+		if (sep) {
+			*sep = 0;
+			kvm->cfg.disk_image[kvm->cfg.image_count].tpgt = sep + 1;
+		}
+		cur = sep + 1;
+	}
+
+	do {
+		sep = strstr(cur, ",");
+		if (sep) {
+			if (strncmp(sep + 1, "ro", 2) == 0)
+				kvm->cfg.disk_image[kvm->cfg.image_count].readonly = true;
+			else if (strncmp(sep + 1, "direct", 6) == 0)
+				kvm->cfg.disk_image[kvm->cfg.image_count].direct = true;
+			*sep = 0;
+			cur = sep + 1;
+		}
+	} while (sep);
+
+	kvm->cfg.image_count++;
+
+	return 0;
+}
+
+#ifdef CONFIG_HAS_AIO
+static void *disk_image__thread(void *param)
+{
+	struct disk_image *disk = param;
+	struct io_event event[AIO_MAX];
+	struct timespec notime = {0};
+	int nr, i;
+	u64 dummy;
+
+	kvm__set_thread_name("disk-image-io");
+
+	while (read(disk->evt, &dummy, sizeof(dummy)) > 0) {
+		nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, &notime);
+		for (i = 0; i < nr; i++)
+			disk->disk_req_cb(event[i].data, event[i].res);
+	}
+
+	return NULL;
+}
+#endif
+
+struct disk_image *disk_image__new(int fd, u64 size,
+				   struct disk_image_operations *ops,
+				   int use_mmap)
+{
+	struct disk_image *disk;
+	int r;
+
+	disk = malloc(sizeof *disk);
+	if (!disk)
+		return ERR_PTR(-ENOMEM);
+
+	*disk = (struct disk_image) {
+		.fd	= fd,
+		.size	= size,
+		.ops	= ops,
+	};
+
+	if (use_mmap == DISK_IMAGE_MMAP) {
+		/*
+		 * The write to disk image will be discarded
+		 */
+		disk->priv = mmap(NULL, size, PROT_RW, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+		if (disk->priv == MAP_FAILED) {
+			r = -errno;
+			free(disk);
+			return ERR_PTR(r);
+		}
+	}
+
+#ifdef CONFIG_HAS_AIO
+	{
+		pthread_t thread;
+
+		disk->evt = eventfd(0, 0);
+		io_setup(AIO_MAX, &disk->ctx);
+		r = pthread_create(&thread, NULL, disk_image__thread, disk);
+		if (r) {
+			r = -errno;
+			free(disk);
+			return ERR_PTR(r);
+		}
+	}
+#endif
+	return disk;
+}
+
+static struct disk_image *disk_image__open(const char *filename, bool readonly, bool direct)
+{
+	struct disk_image *disk;
+	struct stat st;
+	int fd, flags;
+
+	if (readonly)
+		flags = O_RDONLY;
+	else
+		flags = O_RDWR;
+	if (direct)
+		flags |= O_DIRECT;
+
+	if (stat(filename, &st) < 0)
+		return ERR_PTR(-errno);
+
+	/* blk device ?*/
+	disk = blkdev__probe(filename, flags, &st);
+	if (!IS_ERR_OR_NULL(disk))
+		return disk;
+
+	fd = open(filename, flags);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	/* qcow image ?*/
+	disk = qcow_probe(fd, true);
+	if (!IS_ERR_OR_NULL(disk)) {
+		pr_warning("Forcing read-only support for QCOW");
+		return disk;
+	}
+
+	/* raw image ?*/
+	disk = raw_image__probe(fd, &st, readonly);
+	if (!IS_ERR_OR_NULL(disk))
+		return disk;
+
+	if (close(fd) < 0)
+		pr_warning("close() failed");
+
+	return ERR_PTR(-ENOSYS);
+}
+
+static struct disk_image **disk_image__open_all(struct kvm *kvm)
+{
+	struct disk_image **disks;
+	const char *filename;
+	const char *wwpn;
+	const char *tpgt;
+	bool readonly;
+	bool direct;
+	void *err;
+	int i;
+	struct disk_image_params *params = (struct disk_image_params *)&kvm->cfg.disk_image;
+	int count = kvm->cfg.image_count;
+
+	if (!count)
+		return ERR_PTR(-EINVAL);
+	if (count > MAX_DISK_IMAGES)
+		return ERR_PTR(-ENOSPC);
+
+	disks = calloc(count, sizeof(*disks));
+	if (!disks)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		filename = params[i].filename;
+		readonly = params[i].readonly;
+		direct = params[i].direct;
+		wwpn = params[i].wwpn;
+		tpgt = params[i].tpgt;
+
+		if (wwpn) {
+			disks[i] = malloc(sizeof(struct disk_image));
+			if (!disks[i])
+				return ERR_PTR(-ENOMEM);
+			disks[i]->wwpn = wwpn;
+			disks[i]->tpgt = tpgt;
+			continue;
+		}
+
+		if (!filename)
+			continue;
+
+		disks[i] = disk_image__open(filename, readonly, direct);
+		if (IS_ERR_OR_NULL(disks[i])) {
+			pr_err("Loading disk image '%s' failed", filename);
+			err = disks[i];
+			goto error;
+		}
+		disks[i]->debug_iodelay = kvm->cfg.debug_iodelay;
+	}
+
+	return disks;
+error:
+	for (i = 0; i < count; i++)
+		if (!IS_ERR_OR_NULL(disks[i]))
+			disk_image__close(disks[i]);
+
+	free(disks);
+	return err;
+}
+
+int disk_image__flush(struct disk_image *disk)
+{
+	if (disk->ops->flush)
+		return disk->ops->flush(disk);
+
+	return fsync(disk->fd);
+}
+
+static int disk_image__close(struct disk_image *disk)
+{
+	/* If there was no disk image then there's nothing to do: */
+	if (!disk)
+		return 0;
+
+	if (disk->ops->close)
+		return disk->ops->close(disk);
+
+	if (close(disk->fd) < 0)
+		pr_warning("close() failed");
+
+	free(disk);
+
+	return 0;
+}
+
+static int disk_image__close_all(struct disk_image **disks, int count)
+{
+	while (count)
+		disk_image__close(disks[--count]);
+
+	free(disks);
+
+	return 0;
+}
+
+/*
+ * Fill iov with disk data, starting from sector 'sector'.
+ * Return amount of bytes read.
+ */
+ssize_t disk_image__read(struct disk_image *disk, u64 sector,
+			 const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t total = 0;
+
+	if (debug_iodelay)
+		msleep(debug_iodelay);
+
+	if (disk->ops->read) {
+		total = disk->ops->read(disk, sector, iov, iovcount, param);
+		if (total < 0) {
+			pr_info("disk_image__read error: total=%ld\n", (long)total);
+			return total;
+		}
+	}
+
+	if (!disk->async && disk->disk_req_cb)
+		disk->disk_req_cb(param, total);
+
+	return total;
+}
+
+/*
+ * Write iov to disk, starting from sector 'sector'.
+ * Return amount of bytes written.
+ */
+ssize_t disk_image__write(struct disk_image *disk, u64 sector,
+			  const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t total = 0;
+
+	if (debug_iodelay)
+		msleep(debug_iodelay);
+
+	if (disk->ops->write) {
+		/*
+		 * Try writev based operation first
+		 */
+
+		total = disk->ops->write(disk, sector, iov, iovcount, param);
+		if (total < 0) {
+			pr_info("disk_image__write error: total=%ld\n", (long)total);
+			return total;
+		}
+	} else {
+		/* Do nothing */
+	}
+
+	if (!disk->async && disk->disk_req_cb)
+		disk->disk_req_cb(param, total);
+
+	return total;
+}
+
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len)
+{
+	struct stat st;
+	int r;
+
+	r = fstat(disk->fd, &st);
+	if (r)
+		return r;
+
+	*len = snprintf(buffer, *len, "%llu%llu%llu",
+			(u64)st.st_dev, (u64)st.st_rdev, (u64)st.st_ino);
+	return *len;
+}
+
+void disk_image__set_callback(struct disk_image *disk,
+			      void (*disk_req_cb)(void *param, long len))
+{
+	disk->disk_req_cb = disk_req_cb;
+}
+
+int disk_image__init(struct kvm *kvm)
+{
+	if (kvm->cfg.image_count) {
+		kvm->disks = disk_image__open_all(kvm);
+		if (IS_ERR(kvm->disks))
+			return PTR_ERR(kvm->disks);
+	}
+
+	return 0;
+}
+dev_base_init(disk_image__init);
+
+int disk_image__exit(struct kvm *kvm)
+{
+	return disk_image__close_all(kvm->disks, kvm->nr_disks);
+}
+dev_base_exit(disk_image__exit);
diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
new file mode 100644
index 000000000000..64a25509899e
--- /dev/null
+++ b/tools/kvm/disk/qcow.c
@@ -0,0 +1,1527 @@
+#include "kvm/qcow.h"
+
+#include "kvm/disk-image.h"
+#include "kvm/read-write.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef CONFIG_HAS_ZLIB
+#include <zlib.h>
+#endif
+
+#include <linux/err.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append);
+static int qcow_write_refcount_table(struct qcow *q);
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref);
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size);
+
+static inline int qcow_pwrite_sync(int fd,
+	void *buf, size_t count, off_t offset)
+{
+	if (pwrite_in_full(fd, buf, count, offset) < 0)
+		return -1;
+
+	return fdatasync(fd);
+}
+
+static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+
+	/* search the tree */
+	while (*link) {
+		struct qcow_l2_table *t;
+
+		t = rb_entry(*link, struct qcow_l2_table, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_l2_table *t;
+
+		t = rb_entry(link, struct qcow_l2_table, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static void l1_table_free_cache(struct qcow_l1_table *l1t)
+{
+	struct rb_root *r = &l1t->root;
+	struct list_head *pos, *n;
+	struct qcow_l2_table *t;
+
+	list_for_each_safe(pos, n, &l1t->lru_list) {
+		/* Remove cache table from the list and RB tree */
+		list_del(pos);
+		t = list_entry(pos, struct qcow_l2_table, list);
+		rb_erase(&t->node, r);
+
+		/* Free the cached node */
+		free(t);
+	}
+}
+
+static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
+{
+	struct qcow_header *header = q->header;
+	u64 size;
+
+	if (!c->dirty)
+		return 0;
+
+	size = 1 << header->l2_bits;
+
+	if (qcow_pwrite_sync(q->fd, c->table,
+		size * sizeof(u64), c->offset) < 0)
+		return -1;
+
+	c->dirty = 0;
+
+	return 0;
+}
+
+static int cache_table(struct qcow *q, struct qcow_l2_table *c)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct rb_root *r = &l1t->root;
+	struct qcow_l2_table *lru;
+
+	if (l1t->nr_cached == MAX_CACHE_NODES) {
+		/*
+		 * The node at the head of the list is least recently used
+		 * node. Remove it from the list and replaced with a new node.
+		 */
+		lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
+
+		/* Remove the node from the cache */
+		rb_erase(&lru->node, r);
+		list_del_init(&lru->list);
+		l1t->nr_cached--;
+
+		/* Free the LRUed node */
+		free(lru);
+	}
+
+	/* Add new node in RB Tree: Helps in searching faster */
+	if (l2_table_insert(r, c) < 0)
+		goto error;
+
+	/* Add in LRU replacement list */
+	list_add_tail(&c->list, &l1t->lru_list);
+	l1t->nr_cached++;
+
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+
+	l2t = l2_table_lookup(&l1t->root, offset);
+	if (!l2t)
+		return NULL;
+
+	/* Update the LRU state, by moving the searched node to list tail */
+	list_move_tail(&l2t->list, &l1t->lru_list);
+
+	return l2t;
+}
+
+/* Allocates a new node for caching L2 table */
+static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l2_table *c;
+	u64 l2t_sz;
+	u64 size;
+
+	l2t_sz = 1 << header->l2_bits;
+	size   = sizeof(*c) + l2t_sz * sizeof(u64);
+	c      = calloc(1, size);
+	if (!c)
+		goto out;
+
+	c->offset = offset;
+	RB_CLEAR_NODE(&c->node);
+	INIT_LIST_HEAD(&c->list);
+out:
+	return c;
+}
+
+static inline u64 get_l1_index(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return offset >> (header->l2_bits + header->cluster_bits);
+}
+
+static inline u64 get_l2_index(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
+}
+
+static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return offset & ((1 << header->cluster_bits)-1);
+}
+
+static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l2_table *l2t;
+	u64 size;
+
+	size = 1 << header->l2_bits;
+
+	/* search an entry for offset in cache */
+	l2t = l2_table_search(q, offset);
+	if (l2t)
+		return l2t;
+
+	/* allocate new node for caching l2 table */
+	l2t = new_cache_table(q, offset);
+	if (!l2t)
+		goto error;
+
+	/* table not cached: read from the disk */
+	if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
+		goto error;
+
+	/* cache the table */
+	if (cache_table(q, l2t) < 0)
+		goto error;
+
+	return l2t;
+error:
+	free(l2t);
+	return NULL;
+}
+
+static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size,
+	const u8 *buf, int buf_size)
+{
+#ifdef CONFIG_HAS_ZLIB
+	z_stream strm1, *strm = &strm1;
+	int ret, out_len;
+
+	memset(strm, 0, sizeof(*strm));
+
+	strm->next_in	= (u8 *)buf;
+	strm->avail_in	= buf_size;
+	strm->next_out	= out_buf;
+	strm->avail_out	= out_buf_size;
+
+	ret = inflateInit2(strm, -12);
+	if (ret != Z_OK)
+		return -1;
+
+	ret = inflate(strm, Z_FINISH);
+	out_len = strm->next_out - out_buf;
+	if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+		out_len != out_buf_size) {
+		inflateEnd(strm);
+		return -1;
+	}
+
+	inflateEnd(strm);
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset,
+	void *dst, u32 dst_len)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 clust_offset;
+	u64 clust_start;
+	u64 l2t_offset;
+	size_t length;
+	u64 l2t_size;
+	u64 l1_idx;
+	u64 l2_idx;
+	int coffset;
+	int csize;
+
+	l1_idx = get_l1_index(q, offset);
+	if (l1_idx >= l1t->table_size)
+		return -1;
+
+	clust_offset = get_cluster_offset(q, offset);
+	if (clust_offset >= q->cluster_size)
+		return -1;
+
+	length = q->cluster_size - clust_offset;
+	if (length > dst_len)
+		length = dst_len;
+
+	mutex_lock(&q->mutex);
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+	if (!l2t_offset)
+		goto zero_cluster;
+
+	l2t_size = 1 << header->l2_bits;
+
+	/* read and cache level 2 table */
+	l2t = qcow_read_l2_table(q, l2t_offset);
+	if (!l2t)
+		goto out_error;
+
+	l2_idx = get_l2_index(q, offset);
+	if (l2_idx >= l2t_size)
+		goto out_error;
+
+	clust_start = be64_to_cpu(l2t->table[l2_idx]);
+	if (clust_start & QCOW1_OFLAG_COMPRESSED) {
+		coffset	= clust_start & q->cluster_offset_mask;
+		csize	= clust_start >> (63 - q->header->cluster_bits);
+		csize	&= (q->cluster_size - 1);
+
+		if (pread_in_full(q->fd, q->cluster_data, csize,
+				  coffset) < 0)
+			goto out_error;
+
+		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+					q->cluster_data, csize) < 0)
+			goto out_error;
+
+		memcpy(dst, q->cluster_cache + clust_offset, length);
+		mutex_unlock(&q->mutex);
+	} else {
+		if (!clust_start)
+			goto zero_cluster;
+
+		mutex_unlock(&q->mutex);
+
+		if (pread_in_full(q->fd, dst, length,
+				  clust_start + clust_offset) < 0)
+			return -1;
+	}
+
+	return length;
+
+zero_cluster:
+	mutex_unlock(&q->mutex);
+	memset(dst, 0, length);
+	return length;
+
+out_error:
+	mutex_unlock(&q->mutex);
+	length = -1;
+	return -1;
+}
+
+static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset,
+	void *dst, u32 dst_len)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 clust_offset;
+	u64 clust_start;
+	u64 l2t_offset;
+	size_t length;
+	u64 l2t_size;
+	u64 l1_idx;
+	u64 l2_idx;
+	int coffset;
+	int sector_offset;
+	int nb_csectors;
+	int csize;
+
+	l1_idx = get_l1_index(q, offset);
+	if (l1_idx >= l1t->table_size)
+		return -1;
+
+	clust_offset = get_cluster_offset(q, offset);
+	if (clust_offset >= q->cluster_size)
+		return -1;
+
+	length = q->cluster_size - clust_offset;
+	if (length > dst_len)
+		length = dst_len;
+
+	mutex_lock(&q->mutex);
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+
+	l2t_offset &= ~QCOW2_OFLAG_COPIED;
+	if (!l2t_offset)
+		goto zero_cluster;
+
+	l2t_size = 1 << header->l2_bits;
+
+	/* read and cache level 2 table */
+	l2t = qcow_read_l2_table(q, l2t_offset);
+	if (!l2t)
+		goto out_error;
+
+	l2_idx = get_l2_index(q, offset);
+	if (l2_idx >= l2t_size)
+		goto out_error;
+
+	clust_start = be64_to_cpu(l2t->table[l2_idx]);
+	if (clust_start & QCOW2_OFLAG_COMPRESSED) {
+		coffset = clust_start & q->cluster_offset_mask;
+		nb_csectors = ((clust_start >> q->csize_shift)
+			& q->csize_mask) + 1;
+		sector_offset = coffset & (SECTOR_SIZE - 1);
+		csize = nb_csectors * SECTOR_SIZE - sector_offset;
+
+		if (pread_in_full(q->fd, q->cluster_data,
+				  nb_csectors * SECTOR_SIZE,
+				  coffset & ~(SECTOR_SIZE - 1)) < 0) {
+			goto out_error;
+		}
+
+		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+					q->cluster_data + sector_offset,
+					csize) < 0) {
+			goto out_error;
+		}
+
+		memcpy(dst, q->cluster_cache + clust_offset, length);
+		mutex_unlock(&q->mutex);
+	} else {
+		clust_start &= QCOW2_OFFSET_MASK;
+		if (!clust_start)
+			goto zero_cluster;
+
+		mutex_unlock(&q->mutex);
+
+		if (pread_in_full(q->fd, dst, length,
+				  clust_start + clust_offset) < 0)
+			return -1;
+	}
+
+	return length;
+
+zero_cluster:
+	mutex_unlock(&q->mutex);
+	memset(dst, 0, length);
+	return length;
+
+out_error:
+	mutex_unlock(&q->mutex);
+	length = -1;
+	return -1;
+}
+
+static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector,
+	void *dst, u32 dst_len)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_header *header = q->header;
+	u32 nr_read;
+	u64 offset;
+	char *buf;
+	u32 nr;
+
+	buf = dst;
+	nr_read = 0;
+
+	while (nr_read < dst_len) {
+		offset = sector << SECTOR_SHIFT;
+		if (offset >= header->size)
+			return -1;
+
+		if (q->version == QCOW1_VERSION)
+			nr = qcow1_read_cluster(q, offset, buf,
+				dst_len - nr_read);
+		else
+			nr = qcow2_read_cluster(q, offset, buf,
+				dst_len - nr_read);
+
+		if (nr <= 0)
+			return -1;
+
+		nr_read	+= nr;
+		buf	+= nr;
+		sector	+= (nr >> SECTOR_SHIFT);
+	}
+
+	return dst_len;
+}
+
+static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t nr, total = 0;
+
+	while (iovcount--) {
+		nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+		if (nr != (ssize_t)iov->iov_len) {
+			pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+			return -1;
+		}
+
+		sector += iov->iov_len >> SECTOR_SHIFT;
+		total += nr;
+		iov++;
+	}
+
+	return total;
+}
+
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+	struct rb_root *r = &rft->root;
+	struct list_head *pos, *n;
+	struct qcow_refcount_block *t;
+
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		list_del(pos);
+		t = list_entry(pos, struct qcow_refcount_block, list);
+		rb_erase(&t->node, r);
+
+		free(t);
+	}
+}
+
+static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+
+	/* search the tree */
+	while (*link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(*link, struct qcow_refcount_block, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
+{
+	if (!rfb->dirty)
+		return 0;
+
+	if (qcow_pwrite_sync(q->fd, rfb->entries,
+		rfb->size * sizeof(u16), rfb->offset) < 0)
+		return -1;
+
+	rfb->dirty = 0;
+
+	return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct rb_root *r = &rft->root;
+	struct qcow_refcount_block *lru;
+
+	if (rft->nr_cached == MAX_CACHE_NODES) {
+		lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
+
+		rb_erase(&lru->node, r);
+		list_del_init(&lru->list);
+		rft->nr_cached--;
+
+		free(lru);
+	}
+
+	if (refcount_block_insert(r, c) < 0)
+		goto error;
+
+	list_add_tail(&c->list, &rft->lru_list);
+	rft->nr_cached++;
+
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
+{
+	struct qcow_refcount_block *rfb;
+
+	rfb = malloc(sizeof *rfb + q->cluster_size);
+	if (!rfb)
+		return NULL;
+
+	rfb->offset = rfb_offset;
+	rfb->size = q->cluster_size / sizeof(u16);
+	RB_CLEAR_NODE(&rfb->node);
+	INIT_LIST_HEAD(&rfb->list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(link, struct qcow_refcount_block, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+
+	rfb = refcount_block_lookup(&rft->root, offset);
+	if (!rfb)
+		return NULL;
+
+	/* Update the LRU state, by moving the searched node to list tail */
+	list_move_tail(&rfb->list, &rft->lru_list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q,
+	u64 clust_idx)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+	u64 new_block_offset;
+	u64 rft_idx;
+
+	rft_idx = clust_idx >> (header->cluster_bits -
+		QCOW_REFCOUNT_BLOCK_SHIFT);
+
+	if (rft_idx >= rft->rf_size) {
+		pr_warning("Don't support grow refcount block table");
+		return NULL;
+	}
+
+	new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0);
+	if (new_block_offset < 0)
+		return NULL;
+
+	rfb = new_refcount_block(q, new_block_offset);
+	if (!rfb)
+		return NULL;
+
+	memset(rfb->entries, 0x00, q->cluster_size);
+	rfb->dirty = 1;
+
+	/* write refcount block */
+	if (write_refcount_block(q, rfb) < 0)
+		goto free_rfb;
+
+	if (cache_refcount_block(q, rfb) < 0)
+		goto free_rfb;
+
+	rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset);
+	if (update_cluster_refcount(q, new_block_offset >>
+		    header->cluster_bits, 1) < 0)
+		goto recover_rft;
+
+	if (qcow_write_refcount_table(q) < 0)
+		goto recover_rft;
+
+	return rfb;
+
+recover_rft:
+	rft->rf_table[rft_idx] = 0;
+free_rfb:
+	free(rfb);
+	return NULL;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+	u64 rfb_offset;
+	u64 rft_idx;
+
+	rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
+	if (rft_idx >= rft->rf_size)
+		return ERR_PTR(-ENOSPC);
+
+	rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+	if (!rfb_offset)
+		return ERR_PTR(-ENOSPC);
+
+	rfb = refcount_block_search(q, rfb_offset);
+	if (rfb)
+		return rfb;
+
+	rfb = new_refcount_block(q, rfb_offset);
+	if (!rfb)
+		return NULL;
+
+	if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
+		goto error_free_rfb;
+
+	if (cache_refcount_block(q, rfb) < 0)
+		goto error_free_rfb;
+
+	return rfb;
+
+error_free_rfb:
+	free(rfb);
+
+	return NULL;
+}
+
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (PTR_ERR(rfb) == -ENOSPC)
+		return 0;
+	else if (IS_ERR_OR_NULL(rfb)) {
+		pr_warning("Error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+	if (rfb_idx >= rfb->size) {
+		pr_warning("L1: refcount block index out of bounds");
+		return -1;
+	}
+
+	return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u16 refcount;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (PTR_ERR(rfb) == -ENOSPC) {
+		rfb = qcow_grow_refcount_block(q, clust_idx);
+		if (!rfb) {
+			pr_warning("error while growing refcount table");
+			return -1;
+		}
+	} else if (IS_ERR_OR_NULL(rfb)) {
+		pr_warning("error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+	if (rfb_idx >= rfb->size) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+	rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+	rfb->dirty = 1;
+
+	/* write refcount block */
+	if (write_refcount_block(q, rfb) < 0) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	/* update free_clust_idx since refcount becomes zero */
+	if (!refcount && clust_idx < q->free_clust_idx)
+		q->free_clust_idx = clust_idx;
+
+	return 0;
+}
+
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
+{
+	struct qcow_header *header = q->header;
+	u64 start, end, offset;
+
+	start = clust_start & ~(q->cluster_size - 1);
+	end = (clust_start + size - 1) & ~(q->cluster_size - 1);
+	for (offset = start; offset <= end; offset += q->cluster_size)
+		update_cluster_refcount(q, offset >> header->cluster_bits, -1);
+}
+
+/*
+ * Allocate clusters according to the size. Find a postion that
+ * can satisfy the size. free_clust_idx is initialized to zero and
+ * Record last position.
+ */
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref)
+{
+	struct qcow_header *header = q->header;
+	u16 clust_refcount;
+	u32 clust_idx = 0, i;
+	u64 clust_num;
+
+	clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+	for (i = 0; i < clust_num; i++) {
+		clust_idx = q->free_clust_idx++;
+		clust_refcount = qcow_get_refcount(q, clust_idx);
+		if (clust_refcount < 0)
+			return -1;
+		else if (clust_refcount > 0)
+			goto again;
+	}
+
+	clust_idx++;
+
+	if (update_ref)
+		for (i = 0; i < clust_num; i++)
+			if (update_cluster_refcount(q,
+				clust_idx - clust_num + i, 1))
+				return -1;
+
+	return (clust_idx - clust_num) << header->cluster_bits;
+}
+
+static int qcow_write_l1_table(struct qcow *q)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_header *header = q->header;
+
+	if (qcow_pwrite_sync(q->fd, l1t->l1_table,
+		l1t->table_size * sizeof(u64),
+		header->l1_table_offset) < 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Get l2 table. If the table has been copied, read table directly.
+ * If the table exists, allocate a new cluster and copy the table
+ * to the new cluster.
+ */
+static int get_cluster_table(struct qcow *q, u64 offset,
+	struct qcow_l2_table **result_l2t, u64 *result_l2_index)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 l1t_idx;
+	u64 l2t_offset;
+	u64 l2t_idx;
+	u64 l2t_size;
+	u64 l2t_new_offset;
+
+	l2t_size = 1 << header->l2_bits;
+
+	l1t_idx = get_l1_index(q, offset);
+	if (l1t_idx >= l1t->table_size)
+		return -1;
+
+	l2t_idx = get_l2_index(q, offset);
+	if (l2t_idx >= l2t_size)
+		return -1;
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
+	if (l2t_offset & QCOW2_OFLAG_COPIED) {
+		l2t_offset &= ~QCOW2_OFLAG_COPIED;
+		l2t = qcow_read_l2_table(q, l2t_offset);
+		if (!l2t)
+			goto error;
+	} else {
+		l2t_new_offset = qcow_alloc_clusters(q,
+			l2t_size*sizeof(u64), 1);
+
+		if (l2t_new_offset < 0)
+			goto error;
+
+		l2t = new_cache_table(q, l2t_new_offset);
+		if (!l2t)
+			goto free_cluster;
+
+		if (l2t_offset) {
+			l2t = qcow_read_l2_table(q, l2t_offset);
+			if (!l2t)
+				goto free_cache;
+		} else
+			memset(l2t->table, 0x00, l2t_size * sizeof(u64));
+
+		/* write l2 table */
+		l2t->dirty = 1;
+		if (qcow_l2_cache_write(q, l2t) < 0)
+			goto free_cache;
+
+		/* cache l2 table */
+		if (cache_table(q, l2t))
+			goto free_cache;
+
+		/* update the l1 talble */
+		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+			| QCOW2_OFLAG_COPIED);
+		if (qcow_write_l1_table(q)) {
+			pr_warning("Update l1 table error");
+			goto free_cache;
+		}
+
+		/* free old cluster */
+		qcow_free_clusters(q, l2t_offset, q->cluster_size);
+	}
+
+	*result_l2t = l2t;
+	*result_l2_index = l2t_idx;
+
+	return 0;
+
+free_cache:
+	free(l2t);
+
+free_cluster:
+	qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
+
+error:
+	return -1;
+}
+
+/*
+ * If the cluster has been copied, write data directly. If not,
+ * read the original data and write it to the new cluster with
+ * modification.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+		void *buf, u32 src_len)
+{
+	struct qcow_l2_table *l2t;
+	u64 clust_new_start;
+	u64 clust_start;
+	u64 clust_flags;
+	u64 clust_off;
+	u64 l2t_idx;
+	u64 len;
+
+	l2t = NULL;
+
+	clust_off = get_cluster_offset(q, offset);
+	if (clust_off >= q->cluster_size)
+		return -1;
+
+	len = q->cluster_size - clust_off;
+	if (len > src_len)
+		len = src_len;
+
+	mutex_lock(&q->mutex);
+
+	if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+		pr_warning("Get l2 table error");
+		goto error;
+	}
+
+	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
+
+	clust_start &= QCOW2_OFFSET_MASK;
+	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
+		clust_new_start	= qcow_alloc_clusters(q, q->cluster_size, 1);
+		if (clust_new_start < 0) {
+			pr_warning("Cluster alloc error");
+			goto error;
+		}
+
+		offset &= ~(q->cluster_size - 1);
+
+		/* if clust_start is not zero, read the original data*/
+		if (clust_start) {
+			mutex_unlock(&q->mutex);
+			if (qcow2_read_cluster(q, offset, q->copy_buff,
+				q->cluster_size) < 0) {
+				pr_warning("Read copy cluster error");
+				qcow_free_clusters(q, clust_new_start,
+					q->cluster_size);
+				return -1;
+			}
+			mutex_lock(&q->mutex);
+		} else
+			memset(q->copy_buff, 0x00, q->cluster_size);
+
+		memcpy(q->copy_buff + clust_off, buf, len);
+
+		 /* Write actual data */
+		if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+			clust_new_start) < 0)
+			goto free_cluster;
+
+		/* update l2 table*/
+		l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+			| QCOW2_OFLAG_COPIED);
+		l2t->dirty = 1;
+
+		if (qcow_l2_cache_write(q, l2t))
+			goto free_cluster;
+
+		/* free old cluster*/
+		if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+			int size;
+			size = ((clust_start >> q->csize_shift) &
+				q->csize_mask) + 1;
+			size *= 512;
+			clust_start &= q->cluster_offset_mask;
+			clust_start &= ~511;
+
+			qcow_free_clusters(q, clust_start, size);
+		} else if (clust_start)
+			qcow_free_clusters(q, clust_start, q->cluster_size);
+
+	} else {
+		/* Write actual data */
+		if (pwrite_in_full(q->fd, buf, len,
+			clust_start + clust_off) < 0)
+			goto error;
+	}
+	mutex_unlock(&q->mutex);
+	return len;
+
+free_cluster:
+	qcow_free_clusters(q, clust_new_start, q->cluster_size);
+
+error:
+	mutex_unlock(&q->mutex);
+	return -1;
+}
+
+static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_header *header = q->header;
+	u32 nr_written;
+	char *buf;
+	u64 offset;
+	ssize_t nr;
+
+	buf		= src;
+	nr_written	= 0;
+	offset		= sector << SECTOR_SHIFT;
+
+	while (nr_written < src_len) {
+		if (offset >= header->size)
+			return -1;
+
+		nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
+		if (nr < 0)
+			return -1;
+
+		nr_written	+= nr;
+		buf		+= nr;
+		offset		+= nr;
+	}
+
+	return nr_written;
+}
+
+static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t nr, total = 0;
+
+	while (iovcount--) {
+		nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+		if (nr != (ssize_t)iov->iov_len) {
+			pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+			return -1;
+		}
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		iov++;
+		total	+= nr;
+	}
+
+	return total;
+}
+
+static int qcow_disk_flush(struct disk_image *disk)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_refcount_table *rft;
+	struct list_head *pos, *n;
+	struct qcow_l1_table *l1t;
+
+	l1t = &q->table;
+	rft = &q->refcount_table;
+
+	mutex_lock(&q->mutex);
+
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);
+
+		if (write_refcount_block(q, c) < 0)
+			goto error_unlock;
+	}
+
+	list_for_each_safe(pos, n, &l1t->lru_list) {
+		struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
+
+		if (qcow_l2_cache_write(q, c) < 0)
+			goto error_unlock;
+	}
+
+	if (qcow_write_l1_table < 0)
+		goto error_unlock;
+
+	mutex_unlock(&q->mutex);
+
+	return fsync(disk->fd);
+
+error_unlock:
+	mutex_unlock(&q->mutex);
+	return -1;
+}
+
+static int qcow_disk_close(struct disk_image *disk)
+{
+	struct qcow *q;
+
+	if (!disk)
+		return 0;
+
+	q = disk->priv;
+
+	refcount_table_free_cache(&q->refcount_table);
+	l1_table_free_cache(&q->table);
+	free(q->copy_buff);
+	free(q->cluster_data);
+	free(q->cluster_cache);
+	free(q->refcount_table.rf_table);
+	free(q->table.l1_table);
+	free(q->header);
+	free(q);
+
+	return 0;
+}
+
+static struct disk_image_operations qcow_disk_readonly_ops = {
+	.read	= qcow_read_sector,
+	.close	= qcow_disk_close,
+};
+
+static struct disk_image_operations qcow_disk_ops = {
+	.read	= qcow_read_sector,
+	.write	= qcow_write_sector,
+	.flush	= qcow_disk_flush,
+	.close	= qcow_disk_close,
+};
+
+static int qcow_read_refcount_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+
+	rft->rf_size = (header->refcount_table_size * q->cluster_size)
+		/ sizeof(u64);
+
+	rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+	if (!rft->rf_table)
+		return -1;
+
+	rft->root = RB_ROOT;
+	INIT_LIST_HEAD(&rft->lru_list);
+
+	return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
+}
+
+static int qcow_write_refcount_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+
+	return qcow_pwrite_sync(q->fd, rft->rf_table,
+		rft->rf_size * sizeof(u64), header->refcount_table_offset);
+}
+
+static int qcow_read_l1_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *table = &q->table;
+
+	table->table_size = header->l1_size;
+
+	table->l1_table	= calloc(table->table_size, sizeof(u64));
+	if (!table->l1_table)
+		return -1;
+
+	return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset);
+}
+
+static void *qcow2_read_header(int fd)
+{
+	struct qcow2_header_disk f_header;
+	struct qcow_header *header;
+
+	header = malloc(sizeof(struct qcow_header));
+	if (!header)
+		return NULL;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
+		free(header);
+		return NULL;
+	}
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+	be64_to_cpus(&f_header.backing_file_offset);
+	be32_to_cpus(&f_header.backing_file_size);
+	be32_to_cpus(&f_header.cluster_bits);
+	be64_to_cpus(&f_header.size);
+	be32_to_cpus(&f_header.crypt_method);
+	be32_to_cpus(&f_header.l1_size);
+	be64_to_cpus(&f_header.l1_table_offset);
+	be64_to_cpus(&f_header.refcount_table_offset);
+	be32_to_cpus(&f_header.refcount_table_clusters);
+	be32_to_cpus(&f_header.nb_snapshots);
+	be64_to_cpus(&f_header.snapshots_offset);
+
+	*header		= (struct qcow_header) {
+		.size			= f_header.size,
+		.l1_table_offset	= f_header.l1_table_offset,
+		.l1_size		= f_header.l1_size,
+		.cluster_bits		= f_header.cluster_bits,
+		.l2_bits		= f_header.cluster_bits - 3,
+		.refcount_table_offset	= f_header.refcount_table_offset,
+		.refcount_table_size	= f_header.refcount_table_clusters,
+	};
+
+	return header;
+}
+
+static struct disk_image *qcow2_probe(int fd, bool readonly)
+{
+	struct disk_image *disk_image;
+	struct qcow_l1_table *l1t;
+	struct qcow_header *h;
+	struct qcow *q;
+
+	q = calloc(1, sizeof(struct qcow));
+	if (!q)
+		return NULL;
+
+	mutex_init(&q->mutex);
+	q->fd = fd;
+
+	l1t = &q->table;
+
+	l1t->root = RB_ROOT;
+	INIT_LIST_HEAD(&l1t->lru_list);
+
+	h = q->header = qcow2_read_header(fd);
+	if (!h)
+		goto free_qcow;
+
+	q->version = QCOW2_VERSION;
+	q->csize_shift = (62 - (q->header->cluster_bits - 8));
+	q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1;
+	q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
+	q->cluster_size = 1 << q->header->cluster_bits;
+
+	q->copy_buff = malloc(q->cluster_size);
+	if (!q->copy_buff) {
+		pr_warning("copy buff malloc error");
+		goto free_header;
+	}
+
+	q->cluster_data = malloc(q->cluster_size);
+	if (!q->cluster_data) {
+		pr_warning("cluster data malloc error");
+		goto free_copy_buff;
+	}
+
+	q->cluster_cache = malloc(q->cluster_size);
+	if (!q->cluster_cache) {
+		pr_warning("cluster cache malloc error");
+		goto free_cluster_data;
+	}
+
+	if (qcow_read_l1_table(q) < 0)
+		goto free_cluster_cache;
+
+	if (qcow_read_refcount_table(q) < 0)
+		goto free_l1_table;
+
+	/*
+	 * Do not use mmap use read/write instead
+	 */
+	if (readonly)
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+	else
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+	if (IS_ERR_OR_NULL(disk_image))
+		goto free_refcount_table;
+
+	disk_image->async = 0;
+	disk_image->priv = q;
+
+	return disk_image;
+
+free_refcount_table:
+	if (q->refcount_table.rf_table)
+		free(q->refcount_table.rf_table);
+free_l1_table:
+	if (q->table.l1_table)
+		free(q->table.l1_table);
+free_cluster_cache:
+	if (q->cluster_cache)
+		free(q->cluster_cache);
+free_cluster_data:
+	if (q->cluster_data)
+		free(q->cluster_data);
+free_copy_buff:
+	if (q->copy_buff)
+		free(q->copy_buff);
+free_header:
+	if (q->header)
+		free(q->header);
+free_qcow:
+	free(q);
+
+	return NULL;
+}
+
+static bool qcow2_check_image(int fd)
+{
+	struct qcow2_header_disk f_header;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
+		return false;
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+
+	if (f_header.magic != QCOW_MAGIC)
+		return false;
+
+	if (f_header.version != QCOW2_VERSION)
+		return false;
+
+	return true;
+}
+
+static void *qcow1_read_header(int fd)
+{
+	struct qcow1_header_disk f_header;
+	struct qcow_header *header;
+
+	header = malloc(sizeof(struct qcow_header));
+	if (!header)
+		return NULL;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
+		free(header);
+		return NULL;
+	}
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+	be64_to_cpus(&f_header.backing_file_offset);
+	be32_to_cpus(&f_header.backing_file_size);
+	be32_to_cpus(&f_header.mtime);
+	be64_to_cpus(&f_header.size);
+	be32_to_cpus(&f_header.crypt_method);
+	be64_to_cpus(&f_header.l1_table_offset);
+
+	*header		= (struct qcow_header) {
+		.size			= f_header.size,
+		.l1_table_offset	= f_header.l1_table_offset,
+		.l1_size		= f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
+		.cluster_bits		= f_header.cluster_bits,
+		.l2_bits		= f_header.l2_bits,
+	};
+
+	return header;
+}
+
+static struct disk_image *qcow1_probe(int fd, bool readonly)
+{
+	struct disk_image *disk_image;
+	struct qcow_l1_table *l1t;
+	struct qcow_header *h;
+	struct qcow *q;
+
+	q = calloc(1, sizeof(struct qcow));
+	if (!q)
+		return NULL;
+
+	mutex_init(&q->mutex);
+	q->fd = fd;
+
+	l1t = &q->table;
+
+	l1t->root = RB_ROOT;
+	INIT_LIST_HEAD(&l1t->lru_list);
+
+	h = q->header = qcow1_read_header(fd);
+	if (!h)
+		goto free_qcow;
+
+	q->version = QCOW1_VERSION;
+	q->cluster_size = 1 << q->header->cluster_bits;
+	q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+	q->free_clust_idx = 0;
+
+	q->cluster_data = malloc(q->cluster_size);
+	if (!q->cluster_data) {
+		pr_warning("cluster data malloc error");
+		goto free_header;
+	}
+
+	q->cluster_cache = malloc(q->cluster_size);
+	if (!q->cluster_cache) {
+		pr_warning("cluster cache malloc error");
+		goto free_cluster_data;
+	}
+
+	if (qcow_read_l1_table(q) < 0)
+		goto free_cluster_cache;
+
+	/*
+	 * Do not use mmap use read/write instead
+	 */
+	if (readonly)
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+	else
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+	if (!disk_image)
+		goto free_l1_table;
+
+	disk_image->async = 1;
+	disk_image->priv = q;
+
+	return disk_image;
+
+free_l1_table:
+	if (q->table.l1_table)
+		free(q->table.l1_table);
+free_cluster_cache:
+	if (q->cluster_cache)
+		free(q->cluster_cache);
+free_cluster_data:
+	if (q->cluster_data)
+		free(q->cluster_data);
+free_header:
+	if (q->header)
+		free(q->header);
+free_qcow:
+	free(q);
+
+	return NULL;
+}
+
+static bool qcow1_check_image(int fd)
+{
+	struct qcow1_header_disk f_header;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
+		return false;
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+
+	if (f_header.magic != QCOW_MAGIC)
+		return false;
+
+	if (f_header.version != QCOW1_VERSION)
+		return false;
+
+	return true;
+}
+
+struct disk_image *qcow_probe(int fd, bool readonly)
+{
+	if (qcow1_check_image(fd))
+		return qcow1_probe(fd, readonly);
+
+	if (qcow2_check_image(fd))
+		return qcow2_probe(fd, readonly);
+
+	return NULL;
+}
diff --git a/tools/kvm/disk/raw.c b/tools/kvm/disk/raw.c
new file mode 100644
index 000000000000..93b2b4e8db1f
--- /dev/null
+++ b/tools/kvm/disk/raw.c
@@ -0,0 +1,141 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+	struct iocb iocb;
+
+	return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+				disk->evt, param);
+#else
+	return preadv_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+	struct iocb iocb;
+
+	return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+				disk->evt, param);
+#else
+	return pwritev_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+	ssize_t total = 0;
+
+	while (iovcount--) {
+		memcpy(iov->iov_base, disk->priv + offset, iov->iov_len);
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		offset	+= iov->iov_len;
+		total	+= iov->iov_len;
+		iov++;
+	}
+
+	return total;
+}
+
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+	ssize_t total = 0;
+
+	while (iovcount--) {
+		memcpy(disk->priv + offset, iov->iov_base, iov->iov_len);
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		offset	+= iov->iov_len;
+		total	+= iov->iov_len;
+		iov++;
+	}
+
+	return total;
+}
+
+int raw_image__close(struct disk_image *disk)
+{
+	int ret = 0;
+
+	if (disk->priv != MAP_FAILED)
+		ret = munmap(disk->priv, disk->size);
+
+	close(disk->evt);
+
+#ifdef CONFIG_HAS_VIRTIO
+	io_destroy(disk->ctx);
+#endif
+
+	return ret;
+}
+
+/*
+ * multiple buffer based disk image operations
+ */
+static struct disk_image_operations raw_image_regular_ops = {
+	.read	= raw_image__read,
+	.write	= raw_image__write,
+};
+
+struct disk_image_operations ro_ops = {
+	.read	= raw_image__read_mmap,
+	.write	= raw_image__write_mmap,
+	.close	= raw_image__close,
+};
+
+struct disk_image_operations ro_ops_nowrite = {
+	.read	= raw_image__read,
+};
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly)
+{
+	struct disk_image *disk;
+
+	if (readonly) {
+		/*
+		 * Use mmap's MAP_PRIVATE to implement non-persistent write
+		 * FIXME: This does not work on 32-bit host.
+		 */
+		struct disk_image *disk;
+
+		disk = disk_image__new(fd, st->st_size, &ro_ops, DISK_IMAGE_MMAP);
+		if (IS_ERR_OR_NULL(disk)) {
+			disk = disk_image__new(fd, st->st_size, &ro_ops_nowrite, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+			if (!IS_ERR_OR_NULL(disk))
+				disk->async = 1;
+#endif
+		}
+
+		return disk;
+	} else {
+		/*
+		 * Use read/write instead of mmap
+		 */
+		disk = disk_image__new(fd, st->st_size, &raw_image_regular_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+		if (!IS_ERR_OR_NULL(disk))
+			disk->async = 1;
+#endif
+		return disk;
+	}
+}
diff --git a/tools/kvm/framebuffer.c b/tools/kvm/framebuffer.c
new file mode 100644
index 000000000000..fb8f51dd1de7
--- /dev/null
+++ b/tools/kvm/framebuffer.c
@@ -0,0 +1,80 @@
+#include "kvm/framebuffer.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+static LIST_HEAD(framebuffers);
+
+struct framebuffer *fb__register(struct framebuffer *fb)
+{
+	INIT_LIST_HEAD(&fb->node);
+	list_add(&fb->node, &framebuffers);
+
+	return fb;
+}
+
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops)
+{
+	if (fb->nr_targets >= FB_MAX_TARGETS)
+		return -ENOSPC;
+
+	fb->targets[fb->nr_targets++] = ops;
+
+	return 0;
+}
+
+static int start_targets(struct framebuffer *fb)
+{
+	unsigned long i;
+
+	for (i = 0; i < fb->nr_targets; i++) {
+		struct fb_target_operations *ops = fb->targets[i];
+		int err = 0;
+
+		if (ops->start)
+			err = ops->start(fb);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int fb__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	list_for_each_entry(fb, &framebuffers, node) {
+		int err;
+
+		err = start_targets(fb);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+firmware_init(fb__init);
+
+int fb__exit(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	list_for_each_entry(fb, &framebuffers, node) {
+		u32 i;
+
+		for (i = 0; i < fb->nr_targets; i++)
+			if (fb->targets[i]->stop)
+				fb->targets[i]->stop(fb);
+
+		munmap(fb->mem, fb->mem_size);
+	}
+
+	return 0;
+}
+firmware_exit(fb__exit);
diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c
new file mode 100644
index 000000000000..8c49a0323451
--- /dev/null
+++ b/tools/kvm/guest/init.c
@@ -0,0 +1,76 @@
+/*
+ * This is a simple init for shared rootfs guests. This part should be limited
+ * to doing mounts and running stage 2 of the init process.
+ */
+#include <sys/mount.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <linux/reboot.h>
+
+static int run_process(char *filename)
+{
+	char *new_argv[] = { filename, NULL };
+	char *new_env[] = { "TERM=linux", "DISPLAY=192.168.33.1:0",
+				"HOME=/virt/home", NULL };
+
+	return execve(filename, new_argv, new_env);
+}
+
+static int run_process_sandbox(char *filename)
+{
+	char *new_argv[] = { filename, "/virt/sandbox.sh", NULL };
+	char *new_env[] = { "TERM=linux", "HOME=/virt/home", NULL };
+
+	return execve(filename, new_argv, new_env);
+}
+
+static void do_mounts(void)
+{
+	mount("hostfs", "/host", "9p", MS_RDONLY, "trans=virtio,version=9p2000.L");
+	mount("", "/sys", "sysfs", 0, NULL);
+	mount("proc", "/proc", "proc", 0, NULL);
+	mount("devtmpfs", "/dev", "devtmpfs", 0, NULL);
+	mkdir("/dev/pts", 0755);
+	mount("devpts", "/dev/pts", "devpts", 0, NULL);
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t child;
+	int status;
+
+	puts("Mounting...");
+
+	do_mounts();
+
+	/* get session leader */
+	setsid();
+
+	/* set controlling terminal */
+	ioctl(0, TIOCSCTTY, 1);
+
+	child = fork();
+	if (child < 0) {
+		printf("Fatal: fork() failed with %d\n", child);
+		return 0;
+	} else if (child == 0) {
+		if (access("/virt/sandbox.sh", R_OK) == 0)
+			run_process_sandbox("/bin/sh");
+		else
+			run_process("/bin/sh");
+	} else {
+		pid_t corpse;
+
+		do {
+			corpse = waitpid(-1, &status, 0);
+		} while (corpse != child);
+	}
+
+	reboot(LINUX_REBOOT_CMD_RESTART);
+
+	printf("Init failed: %s\n", strerror(errno));
+
+	return 0;
+}
diff --git a/tools/kvm/guest_compat.c b/tools/kvm/guest_compat.c
new file mode 100644
index 000000000000..fd4704b20b16
--- /dev/null
+++ b/tools/kvm/guest_compat.c
@@ -0,0 +1,99 @@
+#include "kvm/guest_compat.h"
+
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct compat_message {
+	int id;
+	char *title;
+	char *desc;
+
+	struct list_head list;
+};
+
+static int id;
+static DEFINE_MUTEX(compat_mtx);
+static LIST_HEAD(messages);
+
+static void compat__free(struct compat_message *msg)
+{
+	free(msg->title);
+	free(msg->desc);
+	free(msg);
+}
+
+int compat__add_message(const char *title, const char *desc)
+{
+	struct compat_message *msg;
+	int msg_id;
+
+	msg = malloc(sizeof(*msg));
+	if (msg == NULL)
+		goto cleanup;
+
+	msg->title = strdup(title);
+	msg->desc = strdup(desc);
+
+	if (msg->title == NULL || msg->desc == NULL)
+		goto cleanup;
+
+	mutex_lock(&compat_mtx);
+
+	msg->id = msg_id = id++;
+	list_add_tail(&msg->list, &messages);
+
+	mutex_unlock(&compat_mtx);
+
+	return msg_id;
+
+cleanup:
+	if (msg)
+		compat__free(msg);
+
+	return -ENOMEM;
+}
+
+int compat__remove_message(int id)
+{
+	struct compat_message *pos, *n;
+
+	mutex_lock(&compat_mtx);
+
+	list_for_each_entry_safe(pos, n, &messages, list) {
+		if (pos->id == id) {
+			list_del(&pos->list);
+			compat__free(pos);
+
+			mutex_unlock(&compat_mtx);
+
+			return 0;
+		}
+	}
+
+	mutex_unlock(&compat_mtx);
+
+	return -ENOENT;
+}
+
+int compat__print_all_messages(void)
+{
+	mutex_lock(&compat_mtx);
+
+	while (!list_empty(&messages)) {
+		struct compat_message *msg;
+
+		msg = list_first_entry(&messages, struct compat_message, list);
+
+		printf("\n  # KVM compatibility warning.\n\t%s\n\t%s\n",
+			msg->title, msg->desc);
+
+		list_del(&msg->list);
+		compat__free(msg);
+	}
+
+	mutex_unlock(&compat_mtx);
+
+	return 0;
+}
diff --git a/tools/kvm/hw/i8042.c b/tools/kvm/hw/i8042.c
new file mode 100644
index 000000000000..90357326e171
--- /dev/null
+++ b/tools/kvm/hw/i8042.c
@@ -0,0 +1,355 @@
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/i8042.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdint.h>
+
+/*
+ * IRQs
+ */
+#define KBD_IRQ			1
+#define AUX_IRQ			12
+
+/*
+ * Registers
+ */
+#define I8042_DATA_REG		0x60
+#define I8042_COMMAND_REG	0x64
+
+/*
+ * Commands
+ */
+#define I8042_CMD_CTL_RCTR	0x20
+#define I8042_CMD_CTL_WCTR	0x60
+#define I8042_CMD_AUX_LOOP	0xD3
+#define I8042_CMD_AUX_SEND	0xD4
+#define I8042_CMD_AUX_TEST	0xA9
+#define I8042_CMD_AUX_DISABLE	0xA7
+#define I8042_CMD_AUX_ENABLE	0xA8
+#define I8042_CMD_SYSTEM_RESET	0xFE
+
+#define RESPONSE_ACK		0xFA
+
+#define MODE_DISABLE_AUX	0x20
+
+#define AUX_ENABLE_REPORTING	0x20
+#define AUX_SCALING_FLAG	0x10
+#define AUX_DEFAULT_RESOLUTION	0x2
+#define AUX_DEFAULT_SAMPLE	100
+
+/*
+ * Status register bits
+ */
+#define I8042_STR_AUXDATA	0x20
+#define I8042_STR_KEYLOCK	0x10
+#define I8042_STR_CMDDAT	0x08
+#define I8042_STR_MUXERR	0x04
+#define I8042_STR_OBF		0x01
+
+#define KBD_MODE_KBD_INT	0x01
+#define KBD_MODE_SYS		0x02
+
+#define QUEUE_SIZE		128
+
+/*
+ * This represents the current state of the PS/2 keyboard system,
+ * including the AUX device (the mouse)
+ */
+struct kbd_state {
+	struct kvm		*kvm;
+
+	char			kq[QUEUE_SIZE];	/* Keyboard queue */
+	int			kread, kwrite;	/* Indexes into the queue */
+	int			kcount;		/* number of elements in queue */
+
+	char			mq[QUEUE_SIZE];
+	int			mread, mwrite;
+	int			mcount;
+
+	u8			mstatus;	/* Mouse status byte */
+	u8			mres;		/* Current mouse resolution */
+	u8			msample;	/* Current mouse samples/second */
+
+	u8			mode;		/* i8042 mode register */
+	u8			status;		/* i8042 status register */
+	/*
+	 * Some commands (on port 0x64) have arguments;
+	 * we store the command here while we wait for the argument
+	 */
+	u32			write_cmd;
+};
+
+static struct kbd_state		state;
+
+/*
+ * If there are packets to be read, set the appropriate IRQs high
+ */
+static void kbd_update_irq(void)
+{
+	u8 klevel = 0;
+	u8 mlevel = 0;
+
+	/* First, clear the kbd and aux output buffer full bits */
+	state.status &= ~(I8042_STR_OBF | I8042_STR_AUXDATA);
+
+	if (state.kcount > 0) {
+		state.status |= I8042_STR_OBF;
+		klevel = 1;
+	}
+
+	/* Keyboard has higher priority than mouse */
+	if (klevel == 0 && state.mcount != 0) {
+		state.status |= I8042_STR_OBF | I8042_STR_AUXDATA;
+		mlevel = 1;
+	}
+
+	kvm__irq_line(state.kvm, KBD_IRQ, klevel);
+	kvm__irq_line(state.kvm, AUX_IRQ, mlevel);
+}
+
+/*
+ * Add a byte to the mouse queue, then set IRQs
+ */
+void mouse_queue(u8 c)
+{
+	if (state.mcount >= QUEUE_SIZE)
+		return;
+
+	state.mq[state.mwrite++ % QUEUE_SIZE] = c;
+
+	state.mcount++;
+	kbd_update_irq();
+}
+
+/*
+ * Add a byte to the keyboard queue, then set IRQs
+ */
+void kbd_queue(u8 c)
+{
+	if (state.kcount >= QUEUE_SIZE)
+		return;
+
+	state.kq[state.kwrite++ % QUEUE_SIZE] = c;
+
+	state.kcount++;
+	kbd_update_irq();
+}
+
+static void kbd_write_command(struct kvm *kvm, u8 val)
+{
+	switch (val) {
+	case I8042_CMD_CTL_RCTR:
+		kbd_queue(state.mode);
+		break;
+	case I8042_CMD_CTL_WCTR:
+	case I8042_CMD_AUX_SEND:
+	case I8042_CMD_AUX_LOOP:
+		state.write_cmd = val;
+		break;
+	case I8042_CMD_AUX_TEST:
+		/* 0 means we're a normal PS/2 mouse */
+		mouse_queue(0);
+		break;
+	case I8042_CMD_AUX_DISABLE:
+		state.mode |= MODE_DISABLE_AUX;
+		break;
+	case I8042_CMD_AUX_ENABLE:
+		state.mode &= ~MODE_DISABLE_AUX;
+		break;
+	case I8042_CMD_SYSTEM_RESET:
+		kvm_cpu__reboot(kvm);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * Called when the OS reads from port 0x60 (PS/2 data)
+ */
+static u32 kbd_read_data(void)
+{
+	u32 ret;
+	int i;
+
+	if (state.kcount != 0) {
+		/* Keyboard data gets read first */
+		ret = state.kq[state.kread++ % QUEUE_SIZE];
+		state.kcount--;
+		kvm__irq_line(state.kvm, KBD_IRQ, 0);
+		kbd_update_irq();
+	} else if (state.mcount > 0) {
+		/* Followed by the mouse */
+		ret = state.mq[state.mread++ % QUEUE_SIZE];
+		state.mcount--;
+		kvm__irq_line(state.kvm, AUX_IRQ, 0);
+		kbd_update_irq();
+	} else {
+		i = state.kread - 1;
+		if (i < 0)
+			i = QUEUE_SIZE;
+		ret = state.kq[i];
+	}
+	return ret;
+}
+
+/*
+ * Called when the OS read from port 0x64, the command port
+ */
+static u32 kbd_read_status(void)
+{
+	return (u32)state.status;
+}
+
+/*
+ * Called when the OS writes to port 0x60 (data port)
+ * Things written here are generally arguments to commands previously
+ * written to port 0x64 and stored in state.write_cmd
+ */
+static void kbd_write_data(u32 val)
+{
+	switch (state.write_cmd) {
+	case I8042_CMD_CTL_WCTR:
+		state.mode = val;
+		kbd_update_irq();
+		break;
+	case I8042_CMD_AUX_LOOP:
+		mouse_queue(val);
+		mouse_queue(RESPONSE_ACK);
+		break;
+	case I8042_CMD_AUX_SEND:
+		/* The OS wants to send a command to the mouse */
+		mouse_queue(RESPONSE_ACK);
+		switch (val) {
+		case 0xe6:
+			/* set scaling = 1:1 */
+			state.mstatus &= ~AUX_SCALING_FLAG;
+			break;
+		case 0xe8:
+			/* set resolution */
+			state.mres = val;
+			break;
+		case 0xe9:
+			/* Report mouse status/config */
+			mouse_queue(state.mstatus);
+			mouse_queue(state.mres);
+			mouse_queue(state.msample);
+			break;
+		case 0xf2:
+			/* send ID */
+			mouse_queue(0); /* normal mouse */
+			break;
+		case 0xf3:
+			/* set sample rate */
+			state.msample = val;
+			break;
+		case 0xf4:
+			/* enable reporting */
+			state.mstatus |= AUX_ENABLE_REPORTING;
+			break;
+		case 0xf5:
+			state.mstatus &= ~AUX_ENABLE_REPORTING;
+			break;
+		case 0xf6:
+			/* set defaults, just fall through to reset */
+		case 0xff:
+			/* reset */
+			state.mstatus = 0x0;
+			state.mres = AUX_DEFAULT_RESOLUTION;
+			state.msample = AUX_DEFAULT_SAMPLE;
+			break;
+		default:
+			break;
+	}
+	break;
+	case 0:
+		/* Just send the ID */
+		kbd_queue(RESPONSE_ACK);
+		kbd_queue(0xab);
+		kbd_queue(0x41);
+		kbd_update_irq();
+		break;
+	default:
+		/* Yeah whatever */
+		break;
+	}
+	state.write_cmd = 0;
+}
+
+static void kbd_reset(void)
+{
+	state = (struct kbd_state) {
+		.status		= I8042_STR_MUXERR | I8042_STR_CMDDAT | I8042_STR_KEYLOCK, /* 0x1c */
+		.mode		= KBD_MODE_KBD_INT | KBD_MODE_SYS, /* 0x3 */
+		.mres		= AUX_DEFAULT_RESOLUTION,
+		.msample	= AUX_DEFAULT_SAMPLE,
+	};
+}
+
+/*
+ * Called when the OS has written to one of the keyboard's ports (0x60 or 0x64)
+ */
+static bool kbd_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	switch (port) {
+	case I8042_COMMAND_REG: {
+		u8 value = kbd_read_status();
+		ioport__write8(data, value);
+		break;
+	}
+	case I8042_DATA_REG: {
+		u32 value = kbd_read_data();
+		ioport__write32(data, value);
+		break;
+	}
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool kbd_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	switch (port) {
+	case I8042_COMMAND_REG: {
+		u8 value = ioport__read8(data);
+		kbd_write_command(kvm, value);
+		break;
+	}
+	case I8042_DATA_REG: {
+		u32 value = ioport__read32(data);
+		kbd_write_data(value);
+		break;
+	}
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static struct ioport_operations kbd_ops = {
+	.io_in		= kbd_in,
+	.io_out		= kbd_out,
+};
+
+int kbd__init(struct kvm *kvm)
+{
+#ifndef CONFIG_X86
+	return 0;
+#endif
+
+	kbd_reset();
+	state.kvm = kvm;
+	ioport__register(kvm, I8042_DATA_REG, &kbd_ops, 2, NULL);
+	ioport__register(kvm, I8042_COMMAND_REG, &kbd_ops, 2, NULL);
+
+	return 0;
+}
+dev_init(kbd__init);
diff --git a/tools/kvm/hw/pci-shmem.c b/tools/kvm/hw/pci-shmem.c
new file mode 100644
index 000000000000..ec3f7711b986
--- /dev/null
+++ b/tools/kvm/hw/pci-shmem.c
@@ -0,0 +1,411 @@
+#include "kvm/devices.h"
+#include "kvm/pci-shmem.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+#include "kvm/ioport.h"
+#include "kvm/ioeventfd.h"
+
+#include <linux/kvm.h>
+#include <linux/byteorder.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#define MB_SHIFT (20)
+#define KB_SHIFT (10)
+#define GB_SHIFT (30)
+
+static struct pci_device_header pci_shmem_pci_device = {
+	.vendor_id	= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+	.device_id	= cpu_to_le16(0x1110),
+	.header_type	= PCI_HEADER_TYPE_NORMAL,
+	.class[2]	= 0xFF,	/* misc pci device */
+	.status		= cpu_to_le16(PCI_STATUS_CAP_LIST),
+	.capabilities	= (void *)&pci_shmem_pci_device.msix - (void *)&pci_shmem_pci_device,
+	.msix.cap	= PCI_CAP_ID_MSIX,
+	.msix.ctrl	= cpu_to_le16(1),
+	.msix.table_offset = cpu_to_le32(1),		/* Use BAR 1 */
+	.msix.pba_offset = cpu_to_le32(0x1001),		/* Use BAR 1 */
+};
+
+static struct device_header pci_shmem_device = {
+	.bus_type	= DEVICE_BUS_PCI,
+	.data		= &pci_shmem_pci_device,
+};
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+	INTRMASK = 0,
+	INTRSTATUS = 4,
+	IVPOSITION = 8,
+	DOORBELL = 12,
+};
+
+static struct shmem_info *shmem_region;
+static u16 ivshmem_registers;
+static int local_fd;
+static u32 local_id;
+static u64 msix_block;
+static u64 msix_pba;
+static struct msix_table msix_table[2];
+
+int pci_shmem__register_mem(struct shmem_info *si)
+{
+	if (shmem_region == NULL) {
+		shmem_region = si;
+	} else {
+		pr_warning("only single shmem currently avail. ignoring.\n");
+		free(si);
+	}
+	return 0;
+}
+
+static bool shmem_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	u16 offset = port - ivshmem_registers;
+
+	switch (offset) {
+	case INTRMASK:
+		break;
+	case INTRSTATUS:
+		break;
+	case IVPOSITION:
+		ioport__write32(data, local_id);
+		break;
+	case DOORBELL:
+		break;
+	};
+
+	return true;
+}
+
+static bool shmem_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	u16 offset = port - ivshmem_registers;
+
+	switch (offset) {
+	case INTRMASK:
+		break;
+	case INTRSTATUS:
+		break;
+	case IVPOSITION:
+		break;
+	case DOORBELL:
+		break;
+	};
+
+	return true;
+}
+
+static struct ioport_operations shmem_pci__io_ops = {
+	.io_in	= shmem_pci__io_in,
+	.io_out	= shmem_pci__io_out,
+};
+
+static void callback_mmio_msix(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+	void *mem;
+
+	if (addr - msix_block < 0x1000)
+		mem = &msix_table;
+	else
+		mem = &msix_pba;
+
+	if (is_write)
+		memcpy(mem + addr - msix_block, data, len);
+	else
+		memcpy(data, mem + addr - msix_block, len);
+}
+
+/*
+ * Return an irqfd which can be used by other guests to signal this guest
+ * whenever they need to poke it
+ */
+int pci_shmem__get_local_irqfd(struct kvm *kvm)
+{
+	int fd, gsi, r;
+	struct kvm_irqfd irqfd;
+
+	if (local_fd == 0) {
+		fd = eventfd(0, 0);
+		if (fd < 0)
+			return fd;
+
+		if (pci_shmem_pci_device.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE)) {
+			gsi = irq__add_msix_route(kvm, &msix_table[0].msg);
+		} else {
+			gsi = pci_shmem_pci_device.irq_line;
+		}
+
+		irqfd = (struct kvm_irqfd) {
+			.fd = fd,
+			.gsi = gsi,
+		};
+
+		r = ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd);
+		if (r < 0)
+			return r;
+
+		local_fd = fd;
+	}
+
+	return local_fd;
+}
+
+/*
+ * Connect a new client to ivshmem by adding the appropriate datamatch
+ * to the DOORBELL
+ */
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd)
+{
+	struct kvm_ioeventfd ioevent;
+
+	ioevent = (struct kvm_ioeventfd) {
+		.addr		= ivshmem_registers + DOORBELL,
+		.len		= sizeof(u32),
+		.datamatch	= id,
+		.fd		= fd,
+		.flags		= KVM_IOEVENTFD_FLAG_PIO | KVM_IOEVENTFD_FLAG_DATAMATCH,
+	};
+
+	return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+/*
+ * Remove a client connected to ivshmem by removing the appropriate datamatch
+ * from the DOORBELL
+ */
+int pci_shmem__remove_client(struct kvm *kvm, u32 id)
+{
+	struct kvm_ioeventfd ioevent;
+
+	ioevent = (struct kvm_ioeventfd) {
+		.addr		= ivshmem_registers + DOORBELL,
+		.len		= sizeof(u32),
+		.datamatch	= id,
+		.flags		= KVM_IOEVENTFD_FLAG_PIO
+				| KVM_IOEVENTFD_FLAG_DATAMATCH
+				| KVM_IOEVENTFD_FLAG_DEASSIGN,
+	};
+
+	return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+static void *setup_shmem(const char *key, size_t len, int creating)
+{
+	int fd;
+	int rtn;
+	void *mem;
+	int flag = O_RDWR;
+
+	if (creating)
+		flag |= O_CREAT;
+
+	fd = shm_open(key, flag, S_IRUSR | S_IWUSR);
+	if (fd < 0) {
+		pr_warning("Failed to open shared memory file %s\n", key);
+		return NULL;
+	}
+
+	if (creating) {
+		rtn = ftruncate(fd, (off_t) len);
+		if (rtn < 0)
+			pr_warning("Can't ftruncate(fd,%zu)\n", len);
+	}
+	mem = mmap(NULL, len,
+		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0);
+	if (mem == MAP_FAILED) {
+		pr_warning("Failed to mmap shared memory file");
+		mem = NULL;
+	}
+	close(fd);
+
+	return mem;
+}
+
+int shmem_parser(const struct option *opt, const char *arg, int unset)
+{
+	const u64 default_size = SHMEM_DEFAULT_SIZE;
+	const u64 default_phys_addr = SHMEM_DEFAULT_ADDR;
+	const char *default_handle = SHMEM_DEFAULT_HANDLE;
+	struct shmem_info *si = malloc(sizeof(struct shmem_info));
+	u64 phys_addr;
+	u64 size;
+	char *handle = NULL;
+	int create = 0;
+	const char *p = arg;
+	char *next;
+	int base = 10;
+	int verbose = 0;
+
+	const int skip_pci = strlen("pci:");
+	if (verbose)
+		pr_info("shmem_parser(%p,%s,%d)", opt, arg, unset);
+	/* parse out optional addr family */
+	if (strcasestr(p, "pci:")) {
+		p += skip_pci;
+	} else if (strcasestr(p, "mem:")) {
+		die("I can't add to E820 map yet.\n");
+	}
+	/* parse out physical addr */
+	base = 10;
+	if (strcasestr(p, "0x"))
+		base = 16;
+	phys_addr = strtoll(p, &next, base);
+	if (next == p && phys_addr == 0) {
+		pr_info("shmem: no physical addr specified, using default.");
+		phys_addr = default_phys_addr;
+	}
+	if (*next != ':' && *next != '\0')
+		die("shmem: unexpected chars after phys addr.\n");
+	if (*next == '\0')
+		p = next;
+	else
+		p = next + 1;
+	/* parse out size */
+	base = 10;
+	if (strcasestr(p, "0x"))
+		base = 16;
+	size = strtoll(p, &next, base);
+	if (next == p && size == 0) {
+		pr_info("shmem: no size specified, using default.");
+		size = default_size;
+	}
+	/* look for [KMGkmg][Bb]*  uses base 2. */
+	int skip_B = 0;
+	if (strspn(next, "KMGkmg")) {	/* might have a prefix */
+		if (*(next + 1) == 'B' || *(next + 1) == 'b')
+			skip_B = 1;
+		switch (*next) {
+		case 'K':
+		case 'k':
+			size = size << KB_SHIFT;
+			break;
+		case 'M':
+		case 'm':
+			size = size << MB_SHIFT;
+			break;
+		case 'G':
+		case 'g':
+			size = size << GB_SHIFT;
+			break;
+		default:
+			die("shmem: bug in detecting size prefix.");
+			break;
+		}
+		next += 1 + skip_B;
+	}
+	if (*next != ':' && *next != '\0') {
+		die("shmem: unexpected chars after phys size. <%c><%c>\n",
+		    *next, *p);
+	}
+	if (*next == '\0')
+		p = next;
+	else
+		p = next + 1;
+	/* parse out optional shmem handle */
+	const int skip_handle = strlen("handle=");
+	next = strcasestr(p, "handle=");
+	if (*p && next) {
+		if (p != next)
+			die("unexpected chars before handle\n");
+		p += skip_handle;
+		next = strchrnul(p, ':');
+		if (next - p) {
+			handle = malloc(next - p + 1);
+			strncpy(handle, p, next - p);
+			handle[next - p] = '\0';	/* just in case. */
+		}
+		if (*next == '\0')
+			p = next;
+		else
+			p = next + 1;
+	}
+	/* parse optional create flag to see if we should create shm seg. */
+	if (*p && strcasestr(p, "create")) {
+		create = 1;
+		p += strlen("create");
+	}
+	if (*p != '\0')
+		die("shmem: unexpected trailing chars\n");
+	if (handle == NULL) {
+		handle = malloc(strlen(default_handle) + 1);
+		strcpy(handle, default_handle);
+	}
+	if (verbose) {
+		pr_info("shmem: phys_addr = %llx", phys_addr);
+		pr_info("shmem: size      = %llx", size);
+		pr_info("shmem: handle    = %s", handle);
+		pr_info("shmem: create    = %d", create);
+	}
+
+	si->phys_addr = phys_addr;
+	si->size = size;
+	si->handle = handle;
+	si->create = create;
+	pci_shmem__register_mem(si);	/* ownership of si, etc. passed on. */
+	return 0;
+}
+
+int pci_shmem__init(struct kvm *kvm)
+{
+	u8 line, pin;
+	char *mem;
+	int r;
+
+	if (shmem_region == NULL)
+		return 0;
+
+	/* Register good old INTx */
+	r = irq__register_device(PCI_DEVICE_ID_PCI_SHMEM, &pin, &line);
+	if (r < 0)
+		return r;
+
+	pci_shmem_pci_device.irq_pin = pin;
+	pci_shmem_pci_device.irq_line = line;
+
+	/* Register MMIO space for MSI-X */
+	r = ioport__register(kvm, IOPORT_EMPTY, &shmem_pci__io_ops, IOPORT_SIZE, NULL);
+	if (r < 0)
+		return r;
+	ivshmem_registers = (u16)r;
+
+	msix_block = pci_get_io_space_block(0x1010);
+	kvm__register_mmio(kvm, msix_block, 0x1010, false, callback_mmio_msix, NULL);
+
+	/*
+	 * This registers 3 BARs:
+	 *
+	 * 0 - ivshmem registers
+	 * 1 - MSI-X MMIO space
+	 * 2 - Shared memory block
+	 */
+	pci_shmem_pci_device.bar[0] = cpu_to_le32(ivshmem_registers | PCI_BASE_ADDRESS_SPACE_IO);
+	pci_shmem_pci_device.bar_size[0] = shmem_region->size;
+	pci_shmem_pci_device.bar[1] = cpu_to_le32(msix_block | PCI_BASE_ADDRESS_SPACE_MEMORY);
+	pci_shmem_pci_device.bar_size[1] = 0x1010;
+	pci_shmem_pci_device.bar[2] = cpu_to_le32(shmem_region->phys_addr | PCI_BASE_ADDRESS_SPACE_MEMORY);
+	pci_shmem_pci_device.bar_size[2] = shmem_region->size;
+
+	device__register(&pci_shmem_device);
+
+	/* Open shared memory and plug it into the guest */
+	mem = setup_shmem(shmem_region->handle, shmem_region->size,
+				shmem_region->create);
+	if (mem == NULL)
+		return -EINVAL;
+
+	kvm__register_mem(kvm, shmem_region->phys_addr, shmem_region->size,
+			  mem);
+	return 0;
+}
+dev_init(pci_shmem__init);
+
+int pci_shmem__exit(struct kvm *kvm)
+{
+	return 0;
+}
+dev_exit(pci_shmem__exit);
diff --git a/tools/kvm/hw/rtc.c b/tools/kvm/hw/rtc.c
new file mode 100644
index 000000000000..5232bd791873
--- /dev/null
+++ b/tools/kvm/hw/rtc.c
@@ -0,0 +1,155 @@
+#include "kvm/rtc.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+
+#include <time.h>
+
+/*
+ * MC146818 RTC registers
+ */
+#define RTC_SECONDS			0x00
+#define RTC_SECONDS_ALARM		0x01
+#define RTC_MINUTES			0x02
+#define RTC_MINUTES_ALARM		0x03
+#define RTC_HOURS			0x04
+#define RTC_HOURS_ALARM			0x05
+#define RTC_DAY_OF_WEEK			0x06
+#define RTC_DAY_OF_MONTH		0x07
+#define RTC_MONTH			0x08
+#define RTC_YEAR			0x09
+#define RTC_CENTURY			0x32
+
+#define RTC_REG_A			0x0A
+#define RTC_REG_B			0x0B
+#define RTC_REG_C			0x0C
+#define RTC_REG_D			0x0D
+
+struct rtc_device {
+	u8			cmos_idx;
+	u8			cmos_data[128];
+};
+
+static struct rtc_device	rtc;
+
+static inline unsigned char bin2bcd(unsigned val)
+{
+	return ((val / 10) << 4) + val % 10;
+}
+
+static bool cmos_ram_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	struct tm *tm;
+	time_t ti;
+
+	time(&ti);
+
+	tm = gmtime(&ti);
+
+	switch (rtc.cmos_idx) {
+	case RTC_SECONDS:
+		ioport__write8(data, bin2bcd(tm->tm_sec));
+		break;
+	case RTC_MINUTES:
+		ioport__write8(data, bin2bcd(tm->tm_min));
+		break;
+	case RTC_HOURS:
+		ioport__write8(data, bin2bcd(tm->tm_hour));
+		break;
+	case RTC_DAY_OF_WEEK:
+		ioport__write8(data, bin2bcd(tm->tm_wday + 1));
+		break;
+	case RTC_DAY_OF_MONTH:
+		ioport__write8(data, bin2bcd(tm->tm_mday));
+		break;
+	case RTC_MONTH:
+		ioport__write8(data, bin2bcd(tm->tm_mon + 1));
+		break;
+	case RTC_YEAR: {
+		int year;
+
+		year = tm->tm_year + 1900;
+
+		ioport__write8(data, bin2bcd(year % 100));
+
+		break;
+	}
+	case RTC_CENTURY: {
+		int year;
+
+		year = tm->tm_year + 1900;
+
+		ioport__write8(data, bin2bcd(year / 100));
+
+		break;
+	}
+	default:
+		ioport__write8(data, rtc.cmos_data[rtc.cmos_idx]);
+		break;
+	}
+
+	return true;
+}
+
+static bool cmos_ram_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	switch (rtc.cmos_idx) {
+	case RTC_REG_C:
+	case RTC_REG_D:
+		/* Read-only */
+		break;
+	default:
+		rtc.cmos_data[rtc.cmos_idx] = ioport__read8(data);
+		break;
+	}
+
+	return true;
+}
+
+static struct ioport_operations cmos_ram_data_ioport_ops = {
+	.io_out		= cmos_ram_data_out,
+	.io_in		= cmos_ram_data_in,
+};
+
+static bool cmos_ram_index_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	u8 value = ioport__read8(data);
+
+	kvm->nmi_disabled	= value & (1UL << 7);
+	rtc.cmos_idx		= value & ~(1UL << 7);
+
+	return true;
+}
+
+static struct ioport_operations cmos_ram_index_ioport_ops = {
+	.io_out		= cmos_ram_index_out,
+};
+
+int rtc__init(struct kvm *kvm)
+{
+	int r = 0;
+
+	/* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+	r = ioport__register(kvm, 0x0070, &cmos_ram_index_ioport_ops, 1, NULL);
+	if (r < 0)
+		return r;
+
+	r = ioport__register(kvm, 0x0071, &cmos_ram_data_ioport_ops, 1, NULL);
+	if (r < 0) {
+		ioport__unregister(kvm, 0x0071);
+		return r;
+	}
+
+	return r;
+}
+dev_init(rtc__init);
+
+int rtc__exit(struct kvm *kvm)
+{
+	/* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+	ioport__unregister(kvm, 0x0070);
+	ioport__unregister(kvm, 0x0071);
+
+	return 0;
+}
+dev_exit(rtc__exit);
diff --git a/tools/kvm/hw/serial.c b/tools/kvm/hw/serial.c
new file mode 100644
index 000000000000..53b684ab9dc3
--- /dev/null
+++ b/tools/kvm/hw/serial.c
@@ -0,0 +1,452 @@
+#include "kvm/8250-serial.h"
+
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+
+#include <linux/types.h>
+#include <linux/serial_reg.h>
+
+#include <pthread.h>
+
+/*
+ * This fakes a U6_16550A. The fifo len needs to be 64 as the kernel
+ * expects that for autodetection.
+ */
+#define FIFO_LEN		64
+#define FIFO_MASK		(FIFO_LEN - 1)
+
+#define UART_IIR_TYPE_BITS	0xc0
+
+struct serial8250_device {
+	struct mutex		mutex;
+	u8			id;
+
+	u16			iobase;
+	u8			irq;
+	u8			irq_state;
+	int			txcnt;
+	int			rxcnt;
+	int			rxdone;
+	char			txbuf[FIFO_LEN];
+	char			rxbuf[FIFO_LEN];
+
+	u8			dll;
+	u8			dlm;
+	u8			iir;
+	u8			ier;
+	u8			fcr;
+	u8			lcr;
+	u8			mcr;
+	u8			lsr;
+	u8			msr;
+	u8			scr;
+};
+
+#define SERIAL_REGS_SETTING \
+	.iir			= UART_IIR_NO_INT, \
+	.lsr			= UART_LSR_TEMT | UART_LSR_THRE, \
+	.msr			= UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, \
+	.mcr			= UART_MCR_OUT2,
+
+static struct serial8250_device devices[] = {
+	/* ttyS0 */
+	[0]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 0,
+		.iobase			= 0x3f8,
+		.irq			= 4,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS1 */
+	[1]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 1,
+		.iobase			= 0x2f8,
+		.irq			= 3,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS2 */
+	[2]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 2,
+		.iobase			= 0x3e8,
+		.irq			= 4,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS3 */
+	[3]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 3,
+		.iobase			= 0x2e8,
+		.irq			= 3,
+
+		SERIAL_REGS_SETTING
+	},
+};
+
+static void serial8250_flush_tx(struct kvm *kvm, struct serial8250_device *dev)
+{
+	dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+
+	if (dev->txcnt) {
+		if (kvm->cfg.active_console == CONSOLE_8250)
+			term_putc(dev->txbuf, dev->txcnt, dev->id);
+		dev->txcnt = 0;
+	}
+}
+
+static void serial8250_update_irq(struct kvm *kvm, struct serial8250_device *dev)
+{
+	u8 iir = 0;
+
+	/* Handle clear rx */
+	if (dev->lcr & UART_FCR_CLEAR_RCVR) {
+		dev->lcr &= ~UART_FCR_CLEAR_RCVR;
+		dev->rxcnt = dev->rxdone = 0;
+		dev->lsr &= ~UART_LSR_DR;
+	}
+
+	/* Handle clear tx */
+	if (dev->lcr & UART_FCR_CLEAR_XMIT) {
+		dev->lcr &= ~UART_FCR_CLEAR_XMIT;
+		dev->txcnt = 0;
+		dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+	}
+
+	/* Data ready and rcv interrupt enabled ? */
+	if ((dev->ier & UART_IER_RDI) && (dev->lsr & UART_LSR_DR))
+		iir |= UART_IIR_RDI;
+
+	/* Transmitter empty and interrupt enabled ? */
+	if ((dev->ier & UART_IER_THRI) && (dev->lsr & UART_LSR_TEMT))
+		iir |= UART_IIR_THRI;
+
+	/* Now update the irq line, if necessary */
+	if (!iir) {
+		dev->iir = UART_IIR_NO_INT;
+		if (dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 0);
+	} else {
+		dev->iir = iir;
+		if (!dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 1);
+	}
+	dev->irq_state = iir;
+
+	/*
+	 * If the kernel disabled the tx interrupt, we know that there
+	 * is nothing more to transmit, so we can reset our tx logic
+	 * here.
+	 */
+	if (!(dev->ier & UART_IER_THRI))
+		serial8250_flush_tx(kvm, dev);
+}
+
+#define SYSRQ_PENDING_NONE		0
+
+static int sysrq_pending;
+
+static void serial8250__sysrq(struct kvm *kvm, struct serial8250_device *dev)
+{
+	dev->lsr |= UART_LSR_DR | UART_LSR_BI;
+	dev->rxbuf[dev->rxcnt++] = sysrq_pending;
+	sysrq_pending	= SYSRQ_PENDING_NONE;
+}
+
+static void serial8250__receive(struct kvm *kvm, struct serial8250_device *dev,
+				bool handle_sysrq)
+{
+	int c;
+
+	/*
+	 * If the guest transmitted a full fifo, we clear the
+	 * TEMT/THRE bits to let the kernel escape from the 8250
+	 * interrupt handler. We come here only once a ms, so that
+	 * should give the kernel the desired pause. That also flushes
+	 * the tx fifo to the terminal.
+	 */
+	serial8250_flush_tx(kvm, dev);
+
+	if (dev->mcr & UART_MCR_LOOP)
+		return;
+
+	if ((dev->lsr & UART_LSR_DR) || dev->rxcnt)
+		return;
+
+	if (handle_sysrq && sysrq_pending) {
+		serial8250__sysrq(kvm, dev);
+		return;
+	}
+
+	if (kvm->cfg.active_console != CONSOLE_8250)
+		return;
+
+	while (term_readable(dev->id) &&
+	       dev->rxcnt < FIFO_LEN) {
+
+		c = term_getc(kvm, dev->id);
+
+		if (c < 0)
+			break;
+		dev->rxbuf[dev->rxcnt++] = c;
+		dev->lsr |= UART_LSR_DR;
+	}
+}
+
+void serial8250__update_consoles(struct kvm *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		mutex_lock(&dev->mutex);
+
+		/* Restrict sysrq injection to the first port */
+		serial8250__receive(kvm, dev, i == 0);
+
+		serial8250_update_irq(kvm, dev);
+
+		mutex_unlock(&dev->mutex);
+	}
+}
+
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq)
+{
+	sysrq_pending = sysrq;
+}
+
+static struct serial8250_device *find_device(u16 port)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		if (dev->iobase == (port & ~0x7))
+			return dev;
+	}
+	return NULL;
+}
+
+static bool serial8250_out(struct ioport *ioport, struct kvm *kvm, u16 port,
+			   void *data, int size)
+{
+	struct serial8250_device *dev;
+	u16 offset;
+	bool ret = true;
+	char *addr = data;
+
+	dev = find_device(port);
+	if (!dev)
+		return false;
+
+	mutex_lock(&dev->mutex);
+
+	offset = port - dev->iobase;
+
+	switch (offset) {
+	case UART_TX:
+		if (dev->lcr & UART_LCR_DLAB) {
+			dev->dll = ioport__read8(data);
+			break;
+		}
+
+		/* Loopback mode */
+		if (dev->mcr & UART_MCR_LOOP) {
+			if (dev->rxcnt < FIFO_LEN) {
+				dev->rxbuf[dev->rxcnt++] = *addr;
+				dev->lsr |= UART_LSR_DR;
+			}
+			break;
+		}
+
+		if (dev->txcnt < FIFO_LEN) {
+			dev->txbuf[dev->txcnt++] = *addr;
+			dev->lsr &= ~UART_LSR_TEMT;
+			if (dev->txcnt == FIFO_LEN / 2)
+				dev->lsr &= ~UART_LSR_THRE;
+		} else {
+			/* Should never happpen */
+			dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
+		}
+		break;
+	case UART_IER:
+		if (!(dev->lcr & UART_LCR_DLAB))
+			dev->ier = ioport__read8(data) & 0x0f;
+		else
+			dev->dlm = ioport__read8(data);
+		break;
+	case UART_FCR:
+		dev->fcr = ioport__read8(data);
+		break;
+	case UART_LCR:
+		dev->lcr = ioport__read8(data);
+		break;
+	case UART_MCR:
+		dev->mcr = ioport__read8(data);
+		break;
+	case UART_LSR:
+		/* Factory test */
+		break;
+	case UART_MSR:
+		/* Not used */
+		break;
+	case UART_SCR:
+		dev->scr = ioport__read8(data);
+		break;
+	default:
+		ret = false;
+		break;
+	}
+
+	serial8250_update_irq(kvm, dev);
+
+	mutex_unlock(&dev->mutex);
+
+	return ret;
+}
+
+static void serial8250_rx(struct serial8250_device *dev, void *data)
+{
+	if (dev->rxdone == dev->rxcnt)
+		return;
+
+	/* Break issued ? */
+	if (dev->lsr & UART_LSR_BI) {
+		dev->lsr &= ~UART_LSR_BI;
+		ioport__write8(data, 0);
+		return;
+	}
+
+	ioport__write8(data, dev->rxbuf[dev->rxdone++]);
+	if (dev->rxcnt == dev->rxdone) {
+		dev->lsr &= ~UART_LSR_DR;
+		dev->rxcnt = dev->rxdone = 0;
+	}
+}
+
+static bool serial8250_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	struct serial8250_device *dev;
+	u16 offset;
+	bool ret = true;
+
+	dev = find_device(port);
+	if (!dev)
+		return false;
+
+	mutex_lock(&dev->mutex);
+
+	offset = port - dev->iobase;
+
+	switch (offset) {
+	case UART_RX:
+		if (dev->lcr & UART_LCR_DLAB)
+			ioport__write8(data, dev->dll);
+		else
+			serial8250_rx(dev, data);
+		break;
+	case UART_IER:
+		if (dev->lcr & UART_LCR_DLAB)
+			ioport__write8(data, dev->dlm);
+		else
+			ioport__write8(data, dev->ier);
+		break;
+	case UART_IIR:
+		ioport__write8(data, dev->iir | UART_IIR_TYPE_BITS);
+		break;
+	case UART_LCR:
+		ioport__write8(data, dev->lcr);
+		break;
+	case UART_MCR:
+		ioport__write8(data, dev->mcr);
+		break;
+	case UART_LSR:
+		ioport__write8(data, dev->lsr);
+		break;
+	case UART_MSR:
+		ioport__write8(data, dev->msr);
+		break;
+	case UART_SCR:
+		ioport__write8(data, dev->scr);
+		break;
+	default:
+		ret = false;
+		break;
+	}
+
+	serial8250_update_irq(kvm, dev);
+
+	mutex_unlock(&dev->mutex);
+
+	return ret;
+}
+
+static struct ioport_operations serial8250_ops = {
+	.io_in		= serial8250_in,
+	.io_out		= serial8250_out,
+};
+
+static int serial8250__device_init(struct kvm *kvm, struct serial8250_device *dev)
+{
+	int r;
+
+	r = ioport__register(kvm, dev->iobase, &serial8250_ops, 8, NULL);
+	kvm__irq_line(kvm, dev->irq, 0);
+
+	return r;
+}
+
+int serial8250__init(struct kvm *kvm)
+{
+	unsigned int i, j;
+	int r = 0;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		r = serial8250__device_init(kvm, dev);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return r;
+cleanup:
+	for (j = 0; j <= i; j++) {
+		struct serial8250_device *dev = &devices[j];
+
+		ioport__unregister(kvm, dev->iobase);
+	}
+
+	return r;
+}
+dev_init(serial8250__init);
+
+int serial8250__exit(struct kvm *kvm)
+{
+	unsigned int i;
+	int r;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		r = ioport__unregister(kvm, dev->iobase);
+		if (r < 0)
+			return r;
+	}
+
+	return 0;
+}
+dev_exit(serial8250__exit);
diff --git a/tools/kvm/hw/vesa.c b/tools/kvm/hw/vesa.c
new file mode 100644
index 000000000000..33a675f633a0
--- /dev/null
+++ b/tools/kvm/hw/vesa.c
@@ -0,0 +1,95 @@
+#include "kvm/vesa.h"
+
+#include "kvm/devices.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+
+#include <linux/byteorder.h>
+#include <sys/mman.h>
+#include <linux/err.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+#include <unistd.h>
+
+static bool vesa_pci_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static bool vesa_pci_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static struct ioport_operations vesa_io_ops = {
+	.io_in			= vesa_pci_io_in,
+	.io_out			= vesa_pci_io_out,
+};
+
+static struct pci_device_header vesa_pci_device = {
+	.vendor_id		= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+	.device_id		= cpu_to_le16(PCI_DEVICE_ID_VESA),
+	.header_type		= PCI_HEADER_TYPE_NORMAL,
+	.revision_id		= 0,
+	.class[2]		= 0x03,
+	.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+	.subsys_id		= cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
+	.bar[1]			= cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
+	.bar_size[1]		= VESA_MEM_SIZE,
+};
+
+static struct device_header vesa_device = {
+	.bus_type	= DEVICE_BUS_PCI,
+	.data		= &vesa_pci_device,
+};
+
+static struct framebuffer vesafb;
+
+struct framebuffer *vesa__init(struct kvm *kvm)
+{
+	u16 vesa_base_addr;
+	u8 line, pin;
+	char *mem;
+	int r;
+
+	if (!kvm->cfg.vnc && !kvm->cfg.sdl)
+		return NULL;
+
+	r = irq__register_device(PCI_DEVICE_ID_VESA, &pin, &line);
+	if (r < 0)
+		return ERR_PTR(r);
+
+	r = ioport__register(kvm, IOPORT_EMPTY, &vesa_io_ops, IOPORT_SIZE, NULL);
+	if (r < 0)
+		return ERR_PTR(r);
+
+	vesa_pci_device.irq_pin		= pin;
+	vesa_pci_device.irq_line	= line;
+	vesa_base_addr			= (u16)r;
+	vesa_pci_device.bar[0]		= cpu_to_le32(vesa_base_addr | PCI_BASE_ADDRESS_SPACE_IO);
+	device__register(&vesa_device);
+
+	mem = mmap(NULL, VESA_MEM_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+	if (mem == MAP_FAILED)
+		ERR_PTR(-errno);
+
+	kvm__register_mem(kvm, VESA_MEM_ADDR, VESA_MEM_SIZE, mem);
+
+	vesafb = (struct framebuffer) {
+		.width			= VESA_WIDTH,
+		.height			= VESA_HEIGHT,
+		.depth			= VESA_BPP,
+		.mem			= mem,
+		.mem_addr		= VESA_MEM_ADDR,
+		.mem_size		= VESA_MEM_SIZE,
+		.kvm			= kvm,
+	};
+	return fb__register(&vesafb);
+}
diff --git a/tools/kvm/include/asm/hweight.h b/tools/kvm/include/asm/hweight.h
new file mode 100644
index 000000000000..1a439777bb45
--- /dev/null
+++ b/tools/kvm/include/asm/hweight.h
@@ -0,0 +1,8 @@
+#ifndef _KVM_ASM_HWEIGHT_H_
+#define _KVM_ASM_HWEIGHT_H_
+
+#include <linux/types.h>
+unsigned int hweight32(unsigned int w);
+unsigned long hweight64(__u64 w);
+
+#endif /* _KVM_ASM_HWEIGHT_H_ */
diff --git a/tools/kvm/include/bios/memcpy.h b/tools/kvm/include/bios/memcpy.h
new file mode 100644
index 000000000000..e0210449e80f
--- /dev/null
+++ b/tools/kvm/include/bios/memcpy.h
@@ -0,0 +1,9 @@
+#ifndef KVM_BIOS_MEMCPY_H
+#define KVM_BIOS_MEMCPY_H
+
+#include <linux/types.h>
+#include <stddef.h>
+
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len);
+
+#endif /* KVM_BIOS_MEMCPY_H */
diff --git a/tools/kvm/include/kvm/8250-serial.h b/tools/kvm/include/kvm/8250-serial.h
new file mode 100644
index 000000000000..e9545517351c
--- /dev/null
+++ b/tools/kvm/include/kvm/8250-serial.h
@@ -0,0 +1,11 @@
+#ifndef KVM__8250_SERIAL_H
+#define KVM__8250_SERIAL_H
+
+struct kvm;
+
+int serial8250__init(struct kvm *kvm);
+int serial8250__exit(struct kvm *kvm);
+void serial8250__update_consoles(struct kvm *kvm);
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq);
+
+#endif /* KVM__8250_SERIAL_H */
diff --git a/tools/kvm/include/kvm/apic.h b/tools/kvm/include/kvm/apic.h
new file mode 100644
index 000000000000..212999709f1b
--- /dev/null
+++ b/tools/kvm/include/kvm/apic.h
@@ -0,0 +1,17 @@
+#ifndef KVM_APIC_H_
+#define KVM_APIC_H_
+
+#include <asm/apicdef.h>
+
+/*
+ * APIC, IOAPIC stuff
+ */
+#define APIC_BASE_ADDR_STEP	0x00400000
+#define IOAPIC_BASE_ADDR_STEP	0x00100000
+
+#define APIC_ADDR(apic)		(APIC_DEFAULT_PHYS_BASE + apic * APIC_BASE_ADDR_STEP)
+#define IOAPIC_ADDR(ioapic)	(IO_APIC_DEFAULT_PHYS_BASE + ioapic * IOAPIC_BASE_ADDR_STEP)
+
+#define KVM_APIC_VERSION	0x14 /* xAPIC */
+
+#endif /* KVM_APIC_H_ */
diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h
new file mode 100644
index 000000000000..29f72e0e8e0d
--- /dev/null
+++ b/tools/kvm/include/kvm/brlock.h
@@ -0,0 +1,41 @@
+#ifndef KVM__BRLOCK_H
+#define KVM__BRLOCK_H
+
+#include "kvm/kvm.h"
+#include "kvm/barrier.h"
+
+/*
+ * brlock is a lock which is very cheap for reads, but very expensive
+ * for writes.
+ * This lock will be used when updates are very rare and reads are common.
+ * This lock is currently implemented by stopping the guest while
+ * performing the updates. We assume that the only threads whichread from
+ * the locked data are VCPU threads, and the only writer isn't a VCPU thread.
+ */
+
+#ifndef barrier
+#define barrier()		__asm__ __volatile__("": : :"memory")
+#endif
+
+#ifdef KVM_BRLOCK_DEBUG
+
+#include "kvm/rwsem.h"
+
+DECLARE_RWSEM(brlock_sem);
+
+#define br_read_lock(kvm)	down_read(&brlock_sem);
+#define br_read_unlock(kvm)	up_read(&brlock_sem);
+
+#define br_write_lock(kvm)	down_write(&brlock_sem);
+#define br_write_unlock(kvm)	up_write(&brlock_sem);
+
+#else
+
+#define br_read_lock(kvm)	barrier()
+#define br_read_unlock(kvm)	barrier()
+
+#define br_write_lock(kvm)	kvm__pause(kvm)
+#define br_write_unlock(kvm)	kvm__continue(kvm)
+#endif
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-balloon.h b/tools/kvm/include/kvm/builtin-balloon.h
new file mode 100644
index 000000000000..77ee65605070
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-balloon.h
@@ -0,0 +1,9 @@
+#ifndef KVM__BALLOON_H
+#define KVM__BALLOON_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix);
+void kvm_balloon_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-debug.h b/tools/kvm/include/kvm/builtin-debug.h
new file mode 100644
index 000000000000..efa0268402b3
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-debug.h
@@ -0,0 +1,20 @@
+#ifndef KVM__DEBUG_H
+#define KVM__DEBUG_H
+
+#include <kvm/util.h>
+#include <linux/types.h>
+
+#define KVM_DEBUG_CMD_TYPE_DUMP	(1 << 0)
+#define KVM_DEBUG_CMD_TYPE_NMI	(1 << 1)
+#define KVM_DEBUG_CMD_TYPE_SYSRQ (1 << 2)
+
+struct debug_cmd_params {
+	u32 dbg_type;
+	u32 cpu;
+	char sysrq;
+};
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix);
+void kvm_debug_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-help.h b/tools/kvm/include/kvm/builtin-help.h
new file mode 100644
index 000000000000..2946743b689b
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-help.h
@@ -0,0 +1,6 @@
+#ifndef __KVM_HELP_H__
+#define __KVM_HELP_H__
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-list.h b/tools/kvm/include/kvm/builtin-list.h
new file mode 100644
index 000000000000..47029caa25e6
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-list.h
@@ -0,0 +1,10 @@
+#ifndef KVM__LIST_H
+#define KVM__LIST_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix);
+void kvm_list_help(void) NORETURN;
+int get_vmstate(int sock);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-pause.h b/tools/kvm/include/kvm/builtin-pause.h
new file mode 100644
index 000000000000..84aaee320fbc
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-pause.h
@@ -0,0 +1,9 @@
+#ifndef KVM__PAUSE_H
+#define KVM__PAUSE_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix);
+void kvm_pause_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-resume.h b/tools/kvm/include/kvm/builtin-resume.h
new file mode 100644
index 000000000000..7de999b2c304
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-resume.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RESUME_H
+#define KVM__RESUME_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix);
+void kvm_resume_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-run.h b/tools/kvm/include/kvm/builtin-run.h
new file mode 100644
index 000000000000..91521a58a59c
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-run.h
@@ -0,0 +1,11 @@
+#ifndef __KVM_RUN_H__
+#define __KVM_RUN_H__
+
+#include <kvm/util.h>
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix);
+void kvm_run_help(void) NORETURN;
+
+void kvm_run_set_wrapper_sandbox(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-sandbox.h b/tools/kvm/include/kvm/builtin-sandbox.h
new file mode 100644
index 000000000000..98cd6bee3f32
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-sandbox.h
@@ -0,0 +1,6 @@
+#ifndef KVM__SANDBOX_H
+#define KVM__SANDBOX_H
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-setup.h b/tools/kvm/include/kvm/builtin-setup.h
new file mode 100644
index 000000000000..4a8d7ee39425
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-setup.h
@@ -0,0 +1,11 @@
+#ifndef KVM__SETUP_H
+#define KVM__SETUP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix);
+void kvm_setup_help(void) NORETURN;
+int kvm_setup_create_new(const char *guestfs_name);
+void kvm_setup_resolv(const char *guestfs_name);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stat.h b/tools/kvm/include/kvm/builtin-stat.h
new file mode 100644
index 000000000000..4fecb37901dd
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STAT_H
+#define KVM__STAT_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix);
+void kvm_stat_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stop.h b/tools/kvm/include/kvm/builtin-stop.h
new file mode 100644
index 000000000000..b26b2750a0ca
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stop.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STOP_H
+#define KVM__STOP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix);
+void kvm_stop_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-version.h b/tools/kvm/include/kvm/builtin-version.h
new file mode 100644
index 000000000000..83cac4d8c71e
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-version.h
@@ -0,0 +1,6 @@
+#ifndef KVM__VERSION_H
+#define KVM__VERSION_H
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/compiler.h b/tools/kvm/include/kvm/compiler.h
new file mode 100644
index 000000000000..2013a8351704
--- /dev/null
+++ b/tools/kvm/include/kvm/compiler.h
@@ -0,0 +1,10 @@
+#ifndef KVM_COMPILER_H_
+#define KVM_COMPILER_H_
+
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
+#define notrace __attribute__((no_instrument_function))
+
+#endif /* KVM_COMPILER_H_ */
diff --git a/tools/kvm/include/kvm/devices.h b/tools/kvm/include/kvm/devices.h
new file mode 100644
index 000000000000..c5de3de2737e
--- /dev/null
+++ b/tools/kvm/include/kvm/devices.h
@@ -0,0 +1,27 @@
+#ifndef KVM__DEVICES_H
+#define KVM__DEVICES_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+enum device_bus_type {
+	DEVICE_BUS_PCI,
+	DEVICE_BUS_MMIO,
+	DEVICE_BUS_MAX,
+};
+
+struct device_header {
+	enum device_bus_type	bus_type;
+	void			*data;
+	int			dev_num;
+	struct rb_node		node;
+};
+
+int device__register(struct device_header *dev);
+struct device_header *device__find_dev(enum device_bus_type bus_type,
+				       u8 dev_num);
+
+struct device_header *device__first_dev(enum device_bus_type bus_type);
+struct device_header *device__next_dev(struct device_header *dev);
+
+#endif /* KVM__DEVICES_H */
diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h
new file mode 100644
index 000000000000..b72805242d4d
--- /dev/null
+++ b/tools/kvm/include/kvm/disk-image.h
@@ -0,0 +1,96 @@
+#ifndef KVM__DISK_IMAGE_H
+#define KVM__DISK_IMAGE_H
+
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/parse-options.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>	/* for BLKGETSIZE64 */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define SECTOR_SHIFT		9
+#define SECTOR_SIZE		(1UL << SECTOR_SHIFT)
+
+enum {
+	DISK_IMAGE_REGULAR,
+	DISK_IMAGE_MMAP,
+};
+
+#define MAX_DISK_IMAGES         4
+
+struct disk_image;
+
+struct disk_image_operations {
+	ssize_t (*read)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+			int iovcount, void *param);
+	ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+			int iovcount, void *param);
+	int (*flush)(struct disk_image *disk);
+	int (*close)(struct disk_image *disk);
+};
+
+struct disk_image_params {
+	const char *filename;
+	/*
+	 * wwpn == World Wide Port Number
+	 * tpgt == Target Portal Group Tag
+	 */
+	const char *wwpn;
+	const char *tpgt;
+	bool readonly;
+	bool direct;
+};
+
+struct disk_image {
+	int				fd;
+	u64				size;
+	struct disk_image_operations	*ops;
+	void				*priv;
+	void				*disk_req_cb_param;
+	void				(*disk_req_cb)(void *param, long len);
+	bool				async;
+	int				evt;
+#ifdef CONFIG_HAS_AIO
+	io_context_t			ctx;
+#endif
+	const char			*wwpn;
+	const char			*tpgt;
+	int				debug_iodelay;
+};
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset);
+int disk_image__init(struct kvm *kvm);
+int disk_image__exit(struct kvm *kvm);
+struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
+int disk_image__flush(struct disk_image *disk);
+ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param);
+ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param);
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len);
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly);
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st);
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+int raw_image__close(struct disk_image *disk);
+void disk_image__set_callback(struct disk_image *disk, void (*disk_req_cb)(void *param, long len));
+#endif /* KVM__DISK_IMAGE_H */
diff --git a/tools/kvm/include/kvm/e820.h b/tools/kvm/include/kvm/e820.h
new file mode 100644
index 000000000000..15f62cc660ef
--- /dev/null
+++ b/tools/kvm/include/kvm/e820.h
@@ -0,0 +1,13 @@
+#ifndef KVM_E820_H
+#define KVM_E820_H
+
+#include <linux/types.h>
+#include <kvm/bios.h>
+
+#define SMAP    0x534d4150      /* ASCII "SMAP" */
+
+struct biosregs;
+
+extern bioscall void e820_query_map(struct biosregs *regs);
+
+#endif /* KVM_E820_H */
diff --git a/tools/kvm/include/kvm/fdt.h b/tools/kvm/include/kvm/fdt.h
new file mode 100644
index 000000000000..19f95ac24f0f
--- /dev/null
+++ b/tools/kvm/include/kvm/fdt.h
@@ -0,0 +1,26 @@
+#ifndef KVM__FDT_H
+#define KVM__FDT_H
+
+#include "libfdt.h"
+
+#include <linux/types.h>
+
+#define FDT_MAX_SIZE	0x10000
+
+/* Helper for the various bits of code that generate FDT nodes */
+#define _FDT(exp)							\
+	do {								\
+		int ret = (exp);					\
+		if (ret < 0) {						\
+			die("Error creating device tree: %s: %s\n",	\
+			    #exp, fdt_strerror(ret));			\
+		}							\
+	} while (0)
+
+static inline u32 fdt__alloc_phandle(void)
+{
+	static u32 phandle = 0;
+	return ++phandle;
+}
+
+#endif /* KVM__FDT_H */
diff --git a/tools/kvm/include/kvm/framebuffer.h b/tools/kvm/include/kvm/framebuffer.h
new file mode 100644
index 000000000000..e3200e5b16de
--- /dev/null
+++ b/tools/kvm/include/kvm/framebuffer.h
@@ -0,0 +1,36 @@
+#ifndef KVM__FRAMEBUFFER_H
+#define KVM__FRAMEBUFFER_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct framebuffer;
+
+struct fb_target_operations {
+	int (*start)(struct framebuffer *fb);
+	int (*stop)(struct framebuffer *fb);
+};
+
+#define FB_MAX_TARGETS			2
+
+struct framebuffer {
+	struct list_head		node;
+
+	u32				width;
+	u32				height;
+	u8				depth;
+	char				*mem;
+	u64				mem_addr;
+	u64				mem_size;
+	struct kvm			*kvm;
+
+	unsigned long			nr_targets;
+	struct fb_target_operations	*targets[FB_MAX_TARGETS];
+};
+
+struct framebuffer *fb__register(struct framebuffer *fb);
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops);
+int fb__init(struct kvm *kvm);
+int fb__exit(struct kvm *kvm);
+
+#endif /* KVM__FRAMEBUFFER_H */
diff --git a/tools/kvm/include/kvm/guest_compat.h b/tools/kvm/include/kvm/guest_compat.h
new file mode 100644
index 000000000000..ae7abbdb8be5
--- /dev/null
+++ b/tools/kvm/include/kvm/guest_compat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__GUEST_COMPAT_H
+#define KVM__GUEST_COMPAT_H
+
+int compat__print_all_messages(void);
+int compat__remove_message(int id);
+int compat__add_message(const char *title, const char *description);
+
+
+#endif
+\ No newline at end of file
diff --git a/tools/kvm/include/kvm/i8042.h b/tools/kvm/include/kvm/i8042.h
new file mode 100644
index 000000000000..3b4ab688b840
--- /dev/null
+++ b/tools/kvm/include/kvm/i8042.h
@@ -0,0 +1,12 @@
+#ifndef KVM__PCKBD_H
+#define KVM__PCKBD_H
+
+#include <linux/types.h>
+
+struct kvm;
+
+void mouse_queue(u8 c);
+void kbd_queue(u8 c);
+int kbd__init(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/include/kvm/ioeventfd.h b/tools/kvm/include/kvm/ioeventfd.h
new file mode 100644
index 000000000000..d71fa4066eb9
--- /dev/null
+++ b/tools/kvm/include/kvm/ioeventfd.h
@@ -0,0 +1,28 @@
+#ifndef KVM__IOEVENTFD_H
+#define KVM__IOEVENTFD_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <sys/eventfd.h>
+#include "kvm/util.h"
+
+struct kvm;
+
+struct ioevent {
+	u64			io_addr;
+	u8			io_len;
+	void			(*fn)(struct kvm *kvm, void *ptr);
+	struct kvm		*fn_kvm;
+	void			*fn_ptr;
+	int			fd;
+	u64			datamatch;
+
+	struct list_head	list;
+};
+
+int ioeventfd__init(struct kvm *kvm);
+int ioeventfd__exit(struct kvm *kvm);
+int ioeventfd__add_event(struct ioevent *ioevent, bool is_pio, bool poll_in_userspace);
+int ioeventfd__del_event(u64 addr, u64 datamatch);
+
+#endif
diff --git a/tools/kvm/include/kvm/ioport.h b/tools/kvm/include/kvm/ioport.h
new file mode 100644
index 000000000000..6660acb66b96
--- /dev/null
+++ b/tools/kvm/include/kvm/ioport.h
@@ -0,0 +1,70 @@
+#ifndef KVM__IOPORT_H
+#define KVM__IOPORT_H
+
+#include "kvm/rbtree-interval.h"
+
+#include <stdbool.h>
+#include <limits.h>
+#include <asm/types.h>
+#include <linux/types.h>
+#include <linux/byteorder.h>
+
+/* some ports we reserve for own use */
+#define IOPORT_DBG			0xe0
+#define IOPORT_START			0x6200
+#define IOPORT_SIZE			0x400
+
+#define IOPORT_EMPTY			USHRT_MAX
+
+struct kvm;
+
+struct ioport {
+	struct rb_int_node		node;
+	struct ioport_operations	*ops;
+	void				*priv;
+};
+
+struct ioport_operations {
+	bool (*io_in)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size);
+	bool (*io_out)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size);
+};
+
+void ioport__setup_arch(struct kvm *kvm);
+
+int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops,
+			int count, void *param);
+int ioport__unregister(struct kvm *kvm, u16 port);
+int ioport__init(struct kvm *kvm);
+int ioport__exit(struct kvm *kvm);
+
+static inline u8 ioport__read8(u8 *data)
+{
+	return *data;
+}
+/* On BE platforms, PCI I/O is byteswapped, i.e. LE, so swap back. */
+static inline u16 ioport__read16(u16 *data)
+{
+	return le16_to_cpu(*data);
+}
+
+static inline u32 ioport__read32(u32 *data)
+{
+	return le32_to_cpu(*data);
+}
+
+static inline void ioport__write8(u8 *data, u8 value)
+{
+	*data		 = value;
+}
+
+static inline void ioport__write16(u16 *data, u16 value)
+{
+	*data		 = cpu_to_le16(value);
+}
+
+static inline void ioport__write32(u32 *data, u32 value)
+{
+	*data		 = cpu_to_le32(value);
+}
+
+#endif /* KVM__IOPORT_H */
diff --git a/tools/kvm/include/kvm/irq.h b/tools/kvm/include/kvm/irq.h
new file mode 100644
index 000000000000..5c1274b98610
--- /dev/null
+++ b/tools/kvm/include/kvm/irq.h
@@ -0,0 +1,33 @@
+#ifndef KVM__IRQ_H
+#define KVM__IRQ_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+
+#include "kvm/msi.h"
+
+struct kvm;
+
+struct irq_line {
+	u8			line;
+	struct list_head	node;
+};
+
+struct pci_dev {
+	struct rb_node		node;
+	u32			id;
+	u8			pin;
+	struct list_head	lines;
+};
+
+int irq__register_device(u32 dev, u8 *pin, u8 *line);
+
+struct rb_node *irq__get_pci_tree(void);
+
+int irq__init(struct kvm *kvm);
+int irq__exit(struct kvm *kvm);
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cmd.h b/tools/kvm/include/kvm/kvm-cmd.h
new file mode 100644
index 000000000000..0a73bce077b9
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cmd.h
@@ -0,0 +1,17 @@
+#ifndef __KVM_CMD_H__
+#define __KVM_CMD_H__
+
+struct cmd_struct {
+	const char *cmd;
+	int (*fn)(int, const char **, const char *);
+	void (*help)(void);
+	int option;
+};
+
+extern struct cmd_struct kvm_commands[];
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+                const char *cmd);
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-config.h b/tools/kvm/include/kvm/kvm-config.h
new file mode 100644
index 000000000000..c66f48144bd2
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-config.h
@@ -0,0 +1,61 @@
+#ifndef KVM_CONFIG_H_
+#define KVM_CONFIG_H_
+
+#include "kvm/disk-image.h"
+#include "kvm/kvm-config-arch.h"
+
+#define DEFAULT_KVM_DEV		"/dev/kvm"
+#define DEFAULT_CONSOLE		"serial"
+#define DEFAULT_NETWORK		"user"
+#define DEFAULT_HOST_ADDR	"192.168.33.1"
+#define DEFAULT_GUEST_ADDR	"192.168.33.15"
+#define DEFAULT_GUEST_MAC	"02:15:15:15:15:15"
+#define DEFAULT_HOST_MAC	"02:01:01:01:01:01"
+#define DEFAULT_SCRIPT		"none"
+#define DEFAULT_SANDBOX_FILENAME "guest/sandbox.sh"
+
+#define MIN_RAM_SIZE_MB		(64ULL)
+#define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
+
+struct kvm_config {
+	struct kvm_config_arch arch;
+	struct disk_image_params disk_image[MAX_DISK_IMAGES];
+	u64 ram_size;
+	u8  image_count;
+	u8 num_net_devices;
+	bool virtio_rng;
+	int active_console;
+	int debug_iodelay;
+	int nrcpus;
+	const char *kernel_cmdline;
+	const char *kernel_filename;
+	const char *vmlinux_filename;
+	const char *initrd_filename;
+	const char *firmware_filename;
+	const char *console;
+	const char *dev;
+	const char *network;
+	const char *host_ip;
+	const char *guest_ip;
+	const char *guest_mac;
+	const char *host_mac;
+	const char *script;
+	const char *guest_name;
+	const char *sandbox;
+	const char *hugetlbfs_path;
+	const char *custom_rootfs_name;
+	const char *real_cmdline;
+	struct virtio_net_params *net_params;
+	bool single_step;
+	bool vnc;
+	bool sdl;
+	bool balloon;
+	bool using_rootfs;
+	bool custom_rootfs;
+	bool no_net;
+	bool no_dhcp;
+	bool ioport_debug;
+	bool mmio_debug;
+};
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cpu.h b/tools/kvm/include/kvm/kvm-cpu.h
new file mode 100644
index 000000000000..0ece28c32d4b
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cpu.h
@@ -0,0 +1,26 @@
+#ifndef KVM__KVM_CPU_H
+#define KVM__KVM_CPU_H
+
+#include "kvm/kvm-cpu-arch.h"
+#include <stdbool.h>
+
+int kvm_cpu__init(struct kvm *kvm);
+int kvm_cpu__exit(struct kvm *kvm);
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id);
+void kvm_cpu__delete(struct kvm_cpu *vcpu);
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu);
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu);
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu);
+void kvm_cpu__run(struct kvm_cpu *vcpu);
+void kvm_cpu__reboot(struct kvm *kvm);
+int kvm_cpu__start(struct kvm_cpu *cpu);
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu);
+
+int kvm_cpu__get_debug_fd(void);
+void kvm_cpu__set_debug_fd(int fd);
+void kvm_cpu__show_code(struct kvm_cpu *vcpu);
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu);
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu);
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu);
+
+#endif /* KVM__KVM_CPU_H */
diff --git a/tools/kvm/include/kvm/kvm-ipc.h b/tools/kvm/include/kvm/kvm-ipc.h
new file mode 100644
index 000000000000..5494da4c52a7
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-ipc.h
@@ -0,0 +1,26 @@
+#ifndef KVM__IPC_H_
+#define KVM__IPC_H_
+
+#include <linux/types.h>
+#include "kvm/kvm.h"
+
+enum {
+	KVM_IPC_BALLOON	= 1,
+	KVM_IPC_DEBUG	= 2,
+	KVM_IPC_STAT	= 3,
+	KVM_IPC_PAUSE	= 4,
+	KVM_IPC_RESUME	= 5,
+	KVM_IPC_STOP	= 6,
+	KVM_IPC_PID	= 7,
+	KVM_IPC_VMSTATE	= 8,
+};
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm,
+				int fd, u32 type, u32 len, u8 *msg));
+int kvm_ipc__init(struct kvm *kvm);
+int kvm_ipc__exit(struct kvm *kvm);
+
+int kvm_ipc__send(int fd, u32 type);
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
new file mode 100644
index 000000000000..acb08182c07b
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm.h
@@ -0,0 +1,133 @@
+#ifndef KVM__KVM_H
+#define KVM__KVM_H
+
+#include "kvm/kvm-arch.h"
+#include "kvm/kvm-config.h"
+#include "kvm/util-init.h"
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+#include <signal.h>
+#include <sys/prctl.h>
+
+#define SIGKVMEXIT		(SIGRTMIN + 0)
+#define SIGKVMPAUSE		(SIGRTMIN + 1)
+
+#define KVM_PID_FILE_PATH	"/.lkvm/"
+#define HOME_DIR		getenv("HOME")
+#define KVM_BINARY_NAME		"lkvm"
+
+#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
+
+#define DEFINE_KVM_EXT(ext)		\
+	.name = #ext,			\
+	.code = ext
+
+enum {
+	KVM_VMSTATE_RUNNING,
+	KVM_VMSTATE_PAUSED,
+};
+
+struct kvm_ext {
+	const char *name;
+	int code;
+};
+
+struct kvm_mem_bank {
+	struct list_head	list;
+	u64			guest_phys_addr;
+	void			*host_addr;
+	u64			size;
+};
+
+struct kvm {
+	struct kvm_arch		arch;
+	struct kvm_config	cfg;
+	int			sys_fd;		/* For system ioctls(), i.e. /dev/kvm */
+	int			vm_fd;		/* For VM ioctls() */
+	timer_t			timerid;	/* Posix timer for interrupts */
+
+	int			nrcpus;		/* Number of cpus to run */
+	struct kvm_cpu		**cpus;
+
+	u32			mem_slots;	/* for KVM_SET_USER_MEMORY_REGION */
+	u64			ram_size;
+	void			*ram_start;
+	u64			ram_pagesize;
+	struct list_head	mem_banks;
+
+	bool			nmi_disabled;
+
+	const char		*vmlinux;
+	struct disk_image       **disks;
+	int                     nr_disks;
+
+	int			vm_state;
+};
+
+void kvm__set_dir(const char *fmt, ...);
+const char *kvm__get_dir(void);
+
+int kvm__init(struct kvm *kvm);
+struct kvm *kvm__new(void);
+int kvm__recommended_cpus(struct kvm *kvm);
+int kvm__max_cpus(struct kvm *kvm);
+void kvm__init_ram(struct kvm *kvm);
+int kvm__exit(struct kvm *kvm);
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename);
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+			const char *initrd_filename, const char *kernel_cmdline);
+int kvm_timer__init(struct kvm *kvm);
+int kvm_timer__exit(struct kvm *kvm);
+void kvm__irq_line(struct kvm *kvm, int irq, int level);
+void kvm__irq_trigger(struct kvm *kvm, int irq);
+bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count);
+bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr);
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+			void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+			void *ptr);
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr);
+void kvm__pause(struct kvm *kvm);
+void kvm__continue(struct kvm *kvm);
+void kvm__notify_paused(void);
+int kvm__get_sock_by_instance(const char *name);
+int kvm__enumerate_instances(int (*callback)(const char *name, int pid));
+void kvm__remove_socket(const char *name);
+
+void kvm__arch_set_cmdline(char *cmdline, bool video);
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size);
+void kvm__arch_delete_ram(struct kvm *kvm);
+int kvm__arch_setup_firmware(struct kvm *kvm);
+int kvm__arch_free_firmware(struct kvm *kvm);
+bool kvm__arch_cpu_supports_vm(void);
+void kvm__arch_periodic_poll(struct kvm *kvm);
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset);
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr);
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+
+/*
+ * Debugging
+ */
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size);
+
+extern const char *kvm_exit_reasons[];
+
+static inline bool host_ptr_in_ram(struct kvm *kvm, void *p)
+{
+	return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size);
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension);
+
+static inline void kvm__set_thread_name(const char *name)
+{
+	prctl(PR_SET_NAME, name);
+}
+
+#endif /* KVM__KVM_H */
diff --git a/tools/kvm/include/kvm/msi.h b/tools/kvm/include/kvm/msi.h
new file mode 100644
index 000000000000..885eb5b95ed7
--- /dev/null
+++ b/tools/kvm/include/kvm/msi.h
@@ -0,0 +1,10 @@
+#ifndef LKVM_MSI_H
+#define LKVM_MSI_H
+
+struct msi_msg {
+	u32	address_lo;	/* low 32 bits of msi message address */
+	u32	address_hi;	/* high 32 bits of msi message address */
+	u32	data;		/* 16 bits of msi message data */
+};
+
+#endif /* LKVM_MSI_H */
diff --git a/tools/kvm/include/kvm/mutex.h b/tools/kvm/include/kvm/mutex.h
new file mode 100644
index 000000000000..a90584b9db87
--- /dev/null
+++ b/tools/kvm/include/kvm/mutex.h
@@ -0,0 +1,39 @@
+#ifndef KVM__MUTEX_H
+#define KVM__MUTEX_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike mutex API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+struct mutex {
+	pthread_mutex_t mutex;
+};
+#define MUTEX_INITIALIZER (struct mutex) { .mutex = PTHREAD_MUTEX_INITIALIZER }
+
+#define DEFINE_MUTEX(mtx) struct mutex mtx = MUTEX_INITIALIZER
+
+static inline void mutex_init(struct mutex *lock)
+{
+	if (pthread_mutex_init(&lock->mutex, NULL) != 0)
+		die("unexpected pthread_mutex_init() failure!");
+}
+
+static inline void mutex_lock(struct mutex *lock)
+{
+	if (pthread_mutex_lock(&lock->mutex) != 0)
+		die("unexpected pthread_mutex_lock() failure!");
+
+}
+
+static inline void mutex_unlock(struct mutex *lock)
+{
+	if (pthread_mutex_unlock(&lock->mutex) != 0)
+		die("unexpected pthread_mutex_unlock() failure!");
+}
+
+#endif /* KVM__MUTEX_H */
diff --git a/tools/kvm/include/kvm/parse-options.h b/tools/kvm/include/kvm/parse-options.h
new file mode 100644
index 000000000000..09a5fca71117
--- /dev/null
+++ b/tools/kvm/include/kvm/parse-options.h
@@ -0,0 +1,221 @@
+#ifndef __PARSE_OPTIONS_H__
+#define __PARSE_OPTIONS_H__
+
+#include <inttypes.h>
+#include <kvm/util.h>
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN,
+	OPTION_INCR,
+	OPTION_SET_UINT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_LONG,
+	OPTION_CALLBACK,
+	OPTION_U64,
+	OPTION_UINTEGER,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+	void *ptr;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+};
+
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define check_vtype(v, type) \
+	(BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v)
+
+#define OPT_INTEGER(s, l, v, h)             \
+{                                           \
+	.type = OPTION_INTEGER,             \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, int *),     \
+	.help = (h)                         \
+}
+
+#define OPT_U64(s, l, v, h)                 \
+{                                           \
+	.type = OPTION_U64,                 \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, u64 *),     \
+	.help = (h)                         \
+}
+
+#define OPT_STRING(s, l, v, a, h)           \
+{                                           \
+	.type = OPTION_STRING,              \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, const char **), (a), \
+	.help = (h)                         \
+}
+
+#define OPT_BOOLEAN(s, l, v, h)             \
+{                                           \
+	.type = OPTION_BOOLEAN,             \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, bool *),    \
+	.help = (h)                         \
+}
+
+#define OPT_INCR(s, l, v, h)                \
+{                                           \
+	.type = OPTION_INCR,	            \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, int *),     \
+	.help = (h)                         \
+}
+
+#define OPT_GROUP(h)                        \
+{                                           \
+	.type = OPTION_GROUP,               \
+	.help = (h)                         \
+}
+
+#define OPT_CALLBACK(s, l, v, a, h, f, p)   \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v),			    \
+	(a),				    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.ptr = (p),			    \
+}
+
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f, p) \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v),			    \
+	(a),				    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.flags = PARSE_OPT_NOARG,	    \
+	.ptr = (p),			    \
+}
+
+#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d, p) \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v), (a),		    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.defval = (intptr_t)d,		    \
+	.flags = PARSE_OPT_LASTARG_DEFAULT, \
+	.ptr = (p)			    \
+}
+
+#define OPT_END() { .type = OPTION_END }
+
+#define OPT_ARCH(cmd, cfg)		    \
+	OPT_ARCH_##cmd(OPT_GROUP("Arch-specific options:"), &(cfg)->arch)
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ **/
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	int flags;
+};
+
+/* global functions */
+void usage_with_options(const char * const *usagestr,
+		const struct option *opts) NORETURN;
+int parse_options(int argc, const char **argv, const struct option *options,
+		const char * const usagestr[], int flags);
+#endif
diff --git a/tools/kvm/include/kvm/pci-shmem.h b/tools/kvm/include/kvm/pci-shmem.h
new file mode 100644
index 000000000000..6cff2b85bfd3
--- /dev/null
+++ b/tools/kvm/include/kvm/pci-shmem.h
@@ -0,0 +1,32 @@
+#ifndef KVM__PCI_SHMEM_H
+#define KVM__PCI_SHMEM_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+#include "kvm/parse-options.h"
+
+#define SHMEM_DEFAULT_SIZE (16 << MB_SHIFT)
+#define SHMEM_DEFAULT_ADDR (0xc8000000)
+#define SHMEM_DEFAULT_HANDLE "/kvm_shmem"
+
+struct kvm;
+struct shmem_info;
+
+struct shmem_info {
+	u64 phys_addr;
+	u64 size;
+	char *handle;
+	int create;
+};
+
+int pci_shmem__init(struct kvm *kvm);
+int pci_shmem__exit(struct kvm *kvm);
+int pci_shmem__register_mem(struct shmem_info *si);
+int shmem_parser(const struct option *opt, const char *arg, int unset);
+
+int pci_shmem__get_local_irqfd(struct kvm *kvm);
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd);
+int pci_shmem__remove_client(struct kvm *kvm, u32 id);
+
+#endif
diff --git a/tools/kvm/include/kvm/pci.h b/tools/kvm/include/kvm/pci.h
new file mode 100644
index 000000000000..3da381175c8d
--- /dev/null
+++ b/tools/kvm/include/kvm/pci.h
@@ -0,0 +1,93 @@
+#ifndef KVM__PCI_H
+#define KVM__PCI_H
+
+#include <linux/types.h>
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+#include <endian.h>
+
+#include "kvm/kvm.h"
+#include "kvm/msi.h"
+
+/*
+ * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
+ * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
+ * details.
+ */
+#define PCI_CONFIG_ADDRESS	0xcf8
+#define PCI_CONFIG_DATA		0xcfc
+#define PCI_CONFIG_BUS_FORWARD	0xcfa
+#define PCI_IO_SIZE		0x100
+
+union pci_config_address {
+	struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+		unsigned	register_number	: 6;		/* 7  .. 2  */
+		unsigned	function_number	: 3;		/* 10 .. 8  */
+		unsigned	device_number	: 5;		/* 15 .. 11 */
+		unsigned	bus_number	: 8;		/* 23 .. 16 */
+		unsigned	reserved	: 7;		/* 30 .. 24 */
+		unsigned	enable_bit	: 1;		/* 31       */
+#else
+		unsigned	enable_bit	: 1;		/* 31       */
+		unsigned	reserved	: 7;		/* 30 .. 24 */
+		unsigned	bus_number	: 8;		/* 23 .. 16 */
+		unsigned	device_number	: 5;		/* 15 .. 11 */
+		unsigned	function_number	: 3;		/* 10 .. 8  */
+		unsigned	register_number	: 6;		/* 7  .. 2  */
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+#endif
+	};
+	u32 w;
+};
+
+struct msix_table {
+	struct msi_msg msg;
+	u32 ctrl;
+};
+
+struct msix_cap {
+	u8 cap;
+	u8 next;
+	u16 ctrl;
+	u32 table_offset;
+	u32 pba_offset;
+};
+
+struct pci_device_header {
+	u16		vendor_id;
+	u16		device_id;
+	u16		command;
+	u16		status;
+	u8		revision_id;
+	u8		class[3];
+	u8		cacheline_size;
+	u8		latency_timer;
+	u8		header_type;
+	u8		bist;
+	u32		bar[6];
+	u32		card_bus;
+	u16		subsys_vendor_id;
+	u16		subsys_id;
+	u32		exp_rom_bar;
+	u8		capabilities;
+	u8		reserved1[3];
+	u32		reserved2;
+	u8		irq_line;
+	u8		irq_pin;
+	u8		min_gnt;
+	u8		max_lat;
+	struct msix_cap msix;
+	u8		empty[136]; /* Rest of PCI config space */
+	u32		bar_size[6];
+} __attribute__((packed));
+
+int pci__init(struct kvm *kvm);
+int pci__exit(struct kvm *kvm);
+struct pci_device_header *pci__find_dev(u8 dev_num);
+u32 pci_get_io_space_block(u32 size);
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+
+#endif /* KVM__PCI_H */
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
new file mode 100644
index 000000000000..f8492462ddaa
--- /dev/null
+++ b/tools/kvm/include/kvm/qcow.h
@@ -0,0 +1,133 @@
+#ifndef KVM__QCOW_H
+#define KVM__QCOW_H
+
+#include "kvm/mutex.h"
+
+#include <linux/types.h>
+#include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#define QCOW_MAGIC		(('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW1_VERSION		1
+#define QCOW2_VERSION		2
+
+#define QCOW1_OFLAG_COMPRESSED	(1ULL << 63)
+
+#define QCOW2_OFLAG_COPIED	(1ULL << 63)
+#define QCOW2_OFLAG_COMPRESSED	(1ULL << 62)
+
+#define QCOW2_OFLAGS_MASK	(QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
+
+#define QCOW2_OFFSET_MASK	(~QCOW2_OFLAGS_MASK)
+
+#define MAX_CACHE_NODES         32
+
+struct qcow_l2_table {
+	u64				offset;
+	struct rb_node			node;
+	struct list_head		list;
+	u8				dirty;
+	u64				table[];
+};
+
+struct qcow_l1_table {
+	u32				table_size;
+	u64				*l1_table;
+
+	/* Level2 caching data structures */
+	struct rb_root			root;
+	struct list_head		lru_list;
+	int				nr_cached;
+};
+
+#define QCOW_REFCOUNT_BLOCK_SHIFT	1
+
+struct qcow_refcount_block {
+	u64				offset;
+	struct rb_node			node;
+	struct list_head		list;
+	u64				size;
+	u8				dirty;
+	u16				entries[];
+};
+
+struct qcow_refcount_table {
+	u32				rf_size;
+	u64				*rf_table;
+
+	/* Refcount block caching data structures */
+	struct rb_root			root;
+	struct list_head		lru_list;
+	int				nr_cached;
+};
+
+struct qcow_header {
+	u64				size;	/* in bytes */
+	u64				l1_table_offset;
+	u32				l1_size;
+	u8				cluster_bits;
+	u8				l2_bits;
+	u64				refcount_table_offset;
+	u32				refcount_table_size;
+};
+
+struct qcow {
+	struct mutex			mutex;
+	struct qcow_header		*header;
+	struct qcow_l1_table		table;
+	struct qcow_refcount_table	refcount_table;
+	int				fd;
+	int				csize_shift;
+	int				csize_mask;
+	u32				version;
+	u64				cluster_size;
+	u64				cluster_offset_mask;
+	u64				free_clust_idx;
+	void				*cluster_cache;
+	void				*cluster_data;
+	void				*copy_buff;
+};
+
+struct qcow1_header_disk {
+	u32				magic;
+	u32				version;
+
+	u64				backing_file_offset;
+	u32 				backing_file_size;
+	u32				mtime;
+
+	u64				size;	/* in bytes */
+
+	u8				cluster_bits;
+	u8				l2_bits;
+	u32				crypt_method;
+
+	u64				l1_table_offset;
+};
+
+struct qcow2_header_disk {
+	u32				magic;
+	u32				version;
+
+	u64				backing_file_offset;
+	u32				backing_file_size;
+
+	u32				cluster_bits;
+	u64				size;	/* in bytes */
+	u32				crypt_method;
+
+	u32				l1_size;
+	u64				l1_table_offset;
+
+	u64				refcount_table_offset;
+	u32				refcount_table_clusters;
+
+	u32				nb_snapshots;
+	u64				snapshots_offset;
+};
+
+struct disk_image *qcow_probe(int fd, bool readonly);
+
+#endif /* KVM__QCOW_H */
diff --git a/tools/kvm/include/kvm/rbtree-interval.h b/tools/kvm/include/kvm/rbtree-interval.h
new file mode 100644
index 000000000000..730eb5e8551d
--- /dev/null
+++ b/tools/kvm/include/kvm/rbtree-interval.h
@@ -0,0 +1,30 @@
+#ifndef KVM__INTERVAL_RBTREE_H
+#define KVM__INTERVAL_RBTREE_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#define RB_INT_INIT(l, h) \
+	(struct rb_int_node){.low = l, .high = h}
+#define rb_int(n) rb_entry(n, struct rb_int_node, node)
+
+struct rb_int_node {
+	struct rb_node	node;
+	u64		low;
+	u64		high;
+};
+
+/* Return the rb_int_node interval in which 'point' is located. */
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point);
+
+/* Return the rb_int_node in which start:len is located. */
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high);
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *data);
+
+static inline void rb_int_erase(struct rb_root *root, struct rb_int_node *node)
+{
+	rb_erase(&node->node, root);
+}
+
+#endif
diff --git a/tools/kvm/include/kvm/read-write.h b/tools/kvm/include/kvm/read-write.h
new file mode 100644
index 000000000000..67571f9671c7
--- /dev/null
+++ b/tools/kvm/include/kvm/read-write.h
@@ -0,0 +1,43 @@
+#ifndef KVM_READ_WRITE_H
+#define KVM_READ_WRITE_H
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t xread(int fd, void *buf, size_t count);
+ssize_t xwrite(int fd, const void *buf, size_t count);
+
+ssize_t read_in_full(int fd, void *buf, size_t count);
+ssize_t write_in_full(int fd, const void *buf, size_t count);
+
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt);
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt);
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+#ifdef CONFIG_HAS_AIO
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param);
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param);
+#endif
+
+#endif /* KVM_READ_WRITE_H */
diff --git a/tools/kvm/include/kvm/rtc.h b/tools/kvm/include/kvm/rtc.h
new file mode 100644
index 000000000000..6aa929913c6a
--- /dev/null
+++ b/tools/kvm/include/kvm/rtc.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RTC_H
+#define KVM__RTC_H
+
+struct kvm;
+
+int rtc__init(struct kvm *kvm);
+int rtc__exit(struct kvm *kvm);
+
+#endif /* KVM__RTC_H */
diff --git a/tools/kvm/include/kvm/rwsem.h b/tools/kvm/include/kvm/rwsem.h
new file mode 100644
index 000000000000..75a22f835d20
--- /dev/null
+++ b/tools/kvm/include/kvm/rwsem.h
@@ -0,0 +1,39 @@
+#ifndef KVM__RWSEM_H
+#define KVM__RWSEM_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike rwsem API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER
+
+static inline void down_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_rdlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_rdlock() failure!");
+}
+
+static inline void down_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_wrlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_wrlock() failure!");
+}
+
+static inline void up_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+static inline void up_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+#endif /* KVM__RWSEM_H */
diff --git a/tools/kvm/include/kvm/sdl.h b/tools/kvm/include/kvm/sdl.h
new file mode 100644
index 000000000000..2f0c213e3dba
--- /dev/null
+++ b/tools/kvm/include/kvm/sdl.h
@@ -0,0 +1,28 @@
+#ifndef KVM__SDL_H
+#define KVM__SDL_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_SDL
+int sdl__init(struct kvm *kvm);
+int sdl__exit(struct kvm *kvm);
+#else
+static inline int sdl__init(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+	return 0;
+}
+static inline int sdl__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+	return 0;
+}
+#endif
+
+#endif /* KVM__SDL_H */
diff --git a/tools/kvm/include/kvm/segment.h b/tools/kvm/include/kvm/segment.h
new file mode 100644
index 000000000000..9387a820f137
--- /dev/null
+++ b/tools/kvm/include/kvm/segment.h
@@ -0,0 +1,21 @@
+#ifndef KVM_SEGMENT_H
+#define KVM_SEGMENT_H
+
+#include <linux/types.h>
+
+static inline u32 segment_to_flat(u16 selector, u16 offset)
+{
+	return ((u32)selector << 4) + (u32) offset;
+}
+
+static inline u16 flat_to_seg16(u32 address)
+{
+	return address >> 4;
+}
+
+static inline u16 flat_to_off16(u32 address, u32 segment)
+{
+	return address - (segment << 4);
+}
+
+#endif /* KVM_SEGMENT_H */
diff --git a/tools/kvm/include/kvm/strbuf.h b/tools/kvm/include/kvm/strbuf.h
new file mode 100644
index 000000000000..2beefbc3f3fd
--- /dev/null
+++ b/tools/kvm/include/kvm/strbuf.h
@@ -0,0 +1,20 @@
+#ifndef __STRBUF_H__
+#define __STRBUF_H__
+
+#include <sys/types.h>
+#include <string.h>
+
+int prefixcmp(const char *str, const char *prefix);
+
+extern size_t strlcat(char *dest, const char *src, size_t count);
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+/* some inline functions */
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#endif
diff --git a/tools/kvm/include/kvm/symbol.h b/tools/kvm/include/kvm/symbol.h
new file mode 100644
index 000000000000..725bbaf8fa23
--- /dev/null
+++ b/tools/kvm/include/kvm/symbol.h
@@ -0,0 +1,30 @@
+#ifndef KVM__SYMBOL_H
+#define KVM__SYMBOL_H
+
+#include <stddef.h>
+#include <string.h>
+
+struct kvm;
+
+#define SYMBOL_DEFAULT_UNKNOWN "<unknown>"
+
+#ifdef CONFIG_HAS_BFD
+
+int symbol_init(struct kvm *kvm);
+int symbol_exit(struct kvm *kvm);
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size);
+
+#else
+
+static inline int symbol_init(struct kvm *kvm) { return 0; }
+static inline char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+	char *s = strncpy(sym, SYMBOL_DEFAULT_UNKNOWN, size);
+	sym[size - 1] = '\0';
+	return s;
+}
+static inline int symbol_exit(struct kvm *kvm) { return 0; }
+
+#endif
+
+#endif /* KVM__SYMBOL_H */
diff --git a/tools/kvm/include/kvm/term.h b/tools/kvm/include/kvm/term.h
new file mode 100644
index 000000000000..5f6345719656
--- /dev/null
+++ b/tools/kvm/include/kvm/term.h
@@ -0,0 +1,24 @@
+#ifndef KVM__TERM_H
+#define KVM__TERM_H
+
+#include "kvm/kvm.h"
+
+#include <sys/uio.h>
+#include <stdbool.h>
+
+#define CONSOLE_8250	1
+#define CONSOLE_VIRTIO	2
+#define CONSOLE_HV	3
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term);
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term);
+int term_putc(char *addr, int cnt, int term);
+int term_getc(struct kvm *kvm, int term);
+
+bool term_readable(int term);
+void term_set_tty(int term);
+int term_init(struct kvm *kvm);
+int term_exit(struct kvm *kvm);
+int tty_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__TERM_H */
diff --git a/tools/kvm/include/kvm/threadpool.h b/tools/kvm/include/kvm/threadpool.h
new file mode 100644
index 000000000000..bacb2434e6f1
--- /dev/null
+++ b/tools/kvm/include/kvm/threadpool.h
@@ -0,0 +1,38 @@
+#ifndef KVM__THREADPOOL_H
+#define KVM__THREADPOOL_H
+
+#include "kvm/mutex.h"
+
+#include <linux/list.h>
+
+struct kvm;
+
+typedef void (*kvm_thread_callback_fn_t)(struct kvm *kvm, void *data);
+
+struct thread_pool__job {
+	kvm_thread_callback_fn_t	callback;
+	struct kvm			*kvm;
+	void				*data;
+
+	int				signalcount;
+	struct mutex			mutex;
+
+	struct list_head		queue;
+};
+
+static inline void thread_pool__init_job(struct thread_pool__job *job, struct kvm *kvm, kvm_thread_callback_fn_t callback, void *data)
+{
+	*job = (struct thread_pool__job) {
+		.kvm		= kvm,
+		.callback	= callback,
+		.data		= data,
+		.mutex		= MUTEX_INITIALIZER,
+	};
+}
+
+int thread_pool__init(struct kvm *kvm);
+int thread_pool__exit(struct kvm *kvm);
+
+void thread_pool__do_job(struct thread_pool__job *job);
+
+#endif
diff --git a/tools/kvm/include/kvm/types.h b/tools/kvm/include/kvm/types.h
new file mode 100644
index 000000000000..0cbc5fbc8549
--- /dev/null
+++ b/tools/kvm/include/kvm/types.h
@@ -0,0 +1,7 @@
+#ifndef KVM_TYPES_H
+#define KVM_TYPES_H
+
+/* FIXME: include/linux/if_tun.h and include/linux/if_ether.h complains */
+#define __be16 u16
+
+#endif /* KVM_TYPES_H */
diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
new file mode 100644
index 000000000000..ac248d2b7757
--- /dev/null
+++ b/tools/kvm/include/kvm/uip.h
@@ -0,0 +1,360 @@
+#ifndef KVM__UIP_H
+#define KVM__UIP_H
+
+#include "linux/types.h"
+#include "kvm/mutex.h"
+
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#define UIP_BUF_STATUS_FREE	0
+#define UIP_BUF_STATUS_INUSE	1
+#define UIP_BUF_STATUS_USED	2
+
+#define UIP_ETH_P_IP		0X0800
+#define UIP_ETH_P_ARP		0X0806
+
+#define UIP_IP_VER_4		0X40
+#define UIP_IP_HDR_LEN		0X05
+#define UIP_IP_TTL		0X40
+#define UIP_IP_P_UDP		0X11
+#define UIP_IP_P_TCP		0X06
+#define UIP_IP_P_ICMP		0X01
+
+#define UIP_TCP_HDR_LEN		0x50
+#define UIP_TCP_WIN_SIZE	14600
+#define UIP_TCP_FLAG_FIN	1
+#define UIP_TCP_FLAG_SYN	2
+#define UIP_TCP_FLAG_RST	4
+#define UIP_TCP_FLAG_PSH	8
+#define UIP_TCP_FLAG_ACK	16
+#define UIP_TCP_FLAG_URG	32
+
+#define UIP_BOOTP_VENDOR_SPECIFIC_LEN	64
+#define UIP_BOOTP_MAX_PAYLOAD_LEN	300
+#define UIP_DHCP_VENDOR_SPECIFIC_LEN	312
+#define UIP_DHCP_PORT_SERVER		67
+#define UIP_DHCP_PORT_CLIENT		68
+#define UIP_DHCP_MACPAD_LEN		10
+#define UIP_DHCP_HOSTNAME_LEN		64
+#define UIP_DHCP_FILENAME_LEN		128
+#define UIP_DHCP_MAGIC_COOKIE		0x63825363
+#define UIP_DHCP_MAGIC_COOKIE_LEN	4
+#define UIP_DHCP_LEASE_TIME		0x00003840
+#define UIP_DHCP_MAX_PAYLOAD_LEN	(UIP_BOOTP_MAX_PAYLOAD_LEN - UIP_BOOTP_VENDOR_SPECIFIC_LEN +  UIP_DHCP_VENDOR_SPECIFIC_LEN)
+#define UIP_DHCP_OPTION_LEN		(UIP_DHCP_VENDOR_SPECIFIC_LEN - UIP_DHCP_MAGIC_COOKIE_LEN)
+#define UIP_DHCP_DISCOVER		1
+#define UIP_DHCP_OFFER			2
+#define UIP_DHCP_REQUEST		3
+#define UIP_DHCP_ACK			5
+#define UIP_DHCP_MAX_DNS_SERVER_NR	3
+#define UIP_DHCP_MAX_DOMAIN_NAME_LEN	256
+#define UIP_DHCP_TAG_MSG_TYPE		53
+#define UIP_DHCP_TAG_MSG_TYPE_LEN	1
+#define UIP_DHCP_TAG_SERVER_ID		54
+#define UIP_DHCP_TAG_SERVER_ID_LEN	4
+#define UIP_DHCP_TAG_LEASE_TIME		51
+#define UIP_DHCP_TAG_LEASE_TIME_LEN	4
+#define UIP_DHCP_TAG_SUBMASK		1
+#define UIP_DHCP_TAG_SUBMASK_LEN	4
+#define UIP_DHCP_TAG_ROUTER		3
+#define UIP_DHCP_TAG_ROUTER_LEN		4
+#define UIP_DHCP_TAG_ROOT		17
+#define UIP_DHCP_TAG_ROOT_LEN		4
+#define UIP_DHCP_TAG_DNS_SERVER		6
+#define UIP_DHCP_TAG_DNS_SERVER_LEN	4
+#define UIP_DHCP_TAG_DOMAIN_NAME	15
+#define UIP_DHCP_TAG_END		255
+
+/*
+ * IP package maxium len == 64 KBytes
+ * IP header == 20 Bytes
+ * TCP header == 20 Bytes
+ * UDP header == 8 Bytes
+ */
+#define UIP_MAX_TCP_PAYLOAD	(64*1024 - 20 - 20 - 1)
+#define UIP_MAX_UDP_PAYLOAD	(64*1024 - 20 -  8 - 1)
+
+struct uip_eth_addr {
+	u8 addr[6];
+};
+
+struct uip_eth {
+	struct uip_eth_addr dst;
+	struct uip_eth_addr src;
+	u16 type;
+} __attribute__((packed));
+
+struct uip_arp {
+	struct uip_eth eth;
+	u16 hwtype;
+	u16 proto;
+	u8 hwlen;
+	u8 protolen;
+	u16 op;
+	struct uip_eth_addr smac;
+	u32 sip;
+	struct uip_eth_addr dmac;
+	u32 dip;
+} __attribute__((packed));
+
+struct uip_ip {
+	struct uip_eth eth;
+	u8 vhl;
+	u8 tos;
+	/*
+	 * len = IP hdr +  IP payload
+	 */
+	u16 len;
+	u16 id;
+	u16 flgfrag;
+	u8 ttl;
+	u8 proto;
+	u16 csum;
+	u32 sip;
+	u32 dip;
+} __attribute__((packed));
+
+struct uip_icmp {
+	struct uip_ip ip;
+	u8 type;
+	u8 code;
+	u16 csum;
+	u16 id;
+	u16 seq;
+} __attribute__((packed));
+
+struct uip_udp {
+	/*
+	 * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+	 */
+	struct uip_ip ip;
+	u16 sport;
+	u16 dport;
+	/*
+	 * len = UDP hdr +  UDP payload
+	 */
+	u16 len;
+	u16 csum;
+	u8 payload[0];
+} __attribute__((packed));
+
+struct uip_tcp {
+	/*
+	 * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+	 */
+	struct uip_ip ip;
+	u16 sport;
+	u16 dport;
+	u32 seq;
+	u32 ack;
+	u8  off;
+	u8  flg;
+	u16 win;
+	u16 csum;
+	u16 urgent;
+} __attribute__((packed));
+
+struct uip_pseudo_hdr {
+	u32 sip;
+	u32 dip;
+	u8 zero;
+	u8 proto;
+	u16 len;
+} __attribute__((packed));
+
+struct uip_dhcp {
+	struct uip_udp udp;
+	u8 msg_type;
+	u8 hardware_type;
+	u8 hardware_len;
+	u8 hops;
+	u32 id;
+	u16 time;
+	u16 flg;
+	u32 client_ip;
+	u32 your_ip;
+	u32 server_ip;
+	u32 agent_ip;
+	struct uip_eth_addr client_mac;
+	u8 pad[UIP_DHCP_MACPAD_LEN];
+	u8 server_hostname[UIP_DHCP_HOSTNAME_LEN];
+	u8 boot_filename[UIP_DHCP_FILENAME_LEN];
+	u32 magic_cookie;
+	u8 option[UIP_DHCP_OPTION_LEN];
+} __attribute__((packed));
+
+struct uip_info {
+	struct list_head udp_socket_head;
+	struct list_head tcp_socket_head;
+	struct mutex udp_socket_lock;
+	struct mutex tcp_socket_lock;
+	struct uip_eth_addr guest_mac;
+	struct uip_eth_addr host_mac;
+	pthread_cond_t buf_free_cond;
+	pthread_cond_t buf_used_cond;
+	struct list_head buf_head;
+	struct mutex buf_lock;
+	pthread_t udp_thread;
+	int udp_epollfd;
+	int buf_free_nr;
+	int buf_used_nr;
+	u32 guest_ip;
+	u32 guest_netmask;
+	u32 host_ip;
+	u32 dns_ip[UIP_DHCP_MAX_DNS_SERVER_NR];
+	char *domain_name;
+	u32 buf_nr;
+};
+
+struct uip_buf {
+	struct list_head list;
+	struct uip_info *info;
+	int vnet_len;
+	int eth_len;
+	int status;
+	char *vnet;
+	char *eth;
+	int id;
+};
+
+struct uip_udp_socket {
+	struct sockaddr_in addr;
+	struct list_head list;
+	struct mutex *lock;
+	u32 dport, sport;
+	u32 dip, sip;
+	int fd;
+};
+
+struct uip_tcp_socket {
+	struct sockaddr_in addr;
+	struct list_head list;
+	struct uip_info *info;
+	pthread_cond_t	cond;
+	struct mutex *lock;
+	pthread_t thread;
+	u32 dport, sport;
+	u32 guest_acked;
+	u16 window_size;
+	/*
+	 * Initial Sequence Number
+	 */
+	u32 isn_server;
+	u32 isn_guest;
+	u32 ack_server;
+	u32 seq_server;
+	int write_done;
+	int read_done;
+	u32 dip, sip;
+	u8 *payload;
+	int fd;
+};
+
+struct uip_tx_arg {
+	struct virtio_net_hdr *vnet;
+	struct uip_info *info;
+	struct uip_eth *eth;
+	int vnet_len;
+	int eth_len;
+};
+
+static inline u16 uip_ip_hdrlen(struct uip_ip *ip)
+{
+	return (ip->vhl & 0x0f) * 4;
+}
+
+static inline u16 uip_ip_len(struct uip_ip *ip)
+{
+	return htons(ip->len);
+}
+
+static inline u16 uip_udp_hdrlen(struct uip_udp *udp)
+{
+	return 8;
+}
+
+static inline u16 uip_udp_len(struct uip_udp *udp)
+{
+	return ntohs(udp->len);
+}
+
+static inline u16 uip_tcp_hdrlen(struct uip_tcp *tcp)
+{
+	return (tcp->off >> 4) * 4;
+}
+
+static inline u16 uip_tcp_len(struct uip_tcp *tcp)
+{
+	struct uip_ip *ip;
+
+	ip = &tcp->ip;
+
+	return uip_ip_len(ip) - uip_ip_hdrlen(ip);
+}
+
+static inline u16 uip_tcp_payloadlen(struct uip_tcp *tcp)
+{
+	return uip_tcp_len(tcp) - uip_tcp_hdrlen(tcp);
+}
+
+static inline u8 *uip_tcp_payload(struct uip_tcp *tcp)
+{
+	return (u8 *)&tcp->sport + uip_tcp_hdrlen(tcp);
+}
+
+static inline bool uip_tcp_is_syn(struct uip_tcp *tcp)
+{
+	return (tcp->flg & UIP_TCP_FLAG_SYN) != 0;
+}
+
+static inline bool uip_tcp_is_fin(struct uip_tcp *tcp)
+{
+	return (tcp->flg & UIP_TCP_FLAG_FIN) != 0;
+}
+
+static inline u32 uip_tcp_isn(struct uip_tcp *tcp)
+{
+	return ntohl(tcp->seq);
+}
+
+static inline u32 uip_tcp_isn_alloc(void)
+{
+	/*
+	 * FIXME: should increase every 4ms
+	 */
+	return 10000000;
+}
+
+static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
+{
+	return sizeof(*eth);
+}
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
+int uip_init(struct uip_info *info);
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4(struct uip_tx_arg *arg);
+int uip_tx_do_arp(struct uip_tx_arg *arg);
+
+u16 uip_csum_icmp(struct uip_icmp *icmp);
+u16 uip_csum_udp(struct uip_udp *udp);
+u16 uip_csum_tcp(struct uip_tcp *tcp);
+u16 uip_csum_ip(struct uip_ip *ip);
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_get_used(struct uip_info *info);
+struct uip_buf *uip_buf_get_free(struct uip_info *info);
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg);
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 *payload, int payload_len);
+bool uip_udp_is_dhcp(struct uip_udp *udp);
+
+int uip_dhcp_get_dns(struct uip_info *info);
+#endif /* KVM__UIP_H */
diff --git a/tools/kvm/include/kvm/util-init.h b/tools/kvm/include/kvm/util-init.h
new file mode 100644
index 000000000000..13d4f04df678
--- /dev/null
+++ b/tools/kvm/include/kvm/util-init.h
@@ -0,0 +1,51 @@
+#ifndef KVM__UTIL_INIT_H
+#define KVM__UTIL_INIT_H
+
+struct kvm;
+
+struct init_item {
+	struct hlist_node n;
+	const char *fn_name;
+	int (*init)(struct kvm *);
+};
+
+int init_list__init(struct kvm *kvm);
+int init_list__exit(struct kvm *kvm);
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name);
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name);
+
+#define __init_list_add(cb, l)						\
+static void __attribute__ ((constructor)) __init__##cb(void)		\
+{									\
+	static char name[] = #cb;					\
+	static struct init_item t;					\
+	init_list_add(&t, cb, l, name);					\
+}
+
+#define __exit_list_add(cb, l)						\
+static void __attribute__ ((constructor)) __init__##cb(void)		\
+{									\
+	static char name[] = #cb;					\
+	static struct init_item t;					\
+	exit_list_add(&t, cb, l, name);					\
+}
+
+#define core_init(cb) __init_list_add(cb, 0)
+#define base_init(cb) __init_list_add(cb, 2)
+#define dev_base_init(cb)  __init_list_add(cb, 4)
+#define dev_init(cb) __init_list_add(cb, 5)
+#define virtio_dev_init(cb) __init_list_add(cb, 6)
+#define firmware_init(cb) __init_list_add(cb, 7)
+#define late_init(cb) __init_list_add(cb, 9)
+
+#define core_exit(cb) __exit_list_add(cb, 0)
+#define base_exit(cb) __exit_list_add(cb, 2)
+#define dev_base_exit(cb) __exit_list_add(cb, 4)
+#define dev_exit(cb) __exit_list_add(cb, 5)
+#define virtio_dev_exit(cb) __exit_list_add(cb, 6)
+#define firmware_exit(cb) __exit_list_add(cb, 7)
+#define late_exit(cb) __exit_list_add(cb, 9)
+#endif
diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
new file mode 100644
index 000000000000..0df9f0dfdb43
--- /dev/null
+++ b/tools/kvm/include/kvm/util.h
@@ -0,0 +1,97 @@
+#include <linux/stringify.h>
+
+#ifndef KVM__UTIL_H
+#define KVM__UTIL_H
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Some bits are stolen from perf tool :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <linux/types.h>
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+extern bool do_debug_print;
+
+#define PROT_RW (PROT_READ|PROT_WRITE)
+#define MAP_ANON_NORESERVE (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE)
+
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern void die_perror(const char *s) NORETURN;
+extern int pr_err(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_info(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+#define pr_debug(fmt, ...)						\
+	do {								\
+		if (do_debug_print)					\
+			pr_info("(%s) %s:%d: " fmt, __FILE__,		\
+				__func__, __LINE__, ##__VA_ARGS__);	\
+	} while (0)
+
+
+#define BUILD_BUG_ON(condition)	((void)sizeof(char[1 - 2*!!(condition)]))
+
+#ifndef BUG_ON_HANDLER
+# define BUG_ON_HANDLER(condition)					\
+	do {								\
+		if ((condition)) {					\
+			pr_err("BUG at %s:%d", __FILE__, __LINE__);	\
+			raise(SIGABRT);					\
+		}							\
+	} while (0)
+#endif
+
+#define BUG_ON(condition)	BUG_ON_HANDLER((condition))
+
+#define DIE_IF(cnd)						\
+do {								\
+	if (cnd)						\
+	die(" at (" __FILE__ ":" __stringify(__LINE__) "): "	\
+		__stringify(cnd) "\n");				\
+} while (0)
+
+#define WARN_ON(condition) ({					\
+	int __ret_warn_on = !!(condition);			\
+	if (__ret_warn_on)					\
+		pr_warning("(%s) %s:%d: failed condition: %s",	\
+				__FILE__, __func__, __LINE__,	\
+				__stringify(condition));	\
+	__ret_warn_on;						\
+})
+
+#define MSECS_TO_USECS(s) ((s) * 1000)
+
+/* Millisecond sleep */
+static inline void msleep(unsigned int msecs)
+{
+	usleep(MSECS_TO_USECS(msecs));
+}
+
+struct kvm;
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
+
+#endif /* KVM__UTIL_H */
diff --git a/tools/kvm/include/kvm/vesa.h b/tools/kvm/include/kvm/vesa.h
new file mode 100644
index 000000000000..ac041d9d34e3
--- /dev/null
+++ b/tools/kvm/include/kvm/vesa.h
@@ -0,0 +1,18 @@
+#ifndef KVM__VESA_H
+#define KVM__VESA_H
+
+#include <linux/types.h>
+
+#define VESA_WIDTH	640
+#define VESA_HEIGHT	480
+
+#define VESA_MEM_ADDR	0xd0000000
+#define VESA_MEM_SIZE	(4*VESA_WIDTH*VESA_HEIGHT)
+#define VESA_BPP	32
+
+struct kvm;
+struct biosregs;
+
+struct framebuffer *vesa__init(struct kvm *self);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-9p.h b/tools/kvm/include/kvm/virtio-9p.h
new file mode 100644
index 000000000000..19ffe505a74c
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-9p.h
@@ -0,0 +1,76 @@
+#ifndef KVM__VIRTIO_9P_H
+#define KVM__VIRTIO_9P_H
+#include "kvm/virtio.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/parse-options.h"
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+#define NUM_VIRT_QUEUES		1
+#define VIRTQUEUE_NUM		128
+#define	VIRTIO_9P_DEFAULT_TAG	"kvm_9p"
+#define VIRTIO_9P_HDR_LEN	(sizeof(u32)+sizeof(u8)+sizeof(u16))
+#define VIRTIO_9P_VERSION_DOTL	"9P2000.L"
+#define MAX_TAG_LEN		32
+
+struct p9_msg {
+	u32			size;
+	u8			cmd;
+	u16			tag;
+	u8			msg[0];
+} __attribute__((packed));
+
+struct p9_fid {
+	u32			fid;
+	u32			uid;
+	char			abs_path[PATH_MAX];
+	char			*path;
+	DIR			*dir;
+	int			fd;
+	struct rb_node		node;
+};
+
+struct p9_dev_job {
+	struct virt_queue	*vq;
+	struct p9_dev		*p9dev;
+	struct thread_pool__job job_id;
+};
+
+struct p9_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+	struct rb_root		fids;
+
+	struct virtio_9p_config	*config;
+	u32			features;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct p9_dev_job	jobs[NUM_VIRT_QUEUES];
+	char			root_dir[PATH_MAX];
+};
+
+struct p9_pdu {
+	u32			queue_head;
+	size_t			read_offset;
+	size_t			write_offset;
+	u16			out_iov_cnt;
+	u16			in_iov_cnt;
+	struct iovec		in_iov[VIRTQUEUE_NUM];
+	struct iovec		out_iov[VIRTQUEUE_NUM];
+};
+
+struct kvm;
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name);
+int virtio_9p__init(struct kvm *kvm);
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...);
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-balloon.h b/tools/kvm/include/kvm/virtio-balloon.h
new file mode 100644
index 000000000000..844a1bab7e41
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-balloon.h
@@ -0,0 +1,9 @@
+#ifndef KVM__BLN_VIRTIO_H
+#define KVM__BLN_VIRTIO_H
+
+struct kvm;
+
+int virtio_bln__init(struct kvm *kvm);
+int virtio_bln__exit(struct kvm *kvm);
+
+#endif /* KVM__BLN_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-blk.h b/tools/kvm/include/kvm/virtio-blk.h
new file mode 100644
index 000000000000..12e59b6b21fa
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-blk.h
@@ -0,0 +1,12 @@
+#ifndef KVM__BLK_VIRTIO_H
+#define KVM__BLK_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_blk__init(struct kvm *kvm);
+int virtio_blk__exit(struct kvm *kvm);
+void virtio_blk_complete(void *param, long len);
+
+#endif /* KVM__BLK_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-console.h b/tools/kvm/include/kvm/virtio-console.h
new file mode 100644
index 000000000000..89809208786b
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-console.h
@@ -0,0 +1,10 @@
+#ifndef KVM__CONSOLE_VIRTIO_H
+#define KVM__CONSOLE_VIRTIO_H
+
+struct kvm;
+
+int virtio_console__init(struct kvm *kvm);
+void virtio_console__inject_interrupt(struct kvm *kvm);
+int virtio_console__exit(struct kvm *kvm);
+
+#endif /* KVM__CONSOLE_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-mmio.h b/tools/kvm/include/kvm/virtio-mmio.h
new file mode 100644
index 000000000000..983c8fc1eed9
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-mmio.h
@@ -0,0 +1,59 @@
+#ifndef KVM__VIRTIO_MMIO_H
+#define KVM__VIRTIO_MMIO_H
+
+#include <linux/types.h>
+#include <linux/virtio_mmio.h>
+
+#define VIRTIO_MMIO_MAX_VQ	3
+#define VIRTIO_MMIO_MAX_CONFIG	1
+#define VIRTIO_MMIO_IO_SIZE	0x200
+
+struct kvm;
+
+struct virtio_mmio_ioevent_param {
+	struct virtio_device	*vdev;
+	u32			vq;
+};
+
+struct virtio_mmio_hdr {
+	char	magic[4];
+	u32	version;
+	u32	device_id;
+	u32	vendor_id;
+	u32	host_features;
+	u32	host_features_sel;
+	u32	reserved_1[2];
+	u32	guest_features;
+	u32	guest_features_sel;
+	u32	guest_page_size;
+	u32	reserved_2;
+	u32	queue_sel;
+	u32	queue_num_max;
+	u32	queue_num;
+	u32	queue_align;
+	u32	queue_pfn;
+	u32	reserved_3[3];
+	u32	queue_notify;
+	u32	reserved_4[3];
+	u32	interrupt_state;
+	u32	interrupt_ack;
+	u32	reserved_5[2];
+	u32	status;
+} __attribute__((packed));
+
+struct virtio_mmio {
+	u32			addr;
+	void			*dev;
+	struct kvm		*kvm;
+	u8			irq;
+	struct virtio_mmio_hdr	hdr;
+	struct device_header	dev_hdr;
+	struct virtio_mmio_ioevent_param ioeventfds[VIRTIO_MMIO_MAX_VQ];
+};
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		      int device_id, int subsys_id, int class);
+#endif
diff --git a/tools/kvm/include/kvm/virtio-net.h b/tools/kvm/include/kvm/virtio-net.h
new file mode 100644
index 000000000000..db43d9874796
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-net.h
@@ -0,0 +1,30 @@
+#ifndef KVM__VIRTIO_NET_H
+#define KVM__VIRTIO_NET_H
+
+#include "kvm/parse-options.h"
+
+struct kvm;
+
+struct virtio_net_params {
+	const char *guest_ip;
+	const char *host_ip;
+	const char *script;
+	const char *trans;
+	char guest_mac[6];
+	char host_mac[6];
+	struct kvm *kvm;
+	int mode;
+	int vhost;
+	int fd;
+};
+
+int virtio_net__init(struct kvm *kvm);
+int virtio_net__exit(struct kvm *kvm);
+int netdev_parser(const struct option *opt, const char *arg, int unset);
+
+enum {
+	NET_MODE_USER,
+	NET_MODE_TAP
+};
+
+#endif /* KVM__VIRTIO_NET_H */
diff --git a/tools/kvm/include/kvm/virtio-pci-dev.h b/tools/kvm/include/kvm/virtio-pci-dev.h
new file mode 100644
index 000000000000..48ae018e43e3
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci-dev.h
@@ -0,0 +1,38 @@
+#ifndef VIRTIO_PCI_DEV_H_
+#define VIRTIO_PCI_DEV_H_
+
+#include <linux/virtio_ids.h>
+
+/*
+ * Virtio PCI device constants and resources
+ * they do use (such as irqs and pins).
+ */
+
+#define PCI_DEVICE_ID_VIRTIO_NET		0x1000
+#define PCI_DEVICE_ID_VIRTIO_BLK		0x1001
+#define PCI_DEVICE_ID_VIRTIO_CONSOLE		0x1003
+#define PCI_DEVICE_ID_VIRTIO_RNG		0x1004
+#define PCI_DEVICE_ID_VIRTIO_BLN		0x1005
+#define PCI_DEVICE_ID_VIRTIO_SCSI		0x1008
+#define PCI_DEVICE_ID_VIRTIO_9P			0x1009
+#define PCI_DEVICE_ID_VESA			0x2000
+#define PCI_DEVICE_ID_PCI_SHMEM			0x0001
+
+#define PCI_VENDOR_ID_REDHAT_QUMRANET		0x1af4
+#define PCI_VENDOR_ID_PCI_SHMEM			0x0001
+#define PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET	0x1af4
+
+#define PCI_SUBSYSTEM_ID_VESA			0x0004
+#define PCI_SUBSYSTEM_ID_PCI_SHMEM		0x0001
+
+#define PCI_CLASS_BLK				0x018000
+#define PCI_CLASS_NET				0x020000
+#define PCI_CLASS_CONSOLE			0x078000
+/*
+ * 0xFF Device does not fit in any defined classes
+ */
+#define PCI_CLASS_RNG				0xff0000
+#define PCI_CLASS_BLN				0xff0000
+#define PCI_CLASS_9P				0xff0000
+
+#endif /* VIRTIO_PCI_DEV_H_ */
diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h
new file mode 100644
index 000000000000..6d9a55868df3
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci.h
@@ -0,0 +1,51 @@
+#ifndef KVM__VIRTIO_PCI_H
+#define KVM__VIRTIO_PCI_H
+
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+
+#include <linux/types.h>
+
+#define VIRTIO_PCI_MAX_VQ	3
+#define VIRTIO_PCI_MAX_CONFIG	1
+
+struct kvm;
+
+struct virtio_pci_ioevent_param {
+	struct virtio_device	*vdev;
+	u32			vq;
+};
+
+#define VIRTIO_PCI_F_SIGNAL_MSI (1 << 0)
+
+struct virtio_pci {
+	struct pci_device_header pci_hdr;
+	struct device_header	dev_hdr;
+	void			*dev;
+
+	u16			base_addr;
+	u8			status;
+	u8			isr;
+	u32			features;
+
+	/* MSI-X */
+	u16			config_vector;
+	u32			config_gsi;
+	u32			vq_vector[VIRTIO_PCI_MAX_VQ];
+	u32			gsis[VIRTIO_PCI_MAX_VQ];
+	u32			msix_io_block;
+	u64			msix_pba;
+	struct msix_table	msix_table[VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG];
+
+	/* virtio queue */
+	u16			queue_selector;
+	struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ];
+};
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-rng.h b/tools/kvm/include/kvm/virtio-rng.h
new file mode 100644
index 000000000000..b585b372cd49
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-rng.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RNG_VIRTIO_H
+#define KVM__RNG_VIRTIO_H
+
+struct kvm;
+
+int virtio_rng__init(struct kvm *kvm);
+int virtio_rng__exit(struct kvm *kvm);
+
+#endif /* KVM__RNG_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-scsi.h b/tools/kvm/include/kvm/virtio-scsi.h
new file mode 100644
index 000000000000..a780d7eee790
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-scsi.h
@@ -0,0 +1,26 @@
+#ifndef KVM__SCSI_VIRTIO_H
+#define KVM__SCSI_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_scsi_init(struct kvm *kvm);
+int virtio_scsi_exit(struct kvm *kvm);
+
+/*----------------------------------------------------*/
+/* TODO: Remove this when tcm_vhost goes upstream */
+#define TRANSPORT_IQN_LEN		224
+#define VHOST_SCSI_ABI_VERSION		0
+struct vhost_scsi_target {
+	int abi_version;
+	unsigned char vhost_wwpn[TRANSPORT_IQN_LEN];
+	unsigned short vhost_tpgt;
+};
+/* VHOST_SCSI specific defines */
+#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target)
+#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target)
+#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, struct vhost_scsi_target)
+/*----------------------------------------------------*/
+
+#endif /* KVM__SCSI_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h
new file mode 100644
index 000000000000..924279b1ba03
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio.h
@@ -0,0 +1,92 @@
+#ifndef KVM__VIRTIO_H
+#define KVM__VIRTIO_H
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/types.h>
+#include <sys/uio.h>
+
+#include "kvm/kvm.h"
+
+#define VIRTIO_IRQ_LOW		0
+#define VIRTIO_IRQ_HIGH		1
+
+#define VIRTIO_PCI_O_CONFIG	0
+#define VIRTIO_PCI_O_MSIX	1
+
+struct virt_queue {
+	struct vring	vring;
+	u32		pfn;
+	/* The last_avail_idx field is an index to ->ring of struct vring_avail.
+	   It's where we assume the next request index is at.  */
+	u16		last_avail_idx;
+	u16		last_used_signalled;
+};
+
+static inline u16 virt_queue__pop(struct virt_queue *queue)
+{
+	return queue->vring.avail->ring[queue->last_avail_idx++ % queue->vring.num];
+}
+
+static inline struct vring_desc *virt_queue__get_desc(struct virt_queue *queue, u16 desc_ndx)
+{
+	return &queue->vring.desc[desc_ndx];
+}
+
+static inline bool virt_queue__available(struct virt_queue *vq)
+{
+	if (!vq->vring.avail)
+		return 0;
+
+	vring_avail_event(&vq->vring) = vq->last_avail_idx;
+	return vq->vring.avail->idx !=  vq->last_avail_idx;
+}
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len);
+
+bool virtio_queue__should_signal(struct virt_queue *vq);
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[],
+			u16 *out, u16 *in, struct kvm *kvm);
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[],
+			     u16 *out, u16 *in, u16 head, struct kvm *kvm);
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+			      struct iovec in_iov[], struct iovec out_iov[],
+			      u16 *in, u16 *out);
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off);
+
+enum virtio_trans {
+	VIRTIO_PCI,
+	VIRTIO_MMIO,
+};
+
+struct virtio_device {
+	bool			use_vhost;
+	void			*virtio;
+	struct virtio_ops	*ops;
+};
+
+struct virtio_ops {
+	u8 *(*get_config)(struct kvm *kvm, void *dev);
+	u32 (*get_host_features)(struct kvm *kvm, void *dev);
+	void (*set_guest_features)(struct kvm *kvm, void *dev, u32 features);
+	int (*init_vq)(struct kvm *kvm, void *dev, u32 vq, u32 page_size,
+		       u32 align, u32 pfn);
+	int (*notify_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*get_pfn_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*get_size_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*set_size_vq)(struct kvm *kvm, void *dev, u32 vq, int size);
+	void (*notify_vq_gsi)(struct kvm *kvm, void *dev, u32 vq, u32 gsi);
+	void (*notify_vq_eventfd)(struct kvm *kvm, void *dev, u32 vq, u32 efd);
+	int (*signal_vq)(struct kvm *kvm, struct virtio_device *vdev, u32 queueid);
+	int (*signal_config)(struct kvm *kvm, struct virtio_device *vdev);
+	int (*init)(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		    int device_id, int subsys_id, int class);
+	int (*exit)(struct kvm *kvm, struct virtio_device *vdev);
+};
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		struct virtio_ops *ops, enum virtio_trans trans,
+		int device_id, int subsys_id, int class);
+int virtio_compat_add_message(const char *device, const char *config);
+#endif /* KVM__VIRTIO_H */
diff --git a/tools/kvm/include/kvm/vnc.h b/tools/kvm/include/kvm/vnc.h
new file mode 100644
index 000000000000..c2934a45f6dc
--- /dev/null
+++ b/tools/kvm/include/kvm/vnc.h
@@ -0,0 +1,22 @@
+#ifndef KVM__VNC_H
+#define KVM__VNC_H
+
+#include "kvm/kvm.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_VNCSERVER
+int vnc__init(struct kvm *kvm);
+int vnc__exit(struct kvm *kvm);
+#else
+static inline int vnc__init(struct kvm *kvm)
+{
+	return 0;
+}
+static inline int vnc__exit(struct kvm *kvm)
+{
+	return 0;
+}
+#endif
+
+#endif /* KVM__VNC_H */
diff --git a/tools/kvm/include/linux/bitops.h b/tools/kvm/include/linux/bitops.h
new file mode 100644
index 000000000000..56448b71ebbf
--- /dev/null
+++ b/tools/kvm/include/linux/bitops.h
@@ -0,0 +1,33 @@
+#ifndef _KVM_LINUX_BITOPS_H_
+#define _KVM_LINUX_BITOPS_H_
+
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <asm/hweight.h>
+
+#define BITS_PER_LONG __WORDSIZE
+#define BITS_PER_BYTE           8
+#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+	addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+	addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
+static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+	return ((1UL << (nr % BITS_PER_LONG)) &
+		(((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
+
+#endif
diff --git a/tools/kvm/include/linux/byteorder.h b/tools/kvm/include/linux/byteorder.h
new file mode 100644
index 000000000000..c490de8a89f4
--- /dev/null
+++ b/tools/kvm/include/linux/byteorder.h
@@ -0,0 +1,7 @@
+#ifndef __BYTE_ORDER_H__
+#define __BYTE_ORDER_H__
+
+#include <asm/byteorder.h>
+#include <linux/byteorder/generic.h>
+
+#endif
diff --git a/tools/kvm/include/linux/compiler.h b/tools/kvm/include/linux/compiler.h
new file mode 100644
index 000000000000..898420b81aec
--- /dev/null
+++ b/tools/kvm/include/linux/compiler.h
@@ -0,0 +1,20 @@
+#ifndef _PERF_LINUX_COMPILER_H_
+#define _PERF_LINUX_COMPILER_H_
+
+#ifndef __always_inline
+#define __always_inline	inline
+#endif
+#define __user
+
+#ifndef __attribute_const__
+#define __attribute_const__
+#endif
+
+#define __used		__attribute__((__unused__))
+#define __packed	__attribute__((packed))
+#define __iomem
+#define __force
+#define __must_check
+#define unlikely
+
+#endif
diff --git a/tools/kvm/include/linux/kernel.h b/tools/kvm/include/linux/kernel.h
new file mode 100644
index 000000000000..1e9abe9a4d0c
--- /dev/null
+++ b/tools/kvm/include/linux/kernel.h
@@ -0,0 +1,41 @@
+
+#ifndef KVM__LINUX_KERNEL_H_
+#define KVM__LINUX_KERNEL_H_
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof(((type *)0)->member) * __mptr = (ptr);	\
+	(type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({				\
+	typeof(x) _max1 = (x);			\
+	typeof(y) _max2 = (y);			\
+	(void) (&_max1 == &_max2);		\
+	_max1 > _max2 ? _max1 : _max2; })
+
+#define true 1
+
+#endif
diff --git a/tools/kvm/include/linux/module.h b/tools/kvm/include/linux/module.h
new file mode 100644
index 000000000000..0e4c6a3986f5
--- /dev/null
+++ b/tools/kvm/include/linux/module.h
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_MODULE_H
+#define KVM__LINUX_MODULE_H
+
+#define EXPORT_SYMBOL(name)
+
+#endif
diff --git a/tools/kvm/include/linux/prefetch.h b/tools/kvm/include/linux/prefetch.h
new file mode 100644
index 000000000000..62f67889c52f
--- /dev/null
+++ b/tools/kvm/include/linux/prefetch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_PREFETCH_H
+#define KVM__LINUX_PREFETCH_H
+
+static inline void prefetch(void *a __attribute__((unused))) { }
+
+#endif
diff --git a/tools/kvm/include/linux/stddef.h b/tools/kvm/include/linux/stddef.h
new file mode 100644
index 000000000000..39da8088d942
--- /dev/null
+++ b/tools/kvm/include/linux/stddef.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_STDDEF_H
+#define _LINUX_STDDEF_H
+
+#undef NULL
+#define NULL ((void *)0)
+
+#undef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+
+#endif
diff --git a/tools/kvm/include/linux/types.h b/tools/kvm/include/linux/types.h
new file mode 100644
index 000000000000..5e20f10f8830
--- /dev/null
+++ b/tools/kvm/include/linux/types.h
@@ -0,0 +1,51 @@
+#ifndef LINUX_TYPES_H
+#define LINUX_TYPES_H
+
+#include <kvm/compiler.h>
+#define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
+#include <asm/types.h>
+
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8  u8;
+typedef __s8  s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#endif /* LINUX_TYPES_H */
diff --git a/tools/kvm/ioeventfd.c b/tools/kvm/ioeventfd.c
new file mode 100644
index 000000000000..ff665d410ba5
--- /dev/null
+++ b/tools/kvm/ioeventfd.c
@@ -0,0 +1,218 @@
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+
+#include "kvm/ioeventfd.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#define IOEVENTFD_MAX_EVENTS	20
+
+static struct	epoll_event events[IOEVENTFD_MAX_EVENTS];
+static int	epoll_fd, epoll_stop_fd;
+static LIST_HEAD(used_ioevents);
+static bool	ioeventfd_avail;
+
+static void *ioeventfd__thread(void *param)
+{
+	u64 tmp = 1;
+
+	kvm__set_thread_name("ioeventfd-worker");
+
+	for (;;) {
+		int nfds, i;
+
+		nfds = epoll_wait(epoll_fd, events, IOEVENTFD_MAX_EVENTS, -1);
+		for (i = 0; i < nfds; i++) {
+			struct ioevent *ioevent;
+
+			if (events[i].data.fd == epoll_stop_fd)
+				goto done;
+
+			ioevent = events[i].data.ptr;
+
+			if (read(ioevent->fd, &tmp, sizeof(tmp)) < 0)
+				die("Failed reading event");
+
+			ioevent->fn(ioevent->fn_kvm, ioevent->fn_ptr);
+		}
+	}
+
+done:
+	tmp = write(epoll_stop_fd, &tmp, sizeof(tmp));
+
+	return NULL;
+}
+
+static int ioeventfd__start(void)
+{
+	pthread_t thread;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	return pthread_create(&thread, NULL, ioeventfd__thread, NULL);
+}
+
+int ioeventfd__init(struct kvm *kvm)
+{
+	struct epoll_event epoll_event = {.events = EPOLLIN};
+	int r;
+
+	ioeventfd_avail = kvm__supports_extension(kvm, KVM_CAP_IOEVENTFD);
+	if (!ioeventfd_avail)
+		return 1; /* Not fatal, but let caller determine no-go. */
+
+	epoll_fd = epoll_create(IOEVENTFD_MAX_EVENTS);
+	if (epoll_fd < 0)
+		return -errno;
+
+	epoll_stop_fd = eventfd(0, 0);
+	epoll_event.data.fd = epoll_stop_fd;
+
+	r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, epoll_stop_fd, &epoll_event);
+	if (r < 0)
+		goto cleanup;
+
+	r = ioeventfd__start();
+	if (r < 0)
+		goto cleanup;
+
+	r = 0;
+
+	return r;
+
+cleanup:
+	close(epoll_stop_fd);
+	close(epoll_fd);
+
+	return r;
+}
+base_init(ioeventfd__init);
+
+int ioeventfd__exit(struct kvm *kvm)
+{
+	u64 tmp = 1;
+	int r;
+
+	if (!ioeventfd_avail)
+		return 0;
+
+	r = write(epoll_stop_fd, &tmp, sizeof(tmp));
+	if (r < 0)
+		return r;
+
+	r = read(epoll_stop_fd, &tmp, sizeof(tmp));
+	if (r < 0)
+		return r;
+
+	close(epoll_fd);
+	close(epoll_stop_fd);
+
+	return 0;
+}
+base_exit(ioeventfd__exit);
+
+int ioeventfd__add_event(struct ioevent *ioevent, bool is_pio, bool poll_in_userspace)
+{
+	struct kvm_ioeventfd kvm_ioevent;
+	struct epoll_event epoll_event;
+	struct ioevent *new_ioevent;
+	int event, r;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	new_ioevent = malloc(sizeof(*new_ioevent));
+	if (new_ioevent == NULL)
+		return -ENOMEM;
+
+	*new_ioevent = *ioevent;
+	event = new_ioevent->fd;
+
+	kvm_ioevent = (struct kvm_ioeventfd) {
+		.addr		= ioevent->io_addr,
+		.len		= ioevent->io_len,
+		.datamatch	= ioevent->datamatch,
+		.fd		= event,
+		.flags		= KVM_IOEVENTFD_FLAG_DATAMATCH,
+	};
+
+	if (is_pio)
+		kvm_ioevent.flags |= KVM_IOEVENTFD_FLAG_PIO;
+
+	r = ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+	if (r) {
+		r = -errno;
+		goto cleanup;
+	}
+
+	if (!poll_in_userspace)
+		return 0;
+
+	epoll_event = (struct epoll_event) {
+		.events		= EPOLLIN,
+		.data.ptr	= new_ioevent,
+	};
+
+	r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event, &epoll_event);
+	if (r) {
+		r = -errno;
+		goto cleanup;
+	}
+
+	list_add_tail(&new_ioevent->list, &used_ioevents);
+
+	return 0;
+
+cleanup:
+	free(new_ioevent);
+	return r;
+}
+
+int ioeventfd__del_event(u64 addr, u64 datamatch)
+{
+	struct kvm_ioeventfd kvm_ioevent;
+	struct ioevent *ioevent;
+	u8 found = 0;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	list_for_each_entry(ioevent, &used_ioevents, list) {
+		if (ioevent->io_addr == addr) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found == 0 || ioevent == NULL)
+		return -ENOENT;
+
+	kvm_ioevent = (struct kvm_ioeventfd) {
+		.addr			= ioevent->io_addr,
+		.len			= ioevent->io_len,
+		.datamatch		= ioevent->datamatch,
+		.flags			= KVM_IOEVENTFD_FLAG_PIO
+					| KVM_IOEVENTFD_FLAG_DEASSIGN
+					| KVM_IOEVENTFD_FLAG_DATAMATCH,
+	};
+
+	ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+
+	epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ioevent->fd, NULL);
+
+	list_del(&ioevent->list);
+
+	close(ioevent->fd);
+	free(ioevent);
+
+	return 0;
+}
diff --git a/tools/kvm/ioport.c b/tools/kvm/ioport.c
new file mode 100644
index 000000000000..a4f15827acb4
--- /dev/null
+++ b/tools/kvm/ioport.c
@@ -0,0 +1,198 @@
+#include "kvm/ioport.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "kvm/brlock.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/mutex.h"
+
+#include <linux/kvm.h>	/* for KVM_EXIT_* */
+#include <linux/types.h>
+
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define ioport_node(n) rb_entry(n, struct ioport, node)
+
+DEFINE_MUTEX(ioport_mutex);
+
+static u16			free_io_port_idx; /* protected by ioport_mutex */
+
+static struct rb_root		ioport_tree = RB_ROOT;
+
+static u16 ioport__find_free_port(void)
+{
+	u16 free_port;
+
+	mutex_lock(&ioport_mutex);
+	free_port = IOPORT_START + free_io_port_idx * IOPORT_SIZE;
+	free_io_port_idx++;
+	mutex_unlock(&ioport_mutex);
+
+	return free_port;
+}
+
+static struct ioport *ioport_search(struct rb_root *root, u64 addr)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_single(root, addr);
+	if (node == NULL)
+		return NULL;
+
+	return ioport_node(node);
+}
+
+static int ioport_insert(struct rb_root *root, struct ioport *data)
+{
+	return rb_int_insert(root, &data->node);
+}
+
+static void ioport_remove(struct rb_root *root, struct ioport *data)
+{
+	rb_int_erase(root, &data->node);
+}
+
+int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, int count, void *param)
+{
+	struct ioport *entry;
+	int r;
+
+	br_write_lock(kvm);
+	if (port == IOPORT_EMPTY)
+		port = ioport__find_free_port();
+
+	entry = ioport_search(&ioport_tree, port);
+	if (entry) {
+		pr_warning("ioport re-registered: %x", port);
+		rb_int_erase(&ioport_tree, &entry->node);
+	}
+
+	entry = malloc(sizeof(*entry));
+	if (entry == NULL)
+		return -ENOMEM;
+
+	*entry = (struct ioport) {
+		.node	= RB_INT_INIT(port, port + count),
+		.ops	= ops,
+		.priv	= param,
+	};
+
+	r = ioport_insert(&ioport_tree, entry);
+	if (r < 0) {
+		free(entry);
+		br_write_unlock(kvm);
+		return r;
+	}
+	br_write_unlock(kvm);
+
+	return port;
+}
+
+int ioport__unregister(struct kvm *kvm, u16 port)
+{
+	struct ioport *entry;
+	int r;
+
+	br_write_lock(kvm);
+
+	r = -ENOENT;
+	entry = ioport_search(&ioport_tree, port);
+	if (!entry)
+		goto done;
+
+	ioport_remove(&ioport_tree, entry);
+
+	free(entry);
+
+	r = 0;
+
+done:
+	br_write_unlock(kvm);
+
+	return r;
+}
+
+static void ioport__unregister_all(void)
+{
+	struct ioport *entry;
+	struct rb_node *rb;
+	struct rb_int_node *rb_node;
+
+	rb = rb_first(&ioport_tree);
+	while (rb) {
+		rb_node = rb_int(rb);
+		entry = ioport_node(rb_node);
+		ioport_remove(&ioport_tree, entry);
+		free(entry);
+		rb = rb_first(&ioport_tree);
+	}
+}
+
+static const char *to_direction(int direction)
+{
+	if (direction == KVM_EXIT_IO_IN)
+		return "IN";
+	else
+		return "OUT";
+}
+
+static void ioport_error(u16 port, void *data, int direction, int size, u32 count)
+{
+	fprintf(stderr, "IO error: %s port=%x, size=%d, count=%u\n", to_direction(direction), port, size, count);
+}
+
+bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count)
+{
+	struct ioport_operations *ops;
+	bool ret = false;
+	struct ioport *entry;
+	void *ptr = data;
+
+	br_read_lock();
+	entry = ioport_search(&ioport_tree, port);
+	if (!entry)
+		goto error;
+
+	ops	= entry->ops;
+
+	while (count--) {
+		if (direction == KVM_EXIT_IO_IN && ops->io_in)
+				ret = ops->io_in(entry, kvm, port, ptr, size);
+		else if (ops->io_out)
+				ret = ops->io_out(entry, kvm, port, ptr, size);
+
+		ptr += size;
+	}
+
+	br_read_unlock();
+
+	if (!ret)
+		goto error;
+
+	return true;
+error:
+	br_read_unlock();
+
+	if (kvm->cfg.ioport_debug)
+		ioport_error(port, data, direction, size, count);
+
+	return !kvm->cfg.ioport_debug;
+}
+
+int ioport__init(struct kvm *kvm)
+{
+	ioport__setup_arch(kvm);
+
+	return 0;
+}
+dev_base_init(ioport__init);
+
+int ioport__exit(struct kvm *kvm)
+{
+	ioport__unregister_all();
+	return 0;
+}
+dev_base_exit(ioport__exit);
diff --git a/tools/kvm/kvm-cmd.c b/tools/kvm/kvm-cmd.c
new file mode 100644
index 000000000000..2520b08847e8
--- /dev/null
+++ b/tools/kvm/kvm-cmd.c
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+/* user defined header files */
+#include "kvm/builtin-debug.h"
+#include "kvm/builtin-pause.h"
+#include "kvm/builtin-resume.h"
+#include "kvm/builtin-balloon.h"
+#include "kvm/builtin-list.h"
+#include "kvm/builtin-version.h"
+#include "kvm/builtin-setup.h"
+#include "kvm/builtin-stop.h"
+#include "kvm/builtin-stat.h"
+#include "kvm/builtin-help.h"
+#include "kvm/builtin-sandbox.h"
+#include "kvm/kvm-cmd.h"
+#include "kvm/builtin-run.h"
+#include "kvm/util.h"
+
+struct cmd_struct kvm_commands[] = {
+	{ "pause",	kvm_cmd_pause,		kvm_pause_help,		0 },
+	{ "resume",	kvm_cmd_resume,		kvm_resume_help,	0 },
+	{ "debug",	kvm_cmd_debug,		kvm_debug_help,		0 },
+	{ "balloon",	kvm_cmd_balloon,	kvm_balloon_help,	0 },
+	{ "list",	kvm_cmd_list,		kvm_list_help,		0 },
+	{ "version",	kvm_cmd_version,	NULL,			0 },
+	{ "--version",	kvm_cmd_version,	NULL,			0 },
+	{ "stop",	kvm_cmd_stop,		kvm_stop_help,		0 },
+	{ "stat",	kvm_cmd_stat,		kvm_stat_help,		0 },
+	{ "help",	kvm_cmd_help,		NULL,			0 },
+	{ "setup",	kvm_cmd_setup,		kvm_setup_help,		0 },
+	{ "run",	kvm_cmd_run,		kvm_run_help,		0 },
+	{ "sandbox",	kvm_cmd_sandbox,	kvm_run_help,		0 },
+	{ NULL,		NULL,			NULL,			0 },
+};
+
+/*
+ * kvm_get_command: Searches the command in an array of the commands and
+ * returns a pointer to cmd_struct if a match is found.
+ *
+ * Input parameters:
+ * command: Array of possible commands. The last entry in the array must be
+ *          NULL.
+ * cmd: A string command to search in the array
+ *
+ * Return Value:
+ * NULL: If the cmd is not matched with any of the command in the command array
+ * p: Pointer to cmd_struct of the matching command
+ */
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+		const char *cmd)
+{
+	struct cmd_struct *p = command;
+
+	while (p->cmd) {
+		if (!strcmp(p->cmd, cmd))
+			return p;
+		p++;
+	}
+	return NULL;
+}
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv)
+{
+	struct cmd_struct *p;
+	const char *prefix = NULL;
+	int ret = 0;
+
+	if (!argv || !*argv) {
+		p = kvm_get_command(command, "help");
+		BUG_ON(!p);
+		return p->fn(argc, argv, prefix);
+	}
+
+	p = kvm_get_command(command, argv[0]);
+	if (!p) {
+		p = kvm_get_command(command, "help");
+		BUG_ON(!p);
+		p->fn(0, NULL, prefix);
+		return EINVAL;
+	}
+
+	ret = p->fn(argc - 1, &argv[1], prefix);
+	if (ret < 0) {
+		if (errno == EPERM)
+			die("Permission error - are you root?");
+	}
+
+	return ret;
+}
diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
new file mode 100644
index 000000000000..be05c4988dcd
--- /dev/null
+++ b/tools/kvm/kvm-cpu.c
@@ -0,0 +1,242 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu)
+{
+	struct kvm_guest_debug debug = {
+		.control	= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0)
+		pr_warning("KVM_SET_GUEST_DEBUG failed");
+}
+
+void kvm_cpu__run(struct kvm_cpu *vcpu)
+{
+	int err;
+
+	if (!vcpu->is_running)
+		return;
+
+	err = ioctl(vcpu->vcpu_fd, KVM_RUN, 0);
+	if (err < 0 && (errno != EINTR && errno != EAGAIN))
+		die_perror("KVM_RUN failed");
+}
+
+static void kvm_cpu_signal_handler(int signum)
+{
+	if (signum == SIGKVMEXIT) {
+		if (current_kvm_cpu && current_kvm_cpu->is_running) {
+			current_kvm_cpu->is_running = false;
+			kvm__continue(current_kvm_cpu->kvm);
+		}
+	} else if (signum == SIGKVMPAUSE) {
+		current_kvm_cpu->paused = 1;
+	}
+}
+
+static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu *cpu)
+{
+	if (cpu->ring) {
+		while (cpu->ring->first != cpu->ring->last) {
+			struct kvm_coalesced_mmio *m;
+			m = &cpu->ring->coalesced_mmio[cpu->ring->first];
+			kvm_cpu__emulate_mmio(cpu->kvm,
+					      m->phys_addr,
+					      m->data,
+					      m->len,
+					      1);
+			cpu->ring->first = (cpu->ring->first + 1) % KVM_COALESCED_MMIO_MAX;
+		}
+	}
+}
+
+void kvm_cpu__reboot(struct kvm *kvm)
+{
+	int i;
+
+	/* The kvm->cpus array contains a null pointer in the last location */
+	for (i = 0; ; i++) {
+		if (kvm->cpus[i])
+			pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT);
+		else
+			break;
+	}
+}
+
+int kvm_cpu__start(struct kvm_cpu *cpu)
+{
+	sigset_t sigset;
+
+	sigemptyset(&sigset);
+	sigaddset(&sigset, SIGALRM);
+
+	pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
+	signal(SIGKVMEXIT, kvm_cpu_signal_handler);
+	signal(SIGKVMPAUSE, kvm_cpu_signal_handler);
+
+	kvm_cpu__reset_vcpu(cpu);
+
+	if (cpu->kvm->cfg.single_step)
+		kvm_cpu__enable_singlestep(cpu);
+
+	while (cpu->is_running) {
+		if (cpu->paused) {
+			kvm__notify_paused();
+			cpu->paused = 0;
+		}
+
+		if (cpu->needs_nmi) {
+			kvm_cpu__arch_nmi(cpu);
+			cpu->needs_nmi = 0;
+		}
+
+		kvm_cpu__run(cpu);
+
+		switch (cpu->kvm_run->exit_reason) {
+		case KVM_EXIT_UNKNOWN:
+			break;
+		case KVM_EXIT_DEBUG:
+			kvm_cpu__show_registers(cpu);
+			kvm_cpu__show_code(cpu);
+			break;
+		case KVM_EXIT_IO: {
+			bool ret;
+
+			ret = kvm_cpu__emulate_io(cpu->kvm,
+						  cpu->kvm_run->io.port,
+						  (u8 *)cpu->kvm_run +
+						  cpu->kvm_run->io.data_offset,
+						  cpu->kvm_run->io.direction,
+						  cpu->kvm_run->io.size,
+						  cpu->kvm_run->io.count);
+
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		case KVM_EXIT_MMIO: {
+			bool ret;
+
+			/*
+			 * If we had MMIO exit, coalesced ring should be processed
+			 * *before* processing the exit itself
+			 */
+			kvm_cpu__handle_coalesced_mmio(cpu);
+
+			ret = kvm_cpu__emulate_mmio(cpu->kvm,
+						    cpu->kvm_run->mmio.phys_addr,
+						    cpu->kvm_run->mmio.data,
+						    cpu->kvm_run->mmio.len,
+						    cpu->kvm_run->mmio.is_write);
+
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		case KVM_EXIT_INTR:
+			if (cpu->is_running)
+				break;
+			goto exit_kvm;
+		case KVM_EXIT_SHUTDOWN:
+			goto exit_kvm;
+		default: {
+			bool ret;
+
+			ret = kvm_cpu__handle_exit(cpu);
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		}
+		kvm_cpu__handle_coalesced_mmio(cpu);
+	}
+
+exit_kvm:
+	return 0;
+
+panic_kvm:
+	return 1;
+}
+
+int kvm_cpu__init(struct kvm *kvm)
+{
+	int max_cpus, recommended_cpus, i;
+
+	max_cpus = kvm__max_cpus(kvm);
+	recommended_cpus = kvm__recommended_cpus(kvm);
+
+	if (kvm->cfg.nrcpus > max_cpus) {
+		printf("  # Limit the number of CPUs to %d\n", max_cpus);
+		kvm->cfg.nrcpus = max_cpus;
+	} else if (kvm->cfg.nrcpus > recommended_cpus) {
+		printf("  # Warning: The maximum recommended amount of VCPUs"
+			" is %d\n", recommended_cpus);
+	}
+
+	kvm->nrcpus = kvm->cfg.nrcpus;
+
+	/* Alloc one pointer too many, so array ends up 0-terminated */
+	kvm->cpus = calloc(kvm->nrcpus + 1, sizeof(void *));
+	if (!kvm->cpus) {
+		pr_warning("Couldn't allocate array for %d CPUs", kvm->nrcpus);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		kvm->cpus[i] = kvm_cpu__arch_init(kvm, i);
+		if (!kvm->cpus[i]) {
+			pr_warning("unable to initialize KVM VCPU");
+			goto fail_alloc;
+		}
+	}
+
+	return 0;
+
+fail_alloc:
+	for (i = 0; i < kvm->nrcpus; i++)
+		free(kvm->cpus[i]);
+	return -ENOMEM;
+}
+base_init(kvm_cpu__init);
+
+int kvm_cpu__exit(struct kvm *kvm)
+{
+	int i, r;
+	void *ret = NULL;
+
+	kvm_cpu__delete(kvm->cpus[0]);
+	kvm->cpus[0] = NULL;
+
+	for (i = 1; i < kvm->nrcpus; i++) {
+		if (kvm->cpus[i]->is_running) {
+			pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT);
+			if (pthread_join(kvm->cpus[i]->thread, &ret) != 0)
+				die("pthread_join");
+			kvm_cpu__delete(kvm->cpus[i]);
+		}
+		if (ret == NULL)
+			r = 0;
+	}
+
+	free(kvm->cpus);
+
+	kvm->nrcpus = 0;
+
+	return r;
+}
+late_exit(kvm_cpu__exit);
diff --git a/tools/kvm/kvm-ipc.c b/tools/kvm/kvm-ipc.c
new file mode 100644
index 000000000000..bdcc0d1f6b73
--- /dev/null
+++ b/tools/kvm/kvm-ipc.c
@@ -0,0 +1,500 @@
+#include <sys/epoll.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/eventfd.h>
+#include <dirent.h>
+
+#include "kvm/kvm-ipc.h"
+#include "kvm/rwsem.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/builtin-debug.h"
+#include "kvm/strbuf.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/8250-serial.h"
+
+struct kvm_ipc_head {
+	u32 type;
+	u32 len;
+};
+
+#define KVM_IPC_MAX_MSGS 16
+
+#define KVM_SOCK_SUFFIX		".sock"
+#define KVM_SOCK_SUFFIX_LEN	((ssize_t)sizeof(KVM_SOCK_SUFFIX) - 1)
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+static void (*msgs[KVM_IPC_MAX_MSGS])(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+static DECLARE_RWSEM(msgs_rwlock);
+static int epoll_fd, server_fd, stop_fd;
+static pthread_t thread;
+
+static int kvm__create_socket(struct kvm *kvm)
+{
+	char full_name[PATH_MAX];
+	unsigned int s;
+	struct sockaddr_un local;
+	int len, r;
+
+	/* This usually 108 bytes long */
+	BUILD_BUG_ON(sizeof(local.sun_path) < 32);
+
+	snprintf(full_name, sizeof(full_name), "%s/%s%s",
+		 kvm__get_dir(), kvm->cfg.guest_name, KVM_SOCK_SUFFIX);
+	if (access(full_name, F_OK) == 0) {
+		pr_err("Socket file %s already exist", full_name);
+		return -EEXIST;
+	}
+
+	s = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (s < 0)
+		return s;
+	local.sun_family = AF_UNIX;
+	strlcpy(local.sun_path, full_name, sizeof(local.sun_path));
+	len = strlen(local.sun_path) + sizeof(local.sun_family);
+	r = bind(s, (struct sockaddr *)&local, len);
+	if (r < 0)
+		goto fail;
+
+	r = listen(s, 5);
+	if (r < 0)
+		goto fail;
+
+	return s;
+
+fail:
+	close(s);
+	return r;
+}
+
+void kvm__remove_socket(const char *name)
+{
+	char full_name[PATH_MAX];
+
+	snprintf(full_name, sizeof(full_name), "%s/%s%s",
+		 kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+	unlink(full_name);
+}
+
+int kvm__get_sock_by_instance(const char *name)
+{
+	int s, len, r;
+	char sock_file[PATH_MAX];
+	struct sockaddr_un local;
+
+	snprintf(sock_file, sizeof(sock_file), "%s/%s%s",
+		 kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+	s = socket(AF_UNIX, SOCK_STREAM, 0);
+
+	local.sun_family = AF_UNIX;
+	strlcpy(local.sun_path, sock_file, sizeof(local.sun_path));
+	len = strlen(local.sun_path) + sizeof(local.sun_family);
+
+	r = connect(s, &local, len);
+	if (r < 0 && errno == ECONNREFUSED) {
+		/* Tell the user clean ghost socket file */
+		pr_err("\"%s\" could be a ghost socket file, please remove it",
+				sock_file);
+		return r;
+	} else if (r < 0) {
+		return r;
+	}
+
+	return s;
+}
+
+int kvm__enumerate_instances(int (*callback)(const char *name, int fd))
+{
+	int sock;
+	DIR *dir;
+	struct dirent entry, *result;
+	int ret = 0;
+
+	dir = opendir(kvm__get_dir());
+	if (!dir)
+		return -errno;
+
+	for (;;) {
+		readdir_r(dir, &entry, &result);
+		if (result == NULL)
+			break;
+		if (entry.d_type == DT_SOCK) {
+			ssize_t name_len = strlen(entry.d_name);
+			char *p;
+
+			if (name_len <= KVM_SOCK_SUFFIX_LEN)
+				continue;
+
+			p = &entry.d_name[name_len - KVM_SOCK_SUFFIX_LEN];
+			if (memcmp(KVM_SOCK_SUFFIX, p, KVM_SOCK_SUFFIX_LEN))
+				continue;
+
+			*p = 0;
+			sock = kvm__get_sock_by_instance(entry.d_name);
+			if (sock < 0)
+				continue;
+			ret = callback(entry.d_name, sock);
+			close(sock);
+			if (ret < 0)
+				break;
+		}
+	}
+
+	closedir(dir);
+
+	return ret;
+}
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg))
+{
+	if (type >= KVM_IPC_MAX_MSGS)
+		return -ENOSPC;
+
+	down_write(&msgs_rwlock);
+	msgs[type] = cb;
+	up_write(&msgs_rwlock);
+
+	return 0;
+}
+
+int kvm_ipc__send(int fd, u32 type)
+{
+	struct kvm_ipc_head head = {.type = type, .len = 0,};
+
+	if (write_in_full(fd, &head, sizeof(head)) < 0)
+		return -1;
+
+	return 0;
+}
+
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg)
+{
+	struct kvm_ipc_head head = {.type = type, .len = len,};
+
+	if (write_in_full(fd, &head, sizeof(head)) < 0)
+		return -1;
+
+	if (write_in_full(fd, msg, len) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int kvm_ipc__handle(struct kvm *kvm, int fd, u32 type, u32 len, u8 *data)
+{
+	void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+
+	if (type >= KVM_IPC_MAX_MSGS)
+		return -ENOSPC;
+
+	down_read(&msgs_rwlock);
+	cb = msgs[type];
+	up_read(&msgs_rwlock);
+
+	if (cb == NULL) {
+		pr_warning("No device handles type %u\n", type);
+		return -ENODEV;
+	}
+
+	cb(kvm, fd, type, len, data);
+
+	return 0;
+}
+
+static int kvm_ipc__new_conn(int fd)
+{
+	int client;
+	struct epoll_event ev;
+
+	client = accept(fd, NULL, NULL);
+	if (client < 0)
+		return -1;
+
+	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.fd = client;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client, &ev) < 0) {
+		close(client);
+		return -1;
+	}
+
+	return client;
+}
+
+static void kvm_ipc__close_conn(int fd)
+{
+	epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL);
+	close(fd);
+}
+
+static int kvm_ipc__receive(struct kvm *kvm, int fd)
+{
+	struct kvm_ipc_head head;
+	u8 *msg = NULL;
+	u32 n;
+
+	n = read(fd, &head, sizeof(head));
+	if (n != sizeof(head))
+		goto done;
+
+	msg = malloc(head.len);
+	if (msg == NULL)
+		goto done;
+
+	n = read_in_full(fd, msg, head.len);
+	if (n != head.len)
+		goto done;
+
+	kvm_ipc__handle(kvm, fd, head.type, head.len, msg);
+
+	return 0;
+
+done:
+	free(msg);
+	return -1;
+}
+
+static void *kvm_ipc__thread(void *param)
+{
+	struct epoll_event event;
+	struct kvm *kvm = param;
+
+	kvm__set_thread_name("kvm-ipc");
+
+	for (;;) {
+		int nfds;
+
+		nfds = epoll_wait(epoll_fd, &event, 1, -1);
+		if (nfds > 0) {
+			int fd = event.data.fd;
+
+			if (fd == stop_fd && event.events & EPOLLIN) {
+				break;
+			} else if (fd == server_fd) {
+				int client, r;
+
+				client = kvm_ipc__new_conn(fd);
+				/*
+				 * Handle multiple IPC cmd at a time
+				 */
+				do {
+					r = kvm_ipc__receive(kvm, client);
+				} while	(r == 0);
+
+			} else if (event.events & (EPOLLERR | EPOLLRDHUP | EPOLLHUP)) {
+				kvm_ipc__close_conn(fd);
+			} else {
+				kvm_ipc__receive(kvm, fd);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void kvm__pid(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	pid_t pid = getpid();
+	int r = 0;
+
+	if (type == KVM_IPC_PID)
+		r = write(fd, &pid, sizeof(pid));
+
+	if (r < 0)
+		pr_warning("Failed sending PID");
+}
+
+static void handle_stop(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	if (WARN_ON(type != KVM_IPC_STOP || len))
+		return;
+
+	kvm_cpu__reboot(kvm);
+}
+
+/* Pause/resume the guest using SIGUSR2 */
+static int is_paused;
+
+static void handle_pause(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	if (WARN_ON(len))
+		return;
+
+	if (type == KVM_IPC_RESUME && is_paused) {
+		kvm->vm_state = KVM_VMSTATE_RUNNING;
+		kvm__continue(kvm);
+	} else if (type == KVM_IPC_PAUSE && !is_paused) {
+		kvm->vm_state = KVM_VMSTATE_PAUSED;
+		ioctl(kvm->vm_fd, KVM_KVMCLOCK_CTRL);
+		kvm__pause(kvm);
+	} else {
+		return;
+	}
+
+	is_paused = !is_paused;
+}
+
+static void handle_vmstate(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int r = 0;
+
+	if (type == KVM_IPC_VMSTATE)
+		r = write(fd, &kvm->vm_state, sizeof(kvm->vm_state));
+
+	if (r < 0)
+		pr_warning("Failed sending VMSTATE");
+}
+
+/*
+ * Serialize debug printout so that the output of multiple vcpus does not
+ * get mixed up:
+ */
+static int printout_done;
+
+static void handle_sigusr1(int sig)
+{
+	struct kvm_cpu *cpu = current_kvm_cpu;
+	int fd = kvm_cpu__get_debug_fd();
+
+	if (!cpu || cpu->needs_nmi)
+		return;
+
+	dprintf(fd, "\n #\n # vCPU #%ld's dump:\n #\n", cpu->cpu_id);
+	kvm_cpu__show_registers(cpu);
+	kvm_cpu__show_code(cpu);
+	kvm_cpu__show_page_tables(cpu);
+	fflush(stdout);
+	printout_done = 1;
+}
+
+static void handle_debug(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int i;
+	struct debug_cmd_params *params;
+	u32 dbg_type;
+	u32 vcpu;
+
+	if (WARN_ON(type != KVM_IPC_DEBUG || len != sizeof(*params)))
+		return;
+
+	params = (void *)msg;
+	dbg_type = params->dbg_type;
+	vcpu = params->cpu;
+
+	if (dbg_type & KVM_DEBUG_CMD_TYPE_SYSRQ)
+		serial8250__inject_sysrq(kvm, params->sysrq);
+
+	if (dbg_type & KVM_DEBUG_CMD_TYPE_NMI) {
+		if ((int)vcpu >= kvm->nrcpus)
+			return;
+
+		kvm->cpus[vcpu]->needs_nmi = 1;
+		pthread_kill(kvm->cpus[vcpu]->thread, SIGUSR1);
+	}
+
+	if (!(dbg_type & KVM_DEBUG_CMD_TYPE_DUMP))
+		return;
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		struct kvm_cpu *cpu = kvm->cpus[i];
+
+		if (!cpu)
+			continue;
+
+		printout_done = 0;
+
+		kvm_cpu__set_debug_fd(fd);
+		pthread_kill(cpu->thread, SIGUSR1);
+		/*
+		 * Wait for the vCPU to dump state before signalling
+		 * the next thread. Since this is debug code it does
+		 * not matter that we are burning CPU time a bit:
+		 */
+		while (!printout_done)
+			sleep(0);
+	}
+
+	close(fd);
+
+	serial8250__inject_sysrq(kvm, 'p');
+}
+
+int kvm_ipc__init(struct kvm *kvm)
+{
+	int ret;
+	int sock = kvm__create_socket(kvm);
+	struct epoll_event ev = {0};
+
+	server_fd = sock;
+
+	epoll_fd = epoll_create(KVM_IPC_MAX_MSGS);
+	if (epoll_fd < 0) {
+		ret = epoll_fd;
+		goto err;
+	}
+
+	ev.events = EPOLLIN | EPOLLET;
+	ev.data.fd = sock;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+		pr_err("Failed starting IPC thread");
+		ret = -EFAULT;
+		goto err_epoll;
+	}
+
+	stop_fd = eventfd(0, 0);
+	if (stop_fd < 0) {
+		ret = stop_fd;
+		goto err_epoll;
+	}
+
+	ev.events = EPOLLIN | EPOLLET;
+	ev.data.fd = stop_fd;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, stop_fd, &ev) < 0) {
+		pr_err("Failed adding stop event to epoll");
+		ret = -EFAULT;
+		goto err_stop;
+	}
+
+	if (pthread_create(&thread, NULL, kvm_ipc__thread, kvm) != 0) {
+		pr_err("Failed starting IPC thread");
+		ret = -EFAULT;
+		goto err_stop;
+	}
+
+	kvm_ipc__register_handler(KVM_IPC_PID, kvm__pid);
+	kvm_ipc__register_handler(KVM_IPC_DEBUG, handle_debug);
+	kvm_ipc__register_handler(KVM_IPC_PAUSE, handle_pause);
+	kvm_ipc__register_handler(KVM_IPC_RESUME, handle_pause);
+	kvm_ipc__register_handler(KVM_IPC_STOP, handle_stop);
+	kvm_ipc__register_handler(KVM_IPC_VMSTATE, handle_vmstate);
+	signal(SIGUSR1, handle_sigusr1);
+
+	return 0;
+
+err_stop:
+	close(stop_fd);
+err_epoll:
+	close(epoll_fd);
+err:
+	return ret;
+}
+base_init(kvm_ipc__init);
+
+int kvm_ipc__exit(struct kvm *kvm)
+{
+	u64 val = 1;
+	int ret;
+
+	ret = write(stop_fd, &val, sizeof(val));
+	if (ret < 0)
+		return ret;
+
+	close(server_fd);
+	close(epoll_fd);
+
+	kvm__remove_socket(kvm->cfg.guest_name);
+
+	return ret;
+}
+base_exit(kvm_ipc__exit);
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
new file mode 100644
index 000000000000..a6b3c2346ad4
--- /dev/null
+++ b/tools/kvm/kvm.c
@@ -0,0 +1,512 @@
+#include "kvm/kvm.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/list.h>
+#include <linux/err.h>
+
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <dirent.h>
+
+#define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
+
+const char *kvm_exit_reasons[] = {
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR),
+#ifdef CONFIG_PPC64
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
+#endif
+};
+
+static int pause_event;
+static DEFINE_MUTEX(pause_lock);
+extern struct kvm_ext kvm_req_ext[];
+
+static char kvm_dir[PATH_MAX];
+
+static int set_dir(const char *fmt, va_list args)
+{
+	char tmp[PATH_MAX];
+
+	vsnprintf(tmp, sizeof(tmp), fmt, args);
+
+	mkdir(tmp, 0777);
+
+	if (!realpath(tmp, kvm_dir))
+		return -errno;
+
+	strcat(kvm_dir, "/");
+
+	return 0;
+}
+
+void kvm__set_dir(const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	set_dir(fmt, args);
+	va_end(args);
+}
+
+const char *kvm__get_dir(void)
+{
+	return kvm_dir;
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, extension);
+	if (ret < 0)
+		return false;
+
+	return ret;
+}
+
+static int kvm__check_extensions(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; ; i++) {
+		if (!kvm_req_ext[i].name)
+			break;
+		if (!kvm__supports_extension(kvm, kvm_req_ext[i].code)) {
+			pr_err("Unsuppored KVM extension detected: %s",
+				kvm_req_ext[i].name);
+			return -i;
+		}
+	}
+
+	return 0;
+}
+
+struct kvm *kvm__new(void)
+{
+	struct kvm *kvm = calloc(1, sizeof(*kvm));
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	kvm->sys_fd = -1;
+	kvm->vm_fd = -1;
+
+	return kvm;
+}
+
+int kvm__exit(struct kvm *kvm)
+{
+	struct kvm_mem_bank *bank, *tmp;
+
+	kvm__arch_delete_ram(kvm);
+
+	list_for_each_entry_safe(bank, tmp, &kvm->mem_banks, list) {
+		list_del(&bank->list);
+		free(bank);
+	}
+
+	free(kvm);
+	return 0;
+}
+core_exit(kvm__exit);
+
+/*
+ * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping
+ * memory regions to it. Therefore, be careful if you use this function for
+ * registering memory regions for emulating hardware.
+ */
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr)
+{
+	struct kvm_userspace_memory_region mem;
+	struct kvm_mem_bank *bank;
+	int ret;
+
+	bank = malloc(sizeof(*bank));
+	if (!bank)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&bank->list);
+	bank->guest_phys_addr		= guest_phys;
+	bank->host_addr			= userspace_addr;
+	bank->size			= size;
+
+	mem = (struct kvm_userspace_memory_region) {
+		.slot			= kvm->mem_slots++,
+		.guest_phys_addr	= guest_phys,
+		.memory_size		= size,
+		.userspace_addr		= (unsigned long)userspace_addr,
+	};
+
+	ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
+	if (ret < 0)
+		return -errno;
+
+	list_add(&bank->list, &kvm->mem_banks);
+	return 0;
+}
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset)
+{
+	struct kvm_mem_bank *bank;
+
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		u64 bank_start = bank->guest_phys_addr;
+		u64 bank_end = bank_start + bank->size;
+
+		if (offset >= bank_start && offset < bank_end)
+			return bank->host_addr + (offset - bank_start);
+	}
+
+	pr_warning("unable to translate guest address 0x%llx to host",
+			(unsigned long long)offset);
+	return NULL;
+}
+
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr)
+{
+	struct kvm_mem_bank *bank;
+
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		void *bank_start = bank->host_addr;
+		void *bank_end = bank_start + bank->size;
+
+		if (ptr >= bank_start && ptr < bank_end)
+			return bank->guest_phys_addr + (ptr - bank_start);
+	}
+
+	pr_warning("unable to translate host address %p to guest", ptr);
+	return 0;
+}
+
+int kvm__recommended_cpus(struct kvm *kvm)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_NR_VCPUS);
+	if (ret <= 0)
+		/*
+		 * api.txt states that if KVM_CAP_NR_VCPUS does not exist,
+		 * assume 4.
+		 */
+		return 4;
+
+	return ret;
+}
+
+/*
+ * The following hack should be removed once 'x86: Raise the hard
+ * VCPU count limit' makes it's way into the mainline.
+ */
+#ifndef KVM_CAP_MAX_VCPUS
+#define KVM_CAP_MAX_VCPUS 66
+#endif
+
+int kvm__max_cpus(struct kvm *kvm)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_MAX_VCPUS);
+	if (ret <= 0)
+		ret = kvm__recommended_cpus(kvm);
+
+	return ret;
+}
+
+int kvm__init(struct kvm *kvm)
+{
+	int ret;
+
+	if (!kvm__arch_cpu_supports_vm()) {
+		pr_err("Your CPU does not support hardware virtualization");
+		ret = -ENOSYS;
+		goto err;
+	}
+
+	kvm->sys_fd = open(kvm->cfg.dev, O_RDWR);
+	if (kvm->sys_fd < 0) {
+		if (errno == ENOENT)
+			pr_err("'%s' not found. Please make sure your kernel has CONFIG_KVM "
+			       "enabled and that the KVM modules are loaded.", kvm->cfg.dev);
+		else if (errno == ENODEV)
+			pr_err("'%s' KVM driver not available.\n  # (If the KVM "
+			       "module is loaded then 'dmesg' may offer further clues "
+			       "about the failure.)", kvm->cfg.dev);
+		else
+			pr_err("Could not open %s: ", kvm->cfg.dev);
+
+		ret = -errno;
+		goto err_free;
+	}
+
+	ret = ioctl(kvm->sys_fd, KVM_GET_API_VERSION, 0);
+	if (ret != KVM_API_VERSION) {
+		pr_err("KVM_API_VERSION ioctl");
+		ret = -errno;
+		goto err_sys_fd;
+	}
+
+	kvm->vm_fd = ioctl(kvm->sys_fd, KVM_CREATE_VM, 0);
+	if (kvm->vm_fd < 0) {
+		ret = kvm->vm_fd;
+		goto err_sys_fd;
+	}
+
+	if (kvm__check_extensions(kvm)) {
+		pr_err("A required KVM extension is not supported by OS");
+		ret = -ENOSYS;
+		goto err_vm_fd;
+	}
+
+	kvm__arch_init(kvm, kvm->cfg.hugetlbfs_path, kvm->cfg.ram_size);
+
+	INIT_LIST_HEAD(&kvm->mem_banks);
+	kvm__init_ram(kvm);
+
+	if (!kvm->cfg.firmware_filename) {
+		if (!kvm__load_kernel(kvm, kvm->cfg.kernel_filename,
+				kvm->cfg.initrd_filename, kvm->cfg.real_cmdline))
+			die("unable to load kernel %s", kvm->cfg.kernel_filename);
+	}
+
+	if (kvm->cfg.firmware_filename) {
+		if (!kvm__load_firmware(kvm, kvm->cfg.firmware_filename))
+			die("unable to load firmware image %s: %s", kvm->cfg.firmware_filename, strerror(errno));
+	} else {
+		ret = kvm__arch_setup_firmware(kvm);
+		if (ret < 0)
+			die("kvm__arch_setup_firmware() failed with error %d\n", ret);
+	}
+
+	return 0;
+
+err_vm_fd:
+	close(kvm->vm_fd);
+err_sys_fd:
+	close(kvm->sys_fd);
+err_free:
+	free(kvm);
+err:
+	return ret;
+}
+core_init(kvm__init);
+
+/* RFC 1952 */
+#define GZIP_ID1		0x1f
+#define GZIP_ID2		0x8b
+#define CPIO_MAGIC		"0707"
+/* initrd may be gzipped, or a plain cpio */
+static bool initrd_check(int fd)
+{
+	unsigned char id[4];
+
+	if (read_in_full(fd, id, ARRAY_SIZE(id)) < 0)
+		return false;
+
+	if (lseek(fd, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	return (id[0] == GZIP_ID1 && id[1] == GZIP_ID2) ||
+		!memcmp(id, CPIO_MAGIC, 4);
+}
+
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+		const char *initrd_filename, const char *kernel_cmdline)
+{
+	bool ret;
+	int fd_kernel = -1, fd_initrd = -1;
+
+	fd_kernel = open(kernel_filename, O_RDONLY);
+	if (fd_kernel < 0)
+		die("Unable to open kernel %s", kernel_filename);
+
+	if (initrd_filename) {
+		fd_initrd = open(initrd_filename, O_RDONLY);
+		if (fd_initrd < 0)
+			die("Unable to open initrd %s", initrd_filename);
+
+		if (!initrd_check(fd_initrd))
+			die("%s is not an initrd", initrd_filename);
+	}
+
+	ret = load_bzimage(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+	if (ret)
+		goto found_kernel;
+
+	pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename);
+
+	ret = load_flat_binary(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+	if (ret)
+		goto found_kernel;
+
+	if (initrd_filename)
+		close(fd_initrd);
+	close(fd_kernel);
+
+	die("%s is not a valid bzImage or flat binary", kernel_filename);
+
+found_kernel:
+	if (initrd_filename)
+		close(fd_initrd);
+	close(fd_kernel);
+
+	return ret;
+}
+
+#define TIMER_INTERVAL_NS 1000000	/* 1 msec */
+
+/*
+ * This function sets up a timer that's used to inject interrupts from the
+ * userspace hypervisor into the guest at periodical intervals. Please note
+ * that clock interrupt, for example, is not handled here.
+ */
+int kvm_timer__init(struct kvm *kvm)
+{
+	struct itimerspec its;
+	struct sigevent sev;
+	int r;
+
+	memset(&sev, 0, sizeof(struct sigevent));
+	sev.sigev_value.sival_int	= 0;
+	sev.sigev_notify		= SIGEV_THREAD_ID;
+	sev.sigev_signo			= SIGALRM;
+	sev.sigev_value.sival_ptr	= kvm;
+	sev._sigev_un._tid		= syscall(__NR_gettid);
+
+	r = timer_create(CLOCK_REALTIME, &sev, &kvm->timerid);
+	if (r < 0)
+		return r;
+
+	its.it_value.tv_sec		= TIMER_INTERVAL_NS / 1000000000;
+	its.it_value.tv_nsec		= TIMER_INTERVAL_NS % 1000000000;
+	its.it_interval.tv_sec		= its.it_value.tv_sec;
+	its.it_interval.tv_nsec		= its.it_value.tv_nsec;
+
+	r = timer_settime(kvm->timerid, 0, &its, NULL);
+	if (r < 0) {
+		timer_delete(kvm->timerid);
+		return r;
+	}
+
+	return 0;
+}
+firmware_init(kvm_timer__init);
+
+int kvm_timer__exit(struct kvm *kvm)
+{
+	if (kvm->timerid)
+		if (timer_delete(kvm->timerid) < 0)
+			die("timer_delete()");
+
+	kvm->timerid = 0;
+
+	return 0;
+}
+firmware_exit(kvm_timer__exit);
+
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size)
+{
+	unsigned char *p;
+	unsigned long n;
+
+	size &= ~7; /* mod 8 */
+	if (!size)
+		return;
+
+	p = guest_flat_to_host(kvm, addr);
+
+	for (n = 0; n < size; n += 8) {
+		if (!host_ptr_in_ram(kvm, p + n))
+			break;
+
+		printf("  0x%08lx: %02x %02x %02x %02x  %02x %02x %02x %02x\n",
+			addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3],
+				  p[n + 4], p[n + 5], p[n + 6], p[n + 7]);
+	}
+}
+
+void kvm__pause(struct kvm *kvm)
+{
+	int i, paused_vcpus = 0;
+
+	/* Check if the guest is running */
+	if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+		return;
+
+	mutex_lock(&pause_lock);
+
+	pause_event = eventfd(0, 0);
+	if (pause_event < 0)
+		die("Failed creating pause notification event");
+	for (i = 0; i < kvm->nrcpus; i++)
+		pthread_kill(kvm->cpus[i]->thread, SIGKVMPAUSE);
+
+	while (paused_vcpus < kvm->nrcpus) {
+		u64 cur_read;
+
+		if (read(pause_event, &cur_read, sizeof(cur_read)) < 0)
+			die("Failed reading pause event");
+		paused_vcpus += cur_read;
+	}
+	close(pause_event);
+}
+
+void kvm__continue(struct kvm *kvm)
+{
+	/* Check if the guest is running */
+	if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+		return;
+
+	mutex_unlock(&pause_lock);
+}
+
+void kvm__notify_paused(void)
+{
+	u64 p = 1;
+
+	if (write(pause_event, &p, sizeof(p)) < 0)
+		die("Failed notifying of paused VCPU.");
+
+	mutex_lock(&pause_lock);
+	mutex_unlock(&pause_lock);
+}
diff --git a/tools/kvm/main.c b/tools/kvm/main.c
new file mode 100644
index 000000000000..05bc82c8c6fa
--- /dev/null
+++ b/tools/kvm/main.c
@@ -0,0 +1,19 @@
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/* user defined header files */
+#include <kvm/kvm-cmd.h>
+
+static int handle_kvm_command(int argc, char **argv)
+{
+	return handle_command(kvm_commands, argc, (const char **) &argv[0]);
+}
+
+int main(int argc, char *argv[])
+{
+	kvm__set_dir("%s/%s", HOME_DIR, KVM_PID_FILE_PATH);
+
+	return handle_kvm_command(argc - 1, &argv[1]);
+}
diff --git a/tools/kvm/mmio.c b/tools/kvm/mmio.c
new file mode 100644
index 000000000000..5d65d280391b
--- /dev/null
+++ b/tools/kvm/mmio.c
@@ -0,0 +1,139 @@
+#include "kvm/kvm.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/brlock.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/ioctl.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/err.h>
+#include <errno.h>
+
+#define mmio_node(n) rb_entry(n, struct mmio_mapping, node)
+
+struct mmio_mapping {
+	struct rb_int_node	node;
+	void			(*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr);
+	void			*ptr;
+};
+
+static struct rb_root mmio_tree = RB_ROOT;
+
+static struct mmio_mapping *mmio_search(struct rb_root *root, u64 addr, u64 len)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_range(root, addr, addr + len);
+	if (node == NULL)
+		return NULL;
+
+	return mmio_node(node);
+}
+
+/* Find lowest match, Check for overlap */
+static struct mmio_mapping *mmio_search_single(struct rb_root *root, u64 addr)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_single(root, addr);
+	if (node == NULL)
+		return NULL;
+
+	return mmio_node(node);
+}
+
+static int mmio_insert(struct rb_root *root, struct mmio_mapping *data)
+{
+	return rb_int_insert(root, &data->node);
+}
+
+static const char *to_direction(u8 is_write)
+{
+	if (is_write)
+		return "write";
+
+	return "read";
+}
+
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+			void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+			void *ptr)
+{
+	struct mmio_mapping *mmio;
+	struct kvm_coalesced_mmio_zone zone;
+	int ret;
+
+	mmio = malloc(sizeof(*mmio));
+	if (mmio == NULL)
+		return -ENOMEM;
+
+	*mmio = (struct mmio_mapping) {
+		.node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
+		.mmio_fn = mmio_fn,
+		.ptr	= ptr,
+	};
+
+	if (coalesce) {
+		zone = (struct kvm_coalesced_mmio_zone) {
+			.addr	= phys_addr,
+			.size	= phys_addr_len,
+		};
+		ret = ioctl(kvm->vm_fd, KVM_REGISTER_COALESCED_MMIO, &zone);
+		if (ret < 0) {
+			free(mmio);
+			return -errno;
+		}
+	}
+	br_write_lock(kvm);
+	ret = mmio_insert(&mmio_tree, mmio);
+	br_write_unlock(kvm);
+
+	return ret;
+}
+
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr)
+{
+	struct mmio_mapping *mmio;
+	struct kvm_coalesced_mmio_zone zone;
+
+	br_write_lock(kvm);
+	mmio = mmio_search_single(&mmio_tree, phys_addr);
+	if (mmio == NULL) {
+		br_write_unlock(kvm);
+		return false;
+	}
+
+	zone = (struct kvm_coalesced_mmio_zone) {
+		.addr	= phys_addr,
+		.size	= 1,
+	};
+	ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+
+	rb_int_erase(&mmio_tree, &mmio->node);
+	br_write_unlock(kvm);
+
+	free(mmio);
+	return true;
+}
+
+bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	struct mmio_mapping *mmio;
+
+	br_read_lock();
+	mmio = mmio_search(&mmio_tree, phys_addr, len);
+
+	if (mmio)
+		mmio->mmio_fn(phys_addr, data, len, is_write, mmio->ptr);
+	else {
+		if (kvm->cfg.mmio_debug)
+			fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n",
+				to_direction(is_write), phys_addr, len);
+	}
+	br_read_unlock();
+
+	return true;
+}
diff --git a/tools/kvm/net/uip/arp.c b/tools/kvm/net/uip/arp.c
new file mode 100644
index 000000000000..98423da6cb19
--- /dev/null
+++ b/tools/kvm/net/uip/arp.c
@@ -0,0 +1,30 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_arp(struct uip_tx_arg *arg)
+{
+	struct uip_arp *arp, *arp2;
+	struct uip_info *info;
+	struct uip_buf *buf;
+
+	info = arg->info;
+	buf = uip_buf_clone(arg);
+
+	arp	 = (struct uip_arp *)(arg->eth);
+	arp2	 = (struct uip_arp *)(buf->eth);
+
+	/*
+	 * ARP replay code: 2
+	 */
+	arp2->op   = htons(0x2);
+	arp2->dmac = arp->smac;
+	arp2->dip  = arp->sip;
+
+	if (arp->dip == htonl(info->host_ip)) {
+		arp2->smac = info->host_mac;
+		arp2->sip = htonl(info->host_ip);
+
+		uip_buf_set_used(info, buf);
+	}
+
+	return 0;
+}
diff --git a/tools/kvm/net/uip/buf.c b/tools/kvm/net/uip/buf.c
new file mode 100644
index 000000000000..f29ad41cb8fc
--- /dev/null
+++ b/tools/kvm/net/uip/buf.c
@@ -0,0 +1,114 @@
+#include "kvm/uip.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct uip_buf *uip_buf_get_used(struct uip_info *info)
+{
+	struct uip_buf *buf;
+	bool found = false;
+
+	mutex_lock(&info->buf_lock);
+
+	while (!(info->buf_used_nr > 0))
+		pthread_cond_wait(&info->buf_used_cond, &info->buf_lock.mutex);
+
+	list_for_each_entry(buf, &info->buf_head, list) {
+		if (buf->status == UIP_BUF_STATUS_USED) {
+			/*
+			 * Set status to INUSE immediately to prevent
+			 * someone from using this buf until we free it
+			 */
+			buf->status = UIP_BUF_STATUS_INUSE;
+			info->buf_used_nr--;
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&info->buf_lock);
+
+	return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_get_free(struct uip_info *info)
+{
+	struct uip_buf *buf;
+	bool found = false;
+
+	mutex_lock(&info->buf_lock);
+
+	while (!(info->buf_free_nr > 0))
+		pthread_cond_wait(&info->buf_free_cond, &info->buf_lock.mutex);
+
+	list_for_each_entry(buf, &info->buf_head, list) {
+		if (buf->status == UIP_BUF_STATUS_FREE) {
+			/*
+			 * Set status to INUSE immediately to prevent
+			 * someone from using this buf until we free it
+			 */
+			buf->status = UIP_BUF_STATUS_INUSE;
+			info->buf_free_nr--;
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&info->buf_lock);
+
+	return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf)
+{
+	mutex_lock(&info->buf_lock);
+
+	buf->status = UIP_BUF_STATUS_USED;
+	info->buf_used_nr++;
+	pthread_cond_signal(&info->buf_used_cond);
+
+	mutex_unlock(&info->buf_lock);
+
+	return buf;
+}
+
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf)
+{
+	mutex_lock(&info->buf_lock);
+
+	buf->status = UIP_BUF_STATUS_FREE;
+	info->buf_free_nr++;
+	pthread_cond_signal(&info->buf_free_cond);
+
+	mutex_unlock(&info->buf_lock);
+
+	return buf;
+}
+
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg)
+{
+	struct uip_buf *buf;
+	struct uip_eth *eth2;
+	struct uip_info *info;
+
+	info = arg->info;
+
+	/*
+	 * Get buffer from device to guest
+	 */
+	buf = uip_buf_get_free(info);
+
+	/*
+	 * Clone buffer
+	 */
+	memcpy(buf->vnet, arg->vnet, arg->vnet_len);
+	memcpy(buf->eth, arg->eth, arg->eth_len);
+	buf->vnet_len	= arg->vnet_len;
+	buf->eth_len	= arg->eth_len;
+
+	eth2		= (struct uip_eth *)buf->eth;
+	eth2->src	= info->host_mac;
+	eth2->dst	= arg->eth->src;
+
+	return buf;
+}
diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
new file mode 100644
index 000000000000..4e5bb82e48b1
--- /dev/null
+++ b/tools/kvm/net/uip/core.c
@@ -0,0 +1,190 @@
+#include "kvm/mutex.h"
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
+{
+	struct virtio_net_hdr *vnet;
+	struct uip_tx_arg arg;
+	int eth_len, vnet_len;
+	struct uip_eth *eth;
+	u8 *buf = NULL;
+	u16 proto;
+	int i;
+
+	/*
+	 * Buffer from guest to device
+	 */
+	vnet_len = iov[0].iov_len;
+	vnet	 = iov[0].iov_base;
+
+	eth_len	 = iov[1].iov_len;
+	eth	 = iov[1].iov_base;
+
+	/*
+	 * In case, ethernet frame is in more than one iov entry.
+	 * Copy iov buffer into one linear buffer.
+	 */
+	if (out > 2) {
+		eth_len = 0;
+		for (i = 1; i < out; i++)
+			eth_len += iov[i].iov_len;
+
+		buf = malloc(eth_len);
+		if (!buf)
+			return -1;
+
+		eth = (struct uip_eth *)buf;
+		for (i = 1; i < out; i++) {
+			memcpy(buf, iov[i].iov_base, iov[i].iov_len);
+			buf += iov[i].iov_len;
+		}
+	}
+
+	memset(&arg, 0, sizeof(arg));
+
+	arg.vnet_len = vnet_len;
+	arg.eth_len = eth_len;
+	arg.info = info;
+	arg.vnet = vnet;
+	arg.eth = eth;
+
+	/*
+	 * Check package type
+	 */
+	proto = ntohs(eth->type);
+
+	switch (proto) {
+	case UIP_ETH_P_ARP:
+		uip_tx_do_arp(&arg);
+		break;
+	case UIP_ETH_P_IP:
+		uip_tx_do_ipv4(&arg);
+		break;
+	default:
+		break;
+	}
+
+	if (out > 2 && buf)
+		free(eth);
+
+	return vnet_len + eth_len;
+}
+
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
+{
+	struct virtio_net_hdr *vnet;
+	struct uip_eth *eth;
+	struct uip_buf *buf;
+	int vnet_len;
+	int eth_len;
+	char *p;
+	int len;
+	int cnt;
+	int i;
+
+	/*
+	 * Sleep until there is a buffer for guest
+	 */
+	buf = uip_buf_get_used(info);
+
+	/*
+	 * Fill device to guest buffer, vnet hdr fisrt
+	 */
+	vnet_len = iov[0].iov_len;
+	vnet = iov[0].iov_base;
+	if (buf->vnet_len > vnet_len) {
+		len = -1;
+		goto out;
+	}
+	memcpy(vnet, buf->vnet, buf->vnet_len);
+
+	/*
+	 * Then, the real eth data
+	 * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides
+	 */
+	cnt = buf->eth_len;
+	p = buf->eth;
+	for (i = 1; i < in; i++) {
+		eth_len = iov[i].iov_len;
+		eth = iov[i].iov_base;
+		if (cnt > eth_len) {
+			memcpy(eth, p, eth_len);
+			cnt -= eth_len;
+			p += eth_len;
+		} else {
+			memcpy(eth, p, cnt);
+			cnt -= cnt;
+			break;
+		}
+	}
+
+	if (cnt) {
+		pr_warning("uip_rx error");
+		len = -1;
+		goto out;
+	}
+
+	len = buf->vnet_len + buf->eth_len;
+
+out:
+	uip_buf_set_free(info, buf);
+	return len;
+}
+
+int uip_init(struct uip_info *info)
+{
+	struct list_head *udp_socket_head;
+	struct list_head *tcp_socket_head;
+	struct list_head *buf_head;
+	struct uip_buf *buf;
+	int buf_nr;
+	int i;
+
+	udp_socket_head	= &info->udp_socket_head;
+	tcp_socket_head	= &info->tcp_socket_head;
+	buf_head	= &info->buf_head;
+	buf_nr		= info->buf_nr;
+
+	INIT_LIST_HEAD(udp_socket_head);
+	INIT_LIST_HEAD(tcp_socket_head);
+	INIT_LIST_HEAD(buf_head);
+
+	mutex_init(&info->udp_socket_lock);
+	mutex_init(&info->tcp_socket_lock);
+	mutex_init(&info->buf_lock);
+
+	pthread_cond_init(&info->buf_used_cond, NULL);
+	pthread_cond_init(&info->buf_free_cond, NULL);
+
+
+	for (i = 0; i < buf_nr; i++) {
+		buf = malloc(sizeof(*buf));
+		memset(buf, 0, sizeof(*buf));
+
+		buf->status	= UIP_BUF_STATUS_FREE;
+		buf->info	= info;
+		buf->id		= i;
+		list_add_tail(&buf->list, buf_head);
+	}
+
+	list_for_each_entry(buf, buf_head, list) {
+		buf->vnet	= malloc(sizeof(struct virtio_net_hdr));
+		buf->vnet_len	= sizeof(struct virtio_net_hdr);
+		buf->eth	= malloc(1024*64 + sizeof(struct uip_pseudo_hdr));
+		buf->eth_len	= 1024*64 + sizeof(struct uip_pseudo_hdr);
+
+		memset(buf->vnet, 0, buf->vnet_len);
+		memset(buf->eth, 0, buf->eth_len);
+	}
+
+	info->buf_free_nr = buf_nr;
+	info->buf_used_nr = 0;
+
+	uip_dhcp_get_dns(info);
+
+	return 0;
+}
diff --git a/tools/kvm/net/uip/csum.c b/tools/kvm/net/uip/csum.c
new file mode 100644
index 000000000000..7ca8badaaeee
--- /dev/null
+++ b/tools/kvm/net/uip/csum.c
@@ -0,0 +1,92 @@
+#include "kvm/uip.h"
+
+static u16 uip_csum(u16 csum, u8 *addr, u16 count)
+{
+	long sum = csum;
+
+	while (count > 1) {
+		sum	+= *(u16 *)addr;
+		addr	+= 2;
+		count	-= 2;
+	}
+
+	if (count > 0)
+		sum += *(unsigned char *)addr;
+
+	while (sum>>16)
+		sum = (sum & 0xffff) + (sum >> 16);
+
+	return ~sum;
+}
+
+u16 uip_csum_ip(struct uip_ip *ip)
+{
+	return uip_csum(0, &ip->vhl, uip_ip_hdrlen(ip));
+}
+
+u16 uip_csum_icmp(struct uip_icmp *icmp)
+{
+	struct uip_ip *ip;
+
+	ip = &icmp->ip;
+	return icmp->csum = uip_csum(0, &icmp->type, htons(ip->len) - uip_ip_hdrlen(ip) - 8); /* icmp header len = 8 */
+}
+
+u16 uip_csum_udp(struct uip_udp *udp)
+{
+	struct uip_pseudo_hdr hdr;
+	struct uip_ip *ip;
+	int udp_len;
+	u8 *pad;
+
+	ip	  = &udp->ip;
+
+	hdr.sip   = ip->sip;
+	hdr.dip	  = ip->dip;
+	hdr.zero  = 0;
+	hdr.proto = ip->proto;
+	hdr.len   = udp->len;
+
+	udp_len	  = uip_udp_len(udp);
+
+	if (udp_len % 2) {
+		pad = (u8 *)&udp->sport + udp_len;
+		*pad = 0;
+		memcpy((u8 *)&udp->sport + udp_len + 1, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&udp->sport, udp_len + 1 + sizeof(hdr));
+	} else {
+		memcpy((u8 *)&udp->sport + udp_len, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&udp->sport, udp_len + sizeof(hdr));
+	}
+
+}
+
+u16 uip_csum_tcp(struct uip_tcp *tcp)
+{
+	struct uip_pseudo_hdr hdr;
+	struct uip_ip *ip;
+	u16 tcp_len;
+	u8 *pad;
+
+	ip	  = &tcp->ip;
+	tcp_len   = ntohs(ip->len) - uip_ip_hdrlen(ip);
+
+	hdr.sip   = ip->sip;
+	hdr.dip	  = ip->dip;
+	hdr.zero  = 0;
+	hdr.proto = ip->proto;
+	hdr.len   = htons(tcp_len);
+
+	if (tcp_len > UIP_MAX_TCP_PAYLOAD + 20)
+		pr_warning("tcp_len(%d) is too large", tcp_len);
+
+	if (tcp_len % 2) {
+		pad = (u8 *)&tcp->sport + tcp_len;
+		*pad = 0;
+		memcpy((u8 *)&tcp->sport + tcp_len + 1, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&tcp->sport, tcp_len + 1 + sizeof(hdr));
+	} else {
+		memcpy((u8 *)&tcp->sport + tcp_len, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&tcp->sport, tcp_len + sizeof(hdr));
+	}
+}
diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c
new file mode 100644
index 000000000000..b17d35239321
--- /dev/null
+++ b/tools/kvm/net/uip/dhcp.c
@@ -0,0 +1,202 @@
+#include "kvm/uip.h"
+
+#include <arpa/inet.h>
+
+#define EMPTY_ADDR "0.0.0.0"
+
+static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp)
+{
+	return (dhcp->option[2] == UIP_DHCP_DISCOVER &&
+		dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+		dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+static inline bool uip_dhcp_is_request(struct uip_dhcp *dhcp)
+{
+	return (dhcp->option[2] == UIP_DHCP_REQUEST &&
+		dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+		dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+bool uip_udp_is_dhcp(struct uip_udp *udp)
+{
+	struct uip_dhcp *dhcp;
+
+	if (ntohs(udp->sport) != UIP_DHCP_PORT_CLIENT ||
+	    ntohs(udp->dport) != UIP_DHCP_PORT_SERVER)
+		return false;
+
+	dhcp = (struct uip_dhcp *)udp;
+
+	if (ntohl(dhcp->magic_cookie) != UIP_DHCP_MAGIC_COOKIE)
+		return false;
+
+	return true;
+}
+
+int uip_dhcp_get_dns(struct uip_info *info)
+{
+	char key[256], val[256];
+	struct in_addr addr;
+	int ret = -1;
+	int n = 0;
+	FILE *fp;
+	u32 ip;
+
+	fp = fopen("/etc/resolv.conf", "r");
+	if (!fp)
+		return ret;
+
+	while (!feof(fp)) {
+		if (fscanf(fp, "%s %s\n", key, val) != 2)
+			continue;
+		if (strncmp("domain", key, 6) == 0)
+			info->domain_name = strndup(val, UIP_DHCP_MAX_DOMAIN_NAME_LEN);
+		else if (strncmp("nameserver", key, 10) == 0) {
+			if (!inet_aton(val, &addr))
+				continue;
+			ip = ntohl(addr.s_addr);
+			if (n < UIP_DHCP_MAX_DNS_SERVER_NR)
+				info->dns_ip[n++] = ip;
+			ret = 0;
+		}
+	}
+
+	fclose(fp);
+	return ret;
+}
+
+static int uip_dhcp_fill_option_name_and_server(struct uip_info *info, u8 *opt, int i)
+{
+	u8 domain_name_len;
+	u32 *addr;
+	int n;
+
+	if (info->domain_name) {
+		domain_name_len	= strlen(info->domain_name);
+		opt[i++]	= UIP_DHCP_TAG_DOMAIN_NAME;
+		opt[i++]	= domain_name_len;
+		memcpy(&opt[i], info->domain_name, domain_name_len);
+		i		+= domain_name_len;
+	}
+
+	for (n = 0; n < UIP_DHCP_MAX_DNS_SERVER_NR; n++) {
+		if (info->dns_ip[n] == 0)
+			continue;
+		opt[i++]	= UIP_DHCP_TAG_DNS_SERVER;
+		opt[i++]	= UIP_DHCP_TAG_DNS_SERVER_LEN;
+		addr		= (u32 *)&opt[i];
+		*addr		= htonl(info->dns_ip[n]);
+		i		+= UIP_DHCP_TAG_DNS_SERVER_LEN;
+	}
+
+	return i;
+}
+static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, int reply_msg_type)
+{
+	int i = 0;
+	u32 *addr;
+	u8 *opt;
+
+	opt		= dhcp->option;
+
+	opt[i++]	= UIP_DHCP_TAG_MSG_TYPE;
+	opt[i++]	= UIP_DHCP_TAG_MSG_TYPE_LEN;
+	opt[i++]	= reply_msg_type;
+
+	opt[i++]	= UIP_DHCP_TAG_SERVER_ID;
+	opt[i++]	= UIP_DHCP_TAG_SERVER_ID_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->host_ip);
+	i		+= UIP_DHCP_TAG_SERVER_ID_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_LEASE_TIME;
+	opt[i++]	= UIP_DHCP_TAG_LEASE_TIME_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(UIP_DHCP_LEASE_TIME);
+	i		+= UIP_DHCP_TAG_LEASE_TIME_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_SUBMASK;
+	opt[i++]	= UIP_DHCP_TAG_SUBMASK_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->guest_netmask);
+	i		+= UIP_DHCP_TAG_SUBMASK_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_ROUTER;
+	opt[i++]	= UIP_DHCP_TAG_ROUTER_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->host_ip);
+	i		+= UIP_DHCP_TAG_ROUTER_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_ROOT;
+	opt[i++]	= strlen(EMPTY_ADDR);
+	addr		= (u32 *)&opt[i];
+	strncpy((void *) addr, EMPTY_ADDR, strlen(EMPTY_ADDR));
+	i		+= strlen(EMPTY_ADDR);
+
+	i 		= uip_dhcp_fill_option_name_and_server(info, opt, i);
+
+	opt[i++]	= UIP_DHCP_TAG_END;
+
+	return 0;
+}
+
+static int uip_dhcp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 reply_msg_type)
+{
+	struct uip_dhcp *dhcp;
+
+	dhcp		= (struct uip_dhcp *)buf->eth;
+
+	dhcp->msg_type	= 2;
+	dhcp->client_ip	= 0;
+	dhcp->your_ip	= htonl(info->guest_ip);
+	dhcp->server_ip	= htonl(info->host_ip);
+	dhcp->agent_ip	= 0;
+
+	uip_dhcp_fill_option(info, dhcp, reply_msg_type);
+
+	sk->sip		= htonl(info->guest_ip);
+	sk->dip		= htonl(info->host_ip);
+	sk->sport	= htons(UIP_DHCP_PORT_CLIENT);
+	sk->dport	= htons(UIP_DHCP_PORT_SERVER);
+
+	return 0;
+}
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg)
+{
+	struct uip_udp_socket sk;
+	struct uip_dhcp *dhcp;
+	struct uip_info *info;
+	struct uip_buf *buf;
+	u8 reply_msg_type;
+
+	dhcp = (struct uip_dhcp *)arg->eth;
+
+	if (uip_dhcp_is_discovery(dhcp))
+		reply_msg_type = UIP_DHCP_OFFER;
+	else if (uip_dhcp_is_request(dhcp))
+		reply_msg_type = UIP_DHCP_ACK;
+	else
+		return -1;
+
+	buf = uip_buf_clone(arg);
+	info = arg->info;
+
+	/*
+	 * Cook DHCP pkg
+	 */
+	uip_dhcp_make_pkg(info, &sk, buf, reply_msg_type);
+
+	/*
+	 * Cook UDP pkg
+	 */
+	uip_udp_make_pkg(info, &sk, buf, NULL, UIP_DHCP_MAX_PAYLOAD_LEN);
+
+	/*
+	 * Send data received from socket to guest
+	 */
+	uip_buf_set_used(info, buf);
+
+	return 0;
+}
diff --git a/tools/kvm/net/uip/icmp.c b/tools/kvm/net/uip/icmp.c
new file mode 100644
index 000000000000..233297caf44b
--- /dev/null
+++ b/tools/kvm/net/uip/icmp.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg)
+{
+	struct uip_ip *ip, *ip2;
+	struct uip_icmp *icmp2;
+	struct uip_buf *buf;
+
+	buf		= uip_buf_clone(arg);
+
+	icmp2		= (struct uip_icmp *)(buf->eth);
+	ip2		= (struct uip_ip *)(buf->eth);
+	ip		= (struct uip_ip *)(arg->eth);
+
+	ip2->sip	= ip->dip;
+	ip2->dip	= ip->sip;
+	ip2->csum	= 0;
+	/*
+	 * ICMP reply: 0
+	 */
+	icmp2->type	= 0;
+	icmp2->csum	= 0;
+	ip2->csum	= uip_csum_ip(ip2);
+	icmp2->csum	= uip_csum_icmp(icmp2);
+
+	uip_buf_set_used(arg->info, buf);
+
+	return 0;
+}
diff --git a/tools/kvm/net/uip/ipv4.c b/tools/kvm/net/uip/ipv4.c
new file mode 100644
index 000000000000..58373fd022e0
--- /dev/null
+++ b/tools/kvm/net/uip/ipv4.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4(struct uip_tx_arg *arg)
+{
+	struct uip_ip *ip;
+
+	ip = (struct uip_ip *)(arg->eth);
+
+	if (uip_ip_hdrlen(ip) != 20) {
+		pr_warning("IP header length is not 20 bytes");
+		return -1;
+	}
+
+	switch (ip->proto) {
+	case UIP_IP_P_ICMP:
+		uip_tx_do_ipv4_icmp(arg);
+		break;
+	case UIP_IP_P_TCP:
+		uip_tx_do_ipv4_tcp(arg);
+		break;
+	case UIP_IP_P_UDP:
+		uip_tx_do_ipv4_udp(arg);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
new file mode 100644
index 000000000000..9044f40ba2d0
--- /dev/null
+++ b/tools/kvm/net/uip/tcp.c
@@ -0,0 +1,348 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <arpa/inet.h>
+
+static int uip_tcp_socket_close(struct uip_tcp_socket *sk, int how)
+{
+	shutdown(sk->fd, how);
+
+	if (sk->write_done && sk->read_done) {
+		shutdown(sk->fd, SHUT_RDWR);
+		close(sk->fd);
+
+		mutex_lock(sk->lock);
+		list_del(&sk->list);
+		mutex_unlock(sk->lock);
+
+		free(sk);
+	}
+
+	return 0;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct mutex *sk_lock;
+	struct uip_tcp_socket *sk;
+
+	sk_head = &arg->info->tcp_socket_head;
+	sk_lock = &arg->info->tcp_socket_lock;
+
+	mutex_lock(sk_lock);
+	list_for_each_entry(sk, sk_head, list) {
+		if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+			mutex_unlock(sk_lock);
+			return sk;
+		}
+	}
+	mutex_unlock(sk_lock);
+
+	return NULL;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_alloc(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct uip_tcp_socket *sk;
+	struct mutex *sk_lock;
+	struct uip_tcp *tcp;
+	struct uip_ip *ip;
+	int ret;
+
+	tcp = (struct uip_tcp *)arg->eth;
+	ip = (struct uip_ip *)arg->eth;
+
+	sk_head = &arg->info->tcp_socket_head;
+	sk_lock = &arg->info->tcp_socket_lock;
+
+	sk = malloc(sizeof(*sk));
+	memset(sk, 0, sizeof(*sk));
+
+	sk->lock			= sk_lock;
+	sk->info			= arg->info;
+
+	sk->fd				= socket(AF_INET, SOCK_STREAM, 0);
+	sk->addr.sin_family		= AF_INET;
+	sk->addr.sin_port		= dport;
+	sk->addr.sin_addr.s_addr	= dip;
+
+	pthread_cond_init(&sk->cond, NULL);
+
+	if (ntohl(dip) == arg->info->host_ip)
+		sk->addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+	ret = connect(sk->fd, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+	if (ret) {
+		free(sk);
+		return NULL;
+	}
+
+	sk->sip		= ip->sip;
+	sk->dip		= ip->dip;
+	sk->sport	= tcp->sport;
+	sk->dport	= tcp->dport;
+
+	mutex_lock(sk_lock);
+	list_add_tail(&sk->list, sk_head);
+	mutex_unlock(sk_lock);
+
+	return sk;
+}
+
+static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_len)
+{
+	struct uip_info *info;
+	struct uip_eth *eth2;
+	struct uip_tcp *tcp2;
+	struct uip_buf *buf;
+	struct uip_ip *ip2;
+
+	info		= sk->info;
+
+	/*
+	 * Get free buffer to send data to guest
+	 */
+	buf		= uip_buf_get_free(info);
+
+	/*
+	 * Cook a ethernet frame
+	 */
+	tcp2		= (struct uip_tcp *)buf->eth;
+	eth2		= (struct uip_eth *)buf->eth;
+	ip2		= (struct uip_ip *)buf->eth;
+
+	eth2->src	= info->host_mac;
+	eth2->dst	= info->guest_mac;
+	eth2->type	= htons(UIP_ETH_P_IP);
+
+	ip2->vhl	= UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+	ip2->tos	= 0;
+	ip2->id		= 0;
+	ip2->flgfrag	= 0;
+	ip2->ttl	= UIP_IP_TTL;
+	ip2->proto	= UIP_IP_P_TCP;
+	ip2->csum	= 0;
+	ip2->sip	= sk->dip;
+	ip2->dip	= sk->sip;
+
+	tcp2->sport	= sk->dport;
+	tcp2->dport	= sk->sport;
+	tcp2->seq	= htonl(sk->seq_server);
+	tcp2->ack	= htonl(sk->ack_server);
+	/*
+	 * Diable TCP options, tcp hdr len equals 20 bytes
+	 */
+	tcp2->off	= UIP_TCP_HDR_LEN;
+	tcp2->flg	= flag;
+	tcp2->win	= htons(UIP_TCP_WIN_SIZE);
+	tcp2->csum	= 0;
+	tcp2->urgent	= 0;
+
+	if (payload_len > 0)
+		memcpy(uip_tcp_payload(tcp2), sk->payload, payload_len);
+
+	ip2->len	= htons(uip_tcp_hdrlen(tcp2) + payload_len + uip_ip_hdrlen(ip2));
+	ip2->csum	= uip_csum_ip(ip2);
+	tcp2->csum	= uip_csum_tcp(tcp2);
+
+	/*
+	 * virtio_net_hdr
+	 */
+	buf->vnet_len	= sizeof(struct virtio_net_hdr);
+	memset(buf->vnet, 0, buf->vnet_len);
+
+	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+	/*
+	 * Increase server seq
+	 */
+	sk->seq_server  += payload_len;
+
+	/*
+	 * Send data received from socket to guest
+	 */
+	uip_buf_set_used(info, buf);
+
+	return 0;
+}
+
+static void *uip_tcp_socket_thread(void *p)
+{
+	struct uip_tcp_socket *sk;
+	int len, left, ret;
+	u8 *payload, *pos;
+
+	kvm__set_thread_name("uip-tcp");
+
+	sk = p;
+
+	payload = malloc(UIP_MAX_TCP_PAYLOAD);
+	if (!payload)
+		goto out;
+
+	while (1) {
+		pos = payload;
+
+		ret = read(sk->fd, payload, UIP_MAX_TCP_PAYLOAD);
+
+		if (ret <= 0 || ret > UIP_MAX_TCP_PAYLOAD)
+			goto out;
+
+		left = ret;
+
+		while (left > 0) {
+			mutex_lock(sk->lock);
+			while ((len = sk->guest_acked + sk->window_size - sk->seq_server) <= 0)
+				pthread_cond_wait(&sk->cond, &sk->lock->mutex);
+			mutex_unlock(sk->lock);
+
+			sk->payload = pos;
+			if (len > left)
+				len = left;
+			if (len > UIP_MAX_TCP_PAYLOAD)
+				len = UIP_MAX_TCP_PAYLOAD;
+			left -= len;
+			pos += len;
+
+			uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, len);
+		}
+	}
+
+out:
+	/*
+	 * Close server to guest TCP connection
+	 */
+	uip_tcp_socket_close(sk, SHUT_RD);
+
+	uip_tcp_payload_send(sk, UIP_TCP_FLAG_FIN | UIP_TCP_FLAG_ACK, 0);
+	sk->seq_server += 1;
+
+	sk->read_done = 1;
+
+	free(payload);
+	pthread_exit(NULL);
+
+	return NULL;
+}
+
+static int uip_tcp_socket_receive(struct uip_tcp_socket *sk)
+{
+	if (sk->thread == 0)
+		return pthread_create(&sk->thread, NULL, uip_tcp_socket_thread, (void *)sk);
+
+	return 0;
+}
+
+static int uip_tcp_socket_send(struct uip_tcp_socket *sk, struct uip_tcp *tcp)
+{
+	int len;
+	int ret;
+	u8 *payload;
+
+	if (sk->write_done)
+		return 0;
+
+	payload = uip_tcp_payload(tcp);
+	len = uip_tcp_payloadlen(tcp);
+
+	ret = write(sk->fd, payload, len);
+	if (ret != len)
+		pr_warning("tcp send error");
+
+	return ret;
+}
+
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg)
+{
+	struct uip_tcp_socket *sk;
+	struct uip_tcp *tcp;
+	struct uip_ip *ip;
+	int ret;
+
+	tcp = (struct uip_tcp *)arg->eth;
+	ip = (struct uip_ip *)arg->eth;
+
+	/*
+	 * Guest is trying to start a TCP session, let's fake SYN-ACK to guest
+	 */
+	if (uip_tcp_is_syn(tcp)) {
+		sk = uip_tcp_socket_alloc(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+		if (!sk)
+			return -1;
+
+		sk->window_size = ntohs(tcp->win);
+
+		/*
+		 * Setup ISN number
+		 */
+		sk->isn_guest  = uip_tcp_isn(tcp);
+		sk->isn_server = uip_tcp_isn_alloc();
+
+		sk->seq_server = sk->isn_server;
+		sk->ack_server = sk->isn_guest + 1;
+		uip_tcp_payload_send(sk, UIP_TCP_FLAG_SYN | UIP_TCP_FLAG_ACK, 0);
+		sk->seq_server += 1;
+
+		/*
+		 * Start receive thread for data from remote to guest
+		 */
+		uip_tcp_socket_receive(sk);
+
+		goto out;
+	}
+
+	/*
+	 * Find socket we have allocated
+	 */
+	sk = uip_tcp_socket_find(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+	if (!sk)
+		return -1;
+
+	mutex_lock(sk->lock);
+	sk->window_size = ntohs(tcp->win);
+	sk->guest_acked = ntohl(tcp->ack);
+	pthread_cond_signal(&sk->cond);
+	mutex_unlock(sk->lock);
+
+	if (uip_tcp_is_fin(tcp)) {
+		if (sk->write_done)
+			goto out;
+
+		sk->write_done = 1;
+		sk->ack_server += 1;
+		uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+		/*
+		 * Close guest to server TCP connection
+		 */
+		uip_tcp_socket_close(sk, SHUT_WR);
+
+		goto out;
+	}
+
+	/*
+	 * Ignore guest to server frames with zero tcp payload
+	 */
+	if (uip_tcp_payloadlen(tcp) == 0)
+		goto out;
+
+	/*
+	 * Sent out TCP data to remote host
+	 */
+	ret = uip_tcp_socket_send(sk, tcp);
+	if (ret < 0)
+		return -1;
+	/*
+	 * Send ACK to guest imediately
+	 */
+	sk->ack_server += ret;
+	uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+out:
+	return 0;
+}
diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
new file mode 100644
index 000000000000..31c417cd5ca9
--- /dev/null
+++ b/tools/kvm/net/uip/udp.c
@@ -0,0 +1,239 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#define UIP_UDP_MAX_EVENTS 1000
+
+static struct uip_udp_socket *uip_udp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct uip_udp_socket *sk;
+	struct mutex *sk_lock;
+	struct epoll_event ev;
+	int flags;
+	int ret;
+
+	sk_head = &arg->info->udp_socket_head;
+	sk_lock = &arg->info->udp_socket_lock;
+
+	/*
+	 * Find existing sk
+	 */
+	mutex_lock(sk_lock);
+	list_for_each_entry(sk, sk_head, list) {
+		if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+			mutex_unlock(sk_lock);
+			return sk;
+		}
+	}
+	mutex_unlock(sk_lock);
+
+	/*
+	 * Allocate new one
+	 */
+	sk = malloc(sizeof(*sk));
+	memset(sk, 0, sizeof(*sk));
+
+	sk->lock = sk_lock;
+
+	sk->fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sk->fd < 0)
+		goto out;
+
+	/*
+	 * Set non-blocking
+	 */
+	flags = fcntl(sk->fd, F_GETFL, 0);
+	flags |= O_NONBLOCK;
+	fcntl(sk->fd, F_SETFL, flags);
+
+	/*
+	 * Add sk->fd to epoll_wait
+	 */
+	ev.events	= EPOLLIN;
+	ev.data.fd	= sk->fd;
+	ev.data.ptr	= sk;
+	if (arg->info->udp_epollfd <= 0)
+		arg->info->udp_epollfd = epoll_create(UIP_UDP_MAX_EVENTS);
+	ret = epoll_ctl(arg->info->udp_epollfd, EPOLL_CTL_ADD, sk->fd, &ev);
+	if (ret == -1)
+		pr_warning("epoll_ctl error");
+
+	sk->addr.sin_family	 = AF_INET;
+	sk->addr.sin_addr.s_addr = dip;
+	sk->addr.sin_port	 = dport;
+
+	sk->sip			 = sip;
+	sk->dip			 = dip;
+	sk->sport		 = sport;
+	sk->dport		 = dport;
+
+	mutex_lock(sk_lock);
+	list_add_tail(&sk->list, sk_head);
+	mutex_unlock(sk_lock);
+
+	return sk;
+
+out:
+	free(sk);
+	return NULL;
+}
+
+static int uip_udp_socket_send(struct uip_udp_socket *sk, struct uip_udp *udp)
+{
+	int len;
+	int ret;
+
+	len = ntohs(udp->len) - uip_udp_hdrlen(udp);
+
+	ret = sendto(sk->fd, udp->payload, len, 0, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+	if (ret != len)
+		return -1;
+
+	return 0;
+}
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8* payload, int payload_len)
+{
+	struct uip_eth *eth2;
+	struct uip_udp *udp2;
+	struct uip_ip *ip2;
+
+	/*
+	 * Cook a ethernet frame
+	 */
+	udp2		= (struct uip_udp *)(buf->eth);
+	eth2		= (struct uip_eth *)buf->eth;
+	ip2		= (struct uip_ip *)(buf->eth);
+
+	eth2->src	= info->host_mac;
+	eth2->dst	= info->guest_mac;
+	eth2->type	= htons(UIP_ETH_P_IP);
+
+	ip2->vhl	= UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+	ip2->tos	= 0;
+	ip2->id		= 0;
+	ip2->flgfrag	= 0;
+	ip2->ttl	= UIP_IP_TTL;
+	ip2->proto	= UIP_IP_P_UDP;
+	ip2->csum	= 0;
+
+	ip2->sip	= sk->dip;
+	ip2->dip	= sk->sip;
+	udp2->sport	= sk->dport;
+	udp2->dport	= sk->sport;
+
+	udp2->len	= htons(payload_len + uip_udp_hdrlen(udp2));
+	udp2->csum	= 0;
+
+	if (payload)
+		memcpy(udp2->payload, payload, payload_len);
+
+	ip2->len	= udp2->len + htons(uip_ip_hdrlen(ip2));
+	ip2->csum	= uip_csum_ip(ip2);
+	udp2->csum	= uip_csum_udp(udp2);
+
+	/*
+	 * virtio_net_hdr
+	 */
+	buf->vnet_len	= sizeof(struct virtio_net_hdr);
+	memset(buf->vnet, 0, buf->vnet_len);
+
+	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+	return 0;
+}
+
+static void *uip_udp_socket_thread(void *p)
+{
+	struct epoll_event events[UIP_UDP_MAX_EVENTS];
+	struct uip_udp_socket *sk;
+	struct uip_info *info;
+	struct uip_buf *buf;
+	int payload_len;
+	u8 *payload;
+	int nfds;
+	int i;
+
+	kvm__set_thread_name("uip-udp");
+
+	info = p;
+
+	do {
+		payload = malloc(UIP_MAX_UDP_PAYLOAD);
+	} while (!payload);
+
+	while (1) {
+		nfds = epoll_wait(info->udp_epollfd, events, UIP_UDP_MAX_EVENTS, -1);
+
+		if (nfds == -1)
+			continue;
+
+		for (i = 0; i < nfds; i++) {
+
+			sk = events[i].data.ptr;
+			payload_len = recvfrom(sk->fd, payload, UIP_MAX_UDP_PAYLOAD, 0, NULL, NULL);
+			if (payload_len < 0)
+				continue;
+
+			/*
+			 * Get free buffer to send data to guest
+			 */
+			buf = uip_buf_get_free(info);
+
+			uip_udp_make_pkg(info, sk, buf, payload, payload_len);
+
+			/*
+			 * Send data received from socket to guest
+			 */
+			uip_buf_set_used(info, buf);
+		}
+	}
+
+	free(payload);
+	pthread_exit(NULL);
+	return NULL;
+}
+
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg)
+{
+	struct uip_udp_socket *sk;
+	struct uip_info *info;
+	struct uip_udp *udp;
+	struct uip_ip *ip;
+	int ret;
+
+	udp	= (struct uip_udp *)(arg->eth);
+	ip	= (struct uip_ip *)(arg->eth);
+	info	= arg->info;
+
+	if (uip_udp_is_dhcp(udp)) {
+		uip_tx_do_ipv4_udp_dhcp(arg);
+		return 0;
+	}
+
+	/*
+	 * Find socket we have allocated before, otherwise allocate one
+	 */
+	sk = uip_udp_socket_find(arg, ip->sip, ip->dip, udp->sport, udp->dport);
+	if (!sk)
+		return -1;
+
+	/*
+	 * Send out UDP data to remote host
+	 */
+	ret = uip_udp_socket_send(sk, udp);
+	if (ret)
+		return -1;
+
+	if (!info->udp_thread)
+		pthread_create(&info->udp_thread, NULL, uip_udp_socket_thread, (void *)info);
+
+	return 0;
+}
diff --git a/tools/kvm/pci.c b/tools/kvm/pci.c
new file mode 100644
index 000000000000..8d3732d35842
--- /dev/null
+++ b/tools/kvm/pci.c
@@ -0,0 +1,200 @@
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <assert.h>
+
+#define PCI_BAR_OFFSET(b)		(offsetof(struct pci_device_header, bar[b]))
+
+static union pci_config_address		pci_config_address;
+
+/* This is within our PCI gap - in an unused area.
+ * Note this is a PCI *bus address*, is used to assign BARs etc.!
+ * (That's why it can still 32bit even with 64bit guests-- 64bit
+ * PCI isn't currently supported.)
+ */
+static u32 io_space_blocks		= KVM_PCI_MMIO_AREA;
+
+u32 pci_get_io_space_block(u32 size)
+{
+	u32 block = io_space_blocks;
+	io_space_blocks += size;
+
+	return block;
+}
+
+static void *pci_config_address_ptr(u16 port)
+{
+	unsigned long offset;
+	void *base;
+
+	offset	= port - PCI_CONFIG_ADDRESS;
+	base	= &pci_config_address;
+
+	return base + offset;
+}
+
+static bool pci_config_address_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	void *p = pci_config_address_ptr(port);
+
+	memcpy(p, data, size);
+
+	return true;
+}
+
+static bool pci_config_address_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	void *p = pci_config_address_ptr(port);
+
+	memcpy(data, p, size);
+
+	return true;
+}
+
+static struct ioport_operations pci_config_address_ops = {
+	.io_in	= pci_config_address_in,
+	.io_out	= pci_config_address_out,
+};
+
+static bool pci_device_exists(u8 bus_number, u8 device_number, u8 function_number)
+{
+	if (pci_config_address.bus_number != bus_number)
+		return false;
+
+	if (pci_config_address.function_number != function_number)
+		return false;
+
+	return !IS_ERR_OR_NULL(device__find_dev(DEVICE_BUS_PCI, device_number));
+}
+
+static bool pci_config_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	/*
+	 * If someone accesses PCI configuration space offsets that are not
+	 * aligned to 4 bytes, it uses ioports to signify that.
+	 */
+	pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+	pci__config_wr(kvm, pci_config_address, data, size);
+
+	return true;
+}
+
+static bool pci_config_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	/*
+	 * If someone accesses PCI configuration space offsets that are not
+	 * aligned to 4 bytes, it uses ioports to signify that.
+	 */
+	pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+	pci__config_rd(kvm, pci_config_address, data, size);
+
+	return true;
+}
+
+static struct ioport_operations pci_config_data_ops = {
+	.io_in	= pci_config_data_in,
+	.io_out	= pci_config_data_out,
+};
+
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+	u8 dev_num;
+
+	dev_num	= addr.device_number;
+
+	if (pci_device_exists(0, dev_num, 0)) {
+		unsigned long offset;
+
+		offset = addr.w & 0xff;
+		if (offset < sizeof(struct pci_device_header)) {
+			void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+			struct pci_device_header *hdr = p;
+			u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
+			u32 sz = PCI_IO_SIZE;
+
+			if (bar < 6 && hdr->bar_size[bar])
+				sz = hdr->bar_size[bar];
+
+			/*
+			 * If the kernel masks the BAR it would expect to find the
+			 * size of the BAR there next time it reads from it.
+			 * When the kernel got the size it would write the address
+			 * back.
+			 */
+			if (*(u32 *)(p + offset)) {
+				/* See if kernel tries to mask one of the BARs */
+				if ((offset >= PCI_BAR_OFFSET(0)) &&
+				    (offset <= PCI_BAR_OFFSET(6)) &&
+				    (ioport__read32(data)  == 0xFFFFFFFF))
+					memcpy(p + offset, &sz, sizeof(sz));
+				    else
+					memcpy(p + offset, data, size);
+			}
+		}
+	}
+}
+
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+	u8 dev_num;
+
+	dev_num	= addr.device_number;
+
+	if (pci_device_exists(0, dev_num, 0)) {
+		unsigned long offset;
+
+		offset = addr.w & 0xff;
+		if (offset < sizeof(struct pci_device_header)) {
+			void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+
+			memcpy(data, p + offset, size);
+		} else {
+			memset(data, 0x00, size);
+		}
+	} else {
+		memset(data, 0xff, size);
+	}
+}
+
+struct pci_device_header *pci__find_dev(u8 dev_num)
+{
+	struct device_header *hdr = device__find_dev(DEVICE_BUS_PCI, dev_num);
+
+	if (IS_ERR_OR_NULL(hdr))
+		return NULL;
+
+	return hdr->data;
+}
+
+int pci__init(struct kvm *kvm)
+{
+	int r;
+
+	r = ioport__register(kvm, PCI_CONFIG_DATA + 0, &pci_config_data_ops, 4, NULL);
+	if (r < 0)
+		return r;
+
+	r = ioport__register(kvm, PCI_CONFIG_ADDRESS + 0, &pci_config_address_ops, 4, NULL);
+	if (r < 0) {
+		ioport__unregister(kvm, PCI_CONFIG_DATA);
+		return r;
+	}
+
+	return 0;
+}
+dev_base_init(pci__init);
+
+int pci__exit(struct kvm *kvm)
+{
+	ioport__unregister(kvm, PCI_CONFIG_DATA);
+	ioport__unregister(kvm, PCI_CONFIG_ADDRESS);
+
+	return 0;
+}
+dev_base_exit(pci__exit);
diff --git a/tools/kvm/powerpc/boot.c b/tools/kvm/powerpc/boot.c
new file mode 100644
index 000000000000..2557fc077e42
--- /dev/null
+++ b/tools/kvm/powerpc/boot.c
@@ -0,0 +1,8 @@
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	return false;
+}
diff --git a/tools/kvm/powerpc/cpu_info.c b/tools/kvm/powerpc/cpu_info.c
new file mode 100644
index 000000000000..11ca14e23b8a
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.c
@@ -0,0 +1,195 @@
+/*
+ * PPC CPU identification
+ *
+ * This is a very simple "host CPU info" struct to get us going.
+ * For the little host information we need, I don't want to grub about
+ * parsing stuff in /proc/device-tree so just match host PVR to differentiate
+ * PPC970 and POWER7 (which is all that's currently supported).
+ *
+ * Qemu does something similar but this is MUCH simpler!
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <kvm/kvm.h>
+#include <sys/ioctl.h>
+
+#include "cpu_info.h"
+#include "kvm/util.h"
+
+/* POWER7 */
+
+static struct cpu_info cpu_power7_info = {
+	.name = "POWER7",
+	.tb_freq = 512000000,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+	.mmu_info = {
+		.flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+		.slb_size = 32,
+	},
+};
+
+/* PPC970/G5 */
+
+static struct cpu_info cpu_970_info = {
+	.name = "G5",
+	.tb_freq = 33333333,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_VMX,
+};
+
+/* This is a default catchall for 'no match' on PVR: */
+static struct cpu_info cpu_dummy_info = { .name = "unknown" };
+
+static struct pvr_info host_pvr_info[] = {
+	{ 0xffffffff, 0x0f000003, &cpu_power7_info },
+	{ 0xffff0000, 0x003f0000, &cpu_power7_info },
+	{ 0xffff0000, 0x004a0000, &cpu_power7_info },
+	{ 0xffff0000, 0x00390000, &cpu_970_info },
+	{ 0xffff0000, 0x003c0000, &cpu_970_info },
+        { 0xffff0000, 0x00440000, &cpu_970_info },
+        { 0xffff0000, 0x00450000, &cpu_970_info },
+};
+
+/* If we can't query the kernel for supported page sizes assume 4K and 16M */
+static struct kvm_ppc_one_seg_page_size fallback_sps[] = {
+	[0] = {
+		.page_shift = 12,
+		.slb_enc    = 0,
+		.enc =  {
+			[0] = {
+				.page_shift = 12,
+				.pte_enc    = 0,
+			},
+		},
+	},
+	[1] = {
+		.page_shift = 24,
+		.slb_enc    = 0x100,
+		.enc =  {
+			[0] = {
+				.page_shift = 24,
+				.pte_enc    = 0,
+			},
+		},
+	},
+};
+
+
+static void setup_mmu_info(struct kvm *kvm, struct cpu_info *cpu_info)
+{
+	static struct kvm_ppc_smmu_info *mmu_info;
+	struct kvm_ppc_one_seg_page_size *sps;
+	int i, j, k, valid;
+
+	if (!kvm__supports_extension(kvm, KVM_CAP_PPC_GET_SMMU_INFO)) {
+		memcpy(&cpu_info->mmu_info.sps, fallback_sps, sizeof(fallback_sps));
+	} else if (ioctl(kvm->vm_fd, KVM_PPC_GET_SMMU_INFO, &cpu_info->mmu_info) < 0) {
+			die_perror("KVM_PPC_GET_SMMU_INFO failed");
+	}
+
+	mmu_info = &cpu_info->mmu_info;
+
+	if (!(mmu_info->flags & KVM_PPC_PAGE_SIZES_REAL))
+		/* Guest pages are not restricted by the backing page size */
+		return;
+
+	/* Filter based on backing page size */
+
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &mmu_info->sps[i];
+
+		if (!sps->page_shift)
+			break;
+
+		if (kvm->ram_pagesize < (1ul << sps->page_shift)) {
+			/* Mark the whole segment size invalid */
+			sps->page_shift = 0;
+			continue;
+		}
+
+		/* Check each page size for the segment */
+		for (j = 0, valid = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (!sps->enc[j].page_shift)
+				break;
+
+			if (kvm->ram_pagesize < (1ul << sps->enc[j].page_shift))
+				sps->enc[j].page_shift = 0;
+			else
+				valid++;
+		}
+
+		if (!valid) {
+			/* Mark the whole segment size invalid */
+			sps->page_shift = 0;
+			continue;
+		}
+
+		/* Mark any trailing entries invalid if we broke out early */
+		for (k = j; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++)
+			sps->enc[k].page_shift = 0;
+
+		/* Collapse holes */
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (sps->enc[j].page_shift)
+				continue;
+
+			for (k = j + 1; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++) {
+				if (sps->enc[k].page_shift) {
+					sps->enc[j] = sps->enc[k];
+					sps->enc[k].page_shift = 0;
+					break;
+				}
+			}
+		}
+	}
+
+	/* Mark any trailing entries invalid if we broke out early */
+	for (j = i; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+		mmu_info->sps[j].page_shift = 0;
+
+	/* Collapse holes */
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		if (mmu_info->sps[i].page_shift)
+			continue;
+
+		for (j = i + 1; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (mmu_info->sps[j].page_shift) {
+				mmu_info->sps[i] = mmu_info->sps[j];
+				mmu_info->sps[j].page_shift = 0;
+				break;
+			}
+		}
+	}
+}
+
+struct cpu_info *find_cpu_info(struct kvm *kvm)
+{
+	struct cpu_info *info;
+	unsigned int i;
+	u32 pvr = kvm->arch.pvr;
+
+	for (info = NULL, i = 0; i < ARRAY_SIZE(host_pvr_info); i++) {
+		if ((pvr & host_pvr_info[i].pvr_mask) == host_pvr_info[i].pvr) {
+			info = host_pvr_info[i].cpu_info;
+			break;
+		}
+	}
+
+	/* Didn't find anything? Rut-ro. */
+	if (!info) {
+		pr_warning("Host CPU unsupported by kvmtool\n");
+		info = &cpu_dummy_info;
+	}
+
+	setup_mmu_info(kvm, info);
+
+	return info;
+}
diff --git a/tools/kvm/powerpc/cpu_info.h b/tools/kvm/powerpc/cpu_info.h
new file mode 100644
index 000000000000..f61707a8075d
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.h
@@ -0,0 +1,42 @@
+/*
+ * PPC CPU identification
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef CPU_INFO_H
+#define CPU_INFO_H
+
+#include <kvm/kvm.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+
+struct cpu_info {
+	const char	*name;
+	u32		tb_freq; /* timebase frequency */
+	u32		d_bsize; /* d-cache block size */
+	u32		i_bsize; /* i-cache block size */
+	u32		flags;
+	struct kvm_ppc_smmu_info mmu_info;
+};
+
+struct pvr_info {
+	u32		pvr_mask;
+	u32		pvr;
+	struct cpu_info *cpu_info;
+};
+
+/* Misc capabilities/CPU properties */
+#define CPUINFO_FLAG_DFP	0x00000001
+#define CPUINFO_FLAG_VMX	0x00000002
+#define CPUINFO_FLAG_VSX	0x00000004
+
+struct cpu_info *find_cpu_info(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/powerpc/include/kvm/barrier.h b/tools/kvm/powerpc/include/kvm/barrier.h
new file mode 100644
index 000000000000..dd5115acaff6
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/barrier.h
@@ -0,0 +1,6 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#include <asm/barrier.h>
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-arch.h b/tools/kvm/powerpc/include/kvm/kvm-arch.h
new file mode 100644
index 000000000000..d93e1429e0ba
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-arch.h
@@ -0,0 +1,59 @@
+/*
+ * PPC64 architecture-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * MMIO lives after RAM, but it'd be nice if it didn't constantly move.
+ * Choose a suitably high address, e.g. 63T...  This limits RAM size.
+ */
+#define PPC_MMIO_START			0x3F0000000000UL
+#define PPC_MMIO_SIZE			0x010000000000UL
+
+#define KERNEL_LOAD_ADDR        	0x0000000000000000
+#define KERNEL_START_ADDR       	0x0000000000000000
+#define KERNEL_SECONDARY_START_ADDR     0x0000000000000060
+#define INITRD_LOAD_ADDR        	0x0000000002800000
+
+#define RTAS_MAX_SIZE           	0x10000
+
+#define TIMEBASE_FREQ           	512000000ULL
+
+#define KVM_MMIO_START			PPC_MMIO_START
+
+/*
+ * This is the address that pci_get_io_space_block() starts allocating
+ * from.  Note that this is a PCI bus address.
+ */
+#define KVM_PCI_MMIO_AREA		0x1000000
+#define KVM_VIRTIO_MMIO_AREA		0x2000000
+
+#define VIRTIO_DEFAULT_TRANS	VIRTIO_PCI
+
+struct spapr_phb;
+
+struct kvm_arch {
+	u64			sdr1;
+	u32			pvr;
+	unsigned long		rtas_gra;
+	unsigned long		rtas_size;
+	unsigned long		fdt_gra;
+	unsigned long		initrd_gra;
+	unsigned long		initrd_size;
+	struct icp_state	*icp;
+	struct spapr_phb	*phb;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-config-arch.h b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h
new file mode 100644
index 000000000000..60f61de0296f
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h
@@ -0,0 +1,7 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+struct kvm_config_arch {
+};
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 000000000000..7520c049a948
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,76 @@
+/*
+ * PPC64 cpu-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include <stdbool.h>
+#include <pthread.h>
+
+#define MSR_SF		(1UL<<63)
+#define MSR_HV		(1UL<<60)
+#define MSR_VEC		(1UL<<25)
+#define MSR_VSX		(1UL<<23)
+#define MSR_POW		(1UL<<18)
+#define MSR_EE		(1UL<<15)
+#define MSR_PR		(1UL<<14)
+#define MSR_FP		(1UL<<13)
+#define MSR_ME		(1UL<<12)
+#define MSR_FE0		(1UL<<11)
+#define MSR_SE		(1UL<<10)
+#define MSR_BE		(1UL<<9)
+#define MSR_FE1		(1UL<<8)
+#define MSR_IR		(1UL<<5)
+#define MSR_DR		(1UL<<4)
+#define MSR_PMM		(1UL<<2)
+#define MSR_RI		(1UL<<1)
+#define MSR_LE		(1UL<<0)
+
+#define POWER7_EXT_IRQ	0
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t		thread;		/* VCPU thread */
+
+	unsigned long		cpu_id;
+
+	struct kvm		*kvm;		/* parent KVM */
+	int			vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run		*kvm_run;
+
+	struct kvm_regs		regs;
+	struct kvm_sregs	sregs;
+	struct kvm_fpu		fpu;
+
+	u8			is_running;
+	u8			paused;
+	u8			needs_nmi;
+	/*
+	 * Although PPC KVM doesn't yet support coalesced MMIO, generic code
+	 * needs this in our kvm_cpu:
+	 */
+	struct kvm_coalesced_mmio_ring  *ring;
+};
+
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level);
+
+/* This is never actually called on PPC. */
+static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count)
+{
+	return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/powerpc/ioport.c b/tools/kvm/powerpc/ioport.c
new file mode 100644
index 000000000000..264fb7e2d57d
--- /dev/null
+++ b/tools/kvm/powerpc/ioport.c
@@ -0,0 +1,18 @@
+/*
+ * PPC64 ioport platform setup.  There isn't any! :-)
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+	/* PPC has no legacy ioports to set up */
+}
diff --git a/tools/kvm/powerpc/irq.c b/tools/kvm/powerpc/irq.c
new file mode 100644
index 000000000000..ae9da507fb82
--- /dev/null
+++ b/tools/kvm/powerpc/irq.c
@@ -0,0 +1,50 @@
+/*
+ * PPC64 IRQ routines
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/devices.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "kvm/pci.h"
+
+#include "xics.h"
+#include "spapr_pci.h"
+
+/*
+ * FIXME: The code in this file assumes an SPAPR guest, using XICS.  Make
+ * generic & cope with multiple PPC platform types.
+ */
+
+int irq__register_device(u32 dev, u8 *pin, u8 *line)
+{
+	*pin = 1;
+	/*
+	 * Have I said how nasty I find this?  Line should be dontcare... PHB
+	 * should determine which CPU/XICS IRQ to fire.
+	 */
+	*line = xics_alloc_irqnum();
+	return 0;
+}
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+	die(__FUNCTION__);
+	return 0;
+}
diff --git a/tools/kvm/powerpc/kvm-cpu.c b/tools/kvm/powerpc/kvm-cpu.c
new file mode 100644
index 000000000000..8fce121705c8
--- /dev/null
+++ b/tools/kvm/powerpc/kvm-cpu.c
@@ -0,0 +1,290 @@
+/*
+ * PPC64 processor support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "xics.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu		= calloc(1, sizeof *vcpu);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm	= kvm;
+
+	return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	struct kvm_enable_cap papr_cap = { .cap = KVM_CAP_PPC_PAPR };
+
+	vcpu		= kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id	= cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	if (ioctl(vcpu->vcpu_fd, KVM_ENABLE_CAP, &papr_cap) < 0)
+		die("unable to enable PAPR capability");
+
+	/*
+	 * We start all CPUs, directing non-primary threads into the kernel's
+	 * secondary start point.  When we come to support SLOF, we will start
+	 * only one and SLOF will RTAS call us to ask for others to be
+	 * started.  (FIXME: make more generic & interface with whichever
+	 * firmware a platform may be using.)
+	 */
+	vcpu->is_running = true;
+
+	return vcpu;
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+	/* Don't have to do anything, there's no expected FPU state. */
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	/*
+	 * FIXME: This assumes PPC64 and Linux guest.  It doesn't use the
+	 * OpenFirmware entry method, but instead the "embedded" entry which
+	 * passes the FDT address directly.
+	 */
+	struct kvm_regs *r = &vcpu->regs;
+
+	if (vcpu->cpu_id == 0) {
+		r->pc = KERNEL_START_ADDR;
+		r->gpr[3] = vcpu->kvm->arch.fdt_gra;
+		r->gpr[5] = 0;
+	} else {
+		r->pc = KERNEL_SECONDARY_START_ADDR;
+		r->gpr[3] = vcpu->cpu_id;
+	}
+	r->msr = 0x8000000000001000UL; /* 64bit, non-HV, ME */
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+	/*
+	 * Some sregs setup to initialise SDR1/PVR/HIOR on PPC64 SPAPR
+	 * platforms using PR KVM.  (Technically, this is all ignored on
+	 * SPAPR HV KVM.)  Different setup is required for non-PV non-SPAPR
+	 * platforms!  (FIXME.)
+	 */
+	struct kvm_sregs sregs;
+	struct kvm_one_reg reg = {};
+	u64 value;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	sregs.u.s.sdr1 = vcpu->kvm->arch.sdr1;
+	sregs.pvr = vcpu->kvm->arch.pvr;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &sregs) < 0)
+		die("KVM_SET_SREGS failed");
+
+	reg.id = KVM_REG_PPC_HIOR;
+	value = 0;
+	reg.addr = (u64)&value;
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_regs(vcpu);
+	kvm_cpu__setup_sregs(vcpu);
+	kvm_cpu__setup_fpu(vcpu);
+}
+
+/* kvm_cpu__irq - set KVM's IRQ flag on this vcpu */
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level)
+{
+	unsigned int virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
+
+	/* FIXME: POWER-specific */
+	if (pin != POWER7_EXT_IRQ)
+		return;
+	if (ioctl(vcpu->vcpu_fd, KVM_INTERRUPT, &virq) < 0)
+		pr_warning("Could not KVM_INTERRUPT.");
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	bool ret = true;
+	struct kvm_run *run = vcpu->kvm_run;
+	switch(run->exit_reason) {
+	case KVM_EXIT_PAPR_HCALL:
+		run->papr_hcall.ret = spapr_hypercall(vcpu, run->papr_hcall.nr,
+						      (target_ulong*)run->papr_hcall.args);
+		break;
+	default:
+		ret = false;
+	}
+	return ret;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	/*
+	 * FIXME: This function will need to be split in order to support
+	 * various PowerPC platforms/PHB types, etc.  It currently assumes SPAPR
+	 * PPC64 guest.
+	 */
+	bool ret = false;
+
+	if ((phys_addr >= SPAPR_PCI_WIN_START) &&
+	    (phys_addr < SPAPR_PCI_WIN_END)) {
+		ret = spapr_phb_mmio(kvm, phys_addr, data, len, is_write);
+	} else {
+		pr_warning("MMIO %s unknown address %llx (size %d)!\n",
+			   is_write ? "write to" : "read from",
+			   phys_addr, len);
+	}
+	return ret;
+}
+
+#define CONDSTR_BIT(m, b) (((m) & MSR_##b) ? #b" " : "")
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	int r;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+        if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd, " NIP:   %016llx  MSR:   %016llx "
+		"( %s%s%s%s%s%s%s%s%s%s%s%s)\n",
+		regs.pc, regs.msr,
+		CONDSTR_BIT(regs.msr, SF),
+		CONDSTR_BIT(regs.msr, HV), /* ! */
+		CONDSTR_BIT(regs.msr, VEC),
+		CONDSTR_BIT(regs.msr, VSX),
+		CONDSTR_BIT(regs.msr, EE),
+		CONDSTR_BIT(regs.msr, PR),
+		CONDSTR_BIT(regs.msr, FP),
+		CONDSTR_BIT(regs.msr, ME),
+		CONDSTR_BIT(regs.msr, IR),
+		CONDSTR_BIT(regs.msr, DR),
+		CONDSTR_BIT(regs.msr, RI),
+		CONDSTR_BIT(regs.msr, LE));
+	dprintf(debug_fd, " CTR:   %016llx  LR:    %016llx  CR:   %08llx\n",
+		regs.ctr, regs.lr, regs.cr);
+	dprintf(debug_fd, " SRR0:  %016llx  SRR1:  %016llx  XER:  %016llx\n",
+		regs.srr0, regs.srr1, regs.xer);
+	dprintf(debug_fd, " SPRG0: %016llx  SPRG1: %016llx\n",
+		regs.sprg0, regs.sprg1);
+	dprintf(debug_fd, " SPRG2: %016llx  SPRG3: %016llx\n",
+		regs.sprg2, regs.sprg3);
+	dprintf(debug_fd, " SPRG4: %016llx  SPRG5: %016llx\n",
+		regs.sprg4, regs.sprg5);
+	dprintf(debug_fd, " SPRG6: %016llx  SPRG7: %016llx\n",
+		regs.sprg6, regs.sprg7);
+	dprintf(debug_fd, " GPRs:\n ");
+	for (r = 0; r < 32; r++) {
+		dprintf(debug_fd, "%016llx  ", regs.gpr[r]);
+		if ((r & 3) == 3)
+			dprintf(debug_fd, "\n ");
+	}
+	dprintf(debug_fd, "\n");
+
+	/* FIXME: Assumes SLB-based (book3s) guest */
+	for (r = 0; r < 32; r++) {
+		dprintf(debug_fd, " SLB%02d  %016llx %016llx\n", r,
+			sregs.u.s.ppc64.slb[r].slbe,
+			sregs.u.s.ppc64.slb[r].slbv);
+	}
+	dprintf(debug_fd, "----------\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	/* FIXME: Dump/disassemble some code...! */
+
+	dprintf(debug_fd, "\n Stack:\n");
+	dprintf(debug_fd,   " ------\n");
+	/* Only works in real mode: */
+	kvm__dump_mem(vcpu->kvm, vcpu->regs.gpr[1], 32);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+	/* Does nothing yet */
+}
diff --git a/tools/kvm/powerpc/kvm.c b/tools/kvm/powerpc/kvm.c
new file mode 100644
index 000000000000..dc9f89d55500
--- /dev/null
+++ b/tools/kvm/powerpc/kvm.c
@@ -0,0 +1,529 @@
+/*
+ * PPC64 (SPAPR) platform support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * Portions of FDT setup borrowed from QEMU, copyright 2010 David Gibson, IBM
+ * Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "cpu_info.h"
+
+#include "spapr.h"
+#include "spapr_hvcons.h"
+#include "spapr_pci.h"
+
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+#define HPT_ORDER 24
+
+#define HUGETLBFS_PATH "/var/lib/hugetlbfs/global/pagesize-16MB/"
+
+#define PHANDLE_XICP		0x00001111
+
+static char kern_cmdline[2048];
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_PPC_UNSET_IRQ) },
+	{ DEFINE_KVM_EXT(KVM_CAP_PPC_IRQ_LEVEL) },
+	{ 0, 0 }
+};
+
+static uint32_t mfpvr(void)
+{
+	uint32_t r;
+	asm volatile ("mfpvr %0" : "=r"(r));
+	return r;
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	phys_start = 0;
+	phys_size  = kvm->ram_size;
+	host_mem   = kvm->ram_start;
+
+	/*
+	 * We put MMIO at PPC_MMIO_START, high up.  Make sure that this doesn't
+	 * crash into the end of RAM -- on PPC64 at least, this is so high
+	 * (63TB!) that this is unlikely.
+	 */
+	if (phys_size >= PPC_MMIO_START)
+		die("Too much memory (%lld, what a nice problem): "
+		    "overlaps MMIO!\n",
+		    phys_size);
+
+	kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+	/* We don't need anything unusual in here. */
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	int cap_ppc_rma;
+	unsigned long hpt;
+
+	kvm->ram_size		= ram_size;
+
+	/* Map "default" hugetblfs path to the standard 16M mount point */
+	if (hugetlbfs_path && !strcmp(hugetlbfs_path, "default"))
+		hugetlbfs_path = HUGETLBFS_PATH;
+
+	kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, kvm->ram_size);
+
+	if (kvm->ram_start == MAP_FAILED)
+		die("Couldn't map %lld bytes for RAM (%d)\n",
+		    kvm->ram_size, errno);
+
+	/* FDT goes at top of memory, RTAS just below */
+	kvm->arch.fdt_gra = kvm->ram_size - FDT_MAX_SIZE;
+	/* FIXME: Not all PPC systems have RTAS */
+	kvm->arch.rtas_gra = kvm->arch.fdt_gra - RTAS_MAX_SIZE;
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	/* FIXME:  SPAPR-PR specific; allocate a guest HPT. */
+	if (posix_memalign((void **)&hpt, (1<<HPT_ORDER), (1<<HPT_ORDER)))
+		die("Can't allocate %d bytes for HPT\n", (1<<HPT_ORDER));
+
+	kvm->arch.sdr1 = ((hpt + 0x3ffffULL) & ~0x3ffffULL) | (HPT_ORDER-18);
+
+	kvm->arch.pvr = mfpvr();
+
+	/* FIXME: This is book3s-specific */
+	cap_ppc_rma = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_PPC_RMA);
+	if (cap_ppc_rma == 2)
+		die("Need contiguous RMA allocation on this hardware, "
+		    "which is not yet supported.");
+
+	/* Do these before FDT setup, IRQ setup, etc. */
+	/* FIXME: SPAPR-specific */
+	hypercall_init();
+	register_core_rtas();
+	/* Now that hypercalls are initialised, register a couple for the console: */
+	spapr_hvcons_init();
+	spapr_create_phb(kvm, "pci", SPAPR_PCI_BUID,
+			 SPAPR_PCI_MEM_WIN_ADDR,
+			 SPAPR_PCI_MEM_WIN_SIZE,
+			 SPAPR_PCI_IO_WIN_ADDR,
+			 SPAPR_PCI_IO_WIN_SIZE);
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, 1);
+	kvm__irq_line(kvm, irq, 0);
+}
+
+void kvm__arch_periodic_poll(struct kvm *kvm)
+{
+	/* FIXME: Should register callbacks to platform-specific polls */
+	spapr_hvcons_poll(kvm);
+}
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+	void *p;
+	void *k_start;
+	void *i_start;
+	int nr;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+	while ((nr = read(fd_kernel, p, 65536)) > 0)
+		p += nr;
+
+	pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, p-k_start);
+
+	if (fd_initrd != -1) {
+		if (lseek(fd_initrd, 0, SEEK_SET) < 0)
+			die_perror("lseek");
+
+		if (p-k_start > INITRD_LOAD_ADDR)
+			die("Kernel overlaps initrd!");
+
+		/* Round up kernel size to 8byte alignment, and load initrd right after. */
+		i_start = p = guest_flat_to_host(kvm, INITRD_LOAD_ADDR);
+
+		while (((nr = read(fd_initrd, p, 65536)) > 0) &&
+		       p < (kvm->ram_start + kvm->ram_size))
+			p += nr;
+
+		if (p >= (kvm->ram_start + kvm->ram_size))
+			die("initrd too big to contain in guest RAM.\n");
+
+		pr_info("Loaded initrd to 0x%x (%ld bytes)",
+			INITRD_LOAD_ADDR, p-i_start);
+		kvm->arch.initrd_gra = INITRD_LOAD_ADDR;
+		kvm->arch.initrd_size = p-i_start;
+	} else {
+		kvm->arch.initrd_size = 0;
+	}
+	strncpy(kern_cmdline, kernel_cmdline, 2048);
+	kern_cmdline[2047] = '\0';
+
+	return true;
+}
+
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd,
+		  const char *kernel_cmdline)
+{
+	/* We don't support bzImages. */
+	return false;
+}
+
+struct fdt_prop {
+	void *value;
+	int size;
+};
+
+static void generate_segment_page_sizes(struct kvm_ppc_smmu_info *info, struct fdt_prop *prop)
+{
+	struct kvm_ppc_one_seg_page_size *sps;
+	int i, j, size;
+	u32 *p;
+
+	for (size = 0, i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &info->sps[i];
+
+		if (sps->page_shift == 0)
+			break;
+
+		/* page shift, slb enc & count */
+		size += 3;
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (info->sps[i].enc[j].page_shift == 0)
+				break;
+
+			/* page shift & pte enc */
+			size += 2;
+		}
+	}
+
+	if (!size) {
+		prop->value = NULL;
+		prop->size = 0;
+		return;
+	}
+
+	/* Convert size to bytes */
+	prop->size = size * sizeof(u32);
+
+	prop->value = malloc(prop->size);
+	if (!prop->value)
+		die_perror("malloc failed");
+
+	p = (u32 *)prop->value;
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &info->sps[i];
+
+		if (sps->page_shift == 0)
+			break;
+
+		*p++ = sps->page_shift;
+		*p++ = sps->slb_enc;
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+			if (!info->sps[i].enc[j].page_shift)
+				break;
+
+		*p++ = j;	/* count of enc */
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (!info->sps[i].enc[j].page_shift)
+				break;
+
+			*p++ = info->sps[i].enc[j].page_shift;
+			*p++ = info->sps[i].enc[j].pte_enc;
+		}
+	}
+}
+
+#define SMT_THREADS 4
+
+/*
+ * Set up the FDT for the kernel: This function is currently fairly SPAPR-heavy,
+ * and whilst most PPC targets will require CPU/memory nodes, others like RTAS
+ * should eventually be added separately.
+ */
+static int setup_fdt(struct kvm *kvm)
+{
+	uint64_t 	mem_reg_property[] = { 0, cpu_to_be64(kvm->ram_size) };
+	int 		smp_cpus = kvm->nrcpus;
+	uint32_t	int_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
+	char 		hypertas_prop_kvm[] = "hcall-pft\0hcall-term\0"
+		"hcall-dabr\0hcall-interrupt\0hcall-tce\0hcall-vio\0"
+		"hcall-splpar\0hcall-bulk";
+	int 		i, j;
+	char 		cpu_name[30];
+	u8		staging_fdt[FDT_MAX_SIZE];
+	struct cpu_info *cpu_info = find_cpu_info(kvm);
+	struct fdt_prop segment_page_sizes;
+	u32 segment_sizes_1T[] = {0x1c, 0x28, 0xffffffff, 0xffffffff};
+
+	/* Generate an appropriate DT at kvm->arch.fdt_gra */
+	void *fdt_dest = guest_flat_to_host(kvm, kvm->arch.fdt_gra);
+	void *fdt = staging_fdt;
+
+	_FDT(fdt_create(fdt, FDT_MAX_SIZE));
+	_FDT(fdt_finish_reservemap(fdt));
+
+	_FDT(fdt_begin_node(fdt, ""));
+
+	_FDT(fdt_property_string(fdt, "device_type", "chrp"));
+	_FDT(fdt_property_string(fdt, "model", "IBM pSeries (kvmtool)"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+	/* RTAS */
+	_FDT(fdt_begin_node(fdt, "rtas"));
+	/* This is what the kernel uses to switch 'We're an LPAR'! */
+        _FDT(fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop_kvm,
+                           sizeof(hypertas_prop_kvm)));
+	_FDT(fdt_property_cell(fdt, "linux,rtas-base", kvm->arch.rtas_gra));
+	_FDT(fdt_property_cell(fdt, "linux,rtas-entry", kvm->arch.rtas_gra));
+	_FDT(fdt_property_cell(fdt, "rtas-size", kvm->arch.rtas_size));
+	/* Now add properties for all RTAS tokens: */
+	if (spapr_rtas_fdt_setup(kvm, fdt))
+		die("Couldn't create RTAS FDT properties\n");
+
+	_FDT(fdt_end_node(fdt));
+
+	/* /chosen */
+	_FDT(fdt_begin_node(fdt, "chosen"));
+	/* cmdline */
+	_FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+	/* Initrd */
+	if (kvm->arch.initrd_size != 0) {
+		uint32_t ird_st_prop = cpu_to_be32(kvm->arch.initrd_gra);
+		uint32_t ird_end_prop = cpu_to_be32(kvm->arch.initrd_gra +
+						    kvm->arch.initrd_size);
+		_FDT(fdt_property(fdt, "linux,initrd-start",
+				   &ird_st_prop, sizeof(ird_st_prop)));
+		_FDT(fdt_property(fdt, "linux,initrd-end",
+				   &ird_end_prop, sizeof(ird_end_prop)));
+	}
+
+	/*
+	 * stdout-path: This is assuming we're using the HV console.  Also, the
+	 * address is hardwired until we do a VIO bus.
+	 */
+	_FDT(fdt_property_string(fdt, "linux,stdout-path",
+				 "/vdevice/vty@30000000"));
+	_FDT(fdt_end_node(fdt));
+
+	/*
+	 * Memory: We don't alloc. a separate RMA yet.  If we ever need to
+	 * (CAP_PPC_RMA == 2) then have one memory node for 0->RMAsize, and
+	 * another RMAsize->endOfMem.
+	 */
+	_FDT(fdt_begin_node(fdt, "memory@0"));
+	_FDT(fdt_property_string(fdt, "device_type", "memory"));
+	_FDT(fdt_property(fdt, "reg", mem_reg_property,
+			  sizeof(mem_reg_property)));
+	_FDT(fdt_end_node(fdt));
+
+	generate_segment_page_sizes(&cpu_info->mmu_info, &segment_page_sizes);
+
+	/* CPUs */
+	_FDT(fdt_begin_node(fdt, "cpus"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+	for (i = 0; i < smp_cpus; i += SMT_THREADS) {
+		int32_t pft_size_prop[] = { 0, HPT_ORDER };
+		uint32_t servers_prop[SMT_THREADS];
+		uint32_t gservers_prop[SMT_THREADS * 2];
+		int threads = (smp_cpus - i) >= SMT_THREADS ? SMT_THREADS :
+			smp_cpus - i;
+
+		sprintf(cpu_name, "PowerPC,%s@%d", cpu_info->name, i);
+		_FDT(fdt_begin_node(fdt, cpu_name));
+		sprintf(cpu_name, "PowerPC,%s", cpu_info->name);
+		_FDT(fdt_property_string(fdt, "name", cpu_name));
+		_FDT(fdt_property_string(fdt, "device_type", "cpu"));
+
+		_FDT(fdt_property_cell(fdt, "reg", i));
+		_FDT(fdt_property_cell(fdt, "cpu-version", kvm->arch.pvr));
+
+		_FDT(fdt_property_cell(fdt, "dcache-block-size", cpu_info->d_bsize));
+		_FDT(fdt_property_cell(fdt, "icache-block-size", cpu_info->i_bsize));
+
+		_FDT(fdt_property_cell(fdt, "timebase-frequency", cpu_info->tb_freq));
+		/* Lies, but safeish lies! */
+		_FDT(fdt_property_cell(fdt, "clock-frequency", 0xddbab200));
+
+		if (cpu_info->mmu_info.slb_size)
+			_FDT(fdt_property_cell(fdt, "ibm,slb-size", cpu_info->mmu_info.slb_size));
+
+		/*
+		 * HPT size is hardwired; KVM currently fixes it at 16MB but the
+		 * moment that changes we'll need to read it out of the kernel.
+		 */
+		_FDT(fdt_property(fdt, "ibm,pft-size", pft_size_prop,
+				  sizeof(pft_size_prop)));
+
+		_FDT(fdt_property_string(fdt, "status", "okay"));
+		_FDT(fdt_property(fdt, "64-bit", NULL, 0));
+		/* A server for each thread in this core */
+		for (j = 0; j < SMT_THREADS; j++) {
+			servers_prop[j] = cpu_to_be32(i+j);
+			/*
+			 * Hack borrowed from QEMU, direct the group queues back
+			 * to cpu 0:
+			 */
+			gservers_prop[j*2] = cpu_to_be32(i+j);
+			gservers_prop[j*2 + 1] = 0;
+		}
+		_FDT(fdt_property(fdt, "ibm,ppc-interrupt-server#s",
+				   servers_prop, threads * sizeof(uint32_t)));
+		_FDT(fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
+				  gservers_prop,
+				  threads * 2 * sizeof(uint32_t)));
+
+		if (segment_page_sizes.value)
+			_FDT(fdt_property(fdt, "ibm,segment-page-sizes",
+					  segment_page_sizes.value,
+					  segment_page_sizes.size));
+
+		if (cpu_info->mmu_info.flags & KVM_PPC_1T_SEGMENTS)
+			_FDT(fdt_property(fdt, "ibm,processor-segment-sizes",
+					  segment_sizes_1T, sizeof(segment_sizes_1T)));
+
+		/* VSX / DFP options: */
+		if (cpu_info->flags & CPUINFO_FLAG_VMX)
+			_FDT(fdt_property_cell(fdt, "ibm,vmx",
+					       (cpu_info->flags &
+						CPUINFO_FLAG_VSX) ? 2 : 1));
+		if (cpu_info->flags & CPUINFO_FLAG_DFP)
+			_FDT(fdt_property_cell(fdt, "ibm,dfp", 0x1));
+		_FDT(fdt_end_node(fdt));
+	}
+	_FDT(fdt_end_node(fdt));
+
+	/* IRQ controller */
+	_FDT(fdt_begin_node(fdt, "interrupt-controller@0"));
+
+	_FDT(fdt_property_string(fdt, "device_type",
+				 "PowerPC-External-Interrupt-Presentation"));
+	_FDT(fdt_property_string(fdt, "compatible", "IBM,ppc-xicp"));
+	_FDT(fdt_property_cell(fdt, "reg", 0));
+	_FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+	_FDT(fdt_property(fdt, "ibm,interrupt-server-ranges",
+			   int_server_ranges_prop,
+			   sizeof(int_server_ranges_prop)));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 2));
+	_FDT(fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP));
+	_FDT(fdt_property_cell(fdt, "phandle", PHANDLE_XICP));
+	_FDT(fdt_end_node(fdt));
+
+	/*
+	 * VIO: See comment in linux,stdout-path; we don't yet represent a VIO
+	 * bus/address allocation so addresses are hardwired here.
+	 */
+	_FDT(fdt_begin_node(fdt, "vdevice"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+	_FDT(fdt_property_string(fdt, "device_type", "vdevice"));
+	_FDT(fdt_property_string(fdt, "compatible", "IBM,vdevice"));
+	_FDT(fdt_begin_node(fdt, "vty@30000000"));
+	_FDT(fdt_property_string(fdt, "name", "vty"));
+	_FDT(fdt_property_string(fdt, "device_type", "serial"));
+	_FDT(fdt_property_string(fdt, "compatible", "hvterm1"));
+	_FDT(fdt_property_cell(fdt, "reg", 0x30000000));
+	_FDT(fdt_end_node(fdt));
+	_FDT(fdt_end_node(fdt));
+
+	/* Finalise: */
+	_FDT(fdt_end_node(fdt)); /* Root node */
+	_FDT(fdt_finish(fdt));
+
+	_FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+
+	/* PCI */
+	if (spapr_populate_pci_devices(kvm, PHANDLE_XICP, fdt_dest))
+		die("Fail populating PCI device nodes");
+
+	_FDT(fdt_add_mem_rsv(fdt_dest, kvm->arch.rtas_gra, kvm->arch.rtas_size));
+	_FDT(fdt_pack(fdt_dest));
+
+	free(segment_page_sizes.value);
+
+	return 0;
+}
+firmware_init(setup_fdt);
+
+/**
+ * kvm__arch_setup_firmware
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	/*
+	 * Set up RTAS stub.  All it is is a single hypercall:
+	 *  0:   7c 64 1b 78     mr      r4,r3
+	 *  4:   3c 60 00 00     lis     r3,0
+	 *  8:   60 63 f0 00     ori     r3,r3,61440
+	 *  c:   44 00 00 22     sc      1
+	 * 10:   4e 80 00 20     blr
+	 */
+	uint32_t *rtas = guest_flat_to_host(kvm, kvm->arch.rtas_gra);
+
+	rtas[0] = 0x7c641b78;
+	rtas[1] = 0x3c600000;
+	rtas[2] = 0x6063f000;
+	rtas[3] = 0x44000022;
+	rtas[4] = 0x4e800020;
+	kvm->arch.rtas_size = 20;
+
+	pr_info("Set up %ld bytes of RTAS at 0x%lx\n",
+		kvm->arch.rtas_size, kvm->arch.rtas_gra);
+
+	/* Load SLOF */
+
+	return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+	return 0;
+}
diff --git a/tools/kvm/powerpc/spapr.h b/tools/kvm/powerpc/spapr.h
new file mode 100644
index 000000000000..0537f881c0e4
--- /dev/null
+++ b/tools/kvm/powerpc/spapr.h
@@ -0,0 +1,93 @@
+/*
+ * SPAPR definitions and declarations
+ *
+ * Borrowed heavily from QEMU's spapr.h,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#if !defined(__HW_SPAPR_H__)
+#define __HW_SPAPR_H__
+
+#include <inttypes.h>
+
+/* We need some of the H_ hcall defs, but they're __KERNEL__ only. */
+#define __KERNEL__
+#include <asm/hvcall.h>
+#undef __KERNEL__
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+typedef unsigned long target_ulong;
+typedef uintptr_t target_phys_addr_t;
+
+/*
+ * The hcalls above are standardized in PAPR and implemented by pHyp
+ * as well.
+ *
+ * We also need some hcalls which are specific to qemu / KVM-on-POWER.
+ * So far we just need one for H_RTAS, but in future we'll need more
+ * for extensions like virtio.  We put those into the 0xf000-0xfffc
+ * range which is reserved by PAPR for "platform-specific" hcalls.
+ */
+#define KVMPPC_HCALL_BASE       0xf000
+#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_RTAS
+
+#define DEBUG_SPAPR_HCALLS
+
+#ifdef DEBUG_SPAPR_HCALLS
+#define hcall_dprintf(fmt, ...) \
+    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define hcall_dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+typedef target_ulong (*spapr_hcall_fn)(struct kvm_cpu *vcpu,
+				       target_ulong opcode,
+                                       target_ulong *args);
+
+void hypercall_init(void);
+void register_core_rtas(void);
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+                             target_ulong *args);
+
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt);
+
+static inline uint32_t rtas_ld(struct kvm *kvm, target_ulong phys, int n)
+{
+	return *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n));
+}
+
+static inline void rtas_st(struct kvm *kvm, target_ulong phys, int n, uint32_t val)
+{
+	*((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)) = val;
+}
+
+typedef void (*spapr_rtas_fn)(struct kvm_cpu *vcpu, uint32_t token,
+                              uint32_t nargs, target_ulong args,
+                              uint32_t nret, target_ulong rets);
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn);
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets);
+
+#define SPAPR_PCI_BUID          0x800000020000001ULL
+#define SPAPR_PCI_MEM_WIN_ADDR  (KVM_MMIO_START + 0xA0000000)
+#define SPAPR_PCI_MEM_WIN_SIZE  0x20000000
+#define SPAPR_PCI_IO_WIN_ADDR   (SPAPR_PCI_MEM_WIN_ADDR + SPAPR_PCI_MEM_WIN_SIZE)
+#define SPAPR_PCI_IO_WIN_SIZE	0x2000000
+
+#define SPAPR_PCI_WIN_START	SPAPR_PCI_MEM_WIN_ADDR
+#define SPAPR_PCI_WIN_END	(SPAPR_PCI_IO_WIN_ADDR + SPAPR_PCI_IO_WIN_SIZE)
+
+#endif /* !defined (__HW_SPAPR_H__) */
diff --git a/tools/kvm/powerpc/spapr_hcall.c b/tools/kvm/powerpc/spapr_hcall.c
new file mode 100644
index 000000000000..ff1d63ac37f1
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hcall.c
@@ -0,0 +1,134 @@
+/*
+ * SPAPR hypercalls
+ *
+ * Borrowed heavily from QEMU's spapr_hcall.c,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
+static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX -
+					     KVMPPC_HCALL_BASE + 1];
+
+static target_ulong h_set_dabr(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* FIXME:  Implement this for -PR.  (-HV does this in kernel.) */
+	return H_HARDWARE;
+}
+
+static target_ulong h_rtas(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	target_ulong rtas_r3 = args[0];
+	/*
+	 * Pointer read from phys mem; these ptrs cannot be MMIO (!) so just
+	 * reference guest RAM directly.
+	 */
+	uint32_t token, nargs, nret;
+
+	token = rtas_ld(vcpu->kvm, rtas_r3, 0);
+	nargs = rtas_ld(vcpu->kvm, rtas_r3, 1);
+	nret  = rtas_ld(vcpu->kvm, rtas_r3, 2);
+
+	return spapr_rtas_call(vcpu, token, nargs, rtas_r3 + 12,
+			       nret, rtas_r3 + 12 + 4*nargs);
+}
+
+static target_ulong h_logical_load(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* SLOF will require these, though kernel doesn't. */
+	die(__PRETTY_FUNCTION__);
+	return H_PARAMETER;
+}
+
+static target_ulong h_logical_store(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* SLOF will require these, though kernel doesn't. */
+	die(__PRETTY_FUNCTION__);
+	return H_PARAMETER;
+}
+
+static target_ulong h_logical_icbi(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* KVM will trap this in the kernel.  Die if it misses. */
+	die(__PRETTY_FUNCTION__);
+	return H_SUCCESS;
+}
+
+static target_ulong h_logical_dcbf(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* KVM will trap this in the kernel.  Die if it misses. */
+	die(__PRETTY_FUNCTION__);
+	return H_SUCCESS;
+}
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn)
+{
+	spapr_hcall_fn *slot;
+
+	if (opcode <= MAX_HCALL_OPCODE) {
+		assert((opcode & 0x3) == 0);
+
+		slot = &papr_hypercall_table[opcode / 4];
+	} else {
+		assert((opcode >= KVMPPC_HCALL_BASE) &&
+		       (opcode <= KVMPPC_HCALL_MAX));
+
+		slot = &kvmppc_hypercall_table[opcode - KVMPPC_HCALL_BASE];
+	}
+
+	assert(!(*slot) || (fn == *slot));
+	*slot = fn;
+}
+
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+			     target_ulong *args)
+{
+	if ((opcode <= MAX_HCALL_OPCODE)
+	    && ((opcode & 0x3) == 0)) {
+		spapr_hcall_fn fn = papr_hypercall_table[opcode / 4];
+
+		if (fn) {
+			return fn(vcpu, opcode, args);
+		}
+	} else if ((opcode >= KVMPPC_HCALL_BASE) &&
+		   (opcode <= KVMPPC_HCALL_MAX)) {
+		spapr_hcall_fn fn = kvmppc_hypercall_table[opcode -
+							   KVMPPC_HCALL_BASE];
+
+		if (fn) {
+			return fn(vcpu, opcode, args);
+		}
+	}
+
+	hcall_dprintf("Unimplemented hcall 0x%lx\n", opcode);
+	return H_FUNCTION;
+}
+
+void hypercall_init(void)
+{
+	/* hcall-dabr */
+	spapr_register_hypercall(H_SET_DABR, h_set_dabr);
+
+	spapr_register_hypercall(H_LOGICAL_CI_LOAD, h_logical_load);
+	spapr_register_hypercall(H_LOGICAL_CI_STORE, h_logical_store);
+	spapr_register_hypercall(H_LOGICAL_CACHE_LOAD, h_logical_load);
+	spapr_register_hypercall(H_LOGICAL_CACHE_STORE, h_logical_store);
+	spapr_register_hypercall(H_LOGICAL_ICBI, h_logical_icbi);
+	spapr_register_hypercall(H_LOGICAL_DCBF, h_logical_dcbf);
+
+	/* KVM-PPC specific hcalls */
+	spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
+}
diff --git a/tools/kvm/powerpc/spapr_hvcons.c b/tools/kvm/powerpc/spapr_hvcons.c
new file mode 100644
index 000000000000..0bdf75ba3689
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.c
@@ -0,0 +1,108 @@
+/*
+ * SPAPR HV console
+ *
+ * Borrowed lightly from QEMU's spapr_vty.c, Copyright (c) 2010 David Gibson,
+ * IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "spapr.h"
+#include "spapr_hvcons.h"
+
+#include <stdio.h>
+#include <sys/uio.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+union hv_chario {
+	struct {
+		uint64_t char0_7;
+		uint64_t char8_15;
+	} a;
+	uint8_t buf[16];
+};
+
+static unsigned long h_put_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+	/* To do: Read register from args[0], and check it. */
+	unsigned long len = args[1];
+	union hv_chario data;
+	struct iovec iov;
+
+	if (len > 16) {
+		return H_PARAMETER;
+	}
+	data.a.char0_7 = cpu_to_be64(args[2]);
+	data.a.char8_15 = cpu_to_be64(args[3]);
+
+	iov.iov_base = data.buf;
+	iov.iov_len = len;
+	do {
+		int ret;
+
+		if (vcpu->kvm->cfg.active_console == CONSOLE_HV)
+			ret = term_putc_iov(&iov, 1, 0);
+		else
+			ret = 0;
+		if (ret < 0) {
+			die("term_putc_iov error %d!\n", errno);
+		}
+		iov.iov_base += ret;
+		iov.iov_len -= ret;
+	} while (iov.iov_len > 0);
+
+	return H_SUCCESS;
+}
+
+
+static unsigned long h_get_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+	/* To do: Read register from args[0], and check it. */
+	unsigned long *len = args + 0;
+	unsigned long *char0_7 = args + 1;
+	unsigned long *char8_15 = args + 2;
+	union hv_chario data;
+	struct iovec iov;
+
+	if (vcpu->kvm->cfg.active_console != CONSOLE_HV)
+		return H_SUCCESS;
+
+	if (term_readable(0)) {
+		iov.iov_base = data.buf;
+		iov.iov_len = 16;
+
+		*len = term_getc_iov(vcpu->kvm, &iov, 1, 0);
+		*char0_7 = be64_to_cpu(data.a.char0_7);
+		*char8_15 = be64_to_cpu(data.a.char8_15);
+	} else {
+		*len = 0;
+	}
+
+	return H_SUCCESS;
+}
+
+void spapr_hvcons_poll(struct kvm *kvm)
+{
+	if (term_readable(0)) {
+		/*
+		 * We can inject an IRQ to guest here if we want.  The guest
+		 * will happily poll, though, so not required.
+		 */
+	}
+}
+
+void spapr_hvcons_init(void)
+{
+	spapr_register_hypercall(H_PUT_TERM_CHAR, h_put_term_char);
+	spapr_register_hypercall(H_GET_TERM_CHAR, h_get_term_char);
+}
diff --git a/tools/kvm/powerpc/spapr_hvcons.h b/tools/kvm/powerpc/spapr_hvcons.h
new file mode 100644
index 000000000000..d3e4414a2951
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.h
@@ -0,0 +1,19 @@
+/*
+ * SPAPR HV console
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef spapr_hvcons_H
+#define spapr_hvcons_H
+
+#include "kvm/kvm.h"
+
+void spapr_hvcons_init(void);
+void spapr_hvcons_poll(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/powerpc/spapr_pci.c b/tools/kvm/powerpc/spapr_pci.c
new file mode 100644
index 000000000000..ed4b9ab52a7c
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.c
@@ -0,0 +1,427 @@
+/*
+ * SPAPR PHB emulation, RTAS interface to PCI config space, device tree nodes
+ * for enumerated devices.
+ *
+ * Borrowed heavily from QEMU's spapr_pci.c,
+ * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
+ * Copyright (c) 2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/util.h"
+#include "kvm/pci.h"
+
+#include <linux/pci_regs.h>
+#include <linux/byteorder.h>
+
+
+/* #define DEBUG_PHB yes */
+#ifdef DEBUG_PHB
+#define phb_dprintf(fmt, ...)					\
+	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define phb_dprintf(fmt, ...)			\
+	do { } while (0)
+#endif
+
+static const uint32_t bars[] = {
+	PCI_BASE_ADDRESS_0, PCI_BASE_ADDRESS_1,
+	PCI_BASE_ADDRESS_2, PCI_BASE_ADDRESS_3,
+	PCI_BASE_ADDRESS_4, PCI_BASE_ADDRESS_5
+	/*, PCI_ROM_ADDRESS*/
+};
+
+#define PCI_NUM_REGIONS		7
+
+/* Macros to operate with address in OF binding to PCI */
+#define b_x(x, p, l)	(((x) & ((1<<(l))-1)) << (p))
+#define b_n(x)		b_x((x), 31, 1) /* 0 if relocatable */
+#define b_p(x)		b_x((x), 30, 1) /* 1 if prefetchable */
+#define b_t(x)		b_x((x), 29, 1) /* 1 if the address is aliased */
+#define b_ss(x)		b_x((x), 24, 2) /* the space code */
+#define b_bbbbbbbb(x)	b_x((x), 16, 8) /* bus number */
+#define b_ddddd(x)	b_x((x), 11, 5) /* device number */
+#define b_fff(x)	b_x((x), 8, 3)	/* function number */
+#define b_rrrrrrrr(x)	b_x((x), 0, 8)	/* register number */
+
+#define SS_M64		3
+#define SS_M32		2
+#define SS_IO		1
+#define SS_CONFIG	0
+
+
+static struct spapr_phb phb;
+
+
+static void rtas_ibm_read_pci_config(struct kvm_cpu *vcpu,
+				     uint32_t token, uint32_t nargs,
+				     target_ulong args,
+				     uint32_t nret, target_ulong rets)
+{
+	uint32_t val = 0;
+	uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+
+	if (buid != phb.buid || !dev || (size > 4)) {
+		phb_dprintf("- cfgRd buid 0x%lx cfg addr 0x%x size %d not found\n",
+			    buid, addr.w, size);
+
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	pci__config_rd(vcpu->kvm, addr, &val, size);
+	/* It appears this wants a byteswapped result... */
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val>>16);
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	phb_dprintf("- cfgRd buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_read_pci_config(struct kvm_cpu *vcpu,
+				 uint32_t token, uint32_t nargs,
+				 target_ulong args,
+				 uint32_t nret, target_ulong rets)
+{
+	uint32_t val;
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+
+	if (!dev || (size > 4)) {
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	pci__config_rd(vcpu->kvm, addr, &val, size);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val>>16); /* We're yuck-endian. */
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	phb_dprintf("- cfgRd addr 0x%x size %d, val 0x%x\n", addr.w, size, val);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_ibm_write_pci_config(struct kvm_cpu *vcpu,
+				      uint32_t token, uint32_t nargs,
+				      target_ulong args,
+				      uint32_t nret, target_ulong rets)
+{
+	uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+	uint32_t val = rtas_ld(vcpu->kvm, args, 4);
+
+	if (buid != phb.buid || !dev || (size > 4)) {
+		phb_dprintf("- cfgWr buid 0x%lx cfg addr 0x%x/%d error (val 0x%x)\n",
+			    buid, addr.w, size, val);
+
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	phb_dprintf("- cfgWr buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val) << 16;
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	pci__config_wr(vcpu->kvm, addr, &val, size);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_write_pci_config(struct kvm_cpu *vcpu,
+				  uint32_t token, uint32_t nargs,
+				  target_ulong args,
+				  uint32_t nret, target_ulong rets)
+{
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+	uint32_t val = rtas_ld(vcpu->kvm, args, 2);
+
+	if (!dev || (size > 4)) {
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+
+	phb_dprintf("- cfgWr addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val) << 16;
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	pci__config_wr(vcpu->kvm, addr, &val, size);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+void spapr_create_phb(struct kvm *kvm,
+		      const char *busname, uint64_t buid,
+		      uint64_t mem_win_addr, uint64_t mem_win_size,
+		      uint64_t io_win_addr, uint64_t io_win_size)
+{
+	/*
+	 * Since kvmtool doesn't really have any concept of buses etc.,
+	 * there's nothing to register here.  Just register RTAS.
+	 */
+	spapr_rtas_register("read-pci-config", rtas_read_pci_config);
+	spapr_rtas_register("write-pci-config", rtas_write_pci_config);
+	spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config);
+	spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config);
+
+	phb.buid = buid;
+	phb.mem_addr = mem_win_addr;
+	phb.mem_size = mem_win_size;
+	phb.io_addr  = io_win_addr;
+	phb.io_size  = io_win_size;
+
+	kvm->arch.phb = &phb;
+}
+
+static uint32_t bar_to_ss(unsigned long bar)
+{
+	if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+	    PCI_BASE_ADDRESS_SPACE_IO)
+		return SS_IO;
+	else if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64)
+		return SS_M64;
+	else
+		return SS_M32;
+}
+
+static unsigned long bar_to_addr(unsigned long bar)
+{
+	if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+	    PCI_BASE_ADDRESS_SPACE_IO)
+		return bar & PCI_BASE_ADDRESS_IO_MASK;
+	else
+		return bar & PCI_BASE_ADDRESS_MEM_MASK;
+}
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+			       uint32_t xics_phandle,
+			       void *fdt)
+{
+	int bus_off, node_off = 0, devid, fn, i, n, devices;
+	struct device_header *dev_hdr;
+	char nodename[256];
+	struct {
+		uint32_t hi;
+		uint64_t addr;
+		uint64_t size;
+	} __attribute__((packed)) reg[PCI_NUM_REGIONS + 1],
+		  assigned_addresses[PCI_NUM_REGIONS];
+	uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) };
+	struct {
+		uint32_t hi;
+		uint64_t child;
+		uint64_t parent;
+		uint64_t size;
+	} __attribute__((packed)) ranges[] = {
+		{
+			cpu_to_be32(b_ss(1)), cpu_to_be64(0),
+			cpu_to_be64(phb.io_addr),
+			cpu_to_be64(phb.io_size),
+		},
+		{
+			cpu_to_be32(b_ss(2)), cpu_to_be64(0),
+			cpu_to_be64(phb.mem_addr),
+			cpu_to_be64(phb.mem_size),
+		},
+	};
+	uint64_t bus_reg[] = { cpu_to_be64(phb.buid), 0 };
+	uint32_t interrupt_map_mask[] = {
+		cpu_to_be32(b_ddddd(-1)|b_fff(-1)), 0x0, 0x0, 0x0};
+	uint32_t interrupt_map[SPAPR_PCI_NUM_LSI][7];
+
+	/* Start populating the FDT */
+	sprintf(nodename, "pci@%" PRIx64, phb.buid);
+	bus_off = fdt_add_subnode(fdt, 0, nodename);
+	if (bus_off < 0) {
+		die("error making bus subnode, %s\n", fdt_strerror(bus_off));
+		return bus_off;
+	}
+
+	/* Write PHB properties */
+	_FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
+	_FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#address-cells", 0x3));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#size-cells", 0x2));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#interrupt-cells", 0x1));
+	_FDT(fdt_setprop(fdt, bus_off, "used-by-rtas", NULL, 0));
+	_FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range)));
+	_FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges)));
+	_FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
+	_FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
+			 &interrupt_map_mask, sizeof(interrupt_map_mask)));
+
+	/* Populate PCI devices and allocate IRQs */
+	devices = 0;
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		uint32_t *irqmap = interrupt_map[devices];
+		struct pci_device_header *hdr = dev_hdr->data;
+
+		if (!hdr)
+			continue;
+
+		devid = dev_hdr->dev_num;
+		fn = 0; /* kvmtool doesn't yet do multifunction devices */
+
+		sprintf(nodename, "pci@%u,%u", devid, fn);
+
+		/* Allocate interrupt from the map */
+		if (devid > SPAPR_PCI_NUM_LSI)	{
+			die("Unexpected behaviour in spapr_populate_pci_devices,"
+			    "wrong devid %u\n", devid);
+		}
+		irqmap[0] = cpu_to_be32(b_ddddd(devid)|b_fff(fn));
+		irqmap[1] = 0;
+		irqmap[2] = 0;
+		irqmap[3] = 0;
+		irqmap[4] = cpu_to_be32(xics_phandle);
+		/*
+		 * This is nasty; the PCI devs are set up such that their own
+		 * header's irq_line indicates the direct XICS IRQ number to
+		 * use.  There REALLY needs to be a hierarchical system in place
+		 * to 'raise' an IRQ on the bridge which indexes/looks up which
+		 * XICS IRQ to fire.
+		 */
+		irqmap[5] = cpu_to_be32(hdr->irq_line);
+		irqmap[6] = cpu_to_be32(0x8);
+
+		/* Add node to FDT */
+		node_off = fdt_add_subnode(fdt, bus_off, nodename);
+		if (node_off < 0) {
+			die("error making node subnode, %s\n", fdt_strerror(bus_off));
+			return node_off;
+		}
+
+		_FDT(fdt_setprop_cell(fdt, node_off, "vendor-id",
+				      le16_to_cpu(hdr->vendor_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "device-id",
+				      le16_to_cpu(hdr->device_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "revision-id",
+				      hdr->revision_id));
+		_FDT(fdt_setprop_cell(fdt, node_off, "class-code",
+				      hdr->class[0] | (hdr->class[1] << 8) | (hdr->class[2] << 16)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id",
+				      le16_to_cpu(hdr->subsys_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id",
+				      le16_to_cpu(hdr->subsys_vendor_id)));
+
+		/* Config space region comes first */
+		reg[0].hi = cpu_to_be32(
+			b_n(0) |
+			b_p(0) |
+			b_t(0) |
+			b_ss(SS_CONFIG) |
+			b_bbbbbbbb(0) |
+			b_ddddd(devid) |
+			b_fff(fn));
+		reg[0].addr = 0;
+		reg[0].size = 0;
+
+		n = 0;
+		/* Six BARs, no ROM supported, addresses are 32bit */
+		for (i = 0; i < 6; ++i) {
+			if (0 == hdr->bar[i]) {
+				continue;
+			}
+
+			reg[n+1].hi = cpu_to_be32(
+				b_n(0) |
+				b_p(0) |
+				b_t(0) |
+				b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+				b_bbbbbbbb(0) |
+				b_ddddd(devid) |
+				b_fff(fn) |
+				b_rrrrrrrr(bars[i]));
+			reg[n+1].addr = 0;
+			reg[n+1].size = cpu_to_be64(hdr->bar_size[i]);
+
+			assigned_addresses[n].hi = cpu_to_be32(
+				b_n(1) |
+				b_p(0) |
+				b_t(0) |
+				b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+				b_bbbbbbbb(0) |
+				b_ddddd(devid) |
+				b_fff(fn) |
+				b_rrrrrrrr(bars[i]));
+
+			/*
+			 * Writing zeroes to assigned_addresses causes the guest kernel to
+			 * reassign BARs
+			 */
+			assigned_addresses[n].addr = cpu_to_be64(bar_to_addr(le32_to_cpu(hdr->bar[i])));
+			assigned_addresses[n].size = reg[n+1].size;
+
+			++n;
+		}
+		_FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1)));
+		_FDT(fdt_setprop(fdt, node_off, "assigned-addresses",
+				 assigned_addresses,
+				 sizeof(assigned_addresses[0])*(n)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "interrupts",
+				      hdr->irq_pin));
+
+		/* We don't set ibm,dma-window property as we don't have an IOMMU. */
+
+		++devices;
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* Write interrupt map */
+	_FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
+			 devices * sizeof(interrupt_map[0])));
+
+	return 0;
+}
diff --git a/tools/kvm/powerpc/spapr_pci.h b/tools/kvm/powerpc/spapr_pci.h
new file mode 100644
index 000000000000..48b221c5dc73
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.h
@@ -0,0 +1,57 @@
+/*
+ * SPAPR PHB definitions
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef SPAPR_PCI_H
+#define SPAPR_PCI_H
+
+#include "kvm/kvm.h"
+#include "spapr.h"
+#include <inttypes.h>
+
+/* With XICS, we can easily accomodate 1 IRQ per PCI device. */
+
+#define SPAPR_PCI_NUM_LSI 256
+
+struct spapr_phb {
+	uint64_t buid;
+	uint64_t mem_addr;
+	uint64_t mem_size;
+	uint64_t io_addr;
+	uint64_t io_size;
+};
+
+void spapr_create_phb(struct kvm *kvm,
+                      const char *busname, uint64_t buid,
+                      uint64_t mem_win_addr, uint64_t mem_win_size,
+                      uint64_t io_win_addr, uint64_t io_win_size);
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+                               uint32_t xics_phandle,
+                               void *fdt);
+
+static inline bool spapr_phb_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	if ((phys_addr >= SPAPR_PCI_IO_WIN_ADDR) &&
+	    (phys_addr < SPAPR_PCI_IO_WIN_ADDR +
+	     SPAPR_PCI_IO_WIN_SIZE)) {
+		return kvm__emulate_io(kvm, phys_addr - SPAPR_PCI_IO_WIN_ADDR,
+				       data, is_write ? KVM_EXIT_IO_OUT :
+				       KVM_EXIT_IO_IN,
+				       len, 1);
+	} else if ((phys_addr >= SPAPR_PCI_MEM_WIN_ADDR) &&
+		   (phys_addr < SPAPR_PCI_MEM_WIN_ADDR +
+		    SPAPR_PCI_MEM_WIN_SIZE)) {
+		return kvm__emulate_mmio(kvm, phys_addr - SPAPR_PCI_MEM_WIN_ADDR,
+					 data, len, is_write);
+	}
+	return false;
+}
+
+#endif
diff --git a/tools/kvm/powerpc/spapr_rtas.c b/tools/kvm/powerpc/spapr_rtas.c
new file mode 100644
index 000000000000..c81d82b3857c
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_rtas.c
@@ -0,0 +1,233 @@
+/*
+ * SPAPR base RTAS calls
+ *
+ * Borrowed heavily from QEMU's spapr_rtas.c
+ * Copyright (c) 2010-2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "libfdt.h"
+
+#include "spapr.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#define TOKEN_BASE      0x2000
+#define TOKEN_MAX       0x100
+
+#define RTAS_CONSOLE
+
+static struct rtas_call {
+	const char *name;
+	spapr_rtas_fn fn;
+} rtas_table[TOKEN_MAX];
+
+struct rtas_call *rtas_next = rtas_table;
+
+
+static void rtas_display_character(struct kvm_cpu *vcpu,
+                                   uint32_t token, uint32_t nargs,
+                                   target_ulong args,
+                                   uint32_t nret, target_ulong rets)
+{
+	char c = rtas_ld(vcpu->kvm, args, 0);
+	term_putc(&c, 1, 0);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+#ifdef RTAS_CONSOLE
+static void rtas_put_term_char(struct kvm_cpu *vcpu,
+			       uint32_t token, uint32_t nargs,
+			       target_ulong args,
+			       uint32_t nret, target_ulong rets)
+{
+	char c = rtas_ld(vcpu->kvm, args, 0);
+
+	if (vcpu->kvm->cfg.active_console == CONSOLE_HV)
+		term_putc(&c, 1, 0);
+
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_get_term_char(struct kvm_cpu *vcpu,
+			       uint32_t token, uint32_t nargs,
+			       target_ulong args,
+			       uint32_t nret, target_ulong rets)
+{
+	int c;
+
+	if (vcpu->kvm->cfg.active_console == CONSOLE_HV && term_readable(0) &&
+	    (c = term_getc(vcpu->kvm, 0)) >= 0) {
+		rtas_st(vcpu->kvm, rets, 0, 0);
+		rtas_st(vcpu->kvm, rets, 1, c);
+	} else {
+		rtas_st(vcpu->kvm, rets, 0, -2);
+	}
+}
+#endif
+
+static void rtas_get_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+	struct tm tm;
+	time_t tnow;
+
+	if (nret != 8) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	tnow = time(NULL);
+	/* Guest time is currently not offset in any way. */
+	gmtime_r(&tnow, &tm);
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+	rtas_st(vcpu->kvm, rets, 1, tm.tm_year + 1900);
+	rtas_st(vcpu->kvm, rets, 2, tm.tm_mon + 1);
+	rtas_st(vcpu->kvm, rets, 3, tm.tm_mday);
+	rtas_st(vcpu->kvm, rets, 4, tm.tm_hour);
+	rtas_st(vcpu->kvm, rets, 5, tm.tm_min);
+	rtas_st(vcpu->kvm, rets, 6, tm.tm_sec);
+	rtas_st(vcpu->kvm, rets, 7, 0);
+}
+
+static void rtas_set_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+	pr_warning("%s called; TOD set ignored.\n", __FUNCTION__);
+}
+
+static void rtas_power_off(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs, target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	if (nargs != 2 || nret != 1) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+	kvm_cpu__reboot(vcpu->kvm);
+}
+
+static void rtas_query_cpu_stopped_state(struct kvm_cpu *vcpu,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+	if (nargs != 1 || nret != 2) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/*
+	 * Can read id = rtas_ld(vcpu->kvm, args, 0), but
+	 * we currently start all CPUs.  So just return true.
+	 */
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, 2);
+}
+
+static void rtas_start_cpu(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs,
+                           target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	die(__FUNCTION__);
+}
+
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets)
+{
+	if ((token >= TOKEN_BASE)
+	    && ((token - TOKEN_BASE) < TOKEN_MAX)) {
+		struct rtas_call *call = rtas_table + (token - TOKEN_BASE);
+
+		if (call->fn) {
+			call->fn(vcpu, token, nargs, args, nret, rets);
+			return H_SUCCESS;
+		}
+	}
+
+	/*
+	 * HACK: Some Linux early debug code uses RTAS display-character,
+	 * but assumes the token value is 0xa (which it is on some real
+	 * machines) without looking it up in the device tree.  This
+	 * special case makes this work
+	 */
+	if (token == 0xa) {
+		rtas_display_character(vcpu, 0xa, nargs, args, nret, rets);
+		return H_SUCCESS;
+	}
+
+	hcall_dprintf("Unknown RTAS token 0x%x\n", token);
+	rtas_st(vcpu->kvm, rets, 0, -3);
+	return H_PARAMETER;
+}
+
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
+{
+	assert(rtas_next < (rtas_table + TOKEN_MAX));
+
+	rtas_next->name = name;
+	rtas_next->fn = fn;
+
+	rtas_next++;
+}
+
+/*
+ * This is called from the context of an open /rtas node, in order to add
+ * properties for the rtas call tokens.
+ */
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < TOKEN_MAX; i++) {
+		struct rtas_call *call = &rtas_table[i];
+
+		if (!call->fn) {
+			continue;
+		}
+
+		ret = fdt_property_cell(fdt, call->name, i + TOKEN_BASE);
+
+		if (ret < 0) {
+			pr_warning("Couldn't add rtas token for %s: %s\n",
+				   call->name, fdt_strerror(ret));
+			return ret;
+		}
+
+	}
+	return 0;
+}
+
+void register_core_rtas(void)
+{
+	spapr_rtas_register("display-character", rtas_display_character);
+	spapr_rtas_register("get-time-of-day", rtas_get_time_of_day);
+	spapr_rtas_register("set-time-of-day", rtas_set_time_of_day);
+	spapr_rtas_register("power-off", rtas_power_off);
+	spapr_rtas_register("query-cpu-stopped-state",
+			    rtas_query_cpu_stopped_state);
+	spapr_rtas_register("start-cpu", rtas_start_cpu);
+#ifdef RTAS_CONSOLE
+	/* These are unused: We do console I/O via hcalls, not rtas. */
+	spapr_rtas_register("put-term-char", rtas_put_term_char);
+	spapr_rtas_register("get-term-char", rtas_get_term_char);
+#endif
+}
diff --git a/tools/kvm/powerpc/xics.c b/tools/kvm/powerpc/xics.c
new file mode 100644
index 000000000000..d4b5caae8af7
--- /dev/null
+++ b/tools/kvm/powerpc/xics.c
@@ -0,0 +1,522 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Borrowed heavily from QEMU's xics.c,
+ * Copyright (c) 2010,2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "xics.h"
+#include "kvm/util.h"
+
+#include <stdio.h>
+#include <malloc.h>
+
+#define XICS_NUM_IRQS	1024
+
+
+/* #define DEBUG_XICS yes */
+#ifdef DEBUG_XICS
+#define xics_dprintf(fmt, ...)					\
+	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define xics_dprintf(fmt, ...)			\
+	do { } while (0)
+#endif
+
+/*
+ * ICP: Presentation layer
+ */
+
+struct icp_server_state {
+	uint32_t xirr;
+	uint8_t pending_priority;
+	uint8_t mfrr;
+	struct kvm_cpu *cpu;
+};
+
+#define XICS_IRQ_OFFSET 16
+#define XISR_MASK	0x00ffffff
+#define CPPR_MASK	0xff000000
+
+#define XISR(ss)   (((ss)->xirr) & XISR_MASK)
+#define CPPR(ss)   (((ss)->xirr) >> 24)
+
+struct ics_state;
+
+struct icp_state {
+	unsigned long nr_servers;
+	struct icp_server_state *ss;
+	struct ics_state *ics;
+};
+
+static void ics_reject(struct ics_state *ics, int nr);
+static void ics_resend(struct ics_state *ics);
+static void ics_eoi(struct ics_state *ics, int nr);
+
+static inline void cpu_irq_raise(struct kvm_cpu *vcpu)
+{
+	xics_dprintf("INT1[%p]\n", vcpu);
+	kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1);
+}
+
+static inline void cpu_irq_lower(struct kvm_cpu *vcpu)
+{
+	xics_dprintf("INT0[%p]\n", vcpu);
+	kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 0);
+}
+
+static void icp_check_ipi(struct icp_state *icp, int server)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	if (XISR(ss) && (ss->pending_priority <= ss->mfrr)) {
+		return;
+	}
+
+	if (XISR(ss)) {
+		ics_reject(icp->ics, XISR(ss));
+	}
+
+	ss->xirr = (ss->xirr & ~XISR_MASK) | XICS_IPI;
+	ss->pending_priority = ss->mfrr;
+	cpu_irq_raise(ss->cpu);
+}
+
+static void icp_resend(struct icp_state *icp, int server)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	if (ss->mfrr < CPPR(ss)) {
+		icp_check_ipi(icp, server);
+	}
+	ics_resend(icp->ics);
+}
+
+static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
+{
+	struct icp_server_state *ss = icp->ss + server;
+	uint8_t old_cppr;
+	uint32_t old_xisr;
+
+	old_cppr = CPPR(ss);
+	ss->xirr = (ss->xirr & ~CPPR_MASK) | (cppr << 24);
+
+	if (cppr < old_cppr) {
+		if (XISR(ss) && (cppr <= ss->pending_priority)) {
+			old_xisr = XISR(ss);
+			ss->xirr &= ~XISR_MASK; /* Clear XISR */
+			cpu_irq_lower(ss->cpu);
+			ics_reject(icp->ics, old_xisr);
+		}
+	} else {
+		if (!XISR(ss)) {
+			icp_resend(icp, server);
+		}
+	}
+}
+
+static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
+{
+	struct icp_server_state *ss = icp->ss + nr;
+
+	ss->mfrr = mfrr;
+	if (mfrr < CPPR(ss)) {
+		icp_check_ipi(icp, nr);
+	}
+}
+
+static uint32_t icp_accept(struct icp_server_state *ss)
+{
+	uint32_t xirr;
+
+	cpu_irq_lower(ss->cpu);
+	xirr = ss->xirr;
+	ss->xirr = ss->pending_priority << 24;
+	return xirr;
+}
+
+static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	ics_eoi(icp->ics, xirr & XISR_MASK);
+	/* Send EOI -> ICS */
+	ss->xirr = (ss->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK);
+	if (!XISR(ss)) {
+		icp_resend(icp, server);
+	}
+}
+
+static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
+{
+	struct icp_server_state *ss = icp->ss + server;
+	xics_dprintf("icp_irq(nr %d, server %d, prio 0x%x)\n", nr, server, priority);
+	if ((priority >= CPPR(ss))
+	    || (XISR(ss) && (ss->pending_priority <= priority))) {
+		xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+			     nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+		ics_reject(icp->ics, nr);
+	} else {
+		if (XISR(ss)) {
+			xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+				     nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+			ics_reject(icp->ics, XISR(ss));
+		}
+		ss->xirr = (ss->xirr & ~XISR_MASK) | (nr & XISR_MASK);
+		ss->pending_priority = priority;
+		cpu_irq_raise(ss->cpu);
+	}
+}
+
+/*
+ * ICS: Source layer
+ */
+
+struct ics_irq_state {
+	int server;
+	uint8_t priority;
+	uint8_t saved_priority;
+	int rejected:1;
+	int masked_pending:1;
+};
+
+struct ics_state {
+	unsigned int nr_irqs;
+	unsigned int offset;
+	struct ics_irq_state *irqs;
+	struct icp_state *icp;
+};
+
+static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
+{
+	return (nr >= ics->offset)
+		&& (nr < (ics->offset + ics->nr_irqs));
+}
+
+static void ics_set_irq_msi(struct ics_state *ics, int srcno, int val)
+{
+	struct ics_irq_state *irq = ics->irqs + srcno;
+
+	if (val) {
+		if (irq->priority == 0xff) {
+			xics_dprintf(" irq pri ff, masked pending\n");
+			irq->masked_pending = 1;
+		} else	{
+			icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
+		}
+	}
+}
+
+static void ics_reject_msi(struct ics_state *ics, int nr)
+{
+	struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+	irq->rejected = 1;
+}
+
+static void ics_resend_msi(struct ics_state *ics)
+{
+	unsigned int i;
+
+	for (i = 0; i < ics->nr_irqs; i++) {
+		struct ics_irq_state *irq = ics->irqs + i;
+
+		/* FIXME: filter by server#? */
+		if (irq->rejected) {
+			irq->rejected = 0;
+			if (irq->priority != 0xff) {
+				icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
+			}
+		}
+	}
+}
+
+static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
+			       uint8_t priority)
+{
+	struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+	irq->server = server;
+	irq->priority = priority;
+	xics_dprintf("ics_write_xive_msi(nr %d, server %d, pri 0x%x)\n", nr, server, priority);
+
+	if (!irq->masked_pending || (priority == 0xff)) {
+		return;
+	}
+
+	irq->masked_pending = 0;
+	icp_irq(ics->icp, server, nr, priority);
+}
+
+static void ics_reject(struct ics_state *ics, int nr)
+{
+	ics_reject_msi(ics, nr);
+}
+
+static void ics_resend(struct ics_state *ics)
+{
+	ics_resend_msi(ics);
+}
+
+static void ics_eoi(struct ics_state *ics, int nr)
+{
+}
+
+/*
+ * Exported functions
+ */
+
+static int allocated_irqnum = XICS_IRQ_OFFSET;
+
+/*
+ * xics_alloc_irqnum(): This is hacky.  The problem boils down to the PCI device
+ * code which just calls kvm__irq_line( .. pcidev->pci_hdr.irq_line ..) at will.
+ * Each PCI device's IRQ line is allocated by irq__register_device() (which
+ * allocates an IRQ AND allocates a.. PCI device num..).
+ *
+ * In future I'd like to at least mimic some kind of 'upstream IRQ controller'
+ * whereby PCI devices let their PHB know when they want to IRQ, and that
+ * percolates up.
+ *
+ * For now, allocate a REAL xics irq number and (via irq__register_device) push
+ * that into the config space.	8 bits only though!
+ */
+int xics_alloc_irqnum(void)
+{
+	int irq = allocated_irqnum++;
+
+	if (irq > 255)
+		die("Huge numbers of IRQs aren't supported with the daft kvmtool IRQ system.");
+
+	return irq;
+}
+
+static target_ulong h_cppr(struct kvm_cpu *vcpu,
+			   target_ulong opcode, target_ulong *args)
+{
+	target_ulong cppr = args[0];
+
+	xics_dprintf("h_cppr(%lx)\n", cppr);
+	icp_set_cppr(vcpu->kvm->arch.icp, vcpu->cpu_id, cppr);
+	return H_SUCCESS;
+}
+
+static target_ulong h_ipi(struct kvm_cpu *vcpu,
+			  target_ulong opcode, target_ulong *args)
+{
+	target_ulong server = args[0];
+	target_ulong mfrr = args[1];
+
+	xics_dprintf("h_ipi(%lx, %lx)\n", server, mfrr);
+	if (server >= vcpu->kvm->arch.icp->nr_servers) {
+		return H_PARAMETER;
+	}
+
+	icp_set_mfrr(vcpu->kvm->arch.icp, server, mfrr);
+	return H_SUCCESS;
+}
+
+static target_ulong h_xirr(struct kvm_cpu *vcpu,
+			   target_ulong opcode, target_ulong *args)
+{
+	uint32_t xirr = icp_accept(vcpu->kvm->arch.icp->ss + vcpu->cpu_id);
+
+	xics_dprintf("h_xirr() = %x\n", xirr);
+	args[0] = xirr;
+	return H_SUCCESS;
+}
+
+static target_ulong h_eoi(struct kvm_cpu *vcpu,
+			  target_ulong opcode, target_ulong *args)
+{
+	target_ulong xirr = args[0];
+
+	xics_dprintf("h_eoi(%lx)\n", xirr);
+	icp_eoi(vcpu->kvm->arch.icp, vcpu->cpu_id, xirr);
+	return H_SUCCESS;
+}
+
+static void rtas_set_xive(struct kvm_cpu *vcpu, uint32_t token,
+			  uint32_t nargs, target_ulong args,
+			  uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr, server, priority;
+
+	if ((nargs != 3) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+	server = rtas_ld(vcpu->kvm, args, 1);
+	priority = rtas_ld(vcpu->kvm, args, 2);
+
+	xics_dprintf("rtas_set_xive(%x,%x,%x)\n", nr, server, priority);
+	if (!ics_valid_irq(ics, nr) || (server >= ics->icp->nr_servers)
+	    || (priority > 0xff)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	ics_write_xive_msi(ics, nr, server, priority);
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_get_xive(struct kvm_cpu *vcpu, uint32_t token,
+			  uint32_t nargs, target_ulong args,
+			  uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 3)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+	rtas_st(vcpu->kvm, rets, 1, ics->irqs[nr - ics->offset].server);
+	rtas_st(vcpu->kvm, rets, 2, ics->irqs[nr - ics->offset].priority);
+}
+
+static void rtas_int_off(struct kvm_cpu *vcpu, uint32_t token,
+			 uint32_t nargs, target_ulong args,
+			 uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_int_on(struct kvm_cpu *vcpu, uint32_t token,
+			uint32_t nargs, target_ulong args,
+			uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static int xics_init(struct kvm *kvm)
+{
+	int max_server_num;
+	unsigned int i;
+	struct icp_state *icp;
+	struct ics_state *ics;
+	int j;
+
+	max_server_num = kvm->nrcpus;
+
+	icp = malloc(sizeof(*icp));
+	icp->nr_servers = max_server_num + 1;
+	icp->ss = malloc(icp->nr_servers * sizeof(struct icp_server_state));
+
+	for (i = 0; i < icp->nr_servers; i++) {
+		icp->ss[i].xirr = 0;
+		icp->ss[i].pending_priority = 0;
+		icp->ss[i].cpu = 0;
+		icp->ss[i].mfrr = 0xff;
+	}
+
+	/*
+	 * icp->ss[env->cpu_index].cpu is set by CPUs calling in to
+	 * xics_cpu_register().
+	 */
+
+	ics = malloc(sizeof(*ics));
+	ics->nr_irqs = XICS_NUM_IRQS;
+	ics->offset = XICS_IRQ_OFFSET;
+	ics->irqs = malloc(ics->nr_irqs * sizeof(struct ics_irq_state));
+
+	icp->ics = ics;
+	ics->icp = icp;
+
+	for (i = 0; i < ics->nr_irqs; i++) {
+		ics->irqs[i].server = 0;
+		ics->irqs[i].priority = 0xff;
+		ics->irqs[i].saved_priority = 0xff;
+		ics->irqs[i].rejected = 0;
+		ics->irqs[i].masked_pending = 0;
+	}
+
+	spapr_register_hypercall(H_CPPR, h_cppr);
+	spapr_register_hypercall(H_IPI, h_ipi);
+	spapr_register_hypercall(H_XIRR, h_xirr);
+	spapr_register_hypercall(H_EOI, h_eoi);
+
+	spapr_rtas_register("ibm,set-xive", rtas_set_xive);
+	spapr_rtas_register("ibm,get-xive", rtas_get_xive);
+	spapr_rtas_register("ibm,int-off", rtas_int_off);
+	spapr_rtas_register("ibm,int-on", rtas_int_on);
+
+	for (j = 0; j < kvm->nrcpus; j++) {
+		struct kvm_cpu *vcpu = kvm->cpus[j];
+
+		if (vcpu->cpu_id >= icp->nr_servers)
+			die("Invalid server number for cpuid %ld\n", vcpu->cpu_id);
+
+		icp->ss[vcpu->cpu_id].cpu = vcpu;
+	}
+
+	kvm->arch.icp = icp;
+
+	return 0;
+}
+base_init(xics_init);
+
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	/*
+	 * Route event to ICS, which routes to ICP, which eventually does a
+	 * kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1)
+	 */
+	xics_dprintf("Raising IRQ %d -> %d\n", irq, level);
+	ics_set_irq_msi(kvm->arch.icp->ics, irq - kvm->arch.icp->ics->offset, level);
+}
diff --git a/tools/kvm/powerpc/xics.h b/tools/kvm/powerpc/xics.h
new file mode 100644
index 000000000000..d5bc6f92fa82
--- /dev/null
+++ b/tools/kvm/powerpc/xics.h
@@ -0,0 +1,18 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef XICS_H
+#define XICS_H
+
+#define XICS_IPI        0x2
+
+int xics_alloc_irqnum(void);
+
+#endif
diff --git a/tools/kvm/symbol.c b/tools/kvm/symbol.c
new file mode 100644
index 000000000000..07dd9d541065
--- /dev/null
+++ b/tools/kvm/symbol.c
@@ -0,0 +1,133 @@
+#include "kvm/symbol.h"
+
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <bfd.h>
+
+static bfd *abfd;
+
+int symbol_init(struct kvm *kvm)
+{
+	int ret = 0;
+
+	if (!kvm->vmlinux)
+		return 0;
+
+	bfd_init();
+
+	abfd = bfd_openr(kvm->vmlinux, NULL);
+	if (abfd == NULL) {
+		bfd_error_type err = bfd_get_error();
+
+		switch (err) {
+		case bfd_error_no_memory:
+			ret = -ENOMEM;
+			break;
+		case bfd_error_invalid_target:
+			ret = -EINVAL;
+			break;
+		default:
+			ret = -EFAULT;
+			break;
+		}
+	}
+
+	return ret;
+}
+late_init(symbol_init);
+
+static asymbol *lookup(asymbol **symbols, int nr_symbols, const char *symbol_name)
+{
+	int i, ret;
+
+	ret = -ENOENT;
+
+	for (i = 0; i < nr_symbols; i++) {
+		asymbol *symbol = symbols[i];
+
+		if (!strcmp(bfd_asymbol_name(symbol), symbol_name))
+			return symbol;
+	}
+
+	return ERR_PTR(ret);
+}
+
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+	const char *filename;
+	bfd_vma sym_offset;
+	bfd_vma sym_start;
+	asection *section;
+	unsigned int line;
+	const char *func;
+	long symtab_size;
+	asymbol *symbol;
+	asymbol **syms;
+	int nr_syms, ret;
+
+	ret = -ENOENT;
+	if (!abfd)
+		goto not_found;
+
+	if (!bfd_check_format(abfd, bfd_object))
+		goto not_found;
+
+	symtab_size = bfd_get_symtab_upper_bound(abfd);
+	if (!symtab_size)
+		goto not_found;
+
+	ret = -ENOMEM;
+	syms = malloc(symtab_size);
+	if (!syms)
+		goto not_found;
+
+	nr_syms = bfd_canonicalize_symtab(abfd, syms);
+
+	ret = -ENOENT;
+	section = bfd_get_section_by_name(abfd, ".debug_aranges");
+	if (!section)
+		goto not_found;
+
+	if (!bfd_find_nearest_line(abfd, section, NULL, addr, &filename, &func, &line))
+		goto not_found;
+
+	if (!func)
+		goto not_found;
+
+	symbol = lookup(syms, nr_syms, func);
+	if (IS_ERR(symbol))
+		goto not_found;
+
+	sym_start = bfd_asymbol_value(symbol);
+
+	sym_offset = addr - sym_start;
+
+	snprintf(sym, size, "%s+%llx (%s:%i)", func, (long long) sym_offset, filename, line);
+
+	sym[size - 1] = '\0';
+
+	free(syms);
+
+	return sym;
+
+not_found:
+	return ERR_PTR(ret);
+}
+
+int symbol_exit(struct kvm *kvm)
+{
+	bfd_boolean ret = TRUE;
+
+	if (abfd)
+		ret = bfd_close(abfd);
+
+	if (ret == TRUE)
+		return 0;
+
+	return -EFAULT;
+}
+late_exit(symbol_exit);
diff --git a/tools/kvm/term.c b/tools/kvm/term.c
new file mode 100644
index 000000000000..4413450f57d2
--- /dev/null
+++ b/tools/kvm/term.c
@@ -0,0 +1,171 @@
+#include <poll.h>
+#include <stdbool.h>
+#include <termios.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <signal.h>
+#include <pty.h>
+#include <utmp.h>
+
+#include "kvm/read-write.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#define TERM_FD_IN      0
+#define TERM_FD_OUT     1
+
+static struct termios	orig_term;
+
+int term_escape_char	= 0x01; /* ctrl-a is used for escape */
+bool term_got_escape	= false;
+
+int term_fds[4][2];
+
+int term_getc(struct kvm *kvm, int term)
+{
+	unsigned char c;
+
+	if (read_in_full(term_fds[term][TERM_FD_IN], &c, 1) < 0)
+		return -1;
+
+	if (term_got_escape) {
+		term_got_escape = false;
+		if (c == 'x')
+			kvm_cpu__reboot(kvm);
+		if (c == term_escape_char)
+			return c;
+	}
+
+	if (c == term_escape_char) {
+		term_got_escape = true;
+		return -1;
+	}
+
+	return c;
+}
+
+int term_putc(char *addr, int cnt, int term)
+{
+	int ret;
+
+	while (cnt--) {
+		ret = write(term_fds[term][TERM_FD_OUT], addr++, 1);
+		if (ret < 0)
+			return 0;
+	}
+
+	return cnt;
+}
+
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term)
+{
+	int c;
+
+	c = term_getc(kvm, term);
+
+	if (c < 0)
+		return 0;
+
+	*((char *)iov[TERM_FD_IN].iov_base)	= (char)c;
+
+	return sizeof(char);
+}
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term)
+{
+	return writev(term_fds[term][TERM_FD_OUT], iov, iovcnt);
+}
+
+bool term_readable(int term)
+{
+	struct pollfd pollfd = (struct pollfd) {
+		.fd	= term_fds[term][TERM_FD_IN],
+		.events	= POLLIN,
+		.revents = 0,
+	};
+
+	return poll(&pollfd, 1, 0) > 0;
+}
+
+static void term_cleanup(void)
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		tcsetattr(term_fds[i][TERM_FD_IN], TCSANOW, &orig_term);
+}
+
+static void term_sig_cleanup(int sig)
+{
+	term_cleanup();
+	signal(sig, SIG_DFL);
+	raise(sig);
+}
+
+void term_set_tty(int term)
+{
+	struct termios orig_term;
+	int master, slave;
+	char new_pty[PATH_MAX];
+
+	if (tcgetattr(STDIN_FILENO, &orig_term) < 0)
+		die("unable to save initial standard input settings");
+
+	orig_term.c_lflag &= ~(ICANON | ECHO | ISIG);
+
+	if (openpty(&master, &slave, new_pty, &orig_term, NULL) < 0)
+		return;
+
+	close(slave);
+
+	pr_info("Assigned terminal %d to pty %s\n", term, new_pty);
+
+	term_fds[term][TERM_FD_IN] = term_fds[term][TERM_FD_OUT] = master;
+}
+
+int tty_parser(const struct option *opt, const char *arg, int unset)
+{
+	int tty = atoi(arg);
+
+	term_set_tty(tty);
+
+	return 0;
+}
+
+int term_init(struct kvm *kvm)
+{
+	struct termios term;
+	int i, r;
+
+	r = tcgetattr(STDIN_FILENO, &orig_term);
+	if (r < 0) {
+		pr_warning("unable to save initial standard input settings");
+		return r;
+	}
+
+
+	term = orig_term;
+	term.c_lflag &= ~(ICANON | ECHO | ISIG);
+	tcsetattr(STDIN_FILENO, TCSANOW, &term);
+
+	for (i = 0; i < 4; i++)
+		if (term_fds[i][TERM_FD_IN] == 0) {
+			term_fds[i][TERM_FD_IN] = STDIN_FILENO;
+			term_fds[i][TERM_FD_OUT] = STDOUT_FILENO;
+		}
+
+	signal(SIGTERM, term_sig_cleanup);
+	atexit(term_cleanup);
+
+	return 0;
+}
+dev_init(term_init);
+
+int term_exit(struct kvm *kvm)
+{
+	return 0;
+}
+dev_exit(term_exit);
diff --git a/tools/kvm/tests/Makefile b/tools/kvm/tests/Makefile
new file mode 100644
index 000000000000..cad14ecbae1b
--- /dev/null
+++ b/tools/kvm/tests/Makefile
@@ -0,0 +1,19 @@
+all: kernel pit boot
+
+kernel:
+	$(MAKE) -C kernel
+.PHONY: kernel
+
+pit:
+	$(MAKE) -C pit
+.PHONY: pit
+
+boot:
+	$(MAKE) -C boot
+.PHONY: boot
+
+clean:
+	$(MAKE) -C kernel clean
+	$(MAKE) -C pit clean
+	$(MAKE) -C boot clean
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/Makefile b/tools/kvm/tests/boot/Makefile
new file mode 100644
index 000000000000..40cba6847ccd
--- /dev/null
+++ b/tools/kvm/tests/boot/Makefile
@@ -0,0 +1,13 @@
+NAME	:= init
+
+OBJ	:= $(NAME).o
+
+all: $(.o)
+	rm -rf rootfs
+	mkdir rootfs
+	gcc -static init.c -o rootfs/init
+	mkisofs rootfs > boot_test.iso
+
+clean:
+	rm -rf rootfs boot_test.iso
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/init.c b/tools/kvm/tests/boot/init.c
new file mode 100644
index 000000000000..094f8ba37317
--- /dev/null
+++ b/tools/kvm/tests/boot/init.c
@@ -0,0 +1,11 @@
+#include <linux/reboot.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+	puts("hello, KVM guest!\r");
+
+	reboot(LINUX_REBOOT_CMD_RESTART);
+
+	return 0;
+}
diff --git a/tools/kvm/tests/kernel/.gitignore b/tools/kvm/tests/kernel/.gitignore
new file mode 100644
index 000000000000..d0cd209e5078
--- /dev/null
+++ b/tools/kvm/tests/kernel/.gitignore
@@ -0,0 +1,2 @@
+kernel.bin
+kernel.elf
diff --git a/tools/kvm/tests/kernel/Makefile b/tools/kvm/tests/kernel/Makefile
new file mode 100644
index 000000000000..c7dd8da33332
--- /dev/null
+++ b/tools/kvm/tests/kernel/Makefile
@@ -0,0 +1,20 @@
+NAME	:= kernel
+
+BIN	:= $(NAME).bin
+ELF	:= $(NAME).elf
+OBJ	:= $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+	objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+	ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+	gcc -nostdinc -c $< -o $@
+
+clean:
+	rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/kernel/README b/tools/kvm/tests/kernel/README
new file mode 100644
index 000000000000..2923777e6d65
--- /dev/null
+++ b/tools/kvm/tests/kernel/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/kernel/kernel.S b/tools/kvm/tests/kernel/kernel.S
new file mode 100644
index 000000000000..2824b64da657
--- /dev/null
+++ b/tools/kvm/tests/kernel/kernel.S
@@ -0,0 +1,8 @@
+	.code16gcc
+	.text
+	.globl  _start
+	.type   _start, @function
+_start:
+	# "This is probably the largest possible kernel that is bug free." -- Avi Kivity
+	1:
+	jmp 1b
diff --git a/tools/kvm/tests/pit/.gitignore b/tools/kvm/tests/pit/.gitignore
new file mode 100644
index 000000000000..43f0aa8d37f1
--- /dev/null
+++ b/tools/kvm/tests/pit/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+*.elf
diff --git a/tools/kvm/tests/pit/Makefile b/tools/kvm/tests/pit/Makefile
new file mode 100644
index 000000000000..2fae9b2aec2f
--- /dev/null
+++ b/tools/kvm/tests/pit/Makefile
@@ -0,0 +1,20 @@
+NAME	:= tick
+
+BIN	:= $(NAME).bin
+ELF	:= $(NAME).elf
+OBJ	:= $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+	objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+	ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+	gcc -nostdinc -c $< -o $@
+
+clean:
+	rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/pit/README b/tools/kvm/tests/pit/README
new file mode 100644
index 000000000000..2923777e6d65
--- /dev/null
+++ b/tools/kvm/tests/pit/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/pit/tick.S b/tools/kvm/tests/pit/tick.S
new file mode 100644
index 000000000000..635dc8dd46bc
--- /dev/null
+++ b/tools/kvm/tests/pit/tick.S
@@ -0,0 +1,101 @@
+#define IO_PIC		0x20
+#define IRQ_OFFSET	32
+#define IO_PIT		0x40
+#define TIMER_FREQ	1193182
+#define TIMER_DIV(x)	((TIMER_FREQ+(x)/2)/(x))
+
+#define TEST_COUNT	0x0200
+
+	.code16gcc
+	.text
+	.globl	_start
+	.type	_start, @function
+_start:
+/*
+ * fill up noop handlers
+ */
+	xorw	%ax, %ax
+	xorw	%di, %di
+	movw	%ax, %es
+	movw	$256, %cx
+fill_noop_idt:
+	movw	$noop_handler, %es:(%di)
+	movw	%cs, %es:2(%di)
+	add	$4, %di
+	loop	fill_noop_idt
+
+set_idt:
+	movw	$timer_isr, %es:(IRQ_OFFSET*4)
+	movw	%cs, %es:(IRQ_OFFSET*4+2)
+
+set_pic:
+	# ICW1
+	mov	$0x11, %al
+	mov	$(IO_PIC), %dx
+	out	%al,%dx
+	# ICW2
+	mov	$(IRQ_OFFSET), %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	# ICW3
+	mov	$0x00, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	# ICW4
+	mov	$0x3, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+
+set_pit:
+	# set 8254 mode
+	mov	$(IO_PIT+3), %dx
+	mov	$0x34, %al
+	outb	%al, %dx
+	# set 8254 freq 1KHz
+	mov	$(IO_PIT), %dx
+	movb	$(TIMER_DIV(1000) % 256), %al
+	outb	%al, %dx
+	movb	$(TIMER_DIV(1000) / 256), %al
+	outb	%al, %dx
+
+enable_irq0:
+	mov	$0xfe, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	sti
+loop:
+	1:
+	jmp	1b
+
+test_ok:
+	mov	$0x3f8,%dx
+	cs lea	msg2, %si
+	mov	$(msg2_end-msg2), %cx
+	cs rep/outsb
+
+	/* Reboot by using the i8042 reboot line */
+	mov	$0xfe, %al
+	outb	%al, $0x64
+
+timer_isr:
+	cli
+	pushaw
+	pushfw
+	mov	$0x3f8,%dx
+	mov	$0x2e, %al	# .
+	out	%al,%dx
+	decw	count
+	jz	test_ok
+	popfw
+	popaw
+	iretw
+
+noop_handler:
+	iretw
+
+count:
+	.word	TEST_COUNT
+
+msg2:
+	.asciz "\nTest OK\n"
+msg2_end:
diff --git a/tools/kvm/ui/sdl.c b/tools/kvm/ui/sdl.c
new file mode 100644
index 000000000000..9994490022b6
--- /dev/null
+++ b/tools/kvm/ui/sdl.c
@@ -0,0 +1,323 @@
+#include "kvm/sdl.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/vesa.h"
+
+#include <SDL/SDL.h>
+#include <pthread.h>
+#include <signal.h>
+#include <linux/err.h>
+
+#define FRAME_RATE		25
+
+#define SCANCODE_UNKNOWN      0
+#define SCANCODE_NORMAL       1
+#define SCANCODE_ESCAPED      2
+#define SCANCODE_KEY_PAUSE    3
+#define SCANCODE_KEY_PRNTSCRN 4
+
+struct set2_scancode {
+	u8 code;
+	u8 type;
+};
+
+#define DEFINE_SC(_code) {\
+	.code = _code,\
+	.type = SCANCODE_NORMAL,\
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) {\
+	.code = _code,\
+	.type = SCANCODE_ESCAPED,\
+}
+
+static const struct set2_scancode const keymap[256] = {
+	[9]	= DEFINE_SC(0x76),	/* <esc> */
+	[10]	= DEFINE_SC(0x16),	/* 1 */
+	[11]	= DEFINE_SC(0x1e),	/* 2 */
+	[12]	= DEFINE_SC(0x26),	/* 3 */
+	[13]	= DEFINE_SC(0x25),	/* 4 */
+	[14]	= DEFINE_SC(0x2e),	/* 5 */
+	[15]	= DEFINE_SC(0x36),	/* 6 */
+	[16]	= DEFINE_SC(0x3d),	/* 7 */
+	[17]	= DEFINE_SC(0x3e),	/* 8 */
+	[18]	= DEFINE_SC(0x46),	/* 9 */
+	[19]	= DEFINE_SC(0x45),	/* 9 */
+	[20]	= DEFINE_SC(0x4e),	/* - */
+	[21]	= DEFINE_SC(0x55),	/* + */
+	[22]	= DEFINE_SC(0x66),	/* <backspace> */
+	[23]	= DEFINE_SC(0x0d),	/* <tab> */
+	[24]	= DEFINE_SC(0x15),	/* q */
+	[25]	= DEFINE_SC(0x1d),	/* w */
+	[26]	= DEFINE_SC(0x24),	/* e */
+	[27]	= DEFINE_SC(0x2d),	/* r */
+	[28]	= DEFINE_SC(0x2c),	/* t */
+	[29]	= DEFINE_SC(0x35),	/* y */
+	[30]	= DEFINE_SC(0x3c),	/* u */
+	[31]	= DEFINE_SC(0x43),	/* i */
+	[32]	= DEFINE_SC(0x44),	/* o */
+	[33]	= DEFINE_SC(0x4d),	/* p */
+	[34]	= DEFINE_SC(0x54),	/* [ */
+	[35]	= DEFINE_SC(0x5b),	/* ] */
+	[36]	= DEFINE_SC(0x5a),	/* <enter> */
+	[37]	= DEFINE_SC(0x14),	/* <left ctrl> */
+	[38]	= DEFINE_SC(0x1c),	/* a */
+	[39]	= DEFINE_SC(0x1b),	/* s */
+	[40]	= DEFINE_SC(0x23),	/* d */
+	[41]	= DEFINE_SC(0x2b),	/* f */
+	[42]	= DEFINE_SC(0x34),	/* g */
+	[43]	= DEFINE_SC(0x33),	/* h */
+	[44]	= DEFINE_SC(0x3b),	/* j */
+	[45]	= DEFINE_SC(0x42),	/* k */
+	[46]	= DEFINE_SC(0x4b),	/* l */
+	[47]	= DEFINE_SC(0x4c),	/* ; */
+	[48]	= DEFINE_SC(0x52),	/* ' */
+	[49]	= DEFINE_SC(0x0e),	/* ` */
+	[50]	= DEFINE_SC(0x12),	/* <left shift> */
+	[51]	= DEFINE_SC(0x5d),	/* \ */
+	[52]	= DEFINE_SC(0x1a),	/* z */
+	[53]	= DEFINE_SC(0x22),	/* x */
+	[54]	= DEFINE_SC(0x21),	/* c */
+	[55]	= DEFINE_SC(0x2a),	/* v */
+	[56]	= DEFINE_SC(0x32),	/* b */
+	[57]	= DEFINE_SC(0x31),	/* n */
+	[58]	= DEFINE_SC(0x3a),	/* m */
+	[59]	= DEFINE_SC(0x41),	/* < */
+	[60]	= DEFINE_SC(0x49),	/* > */
+	[61]	= DEFINE_SC(0x4a),	/* / */
+	[62]	= DEFINE_SC(0x59),	/* <right shift> */
+	[63]	= DEFINE_SC(0x7c),	/* keypad * */
+	[64]	= DEFINE_SC(0x11),	/* <left alt> */
+	[65]	= DEFINE_SC(0x29),	/* <space> */
+
+	[67]	= DEFINE_SC(0x05),	/* <F1> */
+	[68]	= DEFINE_SC(0x06),	/* <F2> */
+	[69]	= DEFINE_SC(0x04),	/* <F3> */
+	[70]	= DEFINE_SC(0x0c),	/* <F4> */
+	[71]	= DEFINE_SC(0x03),	/* <F5> */
+	[72]	= DEFINE_SC(0x0b),	/* <F6> */
+	[73]	= DEFINE_SC(0x83),	/* <F7> */
+	[74]	= DEFINE_SC(0x0a),	/* <F8> */
+	[75]	= DEFINE_SC(0x01),	/* <F9> */
+	[76]	= DEFINE_SC(0x09),	/* <F10> */
+
+	[79]	= DEFINE_SC(0x6c),	/* keypad 7 */
+	[80]	= DEFINE_SC(0x75),	/* keypad 8 */
+	[81]	= DEFINE_SC(0x7d),	/* keypad 9 */
+	[82]	= DEFINE_SC(0x7b),	/* keypad - */
+	[83]	= DEFINE_SC(0x6b),	/* keypad 4 */
+	[84]	= DEFINE_SC(0x73),	/* keypad 5 */
+	[85]	= DEFINE_SC(0x74),	/* keypad 6 */
+	[86]	= DEFINE_SC(0x79),	/* keypad + */
+	[87]	= DEFINE_SC(0x69),	/* keypad 1 */
+	[88]	= DEFINE_SC(0x72),	/* keypad 2 */
+	[89]	= DEFINE_SC(0x7a),	/* keypad 3 */
+	[90]	= DEFINE_SC(0x70),	/* keypad 0 */
+	[91]	= DEFINE_SC(0x71),	/* keypad . */
+
+	[94]	= DEFINE_SC(0x61),	/* <INT 1> */
+	[95]	= DEFINE_SC(0x78),	/* <F11> */
+	[96]	= DEFINE_SC(0x07),	/* <F12> */
+
+	[104]	= DEFINE_ESC(0x5a),	/* keypad <enter> */
+	[105]	= DEFINE_ESC(0x14),	/* <right ctrl> */
+	[106]	= DEFINE_ESC(0x4a),	/* keypad / */
+	[108]	= DEFINE_ESC(0x11),	/* <right alt> */
+	[110]	= DEFINE_ESC(0x6c),	/* <home> */
+	[111]	= DEFINE_ESC(0x75),	/* <up> */
+	[112]	= DEFINE_ESC(0x7d),	/* <pag up> */
+	[113]	= DEFINE_ESC(0x6b),	/* <left> */
+	[114]	= DEFINE_ESC(0x74),	/* <right> */
+	[115]	= DEFINE_ESC(0x69),	/* <end> */
+	[116]	= DEFINE_ESC(0x72),	/* <down> */
+	[117]	= DEFINE_ESC(0x7a),	/* <pag down> */
+	[118]	= DEFINE_ESC(0x70),	/* <ins> */
+	[119]	= DEFINE_ESC(0x71),	/* <delete> */
+};
+static bool running, done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+	return &keymap[scancode];
+}
+
+static void key_press(const struct set2_scancode *sc)
+{
+	switch (sc->type) {
+	case SCANCODE_ESCAPED:
+		kbd_queue(0xe0);
+		/* fallthrough */
+	case SCANCODE_NORMAL:
+		kbd_queue(sc->code);
+		break;
+	case SCANCODE_KEY_PAUSE:
+		kbd_queue(0xe1);
+		kbd_queue(0x14);
+		kbd_queue(0x77);
+		kbd_queue(0xe1);
+		kbd_queue(0xf0);
+		kbd_queue(0x14);
+		kbd_queue(0x77);
+		break;
+	case SCANCODE_KEY_PRNTSCRN:
+		kbd_queue(0xe0);
+		kbd_queue(0x12);
+		kbd_queue(0xe0);
+		kbd_queue(0x7c);
+		break;
+	}
+}
+
+static void key_release(const struct set2_scancode *sc)
+{
+	switch (sc->type) {
+	case SCANCODE_ESCAPED:
+		kbd_queue(0xe0);
+		/* fallthrough */
+	case SCANCODE_NORMAL:
+		kbd_queue(0xf0);
+		kbd_queue(sc->code);
+		break;
+	case SCANCODE_KEY_PAUSE:
+		/* nothing to do */
+		break;
+	case SCANCODE_KEY_PRNTSCRN:
+		kbd_queue(0xe0);
+		kbd_queue(0xf0);
+		kbd_queue(0x7c);
+		kbd_queue(0xe0);
+		kbd_queue(0xf0);
+		kbd_queue(0x12);
+		break;
+	}
+}
+
+static void *sdl__thread(void *p)
+{
+	Uint32 rmask, gmask, bmask, amask;
+	struct framebuffer *fb = p;
+	SDL_Surface *guest_screen;
+	SDL_Surface *screen;
+	SDL_Event ev;
+	Uint32 flags;
+
+	kvm__set_thread_name("kvm-sdl-worker");
+
+	if (SDL_Init(SDL_INIT_VIDEO) != 0)
+		die("Unable to initialize SDL");
+
+	rmask = 0x000000ff;
+	gmask = 0x0000ff00;
+	bmask = 0x00ff0000;
+	amask = 0x00000000;
+
+	guest_screen = SDL_CreateRGBSurfaceFrom(fb->mem, fb->width, fb->height, fb->depth, fb->width * fb->depth / 8, rmask, gmask, bmask, amask);
+	if (!guest_screen)
+		die("Unable to create SDL RBG surface");
+
+	flags = SDL_HWSURFACE | SDL_ASYNCBLIT | SDL_HWACCEL | SDL_DOUBLEBUF;
+
+	SDL_WM_SetCaption("KVM tool", "KVM tool");
+
+	screen = SDL_SetVideoMode(fb->width, fb->height, fb->depth, flags);
+	if (!screen)
+		die("Unable to set SDL video mode");
+
+	SDL_EnableKeyRepeat(200, 50);
+
+	while (running) {
+		SDL_BlitSurface(guest_screen, NULL, screen, NULL);
+		SDL_Flip(screen);
+
+		while (SDL_PollEvent(&ev)) {
+			switch (ev.type) {
+			case SDL_KEYDOWN: {
+				const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+				if (sc->type == SCANCODE_UNKNOWN) {
+					pr_warning("key '%d' not found in keymap", ev.key.keysym.scancode);
+					break;
+				}
+				key_press(sc);
+				break;
+			}
+			case SDL_KEYUP: {
+				const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+				if (sc->type == SCANCODE_UNKNOWN)
+					break;
+				key_release(sc);
+				break;
+			}
+			case SDL_QUIT:
+				goto exit;
+			}
+		}
+
+		SDL_Delay(1000 / FRAME_RATE);
+	}
+
+	if (running == false && done == false) {
+		done = true;
+		return NULL;
+	}
+exit:
+	kvm_cpu__reboot(fb->kvm);
+
+	return NULL;
+}
+
+static int sdl__start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	running = true;
+
+	if (pthread_create(&thread, NULL, sdl__thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int sdl__stop(struct framebuffer *fb)
+{
+	running = false;
+	while (done == false)
+		sleep(0);
+
+	return 0;
+}
+
+static struct fb_target_operations sdl_ops = {
+	.start	= sdl__start,
+	.stop	= sdl__stop,
+};
+
+int sdl__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.sdl)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &sdl_ops);
+}
+dev_init(sdl__init);
+
+int sdl__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		return sdl__stop(NULL);
+
+	return 0;
+}
+dev_exit(sdl__exit);
diff --git a/tools/kvm/ui/vnc.c b/tools/kvm/ui/vnc.c
new file mode 100644
index 000000000000..12e4bd53fe0d
--- /dev/null
+++ b/tools/kvm/ui/vnc.c
@@ -0,0 +1,250 @@
+#include "kvm/vnc.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/vesa.h"
+
+#include <linux/types.h>
+#include <rfb/keysym.h>
+#include <rfb/rfb.h>
+#include <pthread.h>
+#include <linux/err.h>
+
+#define VESA_QUEUE_SIZE		128
+#define VESA_IRQ		14
+
+/*
+ * This "6000" value is pretty much the result of experimentation
+ * It seems that around this value, things update pretty smoothly
+ */
+#define VESA_UPDATE_TIME	6000
+
+/*
+ * We can map the letters and numbers without a fuss,
+ * but the other characters not so much.
+ */
+static char letters[26] = {
+	0x1c, 0x32, 0x21, 0x23, 0x24, /* a-e */
+	0x2b, 0x34, 0x33, 0x43, 0x3b, /* f-j */
+	0x42, 0x4b, 0x3a, 0x31, 0x44, /* k-o */
+	0x4d, 0x15, 0x2d, 0x1b, 0x2c, /* p-t */
+	0x3c, 0x2a, 0x1d, 0x22, 0x35, /* u-y */
+	0x1a,
+};
+
+static rfbScreenInfoPtr server;
+static char num[10] = {
+	0x45, 0x16, 0x1e, 0x26, 0x2e, 0x23, 0x36, 0x3d, 0x3e, 0x46,
+};
+
+/*
+ * This is called when the VNC server receives a key event
+ * The reason this function is such a beast is that we have
+ * to convert from ASCII characters (which is what VNC gets)
+ * to PC keyboard scancodes, which is what Linux expects to
+ * get from its keyboard. ASCII and the scancode set don't
+ * really seem to mesh in any good way beyond some basics with
+ * the letters and numbers.
+ */
+static void kbd_handle_key(rfbBool down, rfbKeySym key, rfbClientPtr cl)
+{
+	char tosend = 0;
+
+	if (key >= 0x41 && key <= 0x5a)
+		key += 0x20; /* convert to lowercase */
+
+	if (key >= 0x61 && key <= 0x7a) /* a-z */
+		tosend = letters[key - 0x61];
+
+	if (key >= 0x30 && key <= 0x39)
+		tosend = num[key - 0x30];
+
+	switch (key) {
+	case XK_Insert:		kbd_queue(0xe0);	tosend = 0x70;	break;
+	case XK_Delete:		kbd_queue(0xe0);	tosend = 0x71;	break;
+	case XK_Up:		kbd_queue(0xe0);	tosend = 0x75;	break;
+	case XK_Down:		kbd_queue(0xe0);	tosend = 0x72;	break;
+	case XK_Left:		kbd_queue(0xe0);	tosend = 0x6b;	break;
+	case XK_Right:		kbd_queue(0xe0);	tosend = 0x74;	break;
+	case XK_Page_Up:	kbd_queue(0xe0);	tosend = 0x7d;	break;
+	case XK_Page_Down:	kbd_queue(0xe0);	tosend = 0x7a;	break;
+	case XK_Home:		kbd_queue(0xe0);	tosend = 0x6c;	break;
+	case XK_BackSpace:	tosend = 0x66;		break;
+	case XK_Tab:		tosend = 0x0d;		break;
+	case XK_Return:		tosend = 0x5a;		break;
+	case XK_Escape:		tosend = 0x76;		break;
+	case XK_End:		tosend = 0x69;		break;
+	case XK_Shift_L:	tosend = 0x12;		break;
+	case XK_Shift_R:	tosend = 0x59;		break;
+	case XK_Control_R:	kbd_queue(0xe0);
+	case XK_Control_L:	tosend = 0x14;		break;
+	case XK_Alt_R:		kbd_queue(0xe0);
+	case XK_Alt_L:		tosend = 0x11;		break;
+	case XK_quoteleft:	tosend = 0x0e;		break;
+	case XK_minus:		tosend = 0x4e;		break;
+	case XK_equal:		tosend = 0x55;		break;
+	case XK_bracketleft:	tosend = 0x54;		break;
+	case XK_bracketright:	tosend = 0x5b;		break;
+	case XK_backslash:	tosend = 0x5d;		break;
+	case XK_Caps_Lock:	tosend = 0x58;		break;
+	case XK_semicolon:	tosend = 0x4c;		break;
+	case XK_quoteright:	tosend = 0x52;		break;
+	case XK_comma:		tosend = 0x41;		break;
+	case XK_period:		tosend = 0x49;		break;
+	case XK_slash:		tosend = 0x4a;		break;
+	case XK_space:		tosend = 0x29;		break;
+
+	/*
+	 * This is where I handle the shifted characters.
+	 * They don't really map nicely the way A-Z maps to a-z,
+	 * so I'm doing it manually
+	 */
+	case XK_exclam:		tosend = 0x16;		break;
+	case XK_quotedbl:	tosend = 0x52;		break;
+	case XK_numbersign:	tosend = 0x26;		break;
+	case XK_dollar:		tosend = 0x25;		break;
+	case XK_percent:	tosend = 0x2e;		break;
+	case XK_ampersand:	tosend = 0x3d;		break;
+	case XK_parenleft:	tosend = 0x46;		break;
+	case XK_parenright:	tosend = 0x45;		break;
+	case XK_asterisk:	tosend = 0x3e;		break;
+	case XK_plus:		tosend = 0x55;		break;
+	case XK_colon:		tosend = 0x4c;		break;
+	case XK_less:		tosend = 0x41;		break;
+	case XK_greater:	tosend = 0x49;		break;
+	case XK_question:	tosend = 0x4a;		break;
+	case XK_at:		tosend = 0x1e;		break;
+	case XK_asciicircum:	tosend = 0x36;		break;
+	case XK_underscore:	tosend = 0x4e;		break;
+	case XK_braceleft:	tosend = 0x54;		break;
+	case XK_braceright:	tosend = 0x5b;		break;
+	case XK_bar:		tosend = 0x5d;		break;
+	case XK_asciitilde:	tosend = 0x0e;		break;
+	default:		break;
+	}
+
+	/*
+	 * If this is a "key up" event (the user has released the key, we
+	 * need to send 0xf0 first.
+	 */
+	if (!down && tosend != 0x0)
+		kbd_queue(0xf0);
+
+	if (tosend)
+		kbd_queue(tosend);
+}
+
+/* The previous X and Y coordinates of the mouse */
+static int xlast, ylast = -1;
+
+/*
+ * This function is called by the VNC server whenever a mouse event occurs.
+ */
+static void kbd_handle_ptr(int buttonMask, int x, int y, rfbClientPtr cl)
+{
+	int dx, dy;
+	char b1 = 0x8;
+
+	/* The VNC mask and the PS/2 button encoding are the same */
+	b1 |= buttonMask;
+
+	if (xlast >= 0 && ylast >= 0) {
+		/* The PS/2 mouse sends deltas, not absolutes */
+		dx = x - xlast;
+		dy = ylast - y;
+
+		/* Set overflow bits if needed */
+		if (dy > 255)
+			b1 |= 0x80;
+		if (dx > 255)
+			b1 |= 0x40;
+
+		/* Set negative bits if needed */
+		if (dy < 0)
+			b1 |= 0x20;
+		if (dx < 0)
+			b1 |= 0x10;
+
+		mouse_queue(b1);
+		mouse_queue(dx);
+		mouse_queue(dy);
+	}
+
+	xlast = x;
+	ylast = y;
+	rfbDefaultPtrAddEvent(buttonMask, x, y, cl);
+}
+
+static void *vnc__thread(void *p)
+{
+	struct framebuffer *fb = p;
+	/*
+	 * Make a fake argc and argv because the getscreen function
+	 * seems to want it.
+	 */
+	char argv[1][1] = {{0}};
+	int argc = 1;
+
+	kvm__set_thread_name("kvm-vnc-worker");
+
+	server = rfbGetScreen(&argc, (char **) argv, fb->width, fb->height, 8, 3, 4);
+	server->frameBuffer		= fb->mem;
+	server->alwaysShared		= TRUE;
+	server->kbdAddEvent		= kbd_handle_key;
+	server->ptrAddEvent		= kbd_handle_ptr;
+	rfbInitServer(server);
+
+	while (rfbIsActive(server)) {
+		rfbMarkRectAsModified(server, 0, 0, fb->width, fb->height);
+		rfbProcessEvents(server, server->deferUpdateTime * VESA_UPDATE_TIME);
+	}
+	return NULL;
+}
+
+static int vnc__start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	if (pthread_create(&thread, NULL, vnc__thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int vnc__stop(struct framebuffer *fb)
+{
+	rfbShutdownServer(server, TRUE);
+
+	return 0;
+}
+
+static struct fb_target_operations vnc_ops = {
+	.start	= vnc__start,
+	.stop	= vnc__stop,
+};
+
+int vnc__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.vnc)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &vnc_ops);
+}
+dev_init(vnc__init);
+
+int vnc__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.vnc)
+		return vnc__stop(NULL);
+
+	return 0;
+}
+dev_exit(vnc__exit);
diff --git a/tools/kvm/util/KVMTOOLS-VERSION-GEN b/tools/kvm/util/KVMTOOLS-VERSION-GEN
new file mode 100755
index 000000000000..1af9d6c26f2a
--- /dev/null
+++ b/tools/kvm/util/KVMTOOLS-VERSION-GEN
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+if [ $# -eq 1 ]  ; then
+	OUTPUT=$1
+fi
+
+GVF=${OUTPUT}KVMTOOLS-VERSION-FILE
+
+LF='
+'
+
+# First check if there is a .git to get the version from git describe
+# otherwise try to get the version from the kernel makefile
+if test -d ../../.git -o -f ../../.git &&
+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	v[0-9]*)
+		git update-index -q --refresh
+		test -z "$(git diff-index --name-only HEAD --)" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+	VN=$(MAKEFLAGS= make -sC ../.. kernelversion)
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+	VC=$(sed -e 's/^KVMTOOLS_VERSION = //' <$GVF)
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo >&2 "KVMTOOLS_VERSION = $VN"
+	echo "KVMTOOLS_VERSION = $VN" >$GVF
+}
diff --git a/tools/kvm/util/generate-cmdlist.sh b/tools/kvm/util/generate-cmdlist.sh
new file mode 100755
index 000000000000..c8be0bd07b4a
--- /dev/null
+++ b/tools/kvm/util/generate-cmdlist.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n 's/^lkvm-\([^ \t]*\).*common/\1/p' command-list.txt |
+while read cmd
+do
+	 # TODO following sed command should be fixed
+     sed -n '/^NAME/,/^lkvm-'"$cmd"'/ {
+		 /NAME/d
+		 /--/d
+		 s/.*kvm-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	     p
+	 }' "Documentation/kvm-$cmd.txt"
+done
+echo "};"
diff --git a/tools/kvm/util/init.c b/tools/kvm/util/init.c
new file mode 100644
index 000000000000..33a595268021
--- /dev/null
+++ b/tools/kvm/util/init.c
@@ -0,0 +1,69 @@
+#include <linux/list.h>
+#include <linux/kernel.h>
+
+#include "kvm/kvm.h"
+#include "kvm/util-init.h"
+
+#define PRIORITY_LISTS 10
+
+static struct hlist_head init_lists[PRIORITY_LISTS];
+static struct hlist_head exit_lists[PRIORITY_LISTS];
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name)
+{
+	t->init = init;
+	t->fn_name = name;
+	hlist_add_head(&t->n, &init_lists[priority]);
+
+	return 0;
+}
+
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name)
+{
+	t->init = init;
+	t->fn_name = name;
+	hlist_add_head(&t->n, &exit_lists[priority]);
+
+	return 0;
+}
+
+int init_list__init(struct kvm *kvm)
+{
+	unsigned int i;
+	int r = 0;
+	struct hlist_node *n;
+	struct init_item *t;
+
+	for (i = 0; i < ARRAY_SIZE(init_lists); i++)
+		hlist_for_each_entry(t, n, &init_lists[i], n) {
+			r = t->init(kvm);
+			if (r < 0) {
+				pr_warning("Failed init: %s\n", t->fn_name);
+				goto fail;
+			}
+		}
+
+fail:
+	return r;
+}
+
+int init_list__exit(struct kvm *kvm)
+{
+	int i;
+	int r = 0;
+	struct hlist_node *n;
+	struct init_item *t;
+
+	for (i = ARRAY_SIZE(exit_lists) - 1; i >= 0; i--)
+		hlist_for_each_entry(t, n, &exit_lists[i], n) {
+			r = t->init(kvm);
+			if (r < 0) {
+				pr_warning("%s failed.\n", t->fn_name);
+				goto fail;
+			}
+		}
+fail:
+	return r;
+}
diff --git a/tools/kvm/util/kvm-ifup-vbr0 b/tools/kvm/util/kvm-ifup-vbr0
new file mode 100755
index 000000000000..a91c37f7ee00
--- /dev/null
+++ b/tools/kvm/util/kvm-ifup-vbr0
@@ -0,0 +1,6 @@
+#!/bin/sh
+switch=vbr0
+/sbin/ifconfig $1 0.0.0.0 up
+/usr/sbin/brctl addif ${switch} $1
+/usr/sbin/brctl setfd ${switch} 0
+/usr/sbin/brctl stp ${switch} off
diff --git a/tools/kvm/util/parse-options.c b/tools/kvm/util/parse-options.c
new file mode 100644
index 000000000000..9a1bbee6c271
--- /dev/null
+++ b/tools/kvm/util/parse-options.c
@@ -0,0 +1,577 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <stdbool.h>
+
+/* user defined includes */
+#include <linux/types.h>
+#include <kvm/util.h>
+#include <kvm/parse-options.h>
+#include <kvm/strbuf.h>
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		return pr_err("switch `%c' %s", opt->short_name, reason);
+	if (flags & OPT_UNSET)
+		return pr_err("option `no-%s' %s", opt->long_name, reason);
+	return pr_err("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		int flags, const char **arg)
+{
+	if (p->opt) {
+		*arg = p->opt;
+		p->opt = NULL;
+	} else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
+				**(p->argv + 1) == '-')) {
+		*arg = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		*arg = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	return 0;
+}
+
+static int readnum(const struct option *opt, int flags,
+		   const char *str, char **end)
+{
+	switch (opt->type) {
+	case OPTION_INTEGER:
+		*(int *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_UINTEGER:
+		*(unsigned int *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_LONG:
+		*(long *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_U64:
+		*(u64 *)opt->value = strtoull(str, end, 0);
+		break;
+	default:
+		return opterror(opt, "invalid numeric conversion", flags);
+	}
+
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		const struct option *opt, int flags)
+{
+	const char *s, *arg = NULL;
+	const int unset = flags & OPT_UNSET;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+		/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_BIT:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		case OPTION_END:
+		case OPTION_ARGUMENT:
+		case OPTION_GROUP:
+		case OPTION_STRING:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+		case OPTION_LONG:
+		case OPTION_U64:
+		default:
+			break;
+		}
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(bool *)opt->value = unset ? false : true;
+		return 0;
+
+	case OPTION_INCR:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_UINT:
+		*(unsigned int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			return get_arg(p, opt, flags,
+					(const char **)opt->value);
+		return 0;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_UINTEGER:
+		if (unset) {
+			*(unsigned int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(unsigned int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_LONG:
+		if (unset) {
+			*(long *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(long *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_U64:
+		if (unset) {
+			*(u64 *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(u64 *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_END:
+	case OPTION_ARGUMENT:
+	case OPTION_GROUP:
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+static int usage_with_options_internal(const char * const *usagestr,
+		const struct option *opts, int full)
+{
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	fprintf(stderr, "\n usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "    or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	for (; opts->type != OPTION_END; opts++) {
+		size_t pos;
+		int pad;
+
+		if (opts->type == OPTION_GROUP) {
+			fputc('\n', stderr);
+			if (*opts->help)
+				fprintf(stderr, "%s\n", opts->help);
+			continue;
+		}
+		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+			continue;
+
+		pos = fprintf(stderr, "    ");
+		if (opts->short_name)
+			pos += fprintf(stderr, "-%c", opts->short_name);
+		else
+			pos += fprintf(stderr, "    ");
+
+		if (opts->long_name && opts->short_name)
+			pos += fprintf(stderr, ", ");
+		if (opts->long_name)
+			pos += fprintf(stderr, "--%s", opts->long_name);
+
+		switch (opts->type) {
+		case OPTION_ARGUMENT:
+			break;
+		case OPTION_LONG:
+		case OPTION_U64:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<n>]");
+				else
+					pos += fprintf(stderr, "[<n>]");
+			else
+				pos += fprintf(stderr, " <n>");
+			break;
+		case OPTION_CALLBACK:
+			if (opts->flags & PARSE_OPT_NOARG)
+				break;
+		/* FALLTHROUGH */
+		case OPTION_STRING:
+			if (opts->argh) {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=<%s>]", opts->argh);
+					else
+						pos += fprintf(stderr, "[<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, " <%s>", opts->argh);
+			} else {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=...]");
+					else
+						pos += fprintf(stderr, "[...]");
+				else
+					pos += fprintf(stderr, " ...");
+			}
+				break;
+		default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
+		case OPTION_END:
+		case OPTION_GROUP:
+		case OPTION_BIT:
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+			break;
+		}
+		if (pos <= USAGE_OPTS_WIDTH)
+			pad = USAGE_OPTS_WIDTH - pos;
+		else {
+			fputc('\n', stderr);
+			pad = USAGE_OPTS_WIDTH;
+		}
+		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	}
+	fputc('\n', stderr);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+		const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0);
+	exit(129);
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		pr_err("did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			pr_err("did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+static int parse_options_usage(const char * const *usagestr,
+		const struct option *opts)
+{
+	return usage_with_options_internal(usagestr, opts, 0);
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p,
+        const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+		const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value",
+						flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option)
+		return pr_err("Ambiguous option: %s "
+				"(could be --%s%s or --%s%s)",
+				arg,
+				(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+				ambiguous_option->long_name,
+				(abbrev_flags & OPT_UNSET) ?  "no-" : "",
+				abbrev_option->long_name);
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+
+static void parse_options_start(struct parse_opt_ctx_t *ctx, int argc,
+		const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc;
+	ctx->argv = argv;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+			(flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+		const struct option *options, const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		const char *arg = ctx->argv[0];
+
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = arg + 1;
+			if (internal_help && *ctx->opt == 'h')
+				return parse_options_usage(usagestr, options);
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options);
+			case -2:
+				goto unknown;
+			default:
+				break;
+			}
+			if (ctx->opt)
+				check_typos(arg + 1, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return parse_options_usage(usagestr,
+							options);
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr,
+							options);
+				case -2:
+					/* fake a short option thing to hide
+					 * the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				default:
+					break;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		if (internal_help && !strcmp(arg + 2, "help-all"))
+			return usage_with_options_internal(usagestr, options,
+					1);
+		if (internal_help && !strcmp(arg + 2, "help"))
+			return parse_options_usage(usagestr, options);
+		switch (parse_long_opt(ctx, arg + 2, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options);
+		case -2:
+			goto unknown;
+		default:
+			break;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		const char * const usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-') {
+			pr_err("unknown option `%s'", ctx.argv[0] + 2);
+		} else {
+			pr_err("unknown switch `%c'", *ctx.opt);
+		}
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}
diff --git a/tools/kvm/util/rbtree-interval.c b/tools/kvm/util/rbtree-interval.c
new file mode 100644
index 000000000000..3630a6d80d6e
--- /dev/null
+++ b/tools/kvm/util/rbtree-interval.c
@@ -0,0 +1,58 @@
+#include <kvm/rbtree-interval.h>
+#include <stddef.h>
+#include <errno.h>
+
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct rb_int_node *cur = rb_int(node);
+
+		if (point < cur->low)
+			node = node->rb_left;
+		else if (cur->high <= point)
+			node = node->rb_right;
+		else
+			return cur;
+	}
+
+	return NULL;
+}
+
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high)
+{
+	struct rb_int_node *range;
+
+	range = rb_int_search_single(root, low);
+	if (range == NULL)
+		return NULL;
+
+	/* We simply verify that 'high' is smaller than the end of the range where 'low' is located */
+	if (range->high < high)
+		return NULL;
+
+	return range;
+}
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *i_node)
+{
+	struct rb_node **node = &root->rb_node, *parent = NULL;
+
+	while (*node) {
+		struct rb_int_node *cur = rb_int(*node);
+
+		parent = *node;
+		if (i_node->high <= cur->low)
+			node = &cur->node.rb_left;
+		else if (cur->high <= i_node->low)
+			node = &cur->node.rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&i_node->node, parent, node);
+	rb_insert_color(&i_node->node, root);
+
+	return 0;
+}
diff --git a/tools/kvm/util/read-write.c b/tools/kvm/util/read-write.c
new file mode 100644
index 000000000000..44709dfd4353
--- /dev/null
+++ b/tools/kvm/util/read-write.c
@@ -0,0 +1,354 @@
+#include "kvm/read-write.h"
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+/* Same as read(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xread(int fd, void *buf, size_t count)
+{
+	ssize_t nr;
+
+restart:
+	nr = read(fd, buf, count);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as write(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwrite(int fd, const void *buf, size_t count)
+{
+	ssize_t nr;
+
+restart:
+	nr = write(fd, buf, count);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+	ssize_t total = 0;
+	char *p = buf;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xread(fd, p, count);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		count -= nr;
+		total += nr;
+		p += nr;
+	}
+
+	return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xwrite(fd, p, count);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= nr;
+		total += nr;
+		p += nr;
+	}
+
+	return total;
+}
+
+/* Same as pread(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pread(fd, buf, count, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as pwrite(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pwrite(fd, buf, count, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t total = 0;
+	char *p = buf;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpread(fd, p, count, offset);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		count -= nr;
+		total += nr;
+		p += nr;
+		offset += nr;
+	}
+
+	return total;
+}
+
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpwrite(fd, p, count, offset);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= nr;
+		total += nr;
+		p += nr;
+		offset += nr;
+	}
+
+	return total;
+}
+
+/* Same as readv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t nr;
+
+restart:
+	nr = readv(fd, iov, iovcnt);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as writev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t nr;
+
+restart:
+	nr = writev(fd, iov, iovcnt);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+static inline ssize_t get_iov_size(const struct iovec *iov, int iovcnt)
+{
+	size_t size = 0;
+	while (iovcnt--)
+		size += (iov++)->iov_len;
+
+	return size;
+}
+
+static inline void shift_iovec(const struct iovec **iov, int *iovcnt,
+				size_t nr, ssize_t *total, size_t *count, off_t *offset)
+{
+	while (nr >= (*iov)->iov_len) {
+		nr -= (*iov)->iov_len;
+		*total += (*iov)->iov_len;
+		*count -= (*iov)->iov_len;
+		if (offset)
+			*offset += (*iov)->iov_len;
+		(*iovcnt)--;
+		(*iov)++;
+	}
+}
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xreadv(fd, iov, iovcnt);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+	}
+
+	return total;
+}
+
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xwritev(fd, iov, iovcnt);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+	}
+
+	return total;
+}
+
+/* Same as preadv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = preadv(fd, iov, iovcnt, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as pwritev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pwritev(fd, iov, iovcnt, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpreadv(fd, iov, iovcnt, offset);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+	}
+
+	return total;
+}
+
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpwritev(fd, iov, iovcnt, offset);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+	}
+
+	return total;
+}
+
+#ifdef CONFIG_HAS_AIO
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param)
+{
+	struct iocb *ios[1] = { iocb };
+	int ret;
+
+	io_prep_pwritev(iocb, fd, iov, iovcnt, offset);
+	io_set_eventfd(iocb, ev);
+	iocb->data = param;
+
+restart:
+	ret = io_submit(ctx, 1, ios);
+	if (ret == -EAGAIN)
+		goto restart;
+	return ret;
+}
+
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param)
+{
+	struct iocb *ios[1] = { iocb };
+	int ret;
+
+	io_prep_preadv(iocb, fd, iov, iovcnt, offset);
+	io_set_eventfd(iocb, ev);
+	iocb->data = param;
+
+restart:
+	ret = io_submit(ctx, 1, ios);
+	if (ret == -EAGAIN)
+		goto restart;
+	return ret;
+}
+#endif
diff --git a/tools/kvm/util/set_private_br.sh b/tools/kvm/util/set_private_br.sh
new file mode 100755
index 000000000000..49867ddca6a7
--- /dev/null
+++ b/tools/kvm/util/set_private_br.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Author: Amos Kong <kongjianjun@gmail.com>
+# Date: Apr 14, 2011
+# Description: this script is used to create/delete a private bridge,
+# launch a dhcp server on the bridge by dnsmasq.
+#
+# @ ./set_private_br.sh $bridge_name $subnet_prefix
+# @ ./set_private_br.sh vbr0 192.168.33
+
+brname='vbr0'
+subnet='192.168.33'
+
+add_br()
+{
+    echo "add new private bridge: $brname"
+    /usr/sbin/brctl addbr $brname
+    echo 1 > /proc/sys/net/ipv6/conf/$brname/disable_ipv6
+    echo 1 > /proc/sys/net/ipv4/ip_forward
+    /usr/sbin/brctl stp $brname on
+    /usr/sbin/brctl setfd $brname 0
+    ifconfig $brname $subnet.1
+    ifconfig $brname up
+    # Add forward rule, then guest can access public network
+    iptables -t nat -A POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+    /etc/init.d/dnsmasq stop
+    /etc/init.d/tftpd-hpa stop 2>/dev/null
+    dnsmasq --strict-order --bind-interfaces --listen-address $subnet.1 --dhcp-range $subnet.1,$subnet.254 $tftp_cmd
+}
+
+del_br()
+{
+    echo "cleanup bridge setup"
+    kill -9 `pgrep dnsmasq|tail -1`
+    ifconfig $brname down
+    /usr/sbin/brctl delbr $brname
+    iptables -t nat -D POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+}
+
+
+if [ $# = 0 ]; then
+    del_br 2>/dev/null
+    exit
+fi
+if [ $# > 1 ]; then
+    brname="$1"
+fi
+if [ $# = 2 ]; then
+    subnet="$2"
+fi
+add_br
diff --git a/tools/kvm/util/strbuf.c b/tools/kvm/util/strbuf.c
new file mode 100644
index 000000000000..99d6b0c08fb4
--- /dev/null
+++ b/tools/kvm/util/strbuf.c
@@ -0,0 +1,62 @@
+
+/* user defined headers */
+#include <kvm/util.h>
+#include <kvm/strbuf.h>
+
+int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++) {
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+	}
+}
+
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dest: The string to be appended to
+ * @src: The string to append to it
+ * @count: The size of the destination buffer.
+ */
+size_t strlcat(char *dest, const char *src, size_t count)
+{
+	size_t dsize = strlen(dest);
+	size_t len = strlen(src);
+	size_t res = dsize + len;
+
+	DIE_IF(dsize >= count);
+
+	dest += dsize;
+	count -= dsize;
+	if (len >= count)
+		len = count - 1;
+
+	memcpy(dest, src, len);
+	dest[len] = 0;
+
+	return res;
+}
+
+/**
+ * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
diff --git a/tools/kvm/util/threadpool.c b/tools/kvm/util/threadpool.c
new file mode 100644
index 000000000000..e64aa26dada4
--- /dev/null
+++ b/tools/kvm/util/threadpool.c
@@ -0,0 +1,175 @@
+#include "kvm/threadpool.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+static DEFINE_MUTEX(job_mutex);
+static DEFINE_MUTEX(thread_mutex);
+static pthread_cond_t job_cond = PTHREAD_COND_INITIALIZER;
+
+static LIST_HEAD(head);
+
+static pthread_t	*threads;
+static long		threadcount;
+static bool		running;
+
+static struct thread_pool__job *thread_pool__job_pop_locked(void)
+{
+	struct thread_pool__job *job;
+
+	if (list_empty(&head))
+		return NULL;
+
+	job = list_first_entry(&head, struct thread_pool__job, queue);
+	list_del(&job->queue);
+
+	return job;
+}
+
+static void thread_pool__job_push_locked(struct thread_pool__job *job)
+{
+	list_add_tail(&job->queue, &head);
+}
+
+static struct thread_pool__job *thread_pool__job_pop(void)
+{
+	struct thread_pool__job *job;
+
+	mutex_lock(&job_mutex);
+	job = thread_pool__job_pop_locked();
+	mutex_unlock(&job_mutex);
+	return job;
+}
+
+static void thread_pool__job_push(struct thread_pool__job *job)
+{
+	mutex_lock(&job_mutex);
+	thread_pool__job_push_locked(job);
+	mutex_unlock(&job_mutex);
+}
+
+static void thread_pool__handle_job(struct thread_pool__job *job)
+{
+	while (job) {
+		job->callback(job->kvm, job->data);
+
+		mutex_lock(&job->mutex);
+
+		if (--job->signalcount > 0)
+			/* If the job was signaled again while we were working */
+			thread_pool__job_push(job);
+
+		mutex_unlock(&job->mutex);
+
+		job = thread_pool__job_pop();
+	}
+}
+
+static void thread_pool__threadfunc_cleanup(void *param)
+{
+	mutex_unlock(&job_mutex);
+}
+
+static void *thread_pool__threadfunc(void *param)
+{
+	pthread_cleanup_push(thread_pool__threadfunc_cleanup, NULL);
+
+	kvm__set_thread_name("threadpool-worker");
+
+	while (running) {
+		struct thread_pool__job *curjob = NULL;
+
+		mutex_lock(&job_mutex);
+		while (running && (curjob = thread_pool__job_pop_locked()) == NULL)
+			pthread_cond_wait(&job_cond, &job_mutex.mutex);
+		mutex_unlock(&job_mutex);
+
+		if (running)
+			thread_pool__handle_job(curjob);
+	}
+
+	pthread_cleanup_pop(0);
+
+	return NULL;
+}
+
+static int thread_pool__addthread(void)
+{
+	int res;
+	void *newthreads;
+
+	mutex_lock(&thread_mutex);
+	newthreads = realloc(threads, (threadcount + 1) * sizeof(pthread_t));
+	if (newthreads == NULL) {
+		mutex_unlock(&thread_mutex);
+		return -1;
+	}
+
+	threads = newthreads;
+
+	res = pthread_create(threads + threadcount, NULL,
+			     thread_pool__threadfunc, NULL);
+
+	if (res == 0)
+		threadcount++;
+	mutex_unlock(&thread_mutex);
+
+	return res;
+}
+
+int thread_pool__init(struct kvm *kvm)
+{
+	unsigned long i;
+	unsigned int thread_count = sysconf(_SC_NPROCESSORS_ONLN);
+
+	running = true;
+
+	for (i = 0; i < thread_count; i++)
+		if (thread_pool__addthread() < 0)
+			return i;
+
+	return i;
+}
+late_init(thread_pool__init);
+
+int thread_pool__exit(struct kvm *kvm)
+{
+	int i;
+	void *NUL = NULL;
+
+	running = false;
+
+	for (i = 0; i < threadcount; i++) {
+		mutex_lock(&job_mutex);
+		pthread_cond_signal(&job_cond);
+		mutex_unlock(&job_mutex);
+	}
+
+	for (i = 0; i < threadcount; i++) {
+		pthread_join(threads[i], NUL);
+	}
+
+	return 0;
+}
+late_exit(thread_pool__exit);
+
+void thread_pool__do_job(struct thread_pool__job *job)
+{
+	struct thread_pool__job *jobinfo = job;
+
+	if (jobinfo == NULL || jobinfo->callback == NULL)
+		return;
+
+	mutex_lock(&jobinfo->mutex);
+	if (jobinfo->signalcount++ == 0)
+		thread_pool__job_push(job);
+	mutex_unlock(&jobinfo->mutex);
+
+	mutex_lock(&job_mutex);
+	pthread_cond_signal(&job_cond);
+	mutex_unlock(&job_mutex);
+}
diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c
new file mode 100644
index 000000000000..c11a15a304a5
--- /dev/null
+++ b/tools/kvm/util/util.c
@@ -0,0 +1,133 @@
+/*
+ * Taken from perf which in turn take it from GIT
+ */
+
+#include "kvm/util.h"
+
+#include <kvm/kvm.h>
+#include <linux/magic.h>	/* For HUGETLBFS_MAGIC */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+	report(" Fatal: ", err, params);
+	exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+	report(" Error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+	report(" Warning: ", warn, params);
+}
+
+static void info_builtin(const char *info, va_list params)
+{
+	report(" Info: ", info, params);
+}
+
+void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	die_builtin(err, params);
+	va_end(params);
+}
+
+int pr_err(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	error_builtin(err, params);
+	va_end(params);
+	return -1;
+}
+
+void pr_warning(const char *warn, ...)
+{
+	va_list params;
+
+	va_start(params, warn);
+	warn_builtin(warn, params);
+	va_end(params);
+}
+
+void pr_info(const char *info, ...)
+{
+	va_list params;
+
+	va_start(params, info);
+	info_builtin(info, params);
+	va_end(params);
+}
+
+void die_perror(const char *s)
+{
+	perror(s);
+	exit(1);
+}
+
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size)
+{
+	char mpath[PATH_MAX];
+	int fd;
+	struct statfs sfs;
+	void *addr;
+	unsigned long blk_size;
+
+	if (statfs(htlbfs_path, &sfs) < 0)
+		die("Can't stat %s\n", htlbfs_path);
+
+	if ((unsigned int)sfs.f_type != HUGETLBFS_MAGIC)
+		die("%s is not hugetlbfs!\n", htlbfs_path);
+
+	blk_size = (unsigned long)sfs.f_bsize;
+	if (sfs.f_bsize == 0 || blk_size > size) {
+		die("Can't use hugetlbfs pagesize %ld for mem size %lld\n",
+		    blk_size, size);
+	}
+
+	kvm->ram_pagesize = blk_size;
+
+	snprintf(mpath, PATH_MAX, "%s/kvmtoolXXXXXX", htlbfs_path);
+	fd = mkstemp(mpath);
+	if (fd < 0)
+		die("Can't open %s for hugetlbfs map\n", mpath);
+	unlink(mpath);
+	if (ftruncate(fd, size) < 0)
+		die("Can't ftruncate for mem mapping size %lld\n",
+		    size);
+	addr = mmap(NULL, size, PROT_RW, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	return addr;
+}
+
+/* This function wraps the decision between hugetlbfs map (if requested) or normal mmap */
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size)
+{
+	if (hugetlbfs_path)
+		/*
+		 * We don't /need/ to map guest RAM from hugetlbfs, but we do so
+		 * if the user specifies a hugetlbfs path.
+		 */
+		return mmap_hugetlbfs(kvm, hugetlbfs_path, size);
+	else {
+		kvm->ram_pagesize = getpagesize();
+		return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+	}
+}
diff --git a/tools/kvm/virtio/9p-pdu.c b/tools/kvm/virtio/9p-pdu.c
new file mode 100644
index 000000000000..b9ce8ce60f2f
--- /dev/null
+++ b/tools/kvm/virtio/9p-pdu.c
@@ -0,0 +1,287 @@
+#include "kvm/util.h"
+#include "kvm/virtio-9p.h"
+
+#include <endian.h>
+#include <stdint.h>
+
+#include <linux/compiler.h>
+#include <net/9p/9p.h>
+
+static void virtio_p9_pdu_read(struct p9_pdu *pdu, void *data, size_t size)
+{
+	size_t len;
+	int i, copied = 0;
+	u16 iov_cnt = pdu->out_iov_cnt;
+	size_t offset = pdu->read_offset;
+	struct iovec *iov = pdu->out_iov;
+
+	for (i = 0; i < iov_cnt && size; i++) {
+		if (offset >= iov[i].iov_len) {
+			offset -= iov[i].iov_len;
+			continue;
+		} else {
+			len = MIN(iov[i].iov_len - offset, size);
+			memcpy(data, iov[i].iov_base + offset, len);
+			size -= len;
+			data += len;
+			offset = 0;
+			copied += len;
+		}
+	}
+	pdu->read_offset += copied;
+}
+
+static void virtio_p9_pdu_write(struct p9_pdu *pdu,
+				const void *data, size_t size)
+{
+	size_t len;
+	int i, copied = 0;
+	u16 iov_cnt = pdu->in_iov_cnt;
+	size_t offset = pdu->write_offset;
+	struct iovec *iov = pdu->in_iov;
+
+	for (i = 0; i < iov_cnt && size; i++) {
+		if (offset >= iov[i].iov_len) {
+			offset -= iov[i].iov_len;
+			continue;
+		} else {
+			len = MIN(iov[i].iov_len - offset, size);
+			memcpy(iov[i].iov_base + offset, data, len);
+			size -= len;
+			data += len;
+			offset = 0;
+			copied += len;
+		}
+	}
+	pdu->write_offset += copied;
+}
+
+static void virtio_p9_wstat_free(struct p9_wstat *stbuf)
+{
+	free(stbuf->name);
+	free(stbuf->uid);
+	free(stbuf->gid);
+	free(stbuf->muid);
+}
+
+static int virtio_p9_decode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+	int retval = 0;
+	const char *ptr;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':
+		{
+			int8_t *val = va_arg(ap, int8_t *);
+			virtio_p9_pdu_read(pdu, val, sizeof(*val));
+		}
+		break;
+		case 'w':
+		{
+			int16_t le_val;
+			int16_t *val = va_arg(ap, int16_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le16toh(le_val);
+		}
+		break;
+		case 'd':
+		{
+			int32_t le_val;
+			int32_t *val = va_arg(ap, int32_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le32toh(le_val);
+		}
+		break;
+		case 'q':
+		{
+			int64_t le_val;
+			int64_t *val = va_arg(ap, int64_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le64toh(le_val);
+		}
+		break;
+		case 's':
+		{
+			int16_t len;
+			char **str = va_arg(ap, char **);
+
+			virtio_p9_pdu_readf(pdu, "w", &len);
+			*str = malloc(len + 1);
+			if (*str == NULL) {
+				retval = ENOMEM;
+				break;
+			}
+			virtio_p9_pdu_read(pdu, *str, len);
+			(*str)[len] = 0;
+		}
+		break;
+		case 'Q':
+		{
+			struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+			retval = virtio_p9_pdu_readf(pdu, "bdq",
+						     &qid->type, &qid->version,
+						     &qid->path);
+		}
+		break;
+		case 'S':
+		{
+			struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+			memset(stbuf, 0, sizeof(struct p9_wstat));
+			stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = -1;
+			retval = virtio_p9_pdu_readf(pdu, "wwdQdddqssss",
+						&stbuf->size, &stbuf->type,
+						&stbuf->dev, &stbuf->qid,
+						&stbuf->mode, &stbuf->atime,
+						&stbuf->mtime, &stbuf->length,
+						&stbuf->name, &stbuf->uid,
+						&stbuf->gid, &stbuf->muid);
+			if (retval)
+				virtio_p9_wstat_free(stbuf);
+		}
+		break;
+		case 'I':
+		{
+			struct p9_iattr_dotl *p9attr = va_arg(ap,
+						       struct p9_iattr_dotl *);
+
+			retval = virtio_p9_pdu_readf(pdu, "ddddqqqqq",
+						     &p9attr->valid,
+						     &p9attr->mode,
+						     &p9attr->uid,
+						     &p9attr->gid,
+						     &p9attr->size,
+						     &p9attr->atime_sec,
+						     &p9attr->atime_nsec,
+						     &p9attr->mtime_sec,
+						     &p9attr->mtime_nsec);
+		}
+		break;
+		default:
+			retval = EINVAL;
+			break;
+		}
+	}
+	return retval;
+}
+
+static int virtio_p9_pdu_encode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+	int retval = 0;
+	const char *ptr;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':
+		{
+			int8_t val = va_arg(ap, int);
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'w':
+		{
+			int16_t val = htole16(va_arg(ap, int));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'd':
+		{
+			int32_t val = htole32(va_arg(ap, int32_t));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'q':
+		{
+			int64_t val = htole64(va_arg(ap, int64_t));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 's':
+		{
+			uint16_t len = 0;
+			const char *s = va_arg(ap, char *);
+			if (s)
+				len = MIN(strlen(s), USHRT_MAX);
+			virtio_p9_pdu_writef(pdu, "w", len);
+			virtio_p9_pdu_write(pdu, s, len);
+		}
+		break;
+		case 'Q':
+		{
+			struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+			retval = virtio_p9_pdu_writef(pdu, "bdq",
+						      qid->type, qid->version,
+						      qid->path);
+		}
+		break;
+		case 'S':
+		{
+			struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+			retval = virtio_p9_pdu_writef(pdu, "wwdQdddqssss",
+						stbuf->size, stbuf->type,
+						stbuf->dev, &stbuf->qid,
+						stbuf->mode, stbuf->atime,
+						stbuf->mtime, stbuf->length,
+						stbuf->name, stbuf->uid,
+						stbuf->gid, stbuf->muid);
+		}
+		break;
+		case 'A':
+		{
+			struct p9_stat_dotl *stbuf = va_arg(ap,
+						      struct p9_stat_dotl *);
+			retval  = virtio_p9_pdu_writef(pdu,
+						       "qQdddqqqqqqqqqqqqqqq",
+						       stbuf->st_result_mask,
+						       &stbuf->qid,
+						       stbuf->st_mode,
+						       stbuf->st_uid,
+						       stbuf->st_gid,
+						       stbuf->st_nlink,
+						       stbuf->st_rdev,
+						       stbuf->st_size,
+						       stbuf->st_blksize,
+						       stbuf->st_blocks,
+						       stbuf->st_atime_sec,
+						       stbuf->st_atime_nsec,
+						       stbuf->st_mtime_sec,
+						       stbuf->st_mtime_nsec,
+						       stbuf->st_ctime_sec,
+						       stbuf->st_ctime_nsec,
+						       stbuf->st_btime_sec,
+						       stbuf->st_btime_nsec,
+						       stbuf->st_gen,
+						       stbuf->st_data_version);
+		}
+		break;
+		default:
+			retval = EINVAL;
+			break;
+		}
+	}
+	return retval;
+}
+
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = virtio_p9_decode(pdu, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = virtio_p9_pdu_encode(pdu, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c
new file mode 100644
index 000000000000..60865dd0ca7c
--- /dev/null
+++ b/tools/kvm/virtio/9p.c
@@ -0,0 +1,1441 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/guest_compat.h"
+#include "kvm/builtin-setup.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/vfs.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_9p.h>
+#include <net/9p/9p.h>
+
+static LIST_HEAD(devs);
+static int compat_id = -1;
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid);
+static struct p9_fid *find_or_create_fid(struct p9_dev *dev, u32 fid)
+{
+	struct rb_node *node = dev->fids.rb_node;
+	struct p9_fid *pfid = NULL;
+
+	while (node) {
+		struct p9_fid *cur = rb_entry(node, struct p9_fid, node);
+
+		if (fid < cur->fid) {
+			node = node->rb_left;
+		} else if (fid > cur->fid) {
+			node = node->rb_right;
+		} else {
+			return cur;
+		}
+	}
+
+	pfid = calloc(sizeof(*pfid), 1);
+	if (!pfid)
+		return NULL;
+
+	pfid->fid = fid;
+	strcpy(pfid->abs_path, dev->root_dir);
+	pfid->path = pfid->abs_path + strlen(dev->root_dir);
+
+	insert_new_fid(dev, pfid);
+
+	return pfid;
+}
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid)
+{
+	struct rb_node **node = &(dev->fids.rb_node), *parent = NULL;
+
+	while (*node) {
+		int result = fid->fid - rb_entry(*node, struct p9_fid, node)->fid;
+
+		parent = *node;
+		if (result < 0)
+			node    = &((*node)->rb_left);
+		else if (result > 0)
+			node    = &((*node)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&fid->node, parent, node);
+	rb_insert_color(&fid->node, &dev->fids);
+	return 0;
+}
+
+static struct p9_fid *get_fid(struct p9_dev *p9dev, int fid)
+{
+	struct p9_fid *new;
+
+	new = find_or_create_fid(p9dev, fid);
+
+	return new;
+}
+
+/* Warning: Immediately use value returned from this function */
+static const char *rel_to_abs(struct p9_dev *p9dev,
+			      const char *path, char *abs_path)
+{
+	sprintf(abs_path, "%s/%s", p9dev->root_dir, path);
+
+	return abs_path;
+}
+
+static void stat2qid(struct stat *st, struct p9_qid *qid)
+{
+	*qid = (struct p9_qid) {
+		.path		= st->st_ino,
+		.version	= st->st_mtime,
+	};
+
+	if (S_ISDIR(st->st_mode))
+		qid->type	|= P9_QTDIR;
+}
+
+static void close_fid(struct p9_dev *p9dev, u32 fid)
+{
+	struct p9_fid *pfid = get_fid(p9dev, fid);
+
+	if (pfid->fd > 0)
+		close(pfid->fd);
+
+	if (pfid->dir)
+		closedir(pfid->dir);
+
+	rb_erase(&pfid->node, &p9dev->fids);
+	free(pfid);
+}
+
+static void virtio_p9_set_reply_header(struct p9_pdu *pdu, u32 size)
+{
+	u8 cmd;
+	u16 tag;
+
+	pdu->read_offset = sizeof(u32);
+	virtio_p9_pdu_readf(pdu, "bw", &cmd, &tag);
+	pdu->write_offset = 0;
+	/* cmd + 1 is the reply message */
+	virtio_p9_pdu_writef(pdu, "dbw", size, cmd + 1, tag);
+}
+
+static u16 virtio_p9_update_iov_cnt(struct iovec iov[], u32 count, int iov_cnt)
+{
+	int i;
+	u32 total = 0;
+	for (i = 0; (i < iov_cnt) && (total < count); i++) {
+		if (total + iov[i].iov_len > count) {
+			/* we don't need this iov fully */
+			iov[i].iov_len -= ((total + iov[i].iov_len) - count);
+			i++;
+			break;
+		}
+		total += iov[i].iov_len;
+	}
+	return i;
+}
+
+static void virtio_p9_error_reply(struct p9_dev *p9dev,
+				  struct p9_pdu *pdu, int err, u32 *outlen)
+{
+	u16 tag;
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", err);
+	*outlen = pdu->write_offset;
+
+	/* read the tag from input */
+	pdu->read_offset = sizeof(u32) + sizeof(u8);
+	virtio_p9_pdu_readf(pdu, "w", &tag);
+
+	/* Update the header */
+	pdu->write_offset = 0;
+	virtio_p9_pdu_writef(pdu, "dbw", *outlen, P9_RLERROR, tag);
+}
+
+static void virtio_p9_version(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 msize;
+	char *version;
+	virtio_p9_pdu_readf(pdu, "ds", &msize, &version);
+	/*
+	 * reply with the same msize the client sent us
+	 * Error out if the request is not for 9P2000.L
+	 */
+	if (!strcmp(version, VIRTIO_9P_VERSION_DOTL))
+		virtio_p9_pdu_writef(pdu, "ds", msize, version);
+	else
+		virtio_p9_pdu_writef(pdu, "ds", msize, "unknown");
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(version);
+	return;
+}
+
+static void virtio_p9_clunk(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid);
+	close_fid(p9dev, fid);
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+}
+
+/*
+ * FIXME!! Need to map to protocol independent value. Upstream
+ * 9p also have the same BUG
+ */
+static int virtio_p9_openflags(int flags)
+{
+	flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT | O_DIRECT);
+	flags |= O_NOFOLLOW;
+	return flags;
+}
+
+static bool is_dir(struct p9_fid *fid)
+{
+	struct stat st;
+
+	stat(fid->abs_path, &st);
+
+	return S_ISDIR(st.st_mode);
+}
+
+static void virtio_p9_open(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid, flags;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *new_fid;
+
+
+	virtio_p9_pdu_readf(pdu, "dd", &fid, &flags);
+	new_fid = get_fid(p9dev, fid);
+
+	if (lstat(new_fid->abs_path, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+
+	if (is_dir(new_fid)) {
+		new_fid->dir = opendir(new_fid->abs_path);
+		if (!new_fid->dir)
+			goto err_out;
+	} else {
+		new_fid->fd  = open(new_fid->abs_path,
+				    virtio_p9_openflags(flags));
+		if (new_fid->fd < 0)
+			goto err_out;
+	}
+	/* FIXME!! need ot send proper iounit  */
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_create(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int fd, ret;
+	char *name;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char full_path[PATH_MAX];
+	u32 dfid_val, flags, mode, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsddd", &dfid_val,
+			    &name, &flags, &mode, &gid);
+	dfid = get_fid(p9dev, dfid_val);
+
+	flags = virtio_p9_openflags(flags);
+
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	fd = open(full_path, flags | O_CREAT, mode);
+	if (fd < 0)
+		goto err_out;
+	dfid->fd = fd;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	sprintf(dfid->path, "%s/%s", dfid->path, name);
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(name);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_mkdir(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char full_path[PATH_MAX];
+	u32 dfid_val, mode, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsdd", &dfid_val,
+			    &name, &mode, &gid);
+	dfid = get_fid(p9dev, dfid_val);
+
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	ret = mkdir(full_path, mode);
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(name);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_walk(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u8 i;
+	u16 nwqid;
+	u16 nwname;
+	struct p9_qid wqid;
+	struct p9_fid *new_fid, *old_fid;
+	u32 fid_val, newfid_val;
+
+
+	virtio_p9_pdu_readf(pdu, "ddw", &fid_val, &newfid_val, &nwname);
+	new_fid	= get_fid(p9dev, newfid_val);
+
+	nwqid = 0;
+	if (nwname) {
+		struct p9_fid *fid = get_fid(p9dev, fid_val);
+
+		strcpy(new_fid->path, fid->path);
+		/* skip the space for count */
+		pdu->write_offset += sizeof(u16);
+		for (i = 0; i < nwname; i++) {
+			struct stat st;
+			char tmp[PATH_MAX] = {0};
+			char full_path[PATH_MAX];
+			char *str;
+
+			virtio_p9_pdu_readf(pdu, "s", &str);
+
+			/* Format the new path we're 'walk'ing into */
+			sprintf(tmp, "%s/%s", new_fid->path, str);
+
+			free(str);
+
+			if (lstat(rel_to_abs(p9dev, tmp, full_path), &st) < 0)
+				goto err_out;
+
+			stat2qid(&st, &wqid);
+			strcpy(new_fid->path, tmp);
+			new_fid->uid = fid->uid;
+			nwqid++;
+			virtio_p9_pdu_writef(pdu, "Q", &wqid);
+		}
+	} else {
+		/*
+		 * update write_offset so our outlen get correct value
+		 */
+		pdu->write_offset += sizeof(u16);
+		old_fid = get_fid(p9dev, fid_val);
+		strcpy(new_fid->path, old_fid->path);
+		new_fid->uid    = old_fid->uid;
+	}
+	*outlen = pdu->write_offset;
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", nwqid);
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_attach(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	char *uname;
+	char *aname;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *fid;
+	u32 fid_val, afid, uid;
+
+	virtio_p9_pdu_readf(pdu, "ddssd", &fid_val, &afid,
+			    &uname, &aname, &uid);
+
+	free(uname);
+	free(aname);
+
+	if (lstat(p9dev->root_dir, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+
+	fid = get_fid(p9dev, fid_val);
+	fid->uid = uid;
+	strcpy(fid->path, "/");
+
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_fill_stat(struct p9_dev *p9dev,
+				struct stat *st, struct p9_stat_dotl *statl)
+{
+	memset(statl, 0, sizeof(*statl));
+	statl->st_mode		= st->st_mode;
+	statl->st_nlink		= st->st_nlink;
+	statl->st_uid		= st->st_uid;
+	statl->st_gid		= st->st_gid;
+	statl->st_rdev		= st->st_rdev;
+	statl->st_size		= st->st_size;
+	statl->st_blksize	= st->st_blksize;
+	statl->st_blocks	= st->st_blocks;
+	statl->st_atime_sec	= st->st_atime;
+	statl->st_atime_nsec	= st->st_atim.tv_nsec;
+	statl->st_mtime_sec	= st->st_mtime;
+	statl->st_mtime_nsec	= st->st_mtim.tv_nsec;
+	statl->st_ctime_sec	= st->st_ctime;
+	statl->st_ctime_nsec	= st->st_ctim.tv_nsec;
+	/* Currently we only support BASIC fields in stat */
+	statl->st_result_mask	= P9_STATS_BASIC;
+	stat2qid(st, &statl->qid);
+}
+
+static void virtio_p9_read(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u64 offset;
+	u32 fid_val;
+	u16 iov_cnt;
+	void *iov_base;
+	size_t iov_len;
+	u32 count, rcount;
+	struct p9_fid *fid;
+
+
+	rcount = 0;
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	iov_base = pdu->in_iov[0].iov_base;
+	iov_len  = pdu->in_iov[0].iov_len;
+	iov_cnt  = pdu->in_iov_cnt;
+	pdu->in_iov[0].iov_base += VIRTIO_9P_HDR_LEN + sizeof(u32);
+	pdu->in_iov[0].iov_len -= VIRTIO_9P_HDR_LEN + sizeof(u32);
+	pdu->in_iov_cnt = virtio_p9_update_iov_cnt(pdu->in_iov,
+						   count,
+						   pdu->in_iov_cnt);
+	rcount = preadv(fid->fd, pdu->in_iov,
+			pdu->in_iov_cnt, offset);
+	if (rcount > count)
+		rcount = count;
+	/*
+	 * Update the iov_base back, so that rest of
+	 * pdu_writef works correctly.
+	 */
+	pdu->in_iov[0].iov_base = iov_base;
+	pdu->in_iov[0].iov_len  = iov_len;
+	pdu->in_iov_cnt         = iov_cnt;
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", rcount);
+	*outlen = pdu->write_offset + rcount;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+}
+
+static int virtio_p9_dentry_size(struct dirent *dent)
+{
+	/*
+	 * Size of each dirent:
+	 * qid(13) + offset(8) + type(1) + name_len(2) + name
+	 */
+	return 24 + strlen(dent->d_name);
+}
+
+static void virtio_p9_readdir(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	u32 count, rcount;
+	struct stat st;
+	struct p9_fid *fid;
+	struct dirent *dent;
+	char full_path[PATH_MAX];
+	u64 offset, old_offset;
+
+	rcount = 0;
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	if (!is_dir(fid)) {
+		errno = EINVAL;
+		goto err_out;
+	}
+
+	/* Move the offset specified */
+	seekdir(fid->dir, offset);
+
+	old_offset = offset;
+	/* If reading a dir, fill the buffer with p9_stat entries */
+	dent = readdir(fid->dir);
+
+	/* Skip the space for writing count */
+	pdu->write_offset += sizeof(u32);
+	while (dent) {
+		u32 read;
+		struct p9_qid qid;
+
+		if ((rcount + virtio_p9_dentry_size(dent)) > count) {
+			/* seek to the previous offset and return */
+			seekdir(fid->dir, old_offset);
+			break;
+		}
+		old_offset = dent->d_off;
+		lstat(rel_to_abs(p9dev, dent->d_name, full_path), &st);
+		stat2qid(&st, &qid);
+		read = pdu->write_offset;
+		virtio_p9_pdu_writef(pdu, "Qqbs", &qid, dent->d_off,
+				     dent->d_type, dent->d_name);
+		rcount += pdu->write_offset - read;
+		dent = readdir(fid->dir);
+	}
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", rcount);
+	*outlen = pdu->write_offset + rcount;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+
+static void virtio_p9_getattr(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	struct stat st;
+	u64 request_mask;
+	struct p9_fid *fid;
+	struct p9_stat_dotl statl;
+
+	virtio_p9_pdu_readf(pdu, "dq", &fid_val, &request_mask);
+	fid = get_fid(p9dev, fid_val);
+	if (lstat(fid->abs_path, &st) < 0)
+		goto err_out;
+
+	virtio_p9_fill_stat(p9dev, &st, &statl);
+	virtio_p9_pdu_writef(pdu, "A", &statl);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+/* FIXME!! from linux/fs.h */
+/*
+ * Attribute flags.  These should be or-ed together to figure out what
+ * has been changed!
+ */
+#define ATTR_MODE	(1 << 0)
+#define ATTR_UID	(1 << 1)
+#define ATTR_GID	(1 << 2)
+#define ATTR_SIZE	(1 << 3)
+#define ATTR_ATIME	(1 << 4)
+#define ATTR_MTIME	(1 << 5)
+#define ATTR_CTIME	(1 << 6)
+#define ATTR_ATIME_SET	(1 << 7)
+#define ATTR_MTIME_SET	(1 << 8)
+#define ATTR_FORCE	(1 << 9) /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG	(1 << 10)
+#define ATTR_KILL_SUID	(1 << 11)
+#define ATTR_KILL_SGID	(1 << 12)
+#define ATTR_FILE	(1 << 13)
+#define ATTR_KILL_PRIV	(1 << 14)
+#define ATTR_OPEN	(1 << 15) /* Truncating from open(O_TRUNC) */
+#define ATTR_TIMES_SET	(1 << 16)
+
+#define ATTR_MASK    127
+
+static void virtio_p9_setattr(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret = 0;
+	u32 fid_val;
+	struct p9_fid *fid;
+	struct p9_iattr_dotl p9attr;
+
+	virtio_p9_pdu_readf(pdu, "dI", &fid_val, &p9attr);
+	fid = get_fid(p9dev, fid_val);
+
+	if (p9attr.valid & ATTR_MODE) {
+		ret = chmod(fid->abs_path, p9attr.mode);
+		if (ret < 0)
+			goto err_out;
+	}
+	if (p9attr.valid & (ATTR_ATIME | ATTR_MTIME)) {
+		struct timespec times[2];
+		if (p9attr.valid & ATTR_ATIME) {
+			if (p9attr.valid & ATTR_ATIME_SET) {
+				times[0].tv_sec = p9attr.atime_sec;
+				times[0].tv_nsec = p9attr.atime_nsec;
+			} else {
+				times[0].tv_nsec = UTIME_NOW;
+			}
+		} else {
+			times[0].tv_nsec = UTIME_OMIT;
+		}
+		if (p9attr.valid & ATTR_MTIME) {
+			if (p9attr.valid & ATTR_MTIME_SET) {
+				times[1].tv_sec = p9attr.mtime_sec;
+				times[1].tv_nsec = p9attr.mtime_nsec;
+			} else {
+				times[1].tv_nsec = UTIME_NOW;
+			}
+		} else
+			times[1].tv_nsec = UTIME_OMIT;
+
+		ret = utimensat(-1, fid->abs_path, times, AT_SYMLINK_NOFOLLOW);
+		if (ret < 0)
+			goto err_out;
+	}
+	/*
+	 * If the only valid entry in iattr is ctime we can call
+	 * chown(-1,-1) to update the ctime of the file
+	 */
+	if ((p9attr.valid & (ATTR_UID | ATTR_GID)) ||
+	    ((p9attr.valid & ATTR_CTIME)
+	     && !((p9attr.valid & ATTR_MASK) & ~ATTR_CTIME))) {
+		if (!(p9attr.valid & ATTR_UID))
+			p9attr.uid = -1;
+
+		if (!(p9attr.valid & ATTR_GID))
+			p9attr.gid = -1;
+
+		ret = lchown(fid->abs_path, p9attr.uid, p9attr.gid);
+		if (ret < 0)
+			goto err_out;
+	}
+	if (p9attr.valid & (ATTR_SIZE)) {
+		ret = truncate(fid->abs_path, p9attr.size);
+		if (ret < 0)
+			goto err_out;
+	}
+	*outlen = VIRTIO_9P_HDR_LEN;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_write(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+
+	u64 offset;
+	u32 fid_val;
+	u32 count;
+	ssize_t res;
+	u16 iov_cnt;
+	void *iov_base;
+	size_t iov_len;
+	struct p9_fid *fid;
+	/* u32 fid + u64 offset + u32 count */
+	int twrite_size = sizeof(u32) + sizeof(u64) + sizeof(u32);
+
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	iov_base = pdu->out_iov[0].iov_base;
+	iov_len  = pdu->out_iov[0].iov_len;
+	iov_cnt  = pdu->out_iov_cnt;
+
+	/* Adjust the iovec to skip the header and meta data */
+	pdu->out_iov[0].iov_base += (sizeof(struct p9_msg) + twrite_size);
+	pdu->out_iov[0].iov_len -=  (sizeof(struct p9_msg) + twrite_size);
+	pdu->out_iov_cnt = virtio_p9_update_iov_cnt(pdu->out_iov, count,
+						    pdu->out_iov_cnt);
+	res = pwritev(fid->fd, pdu->out_iov, pdu->out_iov_cnt, offset);
+	/*
+	 * Update the iov_base back, so that rest of
+	 * pdu_readf works correctly.
+	 */
+	pdu->out_iov[0].iov_base = iov_base;
+	pdu->out_iov[0].iov_len  = iov_len;
+	pdu->out_iov_cnt         = iov_cnt;
+
+	if (res < 0)
+		goto err_out;
+	virtio_p9_pdu_writef(pdu, "d", res);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_remove(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val;
+	struct p9_fid *fid;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	ret = remove(fid->abs_path);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_rename(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val, new_fid_val;
+	struct p9_fid *fid, *new_fid;
+	char full_path[PATH_MAX], *new_name;
+
+	virtio_p9_pdu_readf(pdu, "dds", &fid_val, &new_fid_val, &new_name);
+	fid = get_fid(p9dev, fid_val);
+	new_fid = get_fid(p9dev, new_fid_val);
+
+	sprintf(full_path, "%s/%s", new_fid->abs_path, new_name);
+	ret = rename(fid->abs_path, full_path);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_readlink(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val;
+	struct p9_fid *fid;
+	char target_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	memset(target_path, 0, PATH_MAX);
+	ret = readlink(fid->abs_path, target_path, PATH_MAX - 1);
+	if (ret < 0)
+		goto err_out;
+
+	virtio_p9_pdu_writef(pdu, "s", target_path);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_statfs(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u64 fsid;
+	u32 fid_val;
+	struct p9_fid *fid;
+	struct statfs stat_buf;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	ret = statfs(fid->abs_path, &stat_buf);
+	if (ret < 0)
+		goto err_out;
+	/* FIXME!! f_blocks needs update based on client msize */
+	fsid = (unsigned int) stat_buf.f_fsid.__val[0] |
+		(unsigned long long)stat_buf.f_fsid.__val[1] << 32;
+	virtio_p9_pdu_writef(pdu, "ddqqqqqqd", stat_buf.f_type,
+			     stat_buf.f_bsize, stat_buf.f_blocks,
+			     stat_buf.f_bfree, stat_buf.f_bavail,
+			     stat_buf.f_files, stat_buf.f_ffree,
+			     fsid, stat_buf.f_namelen);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_mknod(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	struct stat st;
+	struct p9_fid *dfid;
+	struct p9_qid qid;
+	char full_path[PATH_MAX];
+	u32 fid_val, mode, major, minor, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsdddd", &fid_val, &name, &mode,
+			    &major, &minor, &gid);
+
+	dfid = get_fid(p9dev, fid_val);
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	ret = mknod(full_path, mode, makedev(major, minor));
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_fsync(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	struct p9_fid *fid;
+	u32 fid_val, datasync;
+
+	virtio_p9_pdu_readf(pdu, "dd", &fid_val, &datasync);
+	fid = get_fid(p9dev, fid_val);
+
+	if (datasync)
+		ret = fdatasync(fid->fd);
+	else
+		ret = fsync(fid->fd);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_symlink(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	struct stat st;
+	u32 fid_val, gid;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char new_name[PATH_MAX];
+	char *old_path, *name;
+
+	virtio_p9_pdu_readf(pdu, "dssd", &fid_val, &name, &old_path, &gid);
+
+	dfid = get_fid(p9dev, fid_val);
+	sprintf(new_name, "%s/%s", dfid->abs_path, name);
+	ret = symlink(old_path, new_name);
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(new_name, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	free(name);
+	free(old_path);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	free(old_path);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_link(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	u32 fid_val, dfid_val;
+	struct p9_fid *dfid, *fid;
+	char full_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "dds", &dfid_val, &fid_val, &name);
+
+	dfid = get_fid(p9dev, dfid_val);
+	fid =  get_fid(p9dev, fid_val);
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	ret = link(fid->abs_path, full_path);
+	if (ret < 0)
+		goto err_out;
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+
+}
+
+static void virtio_p9_lock(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u8 ret;
+	u32 fid_val;
+	struct p9_flock flock;
+
+	virtio_p9_pdu_readf(pdu, "dbdqqds", &fid_val, &flock.type,
+			    &flock.flags, &flock.start, &flock.length,
+			    &flock.proc_id, &flock.client_id);
+
+	/* Just return success */
+	ret = P9_LOCK_SUCCESS;
+	virtio_p9_pdu_writef(pdu, "d", ret);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(flock.client_id);
+	return;
+}
+
+static void virtio_p9_getlock(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	struct p9_getlock glock;
+	virtio_p9_pdu_readf(pdu, "dbqqds", &fid_val, &glock.type,
+			    &glock.start, &glock.length, &glock.proc_id,
+			    &glock.client_id);
+
+	/* Just return success */
+	glock.type = F_UNLCK;
+	virtio_p9_pdu_writef(pdu, "bqqds", glock.type,
+			     glock.start, glock.length, glock.proc_id,
+			     glock.client_id);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(glock.client_id);
+	return;
+}
+
+static int virtio_p9_ancestor(char *path, char *ancestor)
+{
+	int size = strlen(ancestor);
+	if (!strncmp(path, ancestor, size)) {
+		/*
+		 * Now check whether ancestor is a full name or
+		 * or directory component and not just part
+		 * of a name.
+		 */
+		if (path[size] == '\0' || path[size] == '/')
+			return 1;
+	}
+	return 0;
+}
+
+static void virtio_p9_fix_path(char *fid_path, char *old_name, char *new_name)
+{
+	char tmp_name[PATH_MAX];
+	size_t rp_sz = strlen(old_name);
+
+	if (rp_sz == strlen(fid_path)) {
+		/* replace the full name */
+		strcpy(fid_path, new_name);
+		return;
+	}
+	/* save the trailing path details */
+	strcpy(tmp_name, fid_path + rp_sz);
+	sprintf(fid_path, "%s%s", new_name, tmp_name);
+	return;
+}
+
+static void rename_fids(struct p9_dev *p9dev, char *old_name, char *new_name)
+{
+	struct rb_node *node = rb_first(&p9dev->fids);
+
+	while (node) {
+		struct p9_fid *fid = rb_entry(node, struct p9_fid, node);
+
+		if (fid->fid != P9_NOFID && virtio_p9_ancestor(fid->path, old_name)) {
+				virtio_p9_fix_path(fid->path, old_name, new_name);
+		}
+		node = rb_next(node);
+	}
+}
+
+static void virtio_p9_renameat(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *old_name, *new_name;
+	u32 old_dfid_val, new_dfid_val;
+	struct p9_fid *old_dfid, *new_dfid;
+	char old_full_path[PATH_MAX], new_full_path[PATH_MAX];
+
+
+	virtio_p9_pdu_readf(pdu, "dsds", &old_dfid_val, &old_name,
+			    &new_dfid_val, &new_name);
+
+	old_dfid = get_fid(p9dev, old_dfid_val);
+	new_dfid = get_fid(p9dev, new_dfid_val);
+
+	sprintf(old_full_path, "%s/%s", old_dfid->abs_path, old_name);
+	sprintf(new_full_path, "%s/%s", new_dfid->abs_path, new_name);
+	ret = rename(old_full_path, new_full_path);
+	if (ret < 0)
+		goto err_out;
+	/*
+	 * Now fix path in other fids, if the renamed path is part of
+	 * that.
+	 */
+	rename_fids(p9dev, old_name, new_name);
+	free(old_name);
+	free(new_name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(old_name);
+	free(new_name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_unlinkat(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	u32 fid_val, flags;
+	struct p9_fid *fid;
+	char full_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "dsd", &fid_val, &name, &flags);
+	fid = get_fid(p9dev, fid_val);
+
+	sprintf(full_path, "%s/%s", fid->abs_path, name);
+	ret = remove(full_path);
+	if (ret < 0)
+		goto err_out;
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_flush(struct p9_dev *p9dev,
+				struct p9_pdu *pdu, u32 *outlen)
+{
+	u16 tag, oldtag;
+
+	virtio_p9_pdu_readf(pdu, "ww", &tag, &oldtag);
+	virtio_p9_pdu_writef(pdu, "w", tag);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+
+	return;
+}
+
+static void virtio_p9_eopnotsupp(struct p9_dev *p9dev,
+				 struct p9_pdu *pdu, u32 *outlen)
+{
+	return virtio_p9_error_reply(p9dev, pdu, EOPNOTSUPP, outlen);
+}
+
+typedef void p9_handler(struct p9_dev *p9dev,
+			struct p9_pdu *pdu, u32 *outlen);
+
+/* FIXME should be removed when merging with latest linus tree */
+#define P9_TRENAMEAT 74
+#define P9_TUNLINKAT 76
+
+static p9_handler *virtio_9p_dotl_handler [] = {
+	[P9_TREADDIR]     = virtio_p9_readdir,
+	[P9_TSTATFS]      = virtio_p9_statfs,
+	[P9_TGETATTR]     = virtio_p9_getattr,
+	[P9_TSETATTR]     = virtio_p9_setattr,
+	[P9_TXATTRWALK]   = virtio_p9_eopnotsupp,
+	[P9_TXATTRCREATE] = virtio_p9_eopnotsupp,
+	[P9_TMKNOD]       = virtio_p9_mknod,
+	[P9_TLOCK]        = virtio_p9_lock,
+	[P9_TGETLOCK]     = virtio_p9_getlock,
+	[P9_TRENAMEAT]    = virtio_p9_renameat,
+	[P9_TREADLINK]    = virtio_p9_readlink,
+	[P9_TUNLINKAT]    = virtio_p9_unlinkat,
+	[P9_TMKDIR]       = virtio_p9_mkdir,
+	[P9_TVERSION]     = virtio_p9_version,
+	[P9_TLOPEN]       = virtio_p9_open,
+	[P9_TATTACH]      = virtio_p9_attach,
+	[P9_TWALK]        = virtio_p9_walk,
+	[P9_TCLUNK]       = virtio_p9_clunk,
+	[P9_TFSYNC]       = virtio_p9_fsync,
+	[P9_TREAD]        = virtio_p9_read,
+	[P9_TFLUSH]       = virtio_p9_flush,
+	[P9_TLINK]        = virtio_p9_link,
+	[P9_TSYMLINK]     = virtio_p9_symlink,
+	[P9_TLCREATE]     = virtio_p9_create,
+	[P9_TWRITE]       = virtio_p9_write,
+	[P9_TREMOVE]      = virtio_p9_remove,
+	[P9_TRENAME]      = virtio_p9_rename,
+};
+
+static struct p9_pdu *virtio_p9_pdu_init(struct kvm *kvm, struct virt_queue *vq)
+{
+	struct p9_pdu *pdu = calloc(1, sizeof(*pdu));
+	if (!pdu)
+		return NULL;
+
+	/* skip the pdu header p9_msg */
+	pdu->read_offset	= VIRTIO_9P_HDR_LEN;
+	pdu->write_offset	= VIRTIO_9P_HDR_LEN;
+	pdu->queue_head		= virt_queue__get_inout_iov(kvm, vq, pdu->in_iov,
+					pdu->out_iov, &pdu->in_iov_cnt, &pdu->out_iov_cnt);
+	return pdu;
+}
+
+static u8 virtio_p9_get_cmd(struct p9_pdu *pdu)
+{
+	struct p9_msg *msg;
+	/*
+	 * we can peek directly into pdu for a u8
+	 * value. The host endianess won't be an issue
+	 */
+	msg = pdu->out_iov[0].iov_base;
+	return msg->cmd;
+}
+
+static bool virtio_p9_do_io_request(struct kvm *kvm, struct p9_dev_job *job)
+{
+	u8 cmd;
+	u32 len = 0;
+	p9_handler *handler;
+	struct p9_dev *p9dev;
+	struct virt_queue *vq;
+	struct p9_pdu *p9pdu;
+
+	vq = job->vq;
+	p9dev = job->p9dev;
+
+	p9pdu = virtio_p9_pdu_init(kvm, vq);
+	cmd = virtio_p9_get_cmd(p9pdu);
+
+	if ((cmd >= ARRAY_SIZE(virtio_9p_dotl_handler)) ||
+	    !virtio_9p_dotl_handler[cmd])
+		handler = virtio_p9_eopnotsupp;
+	else
+		handler = virtio_9p_dotl_handler[cmd];
+
+	handler(p9dev, p9pdu, &len);
+	virt_queue__set_used_elem(vq, p9pdu->queue_head, len);
+	free(p9pdu);
+	return true;
+}
+
+static void virtio_p9_do_io(struct kvm *kvm, void *param)
+{
+	struct p9_dev_job *job = (struct p9_dev_job *)param;
+	struct p9_dev *p9dev   = job->p9dev;
+	struct virt_queue *vq  = job->vq;
+
+	while (virt_queue__available(vq)) {
+		virtio_p9_do_io_request(kvm, job);
+		p9dev->vdev.ops->signal_vq(kvm, &p9dev->vdev, vq - p9dev->vqs);
+	}
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct p9_dev *p9dev = dev;
+
+	return ((u8 *)(p9dev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 1 << VIRTIO_9P_MOUNT_TAG;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct p9_dev *p9dev = dev;
+
+	p9dev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct p9_dev *p9dev = dev;
+	struct p9_dev_job *job;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &p9dev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= guest_flat_to_host(kvm, queue->pfn * page_size);
+	job		= &p9dev->jobs[vq];
+
+	vring_init(&queue->vring, VIRTQUEUE_NUM, p, align);
+
+	*job		= (struct p9_dev_job) {
+		.vq		= queue,
+		.p9dev		= p9dev,
+	};
+	thread_pool__init_job(&job->job_id, kvm, virtio_p9_do_io, job);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct p9_dev *p9dev = dev;
+
+	thread_pool__do_job(&p9dev->jobs[vq].job_id);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct p9_dev *p9dev = dev;
+
+	return p9dev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTQUEUE_NUM;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+struct virtio_ops p9_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset)
+{
+	char *tag_name;
+	char tmp[PATH_MAX];
+	struct kvm *kvm = opt->ptr;
+
+	/*
+	 * 9p dir can be of the form dirname,tag_name or
+	 * just dirname. In the later case we use the
+	 * default tag name
+	 */
+	tag_name = strstr(arg, ",");
+	if (tag_name) {
+		*tag_name = '\0';
+		tag_name++;
+	}
+	if (realpath(arg, tmp)) {
+		if (virtio_9p__register(kvm, tmp, tag_name) < 0)
+			die("Unable to initialize virtio 9p");
+	} else
+		die("Failed resolving 9p path");
+	return 0;
+}
+
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	char path[PATH_MAX];
+	struct stat st;
+	struct kvm *kvm = opt->ptr;
+
+	if (stat(arg, &st) == 0 &&
+	    S_ISDIR(st.st_mode)) {
+		char tmp[PATH_MAX];
+
+		if (kvm->cfg.using_rootfs)
+			die("Please use only one rootfs directory atmost");
+
+		if (realpath(arg, tmp) == 0 ||
+		    virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm->cfg.using_rootfs = 1;
+		return 0;
+	}
+
+	snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+	if (stat(path, &st) == 0 &&
+	    S_ISDIR(st.st_mode)) {
+		char tmp[PATH_MAX];
+
+		if (kvm->cfg.using_rootfs)
+			die("Please use only one rootfs directory atmost");
+
+		if (realpath(path, tmp) == 0 ||
+		    virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm_setup_resolv(arg);
+		kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+		kvm->cfg.custom_rootfs_name = arg;
+		return 0;
+	}
+
+	return -1;
+}
+
+int virtio_9p__init(struct kvm *kvm)
+{
+	struct p9_dev *p9dev;
+
+	list_for_each_entry(p9dev, &devs, list) {
+		virtio_init(kvm, p9dev, &p9dev->vdev, &p9_dev_virtio_ops,
+			    VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_9P,
+			    VIRTIO_ID_9P, PCI_CLASS_9P);
+	}
+
+	return 0;
+}
+virtio_dev_init(virtio_9p__init);
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name)
+{
+	struct p9_dev *p9dev;
+	int err = 0;
+
+	p9dev = calloc(1, sizeof(*p9dev));
+	if (!p9dev)
+		return -ENOMEM;
+
+	if (!tag_name)
+		tag_name = VIRTIO_9P_DEFAULT_TAG;
+
+	p9dev->config = calloc(1, sizeof(*p9dev->config) + strlen(tag_name) + 1);
+	if (p9dev->config == NULL) {
+		err = -ENOMEM;
+		goto free_p9dev;
+	}
+
+	strcpy(p9dev->root_dir, root);
+	p9dev->config->tag_len = strlen(tag_name);
+	if (p9dev->config->tag_len > MAX_TAG_LEN) {
+		err = -EINVAL;
+		goto free_p9dev_config;
+	}
+
+	memcpy(&p9dev->config->tag, tag_name, strlen(tag_name));
+
+	list_add(&p9dev->list, &devs);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-9p", "CONFIG_NET_9P_VIRTIO");
+
+	return err;
+
+free_p9dev_config:
+	free(p9dev->config);
+free_p9dev:
+	free(p9dev);
+	return err;
+}
diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c
new file mode 100644
index 000000000000..d1b64fabbc65
--- /dev/null
+++ b/tools/kvm/virtio/balloon.c
@@ -0,0 +1,279 @@
+#include "kvm/virtio-balloon.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_balloon.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+#define NUM_VIRT_QUEUES		3
+#define VIRTIO_BLN_QUEUE_SIZE	128
+#define VIRTIO_BLN_INFLATE	0
+#define VIRTIO_BLN_DEFLATE	1
+#define VIRTIO_BLN_STATS	2
+
+struct bln_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+
+	u32			features;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct thread_pool__job	jobs[NUM_VIRT_QUEUES];
+
+	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+	struct virtio_balloon_stat *cur_stat;
+	u32			cur_stat_head;
+	u16			stat_count;
+	int			stat_waitfd;
+
+	struct virtio_balloon_config config;
+};
+
+static struct bln_dev bdev;
+static int compat_id = -1;
+
+static bool virtio_bln_do_io_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+	unsigned int len = 0;
+	u16 out, in, head;
+	u32 *ptrs, i;
+
+	head	= virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	ptrs	= iov[0].iov_base;
+	len	= iov[0].iov_len / sizeof(u32);
+
+	for (i = 0 ; i < len ; i++) {
+		void *guest_ptr;
+
+		guest_ptr = guest_flat_to_host(kvm, ptrs[i] << VIRTIO_BALLOON_PFN_SHIFT);
+		if (queue == &bdev->vqs[VIRTIO_BLN_INFLATE]) {
+			madvise(guest_ptr, 1 << VIRTIO_BALLOON_PFN_SHIFT, MADV_DONTNEED);
+			bdev->config.actual++;
+		} else if (queue == &bdev->vqs[VIRTIO_BLN_DEFLATE]) {
+			bdev->config.actual--;
+		}
+	}
+
+	virt_queue__set_used_elem(queue, head, len);
+
+	return true;
+}
+
+static bool virtio_bln_do_stat_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+	u16 out, in, head;
+	struct virtio_balloon_stat *stat;
+	u64 wait_val = 1;
+
+	head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	stat = iov[0].iov_base;
+
+	/* Initial empty stat buffer */
+	if (bdev->cur_stat == NULL) {
+		bdev->cur_stat = stat;
+		bdev->cur_stat_head = head;
+
+		return true;
+	}
+
+	memcpy(bdev->stats, stat, iov[0].iov_len);
+
+	bdev->stat_count = iov[0].iov_len / sizeof(struct virtio_balloon_stat);
+	bdev->cur_stat = stat;
+	bdev->cur_stat_head = head;
+
+	if (write(bdev->stat_waitfd, &wait_val, sizeof(wait_val)) <= 0)
+		return -EFAULT;
+
+	return 1;
+}
+
+static void virtio_bln_do_io(struct kvm *kvm, void *param)
+{
+	struct virt_queue *vq = param;
+
+	if (vq == &bdev.vqs[VIRTIO_BLN_STATS]) {
+		virtio_bln_do_stat_request(kvm, &bdev, vq);
+		bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+		return;
+	}
+
+	while (virt_queue__available(vq)) {
+		virtio_bln_do_io_request(kvm, &bdev, vq);
+		bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, vq - bdev.vqs);
+	}
+}
+
+static int virtio_bln__collect_stats(struct kvm *kvm)
+{
+	u64 tmp;
+
+	virt_queue__set_used_elem(&bdev.vqs[VIRTIO_BLN_STATS], bdev.cur_stat_head,
+				  sizeof(struct virtio_balloon_stat));
+	bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+
+	if (read(bdev.stat_waitfd, &tmp, sizeof(tmp)) <= 0)
+		return -EFAULT;
+
+	return 0;
+}
+
+static void virtio_bln__print_stats(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int r;
+
+	if (WARN_ON(type != KVM_IPC_STAT || len))
+		return;
+
+	if (virtio_bln__collect_stats(kvm) < 0)
+		return;
+
+	r = write(fd, bdev.stats, sizeof(bdev.stats));
+	if (r < 0)
+		pr_warning("Failed sending memory stats");
+}
+
+static void handle_mem(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int mem;
+
+	if (WARN_ON(type != KVM_IPC_BALLOON || len != sizeof(int)))
+		return;
+
+	mem = *(int *)msg;
+	if (mem > 0) {
+		bdev.config.num_pages += 256 * mem;
+	} else if (mem < 0) {
+		if (bdev.config.num_pages < (u32)(256 * (-mem)))
+			return;
+
+		bdev.config.num_pages += 256 * mem;
+	}
+
+	/* Notify that the configuration space has changed */
+	bdev.vdev.ops->signal_config(kvm, &bdev.vdev);
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct bln_dev *bdev = dev;
+
+	return ((u8 *)(&bdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 1 << VIRTIO_BALLOON_F_STATS_VQ;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct bln_dev *bdev = dev;
+
+	bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct bln_dev *bdev = dev;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &bdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= guest_flat_to_host(kvm, queue->pfn * page_size);
+
+	thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue);
+	vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, align);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct bln_dev *bdev = dev;
+
+	thread_pool__do_job(&bdev->jobs[vq]);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct bln_dev *bdev = dev;
+
+	return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_BLN_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+struct virtio_ops bln_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq            = set_size_vq,
+};
+
+int virtio_bln__init(struct kvm *kvm)
+{
+	if (!kvm->cfg.balloon)
+		return 0;
+
+	kvm_ipc__register_handler(KVM_IPC_BALLOON, handle_mem);
+	kvm_ipc__register_handler(KVM_IPC_STAT, virtio_bln__print_stats);
+
+	bdev.stat_waitfd	= eventfd(0, 0);
+	memset(&bdev.config, 0, sizeof(struct virtio_balloon_config));
+
+	virtio_init(kvm, &bdev, &bdev.vdev, &bln_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_BLN,
+		    VIRTIO_ID_BALLOON, PCI_CLASS_BLN);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-balloon", "CONFIG_VIRTIO_BALLOON");
+
+	return 0;
+}
+virtio_dev_init(virtio_bln__init);
+
+int virtio_bln__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_bln__exit);
diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c
new file mode 100644
index 000000000000..44ac44baffdb
--- /dev/null
+++ b/tools/kvm/virtio/blk.c
@@ -0,0 +1,319 @@
+#include "kvm/virtio-blk.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <pthread.h>
+
+#define VIRTIO_BLK_MAX_DEV		4
+
+/*
+ * the header and status consume too entries
+ */
+#define DISK_SEG_MAX			(VIRTIO_BLK_QUEUE_SIZE - 2)
+#define VIRTIO_BLK_QUEUE_SIZE		256
+#define NUM_VIRT_QUEUES			1
+
+struct blk_dev_req {
+	struct virt_queue		*vq;
+	struct blk_dev			*bdev;
+	struct iovec			iov[VIRTIO_BLK_QUEUE_SIZE];
+	u16				out, in, head;
+	struct kvm			*kvm;
+};
+
+struct blk_dev {
+	struct mutex			mutex;
+
+	struct list_head		list;
+
+	struct virtio_device		vdev;
+	struct virtio_blk_config	blk_config;
+	struct disk_image		*disk;
+	u32				features;
+
+	struct virt_queue		vqs[NUM_VIRT_QUEUES];
+	struct blk_dev_req		reqs[VIRTIO_BLK_QUEUE_SIZE];
+
+	pthread_t			io_thread;
+	int				io_efd;
+
+	struct kvm			*kvm;
+};
+
+static LIST_HEAD(bdevs);
+static int compat_id = -1;
+
+void virtio_blk_complete(void *param, long len)
+{
+	struct blk_dev_req *req = param;
+	struct blk_dev *bdev = req->bdev;
+	int queueid = req->vq - bdev->vqs;
+	u8 *status;
+
+	/* status */
+	status	= req->iov[req->out + req->in - 1].iov_base;
+	*status	= (len < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+	mutex_lock(&bdev->mutex);
+	virt_queue__set_used_elem(req->vq, req->head, len);
+	mutex_unlock(&bdev->mutex);
+
+	if (virtio_queue__should_signal(&bdev->vqs[queueid]))
+		bdev->vdev.ops->signal_vq(req->kvm, &bdev->vdev, queueid);
+}
+
+static void virtio_blk_do_io_request(struct kvm *kvm, struct blk_dev_req *req)
+{
+	struct virtio_blk_outhdr *req_hdr;
+	ssize_t block_cnt;
+	struct blk_dev *bdev;
+	struct iovec *iov;
+	u16 out, in;
+
+	block_cnt	= -1;
+	bdev		= req->bdev;
+	iov		= req->iov;
+	out		= req->out;
+	in		= req->in;
+	req_hdr		= iov[0].iov_base;
+
+	switch (req_hdr->type) {
+	case VIRTIO_BLK_T_IN:
+		block_cnt = disk_image__read(bdev->disk, req_hdr->sector,
+				iov + 1, in + out - 2, req);
+		break;
+	case VIRTIO_BLK_T_OUT:
+		block_cnt = disk_image__write(bdev->disk, req_hdr->sector,
+				iov + 1, in + out - 2, req);
+		break;
+	case VIRTIO_BLK_T_FLUSH:
+		block_cnt = disk_image__flush(bdev->disk);
+		virtio_blk_complete(req, block_cnt);
+		break;
+	case VIRTIO_BLK_T_GET_ID:
+		block_cnt = VIRTIO_BLK_ID_BYTES;
+		disk_image__get_serial(bdev->disk,
+				(iov + 1)->iov_base, &block_cnt);
+		virtio_blk_complete(req, block_cnt);
+		break;
+	default:
+		pr_warning("request type %d", req_hdr->type);
+		block_cnt	= -1;
+		break;
+	}
+}
+
+static void virtio_blk_do_io(struct kvm *kvm, struct virt_queue *vq, struct blk_dev *bdev)
+{
+	struct blk_dev_req *req;
+	u16 head;
+
+	while (virt_queue__available(vq)) {
+		head		= virt_queue__pop(vq);
+		req		= &bdev->reqs[head];
+		req->head	= virt_queue__get_head_iov(vq, req->iov, &req->out,
+					&req->in, head, kvm);
+		req->vq		= vq;
+
+		virtio_blk_do_io_request(kvm, req);
+	}
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct blk_dev *bdev = dev;
+
+	return ((u8 *)(&bdev->blk_config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return	1UL << VIRTIO_BLK_F_SEG_MAX
+		| 1UL << VIRTIO_BLK_F_FLUSH
+		| 1UL << VIRTIO_RING_F_EVENT_IDX
+		| 1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct blk_dev *bdev = dev;
+
+	bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct blk_dev *bdev = dev;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &bdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= guest_flat_to_host(kvm, queue->pfn * page_size);
+
+	vring_init(&queue->vring, VIRTIO_BLK_QUEUE_SIZE, p, align);
+
+	return 0;
+}
+
+static void *virtio_blk_thread(void *dev)
+{
+	struct blk_dev *bdev = dev;
+	u64 data;
+	int r;
+
+	kvm__set_thread_name("virtio-blk-io");
+
+	while (1) {
+		r = read(bdev->io_efd, &data, sizeof(u64));
+		if (r < 0)
+			continue;
+		virtio_blk_do_io(bdev->kvm, &bdev->vqs[0], bdev);
+	}
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct blk_dev *bdev = dev;
+	u64 data = 1;
+	int r;
+
+	r = write(bdev->io_efd, &data, sizeof(data));
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct blk_dev *bdev = dev;
+
+	return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	/* FIXME: dynamic */
+	return VIRTIO_BLK_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops blk_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+static int virtio_blk__init_one(struct kvm *kvm, struct disk_image *disk)
+{
+	struct blk_dev *bdev;
+	unsigned int i;
+
+	if (!disk)
+		return -EINVAL;
+
+	bdev = calloc(1, sizeof(struct blk_dev));
+	if (bdev == NULL)
+		return -ENOMEM;
+
+	*bdev = (struct blk_dev) {
+		.mutex			= MUTEX_INITIALIZER,
+		.disk			= disk,
+		.blk_config		= (struct virtio_blk_config) {
+			.capacity	= disk->size / SECTOR_SIZE,
+			.seg_max	= DISK_SEG_MAX,
+		},
+		.io_efd			= eventfd(0, 0),
+		.kvm			= kvm,
+	};
+
+	virtio_init(kvm, bdev, &bdev->vdev, &blk_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_BLK,
+		    VIRTIO_ID_BLOCK, PCI_CLASS_BLK);
+
+	list_add_tail(&bdev->list, &bdevs);
+
+	for (i = 0; i < ARRAY_SIZE(bdev->reqs); i++) {
+		bdev->reqs[i].bdev = bdev;
+		bdev->reqs[i].kvm = kvm;
+	}
+
+	disk_image__set_callback(bdev->disk, virtio_blk_complete);
+
+	pthread_create(&bdev->io_thread, NULL, virtio_blk_thread, bdev);
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-blk", "CONFIG_VIRTIO_BLK");
+
+	return 0;
+}
+
+static int virtio_blk__exit_one(struct kvm *kvm, struct blk_dev *bdev)
+{
+	list_del(&bdev->list);
+	free(bdev);
+
+	return 0;
+}
+
+int virtio_blk__init(struct kvm *kvm)
+{
+	int i, r = 0;
+
+	for (i = 0; i < kvm->nr_disks; i++) {
+		if (kvm->disks[i]->wwpn)
+			continue;
+		r = virtio_blk__init_one(kvm, kvm->disks[i]);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	return virtio_blk__exit(kvm);
+}
+virtio_dev_init(virtio_blk__init);
+
+int virtio_blk__exit(struct kvm *kvm)
+{
+	while (!list_empty(&bdevs)) {
+		struct blk_dev *bdev;
+
+		bdev = list_first_entry(&bdevs, struct blk_dev, list);
+		virtio_blk__exit_one(kvm, bdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_blk__exit);
diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c
new file mode 100644
index 000000000000..b18d3a925928
--- /dev/null
+++ b/tools/kvm/virtio/console.c
@@ -0,0 +1,212 @@
+#include "kvm/virtio-console.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/virtio.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define VIRTIO_CONSOLE_QUEUE_SIZE	128
+#define VIRTIO_CONSOLE_NUM_QUEUES	2
+#define VIRTIO_CONSOLE_RX_QUEUE		0
+#define VIRTIO_CONSOLE_TX_QUEUE		1
+
+struct con_dev {
+	struct mutex			mutex;
+
+	struct virtio_device		vdev;
+	struct virt_queue		vqs[VIRTIO_CONSOLE_NUM_QUEUES];
+	struct virtio_console_config	config;
+	u32				features;
+
+	struct thread_pool__job		jobs[VIRTIO_CONSOLE_NUM_QUEUES];
+};
+
+static struct con_dev cdev = {
+	.mutex				= MUTEX_INITIALIZER,
+
+	.config = {
+		.cols			= 80,
+		.rows			= 24,
+		.max_nr_ports		= 1,
+	},
+};
+
+static int compat_id = -1;
+
+/*
+ * Interrupts are injected for hvc0 only.
+ */
+static void virtio_console__inject_interrupt_callback(struct kvm *kvm, void *param)
+{
+	struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+	struct virt_queue *vq;
+	u16 out, in;
+	u16 head;
+	int len;
+
+	if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+		return;
+
+	mutex_lock(&cdev.mutex);
+
+	vq = param;
+
+	if (term_readable(0) && virt_queue__available(vq)) {
+		head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+		len = term_getc_iov(kvm, iov, in, 0);
+		virt_queue__set_used_elem(vq, head, len);
+		cdev.vdev.ops->signal_vq(kvm, &cdev.vdev, vq - cdev.vqs);
+	}
+
+	mutex_unlock(&cdev.mutex);
+}
+
+void virtio_console__inject_interrupt(struct kvm *kvm)
+{
+	thread_pool__do_job(&cdev.jobs[VIRTIO_CONSOLE_RX_QUEUE]);
+}
+
+static void virtio_console_handle_callback(struct kvm *kvm, void *param)
+{
+	struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+	struct virt_queue *vq;
+	u16 out, in;
+	u16 head;
+	u32 len;
+
+	vq = param;
+
+	/*
+	 * The current Linux implementation polls for the buffer
+	 * to be used, rather than waiting for an interrupt.
+	 * So there is no need to inject an interrupt for the tx path.
+	 */
+
+	while (virt_queue__available(vq)) {
+		head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+		if (kvm->cfg.active_console == CONSOLE_VIRTIO)
+			len = term_putc_iov(iov, out, 0);
+		else
+			len = 0;
+		virt_queue__set_used_elem(vq, head, len);
+	}
+
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct con_dev *cdev = dev;
+
+	return ((u8 *)(&cdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	/* Unused */
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct virt_queue *queue;
+	void *p;
+
+	BUG_ON(vq >= VIRTIO_CONSOLE_NUM_QUEUES);
+
+	compat__remove_message(compat_id);
+
+	queue		= &cdev.vqs[vq];
+	queue->pfn	= pfn;
+	p		= guest_flat_to_host(kvm, queue->pfn * page_size);
+
+	vring_init(&queue->vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, align);
+
+	if (vq == VIRTIO_CONSOLE_TX_QUEUE)
+		thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console_handle_callback, queue);
+	else if (vq == VIRTIO_CONSOLE_RX_QUEUE)
+		thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console__inject_interrupt_callback, queue);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct con_dev *cdev = dev;
+
+	thread_pool__do_job(&cdev->jobs[vq]);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct con_dev *cdev = dev;
+
+	return cdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_CONSOLE_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops con_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+int virtio_console__init(struct kvm *kvm)
+{
+	if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+		return 0;
+
+	virtio_init(kvm, &cdev, &cdev.vdev, &con_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_CONSOLE,
+		    VIRTIO_ID_CONSOLE, PCI_CLASS_CONSOLE);
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-console", "CONFIG_VIRTIO_CONSOLE");
+
+	return 0;
+}
+virtio_dev_init(virtio_console__init);
+
+int virtio_console__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_console__exit);
diff --git a/tools/kvm/virtio/core.c b/tools/kvm/virtio/core.c
new file mode 100644
index 000000000000..2dfb828d177f
--- /dev/null
+++ b/tools/kvm/virtio/core.c
@@ -0,0 +1,233 @@
+#include <linux/virtio_ring.h>
+#include <linux/types.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+
+#include "kvm/guest_compat.h"
+#include "kvm/barrier.h"
+#include "kvm/virtio.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len)
+{
+	struct vring_used_elem *used_elem;
+
+	used_elem	= &queue->vring.used->ring[queue->vring.used->idx % queue->vring.num];
+	used_elem->id	= head;
+	used_elem->len	= len;
+
+	/*
+	 * Use wmb to assure that used elem was updated with head and len.
+	 * We need a wmb here since we can't advance idx unless we're ready
+	 * to pass the used element to the guest.
+	 */
+	wmb();
+	queue->vring.used->idx++;
+
+	/*
+	 * Use wmb to assure used idx has been increased before we signal the guest.
+	 * Without a wmb here the guest may ignore the queue since it won't see
+	 * an updated idx.
+	 */
+	wmb();
+
+	return used_elem;
+}
+
+/*
+ * Each buffer in the virtqueues is actually a chain of descriptors.  This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end.
+ */
+static unsigned next_desc(struct vring_desc *desc,
+			  unsigned int i, unsigned int max)
+{
+	unsigned int next;
+
+	/* If this descriptor says it doesn't chain, we're done. */
+	if (!(desc[i].flags & VRING_DESC_F_NEXT))
+		return max;
+
+	/* Check they're not leading us off end of descriptors. */
+	next = desc[i].next;
+	/* Make sure compiler knows to grab that: we don't want it changing! */
+	wmb();
+
+	return next;
+}
+
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, u16 head, struct kvm *kvm)
+{
+	struct vring_desc *desc;
+	u16 idx;
+	u16 max;
+
+	idx = head;
+	*out = *in = 0;
+	max = vq->vring.num;
+	desc = vq->vring.desc;
+
+	if (desc[idx].flags & VRING_DESC_F_INDIRECT) {
+		max = desc[idx].len / sizeof(struct vring_desc);
+		desc = guest_flat_to_host(kvm, desc[idx].addr);
+		idx = 0;
+	}
+
+	do {
+		/* Grab the first descriptor, and check it's OK. */
+		iov[*out + *in].iov_len = desc[idx].len;
+		iov[*out + *in].iov_base = guest_flat_to_host(kvm, desc[idx].addr);
+		/* If this is an input descriptor, increment that count. */
+		if (desc[idx].flags & VRING_DESC_F_WRITE)
+			(*in)++;
+		else
+			(*out)++;
+	} while ((idx = next_desc(desc, idx, max)) != max);
+
+	return head;
+}
+
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm)
+{
+	u16 head;
+
+	head = virt_queue__pop(vq);
+
+	return virt_queue__get_head_iov(vq, iov, out, in, head, kvm);
+}
+
+/* in and out are relative to guest */
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+			      struct iovec in_iov[], struct iovec out_iov[],
+			      u16 *in, u16 *out)
+{
+	struct vring_desc *desc;
+	u16 head, idx;
+
+	idx = head = virt_queue__pop(queue);
+	*out = *in = 0;
+	do {
+		desc = virt_queue__get_desc(queue, idx);
+		if (desc->flags & VRING_DESC_F_WRITE) {
+			in_iov[*in].iov_base = guest_flat_to_host(kvm,
+								  desc->addr);
+			in_iov[*in].iov_len = desc->len;
+			(*in)++;
+		} else {
+			out_iov[*out].iov_base = guest_flat_to_host(kvm,
+								    desc->addr);
+			out_iov[*out].iov_len = desc->len;
+			(*out)++;
+		}
+		if (desc->flags & VRING_DESC_F_NEXT)
+			idx = desc->next;
+		else
+			break;
+	} while (1);
+
+	return head;
+}
+
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off)
+{
+	if (msix) {
+		if (offset < 4)
+			return VIRTIO_PCI_O_MSIX;
+		else
+			offset -= 4;
+	}
+
+	*config_off = offset;
+
+	return VIRTIO_PCI_O_CONFIG;
+}
+
+bool virtio_queue__should_signal(struct virt_queue *vq)
+{
+	u16 old_idx, new_idx, event_idx;
+
+	old_idx		= vq->last_used_signalled;
+	new_idx		= vq->vring.used->idx;
+	event_idx	= vring_used_event(&vq->vring);
+
+	if (vring_need_event(event_idx, new_idx, old_idx)) {
+		vq->last_used_signalled = new_idx;
+		return true;
+	}
+
+	return false;
+}
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		struct virtio_ops *ops, enum virtio_trans trans,
+		int device_id, int subsys_id, int class)
+{
+	void *virtio;
+
+	switch (trans) {
+	case VIRTIO_PCI:
+		virtio = calloc(sizeof(struct virtio_pci), 1);
+		if (!virtio)
+			return -ENOMEM;
+		vdev->virtio			= virtio;
+		vdev->ops			= ops;
+		vdev->ops->signal_vq		= virtio_pci__signal_vq;
+		vdev->ops->signal_config	= virtio_pci__signal_config;
+		vdev->ops->init			= virtio_pci__init;
+		vdev->ops->exit			= virtio_pci__exit;
+		vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+		break;
+	case VIRTIO_MMIO:
+		virtio = calloc(sizeof(struct virtio_mmio), 1);
+		if (!virtio)
+			return -ENOMEM;
+		vdev->virtio			= virtio;
+		vdev->ops			= ops;
+		vdev->ops->signal_vq		= virtio_mmio_signal_vq;
+		vdev->ops->signal_config	= virtio_mmio_signal_config;
+		vdev->ops->init			= virtio_mmio_init;
+		vdev->ops->exit			= virtio_mmio_exit;
+		vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+		break;
+	default:
+		return -1;
+	};
+
+	return 0;
+}
+
+int virtio_compat_add_message(const char *device, const char *config)
+{
+	int len = 1024;
+	int compat_id;
+	char *title;
+	char *desc;
+
+	title = malloc(len);
+	if (!title)
+		return -ENOMEM;
+
+	desc = malloc(len);
+	if (!desc) {
+		free(title);
+		return -ENOMEM;
+	}
+
+	snprintf(title, len, "%s device was not detected.", device);
+	snprintf(desc,  len, "While you have requested a %s device, "
+			     "the guest kernel did not initialize it.\n"
+			     "\tPlease make sure that the guest kernel was "
+			     "compiled with %s=y enabled in .config.",
+			     device, config);
+
+	compat_id = compat__add_message(title, desc);
+
+	free(desc);
+	free(title);
+
+	return compat_id;
+}
diff --git a/tools/kvm/virtio/mmio.c b/tools/kvm/virtio/mmio.c
new file mode 100644
index 000000000000..bd30f375950c
--- /dev/null
+++ b/tools/kvm/virtio/mmio.c
@@ -0,0 +1,271 @@
+#include "kvm/devices.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+
+#include <linux/virtio_mmio.h>
+#include <string.h>
+
+static u32 virtio_mmio_io_space_blocks = KVM_VIRTIO_MMIO_AREA;
+
+static u32 virtio_mmio_get_io_space_block(u32 size)
+{
+	u32 block = virtio_mmio_io_space_blocks;
+	virtio_mmio_io_space_blocks += size;
+
+	return block;
+}
+
+static void virtio_mmio_ioevent_callback(struct kvm *kvm, void *param)
+{
+	struct virtio_mmio_ioevent_param *ioeventfd = param;
+	struct virtio_mmio *vmmio = ioeventfd->vdev->virtio;
+
+	ioeventfd->vdev->ops->notify_vq(kvm, vmmio->dev, ioeventfd->vq);
+}
+
+static int virtio_mmio_init_ioeventfd(struct kvm *kvm,
+				      struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	struct ioevent ioevent;
+	int err;
+
+	vmmio->ioeventfds[vq] = (struct virtio_mmio_ioevent_param) {
+		.vdev		= vdev,
+		.vq		= vq,
+	};
+
+	ioevent = (struct ioevent) {
+		.io_addr	= vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY,
+		.io_len		= sizeof(u32),
+		.fn		= virtio_mmio_ioevent_callback,
+		.fn_ptr		= &vmmio->ioeventfds[vq],
+		.datamatch	= vq,
+		.fn_kvm		= kvm,
+		.fd		= eventfd(0, 0),
+	};
+
+	if (vdev->use_vhost)
+		/*
+		 * Vhost will poll the eventfd in host kernel side,
+		 * no need to poll in userspace.
+		 */
+		err = ioeventfd__add_event(&ioevent, true, false);
+	else
+		/* Need to poll in userspace. */
+		err = ioeventfd__add_event(&ioevent, true, true);
+	if (err)
+		return err;
+
+	if (vdev->ops->notify_vq_eventfd)
+		vdev->ops->notify_vq_eventfd(kvm, vmmio->dev, vq, ioevent.fd);
+
+	return 0;
+}
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_VRING;
+	kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+	return 0;
+}
+
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_CONFIG;
+	kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+	return 0;
+}
+
+static void virtio_mmio_device_specific(u64 addr, u8 *data, u32 len,
+					u8 is_write, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 i;
+
+	for (i = 0; i < len; i++) {
+		if (is_write)
+			vdev->ops->get_config(vmmio->kvm, vmmio->dev)[addr + i] =
+					      *(u8 *)data + i;
+		else
+			data[i] = vdev->ops->get_config(vmmio->kvm,
+							vmmio->dev)[addr + i];
+	}
+}
+
+static void virtio_mmio_config_in(u64 addr, void *data, u32 len,
+				  struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 val = 0;
+
+	switch (addr) {
+	case VIRTIO_MMIO_MAGIC_VALUE:
+	case VIRTIO_MMIO_VERSION:
+	case VIRTIO_MMIO_DEVICE_ID:
+	case VIRTIO_MMIO_VENDOR_ID:
+	case VIRTIO_MMIO_STATUS:
+	case VIRTIO_MMIO_INTERRUPT_STATUS:
+		ioport__write32(data, *(u32 *)(((void *)&vmmio->hdr) + addr));
+		break;
+	case VIRTIO_MMIO_HOST_FEATURES:
+		if (vmmio->hdr.host_features_sel == 0)
+			val = vdev->ops->get_host_features(vmmio->kvm,
+							   vmmio->dev);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_PFN:
+		val = vdev->ops->get_pfn_vq(vmmio->kvm, vmmio->dev,
+					    vmmio->hdr.queue_sel);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_NUM_MAX:
+		val = vdev->ops->get_size_vq(vmmio->kvm, vmmio->dev,
+					     vmmio->hdr.queue_sel);
+		ioport__write32(data, val);
+		break;
+	default:
+		break;
+	}
+}
+
+static void virtio_mmio_config_out(u64 addr, void *data, u32 len,
+				   struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 val = 0;
+
+	switch (addr) {
+	case VIRTIO_MMIO_HOST_FEATURES_SEL:
+	case VIRTIO_MMIO_GUEST_FEATURES_SEL:
+	case VIRTIO_MMIO_QUEUE_SEL:
+	case VIRTIO_MMIO_STATUS:
+		val = ioport__read32(data);
+		*(u32 *)(((void *)&vmmio->hdr) + addr) = val;
+		break;
+	case VIRTIO_MMIO_GUEST_FEATURES:
+		if (vmmio->hdr.guest_features_sel == 0) {
+			val = ioport__read32(data);
+			vdev->ops->set_guest_features(vmmio->kvm,
+						      vmmio->dev, val);
+		}
+		break;
+	case VIRTIO_MMIO_GUEST_PAGE_SIZE:
+		val = ioport__read32(data);
+		vmmio->hdr.guest_page_size = val;
+		break;
+	case VIRTIO_MMIO_QUEUE_NUM:
+		val = ioport__read32(data);
+		vmmio->hdr.queue_num = val;
+		vdev->ops->set_size_vq(vmmio->kvm, vmmio->dev,
+				       vmmio->hdr.queue_sel, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_ALIGN:
+		val = ioport__read32(data);
+		vmmio->hdr.queue_align = val;
+		break;
+	case VIRTIO_MMIO_QUEUE_PFN:
+		val = ioport__read32(data);
+		virtio_mmio_init_ioeventfd(vmmio->kvm, vdev, vmmio->hdr.queue_sel);
+		vdev->ops->init_vq(vmmio->kvm, vmmio->dev,
+				   vmmio->hdr.queue_sel,
+				   vmmio->hdr.guest_page_size,
+				   vmmio->hdr.queue_align,
+				   val);
+		break;
+	case VIRTIO_MMIO_QUEUE_NOTIFY:
+		val = ioport__read32(data);
+		vdev->ops->notify_vq(vmmio->kvm, vmmio->dev, val);
+		break;
+	case VIRTIO_MMIO_INTERRUPT_ACK:
+		val = ioport__read32(data);
+		vmmio->hdr.interrupt_state &= ~val;
+		break;
+	default:
+		break;
+	};
+}
+
+static void virtio_mmio_mmio_callback(u64 addr, u8 *data, u32 len,
+				      u8 is_write, void *ptr)
+{
+	struct virtio_device *vdev = ptr;
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 offset = addr - vmmio->addr;
+
+	if (offset >= VIRTIO_MMIO_CONFIG) {
+		offset -= VIRTIO_MMIO_CONFIG;
+		virtio_mmio_device_specific(offset, data, len, is_write, ptr);
+		return;
+	}
+
+	if (is_write)
+		virtio_mmio_config_out(offset, data, len, ptr);
+	else
+		virtio_mmio_config_in(offset, data, len, ptr);
+}
+
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u8 pin, line;
+
+	vmmio->addr	= virtio_mmio_get_io_space_block(VIRTIO_MMIO_IO_SIZE);
+	vmmio->kvm	= kvm;
+	vmmio->dev	= dev;
+
+	kvm__register_mmio(kvm, vmmio->addr, VIRTIO_MMIO_IO_SIZE,
+			   false, virtio_mmio_mmio_callback, vdev);
+
+	vmmio->hdr = (struct virtio_mmio_hdr) {
+		.magic		= {'v', 'i', 'r', 't'},
+		.version	= 1,
+		.device_id	= subsys_id,
+		.vendor_id	= 0x4d564b4c , /* 'LKVM' */
+		.queue_num_max	= 256,
+	};
+
+	if (irq__register_device(subsys_id, &pin, &line) < 0)
+		return -1;
+	vmmio->irq = line;
+	vmmio->dev_hdr = (struct device_header) {
+		.bus_type	= DEVICE_BUS_MMIO,
+		.data		= vmmio,
+	};
+
+	device__register(&vmmio->dev_hdr);
+
+	/*
+	 * Instantiate guest virtio-mmio devices using kernel command line
+	 * (or module) parameter, e.g
+	 *
+	 * virtio_mmio.devices=0x200@0xd2000000:5,0x200@0xd2000200:6
+	 */
+	pr_info("virtio-mmio.devices=0x%x@0x%x:%d\n", VIRTIO_MMIO_IO_SIZE, vmmio->addr, line);
+
+	return 0;
+}
+
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	int i;
+
+	kvm__deregister_mmio(kvm, vmmio->addr);
+
+	for (i = 0; i < VIRTIO_MMIO_MAX_VQ; i++)
+		ioeventfd__del_event(vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY, i);
+
+	return 0;
+}
diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
new file mode 100644
index 000000000000..68bd107254a7
--- /dev/null
+++ b/tools/kvm/virtio/net.c
@@ -0,0 +1,674 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio.h"
+#include "kvm/types.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+#include "kvm/uip.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/eventfd.h>
+
+#define VIRTIO_NET_QUEUE_SIZE		256
+#define VIRTIO_NET_NUM_QUEUES		2
+#define VIRTIO_NET_RX_QUEUE		0
+#define VIRTIO_NET_TX_QUEUE		1
+
+struct net_dev;
+
+struct net_dev_operations {
+	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+};
+
+struct net_dev {
+	struct mutex			mutex;
+	struct virtio_device		vdev;
+	struct list_head		list;
+
+	struct virt_queue		vqs[VIRTIO_NET_NUM_QUEUES];
+	struct virtio_net_config	config;
+	u32				features;
+
+	pthread_t			io_rx_thread;
+	struct mutex			io_rx_lock;
+	pthread_cond_t			io_rx_cond;
+
+	pthread_t			io_tx_thread;
+	struct mutex			io_tx_lock;
+	pthread_cond_t			io_tx_cond;
+
+	int				vhost_fd;
+	int				tap_fd;
+	char				tap_name[IFNAMSIZ];
+
+	int				mode;
+
+	struct uip_info			info;
+	struct net_dev_operations	*ops;
+	struct kvm			*kvm;
+};
+
+static LIST_HEAD(ndevs);
+static int compat_id = -1;
+
+static void *virtio_net_rx_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	struct virt_queue *vq;
+	struct kvm *kvm;
+	struct net_dev *ndev = p;
+	u16 out, in;
+	u16 head;
+	int len;
+
+	kvm__set_thread_name("virtio-net-rx");
+
+	kvm = ndev->kvm;
+	vq = &ndev->vqs[VIRTIO_NET_RX_QUEUE];
+
+	while (1) {
+		mutex_lock(&ndev->io_rx_lock);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&ndev->io_rx_cond, &ndev->io_rx_lock.mutex);
+		mutex_unlock(&ndev->io_rx_lock);
+
+		while (virt_queue__available(vq)) {
+			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			len = ndev->ops->rx(iov, in, ndev);
+			virt_queue__set_used_elem(vq, head, len);
+
+			/* We should interrupt guest right now, otherwise latency is huge. */
+			if (virtio_queue__should_signal(&ndev->vqs[VIRTIO_NET_RX_QUEUE]))
+				ndev->vdev.ops->signal_vq(kvm, &ndev->vdev,
+							   VIRTIO_NET_RX_QUEUE);
+		}
+	}
+
+	pthread_exit(NULL);
+	return NULL;
+
+}
+
+static void *virtio_net_tx_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	struct virt_queue *vq;
+	struct kvm *kvm;
+	struct net_dev *ndev = p;
+	u16 out, in;
+	u16 head;
+	int len;
+
+	kvm__set_thread_name("virtio-net-tx");
+
+	kvm = ndev->kvm;
+	vq = &ndev->vqs[VIRTIO_NET_TX_QUEUE];
+
+	while (1) {
+		mutex_lock(&ndev->io_tx_lock);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&ndev->io_tx_cond, &ndev->io_tx_lock.mutex);
+		mutex_unlock(&ndev->io_tx_lock);
+
+		while (virt_queue__available(vq)) {
+			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			len = ndev->ops->tx(iov, out, ndev);
+			virt_queue__set_used_elem(vq, head, len);
+		}
+
+		if (virtio_queue__should_signal(&ndev->vqs[VIRTIO_NET_TX_QUEUE]))
+			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, VIRTIO_NET_TX_QUEUE);
+	}
+
+	pthread_exit(NULL);
+
+	return NULL;
+
+}
+
+static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
+{
+	switch (queue) {
+	case VIRTIO_NET_TX_QUEUE:
+		mutex_lock(&ndev->io_tx_lock);
+		pthread_cond_signal(&ndev->io_tx_cond);
+		mutex_unlock(&ndev->io_tx_lock);
+		break;
+	case VIRTIO_NET_RX_QUEUE:
+		mutex_lock(&ndev->io_rx_lock);
+		pthread_cond_signal(&ndev->io_rx_cond);
+		mutex_unlock(&ndev->io_rx_lock);
+		break;
+	default:
+		pr_warning("Unknown queue index %u", queue);
+	}
+}
+
+static bool virtio_net__tap_init(const struct virtio_net_params *params,
+					struct net_dev *ndev)
+{
+	int sock = socket(AF_INET, SOCK_STREAM, 0);
+	int pid, status, offload, hdr_len;
+	struct sockaddr_in sin = {0};
+	struct ifreq ifr;
+
+	/* Did the user already gave us the FD? */
+	if (params->fd) {
+		ndev->tap_fd = params->fd;
+		return 1;
+	}
+
+	ndev->tap_fd = open("/dev/net/tun", O_RDWR);
+	if (ndev->tap_fd < 0) {
+		pr_warning("Unable to open /dev/net/tun");
+		goto fail;
+	}
+
+	memset(&ifr, 0, sizeof(ifr));
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+	if (ioctl(ndev->tap_fd, TUNSETIFF, &ifr) < 0) {
+		pr_warning("Config tap device error. Are you root?");
+		goto fail;
+	}
+
+	strncpy(ndev->tap_name, ifr.ifr_name, sizeof(ndev->tap_name));
+
+	if (ioctl(ndev->tap_fd, TUNSETNOCSUM, 1) < 0) {
+		pr_warning("Config tap device TUNSETNOCSUM error");
+		goto fail;
+	}
+
+	hdr_len = sizeof(struct virtio_net_hdr);
+	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
+		pr_warning("Config tap device TUNSETVNETHDRSZ error");
+
+	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
+	if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
+		pr_warning("Config tap device TUNSETOFFLOAD error");
+		goto fail;
+	}
+
+	if (strcmp(params->script, "none")) {
+		pid = fork();
+		if (pid == 0) {
+			execl(params->script, params->script, ndev->tap_name, NULL);
+			_exit(1);
+		} else {
+			waitpid(pid, &status, 0);
+			if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
+				pr_warning("Fail to setup tap by %s", params->script);
+				goto fail;
+			}
+		}
+	} else {
+		memset(&ifr, 0, sizeof(ifr));
+		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+		sin.sin_addr.s_addr = inet_addr(params->host_ip);
+		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
+		ifr.ifr_addr.sa_family = AF_INET;
+		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
+			pr_warning("Could not set ip address on tap device");
+			goto fail;
+		}
+	}
+
+	memset(&ifr, 0, sizeof(ifr));
+	strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+	ioctl(sock, SIOCGIFFLAGS, &ifr);
+	ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+	if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
+		pr_warning("Could not bring tap device up");
+
+	close(sock);
+
+	return 1;
+
+fail:
+	if (sock >= 0)
+		close(sock);
+	if (ndev->tap_fd >= 0)
+		close(ndev->tap_fd);
+
+	return 0;
+}
+
+static void virtio_net__io_thread_init(struct kvm *kvm, struct net_dev *ndev)
+{
+	mutex_init(&ndev->io_tx_lock);
+	mutex_init(&ndev->io_rx_lock);
+
+	pthread_cond_init(&ndev->io_tx_cond, NULL);
+	pthread_cond_init(&ndev->io_rx_cond, NULL);
+
+	pthread_create(&ndev->io_tx_thread, NULL, virtio_net_tx_thread, ndev);
+	pthread_create(&ndev->io_rx_thread, NULL, virtio_net_rx_thread, ndev);
+}
+
+static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+	return writev(ndev->tap_fd, iov, out);
+}
+
+static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+	return readv(ndev->tap_fd, iov, in);
+}
+
+static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+	return uip_tx(iov, out, &ndev->info);
+}
+
+static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+	return uip_rx(iov, in, &ndev->info);
+}
+
+static struct net_dev_operations tap_ops = {
+	.rx	= tap_ops_rx,
+	.tx	= tap_ops_tx,
+};
+
+static struct net_dev_operations uip_ops = {
+	.rx	= uip_ops_rx,
+	.tx	= uip_ops_tx,
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct net_dev *ndev = dev;
+
+	return ((u8 *)(&ndev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 1UL << VIRTIO_NET_F_MAC
+		| 1UL << VIRTIO_NET_F_CSUM
+		| 1UL << VIRTIO_NET_F_HOST_UFO
+		| 1UL << VIRTIO_NET_F_HOST_TSO4
+		| 1UL << VIRTIO_NET_F_HOST_TSO6
+		| 1UL << VIRTIO_NET_F_GUEST_UFO
+		| 1UL << VIRTIO_NET_F_GUEST_TSO4
+		| 1UL << VIRTIO_NET_F_GUEST_TSO6
+		| 1UL << VIRTIO_RING_F_EVENT_IDX
+		| 1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct net_dev *ndev = dev;
+
+	ndev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct vhost_vring_state state = { .index = vq };
+	struct vhost_vring_addr addr;
+	struct net_dev *ndev = dev;
+	struct virt_queue *queue;
+	void *p;
+	int r;
+
+	compat__remove_message(compat_id);
+
+	queue		= &ndev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= guest_flat_to_host(kvm, queue->pfn * page_size);
+
+	vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, align);
+
+	if (ndev->vhost_fd == 0)
+		return 0;
+
+	state.num = queue->vring.num;
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_NUM failed");
+	state.num = 0;
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_BASE failed");
+
+	addr = (struct vhost_vring_addr) {
+		.index = vq,
+		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+		.used_user_addr = (u64)(unsigned long)queue->vring.used,
+	};
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_ADDR failed");
+
+	return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+	struct net_dev *ndev = dev;
+	struct kvm_irqfd irq;
+	struct vhost_vring_file file;
+	int r;
+
+	if (ndev->vhost_fd == 0)
+		return;
+
+	irq = (struct kvm_irqfd) {
+		.gsi	= gsi,
+		.fd	= eventfd(0, 0),
+	};
+	file = (struct vhost_vring_file) {
+		.index	= vq,
+		.fd	= irq.fd,
+	};
+
+	r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+	if (r < 0)
+		die_perror("KVM_IRQFD failed");
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_CALL failed");
+	file.fd = ndev->tap_fd;
+	r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
+	if (r != 0)
+		die("VHOST_NET_SET_BACKEND failed %d", errno);
+
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+	struct net_dev *ndev = dev;
+	struct vhost_vring_file file = {
+		.index	= vq,
+		.fd	= efd,
+	};
+	int r;
+
+	if (ndev->vhost_fd == 0)
+		return;
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct net_dev *ndev = dev;
+
+	virtio_net_handle_callback(kvm, ndev, vq);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct net_dev *ndev = dev;
+
+	return ndev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	/* FIXME: dynamic */
+	return VIRTIO_NET_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops net_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.notify_vq		= notify_vq,
+	.notify_vq_gsi		= notify_vq_gsi,
+	.notify_vq_eventfd	= notify_vq_eventfd,
+};
+
+static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
+{
+	u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
+	struct vhost_memory *mem;
+	int r;
+
+	ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
+	if (ndev->vhost_fd < 0)
+		die_perror("Failed openning vhost-net device");
+
+	mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+	if (mem == NULL)
+		die("Failed allocating memory for vhost memory map");
+
+	mem->nregions = 1;
+	mem->regions[0] = (struct vhost_memory_region) {
+		.guest_phys_addr	= 0,
+		.memory_size		= kvm->ram_size,
+		.userspace_addr		= (unsigned long)kvm->ram_start,
+	};
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
+	if (r != 0)
+		die_perror("VHOST_SET_OWNER failed");
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
+	if (r != 0)
+		die_perror("VHOST_SET_FEATURES failed");
+	r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+	if (r != 0)
+		die_perror("VHOST_SET_MEM_TABLE failed");
+
+	ndev->vdev.use_vhost = true;
+
+	free(mem);
+}
+
+static inline void str_to_mac(const char *str, char *mac)
+{
+	sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+		mac, mac+1, mac+2, mac+3, mac+4, mac+5);
+}
+static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
+			const char *param, const char *val)
+{
+	if (strcmp(param, "guest_mac") == 0) {
+		str_to_mac(val, p->guest_mac);
+	} else if (strcmp(param, "mode") == 0) {
+		if (!strncmp(val, "user", 4)) {
+			int i;
+
+			for (i = 0; i < kvm->cfg.num_net_devices; i++)
+				if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
+					die("Only one usermode network device allowed at a time");
+			p->mode = NET_MODE_USER;
+		} else if (!strncmp(val, "tap", 3)) {
+			p->mode = NET_MODE_TAP;
+		} else if (!strncmp(val, "none", 4)) {
+			kvm->cfg.no_net = 1;
+			return -1;
+		} else
+			die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
+	} else if (strcmp(param, "script") == 0) {
+		p->script = strdup(val);
+	} else if (strcmp(param, "guest_ip") == 0) {
+		p->guest_ip = strdup(val);
+	} else if (strcmp(param, "host_ip") == 0) {
+		p->host_ip = strdup(val);
+	} else if (strcmp(param, "trans") == 0) {
+		p->trans = strdup(val);
+	} else if (strcmp(param, "vhost") == 0) {
+		p->vhost = atoi(val);
+	} else if (strcmp(param, "fd") == 0) {
+		p->fd = atoi(val);
+	} else
+		die("Unknown network parameter %s", param);
+
+	return 0;
+}
+
+int netdev_parser(const struct option *opt, const char *arg, int unset)
+{
+	struct virtio_net_params p;
+	char *buf = NULL, *cmd = NULL, *cur = NULL;
+	bool on_cmd = true;
+	struct kvm *kvm = opt->ptr;
+
+	if (arg) {
+		buf = strdup(arg);
+		if (buf == NULL)
+			die("Failed allocating new net buffer");
+		cur = strtok(buf, ",=");
+	}
+
+	p = (struct virtio_net_params) {
+		.guest_ip	= DEFAULT_GUEST_ADDR,
+		.host_ip	= DEFAULT_HOST_ADDR,
+		.script		= DEFAULT_SCRIPT,
+		.mode		= NET_MODE_TAP,
+	};
+
+	str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
+	p.guest_mac[5] += kvm->cfg.num_net_devices;
+
+	while (cur) {
+		if (on_cmd) {
+			cmd = cur;
+		} else {
+			if (set_net_param(kvm, &p, cmd, cur) < 0)
+				goto done;
+		}
+		on_cmd = !on_cmd;
+
+		cur = strtok(NULL, ",=");
+	};
+
+	kvm->cfg.num_net_devices++;
+
+	kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
+	if (kvm->cfg.net_params == NULL)
+		die("Failed adding new network device");
+
+	kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
+
+done:
+	free(buf);
+	return 0;
+}
+
+static int virtio_net__init_one(struct virtio_net_params *params)
+{
+	int i;
+	struct net_dev *ndev;
+
+	ndev = calloc(1, sizeof(struct net_dev));
+	if (ndev == NULL)
+		return -ENOMEM;
+
+	list_add_tail(&ndev->list, &ndevs);
+
+	ndev->kvm = params->kvm;
+
+	mutex_init(&ndev->mutex);
+	ndev->config.status = VIRTIO_NET_S_LINK_UP;
+
+	for (i = 0 ; i < 6 ; i++) {
+		ndev->config.mac[i]		= params->guest_mac[i];
+		ndev->info.guest_mac.addr[i]	= params->guest_mac[i];
+		ndev->info.host_mac.addr[i]	= params->host_mac[i];
+	}
+
+	ndev->mode = params->mode;
+	if (ndev->mode == NET_MODE_TAP) {
+		if (!virtio_net__tap_init(params, ndev))
+			die_perror("You have requested a TAP device, but creation of one has failed because");
+		ndev->ops = &tap_ops;
+	} else {
+		ndev->info.host_ip		= ntohl(inet_addr(params->host_ip));
+		ndev->info.guest_ip		= ntohl(inet_addr(params->guest_ip));
+		ndev->info.guest_netmask	= ntohl(inet_addr("255.255.255.0"));
+		ndev->info.buf_nr		= 20,
+		uip_init(&ndev->info);
+		ndev->ops = &uip_ops;
+	}
+
+	if (params->trans && strcmp(params->trans, "mmio") == 0)
+		virtio_init(params->kvm, ndev, &ndev->vdev, &net_dev_virtio_ops,
+			    VIRTIO_MMIO, PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+	else
+		virtio_init(params->kvm, ndev, &ndev->vdev, &net_dev_virtio_ops,
+			    VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+
+	if (params->vhost)
+		virtio_net__vhost_init(params->kvm, ndev);
+	else
+		virtio_net__io_thread_init(params->kvm, ndev);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
+
+	return 0;
+}
+
+int virtio_net__init(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->cfg.num_net_devices; i++) {
+		kvm->cfg.net_params[i].kvm = kvm;
+		virtio_net__init_one(&kvm->cfg.net_params[i]);
+	}
+
+	if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
+		struct virtio_net_params net_params;
+
+		net_params = (struct virtio_net_params) {
+			.guest_ip	= kvm->cfg.guest_ip,
+			.host_ip	= kvm->cfg.host_ip,
+			.kvm		= kvm,
+			.script		= kvm->cfg.script,
+			.mode		= NET_MODE_USER,
+		};
+		str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
+		str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
+
+		virtio_net__init_one(&net_params);
+	}
+
+	return 0;
+}
+virtio_dev_init(virtio_net__init);
+
+int virtio_net__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_net__exit);
diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c
new file mode 100644
index 000000000000..227d5674f3a6
--- /dev/null
+++ b/tools/kvm/virtio/pci.c
@@ -0,0 +1,410 @@
+#include "kvm/virtio-pci.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/virtio.h"
+#include "kvm/ioeventfd.h"
+
+#include <sys/ioctl.h>
+#include <linux/virtio_pci.h>
+#include <linux/byteorder.h>
+#include <string.h>
+
+static void virtio_pci__ioevent_callback(struct kvm *kvm, void *param)
+{
+	struct virtio_pci_ioevent_param *ioeventfd = param;
+	struct virtio_pci *vpci = ioeventfd->vdev->virtio;
+
+	ioeventfd->vdev->ops->notify_vq(kvm, vpci->dev, ioeventfd->vq);
+}
+
+static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct ioevent ioevent;
+	struct virtio_pci *vpci = vdev->virtio;
+	int r;
+
+	vpci->ioeventfds[vq] = (struct virtio_pci_ioevent_param) {
+		.vdev		= vdev,
+		.vq		= vq,
+	};
+
+	ioevent = (struct ioevent) {
+		.io_addr	= vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY,
+		.io_len		= sizeof(u16),
+		.fn		= virtio_pci__ioevent_callback,
+		.fn_ptr		= &vpci->ioeventfds[vq],
+		.datamatch	= vq,
+		.fn_kvm		= kvm,
+		.fd		= eventfd(0, 0),
+	};
+
+	if (vdev->use_vhost)
+		/*
+		 * Vhost will poll the eventfd in host kernel side,
+		 * no need to poll in userspace.
+		 */
+		r = ioeventfd__add_event(&ioevent, true, false);
+	else
+		/* Need to poll in userspace. */
+		r = ioeventfd__add_event(&ioevent, true, true);
+	if (r)
+		return r;
+
+	if (vdev->ops->notify_vq_eventfd)
+		vdev->ops->notify_vq_eventfd(kvm, vpci->dev, vq, ioevent.fd);
+
+	return 0;
+}
+
+static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci)
+{
+	return vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE);
+}
+
+static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+					void *data, int size, int offset)
+{
+	u32 config_offset;
+	struct virtio_pci *vpci = vdev->virtio;
+	int type = virtio__get_dev_specific_field(offset - 20,
+							virtio_pci__msix_enabled(vpci),
+							&config_offset);
+	if (type == VIRTIO_PCI_O_MSIX) {
+		switch (offset) {
+		case VIRTIO_MSI_CONFIG_VECTOR:
+			ioport__write16(data, vpci->config_vector);
+			break;
+		case VIRTIO_MSI_QUEUE_VECTOR:
+			ioport__write16(data, vpci->vq_vector[vpci->queue_selector]);
+			break;
+		};
+
+		return true;
+	} else if (type == VIRTIO_PCI_O_CONFIG) {
+		u8 cfg;
+
+		cfg = vdev->ops->get_config(kvm, vpci->dev)[config_offset];
+		ioport__write8(data, cfg);
+		return true;
+	}
+
+	return false;
+}
+
+static bool virtio_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	unsigned long offset;
+	bool ret = true;
+	struct virtio_device *vdev;
+	struct virtio_pci *vpci;
+	u32 val;
+
+	vdev = ioport->priv;
+	vpci = vdev->virtio;
+	offset = port - vpci->base_addr;
+
+	switch (offset) {
+	case VIRTIO_PCI_HOST_FEATURES:
+		val = vdev->ops->get_host_features(kvm, vpci->dev);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		val = vdev->ops->get_pfn_vq(kvm, vpci->dev, vpci->queue_selector);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_PCI_QUEUE_NUM:
+		val = vdev->ops->get_size_vq(kvm, vpci->dev, vpci->queue_selector);
+		ioport__write16(data, val);
+		break;
+	case VIRTIO_PCI_STATUS:
+		ioport__write8(data, vpci->status);
+		break;
+	case VIRTIO_PCI_ISR:
+		ioport__write8(data, vpci->isr);
+		kvm__irq_line(kvm, vpci->pci_hdr.irq_line, VIRTIO_IRQ_LOW);
+		vpci->isr = VIRTIO_IRQ_LOW;
+		break;
+	default:
+		ret = virtio_pci__specific_io_in(kvm, vdev, port, data, size, offset);
+		break;
+	};
+
+	return ret;
+}
+
+static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+					void *data, int size, int offset)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	u32 config_offset, gsi, vec;
+	int type = virtio__get_dev_specific_field(offset - 20, virtio_pci__msix_enabled(vpci),
+							&config_offset);
+	if (type == VIRTIO_PCI_O_MSIX) {
+		switch (offset) {
+		case VIRTIO_MSI_CONFIG_VECTOR:
+			vec = vpci->config_vector = ioport__read16(data);
+			if (vec == VIRTIO_MSI_NO_VECTOR)
+				break;
+
+			gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+
+			vpci->config_gsi = gsi;
+			break;
+		case VIRTIO_MSI_QUEUE_VECTOR:
+			vec = vpci->vq_vector[vpci->queue_selector] = ioport__read16(data);
+
+			if (vec == VIRTIO_MSI_NO_VECTOR)
+				break;
+
+			gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+			vpci->gsis[vpci->queue_selector] = gsi;
+			if (vdev->ops->notify_vq_gsi)
+				vdev->ops->notify_vq_gsi(kvm, vpci->dev,
+							vpci->queue_selector, gsi);
+			break;
+		};
+
+		return true;
+	} else if (type == VIRTIO_PCI_O_CONFIG) {
+		vdev->ops->get_config(kvm, vpci->dev)[config_offset] = *(u8 *)data;
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool virtio_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	unsigned long offset;
+	bool ret = true;
+	struct virtio_device *vdev;
+	struct virtio_pci *vpci;
+	u32 val;
+
+	vdev = ioport->priv;
+	vpci = vdev->virtio;
+	offset = port - vpci->base_addr;
+
+	switch (offset) {
+	case VIRTIO_PCI_GUEST_FEATURES:
+		val = ioport__read32(data);
+		vdev->ops->set_guest_features(kvm, vpci->dev, val);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		val = ioport__read32(data);
+		virtio_pci__init_ioeventfd(kvm, vdev, vpci->queue_selector);
+		vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector,
+				   1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+				   VIRTIO_PCI_VRING_ALIGN, val);
+		break;
+	case VIRTIO_PCI_QUEUE_SEL:
+		vpci->queue_selector = ioport__read16(data);
+		break;
+	case VIRTIO_PCI_QUEUE_NOTIFY:
+		val = ioport__read16(data);
+		vdev->ops->notify_vq(kvm, vpci->dev, val);
+		break;
+	case VIRTIO_PCI_STATUS:
+		vpci->status = ioport__read8(data);
+		break;
+	default:
+		ret = virtio_pci__specific_io_out(kvm, vdev, port, data, size, offset);
+		break;
+	};
+
+	return ret;
+}
+
+static struct ioport_operations virtio_pci__io_ops = {
+	.io_in	= virtio_pci__io_in,
+	.io_out	= virtio_pci__io_out,
+};
+
+static void virtio_pci__mmio_callback(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+	struct virtio_pci *vpci = ptr;
+	void *table;
+	u32 offset;
+
+	if (addr > vpci->msix_io_block + PCI_IO_SIZE) {
+		table	= &vpci->msix_pba;
+		offset	= vpci->msix_io_block + PCI_IO_SIZE;
+	} else {
+		table	= &vpci->msix_table;
+		offset	= vpci->msix_io_block;
+	}
+
+	if (is_write)
+		memcpy(table + addr - offset, data, len);
+	else
+		memcpy(data, table + addr - offset, len);
+}
+
+static void virtio_pci__signal_msi(struct kvm *kvm, struct virtio_pci *vpci, int vec)
+{
+	struct kvm_msi msi = {
+		.address_lo = vpci->msix_table[vec].msg.address_lo,
+		.address_hi = vpci->msix_table[vec].msg.address_hi,
+		.data = vpci->msix_table[vec].msg.data,
+	};
+
+	ioctl(kvm->vm_fd, KVM_SIGNAL_MSI, &msi);
+}
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int tbl = vpci->vq_vector[vq];
+
+	if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+		if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+		    vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+			vpci->msix_pba |= 1 << tbl;
+			return 0;
+		}
+
+		if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+			virtio_pci__signal_msi(kvm, vpci, vpci->vq_vector[vq]);
+		else
+			kvm__irq_trigger(kvm, vpci->gsis[vq]);
+	} else {
+		vpci->isr = VIRTIO_IRQ_HIGH;
+		kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+	}
+	return 0;
+}
+
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int tbl = vpci->config_vector;
+
+	if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+		if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+		    vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+			vpci->msix_pba |= 1 << tbl;
+			return 0;
+		}
+
+		if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+			virtio_pci__signal_msi(kvm, vpci, tbl);
+		else
+			kvm__irq_trigger(kvm, vpci->config_gsi);
+	} else {
+		vpci->isr = VIRTIO_PCI_ISR_CONFIG;
+		kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+	}
+
+	return 0;
+}
+
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	u8 pin, line;
+	int r;
+
+	vpci->dev = dev;
+	vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE * 2);
+
+	r = ioport__register(kvm, IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vdev);
+	if (r < 0)
+		return r;
+
+	vpci->base_addr = (u16)r;
+	r = kvm__register_mmio(kvm, vpci->msix_io_block, PCI_IO_SIZE, false,
+			       virtio_pci__mmio_callback, vpci);
+	if (r < 0)
+		goto free_ioport;
+
+	vpci->pci_hdr = (struct pci_device_header) {
+		.vendor_id		= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+		.device_id		= cpu_to_le16(device_id),
+		.header_type		= PCI_HEADER_TYPE_NORMAL,
+		.revision_id		= 0,
+		.class[0]		= class & 0xff,
+		.class[1]		= (class >> 8) & 0xff,
+		.class[2]		= (class >> 16) & 0xff,
+		.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+		.subsys_id		= cpu_to_le16(subsys_id),
+		.bar[0]			= cpu_to_le32(vpci->base_addr
+							| PCI_BASE_ADDRESS_SPACE_IO),
+		.bar[1]			= cpu_to_le32(vpci->msix_io_block
+							| PCI_BASE_ADDRESS_SPACE_MEMORY),
+		.status			= cpu_to_le16(PCI_STATUS_CAP_LIST),
+		.capabilities		= (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
+		.bar_size[0]		= IOPORT_SIZE,
+		.bar_size[1]		= PCI_IO_SIZE,
+		.bar_size[3]		= PCI_IO_SIZE,
+	};
+
+	vpci->dev_hdr = (struct device_header) {
+		.bus_type		= DEVICE_BUS_PCI,
+		.data			= &vpci->pci_hdr,
+	};
+
+	vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
+	vpci->pci_hdr.msix.next = 0;
+	/*
+	 * We at most have VIRTIO_PCI_MAX_VQ entries for virt queue,
+	 * VIRTIO_PCI_MAX_CONFIG entries for config.
+	 *
+	 * To quote the PCI spec:
+	 *
+	 * System software reads this field to determine the
+	 * MSI-X Table Size N, which is encoded as N-1.
+	 * For example, a returned value of "00000000011"
+	 * indicates a table size of 4.
+	 */
+	vpci->pci_hdr.msix.ctrl = cpu_to_le16(VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG - 1);
+
+	/*
+	 * Both table and PBA could be mapped on the same BAR, but for now
+	 * we're not in short of BARs
+	 */
+	vpci->pci_hdr.msix.table_offset = cpu_to_le32(1); /* Use BAR 1 */
+	vpci->pci_hdr.msix.pba_offset = cpu_to_le32(1 | PCI_IO_SIZE); /* Use BAR 3 */
+	vpci->config_vector = 0;
+
+	r = irq__register_device(subsys_id, &pin, &line);
+	if (r < 0)
+		goto free_mmio;
+
+	if (kvm__supports_extension(kvm, KVM_CAP_SIGNAL_MSI))
+		vpci->features |= VIRTIO_PCI_F_SIGNAL_MSI;
+
+	vpci->pci_hdr.irq_pin	= pin;
+	vpci->pci_hdr.irq_line	= line;
+	r = device__register(&vpci->dev_hdr);
+	if (r < 0)
+		goto free_ioport;
+
+	return 0;
+
+free_mmio:
+	kvm__deregister_mmio(kvm, vpci->msix_io_block);
+free_ioport:
+	ioport__unregister(kvm, vpci->base_addr);
+	return r;
+}
+
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int i;
+
+	kvm__deregister_mmio(kvm, vpci->msix_io_block);
+	ioport__unregister(kvm, vpci->base_addr);
+
+	for (i = 0; i < VIRTIO_PCI_MAX_VQ; i++)
+		ioeventfd__del_event(vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY, i);
+
+	return 0;
+}
diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c
new file mode 100644
index 000000000000..2ce8afdc89bf
--- /dev/null
+++ b/tools/kvm/virtio/rng.c
@@ -0,0 +1,204 @@
+#include "kvm/virtio-rng.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_rng.h>
+
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+
+#define NUM_VIRT_QUEUES		1
+#define VIRTIO_RNG_QUEUE_SIZE	128
+
+struct rng_dev_job {
+	struct virt_queue	*vq;
+	struct rng_dev		*rdev;
+	struct thread_pool__job	job_id;
+};
+
+struct rng_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+
+	int			fd;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct rng_dev_job	jobs[NUM_VIRT_QUEUES];
+};
+
+static LIST_HEAD(rdevs);
+static int compat_id = -1;
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	/* Unused */
+	return 0;
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	/* Unused */
+	return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	/* Unused */
+}
+
+static bool virtio_rng_do_io_request(struct kvm *kvm, struct rng_dev *rdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_RNG_QUEUE_SIZE];
+	ssize_t len = 0;
+	u16 out, in, head;
+
+	head	= virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	len	= readv(rdev->fd, iov, in);
+	if (len < 0 && errno == EAGAIN)
+		len = 0;
+
+	virt_queue__set_used_elem(queue, head, len);
+
+	return true;
+}
+
+static void virtio_rng_do_io(struct kvm *kvm, void *param)
+{
+	struct rng_dev_job *job	= param;
+	struct virt_queue *vq	= job->vq;
+	struct rng_dev *rdev	= job->rdev;
+
+	while (virt_queue__available(vq))
+		virtio_rng_do_io_request(kvm, rdev, vq);
+
+	rdev->vdev.ops->signal_vq(kvm, &rdev->vdev, vq - rdev->vqs);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct rng_dev *rdev = dev;
+	struct virt_queue *queue;
+	struct rng_dev_job *job;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &rdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= guest_flat_to_host(kvm, queue->pfn * page_size);
+
+	job = &rdev->jobs[vq];
+
+	vring_init(&queue->vring, VIRTIO_RNG_QUEUE_SIZE, p, align);
+
+	*job = (struct rng_dev_job) {
+		.vq	= queue,
+		.rdev	= rdev,
+	};
+
+	thread_pool__init_job(&job->job_id, kvm, virtio_rng_do_io, job);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct rng_dev *rdev = dev;
+
+	thread_pool__do_job(&rdev->jobs[vq].job_id);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct rng_dev *rdev = dev;
+
+	return rdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_RNG_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops rng_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+int virtio_rng__init(struct kvm *kvm)
+{
+	struct rng_dev *rdev;
+	int r;
+
+	if (!kvm->cfg.virtio_rng)
+		return 0;
+
+	rdev = malloc(sizeof(*rdev));
+	if (rdev == NULL)
+		return -ENOMEM;
+
+	rdev->fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+	if (rdev->fd < 0) {
+		r = rdev->fd;
+		goto cleanup;
+	}
+
+	r = virtio_init(kvm, rdev, &rdev->vdev, &rng_dev_virtio_ops,
+			VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_RNG,
+			VIRTIO_ID_RNG, PCI_CLASS_RNG);
+	if (r < 0)
+		goto cleanup;
+
+	list_add_tail(&rdev->list, &rdevs);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-rng", "CONFIG_HW_RANDOM_VIRTIO");
+	return 0;
+cleanup:
+	close(rdev->fd);
+	free(rdev);
+
+	return r;
+}
+virtio_dev_init(virtio_rng__init);
+
+int virtio_rng__exit(struct kvm *kvm)
+{
+	struct rng_dev *rdev, *tmp;
+
+	list_for_each_entry_safe(rdev, tmp, &rdevs, list) {
+		list_del(&rdev->list);
+		rdev->vdev.ops->exit(kvm, &rdev->vdev);
+		free(rdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_rng__exit);
diff --git a/tools/kvm/virtio/scsi.c b/tools/kvm/virtio/scsi.c
new file mode 100644
index 000000000000..05b2dc60844d
--- /dev/null
+++ b/tools/kvm/virtio/scsi.c
@@ -0,0 +1,311 @@
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/kernel.h>
+#include <linux/virtio_scsi.h>
+#include <linux/vhost.h>
+
+#define VIRTIO_SCSI_QUEUE_SIZE		128
+#define NUM_VIRT_QUEUES			3
+
+static LIST_HEAD(sdevs);
+static int compat_id = -1;
+
+struct scsi_dev {
+	struct virt_queue		vqs[NUM_VIRT_QUEUES];
+	struct virtio_scsi_config	config;
+	struct vhost_scsi_target	target;
+	u32				features;
+	int				vhost_fd;
+	struct virtio_device		vdev;
+	struct list_head		list;
+	struct kvm			*kvm;
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct scsi_dev *sdev = dev;
+
+	return ((u8 *)(&sdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return	1UL << VIRTIO_RING_F_EVENT_IDX |
+		1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct scsi_dev *sdev = dev;
+
+	sdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct vhost_vring_state state = { .index = vq };
+	struct vhost_vring_addr addr;
+	struct scsi_dev *sdev = dev;
+	struct virt_queue *queue;
+	void *p;
+	int r;
+
+	compat__remove_message(compat_id);
+
+	queue		= &sdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= guest_flat_to_host(kvm, queue->pfn * page_size);
+
+	vring_init(&queue->vring, VIRTIO_SCSI_QUEUE_SIZE, p, align);
+
+	if (sdev->vhost_fd == 0)
+		return 0;
+
+	state.num = queue->vring.num;
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_NUM failed");
+	state.num = 0;
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_BASE failed");
+
+	addr = (struct vhost_vring_addr) {
+		.index = vq,
+		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+		.used_user_addr = (u64)(unsigned long)queue->vring.used,
+	};
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_ADDR failed");
+
+	return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+	struct vhost_vring_file file;
+	struct scsi_dev *sdev = dev;
+	struct kvm_irqfd irq;
+	int r;
+
+	if (sdev->vhost_fd == 0)
+		return;
+
+	irq = (struct kvm_irqfd) {
+		.gsi	= gsi,
+		.fd	= eventfd(0, 0),
+	};
+	file = (struct vhost_vring_file) {
+		.index	= vq,
+		.fd	= irq.fd,
+	};
+
+	r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+	if (r < 0)
+		die_perror("KVM_IRQFD failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_CALL failed");
+
+	if (vq > 0)
+		return;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SCSI_SET_ENDPOINT, &sdev->target);
+	if (r != 0)
+		die("VHOST_SCSI_SET_ENDPOINT failed %d", errno);
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+	struct scsi_dev *sdev = dev;
+	struct vhost_vring_file file = {
+		.index	= vq,
+		.fd	= efd,
+	};
+	int r;
+
+	if (sdev->vhost_fd == 0)
+		return;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct scsi_dev *sdev = dev;
+
+	return sdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_SCSI_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	return size;
+}
+
+static struct virtio_ops scsi_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.notify_vq		= notify_vq,
+	.notify_vq_gsi		= notify_vq_gsi,
+	.notify_vq_eventfd	= notify_vq_eventfd,
+};
+
+static void virtio_scsi_vhost_init(struct kvm *kvm, struct scsi_dev *sdev)
+{
+	struct vhost_memory *mem;
+	u64 features;
+	int r;
+
+	sdev->vhost_fd = open("/dev/vhost-scsi", O_RDWR);
+	if (sdev->vhost_fd < 0)
+		die_perror("Failed openning vhost-scsi device");
+
+	mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+	if (mem == NULL)
+		die("Failed allocating memory for vhost memory map");
+
+	mem->nregions = 1;
+	mem->regions[0] = (struct vhost_memory_region) {
+		.guest_phys_addr	= 0,
+		.memory_size		= kvm->ram_size,
+		.userspace_addr		= (unsigned long)kvm->ram_start,
+	};
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_OWNER);
+	if (r != 0)
+		die_perror("VHOST_SET_OWNER failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_GET_FEATURES, &features);
+	if (r != 0)
+		die_perror("VHOST_GET_FEATURES failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_FEATURES, &features);
+	if (r != 0)
+		die_perror("VHOST_SET_FEATURES failed");
+	r = ioctl(sdev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+	if (r != 0)
+		die_perror("VHOST_SET_MEM_TABLE failed");
+
+	sdev->vdev.use_vhost = true;
+
+	free(mem);
+}
+
+
+static int virtio_scsi_init_one(struct kvm *kvm, struct disk_image *disk)
+{
+	struct scsi_dev *sdev;
+
+	if (!disk)
+		return -EINVAL;
+
+	sdev = calloc(1, sizeof(struct scsi_dev));
+	if (sdev == NULL)
+		return -ENOMEM;
+
+	*sdev = (struct scsi_dev) {
+		.config	= (struct virtio_scsi_config) {
+			.num_queues	= NUM_VIRT_QUEUES - 2,
+			.seg_max	= VIRTIO_SCSI_CDB_SIZE - 2,
+			.max_sectors	= 65535,
+			.cmd_per_lun	= 128,
+			.sense_size	= VIRTIO_SCSI_SENSE_SIZE,
+			.cdb_size	= VIRTIO_SCSI_CDB_SIZE,
+			.max_channel	= 0,
+			.max_target	= 0,
+			.max_lun	= 16383,
+			.event_info_size = sizeof(struct virtio_scsi_event),
+		},
+		.kvm			= kvm,
+	};
+	strncpy((char *)&sdev->target.vhost_wwpn, disk->wwpn, sizeof(sdev->target.vhost_wwpn));
+	sdev->target.vhost_tpgt = strtol(disk->tpgt, NULL, 0);
+
+	virtio_init(kvm, sdev, &sdev->vdev, &scsi_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_SCSI,
+		    VIRTIO_ID_SCSI, PCI_CLASS_BLK);
+
+	list_add_tail(&sdev->list, &sdevs);
+
+	virtio_scsi_vhost_init(kvm, sdev);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-scsi", "CONFIG_VIRTIO_SCSI");
+
+	return 0;
+}
+
+static int virtio_scsi_exit_one(struct kvm *kvm, struct scsi_dev *sdev)
+{
+	int r;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SCSI_CLEAR_ENDPOINT, &sdev->target);
+	if (r != 0)
+		die("VHOST_SCSI_CLEAR_ENDPOINT failed %d", errno);
+
+	list_del(&sdev->list);
+	free(sdev);
+
+	return 0;
+}
+
+int virtio_scsi_init(struct kvm *kvm)
+{
+	int i, r = 0;
+
+	for (i = 0; i < kvm->nr_disks; i++) {
+		if (!kvm->disks[i]->wwpn)
+			continue;
+		r = virtio_scsi_init_one(kvm, kvm->disks[i]);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	return virtio_scsi_exit(kvm);
+}
+virtio_dev_init(virtio_scsi_init);
+
+int virtio_scsi_exit(struct kvm *kvm)
+{
+	while (!list_empty(&sdevs)) {
+		struct scsi_dev *sdev;
+
+		sdev = list_first_entry(&sdevs, struct scsi_dev, list);
+		virtio_scsi_exit_one(kvm, sdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_scsi_exit);
diff --git a/tools/kvm/x86/bios.c b/tools/kvm/x86/bios.c
new file mode 100644
index 000000000000..f05cc021f02c
--- /dev/null
+++ b/tools/kvm/x86/bios.c
@@ -0,0 +1,174 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/e820.h"
+#include "kvm/interrupt.h"
+#include "kvm/util.h"
+
+#include <string.h>
+#include <asm/e820.h>
+
+#include "bios/bios-rom.h"
+
+struct irq_handler {
+	unsigned long		address;
+	unsigned int		irq;
+	void			*handler;
+	size_t			size;
+};
+
+#define BIOS_IRQ_PA_ADDR(name)	(MB_BIOS_BEGIN + BIOS_OFFSET__##name)
+#define BIOS_IRQ_FUNC(name)	((char *)&bios_rom[BIOS_OFFSET__##name])
+#define BIOS_IRQ_SIZE(name)	(BIOS_ENTRY_SIZE(BIOS_OFFSET__##name))
+
+#define DEFINE_BIOS_IRQ_HANDLER(_irq, _handler)			\
+	{							\
+		.irq		= _irq,				\
+		.address	= BIOS_IRQ_PA_ADDR(_handler),	\
+		.handler	= BIOS_IRQ_FUNC(_handler),	\
+		.size		= BIOS_IRQ_SIZE(_handler),	\
+	}
+
+static struct irq_handler bios_irq_handlers[] = {
+	DEFINE_BIOS_IRQ_HANDLER(0x10, bios_int10),
+	DEFINE_BIOS_IRQ_HANDLER(0x15, bios_int15),
+};
+
+static void setup_irq_handler(struct kvm *kvm, struct irq_handler *handler)
+{
+	struct real_intr_desc intr_desc;
+	void *p;
+
+	p = guest_flat_to_host(kvm, handler->address);
+	memcpy(p, handler->handler, handler->size);
+
+	intr_desc = (struct real_intr_desc) {
+		.segment	= REAL_SEGMENT(MB_BIOS_BEGIN),
+		.offset		= handler->address - MB_BIOS_BEGIN,
+	};
+
+	DIE_IF((handler->address - MB_BIOS_BEGIN) > 0xffffUL);
+
+	interrupt_table__set(&kvm->arch.interrupt_table, &intr_desc, handler->irq);
+}
+
+/**
+ * e820_setup - setup some simple E820 memory map
+ * @kvm - guest system descriptor
+ */
+static void e820_setup(struct kvm *kvm)
+{
+	struct e820map *e820;
+	struct e820entry *mem_map;
+	unsigned int i = 0;
+
+	e820		= guest_flat_to_host(kvm, E820_MAP_START);
+	mem_map		= e820->map;
+
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= REAL_MODE_IVT_BEGIN,
+		.size		= EBDA_START - REAL_MODE_IVT_BEGIN,
+		.type		= E820_RAM,
+	};
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= EBDA_START,
+		.size		= VGA_RAM_BEGIN - EBDA_START,
+		.type		= E820_RESERVED,
+	};
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= MB_BIOS_BEGIN,
+		.size		= MB_BIOS_END - MB_BIOS_BEGIN,
+		.type		= E820_RESERVED,
+	};
+	if (kvm->ram_size < KVM_32BIT_GAP_START) {
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= BZ_KERNEL_START,
+			.size		= kvm->ram_size - BZ_KERNEL_START,
+			.type		= E820_RAM,
+		};
+	} else {
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= BZ_KERNEL_START,
+			.size		= KVM_32BIT_GAP_START - BZ_KERNEL_START,
+			.type		= E820_RAM,
+		};
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= KVM_32BIT_MAX_MEM_SIZE,
+			.size		= kvm->ram_size - KVM_32BIT_MAX_MEM_SIZE,
+			.type		= E820_RAM,
+		};
+	}
+
+	BUG_ON(i > E820_X_MAX);
+
+	e820->nr_map = i;
+}
+
+static void setup_vga_rom(struct kvm *kvm)
+{
+	u16 *mode;
+	void *p;
+
+	p = guest_flat_to_host(kvm, VGA_ROM_OEM_STRING);
+	memset(p, 0, VGA_ROM_OEM_STRING_SIZE);
+	strncpy(p, "KVM VESA", VGA_ROM_OEM_STRING_SIZE);
+
+	mode = guest_flat_to_host(kvm, VGA_ROM_MODES);
+	mode[0]	= 0x0112;
+	mode[1] = 0xffff;
+}
+
+/**
+ * setup_bios - inject BIOS into guest memory
+ * @kvm - guest system descriptor
+ */
+void setup_bios(struct kvm *kvm)
+{
+	unsigned long address = MB_BIOS_BEGIN;
+	struct real_intr_desc intr_desc;
+	unsigned int i;
+	void *p;
+
+	/*
+	 * before anything else -- clean some known areas
+	 * we definitely don't want any trash here
+	 */
+	p = guest_flat_to_host(kvm, BDA_START);
+	memset(p, 0, BDA_END - BDA_START);
+
+	p = guest_flat_to_host(kvm, EBDA_START);
+	memset(p, 0, EBDA_END - EBDA_START);
+
+	p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+	memset(p, 0, MB_BIOS_END - MB_BIOS_BEGIN);
+
+	p = guest_flat_to_host(kvm, VGA_ROM_BEGIN);
+	memset(p, 0, VGA_ROM_END - VGA_ROM_BEGIN);
+
+	/* just copy the bios rom into the place */
+	p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+	memcpy(p, bios_rom, bios_rom_size);
+
+	/* E820 memory map must be present */
+	e820_setup(kvm);
+
+	/* VESA needs own tricks */
+	setup_vga_rom(kvm);
+
+	/*
+	 * Setup a *fake* real mode vector table, it has only
+	 * one real handler which does just iret
+	 */
+	address = BIOS_IRQ_PA_ADDR(bios_intfake);
+	intr_desc = (struct real_intr_desc) {
+		.segment	= REAL_SEGMENT(MB_BIOS_BEGIN),
+		.offset		= address - MB_BIOS_BEGIN,
+	};
+	interrupt_table__setup(&kvm->arch.interrupt_table, &intr_desc);
+
+	for (i = 0; i < ARRAY_SIZE(bios_irq_handlers); i++)
+		setup_irq_handler(kvm, &bios_irq_handlers[i]);
+
+	/* we almost done */
+	p = guest_flat_to_host(kvm, 0);
+	interrupt_table__copy(&kvm->arch.interrupt_table, p, REAL_INTR_SIZE);
+}
diff --git a/tools/kvm/x86/bios/.gitignore b/tools/kvm/x86/bios/.gitignore
new file mode 100644
index 000000000000..1f0080bcc5f2
--- /dev/null
+++ b/tools/kvm/x86/bios/.gitignore
@@ -0,0 +1,3 @@
+bios-rom.bin
+bios-rom.bin.elf
+bios-rom.h
diff --git a/tools/kvm/x86/bios/bios-rom.S b/tools/kvm/x86/bios/bios-rom.S
new file mode 100644
index 000000000000..3269ce9793ae
--- /dev/null
+++ b/tools/kvm/x86/bios/bios-rom.S
@@ -0,0 +1,12 @@
+#include <kvm/assembly.h>
+
+	.org 0
+#ifdef CONFIG_X86_64
+	.code64
+#else
+	.code32
+#endif
+
+GLOBAL(bios_rom)
+	.incbin "x86/bios/bios.bin"
+END(bios_rom)
diff --git a/tools/kvm/x86/bios/e820.c b/tools/kvm/x86/bios/e820.c
new file mode 100644
index 000000000000..a9bca29bff73
--- /dev/null
+++ b/tools/kvm/x86/bios/e820.c
@@ -0,0 +1,72 @@
+#include "kvm/e820.h"
+
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+
+#include <asm/processor-flags.h>
+#include <asm/e820.h>
+
+static inline void set_fs(u16 seg)
+{
+	asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+
+static inline u8 rdfs8(unsigned long addr)
+{
+	u8 v;
+
+	asm volatile("addr32 movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+
+	return v;
+}
+
+static inline u32 rdfs32(unsigned long addr)
+{
+	u32 v;
+
+	asm volatile("addr32 movl %%fs:%1,%0" : "=q" (v) : "m" (*(u32 *)addr));
+
+	return v;
+}
+
+bioscall void e820_query_map(struct biosregs *regs)
+{
+	struct e820map *e820;
+	u32 map_size;
+	u16 fs_seg;
+	u32 ndx;
+
+	e820		= (struct e820map *)E820_MAP_START;
+	fs_seg		= flat_to_seg16(E820_MAP_START);
+	set_fs(fs_seg);
+
+	ndx		= regs->ebx;
+
+	map_size	= rdfs32(flat_to_off16((u32)&e820->nr_map, fs_seg));
+
+	if (ndx < map_size) {
+		u32 start;
+		unsigned int i;
+		u8 *p;
+
+		fs_seg	= flat_to_seg16(E820_MAP_START);
+		set_fs(fs_seg);
+
+		start	= (u32)&e820->map[ndx];
+
+		p	= (void *) regs->edi;
+
+		for (i = 0; i < sizeof(struct e820entry); i++)
+			*p++	= rdfs8(flat_to_off16(start + i, fs_seg));
+	}
+
+	regs->eax	= SMAP;
+	regs->ecx	= sizeof(struct e820entry);
+	regs->ebx	= ++ndx;
+
+	/* Clear CF to indicate success.  */
+	regs->eflags	&= ~X86_EFLAGS_CF;
+
+	if (ndx >= map_size)
+		regs->ebx	= 0;	/* end of map */
+}
diff --git a/tools/kvm/x86/bios/entry.S b/tools/kvm/x86/bios/entry.S
new file mode 100644
index 000000000000..85056e9816c4
--- /dev/null
+++ b/tools/kvm/x86/bios/entry.S
@@ -0,0 +1,92 @@
+/*
+ * Our pretty trivial BIOS emulation
+ */
+
+#include <kvm/bios.h>
+#include <kvm/assembly.h>
+
+	.org 0
+	.code16gcc
+
+#define EFLAGS_CF	(1 << 0)
+
+#include "macro.S"
+
+/* If you change these macros, remember to update 'struct biosregs' */
+.macro SAVE_BIOSREGS
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebp
+	pushl	%esp
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	pushl	%eax
+.endm
+
+.macro RESTORE_BIOSREGS
+	popl	%eax
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esp
+	popl	%ebp
+	popl	%esi
+	popl	%edi
+	popl	%ds
+	popl	%es
+	popl	%fs
+.endm
+
+/*
+ * fake interrupt handler, nothing can be faster ever
+ */
+ENTRY(bios_intfake)
+	/*
+	 * Set CF to indicate failure. We don't want callers to think that the
+	 * interrupt handler succeeded and then treat the return values in
+	 * registers as valid data.
+	 */
+	orl	$EFLAGS_CF, 0x4(%esp)
+
+	IRET
+ENTRY_END(bios_intfake)
+
+/*
+ * int 10 - video - service
+ */
+ENTRY(bios_int10)
+	SAVE_BIOSREGS
+
+	movl		%esp, %eax
+	/* this is way easier than doing it in assembly */
+	/* just push all the regs and jump to a C handler */
+	call	int10_handler
+
+	RESTORE_BIOSREGS
+
+	/* Clear CF to indicate success.  */
+	andl	$~EFLAGS_CF, 0x4(%esp)
+
+	IRET
+ENTRY_END(bios_int10)
+
+ENTRY(bios_int15)
+	SAVE_BIOSREGS
+
+	movl	%esp, %eax
+	call	int15_handler
+
+	RESTORE_BIOSREGS
+
+	IRET
+ENTRY_END(bios_int15)
+
+GLOBAL(__locals)
+
+#include "local.S"
+
+END(__locals)
diff --git a/tools/kvm/x86/bios/gen-offsets.sh b/tools/kvm/x86/bios/gen-offsets.sh
new file mode 100644
index 000000000000..8771bbe0b1ea
--- /dev/null
+++ b/tools/kvm/x86/bios/gen-offsets.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+echo "/* Autogenerated file, don't edit */"
+echo "#ifndef BIOS_OFFSETS_H"
+echo "#define BIOS_OFFSETS_H"
+
+echo ""
+echo "#define BIOS_ENTRY_SIZE(name) (name##_end - name)"
+echo ""
+
+nm bios.bin.elf | grep ' [Tt] ' | awk '{ print "#define BIOS_OFFSET__" $3 " 0x" $1; }'
+
+echo ""
+echo "#endif"
diff --git a/tools/kvm/x86/bios/int10.c b/tools/kvm/x86/bios/int10.c
new file mode 100644
index 000000000000..7cc0b3f2162e
--- /dev/null
+++ b/tools/kvm/x86/bios/int10.c
@@ -0,0 +1,110 @@
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+#include "kvm/vesa.h"
+
+#include "bios/memcpy.h"
+
+#include <boot/vesa.h>
+
+static far_ptr gen_far_ptr(unsigned int pa)
+{
+	far_ptr ptr;
+
+	ptr.seg = (pa >> 4);
+	ptr.off = pa - (ptr.seg << 4);
+
+	return ptr;
+}
+
+static inline void outb(unsigned short port, unsigned char val)
+{
+	asm volatile("outb %0, %1" : : "a"(val), "Nd"(port));
+}
+
+/*
+ * It's probably much more useful to make this print to the serial
+ * line rather than print to a non-displayed VGA memory
+ */
+static inline void int10_putchar(struct biosregs *args)
+{
+	u8 al = args->eax & 0xFF;
+
+	outb(0x3f8, al);
+}
+
+static void vbe_get_mode(struct biosregs *args)
+{
+	struct vesa_mode_info *info = (struct vesa_mode_info *) args->edi;
+
+	*info = (struct vesa_mode_info) {
+		.mode_attr		= 0xd9, /* 11011011 */
+		.logical_scan		= VESA_WIDTH*4,
+		.h_res			= VESA_WIDTH,
+		.v_res			= VESA_HEIGHT,
+		.bpp			= VESA_BPP,
+		.memory_layout		= 6,
+		.memory_planes		= 1,
+		.lfb_ptr		= VESA_MEM_ADDR,
+		.rmask			= 8,
+		.gmask			= 8,
+		.bmask			= 8,
+		.resv_mask		= 8,
+		.resv_pos		= 24,
+		.bpos			= 16,
+		.gpos			= 8,
+	};
+}
+
+static void vbe_get_info(struct biosregs *args)
+{
+	struct vesa_general_info *infop = (struct vesa_general_info *) args->edi;
+	struct vesa_general_info info;
+
+	info = (struct vesa_general_info) {
+		.signature		= VESA_MAGIC,
+		.version		= 0x102,
+		.vendor_string		= gen_far_ptr(VGA_ROM_BEGIN),
+		.capabilities		= 0x10,
+		.video_mode_ptr		= gen_far_ptr(VGA_ROM_MODES),
+		.total_memory		= (4 * VESA_WIDTH * VESA_HEIGHT) / 0x10000,
+	};
+
+	memcpy16(args->es, infop, args->ds, &info, sizeof(info));
+}
+
+#define VBE_STATUS_OK		0x004F
+
+static void int10_vesa(struct biosregs *args)
+{
+	u8 al;
+
+	al = args->eax & 0xff;
+
+	switch (al) {
+	case 0x00:
+		vbe_get_info(args);
+		break;
+	case 0x01:
+		vbe_get_mode(args);
+		break;
+	}
+
+	args->eax = VBE_STATUS_OK;
+}
+
+bioscall void int10_handler(struct biosregs *args)
+{
+	u8 ah;
+
+	ah = (args->eax & 0xff00) >> 8;
+
+	switch (ah) {
+	case 0x0e:
+		int10_putchar(args);
+		break;
+	case 0x4f:
+		int10_vesa(args);
+		break;
+	}
+
+}
diff --git a/tools/kvm/x86/bios/int15.c b/tools/kvm/x86/bios/int15.c
new file mode 100644
index 000000000000..faf5343ea509
--- /dev/null
+++ b/tools/kvm/x86/bios/int15.c
@@ -0,0 +1,18 @@
+#include "kvm/bios.h"
+
+#include "kvm/e820.h"
+
+#include <asm/processor-flags.h>
+
+bioscall void int15_handler(struct biosregs *regs)
+{
+	switch (regs->eax) {
+	case 0xe820:
+		e820_query_map(regs);
+		break;
+	default:
+		/* Set CF to indicate failure.  */
+		regs->eflags	|= X86_EFLAGS_CF;
+		break;
+	}
+}
diff --git a/tools/kvm/x86/bios/local.S b/tools/kvm/x86/bios/local.S
new file mode 100644
index 000000000000..f2cdbf4c3e1c
--- /dev/null
+++ b/tools/kvm/x86/bios/local.S
@@ -0,0 +1,7 @@
+/*
+ * Local variables for almost every BIOS irq handler
+ * Must be put somewhere inside irq handler body
+ */
+__CALLER_SS:		.int  0
+__CALLER_SP:		.long 0
+__CALLER_CLOBBER:	.long 0
diff --git a/tools/kvm/x86/bios/macro.S b/tools/kvm/x86/bios/macro.S
new file mode 100644
index 000000000000..0d5e567e7cf1
--- /dev/null
+++ b/tools/kvm/x86/bios/macro.S
@@ -0,0 +1,25 @@
+/*
+ * handy BIOS macros
+ */
+
+/*
+ * switch to BIOS stack
+ */
+.macro stack_swap
+	movw %ss, %cs:(__CALLER_SS)
+	movl %esp, %cs:(__CALLER_SP)
+	movl %edx, %cs:(__CALLER_CLOBBER)
+	movw $MB_BIOS_SS, %dx
+	movw %dx, %ss
+	movw $MB_BIOS_SP, %sp
+	movl %cs:(__CALLER_CLOBBER), %edx
+.endm
+
+/*
+ * restore the original stack
+ */
+.macro stack_restore
+	movl %cs:(__CALLER_SP), %esp
+	movw %cs:(__CALLER_SS), %ss
+.endm
+
diff --git a/tools/kvm/x86/bios/memcpy.c b/tools/kvm/x86/bios/memcpy.c
new file mode 100644
index 000000000000..40b9b65fa9e4
--- /dev/null
+++ b/tools/kvm/x86/bios/memcpy.c
@@ -0,0 +1,23 @@
+#include "bios/memcpy.h"
+
+/*
+ *  Copy memory area in 16-bit real mode.
+ */
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len)
+{
+	__asm__ __volatile__ (
+		"pushw	%%ds				\n"
+		"pushw	%%es				\n"
+		"movw	%[src_seg], %%ds		\n"
+		"movw	%[dst_seg], %%es		\n"
+		"rep movsb %%ds:(%%si), %%es:(%%di)	\n"
+		"popw	%%es				\n"
+		"popw	%%ds				\n"
+		:
+		: "S"(src),
+		  "D"(dst),
+		  "c"(len),
+		  [src_seg] "r"(src_seg),
+		  [dst_seg] "r"(dst_seg)
+		: "cc", "memory");
+}
diff --git a/tools/kvm/x86/bios/rom.ld.S b/tools/kvm/x86/bios/rom.ld.S
new file mode 100644
index 000000000000..f4f183579327
--- /dev/null
+++ b/tools/kvm/x86/bios/rom.ld.S
@@ -0,0 +1,16 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS {
+	.text 0 : {
+		*(.text)
+	}
+
+	/DISCARD/ : {
+		*(.debug*)
+		*(.data)
+		*(.bss)
+		*(.eh_frame*)
+	}
+}
+
diff --git a/tools/kvm/x86/boot.c b/tools/kvm/x86/boot.c
new file mode 100644
index 000000000000..61535eb57bce
--- /dev/null
+++ b/tools/kvm/x86/boot.c
@@ -0,0 +1,41 @@
+#include "kvm/kvm.h"
+
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#define BIOS_SELECTOR	0xf000
+#define BIOS_IP		0xfff0
+#define BIOS_SP		0x8000
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	struct stat st;
+	void *p;
+	int fd;
+	int nr;
+
+	fd = open(firmware_filename, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	if (fstat(fd, &st))
+		return false;
+
+	if (st.st_size > MB_FIRMWARE_BIOS_SIZE)
+		die("firmware image %s is too big to fit in memory (%Lu KB).\n", firmware_filename, (u64)(st.st_size / 1024));
+
+	p = guest_flat_to_host(kvm, MB_FIRMWARE_BIOS_BEGIN);
+
+	while ((nr = read(fd, p, st.st_size)) > 0)
+		p += nr;
+
+	kvm->arch.boot_selector	= BIOS_SELECTOR;
+	kvm->arch.boot_ip	= BIOS_IP;
+	kvm->arch.boot_sp	= BIOS_SP;
+
+	return true;
+}
diff --git a/tools/kvm/x86/cpuid.c b/tools/kvm/x86/cpuid.c
new file mode 100644
index 000000000000..4c140f0c57e6
--- /dev/null
+++ b/tools/kvm/x86/cpuid.c
@@ -0,0 +1,60 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <sys/ioctl.h>
+#include <stdlib.h>
+
+#define CPUID_FUNC_PERFMON		0x0A
+
+#define	MAX_KVM_CPUID_ENTRIES		100
+
+static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid)
+{
+	unsigned int i;
+
+	/*
+	 * Filter CPUID functions that are not supported by the hypervisor.
+	 */
+	for (i = 0; i < kvm_cpuid->nent; i++) {
+		struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i];
+
+		switch (entry->function) {
+		case 1:
+			/* Set X86_FEATURE_HYPERVISOR */
+			if (entry->index == 0)
+				entry->ecx |= (1 << 31);
+			break;
+		case 6:
+			/* Clear X86_FEATURE_EPB */
+			entry->ecx = entry->ecx & ~(1 << 3);
+			break;
+		case CPUID_FUNC_PERFMON:
+			entry->eax = 0x00; /* disable it */
+			break;
+		default:
+			/* Keep the CPUID function as -is */
+			break;
+		};
+	}
+}
+
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu)
+{
+	struct kvm_cpuid2 *kvm_cpuid;
+
+	kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) +
+				MAX_KVM_CPUID_ENTRIES * sizeof(*kvm_cpuid->entries));
+
+	kvm_cpuid->nent = MAX_KVM_CPUID_ENTRIES;
+	if (ioctl(vcpu->kvm->sys_fd, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0)
+		die_perror("KVM_GET_SUPPORTED_CPUID failed");
+
+	filter_cpuid(kvm_cpuid);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_CPUID2, kvm_cpuid) < 0)
+		die_perror("KVM_SET_CPUID2 failed");
+
+	free(kvm_cpuid);
+}
diff --git a/tools/kvm/x86/include/kvm/assembly.h b/tools/kvm/x86/include/kvm/assembly.h
new file mode 100644
index 000000000000..e70baab39cce
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/assembly.h
@@ -0,0 +1,24 @@
+#ifndef ASSEMBLY_H_
+#define ASSEMBLY_H_
+
+#define __ALIGN	.p2align 4, 0x90
+#define ENTRY(name)	\
+	__ALIGN;	\
+	.globl name;	\
+	name:
+
+#define GLOBAL(name)	\
+	.globl name;	\
+	name:
+
+#define ENTRY_END(name)	GLOBAL(name##_end)
+#define END(name)	GLOBAL(name##_end)
+
+/*
+ * gas produces size override prefix with which
+ * we are unhappy, lets make it hardcoded for
+ * 16 bit mode
+ */
+#define IRET	.byte 0xcf
+
+#endif /* ASSEMBLY_H_ */
diff --git a/tools/kvm/x86/include/kvm/barrier.h b/tools/kvm/x86/include/kvm/barrier.h
new file mode 100644
index 000000000000..46d14f67b027
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb()	asm volatile ("mfence": : :"memory")
+#define rmb()	asm volatile ("lfence": : :"memory")
+#define wmb()	asm volatile ("sfence": : :"memory")
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/x86/include/kvm/bios-export.h b/tools/kvm/x86/include/kvm/bios-export.h
new file mode 100644
index 000000000000..23825aab031d
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios-export.h
@@ -0,0 +1,13 @@
+#ifndef BIOS_EXPORT_H_
+#define BIOS_EXPORT_H_
+
+struct kvm;
+
+extern char bios_rom[0];
+extern char bios_rom_end[0];
+
+#define bios_rom_size		(bios_rom_end - bios_rom)
+
+extern void setup_bios(struct kvm *kvm);
+
+#endif /* BIOS_EXPORT_H_ */
diff --git a/tools/kvm/x86/include/kvm/bios.h b/tools/kvm/x86/include/kvm/bios.h
new file mode 100644
index 000000000000..ec7ed715e134
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios.h
@@ -0,0 +1,93 @@
+#ifndef BIOS_H_
+#define BIOS_H_
+
+/*
+ * X86-32 Memory Map (typical)
+ *					start      end
+ * Real Mode Interrupt Vector Table	0x00000000 0x000003FF
+ * BDA area				0x00000400 0x000004FF
+ * Conventional Low Memory		0x00000500 0x0009FBFF
+ * EBDA area				0x0009FC00 0x0009FFFF
+ * VIDEO RAM				0x000A0000 0x000BFFFF
+ * VIDEO ROM (BIOS)			0x000C0000 0x000C7FFF
+ * ROMs & unus. space (mapped hw & misc)0x000C8000 0x000EFFFF 160 KiB (typically)
+ * Motherboard BIOS			0x000F0000 0x000FFFFF
+ * Extended Memory			0x00100000 0xFEBFFFFF
+ * Reserved (configs, ACPI, PnP, etc)	0xFEC00000 0xFFFFFFFF
+ */
+
+#define REAL_MODE_IVT_BEGIN		0x00000000
+#define REAL_MODE_IVT_END		0x000003ff
+
+#define BDA_START			0x00000400
+#define BDA_END				0x000004ff
+
+#define EBDA_START			0x0009fc00
+#define EBDA_END			0x0009ffff
+
+#define E820_MAP_START			EBDA_START
+
+#define MB_BIOS_BEGIN			0x000f0000
+#define MB_FIRMWARE_BIOS_BEGIN		0x000e0000
+#define MB_BIOS_END			0x000fffff
+
+#define MB_BIOS_SIZE			(MB_BIOS_END - MB_BIOS_BEGIN + 1)
+#define MB_FIRMWARE_BIOS_SIZE		(MB_BIOS_END - MB_FIRMWARE_BIOS_BEGIN + 1)
+
+#define VGA_RAM_BEGIN			0x000a0000
+#define VGA_RAM_END			0x000bffff
+
+#define VGA_ROM_BEGIN			0x000c0000
+#define VGA_ROM_OEM_STRING		VGA_ROM_BEGIN
+#define VGA_ROM_OEM_STRING_SIZE		16
+#define VGA_ROM_MODES			(VGA_ROM_OEM_STRING + VGA_ROM_OEM_STRING_SIZE)
+#define VGA_ROM_MODES_SIZE		32
+#define VGA_ROM_END			0x000c7fff
+
+/* we handle one page only */
+#define VGA_RAM_SEG			(VGA_RAM_BEGIN >> 4)
+#define VGA_PAGE_SIZE			0x007d0 /* 80x25 */
+
+/* real mode interrupt vector table */
+#define REAL_INTR_BASE			REAL_MODE_IVT_BEGIN
+#define REAL_INTR_VECTORS		256
+
+/*
+ * BIOS stack must be at absolute predefined memory address
+ * We reserve 64 bytes for BIOS stack
+ */
+#define MB_BIOS_SS			0xfff7
+#define MB_BIOS_SP			0x40
+
+/*
+ * When interfere with assembler code we need to be sure how
+ * arguments are passed in real mode.
+ */
+#define bioscall __attribute__((regparm(3)))
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+
+struct biosregs {
+	u32			eax;
+	u32			ebx;
+	u32			ecx;
+	u32			edx;
+	u32			esp;
+	u32			ebp;
+	u32			esi;
+	u32			edi;
+	u32			ds;
+	u32			es;
+	u32			fs;
+	u32			eip;
+	u32			eflags;
+};
+
+extern bioscall void int10_handler(struct biosregs *regs);
+extern bioscall void int15_handler(struct biosregs *regs);
+
+#endif
+
+#endif /* BIOS_H_ */
diff --git a/tools/kvm/x86/include/kvm/boot-protocol.h b/tools/kvm/x86/include/kvm/boot-protocol.h
new file mode 100644
index 000000000000..85b637f585c2
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/boot-protocol.h
@@ -0,0 +1,16 @@
+/*
+ * Linux boot protocol specifics
+ */
+
+#ifndef BOOT_PROTOCOL_H_
+#define BOOT_PROTOCOL_H_
+
+/*
+ * The protected mode kernel part of a modern bzImage is loaded
+ * at 1 MB by default.
+ */
+#define BZ_DEFAULT_SETUP_SECTS		4
+#define BZ_KERNEL_START			0x100000UL
+#define INITRD_START			0x1000000UL
+
+#endif /* BOOT_PROTOCOL_H_ */
diff --git a/tools/kvm/x86/include/kvm/cpufeature.h b/tools/kvm/x86/include/kvm/cpufeature.h
new file mode 100644
index 000000000000..bc4abbbb42f0
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/cpufeature.h
@@ -0,0 +1,41 @@
+#ifndef KVM__CPUFEATURE_H
+#define KVM__CPUFEATURE_H
+
+#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
+#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
+#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
+
+#define CPUID_VENDOR_AMD_1   0x68747541 /* "Auth" */
+#define CPUID_VENDOR_AMD_2   0x69746e65 /* "enti" */
+#define CPUID_VENDOR_AMD_3   0x444d4163 /* "cAMD" */
+
+/*
+ * CPUID flags we need to deal with
+ */
+#define KVM__X86_FEATURE_VMX		5	/* Hardware virtualization */
+#define KVM__X86_FEATURE_SVM		2	/* Secure virtual machine */
+#define KVM__X86_FEATURE_XSAVE		26	/* XSAVE/XRSTOR/XSETBV/XGETBV */
+
+#define cpu_feature_disable(reg, feature)	\
+	((reg) & ~(1 << (feature)))
+#define cpu_feature_enable(reg, feature)	\
+	((reg) |  (1 << (feature)))
+
+struct cpuid_regs {
+	u32	eax;
+	u32	ebx;
+	u32	ecx;
+	u32	edx;
+};
+
+static inline void host_cpuid(struct cpuid_regs *regs)
+{
+	asm volatile("cpuid"
+		: "=a" (regs->eax),
+		  "=b" (regs->ebx),
+		  "=c" (regs->ecx),
+		  "=d" (regs->edx)
+		: "0" (regs->eax), "2" (regs->ecx));
+}
+
+#endif /* KVM__CPUFEATURE_H */
diff --git a/tools/kvm/x86/include/kvm/interrupt.h b/tools/kvm/x86/include/kvm/interrupt.h
new file mode 100644
index 000000000000..00c7ed7dc3c1
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/interrupt.h
@@ -0,0 +1,26 @@
+#ifndef KVM__INTERRUPT_H
+#define KVM__INTERRUPT_H
+
+#include <linux/types.h>
+#include "kvm/bios.h"
+#include "kvm/bios-export.h"
+
+struct real_intr_desc {
+	u16 offset;
+	u16 segment;
+} __attribute__((packed));
+
+#define REAL_SEGMENT_SHIFT	4
+#define REAL_SEGMENT(addr)	((addr) >> REAL_SEGMENT_SHIFT)
+#define REAL_OFFSET(addr)	((addr) & ((1 << REAL_SEGMENT_SHIFT) - 1))
+#define REAL_INTR_SIZE		(REAL_INTR_VECTORS * sizeof(struct real_intr_desc))
+
+struct interrupt_table {
+	struct real_intr_desc entries[REAL_INTR_VECTORS];
+};
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size);
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry);
+void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num);
+
+#endif /* KVM__INTERRUPT_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-arch.h b/tools/kvm/x86/include/kvm/kvm-arch.h
new file mode 100644
index 000000000000..1e0949ed9506
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-arch.h
@@ -0,0 +1,36 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include "kvm/interrupt.h"
+#include "kvm/segment.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * The hole includes VESA framebuffer and PCI memory.
+ */
+#define KVM_32BIT_MAX_MEM_SIZE  (1ULL << 32)
+#define KVM_32BIT_GAP_SIZE	(768 << 20)
+#define KVM_32BIT_GAP_START	(KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE)
+
+#define KVM_MMIO_START		KVM_32BIT_GAP_START
+
+/* This is the address that pci_get_io_space_block() starts allocating
+ * from.  Note that this is a PCI bus address (though same on x86).
+ */
+#define KVM_PCI_MMIO_AREA	(KVM_MMIO_START + 0x1000000)
+#define KVM_VIRTIO_MMIO_AREA	(KVM_MMIO_START + 0x2000000)
+
+#define VIRTIO_DEFAULT_TRANS	VIRTIO_PCI
+
+struct kvm_arch {
+	u16			boot_selector;
+	u16			boot_ip;
+	u16			boot_sp;
+
+	struct interrupt_table	interrupt_table;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-config-arch.h b/tools/kvm/x86/include/kvm/kvm-config-arch.h
new file mode 100644
index 000000000000..3eae8dbce0b8
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-config-arch.h
@@ -0,0 +1,15 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+	int vidmode;
+};
+
+#define OPT_ARCH_RUN(pfx, cfg)						\
+	pfx,								\
+	OPT_GROUP("BIOS options:"),					\
+	OPT_INTEGER('\0', "vidmode", &(cfg)->vidmode, "Video mode"),
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-cpu-arch.h b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 000000000000..198efe68a6f0
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,49 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include "kvm/kvm.h"	/* for kvm__emulate_{mm}io() */
+#include <stdbool.h>
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t		thread;		/* VCPU thread */
+
+	unsigned long		cpu_id;
+
+	struct kvm		*kvm;		/* parent KVM */
+	int			vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run		*kvm_run;
+
+	struct kvm_regs		regs;
+	struct kvm_sregs	sregs;
+	struct kvm_fpu		fpu;
+
+	struct kvm_msrs		*msrs;		/* dynamically allocated */
+
+	u8			is_running;
+	u8			paused;
+	u8			needs_nmi;
+
+	struct kvm_coalesced_mmio_ring	*ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count)
+{
+	return kvm__emulate_io(kvm, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	return kvm__emulate_mmio(kvm, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/mptable.h b/tools/kvm/x86/include/kvm/mptable.h
new file mode 100644
index 000000000000..9e3cfa6be1ce
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/mptable.h
@@ -0,0 +1,9 @@
+#ifndef KVM_MPTABLE_H_
+#define KVM_MPTABLE_H_
+
+struct kvm;
+
+int mptable__init(struct kvm *kvm);
+int mptable__exit(struct kvm *kvm);
+
+#endif /* KVM_MPTABLE_H_ */
diff --git a/tools/kvm/x86/interrupt.c b/tools/kvm/x86/interrupt.c
new file mode 100644
index 000000000000..7d478690fb54
--- /dev/null
+++ b/tools/kvm/x86/interrupt.c
@@ -0,0 +1,28 @@
+#include "kvm/interrupt.h"
+
+#include "kvm/util.h"
+
+#include <string.h>
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size)
+{
+	if (size < sizeof(itable->entries))
+		die("An attempt to overwrite host memory");
+
+	memcpy(dst, itable->entries, sizeof(itable->entries));
+}
+
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry)
+{
+	unsigned int i;
+
+	for (i = 0; i < REAL_INTR_VECTORS; i++)
+		itable->entries[i] = *entry;
+}
+
+void interrupt_table__set(struct interrupt_table *itable,
+				struct real_intr_desc *entry, unsigned int num)
+{
+	if (num < REAL_INTR_VECTORS)
+		itable->entries[num] = *entry;
+}
diff --git a/tools/kvm/x86/ioport.c b/tools/kvm/x86/ioport.c
new file mode 100644
index 000000000000..824ef257cdb5
--- /dev/null
+++ b/tools/kvm/x86/ioport.c
@@ -0,0 +1,99 @@
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+static bool debug_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	return 0;
+}
+
+static struct ioport_operations debug_ops = {
+	.io_out		= debug_io_out,
+};
+
+static bool seabios_debug_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	char ch;
+
+	ch = ioport__read8(data);
+
+	putchar(ch);
+
+	return true;
+}
+
+static struct ioport_operations seabios_debug_ops = {
+	.io_out		= seabios_debug_io_out,
+};
+
+static bool dummy_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static bool dummy_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static struct ioport_operations dummy_read_write_ioport_ops = {
+	.io_in		= dummy_io_in,
+	.io_out		= dummy_io_out,
+};
+
+static struct ioport_operations dummy_write_only_ioport_ops = {
+	.io_out		= dummy_io_out,
+};
+
+/*
+ * The "fast A20 gate"
+ */
+
+static bool ps2_control_a_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+{
+	/*
+	 * A20 is always enabled.
+	 */
+	ioport__write8(data, 0x02);
+
+	return true;
+}
+
+static struct ioport_operations ps2_control_a_ops = {
+	.io_in		= ps2_control_a_io_in,
+	.io_out		= dummy_io_out,
+};
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+	/* Legacy ioport setup */
+
+	/* 0x0020 - 0x003F - 8259A PIC 1 */
+	ioport__register(kvm, 0x0020, &dummy_read_write_ioport_ops, 2, NULL);
+
+	/* PORT 0040-005F - PIT - PROGRAMMABLE INTERVAL TIMER (8253, 8254) */
+	ioport__register(kvm, 0x0040, &dummy_read_write_ioport_ops, 4, NULL);
+
+	/* 0092 - PS/2 system control port A */
+	ioport__register(kvm, 0x0092, &ps2_control_a_ops, 1, NULL);
+
+	/* 0x00A0 - 0x00AF - 8259A PIC 2 */
+	ioport__register(kvm, 0x00A0, &dummy_read_write_ioport_ops, 2, NULL);
+
+	/* PORT 00E0-00EF are 'motherboard specific' so we use them for our
+	   internal debugging purposes.  */
+	ioport__register(kvm, IOPORT_DBG, &debug_ops, 1, NULL);
+
+	/* PORT 00ED - DUMMY PORT FOR DELAY??? */
+	ioport__register(kvm, 0x00ED, &dummy_write_only_ioport_ops, 1, NULL);
+
+	/* 0x00F0 - 0x00FF - Math co-processor */
+	ioport__register(kvm, 0x00F0, &dummy_write_only_ioport_ops, 2, NULL);
+
+	/* PORT 03D4-03D5 - COLOR VIDEO - CRT CONTROL REGISTERS */
+	ioport__register(kvm, 0x03D4, &dummy_read_write_ioport_ops, 1, NULL);
+	ioport__register(kvm, 0x03D5, &dummy_write_only_ioport_ops, 1, NULL);
+
+	ioport__register(kvm, 0x402, &seabios_debug_ops, 1, NULL);
+}
diff --git a/tools/kvm/x86/irq.c b/tools/kvm/x86/irq.c
new file mode 100644
index 000000000000..7447c6b7d7aa
--- /dev/null
+++ b/tools/kvm/x86/irq.c
@@ -0,0 +1,222 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#define IRQ_MAX_GSI			64
+#define IRQCHIP_MASTER			0
+#define IRQCHIP_SLAVE			1
+#define IRQCHIP_IOAPIC			2
+
+static u8		next_line	= 5;
+static struct rb_root	pci_tree	= RB_ROOT;
+
+/* First 24 GSIs are routed between IRQCHIPs and IOAPICs */
+static u32 gsi = 24;
+
+struct kvm_irq_routing *irq_routing;
+
+static int irq__add_routing(u32 gsi, u32 type, u32 irqchip, u32 pin)
+{
+	if (gsi >= IRQ_MAX_GSI)
+		return -ENOSPC;
+
+	irq_routing->entries[irq_routing->nr++] =
+		(struct kvm_irq_routing_entry) {
+			.gsi = gsi,
+			.type = type,
+			.u.irqchip.irqchip = irqchip,
+			.u.irqchip.pin = pin,
+		};
+
+	return 0;
+}
+
+static struct pci_dev *search(struct rb_root *root, u32 id)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct pci_dev *data = rb_entry(node, struct pci_dev, node);
+		int result;
+
+		result = id - data->id;
+
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return data;
+	}
+	return NULL;
+}
+
+static int insert(struct rb_root *root, struct pci_dev *data)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct pci_dev *this	= container_of(*new, struct pci_dev, node);
+		int result		= data->id - this->id;
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else if (result > 0)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+
+	return 0;
+}
+
+int irq__register_device(u32 dev, u8 *pin, u8 *line)
+{
+	struct pci_dev *node;
+	int r;
+
+	node = search(&pci_tree, dev);
+
+	if (!node) {
+		/* We haven't found a node - First device of it's kind */
+		node = malloc(sizeof(*node));
+		if (node == NULL)
+			return -ENOMEM;
+
+		*node = (struct pci_dev) {
+			.id	= dev,
+			/*
+			 * PCI supports only INTA#,B#,C#,D# per device.
+			 * A#,B#,C#,D# are allowed for multifunctional
+			 * devices so stick with A# for our single
+			 * function devices.
+			 */
+			.pin	= 1,
+		};
+
+		INIT_LIST_HEAD(&node->lines);
+
+		r = insert(&pci_tree, node);
+		if (r) {
+			free(node);
+			return r;
+		}
+	}
+
+	if (node) {
+		/* This device already has a pin assigned, give out a new line and device id */
+		struct irq_line *new = malloc(sizeof(*new));
+		if (new == NULL)
+			return -ENOMEM;
+
+		new->line	= next_line++;
+		*line		= new->line;
+		*pin		= node->pin;
+
+		list_add(&new->node, &node->lines);
+
+		return 0;
+	}
+
+	return -EFAULT;
+}
+
+int irq__init(struct kvm *kvm)
+{
+	int i, r;
+
+	irq_routing = calloc(sizeof(struct kvm_irq_routing) +
+			IRQ_MAX_GSI * sizeof(struct kvm_irq_routing_entry), 1);
+	if (irq_routing == NULL)
+		return -ENOMEM;
+
+	/* Hook first 8 GSIs to master IRQCHIP */
+	for (i = 0; i < 8; i++)
+		if (i != 2)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_MASTER, i);
+
+	/* Hook next 8 GSIs to slave IRQCHIP */
+	for (i = 8; i < 16; i++)
+		irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_SLAVE, i - 8);
+
+	/* Last but not least, IOAPIC */
+	for (i = 0; i < 24; i++) {
+		if (i == 0)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, 2);
+		else if (i != 2)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, i);
+	}
+
+	r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+	if (r) {
+		free(irq_routing);
+		return errno;
+	}
+
+	return 0;
+}
+dev_base_init(irq__init);
+
+int irq__exit(struct kvm *kvm)
+{
+	struct rb_node *ent;
+
+	free(irq_routing);
+
+	while ((ent = rb_first(&pci_tree))) {
+		struct pci_dev *dev;
+		struct irq_line *line;
+
+		dev = rb_entry(ent, struct pci_dev, node);
+		while (!list_empty(&dev->lines)) {
+			line = list_first_entry(&dev->lines, struct irq_line, node);
+			list_del(&line->node);
+			free(line);
+		}
+		rb_erase(&dev->node, &pci_tree);
+		free(dev);
+	}
+
+	return 0;
+}
+dev_base_exit(irq__exit);
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+	int r;
+
+	irq_routing->entries[irq_routing->nr++] =
+		(struct kvm_irq_routing_entry) {
+			.gsi = gsi,
+			.type = KVM_IRQ_ROUTING_MSI,
+			.u.msi.address_hi = msg->address_hi,
+			.u.msi.address_lo = msg->address_lo,
+			.u.msi.data = msg->data,
+		};
+
+	r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+	if (r)
+		return r;
+
+	return gsi++;
+}
+
+struct rb_node *irq__get_pci_tree(void)
+{
+	return rb_first(&pci_tree);
+}
diff --git a/tools/kvm/x86/kvm-cpu.c b/tools/kvm/x86/kvm-cpu.c
new file mode 100644
index 000000000000..b6190ed31395
--- /dev/null
+++ b/tools/kvm/x86/kvm-cpu.c
@@ -0,0 +1,425 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <asm/msr-index.h>
+#include <asm/apicdef.h>
+#include <linux/err.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static inline bool is_in_protected_mode(struct kvm_cpu *vcpu)
+{
+	return vcpu->sregs.cr0 & 0x01;
+}
+
+static inline u64 ip_to_flat(struct kvm_cpu *vcpu, u64 ip)
+{
+	u64 cs;
+
+	/*
+	 * NOTE! We should take code segment base address into account here.
+	 * Luckily it's usually zero because Linux uses flat memory model.
+	 */
+	if (is_in_protected_mode(vcpu))
+		return ip;
+
+	cs = vcpu->sregs.cs.selector;
+
+	return ip + (cs << 4);
+}
+
+static inline u32 selector_to_base(u16 selector)
+{
+	/*
+	 * KVM on Intel requires 'base' to be 'selector * 16' in real mode.
+	 */
+	return (u32)selector << 4;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu = calloc(1, sizeof(*vcpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm = kvm;
+
+	return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	if (vcpu->msrs)
+		free(vcpu->msrs);
+
+	free(vcpu);
+}
+
+static int kvm_cpu__set_lint(struct kvm_cpu *vcpu)
+{
+	struct local_apic lapic;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_LAPIC, &lapic))
+		return -1;
+
+	lapic.lvt_lint0.delivery_mode = APIC_MODE_EXTINT;
+	lapic.lvt_lint1.delivery_mode = APIC_MODE_NMI;
+
+	return ioctl(vcpu->vcpu_fd, KVM_SET_LAPIC, &lapic);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	int coalesced_offset;
+
+	vcpu = kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id = cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+	if (kvm_cpu__set_lint(vcpu))
+		die_perror("KVM_SET_LAPIC failed");
+
+	vcpu->is_running = true;
+
+	return vcpu;
+}
+
+static struct kvm_msrs *kvm_msrs__new(size_t nmsrs)
+{
+	struct kvm_msrs *vcpu = calloc(1, sizeof(*vcpu) + (sizeof(struct kvm_msr_entry) * nmsrs));
+
+	if (!vcpu)
+		die("out of memory");
+
+	return vcpu;
+}
+
+#define KVM_MSR_ENTRY(_index, _data)	\
+	(struct kvm_msr_entry) { .index = _index, .data = _data }
+
+static void kvm_cpu__setup_msrs(struct kvm_cpu *vcpu)
+{
+	unsigned long ndx = 0;
+
+	vcpu->msrs = kvm_msrs__new(100);
+
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS,	0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP,	0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP,	0x0);
+#ifdef CONFIG_X86_64
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_STAR,			0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_CSTAR,			0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_KERNEL_GS_BASE,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_SYSCALL_MASK,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_LSTAR,			0x0);
+#endif
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TSC,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_MISC_ENABLE,
+						MSR_IA32_MISC_ENABLE_FAST_STRING);
+
+	vcpu->msrs->nmsrs = ndx;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_MSRS, vcpu->msrs) < 0)
+		die_perror("KVM_SET_MSRS failed");
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+	vcpu->fpu = (struct kvm_fpu) {
+		.fcw	= 0x37f,
+		.mxcsr	= 0x1f80,
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_FPU, &vcpu->fpu) < 0)
+		die_perror("KVM_SET_FPU failed");
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	vcpu->regs = (struct kvm_regs) {
+		/* We start the guest in 16-bit real mode  */
+		.rflags	= 0x0000000000000002ULL,
+
+		.rip	= vcpu->kvm->arch.boot_ip,
+		.rsp	= vcpu->kvm->arch.boot_sp,
+		.rbp	= vcpu->kvm->arch.boot_sp,
+	};
+
+	if (vcpu->regs.rip > USHRT_MAX)
+		die("ip 0x%llx is too high for real mode", (u64)vcpu->regs.rip);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die_perror("KVM_GET_SREGS failed");
+
+	vcpu->sregs.cs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.cs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.ss.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.ss.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.ds.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.ds.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.es.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.es.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.fs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.fs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.gs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.gs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &vcpu->sregs) < 0)
+		die_perror("KVM_SET_SREGS failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_cpuid(vcpu);
+	kvm_cpu__setup_sregs(vcpu);
+	kvm_cpu__setup_regs(vcpu);
+	kvm_cpu__setup_fpu(vcpu);
+	kvm_cpu__setup_msrs(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	return false;
+}
+
+static void print_dtable(const char *name, struct kvm_dtable *dtable)
+{
+	dprintf(debug_fd, " %s                 %016llx  %08hx\n",
+		name, (u64) dtable->base, (u16) dtable->limit);
+}
+
+static void print_segment(const char *name, struct kvm_segment *seg)
+{
+	dprintf(debug_fd, " %s       %04hx      %016llx  %08x  %02hhx    %x %x   %x  %x %x %x %x\n",
+		name, (u16) seg->selector, (u64) seg->base, (u32) seg->limit,
+		(u8) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	unsigned long cr0, cr2, cr3;
+	unsigned long cr4, cr8;
+	unsigned long rax, rbx, rcx;
+	unsigned long rdx, rsi, rdi;
+	unsigned long rbp,  r8,  r9;
+	unsigned long r10, r11, r12;
+	unsigned long r13, r14, r15;
+	unsigned long rip, rsp;
+	struct kvm_sregs sregs;
+	unsigned long rflags;
+	struct kvm_regs regs;
+	int i;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	rflags = regs.rflags;
+
+	rip = regs.rip; rsp = regs.rsp;
+	rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx;
+	rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi;
+	rbp = regs.rbp; r8  = regs.r8;  r9  = regs.r9;
+	r10 = regs.r10; r11 = regs.r11; r12 = regs.r12;
+	r13 = regs.r13; r14 = regs.r14; r15 = regs.r15;
+
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd,   " ----------\n");
+	dprintf(debug_fd, " rip: %016lx   rsp: %016lx flags: %016lx\n", rip, rsp, rflags);
+	dprintf(debug_fd, " rax: %016lx   rbx: %016lx   rcx: %016lx\n", rax, rbx, rcx);
+	dprintf(debug_fd, " rdx: %016lx   rsi: %016lx   rdi: %016lx\n", rdx, rsi, rdi);
+	dprintf(debug_fd, " rbp: %016lx    r8: %016lx    r9: %016lx\n", rbp, r8,  r9);
+	dprintf(debug_fd, " r10: %016lx   r11: %016lx   r12: %016lx\n", r10, r11, r12);
+	dprintf(debug_fd, " r13: %016lx   r14: %016lx   r15: %016lx\n", r13, r14, r15);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_REGS failed");
+
+	cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3;
+	cr4 = sregs.cr4; cr8 = sregs.cr8;
+
+	dprintf(debug_fd, " cr0: %016lx   cr2: %016lx   cr3: %016lx\n", cr0, cr2, cr3);
+	dprintf(debug_fd, " cr4: %016lx   cr8: %016lx\n", cr4, cr8);
+	dprintf(debug_fd, "\n Segment registers:\n");
+	dprintf(debug_fd,   " ------------------\n");
+	dprintf(debug_fd, " register  selector  base              limit     type  p dpl db s l g avl\n");
+	print_segment("cs ", &sregs.cs);
+	print_segment("ss ", &sregs.ss);
+	print_segment("ds ", &sregs.ds);
+	print_segment("es ", &sregs.es);
+	print_segment("fs ", &sregs.fs);
+	print_segment("gs ", &sregs.gs);
+	print_segment("tr ", &sregs.tr);
+	print_segment("ldt", &sregs.ldt);
+	print_dtable("gdt", &sregs.gdt);
+	print_dtable("idt", &sregs.idt);
+
+	dprintf(debug_fd, "\n APIC:\n");
+	dprintf(debug_fd,   " -----\n");
+	dprintf(debug_fd, " efer: %016llx  apic base: %016llx  nmi: %s\n",
+		(u64) sregs.efer, (u64) sregs.apic_base,
+		(vcpu->kvm->nmi_disabled ? "disabled" : "enabled"));
+
+	dprintf(debug_fd, "\n Interrupt bitmap:\n");
+	dprintf(debug_fd,   " -----------------\n");
+	for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++)
+		dprintf(debug_fd, " %016llx", (u64) sregs.interrupt_bitmap[i]);
+	dprintf(debug_fd, "\n");
+}
+
+#define MAX_SYM_LEN 128
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	unsigned int code_bytes = 64;
+	unsigned int code_prologue = 43;
+	unsigned int code_len = code_bytes;
+	char sym[MAX_SYM_LEN] = SYMBOL_DEFAULT_UNKNOWN, *psym;
+	unsigned char c;
+	unsigned int i;
+	u8 *ip;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	ip = guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip) - code_prologue);
+
+	dprintf(debug_fd, "\n Code:\n");
+	dprintf(debug_fd,   " -----\n");
+
+	psym = symbol_lookup(vcpu->kvm, vcpu->regs.rip, sym, MAX_SYM_LEN);
+	if (IS_ERR(psym))
+		dprintf(debug_fd,
+			"Warning: symbol_lookup() failed to find symbol "
+			"with error: %ld\n", PTR_ERR(psym));
+
+	dprintf(debug_fd, " rip: [<%016lx>] %s\n\n", (unsigned long) vcpu->regs.rip, sym);
+
+	for (i = 0; i < code_len; i++, ip++) {
+		if (!host_ptr_in_ram(vcpu->kvm, ip))
+			break;
+
+		c = *ip;
+
+		if (ip == guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip)))
+			dprintf(debug_fd, " <%02x>", c);
+		else
+			dprintf(debug_fd, " %02x", c);
+	}
+
+	dprintf(debug_fd, "\n");
+
+	dprintf(debug_fd, "\n Stack:\n");
+	dprintf(debug_fd,   " ------\n");
+	kvm__dump_mem(vcpu->kvm, vcpu->regs.rsp, 32);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+	u64 *pte1;
+	u64 *pte2;
+	u64 *pte3;
+	u64 *pte4;
+
+	if (!is_in_protected_mode(vcpu))
+		return;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	pte4 = guest_flat_to_host(vcpu->kvm, vcpu->sregs.cr3);
+	if (!host_ptr_in_ram(vcpu->kvm, pte4))
+		return;
+
+	pte3 = guest_flat_to_host(vcpu->kvm, (*pte4 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte3))
+		return;
+
+	pte2 = guest_flat_to_host(vcpu->kvm, (*pte3 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte2))
+		return;
+
+	pte1 = guest_flat_to_host(vcpu->kvm, (*pte2 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte1))
+		return;
+
+	dprintf(debug_fd, "Page Tables:\n");
+	if (*pte2 & (1 << 7))
+		dprintf(debug_fd, " pte4: %016llx   pte3: %016llx"
+			"   pte2: %016llx\n",
+			*pte4, *pte3, *pte2);
+	else
+		dprintf(debug_fd, " pte4: %016llx  pte3: %016llx   pte2: %016"
+			"llx   pte1: %016llx\n",
+			*pte4, *pte3, *pte2, *pte1);
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+	struct kvm_lapic_state klapic;
+	struct local_apic *lapic = (void *)&klapic;
+
+	if (ioctl(cpu->vcpu_fd, KVM_GET_LAPIC, &klapic) != 0)
+		return;
+
+	if (lapic->lvt_lint1.mask)
+		return;
+
+	if (lapic->lvt_lint1.delivery_mode != APIC_MODE_NMI)
+		return;
+
+	ioctl(cpu->vcpu_fd, KVM_NMI);
+}
diff --git a/tools/kvm/x86/kvm.c b/tools/kvm/x86/kvm.c
new file mode 100644
index 000000000000..687e6b7acd4e
--- /dev/null
+++ b/tools/kvm/x86/kvm.c
@@ -0,0 +1,381 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/cpufeature.h"
+#include "kvm/interrupt.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+
+#include <asm/bootparam.h>
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) },
+	{ DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) },
+	{ DEFINE_KVM_EXT(KVM_CAP_PIT2) },
+	{ DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+	{ DEFINE_KVM_EXT(KVM_CAP_HLT) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) },
+	{ DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) },
+	{ 0, 0 }
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	struct cpuid_regs regs;
+	u32 eax_base;
+	int feature;
+
+	regs	= (struct cpuid_regs) {
+		.eax		= 0x00,
+	};
+	host_cpuid(&regs);
+
+	switch (regs.ebx) {
+	case CPUID_VENDOR_INTEL_1:
+		eax_base	= 0x00;
+		feature		= KVM__X86_FEATURE_VMX;
+		break;
+
+	case CPUID_VENDOR_AMD_1:
+		eax_base	= 0x80000000;
+		feature		= KVM__X86_FEATURE_SVM;
+		break;
+
+	default:
+		return false;
+	}
+
+	regs	= (struct cpuid_regs) {
+		.eax		= eax_base,
+	};
+	host_cpuid(&regs);
+
+	if (regs.eax < eax_base + 0x01)
+		return false;
+
+	regs	= (struct cpuid_regs) {
+		.eax		= eax_base + 0x01
+	};
+	host_cpuid(&regs);
+
+	return regs.ecx & (1 << feature);
+}
+
+/*
+ * Allocating RAM size bigger than 4GB requires us to leave a gap
+ * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
+ * devices (see documentation of e820_setup_gap() for details).
+ *
+ * If we're required to initialize RAM bigger than 4GB, we will create
+ * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
+ */
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	if (kvm->ram_size < KVM_32BIT_GAP_START) {
+		/* Use a single block of RAM for 32bit RAM */
+
+		phys_start = 0;
+		phys_size  = kvm->ram_size;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+	} else {
+		/* First RAM range from zero to the PCI gap: */
+
+		phys_start = 0;
+		phys_size  = KVM_32BIT_GAP_START;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+
+		/* Second RAM range from 4GB to the end of RAM: */
+
+		phys_start = KVM_32BIT_MAX_MEM_SIZE;
+		phys_size  = kvm->ram_size - phys_start;
+		host_mem   = kvm->ram_start + phys_start;
+
+		kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+	}
+}
+
+/* Arch-specific commandline setup */
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+	strcpy(cmdline, "noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 "
+				"i8042.dumbkbd=1 i8042.nopnp=1");
+	if (video)
+		strcat(cmdline, " video=vesafb console=tty0");
+	else
+		strcat(cmdline, " console=ttyS0 earlyprintk=serial i8042.noaux=1");
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	struct kvm_pit_config pit_config = { .flags = 0, };
+	int ret;
+
+	ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000);
+	if (ret < 0)
+		die_perror("KVM_SET_TSS_ADDR ioctl");
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config);
+	if (ret < 0)
+		die_perror("KVM_CREATE_PIT2 ioctl");
+
+	if (ram_size < KVM_32BIT_GAP_START) {
+		kvm->ram_size = ram_size;
+		kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+	} else {
+		kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size + KVM_32BIT_GAP_SIZE);
+		kvm->ram_size = ram_size + KVM_32BIT_GAP_SIZE;
+		if (kvm->ram_start != MAP_FAILED)
+			/*
+			 * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
+			 * if we accidently write to it, we will know.
+			 */
+			mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE);
+	}
+	if (kvm->ram_start == MAP_FAILED)
+		die("out of memory");
+
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (ret < 0)
+		die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level;
+
+	irq_level	= (struct kvm_irq_level) {
+		{
+			.irq		= irq,
+		},
+		.level		= level,
+	};
+
+	if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+		die_perror("KVM_IRQ_LINE failed");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, 1);
+	kvm__irq_line(kvm, irq, 0);
+}
+
+#define BOOT_LOADER_SELECTOR	0x1000
+#define BOOT_LOADER_IP		0x0000
+#define BOOT_LOADER_SP		0x8000
+#define BOOT_CMDLINE_OFFSET	0x20000
+
+#define BOOT_PROTOCOL_REQUIRED	0x206
+#define LOAD_HIGH		0x01
+
+static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 offset)
+{
+	unsigned long flat = segment_to_flat(selector, offset);
+
+	return guest_flat_to_host(kvm, flat);
+}
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+	void *p;
+	int nr;
+
+	/*
+	 * Some architectures may support loading an initrd alongside the flat kernel,
+	 * but we do not.
+	 */
+	if (fd_initrd != -1)
+		pr_warning("Loading initrd with flat binary not supported.");
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+	while ((nr = read(fd_kernel, p, 65536)) > 0)
+		p += nr;
+
+	kvm->arch.boot_selector	= BOOT_LOADER_SELECTOR;
+	kvm->arch.boot_ip	= BOOT_LOADER_IP;
+	kvm->arch.boot_sp	= BOOT_LOADER_SP;
+
+	return true;
+}
+
+static const char *BZIMAGE_MAGIC = "HdrS";
+
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd,
+		  const char *kernel_cmdline)
+{
+	struct boot_params *kern_boot;
+	unsigned long setup_sects;
+	struct boot_params boot;
+	size_t cmdline_size;
+	ssize_t setup_size;
+	void *p;
+	int nr;
+	u16 vidmode;
+
+	/*
+	 * See Documentation/x86/boot.txt for details no bzImage on-disk and
+	 * memory layout.
+	 */
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	if (read(fd_kernel, &boot, sizeof(boot)) != sizeof(boot))
+		return false;
+
+	if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC)))
+		return false;
+
+	if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED)
+		die("Too old kernel");
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	if (!boot.hdr.setup_sects)
+		boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS;
+	setup_sects = boot.hdr.setup_sects + 1;
+
+	setup_size = setup_sects << 9;
+	p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+	/* copy setup.bin to mem*/
+	if (read(fd_kernel, p, setup_size) != setup_size)
+		die_perror("read");
+
+	/* copy vmlinux.bin to BZ_KERNEL_START*/
+	p = guest_flat_to_host(kvm, BZ_KERNEL_START);
+
+	while ((nr = read(fd_kernel, p, 65536)) > 0)
+		p += nr;
+
+	p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET);
+	if (kernel_cmdline) {
+		cmdline_size = strlen(kernel_cmdline) + 1;
+		if (cmdline_size > boot.hdr.cmdline_size)
+			cmdline_size = boot.hdr.cmdline_size;
+
+		memset(p, 0, boot.hdr.cmdline_size);
+		memcpy(p, kernel_cmdline, cmdline_size - 1);
+	}
+
+	if (!kvm->cfg.arch.vidmode)
+		vidmode = -1;
+
+	/* vidmode should be either specified or set by default */
+	if (kvm->cfg.vnc || kvm->cfg.sdl) {
+		if (vidmode == -1)
+			vidmode = 0x312;
+	} else {
+		vidmode = 0;
+	}
+
+	kern_boot	= guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00);
+
+	kern_boot->hdr.cmd_line_ptr	= BOOT_CMDLINE_OFFSET;
+	kern_boot->hdr.type_of_loader	= 0xff;
+	kern_boot->hdr.heap_end_ptr	= 0xfe00;
+	kern_boot->hdr.loadflags	|= CAN_USE_HEAP;
+	kern_boot->hdr.vid_mode		= vidmode;
+
+	/*
+	 * Read initrd image into guest memory
+	 */
+	if (fd_initrd >= 0) {
+		struct stat initrd_stat;
+		unsigned long addr;
+
+		if (fstat(fd_initrd, &initrd_stat))
+			die_perror("fstat");
+
+		addr = boot.hdr.initrd_addr_max & ~0xfffff;
+		for (;;) {
+			if (addr < BZ_KERNEL_START)
+				die("Not enough memory for initrd");
+			else if (addr < (kvm->ram_size - initrd_stat.st_size))
+				break;
+			addr -= 0x100000;
+		}
+
+		p = guest_flat_to_host(kvm, addr);
+		nr = read(fd_initrd, p, initrd_stat.st_size);
+		if (nr != initrd_stat.st_size)
+			die("Failed to read initrd");
+
+		kern_boot->hdr.ramdisk_image	= addr;
+		kern_boot->hdr.ramdisk_size	= initrd_stat.st_size;
+	}
+
+	kvm->arch.boot_selector = BOOT_LOADER_SELECTOR;
+	/*
+	 * The real-mode setup code starts at offset 0x200 of a bzImage. See
+	 * Documentation/x86/boot.txt for details.
+	 */
+	kvm->arch.boot_ip = BOOT_LOADER_IP + 0x200;
+	kvm->arch.boot_sp = BOOT_LOADER_SP;
+
+	return true;
+}
+
+/**
+ * kvm__arch_setup_firmware - inject BIOS into guest system memory
+ * @kvm - guest system descriptor
+ *
+ * This function is a main routine where we poke guest memory
+ * and install BIOS there.
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	/* standart minimal configuration */
+	setup_bios(kvm);
+
+	/* FIXME: SMP, ACPI and friends here */
+
+	return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvm__arch_periodic_poll(struct kvm *kvm)
+{
+	serial8250__update_consoles(kvm);
+	virtio_console__inject_interrupt(kvm);
+}
diff --git a/tools/kvm/x86/mptable.c b/tools/kvm/x86/mptable.c
new file mode 100644
index 000000000000..ea8c6e8c848f
--- /dev/null
+++ b/tools/kvm/x86/mptable.c
@@ -0,0 +1,289 @@
+#include "kvm/kvm.h"
+#include "kvm/bios.h"
+#include "kvm/apic.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+
+#include <linux/kernel.h>
+#include <string.h>
+
+#include <asm/mpspec_def.h>
+#include <linux/types.h>
+
+/*
+ * FIXME: please make sure the addresses borrowed
+ * for apic/ioapic never overlaped! We need a global
+ * tracker of system resources (including io, mmio,
+ * and friends).
+ */
+
+static unsigned int mpf_checksum(unsigned char *mp, int len)
+{
+	unsigned int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+static unsigned int gen_cpu_flag(unsigned int cpu, unsigned int ncpu)
+{
+	/* sets enabled/disabled | BSP/AP processor */
+	return ( (cpu < ncpu) ? CPU_ENABLED       : 0) |
+		((cpu == 0)   ? CPU_BOOTPROCESSOR : 0x00);
+}
+
+#define MPTABLE_SIG_FLOATING	"_MP_"
+#define MPTABLE_OEM		"KVMCPU00"
+#define MPTABLE_PRODUCTID	"0.1         "
+#define MPTABLE_PCIBUSTYPE	"PCI   "
+#define MPTABLE_ISABUSTYPE	"ISA   "
+
+#define MPTABLE_STRNCPY(d, s)	memcpy(d, s, sizeof(d))
+
+/* It should be more than enough */
+#define MPTABLE_MAX_SIZE	(32 << 20)
+
+/*
+ * Too many cpus will require x2apic mode
+ * and rather ACPI support so we limit it
+ * here for a while.
+ */
+#define MPTABLE_MAX_CPUS	255
+
+static void mptable_add_irq_src(struct mpc_intsrc *mpc_intsrc,
+				u16 srcbusid,	u16 srcbusirq,
+				u16 dstapic,	u16 dstirq)
+{
+	*mpc_intsrc = (struct mpc_intsrc) {
+		.type		= MP_INTSRC,
+		.irqtype	= mp_INT,
+		.irqflag	= MP_IRQDIR_DEFAULT,
+		.srcbus		= srcbusid,
+		.srcbusirq	= srcbusirq,
+		.dstapic	= dstapic,
+		.dstirq		= dstirq
+	};
+}
+
+/**
+ * mptable_setup - create mptable and fill guest memory with it
+ */
+int mptable__init(struct kvm *kvm)
+{
+	unsigned long real_mpc_table, real_mpf_intel, size;
+	struct mpf_intel *mpf_intel;
+	struct mpc_table *mpc_table;
+	struct mpc_cpu *mpc_cpu;
+	struct mpc_bus *mpc_bus;
+	struct mpc_ioapic *mpc_ioapic;
+	struct mpc_intsrc *mpc_intsrc;
+	struct rb_node *pci_tree;
+
+	const int pcibusid = 0;
+	const int isabusid = 1;
+
+	unsigned int i, nentries = 0, ncpus = kvm->nrcpus;
+	unsigned int ioapicid;
+	void *last_addr;
+
+	/* That is where MP table will be in guest memory */
+	real_mpc_table = ALIGN(MB_BIOS_BEGIN + bios_rom_size, 16);
+
+	if (ncpus > MPTABLE_MAX_CPUS) {
+		pr_warning("Too many cpus: %d limited to %d",
+			ncpus, MPTABLE_MAX_CPUS);
+		ncpus = MPTABLE_MAX_CPUS;
+	}
+
+	mpc_table = calloc(1, MPTABLE_MAX_SIZE);
+	if (!mpc_table)
+		return -ENOMEM;
+
+	MPTABLE_STRNCPY(mpc_table->signature,	MPC_SIGNATURE);
+	MPTABLE_STRNCPY(mpc_table->oem,		MPTABLE_OEM);
+	MPTABLE_STRNCPY(mpc_table->productid,	MPTABLE_PRODUCTID);
+
+	mpc_table->spec		= 4;
+	mpc_table->lapic	= APIC_ADDR(0);
+	mpc_table->oemcount	= ncpus; /* will be updated again at end */
+
+	/*
+	 * CPUs enumeration. Technically speaking we should
+	 * ask either host or HV for apic version supported
+	 * but for a while we simply put some random value
+	 * here.
+	 */
+	mpc_cpu = (void *)&mpc_table[1];
+	for (i = 0; i < ncpus; i++) {
+		mpc_cpu->type		= MP_PROCESSOR;
+		mpc_cpu->apicid		= i;
+		mpc_cpu->apicver	= KVM_APIC_VERSION;
+		mpc_cpu->cpuflag	= gen_cpu_flag(i, ncpus);
+		mpc_cpu->cpufeature	= 0x600; /* some default value */
+		mpc_cpu->featureflag	= 0x201; /* some default value */
+		mpc_cpu++;
+	}
+
+	last_addr = (void *)mpc_cpu;
+	nentries += ncpus;
+
+	/*
+	 * PCI buses.
+	 * FIXME: Some callback here to obtain real number
+	 * of PCI buses present in system.
+	 */
+	mpc_bus		= last_addr;
+	mpc_bus->type	= MP_BUS;
+	mpc_bus->busid	= pcibusid;
+	MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_PCIBUSTYPE);
+
+	last_addr = (void *)&mpc_bus[1];
+	nentries++;
+
+	/*
+	 * ISA bus.
+	 * FIXME: Same issue as for PCI bus.
+	 */
+	mpc_bus		= last_addr;
+	mpc_bus->type	= MP_BUS;
+	mpc_bus->busid	= isabusid;
+	MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_ISABUSTYPE);
+
+	last_addr = (void *)&mpc_bus[1];
+	nentries++;
+
+	/*
+	 * IO-APIC chip.
+	 */
+	ioapicid		= ncpus + 1;
+	mpc_ioapic		= last_addr;
+	mpc_ioapic->type	= MP_IOAPIC;
+	mpc_ioapic->apicid	= ioapicid;
+	mpc_ioapic->apicver	= KVM_APIC_VERSION;
+	mpc_ioapic->flags	= MPC_APIC_USABLE;
+	mpc_ioapic->apicaddr	= IOAPIC_ADDR(0);
+
+	last_addr = (void *)&mpc_ioapic[1];
+	nentries++;
+
+	/*
+	 * IRQ sources.
+	 *
+	 * FIXME: Same issue as with buses. We definitely
+	 * need kind of collector routine which enumerate
+	 * resources used first and pass them here.
+	 * At moment we know we have only virtio block device
+	 * and virtio console but this is g00berfish.
+	 *
+	 * Also note we use PCI irqs here, no for ISA bus yet.
+	 */
+
+	for (pci_tree = irq__get_pci_tree(); pci_tree; pci_tree = rb_next(pci_tree)) {
+		struct pci_dev *dev = rb_entry(pci_tree, struct pci_dev, node);
+		struct irq_line *irq_line;
+
+		list_for_each_entry(irq_line, &dev->lines, node) {
+			unsigned char srcbusirq;
+
+			srcbusirq = (dev->id << 2) | (dev->pin - 1);
+
+			mpc_intsrc = last_addr;
+
+			mptable_add_irq_src(mpc_intsrc, pcibusid, srcbusirq, ioapicid, irq_line->line);
+			last_addr = (void *)&mpc_intsrc[1];
+			nentries++;
+		}
+	}
+
+	/*
+	 * Local IRQs assignment (LINT0, LINT1)
+	 */
+	mpc_intsrc		= last_addr;
+	mpc_intsrc->type	= MP_LINTSRC;
+	mpc_intsrc->irqtype	= mp_ExtINT;
+	mpc_intsrc->irqtype	= mp_INT;
+	mpc_intsrc->irqflag	= MP_IRQDIR_DEFAULT;
+	mpc_intsrc->srcbus	= isabusid;
+	mpc_intsrc->srcbusirq	= 0;
+	mpc_intsrc->dstapic	= 0; /* FIXME: BSP apic */
+	mpc_intsrc->dstirq	= 0; /* LINT0 */
+
+	last_addr = (void *)&mpc_intsrc[1];
+	nentries++;
+
+	mpc_intsrc		= last_addr;
+	mpc_intsrc->type	= MP_LINTSRC;
+	mpc_intsrc->irqtype	= mp_NMI;
+	mpc_intsrc->irqflag	= MP_IRQDIR_DEFAULT;
+	mpc_intsrc->srcbus	= isabusid;
+	mpc_intsrc->srcbusirq	= 0;
+	mpc_intsrc->dstapic	= 0; /* FIXME: BSP apic */
+	mpc_intsrc->dstirq	= 1; /* LINT1 */
+
+	last_addr = (void *)&mpc_intsrc[1];
+	nentries++;
+
+	/*
+	 * Floating MP table finally.
+	 */
+	real_mpf_intel	= ALIGN((unsigned long)last_addr - (unsigned long)mpc_table, 16);
+	mpf_intel	= (void *)((unsigned long)mpc_table + real_mpf_intel);
+
+	MPTABLE_STRNCPY(mpf_intel->signature, MPTABLE_SIG_FLOATING);
+	mpf_intel->length	= 1;
+	mpf_intel->specification= 4;
+	mpf_intel->physptr	= (unsigned int)real_mpc_table;
+	mpf_intel->checksum	= -mpf_checksum((unsigned char *)mpf_intel, sizeof(*mpf_intel));
+
+	/*
+	 * No last_addr inclrement here please, we need last
+	 * active position here to compute table size.
+	 */
+
+	/*
+	 * Don't forget to update header in fixed table.
+	*/
+	mpc_table->oemcount	= nentries;
+	mpc_table->length	= last_addr - (void *)mpc_table;
+	mpc_table->checksum	= -mpf_checksum((unsigned char *)mpc_table, mpc_table->length);
+
+
+	/*
+	 * We will copy the whole table, no need to separate
+	 * floating structure and table itkvm.
+	 */
+	size = (unsigned long)mpf_intel + sizeof(*mpf_intel) - (unsigned long)mpc_table;
+
+	/*
+	 * The finial check -- never get out of system bios
+	 * area. Lets also check for allocated memory overrun,
+	 * in real it's late but still usefull.
+	 */
+
+	if (size > (unsigned long)(MB_BIOS_END - bios_rom_size) ||
+	    size > MPTABLE_MAX_SIZE) {
+		free(mpc_table);
+		pr_err("MP table is too big");
+
+		return -E2BIG;
+	}
+
+	/*
+	 * OK, it is time to move it to guest memory.
+	 */
+	memcpy(guest_flat_to_host(kvm, real_mpc_table), mpc_table, size);
+
+	free(mpc_table);
+
+	return 0;
+}
+firmware_init(mptable__init);
+
+int mptable__exit(struct kvm *kvm)
+{
+	return 0;
+}
+firmware_exit(mptable__exit);
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 5a824e355d04..bb8b3db0e583 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -1463,7 +1462,8 @@ static int event_read_fields(struct event_format *event, struct format_field **f
 			if (read_expect_type(EVENT_ITEM, &token))
 				goto fail;
 
-			/* add signed type */
+			if (strtoul(token, NULL, 0))
+				field->flags |= FIELD_IS_SIGNED;
 
 			free_token(token);
 			if (read_expected(EVENT_OP, ";") < 0)
@@ -1785,6 +1785,8 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok)
 		   strcmp(token, "/") == 0 ||
 		   strcmp(token, "<") == 0 ||
 		   strcmp(token, ">") == 0 ||
+		   strcmp(token, "<=") == 0 ||
+		   strcmp(token, ">=") == 0 ||
 		   strcmp(token, "==") == 0 ||
 		   strcmp(token, "!=") == 0) {
 
@@ -2481,7 +2483,7 @@ process_dynamic_array(struct event_format *event, struct print_arg *arg, char **
 
 	free_token(token);
 	arg = alloc_arg();
-	if (!field) {
+	if (!arg) {
 		do_warning("%s: not enough memory!", __func__);
 		*tok = NULL;
 		return EVENT_ERROR;
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 24a4bbabc5d5..7be7e89533e4 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/tools/lib/traceevent/event-utils.h b/tools/lib/traceevent/event-utils.h
index bc075006966e..e76c9acb92cd 100644
--- a/tools/lib/traceevent/event-utils.h
+++ b/tools/lib/traceevent/event-utils.h
@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
index 5ea4326ad11f..2500e75583fc 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/tools/lib/traceevent/parse-utils.c b/tools/lib/traceevent/parse-utils.c
index f023a133abb6..bba701cf10e6 100644
--- a/tools/lib/traceevent/parse-utils.c
+++ b/tools/lib/traceevent/parse-utils.c
@@ -1,3 +1,22 @@
+/*
+ * Copyright (C) 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c
index b1ccc923e8a5..a57db805136a 100644
--- a/tools/lib/traceevent/trace-seq.c
+++ b/tools/lib/traceevent/trace-seq.c
@@ -13,8 +13,7 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index ef6d22e879eb..eb30044a922a 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -222,10 +222,14 @@ install-pdf: pdf
 #install-html: html
 #	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
 
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),tags)
 $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) $(OUTPUT)PERF-VERSION-FILE
 
 -include $(OUTPUT)PERF-VERSION-FILE
+endif
+endif
 
 #
 # Determine "include::" file references in asciidoc files.
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index c1057701a7dc..8e798baae0fd 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -24,6 +24,9 @@ OPTIONS
 -r::
 --remove=::
         Remove specified file from the cache.
+-M::
+--missing=:: 
+	List missing build ids in the cache for the specified file.
 -v::
 --verbose::
 	Be more verbose.
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index 194f37d635df..5b3123d5721f 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -22,10 +22,6 @@ specified perf.data files.
 
 OPTIONS
 -------
--M::
---displacement::
-        Show position displacement relative to baseline.
-
 -D::
 --dump-raw-trace::
         Dump raw trace in ASCII.
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index f4d91bebd59d..848a0dcb6dfd 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -57,11 +57,44 @@ OPTIONS
 
 -s::
 --sort=::
-	Sort by key(s): pid, comm, dso, symbol, parent, srcline.
+	Sort histogram entries by given key(s) - multiple keys can be specified
+	in CSV format.  Following sort keys are available:
+	pid, comm, dso, symbol, parent, cpu, srcline.
+
+	Each key has following meaning:
+
+	- comm: command (name) of the task which can be read via /proc/<pid>/comm
+	- pid: command and tid of the task
+	- dso: name of library or module executed at the time of sample
+	- symbol: name of function executed at the time of sample
+	- parent: name of function matched to the parent regex filter. Unmatched
+	entries are displayed as "[other]".
+	- cpu: cpu number the task ran at the time of sample
+	- srcline: filename and line number executed at the time of sample.  The
+	DWARF debuggin info must be provided.
+
+	By default, comm, dso and symbol keys are used.
+	(i.e. --sort comm,dso,symbol)
+
+	If --branch-stack option is used, following sort keys are also
+	available:
+	dso_from, dso_to, symbol_from, symbol_to, mispredict.
+
+	- dso_from: name of library or module branched from
+	- dso_to: name of library or module branched to
+	- symbol_from: name of function branched from
+	- symbol_to: name of function branched to
+	- mispredict: "N" for predicted branch, "Y" for mispredicted branch
+
+	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
+	and symbol_to, see '--branch-stack'.
 
 -p::
 --parent=<regex>::
-        regex filter to identify parent, see: '--sort parent'
+        A regex filter to identify parent. The parent is a caller of this
+	function and searched through the callchain, thus it requires callchain
+	information recorded. The pattern is in the exteneded regex format and
+	defaults to "\^sys_|^do_page_fault", see '--sort parent'.
 
 -x::
 --exclude-other::
@@ -74,7 +107,6 @@ OPTIONS
 
 -t::
 --field-separator=::
-
 	Use a special separator character and don't pad with spaces, replacing
 	all occurrences of this separator in symbol names (and other output)
 	with a '.' character, that thus it's the only non valid separator.
diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt
index a4027f221a53..9f1f054b8432 100644
--- a/tools/perf/Documentation/perf-script-python.txt
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -336,7 +336,6 @@ scripts listed by the 'perf script -l' command e.g.:
 ----
 root@tropicana:~# perf script -l
 List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
   wakeup-latency                       system-wide min/max/avg wakeup latency
   rw-by-file <comm>                    r/w activity for a program, by file
   rw-by-pid                            system-wide r/w activity
@@ -402,7 +401,6 @@ should show a new entry for your script:
 ----
 root@tropicana:~# perf script -l
 List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
   wakeup-latency                       system-wide min/max/avg wakeup latency
   rw-by-file <comm>                    r/w activity for a program, by file
   rw-by-pid                            system-wide r/w activity
diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt
index b24ac40fcd58..d1d3e5121f89 100644
--- a/tools/perf/Documentation/perf-test.txt
+++ b/tools/perf/Documentation/perf-test.txt
@@ -23,6 +23,10 @@ from 'perf test list'.
 
 OPTIONS
 -------
+-s::
+--skip::
+	Tests to skip (comma separater numeric list).
+
 -v::
 --verbose::
 	Be more verbose.
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 5b80d84d6b4a..a414bc95fd52 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -60,7 +60,7 @@ Default is to monitor all CPUS.
 
 -i::
 --inherit::
-	Child tasks inherit counters, only makes sens with -p option.
+	Child tasks do not inherit counters.
 
 -k <path>::
 --vmlinux=<path>::
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 8ab05e543ef4..b62dbc0d974a 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -50,7 +50,6 @@ include config/utilities.mak
 
 $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	@$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
--include $(OUTPUT)PERF-VERSION-FILE
 
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 
@@ -104,7 +103,7 @@ ifdef PARSER_DEBUG
 endif
 
 CFLAGS = -fno-omit-frame-pointer -ggdb3 -funwind-tables -Wall -Wextra -std=gnu99 $(CFLAGS_WERROR) $(CFLAGS_OPTIMIZE) $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) $(PARSER_DEBUG_CFLAGS)
-EXTLIBS = -lpthread -lrt -lelf -lm
+EXTLIBS = -lpthread -lrt -lelf -lm -lnuma
 ALL_CFLAGS = $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
@@ -153,6 +152,8 @@ INSTALL = install
 # explicitly what architecture to check for. Fix this up for yours..
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
 
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),tags)
 -include config/feature-tests.mak
 
 ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -Werror -fstack-protector-all,-fstack-protector-all),y)
@@ -206,6 +207,8 @@ ifeq ($(call try-cc,$(SOURCE_BIONIC),$(CFLAGS),bionic),y)
 	EXTLIBS := $(filter-out -lpthread,$(EXTLIBS))
 	BASIC_CFLAGS += -I.
 endif
+endif # MAKECMDGOALS != tags
+endif # MAKECMDGOALS != clean
 
 # Guard against environment variables
 BUILTIN_OBJS =
@@ -230,11 +233,19 @@ endif
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
 TE_LIB := -L$(TE_PATH) -ltraceevent
 
+export LIBTRACEEVENT
+
+# python extension build directories
+PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
+PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
+PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/
+export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
+
+python-clean := rm -rf $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
+
 PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
 PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py
 
-export LIBTRACEEVENT
-
 $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
 	$(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' $(PYTHON_WORD) util/setup.py \
 	  --quiet build_ext; \
@@ -378,8 +389,11 @@ LIB_H += util/rblist.h
 LIB_H += util/intlist.h
 LIB_H += util/perf_regs.h
 LIB_H += util/unwind.h
-LIB_H += ui/helpline.h
 LIB_H += util/vdso.h
+LIB_H += ui/helpline.h
+LIB_H += ui/progress.h
+LIB_H += ui/util.h
+LIB_H += ui/ui.h
 
 LIB_OBJS += $(OUTPUT)util/abspath.o
 LIB_OBJS += $(OUTPUT)util/alias.o
@@ -453,6 +467,7 @@ LIB_OBJS += $(OUTPUT)util/stat.o
 LIB_OBJS += $(OUTPUT)ui/setup.o
 LIB_OBJS += $(OUTPUT)ui/helpline.o
 LIB_OBJS += $(OUTPUT)ui/progress.o
+LIB_OBJS += $(OUTPUT)ui/util.o
 LIB_OBJS += $(OUTPUT)ui/hist.o
 LIB_OBJS += $(OUTPUT)ui/stdio/hist.o
 
@@ -471,11 +486,13 @@ LIB_OBJS += $(OUTPUT)tests/rdpmc.o
 LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o
 LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o
 LIB_OBJS += $(OUTPUT)tests/pmu.o
-LIB_OBJS += $(OUTPUT)tests/util.o
+LIB_OBJS += $(OUTPUT)tests/hists_link.o
+LIB_OBJS += $(OUTPUT)tests/python-use.o
 
 BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
 BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
 # Benchmark modules
+BUILTIN_OBJS += $(OUTPUT)bench/numa.o
 BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
 BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
 ifeq ($(RAW_ARCH),x86_64)
@@ -510,14 +527,13 @@ PERFLIBS = $(LIB_FILE) $(LIBTRACEEVENT)
 #
 # Platform specific tweaks
 #
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),tags)
 
 # We choose to avoid "if .. else if .. else .. endif endif"
 # because maintaining the nesting to match is a pain.  If
 # we had "elif" things would have been much nicer...
 
--include config.mak.autogen
--include config.mak
-
 ifdef NO_LIBELF
 	NO_DWARF := 1
 	NO_DEMANGLE := 1
@@ -646,7 +662,6 @@ ifndef NO_NEWT
 		LIB_OBJS += $(OUTPUT)ui/browsers/hists.o
 		LIB_OBJS += $(OUTPUT)ui/browsers/map.o
 		LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o
-		LIB_OBJS += $(OUTPUT)ui/util.o
 		LIB_OBJS += $(OUTPUT)ui/tui/setup.o
 		LIB_OBJS += $(OUTPUT)ui/tui/util.o
 		LIB_OBJS += $(OUTPUT)ui/tui/helpline.o
@@ -655,9 +670,6 @@ ifndef NO_NEWT
 		LIB_H += ui/browsers/map.h
 		LIB_H += ui/keysyms.h
 		LIB_H += ui/libslang.h
-		LIB_H += ui/progress.h
-		LIB_H += ui/util.h
-		LIB_H += ui/ui.h
 	endif
 endif
 
@@ -673,14 +685,11 @@ ifndef NO_GTK2
 		BASIC_CFLAGS += $(shell pkg-config --cflags gtk+-2.0 2>/dev/null)
 		EXTLIBS += $(shell pkg-config --libs gtk+-2.0 2>/dev/null)
 		LIB_OBJS += $(OUTPUT)ui/gtk/browser.o
+		LIB_OBJS += $(OUTPUT)ui/gtk/hists.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/setup.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/util.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/helpline.o
 		LIB_OBJS += $(OUTPUT)ui/gtk/progress.o
-		# Make sure that it'd be included only once.
-		ifeq ($(findstring -DNEWT_SUPPORT,$(BASIC_CFLAGS)),)
-			LIB_OBJS += $(OUTPUT)ui/util.o
-		endif
 	endif
 endif
 
@@ -707,7 +716,7 @@ disable-python = $(eval $(disable-python_code))
 define disable-python_code
   BASIC_CFLAGS += -DNO_LIBPYTHON
   $(if $(1),$(warning No $(1) was found))
-  $(warning Python support won't be built)
+  $(warning Python support will not be built)
 endef
 
 override PYTHON := \
@@ -715,19 +724,10 @@ override PYTHON := \
 
 ifndef PYTHON
   $(call disable-python,python interpreter)
-  python-clean :=
 else
 
   PYTHON_WORD := $(call shell-wordify,$(PYTHON))
 
-  # python extension build directories
-  PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
-  PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
-  PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/
-  export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
-
-  python-clean := rm -rf $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
-
   ifdef NO_LIBPYTHON
     $(call disable-python)
   else
@@ -843,6 +843,9 @@ ifdef ASCIIDOC8
 	export ASCIIDOC8
 endif
 
+endif # MAKECMDGOALS != tags
+endif # MAKECMDGOALS != clean
+
 # Shell quote (do not use $(call) to accommodate ancient setups);
 
 ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
@@ -884,7 +887,7 @@ strip: $(PROGRAMS) $(OUTPUT)perf
 	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
 
 $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
+	$(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \
 		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
 		$(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
 
@@ -948,7 +951,13 @@ $(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
 
 $(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
-		'-DBINDIR="$(bindir_SQ)"' \
+		'-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \
+		$<
+
+$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
+		-DPYTHONPATH='"$(OUTPUT)python"' \
+		-DPYTHON='"$(PYTHON_WORD)"' \
 		$<
 
 $(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
@@ -1099,7 +1108,7 @@ perfexec_instdir = $(prefix)/$(perfexecdir)
 endif
 perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
 
-install: all try-install-man
+install-bin: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
@@ -1120,6 +1129,8 @@ install: all try-install-man
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
 	$(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
 
+install: install-bin try-install-man
+
 install-python_ext:
 	$(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
 
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 8f89998eeaf4..a5223e6a7b43 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -1,6 +1,7 @@
 #ifndef BENCH_H
 #define BENCH_H
 
+extern int bench_numa(int argc, const char **argv, const char *prefix);
 extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
 extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
 extern int bench_mem_memcpy(int argc, const char **argv,
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
new file mode 100644
index 000000000000..30d1c3225b46
--- /dev/null
+++ b/tools/perf/bench/numa.c
@@ -0,0 +1,1731 @@
+/*
+ * numa.c
+ *
+ * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
+ */
+
+#include "../perf.h"
+#include "../builtin.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+
+#include "bench.h"
+
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <assert.h>
+#include <malloc.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+
+#include <numa.h>
+#include <numaif.h>
+
+/*
+ * Regular printout to the terminal, supressed if -q is specified:
+ */
+#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
+
+/*
+ * Debug printf:
+ */
+#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
+
+struct thread_data {
+	int			curr_cpu;
+	cpu_set_t		bind_cpumask;
+	int			bind_node;
+	u8			*process_data;
+	int			process_nr;
+	int			thread_nr;
+	int			task_nr;
+	unsigned int		loops_done;
+	u64			val;
+	u64			runtime_ns;
+	pthread_mutex_t		*process_lock;
+};
+
+/* Parameters set by options: */
+
+struct params {
+	/* Startup synchronization: */
+	bool			serialize_startup;
+
+	/* Task hierarchy: */
+	int			nr_proc;
+	int			nr_threads;
+
+	/* Working set sizes: */
+	const char		*mb_global_str;
+	const char		*mb_proc_str;
+	const char		*mb_proc_locked_str;
+	const char		*mb_thread_str;
+
+	double			mb_global;
+	double			mb_proc;
+	double			mb_proc_locked;
+	double			mb_thread;
+
+	/* Access patterns to the working set: */
+	bool			data_reads;
+	bool			data_writes;
+	bool			data_backwards;
+	bool			data_zero_memset;
+	bool			data_rand_walk;
+	u32			nr_loops;
+	u32			nr_secs;
+	u32			sleep_usecs;
+
+	/* Working set initialization: */
+	bool			init_zero;
+	bool			init_random;
+	bool			init_cpu0;
+
+	/* Misc options: */
+	int			show_details;
+	int			run_all;
+	int			thp;
+
+	long			bytes_global;
+	long			bytes_process;
+	long			bytes_process_locked;
+	long			bytes_thread;
+
+	int			nr_tasks;
+	bool			show_quiet;
+
+	bool			show_convergence;
+	bool			measure_convergence;
+
+	int			perturb_secs;
+	int			nr_cpus;
+	int			nr_nodes;
+
+	/* Affinity options -C and -N: */
+	char			*cpu_list_str;
+	char			*node_list_str;
+};
+
+
+/* Global, read-writable area, accessible to all processes and threads: */
+
+struct global_info {
+	u8			*data;
+
+	pthread_mutex_t		startup_mutex;
+	int			nr_tasks_started;
+
+	pthread_mutex_t		startup_done_mutex;
+
+	pthread_mutex_t		start_work_mutex;
+	int			nr_tasks_working;
+
+	pthread_mutex_t		stop_work_mutex;
+	u64			bytes_done;
+
+	struct thread_data	*threads;
+
+	/* Convergence latency measurement: */
+	bool			all_converged;
+	bool			stop_work;
+
+	int			print_once;
+
+	struct params		p;
+};
+
+static struct global_info	*g = NULL;
+
+static int parse_cpus_opt(const struct option *opt, const char *arg, int unset);
+static int parse_nodes_opt(const struct option *opt, const char *arg, int unset);
+
+struct params p0;
+
+static const struct option options[] = {
+	OPT_INTEGER('p', "nr_proc"	, &p0.nr_proc,		"number of processes"),
+	OPT_INTEGER('t', "nr_threads"	, &p0.nr_threads,	"number of threads per process"),
+
+	OPT_STRING('G', "mb_global"	, &p0.mb_global_str,	"MB", "global  memory (MBs)"),
+	OPT_STRING('P', "mb_proc"	, &p0.mb_proc_str,	"MB", "process memory (MBs)"),
+	OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
+	OPT_STRING('T', "mb_thread"	, &p0.mb_thread_str,	"MB", "thread  memory (MBs)"),
+
+	OPT_UINTEGER('l', "nr_loops"	, &p0.nr_loops,		"max number of loops to run"),
+	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run"),
+	OPT_UINTEGER('u', "usleep"	, &p0.sleep_usecs,	"usecs to sleep per loop iteration"),
+
+	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via writes (can be mixed with -W)"),
+	OPT_BOOLEAN('W', "data_writes"	, &p0.data_writes,	"access the data via writes (can be mixed with -R)"),
+	OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,	"access the data backwards as well"),
+	OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
+	OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,	"access the data with random (32bit LFSR) walk"),
+
+
+	OPT_BOOLEAN('z', "init_zero"	, &p0.init_zero,	"bzero the initial allocations"),
+	OPT_BOOLEAN('I', "init_random"	, &p0.init_random,	"randomize the contents of the initial allocations"),
+	OPT_BOOLEAN('0', "init_cpu0"	, &p0.init_cpu0,	"do the initial allocations on CPU#0"),
+	OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,	"perturb thread 0/0 every X secs, to test convergence stability"),
+
+	OPT_INCR   ('d', "show_details"	, &p0.show_details,	"Show details"),
+	OPT_INCR   ('a', "all"		, &p0.run_all,		"Run all tests in the suite"),
+	OPT_INTEGER('H', "thp"		, &p0.thp,		"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
+	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
+	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"),
+	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"bzero the initial allocations"),
+	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
+
+	/* Special option string parsing callbacks: */
+        OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]",
+			"bind the first N tasks to these specific cpus (the rest is unbound)",
+			parse_cpus_opt),
+        OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]",
+			"bind the first N tasks to these specific memory nodes (the rest is unbound)",
+			parse_nodes_opt),
+	OPT_END()
+};
+
+static const char * const bench_numa_usage[] = {
+	"perf bench numa <options>",
+	NULL
+};
+
+static const char * const numa_usage[] = {
+	"perf bench numa mem [<options>]",
+	NULL
+};
+
+static cpu_set_t bind_to_cpu(int target_cpu)
+{
+	cpu_set_t orig_mask, mask;
+	int ret;
+
+	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
+	BUG_ON(ret);
+
+	CPU_ZERO(&mask);
+
+	if (target_cpu == -1) {
+		int cpu;
+
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &mask);
+	} else {
+		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
+		CPU_SET(target_cpu, &mask);
+	}
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+
+	return orig_mask;
+}
+
+static cpu_set_t bind_to_node(int target_node)
+{
+	int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
+	cpu_set_t orig_mask, mask;
+	int cpu;
+	int ret;
+
+	BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus);
+	BUG_ON(!cpus_per_node);
+
+	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
+	BUG_ON(ret);
+
+	CPU_ZERO(&mask);
+
+	if (target_node == -1) {
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &mask);
+	} else {
+		int cpu_start = (target_node + 0) * cpus_per_node;
+		int cpu_stop  = (target_node + 1) * cpus_per_node;
+
+		BUG_ON(cpu_stop > g->p.nr_cpus);
+
+		for (cpu = cpu_start; cpu < cpu_stop; cpu++)
+			CPU_SET(cpu, &mask);
+	}
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+
+	return orig_mask;
+}
+
+static void bind_to_cpumask(cpu_set_t mask)
+{
+	int ret;
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+}
+
+static void mempol_restore(void)
+{
+	int ret;
+
+	ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1);
+
+	BUG_ON(ret);
+}
+
+static void bind_to_memnode(int node)
+{
+	unsigned long nodemask;
+	int ret;
+
+	if (node == -1)
+		return;
+
+	BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask));
+	nodemask = 1L << node;
+
+	ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
+	dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret);
+
+	BUG_ON(ret);
+}
+
+#define HPSIZE (2*1024*1024)
+
+#define set_taskname(fmt...)				\
+do {							\
+	char name[20];					\
+							\
+	snprintf(name, 20, fmt);			\
+	prctl(PR_SET_NAME, name);			\
+} while (0)
+
+static u8 *alloc_data(ssize_t bytes0, int map_flags,
+		      int init_zero, int init_cpu0, int thp, int init_random)
+{
+	cpu_set_t orig_mask;
+	ssize_t bytes;
+	u8 *buf;
+	int ret;
+
+	if (!bytes0)
+		return NULL;
+
+	/* Allocate and initialize all memory on CPU#0: */
+	if (init_cpu0) {
+		orig_mask = bind_to_node(0);
+		bind_to_memnode(0);
+	}
+
+	bytes = bytes0 + HPSIZE;
+
+	buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0);
+	BUG_ON(buf == (void *)-1);
+
+	if (map_flags == MAP_PRIVATE) {
+		if (thp > 0) {
+			ret = madvise(buf, bytes, MADV_HUGEPAGE);
+			if (ret && !g->print_once) {
+				g->print_once = 1;
+				printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
+			}
+		}
+		if (thp < 0) {
+			ret = madvise(buf, bytes, MADV_NOHUGEPAGE);
+			if (ret && !g->print_once) {
+				g->print_once = 1;
+				printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
+			}
+		}
+	}
+
+	if (init_zero) {
+		bzero(buf, bytes);
+	} else {
+		/* Initialize random contents, different in each word: */
+		if (init_random) {
+			u64 *wbuf = (void *)buf;
+			long off = rand();
+			long i;
+
+			for (i = 0; i < bytes/8; i++)
+				wbuf[i] = i + off;
+		}
+	}
+
+	/* Align to 2MB boundary: */
+	buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1));
+
+	/* Restore affinity: */
+	if (init_cpu0) {
+		bind_to_cpumask(orig_mask);
+		mempol_restore();
+	}
+
+	return buf;
+}
+
+static void free_data(void *data, ssize_t bytes)
+{
+	int ret;
+
+	if (!data)
+		return;
+
+	ret = munmap(data, bytes);
+	BUG_ON(ret);
+}
+
+/*
+ * Create a shared memory buffer that can be shared between processes, zeroed:
+ */
+static void * zalloc_shared_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Create a shared memory buffer that can be shared between processes:
+ */
+static void * setup_shared_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Allocate process-local memory - this will either be shared between
+ * threads of this process, or only be accessed by this thread:
+ */
+static void * setup_private_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Return a process-shared (global) mutex:
+ */
+static void init_global_mutex(pthread_mutex_t *mutex)
+{
+	pthread_mutexattr_t attr;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(mutex, &attr);
+}
+
+static int parse_cpu_list(const char *arg)
+{
+	p0.cpu_list_str = strdup(arg);
+
+	dprintf("got CPU list: {%s}\n", p0.cpu_list_str);
+
+	return 0;
+}
+
+static void parse_setup_cpu_list(void)
+{
+	struct thread_data *td;
+	char *str0, *str;
+	int t;
+
+	if (!g->p.cpu_list_str)
+		return;
+
+	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
+
+	str0 = str = strdup(g->p.cpu_list_str);
+	t = 0;
+
+	BUG_ON(!str);
+
+	tprintf("# binding tasks to CPUs:\n");
+	tprintf("#  ");
+
+	while (true) {
+		int bind_cpu, bind_cpu_0, bind_cpu_1;
+		char *tok, *tok_end, *tok_step, *tok_len, *tok_mul;
+		int bind_len;
+		int step;
+		int mul;
+
+		tok = strsep(&str, ",");
+		if (!tok)
+			break;
+
+		tok_end = strstr(tok, "-");
+
+		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
+		if (!tok_end) {
+			/* Single CPU specified: */
+			bind_cpu_0 = bind_cpu_1 = atol(tok);
+		} else {
+			/* CPU range specified (for example: "5-11"): */
+			bind_cpu_0 = atol(tok);
+			bind_cpu_1 = atol(tok_end + 1);
+		}
+
+		step = 1;
+		tok_step = strstr(tok, "#");
+		if (tok_step) {
+			step = atol(tok_step + 1);
+			BUG_ON(step <= 0 || step >= g->p.nr_cpus);
+		}
+
+		/*
+		 * Mask length.
+		 * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4',
+		 * where the _4 means the next 4 CPUs are allowed.
+		 */
+		bind_len = 1;
+		tok_len = strstr(tok, "_");
+		if (tok_len) {
+			bind_len = atol(tok_len + 1);
+			BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus);
+		}
+
+		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
+		mul = 1;
+		tok_mul = strstr(tok, "x");
+		if (tok_mul) {
+			mul = atol(tok_mul + 1);
+			BUG_ON(mul <= 0);
+		}
+
+		dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
+
+		BUG_ON(bind_cpu_0 < 0 || bind_cpu_0 >= g->p.nr_cpus);
+		BUG_ON(bind_cpu_1 < 0 || bind_cpu_1 >= g->p.nr_cpus);
+		BUG_ON(bind_cpu_0 > bind_cpu_1);
+
+		for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
+			int i;
+
+			for (i = 0; i < mul; i++) {
+				int cpu;
+
+				if (t >= g->p.nr_tasks) {
+					printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
+					goto out;
+				}
+				td = g->threads + t;
+
+				if (t)
+					tprintf(",");
+				if (bind_len > 1) {
+					tprintf("%2d/%d", bind_cpu, bind_len);
+				} else {
+					tprintf("%2d", bind_cpu);
+				}
+
+				CPU_ZERO(&td->bind_cpumask);
+				for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
+					BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
+					CPU_SET(cpu, &td->bind_cpumask);
+				}
+				t++;
+			}
+		}
+	}
+out:
+
+	tprintf("\n");
+
+	if (t < g->p.nr_tasks)
+		printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
+
+	free(str0);
+}
+
+static int parse_cpus_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
+{
+	if (!arg)
+		return -1;
+
+	return parse_cpu_list(arg);
+}
+
+static int parse_node_list(const char *arg)
+{
+	p0.node_list_str = strdup(arg);
+
+	dprintf("got NODE list: {%s}\n", p0.node_list_str);
+
+	return 0;
+}
+
+static void parse_setup_node_list(void)
+{
+	struct thread_data *td;
+	char *str0, *str;
+	int t;
+
+	if (!g->p.node_list_str)
+		return;
+
+	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
+
+	str0 = str = strdup(g->p.node_list_str);
+	t = 0;
+
+	BUG_ON(!str);
+
+	tprintf("# binding tasks to NODEs:\n");
+	tprintf("# ");
+
+	while (true) {
+		int bind_node, bind_node_0, bind_node_1;
+		char *tok, *tok_end, *tok_step, *tok_mul;
+		int step;
+		int mul;
+
+		tok = strsep(&str, ",");
+		if (!tok)
+			break;
+
+		tok_end = strstr(tok, "-");
+
+		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
+		if (!tok_end) {
+			/* Single NODE specified: */
+			bind_node_0 = bind_node_1 = atol(tok);
+		} else {
+			/* NODE range specified (for example: "5-11"): */
+			bind_node_0 = atol(tok);
+			bind_node_1 = atol(tok_end + 1);
+		}
+
+		step = 1;
+		tok_step = strstr(tok, "#");
+		if (tok_step) {
+			step = atol(tok_step + 1);
+			BUG_ON(step <= 0 || step >= g->p.nr_nodes);
+		}
+
+		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
+		mul = 1;
+		tok_mul = strstr(tok, "x");
+		if (tok_mul) {
+			mul = atol(tok_mul + 1);
+			BUG_ON(mul <= 0);
+		}
+
+		dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
+
+		BUG_ON(bind_node_0 < 0 || bind_node_0 >= g->p.nr_nodes);
+		BUG_ON(bind_node_1 < 0 || bind_node_1 >= g->p.nr_nodes);
+		BUG_ON(bind_node_0 > bind_node_1);
+
+		for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
+			int i;
+
+			for (i = 0; i < mul; i++) {
+				if (t >= g->p.nr_tasks) {
+					printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
+					goto out;
+				}
+				td = g->threads + t;
+
+				if (!t)
+					tprintf(" %2d", bind_node);
+				else
+					tprintf(",%2d", bind_node);
+
+				td->bind_node = bind_node;
+				t++;
+			}
+		}
+	}
+out:
+
+	tprintf("\n");
+
+	if (t < g->p.nr_tasks)
+		printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
+
+	free(str0);
+}
+
+static int parse_nodes_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
+{
+	if (!arg)
+		return -1;
+
+	return parse_node_list(arg);
+
+	return 0;
+}
+
+#define BIT(x) (1ul << x)
+
+static inline uint32_t lfsr_32(uint32_t lfsr)
+{
+	const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31);
+	return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps);
+}
+
+/*
+ * Make sure there's real data dependency to RAM (when read
+ * accesses are enabled), so the compiler, the CPU and the
+ * kernel (KSM, zero page, etc.) cannot optimize away RAM
+ * accesses:
+ */
+static inline u64 access_data(u64 *data __attribute__((unused)), u64 val)
+{
+	if (g->p.data_reads)
+		val += *data;
+	if (g->p.data_writes)
+		*data = val + 1;
+	return val;
+}
+
+/*
+ * The worker process does two types of work, a forwards going
+ * loop and a backwards going loop.
+ *
+ * We do this so that on multiprocessor systems we do not create
+ * a 'train' of processing, with highly synchronized processes,
+ * skewing the whole benchmark.
+ */
+static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val)
+{
+	long words = bytes/sizeof(u64);
+	u64 *data = (void *)__data;
+	long chunk_0, chunk_1;
+	u64 *d0, *d, *d1;
+	long off;
+	long i;
+
+	BUG_ON(!data && words);
+	BUG_ON(data && !words);
+
+	if (!data)
+		return val;
+
+	/* Very simple memset() work variant: */
+	if (g->p.data_zero_memset && !g->p.data_rand_walk) {
+		bzero(data, bytes);
+		return val;
+	}
+
+	/* Spread out by PID/TID nr and by loop nr: */
+	chunk_0 = words/nr_max;
+	chunk_1 = words/g->p.nr_loops;
+	off = nr*chunk_0 + loop*chunk_1;
+
+	while (off >= words)
+		off -= words;
+
+	if (g->p.data_rand_walk) {
+		u32 lfsr = nr + loop + val;
+		int j;
+
+		for (i = 0; i < words/1024; i++) {
+			long start, end;
+
+			lfsr = lfsr_32(lfsr);
+
+			start = lfsr % words;
+			end = min(start + 1024, words-1);
+
+			if (g->p.data_zero_memset) {
+				bzero(data + start, (end-start) * sizeof(u64));
+			} else {
+				for (j = start; j < end; j++)
+					val = access_data(data + j, val);
+			}
+		}
+	} else if (!g->p.data_backwards || (nr + loop) & 1) {
+
+		d0 = data + off;
+		d  = data + off + 1;
+		d1 = data + words;
+
+		/* Process data forwards: */
+		for (;;) {
+			if (unlikely(d >= d1))
+				d = data;
+			if (unlikely(d == d0))
+				break;
+
+			val = access_data(d, val);
+
+			d++;
+		}
+	} else {
+		/* Process data backwards: */
+
+		d0 = data + off;
+		d  = data + off - 1;
+		d1 = data + words;
+
+		/* Process data forwards: */
+		for (;;) {
+			if (unlikely(d < data))
+				d = data + words-1;
+			if (unlikely(d == d0))
+				break;
+
+			val = access_data(d, val);
+
+			d--;
+		}
+	}
+
+	return val;
+}
+
+static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
+{
+	unsigned int cpu;
+
+	cpu = sched_getcpu();
+
+	g->threads[task_nr].curr_cpu = cpu;
+	prctl(0, bytes_worked);
+}
+
+#define MAX_NR_NODES	64
+
+/*
+ * Count the number of nodes a process's threads
+ * are spread out on.
+ *
+ * A count of 1 means that the process is compressed
+ * to a single node. A count of g->p.nr_nodes means it's
+ * spread out on the whole system.
+ */
+static int count_process_nodes(int process_nr)
+{
+	char node_present[MAX_NR_NODES] = { 0, };
+	int nodes;
+	int n, t;
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+		struct thread_data *td;
+		int task_nr;
+		int node;
+
+		task_nr = process_nr*g->p.nr_threads + t;
+		td = g->threads + task_nr;
+
+		node = numa_node_of_cpu(td->curr_cpu);
+		node_present[node] = 1;
+	}
+
+	nodes = 0;
+
+	for (n = 0; n < MAX_NR_NODES; n++)
+		nodes += node_present[n];
+
+	return nodes;
+}
+
+/*
+ * Count the number of distinct process-threads a node contains.
+ *
+ * A count of 1 means that the node contains only a single
+ * process. If all nodes on the system contain at most one
+ * process then we are well-converged.
+ */
+static int count_node_processes(int node)
+{
+	int processes = 0;
+	int t, p;
+
+	for (p = 0; p < g->p.nr_proc; p++) {
+		for (t = 0; t < g->p.nr_threads; t++) {
+			struct thread_data *td;
+			int task_nr;
+			int n;
+
+			task_nr = p*g->p.nr_threads + t;
+			td = g->threads + task_nr;
+
+			n = numa_node_of_cpu(td->curr_cpu);
+			if (n == node) {
+				processes++;
+				break;
+			}
+		}
+	}
+
+	return processes;
+}
+
+static void calc_convergence_compression(int *strong)
+{
+	unsigned int nodes_min, nodes_max;
+	int p;
+
+	nodes_min = -1;
+	nodes_max =  0;
+
+	for (p = 0; p < g->p.nr_proc; p++) {
+		unsigned int nodes = count_process_nodes(p);
+
+		nodes_min = min(nodes, nodes_min);
+		nodes_max = max(nodes, nodes_max);
+	}
+
+	/* Strong convergence: all threads compress on a single node: */
+	if (nodes_min == 1 && nodes_max == 1) {
+		*strong = 1;
+	} else {
+		*strong = 0;
+		tprintf(" {%d-%d}", nodes_min, nodes_max);
+	}
+}
+
+static void calc_convergence(double runtime_ns_max, double *convergence)
+{
+	unsigned int loops_done_min, loops_done_max;
+	int process_groups;
+	int nodes[MAX_NR_NODES];
+	int distance;
+	int nr_min;
+	int nr_max;
+	int strong;
+	int sum;
+	int nr;
+	int node;
+	int cpu;
+	int t;
+
+	if (!g->p.show_convergence && !g->p.measure_convergence)
+		return;
+
+	for (node = 0; node < g->p.nr_nodes; node++)
+		nodes[node] = 0;
+
+	loops_done_min = -1;
+	loops_done_max = 0;
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		unsigned int loops_done;
+
+		cpu = td->curr_cpu;
+
+		/* Not all threads have written it yet: */
+		if (cpu < 0)
+			continue;
+
+		node = numa_node_of_cpu(cpu);
+
+		nodes[node]++;
+
+		loops_done = td->loops_done;
+		loops_done_min = min(loops_done, loops_done_min);
+		loops_done_max = max(loops_done, loops_done_max);
+	}
+
+	nr_max = 0;
+	nr_min = g->p.nr_tasks;
+	sum = 0;
+
+	for (node = 0; node < g->p.nr_nodes; node++) {
+		nr = nodes[node];
+		nr_min = min(nr, nr_min);
+		nr_max = max(nr, nr_max);
+		sum += nr;
+	}
+	BUG_ON(nr_min > nr_max);
+
+	BUG_ON(sum > g->p.nr_tasks);
+
+	if (0 && (sum < g->p.nr_tasks))
+		return;
+
+	/*
+	 * Count the number of distinct process groups present
+	 * on nodes - when we are converged this will decrease
+	 * to g->p.nr_proc:
+	 */
+	process_groups = 0;
+
+	for (node = 0; node < g->p.nr_nodes; node++) {
+		int processes = count_node_processes(node);
+
+		nr = nodes[node];
+		tprintf(" %2d/%-2d", nr, processes);
+
+		process_groups += processes;
+	}
+
+	distance = nr_max - nr_min;
+
+	tprintf(" [%2d/%-2d]", distance, process_groups);
+
+	tprintf(" l:%3d-%-3d (%3d)",
+		loops_done_min, loops_done_max, loops_done_max-loops_done_min);
+
+	if (loops_done_min && loops_done_max) {
+		double skew = 1.0 - (double)loops_done_min/loops_done_max;
+
+		tprintf(" [%4.1f%%]", skew * 100.0);
+	}
+
+	calc_convergence_compression(&strong);
+
+	if (strong && process_groups == g->p.nr_proc) {
+		if (!*convergence) {
+			*convergence = runtime_ns_max;
+			tprintf(" (%6.1fs converged)\n", *convergence/1e9);
+			if (g->p.measure_convergence) {
+				g->all_converged = true;
+				g->stop_work = true;
+			}
+		}
+	} else {
+		if (*convergence) {
+			tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
+			*convergence = 0;
+		}
+		tprintf("\n");
+	}
+}
+
+static void show_summary(double runtime_ns_max, int l, double *convergence)
+{
+	tprintf("\r #  %5.1f%%  [%.1f mins]",
+		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
+
+	calc_convergence(runtime_ns_max, convergence);
+
+	if (g->p.show_details >= 0)
+		fflush(stdout);
+}
+
+static void *worker_thread(void *__tdata)
+{
+	struct thread_data *td = __tdata;
+	struct timeval start0, start, stop, diff;
+	int process_nr = td->process_nr;
+	int thread_nr = td->thread_nr;
+	unsigned long last_perturbance;
+	int task_nr = td->task_nr;
+	int details = g->p.show_details;
+	int first_task, last_task;
+	double convergence = 0;
+	u64 val = td->val;
+	double runtime_ns_max;
+	u8 *global_data;
+	u8 *process_data;
+	u8 *thread_data;
+	u64 bytes_done;
+	long work_done;
+	u32 l;
+
+	bind_to_cpumask(td->bind_cpumask);
+	bind_to_memnode(td->bind_node);
+
+	set_taskname("thread %d/%d", process_nr, thread_nr);
+
+	global_data = g->data;
+	process_data = td->process_data;
+	thread_data = setup_private_data(g->p.bytes_thread);
+
+	bytes_done = 0;
+
+	last_task = 0;
+	if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
+		last_task = 1;
+
+	first_task = 0;
+	if (process_nr == 0 && thread_nr == 0)
+		first_task = 1;
+
+	if (details >= 2) {
+		printf("#  thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
+			process_nr, thread_nr, global_data, process_data, thread_data);
+	}
+
+	if (g->p.serialize_startup) {
+		pthread_mutex_lock(&g->startup_mutex);
+		g->nr_tasks_started++;
+		pthread_mutex_unlock(&g->startup_mutex);
+
+		/* Here we will wait for the main process to start us all at once: */
+		pthread_mutex_lock(&g->start_work_mutex);
+		g->nr_tasks_working++;
+
+		/* Last one wake the main process: */
+		if (g->nr_tasks_working == g->p.nr_tasks)
+			pthread_mutex_unlock(&g->startup_done_mutex);
+
+		pthread_mutex_unlock(&g->start_work_mutex);
+	}
+
+	gettimeofday(&start0, NULL);
+
+	start = stop = start0;
+	last_perturbance = start.tv_sec;
+
+	for (l = 0; l < g->p.nr_loops; l++) {
+		start = stop;
+
+		if (g->stop_work)
+			break;
+
+		val += do_work(global_data,  g->p.bytes_global,  process_nr, g->p.nr_proc,	l, val);
+		val += do_work(process_data, g->p.bytes_process, thread_nr,  g->p.nr_threads,	l, val);
+		val += do_work(thread_data,  g->p.bytes_thread,  0,          1,		l, val);
+
+		if (g->p.sleep_usecs) {
+			pthread_mutex_lock(td->process_lock);
+			usleep(g->p.sleep_usecs);
+			pthread_mutex_unlock(td->process_lock);
+		}
+		/*
+		 * Amount of work to be done under a process-global lock:
+		 */
+		if (g->p.bytes_process_locked) {
+			pthread_mutex_lock(td->process_lock);
+			val += do_work(process_data, g->p.bytes_process_locked, thread_nr,  g->p.nr_threads,	l, val);
+			pthread_mutex_unlock(td->process_lock);
+		}
+
+		work_done = g->p.bytes_global + g->p.bytes_process +
+			    g->p.bytes_process_locked + g->p.bytes_thread;
+
+		update_curr_cpu(task_nr, work_done);
+		bytes_done += work_done;
+
+		if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
+			continue;
+
+		td->loops_done = l;
+
+		gettimeofday(&stop, NULL);
+
+		/* Check whether our max runtime timed out: */
+		if (g->p.nr_secs) {
+			timersub(&stop, &start0, &diff);
+			if (diff.tv_sec >= g->p.nr_secs) {
+				g->stop_work = true;
+				break;
+			}
+		}
+
+		/* Update the summary at most once per second: */
+		if (start.tv_sec == stop.tv_sec)
+			continue;
+
+		/*
+		 * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
+		 * by migrating to CPU#0:
+		 */
+		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
+			cpu_set_t orig_mask;
+			int target_cpu;
+			int this_cpu;
+
+			last_perturbance = stop.tv_sec;
+
+			/*
+			 * Depending on where we are running, move into
+			 * the other half of the system, to create some
+			 * real disturbance:
+			 */
+			this_cpu = g->threads[task_nr].curr_cpu;
+			if (this_cpu < g->p.nr_cpus/2)
+				target_cpu = g->p.nr_cpus-1;
+			else
+				target_cpu = 0;
+
+			orig_mask = bind_to_cpu(target_cpu);
+
+			/* Here we are running on the target CPU already */
+			if (details >= 1)
+				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
+
+			bind_to_cpumask(orig_mask);
+		}
+
+		if (details >= 3) {
+			timersub(&stop, &start, &diff);
+			runtime_ns_max = diff.tv_sec * 1000000000;
+			runtime_ns_max += diff.tv_usec * 1000;
+
+			if (details >= 0) {
+				printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016lx]\n",
+					process_nr, thread_nr, runtime_ns_max / bytes_done, val);
+			}
+			fflush(stdout);
+		}
+		if (!last_task)
+			continue;
+
+		timersub(&stop, &start0, &diff);
+		runtime_ns_max = diff.tv_sec * 1000000000ULL;
+		runtime_ns_max += diff.tv_usec * 1000ULL;
+
+		show_summary(runtime_ns_max, l, &convergence);
+	}
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start0, &diff);
+	td->runtime_ns = diff.tv_sec * 1000000000ULL;
+	td->runtime_ns += diff.tv_usec * 1000ULL;
+
+	free_data(thread_data, g->p.bytes_thread);
+
+	pthread_mutex_lock(&g->stop_work_mutex);
+	g->bytes_done += bytes_done;
+	pthread_mutex_unlock(&g->stop_work_mutex);
+
+	return NULL;
+}
+
+/*
+ * A worker process starts a couple of threads:
+ */
+static void worker_process(int process_nr)
+{
+	pthread_mutex_t process_lock;
+	struct thread_data *td;
+	pthread_t *pthreads;
+	u8 *process_data;
+	int task_nr;
+	int ret;
+	int t;
+
+	pthread_mutex_init(&process_lock, NULL);
+	set_taskname("process %d", process_nr);
+
+	/*
+	 * Pick up the memory policy and the CPU binding of our first thread,
+	 * so that we initialize memory accordingly:
+	 */
+	task_nr = process_nr*g->p.nr_threads;
+	td = g->threads + task_nr;
+
+	bind_to_memnode(td->bind_node);
+	bind_to_cpumask(td->bind_cpumask);
+
+	pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t));
+	process_data = setup_private_data(g->p.bytes_process);
+
+	if (g->p.show_details >= 3) {
+		printf(" # process %2d global mem: %p, process mem: %p\n",
+			process_nr, g->data, process_data);
+	}
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+		task_nr = process_nr*g->p.nr_threads + t;
+		td = g->threads + task_nr;
+
+		td->process_data = process_data;
+		td->process_nr   = process_nr;
+		td->thread_nr    = t;
+		td->task_nr	 = task_nr;
+		td->val          = rand();
+		td->curr_cpu	 = -1;
+		td->process_lock = &process_lock;
+
+		ret = pthread_create(pthreads + t, NULL, worker_thread, td);
+		BUG_ON(ret);
+	}
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+                ret = pthread_join(pthreads[t], NULL);
+		BUG_ON(ret);
+	}
+
+	free_data(process_data, g->p.bytes_process);
+	free(pthreads);
+}
+
+static void print_summary(void)
+{
+	if (g->p.show_details < 0)
+		return;
+
+	printf("\n ###\n");
+	printf(" # %d %s will execute (on %d nodes, %d CPUs):\n",
+		g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus);
+	printf(" #      %5dx %5ldMB global  shared mem operations\n",
+			g->p.nr_loops, g->p.bytes_global/1024/1024);
+	printf(" #      %5dx %5ldMB process shared mem operations\n",
+			g->p.nr_loops, g->p.bytes_process/1024/1024);
+	printf(" #      %5dx %5ldMB thread  local  mem operations\n",
+			g->p.nr_loops, g->p.bytes_thread/1024/1024);
+
+	printf(" ###\n");
+
+	printf("\n ###\n"); fflush(stdout);
+}
+
+static void init_thread_data(void)
+{
+	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+	int t;
+
+	g->threads = zalloc_shared_data(size);
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		int cpu;
+
+		/* Allow all nodes by default: */
+		td->bind_node = -1;
+
+		/* Allow all CPUs by default: */
+		CPU_ZERO(&td->bind_cpumask);
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &td->bind_cpumask);
+	}
+}
+
+static void deinit_thread_data(void)
+{
+	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+
+	free_data(g->threads, size);
+}
+
+static int init(void)
+{
+	g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0);
+
+	/* Copy over options: */
+	g->p = p0;
+
+	g->p.nr_cpus = numa_num_configured_cpus();
+
+	g->p.nr_nodes = numa_max_node() + 1;
+
+	/* char array in count_process_nodes(): */
+	BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+
+	if (g->p.show_quiet && !g->p.show_details)
+		g->p.show_details = -1;
+
+	/* Some memory should be specified: */
+	if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str)
+		return -1;
+
+	if (g->p.mb_global_str) {
+		g->p.mb_global = atof(g->p.mb_global_str);
+		BUG_ON(g->p.mb_global < 0);
+	}
+
+	if (g->p.mb_proc_str) {
+		g->p.mb_proc = atof(g->p.mb_proc_str);
+		BUG_ON(g->p.mb_proc < 0);
+	}
+
+	if (g->p.mb_proc_locked_str) {
+		g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str);
+		BUG_ON(g->p.mb_proc_locked < 0);
+		BUG_ON(g->p.mb_proc_locked > g->p.mb_proc);
+	}
+
+	if (g->p.mb_thread_str) {
+		g->p.mb_thread = atof(g->p.mb_thread_str);
+		BUG_ON(g->p.mb_thread < 0);
+	}
+
+	BUG_ON(g->p.nr_threads <= 0);
+	BUG_ON(g->p.nr_proc <= 0);
+
+	g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads;
+
+	g->p.bytes_global		= g->p.mb_global	*1024L*1024L;
+	g->p.bytes_process		= g->p.mb_proc		*1024L*1024L;
+	g->p.bytes_process_locked	= g->p.mb_proc_locked	*1024L*1024L;
+	g->p.bytes_thread		= g->p.mb_thread	*1024L*1024L;
+
+	g->data = setup_shared_data(g->p.bytes_global);
+
+	/* Startup serialization: */
+	init_global_mutex(&g->start_work_mutex);
+	init_global_mutex(&g->startup_mutex);
+	init_global_mutex(&g->startup_done_mutex);
+	init_global_mutex(&g->stop_work_mutex);
+
+	init_thread_data();
+
+	tprintf("#\n");
+	parse_setup_cpu_list();
+	parse_setup_node_list();
+	tprintf("#\n");
+
+	print_summary();
+
+	return 0;
+}
+
+static void deinit(void)
+{
+	free_data(g->data, g->p.bytes_global);
+	g->data = NULL;
+
+	deinit_thread_data();
+
+	free_data(g, sizeof(*g));
+	g = NULL;
+}
+
+/*
+ * Print a short or long result, depending on the verbosity setting:
+ */
+static void print_res(const char *name, double val,
+		      const char *txt_unit, const char *txt_short, const char *txt_long)
+{
+	if (!name)
+		name = "main,";
+
+	if (g->p.show_quiet)
+		printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
+	else
+		printf(" %14.3f %s\n", val, txt_long);
+}
+
+static int __bench_numa(const char *name)
+{
+	struct timeval start, stop, diff;
+	u64 runtime_ns_min, runtime_ns_sum;
+	pid_t *pids, pid, wpid;
+	double delta_runtime;
+	double runtime_avg;
+	double runtime_sec_max;
+	double runtime_sec_min;
+	int wait_stat;
+	double bytes;
+	int i, t;
+
+	if (init())
+		return -1;
+
+	pids = zalloc(g->p.nr_proc * sizeof(*pids));
+	pid = -1;
+
+	/* All threads try to acquire it, this way we can wait for them to start up: */
+	pthread_mutex_lock(&g->start_work_mutex);
+
+	if (g->p.serialize_startup) {
+		tprintf(" #\n");
+		tprintf(" # Startup synchronization: ..."); fflush(stdout);
+	}
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < g->p.nr_proc; i++) {
+		pid = fork();
+		dprintf(" # process %2d: PID %d\n", i, pid);
+
+		BUG_ON(pid < 0);
+		if (!pid) {
+			/* Child process: */
+			worker_process(i);
+
+			exit(0);
+		}
+		pids[i] = pid;
+
+	}
+	/* Wait for all the threads to start up: */
+	while (g->nr_tasks_started != g->p.nr_tasks)
+		usleep(1000);
+
+	BUG_ON(g->nr_tasks_started != g->p.nr_tasks);
+
+	if (g->p.serialize_startup) {
+		double startup_sec;
+
+		pthread_mutex_lock(&g->startup_done_mutex);
+
+		/* This will start all threads: */
+		pthread_mutex_unlock(&g->start_work_mutex);
+
+		/* This mutex is locked - the last started thread will wake us: */
+		pthread_mutex_lock(&g->startup_done_mutex);
+
+		gettimeofday(&stop, NULL);
+
+		timersub(&stop, &start, &diff);
+
+		startup_sec = diff.tv_sec * 1000000000.0;
+		startup_sec += diff.tv_usec * 1000.0;
+		startup_sec /= 1e9;
+
+		tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
+		tprintf(" #\n");
+
+		start = stop;
+		pthread_mutex_unlock(&g->startup_done_mutex);
+	} else {
+		gettimeofday(&start, NULL);
+	}
+
+	/* Parent process: */
+
+
+	for (i = 0; i < g->p.nr_proc; i++) {
+		wpid = waitpid(pids[i], &wait_stat, 0);
+		BUG_ON(wpid < 0);
+		BUG_ON(!WIFEXITED(wait_stat));
+
+	}
+
+	runtime_ns_sum = 0;
+	runtime_ns_min = -1LL;
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		u64 thread_runtime_ns = g->threads[t].runtime_ns;
+
+		runtime_ns_sum += thread_runtime_ns;
+		runtime_ns_min = min(thread_runtime_ns, runtime_ns_min);
+	}
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
+	BUG_ON(bench_format != BENCH_FORMAT_DEFAULT);
+
+	tprintf("\n ###\n");
+	tprintf("\n");
+
+	runtime_sec_max = diff.tv_sec * 1000000000.0;
+	runtime_sec_max += diff.tv_usec * 1000.0;
+	runtime_sec_max /= 1e9;
+
+	runtime_sec_min = runtime_ns_min/1e9;
+
+	bytes = g->bytes_done;
+	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
+
+	if (g->p.measure_convergence) {
+		print_res(name, runtime_sec_max,
+			"secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
+	}
+
+	print_res(name, runtime_sec_max,
+		"secs,", "runtime-max/thread",	"secs slowest (max) thread-runtime");
+
+	print_res(name, runtime_sec_min,
+		"secs,", "runtime-min/thread",	"secs fastest (min) thread-runtime");
+
+	print_res(name, runtime_avg,
+		"secs,", "runtime-avg/thread",	"secs average thread-runtime");
+
+	delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0;
+	print_res(name, delta_runtime / runtime_sec_max * 100.0,
+		"%,", "spread-runtime/thread",	"% difference between max/avg runtime");
+
+	print_res(name, bytes / g->p.nr_tasks / 1e9,
+		"GB,", "data/thread",		"GB data processed, per thread");
+
+	print_res(name, bytes / 1e9,
+		"GB,", "data-total",		"GB data processed, total");
+
+	print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks),
+		"nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime");
+
+	print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max,
+		"GB/sec,", "thread-speed",	"GB/sec/thread speed");
+
+	print_res(name, bytes / runtime_sec_max / 1e9,
+		"GB/sec,", "total-speed",	"GB/sec total speed");
+
+	free(pids);
+
+	deinit();
+
+	return 0;
+}
+
+#define MAX_ARGS 50
+
+static int command_size(const char **argv)
+{
+	int size = 0;
+
+	while (*argv) {
+		size++;
+		argv++;
+	}
+
+	BUG_ON(size >= MAX_ARGS);
+
+	return size;
+}
+
+static void init_params(struct params *p, const char *name, int argc, const char **argv)
+{
+	int i;
+
+	printf("\n # Running %s \"perf bench numa", name);
+
+	for (i = 0; i < argc; i++)
+		printf(" %s", argv[i]);
+
+	printf("\"\n");
+
+	memset(p, 0, sizeof(*p));
+
+	/* Initialize nonzero defaults: */
+
+	p->serialize_startup		= 1;
+	p->data_reads			= true;
+	p->data_writes			= true;
+	p->data_backwards		= true;
+	p->data_rand_walk		= true;
+	p->nr_loops			= -1;
+	p->init_random			= true;
+}
+
+static int run_bench_numa(const char *name, const char **argv)
+{
+	int argc = command_size(argv);
+
+	init_params(&p0, name, argc, argv);
+	argc = parse_options(argc, argv, options, bench_numa_usage, 0);
+	if (argc)
+		goto err;
+
+	if (__bench_numa(name))
+		goto err;
+
+	return 0;
+
+err:
+	usage_with_options(numa_usage, options);
+	return -1;
+}
+
+#define OPT_BW_RAM		"-s",  "20", "-zZq",    "--thp", " 1", "--no-data_rand_walk"
+#define OPT_BW_RAM_NOTHP	OPT_BW_RAM,		"--thp", "-1"
+
+#define OPT_CONV		"-s", "100", "-zZ0qcm", "--thp", " 1"
+#define OPT_CONV_NOTHP		OPT_CONV,		"--thp", "-1"
+
+#define OPT_BW			"-s",  "20", "-zZ0q",   "--thp", " 1"
+#define OPT_BW_NOTHP		OPT_BW,			"--thp", "-1"
+
+/*
+ * The built-in test-suite executed by "perf bench numa -a".
+ *
+ * (A minimum of 4 nodes and 16 GB of RAM is recommended.)
+ */
+static const char *tests[][MAX_ARGS] = {
+   /* Basic single-stream NUMA bandwidth measurements: */
+   { "RAM-bw-local,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM },
+   { "RAM-bw-local-NOTHP,",
+			  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM_NOTHP },
+   { "RAM-bw-remote,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "1", OPT_BW_RAM },
+
+   /* 2-stream NUMA bandwidth measurements: */
+   { "RAM-bw-local-2x,",  "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+			   "-C", "0,2", "-M", "0x2", OPT_BW_RAM },
+   { "RAM-bw-remote-2x,", "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+		 	   "-C", "0,2", "-M", "1x2", OPT_BW_RAM },
+
+   /* Cross-stream NUMA bandwidth measurement: */
+   { "RAM-bw-cross,",     "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+		 	   "-C", "0,8", "-M", "1,0", OPT_BW_RAM },
+
+   /* Convergence latency measurements: */
+   { " 1x3-convergence,", "mem",  "-p",  "1", "-t",  "3", "-P",  "512", OPT_CONV },
+   { " 1x4-convergence,", "mem",  "-p",  "1", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 1x6-convergence,", "mem",  "-p",  "1", "-t",  "6", "-P", "1020", OPT_CONV },
+   { " 2x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
+   { " 3x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
+   { " 4x4-convergence,", "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 4x4-convergence-NOTHP,",
+			  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
+   { " 4x6-convergence,", "mem",  "-p",  "4", "-t",  "6", "-P", "1020", OPT_CONV },
+   { " 4x8-convergence,", "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_CONV },
+   { " 8x4-convergence,", "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 8x4-convergence-NOTHP,",
+			  "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
+   { " 3x1-convergence,", "mem",  "-p",  "3", "-t",  "1", "-P",  "512", OPT_CONV },
+   { " 4x1-convergence,", "mem",  "-p",  "4", "-t",  "1", "-P",  "512", OPT_CONV },
+   { " 8x1-convergence,", "mem",  "-p",  "8", "-t",  "1", "-P",  "512", OPT_CONV },
+   { "16x1-convergence,", "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_CONV },
+   { "32x1-convergence,", "mem",  "-p", "32", "-t",  "1", "-P",  "128", OPT_CONV },
+
+   /* Various NUMA process/thread layout bandwidth measurements: */
+   { " 2x1-bw-process,",  "mem",  "-p",  "2", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 3x1-bw-process,",  "mem",  "-p",  "3", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 4x1-bw-process,",  "mem",  "-p",  "4", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 8x1-bw-process,",  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW },
+   { " 8x1-bw-process-NOTHP,",
+			  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW_NOTHP },
+   { "16x1-bw-process,",  "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_BW },
+
+   { " 4x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "4", "-T",  "256", OPT_BW },
+   { " 8x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "8", "-T",  "256", OPT_BW },
+   { "16x1-bw-thread,",   "mem",  "-p",  "1", "-t", "16", "-T",  "128", OPT_BW },
+   { "32x1-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-T",   "64", OPT_BW },
+
+   { " 2x3-bw-thread,",	  "mem",  "-p",  "2", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 4x4-bw-thread,",	  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_BW },
+   { " 4x6-bw-thread,",	  "mem",  "-p",  "4", "-t",  "6", "-P",  "512", OPT_BW },
+   { " 4x8-bw-thread,",	  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW },
+   { " 4x8-bw-thread-NOTHP,",
+			  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW_NOTHP },
+   { " 3x3-bw-thread,",	  "mem",  "-p",  "3", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 5x5-bw-thread,",	  "mem",  "-p",  "5", "-t",  "5", "-P",  "512", OPT_BW },
+
+   { "2x16-bw-thread,",   "mem",  "-p",  "2", "-t", "16", "-P",  "512", OPT_BW },
+   { "1x32-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-P", "2048", OPT_BW },
+
+   { "numa02-bw,",	  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW },
+   { "numa02-bw-NOTHP,",  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW_NOTHP },
+   { "numa01-bw-thread,", "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW },
+   { "numa01-bw-thread-NOTHP,",
+			  "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW_NOTHP },
+};
+
+static int bench_all(void)
+{
+	int nr = ARRAY_SIZE(tests);
+	int ret;
+	int i;
+
+	ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'");
+	BUG_ON(ret < 0);
+
+	for (i = 0; i < nr; i++) {
+		if (run_bench_numa(tests[i][0], tests[i] + 1))
+			return -1;
+	}
+
+	printf("\n");
+
+	return 0;
+}
+
+int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	init_params(&p0, "main,", argc, argv);
+	argc = parse_options(argc, argv, options, bench_numa_usage, 0);
+	if (argc)
+		goto err;
+
+	if (p0.run_all)
+		return bench_all();
+
+	if (__bench_numa(NULL))
+		goto err;
+
+	return 0;
+
+err:
+	usage_with_options(numa_usage, options);
+	return -1;
+}
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index cae9a5fd2ecf..e5d514bf5365 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -35,6 +35,16 @@ struct bench_suite {
 /* sentinel: easy for help */
 #define suite_all { "all", "Test all benchmark suites", NULL }
 
+static struct bench_suite numa_suites[] = {
+	{ "mem",
+	  "Benchmark for NUMA workloads",
+	  bench_numa },
+	suite_all,
+	{ NULL,
+	  NULL,
+	  NULL                  }
+};
+
 static struct bench_suite sched_suites[] = {
 	{ "messaging",
 	  "Benchmark for scheduler and IPC mechanisms",
@@ -68,6 +78,9 @@ struct bench_subsys {
 };
 
 static struct bench_subsys subsystems[] = {
+	{ "numa",
+	  "NUMA scheduling and MM behavior",
+	  numa_suites },
 	{ "sched",
 	  "scheduler and IPC mechanism",
 	  sched_suites },
@@ -159,6 +172,7 @@ static void all_suite(struct bench_subsys *subsys)	  /* FROM HERE */
 		printf("# Running %s/%s benchmark...\n",
 		       subsys->name,
 		       suites[i].name);
+		fflush(stdout);
 
 		argv[1] = suites[i].name;
 		suites[i].fn(1, argv, NULL);
@@ -225,6 +239,7 @@ int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused)
 				printf("# Running %s/%s benchmark...\n",
 				       subsystems[i].name,
 				       subsystems[i].suites[j].name);
+			fflush(stdout);
 			status = subsystems[i].suites[j].fn(argc - 1,
 							    argv + 1, prefix);
 			goto end;
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index fae8b250b2ca..a336014e0286 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -14,6 +14,7 @@
 #include "util/parse-options.h"
 #include "util/strlist.h"
 #include "util/build-id.h"
+#include "util/session.h"
 #include "util/symbol.h"
 
 static int build_id_cache__add_file(const char *filename, const char *debugdir)
@@ -58,19 +59,59 @@ static int build_id_cache__remove_file(const char *filename,
 	return err;
 }
 
+static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused)
+{
+	char filename[PATH_MAX];
+	u8 build_id[BUILD_ID_SIZE];
+
+	if (dso__build_id_filename(dso, filename, sizeof(filename)) &&
+	    filename__read_build_id(filename, build_id,
+				    sizeof(build_id)) != sizeof(build_id)) {
+		if (errno == ENOENT)
+			return false;
+
+		pr_warning("Problems with %s file, consider removing it from the cache\n", 
+			   filename);
+	} else if (memcmp(dso->build_id, build_id, sizeof(dso->build_id))) {
+		pr_warning("Problems with %s file, consider removing it from the cache\n", 
+			   filename);
+	}
+
+	return true;
+}
+
+static int build_id_cache__fprintf_missing(const char *filename, bool force, FILE *fp)
+{
+	struct perf_session *session = perf_session__new(filename, O_RDONLY,
+							 force, false, NULL);
+	if (session == NULL)
+		return -1;
+
+	perf_session__fprintf_dsos_buildid(session, fp, dso__missing_buildid_cache, 0);
+	perf_session__delete(session);
+
+	return 0;
+}
+
 int cmd_buildid_cache(int argc, const char **argv,
 		      const char *prefix __maybe_unused)
 {
 	struct strlist *list;
 	struct str_node *pos;
+	int ret = 0;
+	bool force = false;
 	char debugdir[PATH_MAX];
 	char const *add_name_list_str = NULL,
-		   *remove_name_list_str = NULL;
+		   *remove_name_list_str = NULL,
+		   *missing_filename = NULL;
 	const struct option buildid_cache_options[] = {
 	OPT_STRING('a', "add", &add_name_list_str,
 		   "file list", "file(s) to add"),
 	OPT_STRING('r', "remove", &remove_name_list_str, "file list",
 		    "file(s) to remove"),
+	OPT_STRING('M', "missing", &missing_filename, "file",
+		   "to find missing build ids in the cache"),
+	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 	OPT_END()
 	};
@@ -125,5 +166,8 @@ int cmd_buildid_cache(int argc, const char **argv,
 		}
 	}
 
-	return 0;
+	if (missing_filename)
+		ret = build_id_cache__fprintf_missing(missing_filename, force, stdout);
+
+	return ret;
 }
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index a82d99fec83e..e74366a13218 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -44,23 +44,26 @@ static int filename__fprintf_build_id(const char *name, FILE *fp)
 	return fprintf(fp, "%s\n", sbuild_id);
 }
 
+static bool dso__skip_buildid(struct dso *dso, int with_hits)
+{
+	return with_hits && !dso->hit;
+}
+
 static int perf_session__list_build_ids(bool force, bool with_hits)
 {
 	struct perf_session *session;
 
 	symbol__elf_init();
-
-	session = perf_session__new(input_name, O_RDONLY, force, false,
-				    &build_id__mark_dso_hit_ops);
-	if (session == NULL)
-		return -1;
-
 	/*
 	 * See if this is an ELF file first:
 	 */
-	if (filename__fprintf_build_id(session->filename, stdout))
+	if (filename__fprintf_build_id(input_name, stdout))
 		goto out;
 
+	session = perf_session__new(input_name, O_RDONLY, force, false,
+				    &build_id__mark_dso_hit_ops);
+	if (session == NULL)
+		return -1;
 	/*
 	 * in pipe-mode, the only way to get the buildids is to parse
 	 * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
@@ -68,9 +71,9 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
 	if (with_hits || session->fd_pipe)
 		perf_session__process_events(session, &build_id__mark_dso_hit_ops);
 
-	perf_session__fprintf_dsos_buildid(session, stdout, with_hits);
-out:
+	perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits);
 	perf_session__delete(session);
+out:
 	return 0;
 }
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 93b852f8a5d5..4af0b580b046 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -23,7 +23,6 @@ static char const *input_old = "perf.data.old",
 		  *input_new = "perf.data";
 static char	  diff__default_sort_order[] = "dso,symbol";
 static bool  force;
-static bool show_displacement;
 static bool show_period;
 static bool show_formula;
 static bool show_baseline_only;
@@ -146,58 +145,47 @@ static int setup_compute(const struct option *opt, const char *str,
 	return -EINVAL;
 }
 
-static double get_period_percent(struct hist_entry *he, u64 period)
+double perf_diff__period_percent(struct hist_entry *he, u64 period)
 {
 	u64 total = he->hists->stats.total_period;
 	return (period * 100.0) / total;
 }
 
-double perf_diff__compute_delta(struct hist_entry *he)
+double perf_diff__compute_delta(struct hist_entry *he, struct hist_entry *pair)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
-	double new_percent = get_period_percent(he, he->stat.period);
-	double old_percent = pair ? get_period_percent(pair, pair->stat.period) : 0.0;
+	double new_percent = perf_diff__period_percent(he, he->stat.period);
+	double old_percent = perf_diff__period_percent(pair, pair->stat.period);
 
 	he->diff.period_ratio_delta = new_percent - old_percent;
 	he->diff.computed = true;
 	return he->diff.period_ratio_delta;
 }
 
-double perf_diff__compute_ratio(struct hist_entry *he)
+double perf_diff__compute_ratio(struct hist_entry *he, struct hist_entry *pair)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	double new_period = he->stat.period;
-	double old_period = pair ? pair->stat.period : 0;
+	double old_period = pair->stat.period;
 
 	he->diff.computed = true;
-	he->diff.period_ratio = pair ? (new_period / old_period) : 0;
+	he->diff.period_ratio = new_period / old_period;
 	return he->diff.period_ratio;
 }
 
-s64 perf_diff__compute_wdiff(struct hist_entry *he)
+s64 perf_diff__compute_wdiff(struct hist_entry *he, struct hist_entry *pair)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	u64 new_period = he->stat.period;
-	u64 old_period = pair ? pair->stat.period : 0;
+	u64 old_period = pair->stat.period;
 
 	he->diff.computed = true;
-
-	if (!pair)
-		he->diff.wdiff = 0;
-	else
-		he->diff.wdiff = new_period * compute_wdiff_w2 -
-				 old_period * compute_wdiff_w1;
+	he->diff.wdiff = new_period * compute_wdiff_w2 -
+			 old_period * compute_wdiff_w1;
 
 	return he->diff.wdiff;
 }
 
-static int formula_delta(struct hist_entry *he, char *buf, size_t size)
+static int formula_delta(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
-
-	if (!pair)
-		return -1;
-
 	return scnprintf(buf, size,
 			 "(%" PRIu64 " * 100 / %" PRIu64 ") - "
 			 "(%" PRIu64 " * 100 / %" PRIu64 ")",
@@ -205,41 +193,36 @@ static int formula_delta(struct hist_entry *he, char *buf, size_t size)
 			  pair->stat.period, pair->hists->stats.total_period);
 }
 
-static int formula_ratio(struct hist_entry *he, char *buf, size_t size)
+static int formula_ratio(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	double new_period = he->stat.period;
-	double old_period = pair ? pair->stat.period : 0;
-
-	if (!pair)
-		return -1;
+	double old_period = pair->stat.period;
 
 	return scnprintf(buf, size, "%.0F / %.0F", new_period, old_period);
 }
 
-static int formula_wdiff(struct hist_entry *he, char *buf, size_t size)
+static int formula_wdiff(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
 {
-	struct hist_entry *pair = hist_entry__next_pair(he);
 	u64 new_period = he->stat.period;
-	u64 old_period = pair ? pair->stat.period : 0;
-
-	if (!pair)
-		return -1;
+	u64 old_period = pair->stat.period;
 
 	return scnprintf(buf, size,
 		  "(%" PRIu64 " * " "%" PRId64 ") - (%" PRIu64 " * " "%" PRId64 ")",
 		  new_period, compute_wdiff_w2, old_period, compute_wdiff_w1);
 }
 
-int perf_diff__formula(char *buf, size_t size, struct hist_entry *he)
+int perf_diff__formula(struct hist_entry *he, struct hist_entry *pair,
+		       char *buf, size_t size)
 {
 	switch (compute) {
 	case COMPUTE_DELTA:
-		return formula_delta(he, buf, size);
+		return formula_delta(he, pair, buf, size);
 	case COMPUTE_RATIO:
-		return formula_ratio(he, buf, size);
+		return formula_ratio(he, pair, buf, size);
 	case COMPUTE_WEIGHTED_DIFF:
-		return formula_wdiff(he, buf, size);
+		return formula_wdiff(he, pair, buf, size);
 	default:
 		BUG_ON(1);
 	}
@@ -292,48 +275,6 @@ static struct perf_tool tool = {
 	.ordering_requires_timestamps = true,
 };
 
-static void insert_hist_entry_by_name(struct rb_root *root,
-				      struct hist_entry *he)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct hist_entry *iter;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct hist_entry, rb_node);
-		if (hist_entry__cmp(he, iter) < 0)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, root);
-}
-
-static void hists__name_resort(struct hists *self, bool sort)
-{
-	unsigned long position = 1;
-	struct rb_root tmp = RB_ROOT;
-	struct rb_node *next = rb_first(&self->entries);
-
-	while (next != NULL) {
-		struct hist_entry *n = rb_entry(next, struct hist_entry, rb_node);
-
-		next = rb_next(&n->rb_node);
-		n->position = position++;
-
-		if (sort) {
-			rb_erase(&n->rb_node, &self->entries);
-			insert_hist_entry_by_name(&tmp, n);
-		}
-	}
-
-	if (sort)
-		self->entries = tmp;
-}
-
 static struct perf_evsel *evsel_match(struct perf_evsel *evsel,
 				      struct perf_evlist *evlist)
 {
@@ -346,34 +287,34 @@ static struct perf_evsel *evsel_match(struct perf_evsel *evsel,
 	return NULL;
 }
 
-static void perf_evlist__resort_hists(struct perf_evlist *evlist, bool name)
+static void perf_evlist__collapse_resort(struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel;
 
 	list_for_each_entry(evsel, &evlist->entries, node) {
 		struct hists *hists = &evsel->hists;
 
-		hists__output_resort(hists);
-
-		/*
-		 * The hists__name_resort only sets possition
-		 * if name is false.
-		 */
-		if (name || ((!name) && show_displacement))
-			hists__name_resort(hists, name);
+		hists__collapse_resort(hists);
 	}
 }
 
 static void hists__baseline_only(struct hists *hists)
 {
-	struct rb_node *next = rb_first(&hists->entries);
+	struct rb_root *root;
+	struct rb_node *next;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
 
+	next = rb_first(root);
 	while (next != NULL) {
-		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node);
+		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node_in);
 
-		next = rb_next(&he->rb_node);
+		next = rb_next(&he->rb_node_in);
 		if (!hist_entry__next_pair(he)) {
-			rb_erase(&he->rb_node, &hists->entries);
+			rb_erase(&he->rb_node_in, root);
 			hist_entry__free(he);
 		}
 	}
@@ -385,18 +326,21 @@ static void hists__precompute(struct hists *hists)
 
 	while (next != NULL) {
 		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node);
+		struct hist_entry *pair = hist_entry__next_pair(he);
 
 		next = rb_next(&he->rb_node);
+		if (!pair)
+			continue;
 
 		switch (compute) {
 		case COMPUTE_DELTA:
-			perf_diff__compute_delta(he);
+			perf_diff__compute_delta(he, pair);
 			break;
 		case COMPUTE_RATIO:
-			perf_diff__compute_ratio(he);
+			perf_diff__compute_ratio(he, pair);
 			break;
 		case COMPUTE_WEIGHTED_DIFF:
-			perf_diff__compute_wdiff(he);
+			perf_diff__compute_wdiff(he, pair);
 			break;
 		default:
 			BUG_ON(1);
@@ -470,19 +414,30 @@ static void insert_hist_entry_by_compute(struct rb_root *root,
 
 static void hists__compute_resort(struct hists *hists)
 {
-	struct rb_root tmp = RB_ROOT;
-	struct rb_node *next = rb_first(&hists->entries);
+	struct rb_root *root;
+	struct rb_node *next;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	hists->entries = RB_ROOT;
+	next = rb_first(root);
+
+	hists->nr_entries = 0;
+	hists->stats.total_period = 0;
+	hists__reset_col_len(hists);
 
 	while (next != NULL) {
-		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node);
+		struct hist_entry *he;
 
-		next = rb_next(&he->rb_node);
+		he = rb_entry(next, struct hist_entry, rb_node_in);
+		next = rb_next(&he->rb_node_in);
 
-		rb_erase(&he->rb_node, &hists->entries);
-		insert_hist_entry_by_compute(&tmp, he, compute);
+		insert_hist_entry_by_compute(&hists->entries, he, compute);
+		hists__inc_nr_entries(hists, he);
 	}
-
-	hists->entries = tmp;
 }
 
 static void hists__process(struct hists *old, struct hists *new)
@@ -497,6 +452,8 @@ static void hists__process(struct hists *old, struct hists *new)
 	if (sort_compute) {
 		hists__precompute(new);
 		hists__compute_resort(new);
+	} else {
+		hists__output_resort(new);
 	}
 
 	hists__fprintf(new, true, 0, 0, stdout);
@@ -528,8 +485,8 @@ static int __cmd_diff(void)
 	evlist_old = older->evlist;
 	evlist_new = newer->evlist;
 
-	perf_evlist__resort_hists(evlist_old, true);
-	perf_evlist__resort_hists(evlist_new, false);
+	perf_evlist__collapse_resort(evlist_old);
+	perf_evlist__collapse_resort(evlist_new);
 
 	list_for_each_entry(evsel, &evlist_new->entries, node) {
 		struct perf_evsel *evsel_old;
@@ -562,8 +519,6 @@ static const char * const diff_usage[] = {
 static const struct option options[] = {
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('M', "displacement", &show_displacement,
-		    "Show position displacement relative to baseline"),
 	OPT_BOOLEAN('b', "baseline-only", &show_baseline_only,
 		    "Show only items with match in baseline"),
 	OPT_CALLBACK('c', "compute", &compute,
@@ -597,40 +552,32 @@ static const struct option options[] = {
 
 static void ui_init(void)
 {
-	perf_hpp__init();
-
-	/* No overhead column. */
-	perf_hpp__column_enable(PERF_HPP__OVERHEAD, false);
-
 	/*
-	 * Display baseline/delta/ratio/displacement/
+	 * Display baseline/delta/ratio
 	 * formula/periods columns.
 	 */
-	perf_hpp__column_enable(PERF_HPP__BASELINE, true);
+	perf_hpp__column_enable(PERF_HPP__BASELINE);
 
 	switch (compute) {
 	case COMPUTE_DELTA:
-		perf_hpp__column_enable(PERF_HPP__DELTA, true);
+		perf_hpp__column_enable(PERF_HPP__DELTA);
 		break;
 	case COMPUTE_RATIO:
-		perf_hpp__column_enable(PERF_HPP__RATIO, true);
+		perf_hpp__column_enable(PERF_HPP__RATIO);
 		break;
 	case COMPUTE_WEIGHTED_DIFF:
-		perf_hpp__column_enable(PERF_HPP__WEIGHTED_DIFF, true);
+		perf_hpp__column_enable(PERF_HPP__WEIGHTED_DIFF);
 		break;
 	default:
 		BUG_ON(1);
 	};
 
-	if (show_displacement)
-		perf_hpp__column_enable(PERF_HPP__DISPL, true);
-
 	if (show_formula)
-		perf_hpp__column_enable(PERF_HPP__FORMULA, true);
+		perf_hpp__column_enable(PERF_HPP__FORMULA);
 
 	if (show_period) {
-		perf_hpp__column_enable(PERF_HPP__PERIOD, true);
-		perf_hpp__column_enable(PERF_HPP__PERIOD_BASELINE, true);
+		perf_hpp__column_enable(PERF_HPP__PERIOD);
+		perf_hpp__column_enable(PERF_HPP__PERIOD_BASELINE);
 	}
 }
 
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index c20f1dcfb7e2..1312a5e03ec7 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -15,39 +15,6 @@
 #include "util/parse-options.h"
 #include "util/session.h"
 
-struct perf_attr_details {
-	bool freq;
-	bool verbose;
-};
-
-static int comma_printf(bool *first, const char *fmt, ...)
-{
-	va_list args;
-	int ret = 0;
-
-	if (!*first) {
-		ret += printf(",");
-	} else {
-		ret += printf(":");
-		*first = false;
-	}
-
-	va_start(args, fmt);
-	ret += vprintf(fmt, args);
-	va_end(args);
-	return ret;
-}
-
-static int __if_print(bool *first, const char *field, u64 value)
-{
-	if (value == 0)
-		return 0;
-
-	return comma_printf(first, " %s: %" PRIu64, field, value);
-}
-
-#define if_print(field) __if_print(&first, #field, pos->attr.field)
-
 static int __cmd_evlist(const char *file_name, struct perf_attr_details *details)
 {
 	struct perf_session *session;
@@ -57,52 +24,8 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 	if (session == NULL)
 		return -ENOMEM;
 
-	list_for_each_entry(pos, &session->evlist->entries, node) {
-		bool first = true;
-
-		printf("%s", perf_evsel__name(pos));
-
-		if (details->verbose || details->freq) {
-			comma_printf(&first, " sample_freq=%" PRIu64,
-				     (u64)pos->attr.sample_freq);
-		}
-
-		if (details->verbose) {
-			if_print(type);
-			if_print(config);
-			if_print(config1);
-			if_print(config2);
-			if_print(size);
-			if_print(sample_type);
-			if_print(read_format);
-			if_print(disabled);
-			if_print(inherit);
-			if_print(pinned);
-			if_print(exclusive);
-			if_print(exclude_user);
-			if_print(exclude_kernel);
-			if_print(exclude_hv);
-			if_print(exclude_idle);
-			if_print(mmap);
-			if_print(comm);
-			if_print(freq);
-			if_print(inherit_stat);
-			if_print(enable_on_exec);
-			if_print(task);
-			if_print(watermark);
-			if_print(precise_ip);
-			if_print(mmap_data);
-			if_print(sample_id_all);
-			if_print(exclude_host);
-			if_print(exclude_guest);
-			if_print(__reserved_1);
-			if_print(wakeup_events);
-			if_print(bp_type);
-			if_print(branch_sample_type);
-		}
-
-		putchar('\n');
-	}
+	list_for_each_entry(pos, &session->evlist->entries, node)
+		perf_evsel__fprintf(pos, details, stdout);
 
 	perf_session__delete(session);
 	return 0;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 0b4b796167be..c746108c5d48 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -340,7 +340,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 			   int n_lines, int is_caller)
 {
 	struct rb_node *next;
-	struct machine *machine;
+	struct machine *machine = &session->machines.host;
 
 	printf("%.102s\n", graph_dotted_line);
 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
@@ -349,11 +349,6 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 
 	next = rb_first(root);
 
-	machine = perf_session__find_host_machine(session);
-	if (!machine) {
-		pr_err("__print_result: couldn't find kernel information\n");
-		return;
-	}
 	while (next && n_lines--) {
 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
 						   node);
@@ -614,8 +609,7 @@ static struct sort_dimension *avail_sorts[] = {
 	&pingpong_sort_dimension,
 };
 
-#define NUM_AVAIL_SORTS	\
-	(int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *))
+#define NUM_AVAIL_SORTS	((int)ARRAY_SIZE(avail_sorts))
 
 static int sort_dimension__add(const char *tok, struct list_head *list)
 {
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index ca3f80ebc100..37a769d7f9fe 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -973,8 +973,7 @@ __cmd_buildid_list(const char *file_name, int argc, const char **argv)
 
 int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	const char *file_name;
-
+	const char *file_name = NULL;
 	const struct option kvm_options[] = {
 		OPT_STRING('i', "input", &file_name, "file",
 			   "Input file name"),
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index f3151d3c70ce..2ac690cad411 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -224,130 +224,28 @@ static bool perf_evlist__equal(struct perf_evlist *evlist,
 
 static int perf_record__open(struct perf_record *rec)
 {
+	char msg[512];
 	struct perf_evsel *pos;
 	struct perf_evlist *evlist = rec->evlist;
 	struct perf_session *session = rec->session;
 	struct perf_record_opts *opts = &rec->opts;
 	int rc = 0;
 
-	/*
-	 * Set the evsel leader links before we configure attributes,
-	 * since some might depend on this info.
-	 */
-	if (opts->group)
-		perf_evlist__set_leader(evlist);
-
-	perf_evlist__config_attrs(evlist, opts);
+	perf_evlist__config(evlist, opts);
 
 	list_for_each_entry(pos, &evlist->entries, node) {
-		struct perf_event_attr *attr = &pos->attr;
-		/*
-		 * Check if parse_single_tracepoint_event has already asked for
-		 * PERF_SAMPLE_TIME.
-		 *
-		 * XXX this is kludgy but short term fix for problems introduced by
-		 * eac23d1c that broke 'perf script' by having different sample_types
-		 * when using multiple tracepoint events when we use a perf binary
-		 * that tries to use sample_id_all on an older kernel.
-		 *
-		 * We need to move counter creation to perf_session, support
-		 * different sample_types, etc.
-		 */
-		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
-
-fallback_missing_features:
-		if (opts->exclude_guest_missing)
-			attr->exclude_guest = attr->exclude_host = 0;
-retry_sample_id:
-		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
 try_again:
 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
-			int err = errno;
-
-			if (err == EPERM || err == EACCES) {
-				ui__error_paranoid();
-				rc = -err;
-				goto out;
-			} else if (err ==  ENODEV && opts->target.cpu_list) {
-				pr_err("No such device - did you specify"
-				       " an out-of-range profile CPU?\n");
-				rc = -err;
-				goto out;
-			} else if (err == EINVAL) {
-				if (!opts->exclude_guest_missing &&
-				    (attr->exclude_guest || attr->exclude_host)) {
-					pr_debug("Old kernel, cannot exclude "
-						 "guest or host samples.\n");
-					opts->exclude_guest_missing = true;
-					goto fallback_missing_features;
-				} else if (!opts->sample_id_all_missing) {
-					/*
-					 * Old kernel, no attr->sample_id_type_all field
-					 */
-					opts->sample_id_all_missing = true;
-					if (!opts->sample_time && !opts->raw_samples && !time_needed)
-						attr->sample_type &= ~PERF_SAMPLE_TIME;
-
-					goto retry_sample_id;
-				}
-			}
-
-			/*
-			 * If it's cycles then fall back to hrtimer
-			 * based cpu-clock-tick sw counter, which
-			 * is always available even if no PMU support.
-			 *
-			 * PPC returns ENXIO until 2.6.37 (behavior changed
-			 * with commit b0a873e).
-			 */
-			if ((err == ENOENT || err == ENXIO)
-					&& attr->type == PERF_TYPE_HARDWARE
-					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
-
+			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
 				if (verbose)
-					ui__warning("The cycles event is not supported, "
-						    "trying to fall back to cpu-clock-ticks\n");
-				attr->type = PERF_TYPE_SOFTWARE;
-				attr->config = PERF_COUNT_SW_CPU_CLOCK;
-				if (pos->name) {
-					free(pos->name);
-					pos->name = NULL;
-				}
+					ui__warning("%s\n", msg);
 				goto try_again;
 			}
 
-			if (err == ENOENT) {
-				ui__error("The %s event is not supported.\n",
-					  perf_evsel__name(pos));
-				rc = -err;
-				goto out;
-			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
-				ui__error("\'precise\' request may not be supported. "
-					  "Try removing 'p' modifier\n");
-				rc = -err;
-				goto out;
-			}
-
-			printf("\n");
-			error("sys_perf_event_open() syscall returned with %d "
-			      "(%s) for event %s. /bin/dmesg may provide "
-			      "additional information.\n",
-			      err, strerror(err), perf_evsel__name(pos));
-
-#if defined(__i386__) || defined(__x86_64__)
-			if (attr->type == PERF_TYPE_HARDWARE &&
-			    err == EOPNOTSUPP) {
-				pr_err("No hardware sampling interrupt available."
-				       " No APIC? If so then you can boot the kernel"
-				       " with the \"lapic\" boot parameter to"
-				       " force-enable it.\n");
-				rc = -err;
-				goto out;
-			}
-#endif
-
-			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-			rc = -err;
+			rc = -errno;
+			perf_evsel__open_strerror(pos, &opts->target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
 			goto out;
 		}
 	}
@@ -430,10 +328,6 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
 {
 	int err;
 	struct perf_tool *tool = data;
-
-	if (machine__is_host(machine))
-		return;
-
 	/*
 	 *As for guest kernel when processing subcommand record&report,
 	 *we arrange module mmap prior to guest kernel mmap and trigger
@@ -618,12 +512,7 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 
 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
 
-	machine = perf_session__find_host_machine(session);
-	if (!machine) {
-		pr_err("Couldn't find native kernel information.\n");
-		err = -1;
-		goto out_delete_session;
-	}
+	machine = &session->machines.host;
 
 	if (opts->pipe_output) {
 		err = perf_event__synthesize_attrs(tool, session,
@@ -676,9 +565,10 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
 		       "Check /proc/modules permission or run as root.\n");
 
-	if (perf_guest)
-		perf_session__process_machines(session, tool,
-					       perf_event__synthesize_guest_os);
+	if (perf_guest) {
+		machines__process_guests(&session->machines,
+					 perf_event__synthesize_guest_os, tool);
+	}
 
 	if (!opts->target.system_wide)
 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
@@ -875,11 +765,10 @@ static int get_stack_size(char *str, unsigned long *_size)
 }
 #endif /* LIBUNWIND_SUPPORT */
 
-static int
-parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
-		    int unset)
+int record_parse_callchain_opt(const struct option *opt,
+			       const char *arg, int unset)
 {
-	struct perf_record *rec = (struct perf_record *)opt->value;
+	struct perf_record_opts *opts = opt->value;
 	char *tok, *name, *saveptr = NULL;
 	char *buf;
 	int ret = -1;
@@ -905,7 +794,7 @@ parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
 		/* Framepointer style */
 		if (!strncmp(name, "fp", sizeof("fp"))) {
 			if (!strtok_r(NULL, ",", &saveptr)) {
-				rec->opts.call_graph = CALLCHAIN_FP;
+				opts->call_graph = CALLCHAIN_FP;
 				ret = 0;
 			} else
 				pr_err("callchain: No more arguments "
@@ -918,20 +807,20 @@ parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
 			const unsigned long default_stack_dump_size = 8192;
 
 			ret = 0;
-			rec->opts.call_graph = CALLCHAIN_DWARF;
-			rec->opts.stack_dump_size = default_stack_dump_size;
+			opts->call_graph = CALLCHAIN_DWARF;
+			opts->stack_dump_size = default_stack_dump_size;
 
 			tok = strtok_r(NULL, ",", &saveptr);
 			if (tok) {
 				unsigned long size = 0;
 
 				ret = get_stack_size(tok, &size);
-				rec->opts.stack_dump_size = size;
+				opts->stack_dump_size = size;
 			}
 
 			if (!ret)
 				pr_debug("callchain: stack dump size %d\n",
-					 rec->opts.stack_dump_size);
+					 opts->stack_dump_size);
 #endif /* LIBUNWIND_SUPPORT */
 		} else {
 			pr_err("callchain: Unknown -g option "
@@ -944,7 +833,7 @@ parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
 	free(buf);
 
 	if (!ret)
-		pr_debug("callchain: type %d\n", rec->opts.call_graph);
+		pr_debug("callchain: type %d\n", opts->call_graph);
 
 	return ret;
 }
@@ -982,9 +871,9 @@ static struct perf_record record = {
 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
 
 #ifdef LIBUNWIND_SUPPORT
-static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
+const char record_callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
 #else
-static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
+const char record_callchain_help[] = CALLCHAIN_HELP "[fp]";
 #endif
 
 /*
@@ -1028,9 +917,9 @@ const struct option record_options[] = {
 		     "number of mmap data pages"),
 	OPT_BOOLEAN(0, "group", &record.opts.group,
 		    "put the counters into a counter group"),
-	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
-			     callchain_help, &parse_callchain_opt,
-			     "fp"),
+	OPT_CALLBACK_DEFAULT('g', "call-graph", &record.opts,
+			     "mode[,dump_size]", record_callchain_help,
+			     &record_parse_callchain_opt, "fp"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index fc251005dd3d..47a864478543 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -372,7 +372,7 @@ static int __cmd_report(struct perf_report *rep)
 	if (ret)
 		goto out_delete;
 
-	kernel_map = session->host_machine.vmlinux_maps[MAP__FUNCTION];
+	kernel_map = session->machines.host.vmlinux_maps[MAP__FUNCTION];
 	kernel_kmap = map__kmap(kernel_map);
 	if (kernel_map == NULL ||
 	    (kernel_map->dso->hit &&
@@ -595,8 +595,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN(0, "stdio", &report.use_stdio,
 		    "Use the stdio interface"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol, parent, dso_to,"
-		   " dso_from, symbol_to, symbol_from, mispredict"),
+		   "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline,"
+		   " dso_to, dso_from, symbol_to, symbol_from, mispredict"),
 	OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
 		    "Show sample percentage for different cpu modes"),
 	OPT_STRING('p', "parent", &parent_pattern, "regex",
@@ -692,6 +692,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 		setup_browser(true);
 	else {
 		use_browser = 0;
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD);
 		perf_hpp__init();
 	}
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index cc28b85dabd5..138229439a93 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1475,9 +1475,9 @@ static int perf_sched__read_events(struct perf_sched *sched, bool destroy,
 			goto out_delete;
 		}
 
-		sched->nr_events      = session->hists.stats.nr_events[0];
-		sched->nr_lost_events = session->hists.stats.total_lost;
-		sched->nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
+		sched->nr_events      = session->stats.nr_events[0];
+		sched->nr_lost_events = session->stats.total_lost;
+		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
 	}
 
 	if (destroy)
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b363e7b292b2..92d4658f56fb 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -692,7 +692,7 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
 			    const char *arg, int unset __maybe_unused)
 {
 	char *tok;
-	int i, imax = sizeof(all_output_options) / sizeof(struct output_option);
+	int i, imax = ARRAY_SIZE(all_output_options);
 	int j;
 	int rc = 0;
 	char *str = strdup(arg);
@@ -909,18 +909,6 @@ static const char *ends_with(const char *str, const char *suffix)
 	return NULL;
 }
 
-static char *ltrim(char *str)
-{
-	int len = strlen(str);
-
-	while (len && isspace(*str)) {
-		len--;
-		str++;
-	}
-
-	return str;
-}
-
 static int read_script_info(struct script_desc *desc, const char *filename)
 {
 	char line[BUFSIZ], *p;
@@ -1487,7 +1475,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			return -1;
 	}
 
-	perf_session__fprintf_info(session, stdout, show_full_info);
+	if (!script_name && !generate_script_lang)
+		perf_session__fprintf_info(session, stdout, show_full_info);
 
 	if (!no_callchain)
 		symbol_conf.use_callchain = true;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c247faca7127..1c2ac148a7d5 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -132,8 +132,6 @@ static struct stats walltime_nsecs_stats;
 static int create_perf_stat_counter(struct perf_evsel *evsel)
 {
 	struct perf_event_attr *attr = &evsel->attr;
-	bool exclude_guest_missing = false;
-	int ret;
 
 	if (scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -141,38 +139,16 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 
 	attr->inherit = !no_inherit;
 
-retry:
-	if (exclude_guest_missing)
-		evsel->attr.exclude_guest = evsel->attr.exclude_host = 0;
-
-	if (perf_target__has_cpu(&target)) {
-		ret = perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
-		if (ret)
-			goto check_ret;
-		return 0;
-	}
+	if (perf_target__has_cpu(&target))
+		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
 
 	if (!perf_target__has_task(&target) &&
-	    !perf_evsel__is_group_member(evsel)) {
+	    perf_evsel__is_group_leader(evsel)) {
 		attr->disabled = 1;
 		attr->enable_on_exec = 1;
 	}
 
-	ret = perf_evsel__open_per_thread(evsel, evsel_list->threads);
-	if (!ret)
-		return 0;
-	/* fall through */
-check_ret:
-	if (ret && errno == EINVAL) {
-		if (!exclude_guest_missing &&
-		    (evsel->attr.exclude_guest || evsel->attr.exclude_host)) {
-			pr_debug("Old kernel, cannot exclude "
-				 "guest or host samples.\n");
-			exclude_guest_missing = true;
-			goto retry;
-		}
-	}
-	return ret;
+	return perf_evsel__open_per_thread(evsel, evsel_list->threads);
 }
 
 /*
@@ -271,6 +247,7 @@ static int read_counter(struct perf_evsel *counter)
 
 static int __run_perf_stat(int argc __maybe_unused, const char **argv)
 {
+	char msg[512];
 	unsigned long long t0, t1;
 	struct perf_evsel *counter;
 	int status = 0;
@@ -348,20 +325,13 @@ static int __run_perf_stat(int argc __maybe_unused, const char **argv)
 				continue;
 			}
 
-			if (errno == EPERM || errno == EACCES) {
-				error("You may not have permission to collect %sstats.\n"
-				      "\t Consider tweaking"
-				      " /proc/sys/kernel/perf_event_paranoid or running as root.",
-				      target.system_wide ? "system-wide " : "");
-			} else {
-				error("open_counter returned with %d (%s). "
-				      "/bin/dmesg may provide additional information.\n",
-				       errno, strerror(errno));
-			}
+			perf_evsel__open_strerror(counter, &target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
+
 			if (child_pid != -1)
 				kill(child_pid, SIGTERM);
 
-			pr_err("Not all events could be opened.\n");
 			return -1;
 		}
 		counter->supported = true;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c9ff3950cd4b..7978c8117b7f 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -68,28 +68,6 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-void get_term_dimensions(struct winsize *ws)
-{
-	char *s = getenv("LINES");
-
-	if (s != NULL) {
-		ws->ws_row = atoi(s);
-		s = getenv("COLUMNS");
-		if (s != NULL) {
-			ws->ws_col = atoi(s);
-			if (ws->ws_row && ws->ws_col)
-				return;
-		}
-	}
-#ifdef TIOCGWINSZ
-	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
-	    ws->ws_row && ws->ws_col)
-		return;
-#endif
-	ws->ws_row = 25;
-	ws->ws_col = 80;
-}
-
 static void perf_top__update_print_entries(struct perf_top *top)
 {
 	if (top->print_entries > 9)
@@ -596,7 +574,7 @@ static void *display_thread_tui(void *arg)
 	 * via --uid.
 	 */
 	list_for_each_entry(pos, &top->evlist->entries, node)
-		pos->hists.uid_filter_str = top->target.uid_str;
+		pos->hists.uid_filter_str = top->record_opts.target.uid_str;
 
 	perf_evlist__tui_browse_hists(top->evlist, help, &hbt,
 				      &top->session->header.env);
@@ -716,7 +694,7 @@ static void perf_event__process_sample(struct perf_tool *tool,
 		static struct intlist *seen;
 
 		if (!seen)
-			seen = intlist__new();
+			seen = intlist__new(NULL);
 
 		if (!intlist__has_entry(seen, event->ip.pid)) {
 			pr_err("Can't find guest [%d]'s kernel information\n",
@@ -727,8 +705,8 @@ static void perf_event__process_sample(struct perf_tool *tool,
 	}
 
 	if (!machine) {
-		pr_err("%u unprocessable samples recorded.",
-		       top->session->hists.stats.nr_unprocessable_samples++);
+		pr_err("%u unprocessable samples recorded.\n",
+		       top->session->stats.nr_unprocessable_samples++);
 		return;
 	}
 
@@ -847,13 +825,13 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 			++top->us_samples;
 			if (top->hide_user_symbols)
 				continue;
-			machine = perf_session__find_host_machine(session);
+			machine = &session->machines.host;
 			break;
 		case PERF_RECORD_MISC_KERNEL:
 			++top->kernel_samples;
 			if (top->hide_kernel_symbols)
 				continue;
-			machine = perf_session__find_host_machine(session);
+			machine = &session->machines.host;
 			break;
 		case PERF_RECORD_MISC_GUEST_KERNEL:
 			++top->guest_kernel_samples;
@@ -878,7 +856,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 			hists__inc_nr_events(&evsel->hists, event->header.type);
 			machine__process_event(machine, event);
 		} else
-			++session->hists.stats.nr_unknown_events;
+			++session->stats.nr_unknown_events;
 	}
 }
 
@@ -892,111 +870,31 @@ static void perf_top__mmap_read(struct perf_top *top)
 
 static void perf_top__start_counters(struct perf_top *top)
 {
+	char msg[512];
 	struct perf_evsel *counter;
 	struct perf_evlist *evlist = top->evlist;
+	struct perf_record_opts *opts = &top->record_opts;
 
-	if (top->group)
-		perf_evlist__set_leader(evlist);
+	perf_evlist__config(evlist, opts);
 
 	list_for_each_entry(counter, &evlist->entries, node) {
-		struct perf_event_attr *attr = &counter->attr;
-
-		attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-
-		if (top->freq) {
-			attr->sample_type |= PERF_SAMPLE_PERIOD;
-			attr->freq	  = 1;
-			attr->sample_freq = top->freq;
-		}
-
-		if (evlist->nr_entries > 1) {
-			attr->sample_type |= PERF_SAMPLE_ID;
-			attr->read_format |= PERF_FORMAT_ID;
-		}
-
-		if (perf_target__has_cpu(&top->target))
-			attr->sample_type |= PERF_SAMPLE_CPU;
-
-		if (symbol_conf.use_callchain)
-			attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
-
-		attr->mmap = 1;
-		attr->comm = 1;
-		attr->inherit = top->inherit;
-fallback_missing_features:
-		if (top->exclude_guest_missing)
-			attr->exclude_guest = attr->exclude_host = 0;
-retry_sample_id:
-		attr->sample_id_all = top->sample_id_all_missing ? 0 : 1;
 try_again:
 		if (perf_evsel__open(counter, top->evlist->cpus,
 				     top->evlist->threads) < 0) {
-			int err = errno;
-
-			if (err == EPERM || err == EACCES) {
-				ui__error_paranoid();
-				goto out_err;
-			} else if (err == EINVAL) {
-				if (!top->exclude_guest_missing &&
-				    (attr->exclude_guest || attr->exclude_host)) {
-					pr_debug("Old kernel, cannot exclude "
-						 "guest or host samples.\n");
-					top->exclude_guest_missing = true;
-					goto fallback_missing_features;
-				} else if (!top->sample_id_all_missing) {
-					/*
-					 * Old kernel, no attr->sample_id_type_all field
-					 */
-					top->sample_id_all_missing = true;
-					goto retry_sample_id;
-				}
-			}
-			/*
-			 * If it's cycles then fall back to hrtimer
-			 * based cpu-clock-tick sw counter, which
-			 * is always available even if no PMU support:
-			 */
-			if ((err == ENOENT || err == ENXIO) &&
-			    (attr->type == PERF_TYPE_HARDWARE) &&
-			    (attr->config == PERF_COUNT_HW_CPU_CYCLES)) {
-
+			if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
 				if (verbose)
-					ui__warning("Cycles event not supported,\n"
-						    "trying to fall back to cpu-clock-ticks\n");
-
-				attr->type = PERF_TYPE_SOFTWARE;
-				attr->config = PERF_COUNT_SW_CPU_CLOCK;
-				if (counter->name) {
-					free(counter->name);
-					counter->name = NULL;
-				}
+					ui__warning("%s\n", msg);
 				goto try_again;
 			}
 
-			if (err == ENOENT) {
-				ui__error("The %s event is not supported.\n",
-					  perf_evsel__name(counter));
-				goto out_err;
-			} else if (err == EMFILE) {
-				ui__error("Too many events are opened.\n"
-					    "Try again after reducing the number of events\n");
-				goto out_err;
-			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
-				ui__error("\'precise\' request may not be supported. "
-					  "Try removing 'p' modifier\n");
-				goto out_err;
-			}
-
-			ui__error("The sys_perf_event_open() syscall "
-				    "returned with %d (%s).  /bin/dmesg "
-				    "may provide additional information.\n"
-				    "No CONFIG_PERF_EVENTS=y kernel support "
-				    "configured?\n", err, strerror(err));
+			perf_evsel__open_strerror(counter, &opts->target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
 			goto out_err;
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, top->mmap_pages, false) < 0) {
+	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
 		ui__error("Failed to mmap with %d (%s)\n",
 			    errno, strerror(errno));
 		goto out_err;
@@ -1016,7 +914,7 @@ static int perf_top__setup_sample_type(struct perf_top *top)
 			ui__error("Selected -g but \"sym\" not present in --sort/-s.");
 			return -EINVAL;
 		}
-	} else if (!top->dont_use_callchains && callchain_param.mode != CHAIN_NONE) {
+	} else if (callchain_param.mode != CHAIN_NONE) {
 		if (callchain_register_param(&callchain_param) < 0) {
 			ui__error("Can't register callchain params.\n");
 			return -EINVAL;
@@ -1028,6 +926,7 @@ static int perf_top__setup_sample_type(struct perf_top *top)
 
 static int __cmd_top(struct perf_top *top)
 {
+	struct perf_record_opts *opts = &top->record_opts;
 	pthread_t thread;
 	int ret;
 	/*
@@ -1042,17 +941,28 @@ static int __cmd_top(struct perf_top *top)
 	if (ret)
 		goto out_delete;
 
-	if (perf_target__has_task(&top->target))
+	if (perf_target__has_task(&opts->target))
 		perf_event__synthesize_thread_map(&top->tool, top->evlist->threads,
 						  perf_event__process,
-						  &top->session->host_machine);
+						  &top->session->machines.host);
 	else
 		perf_event__synthesize_threads(&top->tool, perf_event__process,
-					       &top->session->host_machine);
+					       &top->session->machines.host);
 	perf_top__start_counters(top);
 	top->session->evlist = top->evlist;
 	perf_session__set_id_hdr_size(top->session);
 
+	/*
+	 * When perf is starting the traced process, all the events (apart from
+	 * group members) have enable_on_exec=1 set, so don't spoil it by
+	 * prematurely enabling them.
+	 *
+	 * XXX 'top' still doesn't start workloads like record, trace, but should,
+	 * so leave the check here.
+	 */
+        if (!perf_target__none(&opts->target))
+                perf_evlist__enable(top->evlist);
+
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
 
@@ -1093,116 +1003,56 @@ out_delete:
 static int
 parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 {
-	struct perf_top *top = (struct perf_top *)opt->value;
-	char *tok, *tok2;
-	char *endptr;
-
 	/*
 	 * --no-call-graph
 	 */
-	if (unset) {
-		top->dont_use_callchains = true;
+	if (unset)
 		return 0;
-	}
 
 	symbol_conf.use_callchain = true;
 
-	if (!arg)
-		return 0;
-
-	tok = strtok((char *)arg, ",");
-	if (!tok)
-		return -1;
-
-	/* get the output mode */
-	if (!strncmp(tok, "graph", strlen(arg)))
-		callchain_param.mode = CHAIN_GRAPH_ABS;
-
-	else if (!strncmp(tok, "flat", strlen(arg)))
-		callchain_param.mode = CHAIN_FLAT;
-
-	else if (!strncmp(tok, "fractal", strlen(arg)))
-		callchain_param.mode = CHAIN_GRAPH_REL;
-
-	else if (!strncmp(tok, "none", strlen(arg))) {
-		callchain_param.mode = CHAIN_NONE;
-		symbol_conf.use_callchain = false;
-
-		return 0;
-	} else
-		return -1;
-
-	/* get the min percentage */
-	tok = strtok(NULL, ",");
-	if (!tok)
-		goto setup;
-
-	callchain_param.min_percent = strtod(tok, &endptr);
-	if (tok == endptr)
-		return -1;
-
-	/* get the print limit */
-	tok2 = strtok(NULL, ",");
-	if (!tok2)
-		goto setup;
-
-	if (tok2[0] != 'c') {
-		callchain_param.print_limit = strtod(tok2, &endptr);
-		tok2 = strtok(NULL, ",");
-		if (!tok2)
-			goto setup;
-	}
-
-	/* get the call chain order */
-	if (!strcmp(tok2, "caller"))
-		callchain_param.order = ORDER_CALLER;
-	else if (!strcmp(tok2, "callee"))
-		callchain_param.order = ORDER_CALLEE;
-	else
-		return -1;
-setup:
-	if (callchain_register_param(&callchain_param) < 0) {
-		fprintf(stderr, "Can't register callchain params\n");
-		return -1;
-	}
-	return 0;
+	return record_parse_callchain_opt(opt, arg, unset);
 }
 
 int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	struct perf_evsel *pos;
 	int status;
 	char errbuf[BUFSIZ];
 	struct perf_top top = {
 		.count_filter	     = 5,
 		.delay_secs	     = 2,
-		.freq		     = 4000, /* 4 KHz */
-		.mmap_pages	     = 128,
-		.sym_pcnt_filter     = 5,
-		.target		     = {
-			.uses_mmap   = true,
+		.record_opts = {
+			.mmap_pages	= UINT_MAX,
+			.user_freq	= UINT_MAX,
+			.user_interval	= ULLONG_MAX,
+			.freq		= 4000, /* 4 KHz */
+			.target		     = {
+				.uses_mmap   = true,
+			},
 		},
+		.sym_pcnt_filter     = 5,
 	};
-	char callchain_default_opt[] = "fractal,0.5,callee";
+	struct perf_record_opts *opts = &top.record_opts;
+	struct perf_target *target = &opts->target;
 	const struct option options[] = {
 	OPT_CALLBACK('e', "event", &top.evlist, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events_option),
-	OPT_INTEGER('c', "count", &top.default_interval,
-		    "event period to sample"),
-	OPT_STRING('p', "pid", &top.target.pid, "pid",
+	OPT_U64('c', "count", &opts->user_interval, "event period to sample"),
+	OPT_STRING('p', "pid", &target->pid, "pid",
 		    "profile events on existing process id"),
-	OPT_STRING('t', "tid", &top.target.tid, "tid",
+	OPT_STRING('t', "tid", &target->tid, "tid",
 		    "profile events on existing thread id"),
-	OPT_BOOLEAN('a', "all-cpus", &top.target.system_wide,
+	OPT_BOOLEAN('a', "all-cpus", &target->system_wide,
 			    "system-wide collection from all CPUs"),
-	OPT_STRING('C', "cpu", &top.target.cpu_list, "cpu",
+	OPT_STRING('C', "cpu", &target->cpu_list, "cpu",
 		    "list of cpus to monitor"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
 		    "hide kernel symbols"),
-	OPT_UINTEGER('m', "mmap-pages", &top.mmap_pages, "number of mmap data pages"),
+	OPT_UINTEGER('m', "mmap-pages", &opts->mmap_pages,
+		     "number of mmap data pages"),
 	OPT_INTEGER('r', "realtime", &top.realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_INTEGER('d', "delay", &top.delay_secs,
@@ -1211,16 +1061,14 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 			    "dump the symbol table used for profiling"),
 	OPT_INTEGER('f', "count-filter", &top.count_filter,
 		    "only display functions with more events than this"),
-	OPT_BOOLEAN('g', "group", &top.group,
+	OPT_BOOLEAN('g', "group", &opts->group,
 			    "put the counters into a counter group"),
-	OPT_BOOLEAN('i', "inherit", &top.inherit,
-		    "child tasks inherit counters"),
+	OPT_BOOLEAN('i', "no-inherit", &opts->no_inherit,
+		    "child tasks do not inherit counters"),
 	OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name",
 		    "symbol to annotate"),
-	OPT_BOOLEAN('z', "zero", &top.zero,
-		    "zero history across updates"),
-	OPT_INTEGER('F', "freq", &top.freq,
-		    "profile at this frequency"),
+	OPT_BOOLEAN('z', "zero", &top.zero, "zero history across updates"),
+	OPT_UINTEGER('F', "freq", &opts->user_freq, "profile at this frequency"),
 	OPT_INTEGER('E', "entries", &top.print_entries,
 		    "display this many functions"),
 	OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
@@ -1233,10 +1081,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "sort by key(s): pid, comm, dso, symbol, parent"),
 	OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
 		    "Show a column with the number of samples"),
-	OPT_CALLBACK_DEFAULT('G', "call-graph", &top, "output_type,min_percent, call_order",
-		     "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. "
-		     "Default: fractal,0.5,callee", &parse_callchain_opt,
-		     callchain_default_opt),
+	OPT_CALLBACK_DEFAULT('G', "call-graph", &top.record_opts,
+			     "mode[,dump_size]", record_callchain_help,
+			     &parse_callchain_opt, "fp"),
 	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
 		    "Show a column with the sum of periods"),
 	OPT_STRING(0, "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
@@ -1251,7 +1098,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Display raw encoding of assembly instructions (default)"),
 	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
 		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
-	OPT_STRING('u', "uid", &top.target.uid_str, "user", "user to profile"),
+	OPT_STRING('u', "uid", &target->uid_str, "user", "user to profile"),
 	OPT_END()
 	};
 	const char * const top_usage[] = {
@@ -1281,27 +1128,27 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	setup_browser(false);
 
-	status = perf_target__validate(&top.target);
+	status = perf_target__validate(target);
 	if (status) {
-		perf_target__strerror(&top.target, status, errbuf, BUFSIZ);
+		perf_target__strerror(target, status, errbuf, BUFSIZ);
 		ui__warning("%s", errbuf);
 	}
 
-	status = perf_target__parse_uid(&top.target);
+	status = perf_target__parse_uid(target);
 	if (status) {
 		int saved_errno = errno;
 
-		perf_target__strerror(&top.target, status, errbuf, BUFSIZ);
+		perf_target__strerror(target, status, errbuf, BUFSIZ);
 		ui__error("%s", errbuf);
 
 		status = -saved_errno;
 		goto out_delete_evlist;
 	}
 
-	if (perf_target__none(&top.target))
-		top.target.system_wide = true;
+	if (perf_target__none(target))
+		target->system_wide = true;
 
-	if (perf_evlist__create_maps(top.evlist, &top.target) < 0)
+	if (perf_evlist__create_maps(top.evlist, target) < 0)
 		usage_with_options(top_usage, options);
 
 	if (!top.evlist->nr_entries &&
@@ -1315,24 +1162,22 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (top.delay_secs < 1)
 		top.delay_secs = 1;
 
+	if (opts->user_interval != ULLONG_MAX)
+		opts->default_interval = opts->user_interval;
+	if (opts->user_freq != UINT_MAX)
+		opts->freq = opts->user_freq;
+
 	/*
 	 * User specified count overrides default frequency.
 	 */
-	if (top.default_interval)
-		top.freq = 0;
-	else if (top.freq) {
-		top.default_interval = top.freq;
+	if (opts->default_interval)
+		opts->freq = 0;
+	else if (opts->freq) {
+		opts->default_interval = opts->freq;
 	} else {
 		ui__error("frequency and count are zero, aborting\n");
-		exit(EXIT_FAILURE);
-	}
-
-	list_for_each_entry(pos, &top.evlist->entries, node) {
-		/*
-		 * Fill in the ones not specifically initialized via -c:
-		 */
-		if (!pos->attr.sample_period)
-			pos->attr.sample_period = top.default_interval;
+		status = -EINVAL;
+		goto out_delete_evlist;
 	}
 
 	top.sym_evsel = perf_evlist__first(top.evlist);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 7932ffa29889..d222d7fc7e96 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -455,7 +455,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 		goto out_delete_evlist;
 	}
 
-	perf_evlist__config_attrs(evlist, &trace->opts);
+	perf_evlist__config(evlist, &trace->opts);
 
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak
index e5413125e6bb..8ef3bd30a549 100644
--- a/tools/perf/config/utilities.mak
+++ b/tools/perf/config/utilities.mak
@@ -13,7 +13,7 @@ newline := $(newline)
 # what should replace a newline when escaping
 # newlines; the default is a bizarre string.
 #
-nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+nl-escape = $(if $(1),$(1),m822df3020w6a44id34bt574ctac44eb9f4n)
 
 # escape-nl
 #
@@ -173,9 +173,9 @@ _ge-abspath = $(if $(is-executable),$(1))
 # Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
 #
 define get-executable-or-default
-$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2),$(1)))
 endef
-_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_ge_attempt = $(if $(get-executable),$(get-executable),$(_gea_warn)$(call _gea_err,$(2)))
 _gea_warn = $(warning The path '$(1)' is not executable.)
 _gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
 
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 0f661fbce6a8..095b88207cd3 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -328,14 +328,23 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
 		return 0;
 
+	status = 1;
 	/* Check for ENOSPC and EIO errors.. */
-	if (fflush(stdout))
-		die("write failure on standard output: %s", strerror(errno));
-	if (ferror(stdout))
-		die("unknown write failure on standard output");
-	if (fclose(stdout))
-		die("close failed on standard output: %s", strerror(errno));
-	return 0;
+	if (fflush(stdout)) {
+		fprintf(stderr, "write failure on standard output: %s", strerror(errno));
+		goto out;
+	}
+	if (ferror(stdout)) {
+		fprintf(stderr, "unknown write failure on standard output");
+		goto out;
+	}
+	if (fclose(stdout)) {
+		fprintf(stderr, "close failed on standard output: %s", strerror(errno));
+		goto out;
+	}
+	status = 0;
+out:
+	return status;
 }
 
 static void handle_internal_command(int argc, const char **argv)
@@ -467,7 +476,8 @@ int main(int argc, const char **argv)
 		cmd += 5;
 		argv[0] = cmd;
 		handle_internal_command(argc, argv);
-		die("cannot handle %s internally", cmd);
+		fprintf(stderr, "cannot handle %s internally", cmd);
+		goto out;
 	}
 
 	/* Look for flags.. */
@@ -485,7 +495,7 @@ int main(int argc, const char **argv)
 		printf("\n usage: %s\n\n", perf_usage_string);
 		list_common_cmds_help();
 		printf("\n %s\n\n", perf_more_info_string);
-		exit(1);
+		goto out;
 	}
 	cmd = argv[0];
 
@@ -517,7 +527,7 @@ int main(int argc, const char **argv)
 			fprintf(stderr, "Expansion of alias '%s' failed; "
 				"'%s' is not a perf-command\n",
 				cmd, argv[0]);
-			exit(1);
+			goto out;
 		}
 		if (!done_help) {
 			cmd = argv[0] = help_unknown_cmd(cmd);
@@ -528,6 +538,6 @@ int main(int argc, const char **argv)
 
 	fprintf(stderr, "Failed to run command '%s': %s\n",
 		cmd, strerror(errno));
-
+out:
 	return 1;
 }
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 2c340e7da458..8f3bf388e414 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -1,10 +1,6 @@
 #ifndef _PERF_PERF_H
 #define _PERF_PERF_H
 
-struct winsize;
-
-void get_term_dimensions(struct winsize *ws);
-
 #include <asm/unistd.h>
 
 #if defined(__i386__)
@@ -237,8 +233,6 @@ struct perf_record_opts {
 	bool	     raw_samples;
 	bool	     sample_address;
 	bool	     sample_time;
-	bool	     sample_id_all_missing;
-	bool	     exclude_guest_missing;
 	bool	     period;
 	unsigned int freq;
 	unsigned int mmap_pages;
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-record b/tools/perf/scripts/perl/bin/workqueue-stats-record
deleted file mode 100644
index 8edda9078d5d..000000000000
--- a/tools/perf/scripts/perl/bin/workqueue-stats-record
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-perf record -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion $@
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report
deleted file mode 100644
index 6d91411d248c..000000000000
--- a/tools/perf/scripts/perl/bin/workqueue-stats-report
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-# description: workqueue stats (ins/exe/create/destroy)
-perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl
diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl
deleted file mode 100644
index a8eaff5119e0..000000000000
--- a/tools/perf/scripts/perl/workqueue-stats.pl
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/perl -w
-# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
-# Licensed under the terms of the GNU GPL License version 2
-
-# Displays workqueue stats
-#
-# Usage:
-#
-#   perf record -c 1 -f -a -R -e workqueue:workqueue_creation -e
-#     workqueue:workqueue_destruction -e workqueue:workqueue_execution
-#     -e workqueue:workqueue_insertion
-#
-#   perf script -p -s tools/perf/scripts/perl/workqueue-stats.pl
-
-use 5.010000;
-use strict;
-use warnings;
-
-use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
-use lib "./Perf-Trace-Util/lib";
-use Perf::Trace::Core;
-use Perf::Trace::Util;
-
-my @cpus;
-
-sub workqueue::workqueue_destruction
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{destroyed}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_creation
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $cpu) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{created}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_execution
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $func) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{executed}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_insertion
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $func) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{inserted}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub trace_end
-{
-    print "workqueue work stats:\n\n";
-    my $cpu = 0;
-    printf("%3s %6s %6s\t%-20s\n", "cpu", "ins", "exec", "name");
-    printf("%3s %6s %6s\t%-20s\n", "---", "---", "----", "----");
-    foreach my $pidhash (@cpus) {
-	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $ins = $$wqhash{'inserted'} || 0;
-	    my $exe = $$wqhash{'executed'} || 0;
-	    my $comm = $$wqhash{'comm'} || "";
-	    if ($ins || $exe) {
-		printf("%3u %6u %6u\t%-20s\n", $cpu, $ins, $exe, $comm);
-	    }
-	}
-	$cpu++;
-    }
-
-    $cpu = 0;
-    print "\nworkqueue lifecycle stats:\n\n";
-    printf("%3s %6s %6s\t%-20s\n", "cpu", "created", "destroyed", "name");
-    printf("%3s %6s %6s\t%-20s\n", "---", "-------", "---------", "----");
-    foreach my $pidhash (@cpus) {
-	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $created = $$wqhash{'created'} || 0;
-	    my $destroyed = $$wqhash{'destroyed'} || 0;
-	    my $comm = $$wqhash{'comm'} || "";
-	    if ($created || $destroyed) {
-		printf("%3u %6u %6u\t%-20s\n", $cpu, $created, $destroyed,
-		       $comm);
-	    }
-	}
-	$cpu++;
-    }
-
-    print_unhandled();
-}
-
-my %unhandled;
-
-sub print_unhandled
-{
-    if ((scalar keys %unhandled) == 0) {
-	return;
-    }
-
-    print "\nunhandled events:\n\n";
-
-    printf("%-40s  %10s\n", "event", "count");
-    printf("%-40s  %10s\n", "----------------------------------------",
-	   "-----------");
-
-    foreach my $event_name (keys %unhandled) {
-	printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
-    }
-}
-
-sub trace_unhandled
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
-
-    $unhandled{$event_name}++;
-}
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 25638a986257..f61dd3fb546b 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -33,8 +33,6 @@
 
 extern int verbose;
 
-bool test_attr__enabled;
-
 static char *dir;
 
 void test_attr__init(void)
@@ -146,7 +144,7 @@ static int run_dir(const char *d, const char *perf)
 {
 	char cmd[3*PATH_MAX];
 
-	snprintf(cmd, 3*PATH_MAX, "python %s/attr.py -d %s/attr/ -p %s %s",
+	snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %s",
 		 d, d, perf, verbose ? "-v" : "");
 
 	return system(cmd);
diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py
index e702b82dcb86..2f629ca485bc 100644
--- a/tools/perf/tests/attr.py
+++ b/tools/perf/tests/attr.py
@@ -68,7 +68,7 @@ class Event(dict):
             self[key] = val
 
     def __init__(self, name, data, base):
-        log.info("    Event %s" % name);
+        log.debug("    Event %s" % name);
         self.name  = name;
         self.group = ''
         self.add(base)
@@ -97,6 +97,14 @@ class Event(dict):
                 return False
         return True
 
+    def diff(self, other):
+        for t in Event.terms:
+            if not self.has_key(t) or not other.has_key(t):
+                continue
+            if not self.compare_data(self[t], other[t]):
+		log.warning("expected %s=%s, got %s" % (t, self[t], other[t]))
+                
+
 # Test file description needs to have following sections:
 # [config]
 #   - just single instance in file
@@ -113,7 +121,7 @@ class Test(object):
         parser = ConfigParser.SafeConfigParser()
         parser.read(path)
 
-        log.warning("running '%s'" % path)
+        log.debug("running '%s'" % path)
 
         self.path     = path
         self.test_dir = options.test_dir
@@ -128,7 +136,7 @@ class Test(object):
 
         self.expect   = {}
         self.result   = {}
-        log.info("  loading expected events");
+        log.debug("  loading expected events");
         self.load_events(path, self.expect)
 
     def is_event(self, name):
@@ -164,7 +172,7 @@ class Test(object):
               self.perf, self.command, tempdir, self.args)
         ret = os.WEXITSTATUS(os.system(cmd))
 
-        log.info("  running '%s' ret %d " % (cmd, ret))
+        log.warning("  running '%s' ret %d " % (cmd, ret))
 
         if ret != int(self.ret):
             raise Unsup(self)
@@ -172,7 +180,7 @@ class Test(object):
     def compare(self, expect, result):
         match = {}
 
-        log.info("  compare");
+        log.debug("  compare");
 
         # For each expected event find all matching
         # events in result. Fail if there's not any.
@@ -187,10 +195,11 @@ class Test(object):
                 else:
                     log.debug("    ->FAIL");
 
-            log.info("    match: [%s] matches %s" % (exp_name, str(exp_list)))
+            log.debug("    match: [%s] matches %s" % (exp_name, str(exp_list)))
 
             # we did not any matching event - fail
             if (not exp_list):
+		exp_event.diff(res_event)
                 raise Fail(self, 'match failure');
 
             match[exp_name] = exp_list
@@ -208,10 +217,10 @@ class Test(object):
                 if res_group not in match[group]:
                     raise Fail(self, 'group failure')
 
-                log.info("    group: [%s] matches group leader %s" %
+                log.debug("    group: [%s] matches group leader %s" %
                          (exp_name, str(match[group])))
 
-        log.info("  matched")
+        log.debug("  matched")
 
     def resolve_groups(self, events):
         for name, event in events.items():
@@ -233,7 +242,7 @@ class Test(object):
             self.run_cmd(tempdir);
 
             # load events expectation for the test
-            log.info("  loading result events");
+            log.debug("  loading result events");
             for f in glob.glob(tempdir + '/event*'):
                 self.load_events(f, self.result);
 
diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record
index f1485d8e6a0b..5bc3880f7be5 100644
--- a/tools/perf/tests/attr/base-record
+++ b/tools/perf/tests/attr/base-record
@@ -7,7 +7,7 @@ size=96
 config=0
 sample_period=4000
 sample_type=263
-read_format=7
+read_format=0
 disabled=1
 inherit=1
 pinned=0
diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group
index a6599e9a19d3..57739cacdb2a 100644
--- a/tools/perf/tests/attr/test-record-group
+++ b/tools/perf/tests/attr/test-record-group
@@ -6,12 +6,14 @@ args    = --group -e cycles,instructions kill >/dev/null 2>&1
 fd=1
 group_fd=-1
 sample_type=327
+read_format=4
 
 [event-2:base-record]
 fd=2
 group_fd=1
 config=1
 sample_type=327
+read_format=4
 mmap=0
 comm=0
 enable_on_exec=0
diff --git a/tools/perf/tests/attr/test-record-group1 b/tools/perf/tests/attr/test-record-group1
index 5a8359da38af..c5548d054aff 100644
--- a/tools/perf/tests/attr/test-record-group1
+++ b/tools/perf/tests/attr/test-record-group1
@@ -1,11 +1,12 @@
 [config]
 command = record
-args    = -e '{cycles,instructions}' kill >/tmp/krava 2>&1
+args    = -e '{cycles,instructions}' kill >/dev/null 2>&1
 
 [event-1:base-record]
 fd=1
 group_fd=-1
 sample_type=327
+read_format=4
 
 [event-2:base-record]
 fd=2
@@ -13,6 +14,7 @@ group_fd=1
 type=0
 config=1
 sample_type=327
+read_format=4
 mmap=0
 comm=0
 enable_on_exec=0
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 186f67535494..acb98e0e39f2 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -4,6 +4,7 @@
  * Builtin regression testing command: ever growing number of sanity tests
  */
 #include "builtin.h"
+#include "intlist.h"
 #include "tests.h"
 #include "debug.h"
 #include "color.h"
@@ -69,6 +70,14 @@ static struct test {
 		.func = test__attr,
 	},
 	{
+		.desc = "Test matching and linking mutliple hists",
+		.func = test__hists_link,
+	},
+	{
+		.desc = "Try 'use perf' in python, checking link problems",
+		.func = test__python_use,
+	},
+	{
 		.func = NULL,
 	},
 };
@@ -97,7 +106,7 @@ static bool perf_test__matches(int curr, int argc, const char *argv[])
 	return false;
 }
 
-static int __cmd_test(int argc, const char *argv[])
+static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 {
 	int i = 0;
 	int width = 0;
@@ -118,13 +127,28 @@ static int __cmd_test(int argc, const char *argv[])
 			continue;
 
 		pr_info("%2d: %-*s:", i, width, tests[curr].desc);
+
+		if (intlist__find(skiplist, i)) {
+			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip (user override)\n");
+			continue;
+		}
+
 		pr_debug("\n--- start ---\n");
 		err = tests[curr].func();
 		pr_debug("---- end ----\n%s:", tests[curr].desc);
-		if (err)
-			color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n");
-		else
+
+		switch (err) {
+		case TEST_OK:
 			pr_info(" Ok\n");
+			break;
+		case TEST_SKIP:
+			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip\n");
+			break;
+		case TEST_FAIL:
+		default:
+			color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n");
+			break;
+		}
 	}
 
 	return 0;
@@ -152,11 +176,14 @@ int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused)
 	"perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]",
 	NULL,
 	};
+	const char *skip = NULL;
 	const struct option test_options[] = {
+	OPT_STRING('s', "skip", &skip, "tests", "tests to skip"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
 	};
+	struct intlist *skiplist = NULL;
 
 	argc = parse_options(argc, argv, test_options, test_usage, 0);
 	if (argc >= 1 && !strcmp(argv[0], "list"))
@@ -169,5 +196,8 @@ int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (symbol__init() < 0)
 		return -1;
 
-	return __cmd_test(argc, argv);
+	if (skip != NULL)
+		skiplist = intlist__new(skip);
+
+	return __cmd_test(argc, argv, skiplist);
 }
diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index e61fc828a158..0fd99a9adb91 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -22,7 +22,7 @@ static int perf_evsel__roundtrip_cache_name_test(void)
 			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
 				__perf_evsel__hw_cache_type_op_res_name(type, op, i,
 									name, sizeof(name));
-				err = parse_events(evlist, name, 0);
+				err = parse_events(evlist, name);
 				if (err)
 					ret = err;
 			}
@@ -70,7 +70,7 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names)
                 return -ENOMEM;
 
 	for (i = 0; i < nr_names; ++i) {
-		err = parse_events(evlist, names[i], 0);
+		err = parse_events(evlist, names[i]);
 		if (err) {
 			pr_debug("failed to parse event '%s', err %d\n",
 				 names[i], err);
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
new file mode 100644
index 000000000000..0afd9223bde7
--- /dev/null
+++ b/tools/perf/tests/hists_link.c
@@ -0,0 +1,499 @@
+#include "perf.h"
+#include "tests.h"
+#include "debug.h"
+#include "symbol.h"
+#include "sort.h"
+#include "evsel.h"
+#include "evlist.h"
+#include "machine.h"
+#include "thread.h"
+#include "parse-events.h"
+
+static struct {
+	u32 pid;
+	const char *comm;
+} fake_threads[] = {
+	{ 100, "perf" },
+	{ 200, "perf" },
+	{ 300, "bash" },
+};
+
+static struct {
+	u32 pid;
+	u64 start;
+	const char *filename;
+} fake_mmap_info[] = {
+	{ 100, 0x40000, "perf" },
+	{ 100, 0x50000, "libc" },
+	{ 100, 0xf0000, "[kernel]" },
+	{ 200, 0x40000, "perf" },
+	{ 200, 0x50000, "libc" },
+	{ 200, 0xf0000, "[kernel]" },
+	{ 300, 0x40000, "bash" },
+	{ 300, 0x50000, "libc" },
+	{ 300, 0xf0000, "[kernel]" },
+};
+
+struct fake_sym {
+	u64 start;
+	u64 length;
+	const char *name;
+};
+
+static struct fake_sym perf_syms[] = {
+	{ 700, 100, "main" },
+	{ 800, 100, "run_command" },
+	{ 900, 100, "cmd_record" },
+};
+
+static struct fake_sym bash_syms[] = {
+	{ 700, 100, "main" },
+	{ 800, 100, "xmalloc" },
+	{ 900, 100, "xfree" },
+};
+
+static struct fake_sym libc_syms[] = {
+	{ 700, 100, "malloc" },
+	{ 800, 100, "free" },
+	{ 900, 100, "realloc" },
+};
+
+static struct fake_sym kernel_syms[] = {
+	{ 700, 100, "schedule" },
+	{ 800, 100, "page_fault" },
+	{ 900, 100, "sys_perf_event_open" },
+};
+
+static struct {
+	const char *dso_name;
+	struct fake_sym *syms;
+	size_t nr_syms;
+} fake_symbols[] = {
+	{ "perf", perf_syms, ARRAY_SIZE(perf_syms) },
+	{ "bash", bash_syms, ARRAY_SIZE(bash_syms) },
+	{ "libc", libc_syms, ARRAY_SIZE(libc_syms) },
+	{ "[kernel]", kernel_syms, ARRAY_SIZE(kernel_syms) },
+};
+
+static struct machine *setup_fake_machine(struct machines *machines)
+{
+	struct machine *machine = machines__find(machines, HOST_KERNEL_ID);
+	size_t i;
+
+	if (machine == NULL) {
+		pr_debug("Not enough memory for machine setup\n");
+		return NULL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(fake_threads); i++) {
+		struct thread *thread;
+
+		thread = machine__findnew_thread(machine, fake_threads[i].pid);
+		if (thread == NULL)
+			goto out;
+
+		thread__set_comm(thread, fake_threads[i].comm);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(fake_mmap_info); i++) {
+		union perf_event fake_mmap_event = {
+			.mmap = {
+				.header = { .misc = PERF_RECORD_MISC_USER, },
+				.pid = fake_mmap_info[i].pid,
+				.start = fake_mmap_info[i].start,
+				.len = 0x1000ULL,
+				.pgoff = 0ULL,
+			},
+		};
+
+		strcpy(fake_mmap_event.mmap.filename,
+		       fake_mmap_info[i].filename);
+
+		machine__process_mmap_event(machine, &fake_mmap_event);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(fake_symbols); i++) {
+		size_t k;
+		struct dso *dso;
+
+		dso = __dsos__findnew(&machine->user_dsos,
+				      fake_symbols[i].dso_name);
+		if (dso == NULL)
+			goto out;
+
+		/* emulate dso__load() */
+		dso__set_loaded(dso, MAP__FUNCTION);
+
+		for (k = 0; k < fake_symbols[i].nr_syms; k++) {
+			struct symbol *sym;
+			struct fake_sym *fsym = &fake_symbols[i].syms[k];
+
+			sym = symbol__new(fsym->start, fsym->length,
+					  STB_GLOBAL, fsym->name);
+			if (sym == NULL)
+				goto out;
+
+			symbols__insert(&dso->symbols[MAP__FUNCTION], sym);
+		}
+	}
+
+	return machine;
+
+out:
+	pr_debug("Not enough memory for machine setup\n");
+	machine__delete_threads(machine);
+	machine__delete(machine);
+	return NULL;
+}
+
+struct sample {
+	u32 pid;
+	u64 ip;
+	struct thread *thread;
+	struct map *map;
+	struct symbol *sym;
+};
+
+static struct sample fake_common_samples[] = {
+	/* perf [kernel] schedule() */
+	{ .pid = 100, .ip = 0xf0000 + 700, },
+	/* perf [perf]   main() */
+	{ .pid = 200, .ip = 0x40000 + 700, },
+	/* perf [perf]   cmd_record() */
+	{ .pid = 200, .ip = 0x40000 + 900, },
+	/* bash [bash]   xmalloc() */
+	{ .pid = 300, .ip = 0x40000 + 800, },
+	/* bash [libc]   malloc() */
+	{ .pid = 300, .ip = 0x50000 + 700, },
+};
+
+static struct sample fake_samples[][5] = {
+	{
+		/* perf [perf]   run_command() */
+		{ .pid = 100, .ip = 0x40000 + 800, },
+		/* perf [libc]   malloc() */
+		{ .pid = 100, .ip = 0x50000 + 700, },
+		/* perf [kernel] page_fault() */
+		{ .pid = 100, .ip = 0xf0000 + 800, },
+		/* perf [kernel] sys_perf_event_open() */
+		{ .pid = 200, .ip = 0xf0000 + 900, },
+		/* bash [libc]   free() */
+		{ .pid = 300, .ip = 0x50000 + 800, },
+	},
+	{
+		/* perf [libc]   free() */
+		{ .pid = 200, .ip = 0x50000 + 800, },
+		/* bash [libc]   malloc() */
+		{ .pid = 300, .ip = 0x50000 + 700, }, /* will be merged */
+		/* bash [bash]   xfee() */
+		{ .pid = 300, .ip = 0x40000 + 900, },
+		/* bash [libc]   realloc() */
+		{ .pid = 300, .ip = 0x50000 + 900, },
+		/* bash [kernel] page_fault() */
+		{ .pid = 300, .ip = 0xf0000 + 800, },
+	},
+};
+
+static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
+{
+	struct perf_evsel *evsel;
+	struct addr_location al;
+	struct hist_entry *he;
+	struct perf_sample sample = { .cpu = 0, };
+	size_t i = 0, k;
+
+	/*
+	 * each evsel will have 10 samples - 5 common and 5 distinct.
+	 * However the second evsel also has a collapsed entry for
+	 * "bash [libc] malloc" so total 9 entries will be in the tree.
+	 */
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		for (k = 0; k < ARRAY_SIZE(fake_common_samples); k++) {
+			const union perf_event event = {
+				.ip = {
+					.header = {
+						.misc = PERF_RECORD_MISC_USER,
+					},
+					.pid = fake_common_samples[k].pid,
+					.ip  = fake_common_samples[k].ip,
+				},
+			};
+
+			if (perf_event__preprocess_sample(&event, machine, &al,
+							  &sample, 0) < 0)
+				goto out;
+
+			he = __hists__add_entry(&evsel->hists, &al, NULL, 1);
+			if (he == NULL)
+				goto out;
+
+			fake_common_samples[k].thread = al.thread;
+			fake_common_samples[k].map = al.map;
+			fake_common_samples[k].sym = al.sym;
+		}
+
+		for (k = 0; k < ARRAY_SIZE(fake_samples[i]); k++) {
+			const union perf_event event = {
+				.ip = {
+					.header = {
+						.misc = PERF_RECORD_MISC_USER,
+					},
+					.pid = fake_samples[i][k].pid,
+					.ip  = fake_samples[i][k].ip,
+				},
+			};
+
+			if (perf_event__preprocess_sample(&event, machine, &al,
+							  &sample, 0) < 0)
+				goto out;
+
+			he = __hists__add_entry(&evsel->hists, &al, NULL, 1);
+			if (he == NULL)
+				goto out;
+
+			fake_samples[i][k].thread = al.thread;
+			fake_samples[i][k].map = al.map;
+			fake_samples[i][k].sym = al.sym;
+		}
+		i++;
+	}
+
+	return 0;
+
+out:
+	pr_debug("Not enough memory for adding a hist entry\n");
+	return -1;
+}
+
+static int find_sample(struct sample *samples, size_t nr_samples,
+		       struct thread *t, struct map *m, struct symbol *s)
+{
+	while (nr_samples--) {
+		if (samples->thread == t && samples->map == m &&
+		    samples->sym == s)
+			return 1;
+		samples++;
+	}
+	return 0;
+}
+
+static int __validate_match(struct hists *hists)
+{
+	size_t count = 0;
+	struct rb_root *root;
+	struct rb_node *node;
+
+	/*
+	 * Only entries from fake_common_samples should have a pair.
+	 */
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	node = rb_first(root);
+	while (node) {
+		struct hist_entry *he;
+
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+
+		if (hist_entry__has_pairs(he)) {
+			if (find_sample(fake_common_samples,
+					ARRAY_SIZE(fake_common_samples),
+					he->thread, he->ms.map, he->ms.sym)) {
+				count++;
+			} else {
+				pr_debug("Can't find the matched entry\n");
+				return -1;
+			}
+		}
+
+		node = rb_next(node);
+	}
+
+	if (count != ARRAY_SIZE(fake_common_samples)) {
+		pr_debug("Invalid count for matched entries: %zd of %zd\n",
+			 count, ARRAY_SIZE(fake_common_samples));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int validate_match(struct hists *leader, struct hists *other)
+{
+	return __validate_match(leader) || __validate_match(other);
+}
+
+static int __validate_link(struct hists *hists, int idx)
+{
+	size_t count = 0;
+	size_t count_pair = 0;
+	size_t count_dummy = 0;
+	struct rb_root *root;
+	struct rb_node *node;
+
+	/*
+	 * Leader hists (idx = 0) will have dummy entries from other,
+	 * and some entries will have no pair.  However every entry
+	 * in other hists should have (dummy) pair.
+	 */
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	node = rb_first(root);
+	while (node) {
+		struct hist_entry *he;
+
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+
+		if (hist_entry__has_pairs(he)) {
+			if (!find_sample(fake_common_samples,
+					 ARRAY_SIZE(fake_common_samples),
+					 he->thread, he->ms.map, he->ms.sym) &&
+			    !find_sample(fake_samples[idx],
+					 ARRAY_SIZE(fake_samples[idx]),
+					 he->thread, he->ms.map, he->ms.sym)) {
+				count_dummy++;
+			}
+			count_pair++;
+		} else if (idx) {
+			pr_debug("A entry from the other hists should have pair\n");
+			return -1;
+		}
+
+		count++;
+		node = rb_next(node);
+	}
+
+	/*
+	 * Note that we have a entry collapsed in the other (idx = 1) hists.
+	 */
+	if (idx == 0) {
+		if (count_dummy != ARRAY_SIZE(fake_samples[1]) - 1) {
+			pr_debug("Invalid count of dummy entries: %zd of %zd\n",
+				 count_dummy, ARRAY_SIZE(fake_samples[1]) - 1);
+			return -1;
+		}
+		if (count != count_pair + ARRAY_SIZE(fake_samples[0])) {
+			pr_debug("Invalid count of total leader entries: %zd of %zd\n",
+				 count, count_pair + ARRAY_SIZE(fake_samples[0]));
+			return -1;
+		}
+	} else {
+		if (count != count_pair) {
+			pr_debug("Invalid count of total other entries: %zd of %zd\n",
+				 count, count_pair);
+			return -1;
+		}
+		if (count_dummy > 0) {
+			pr_debug("Other hists should not have dummy entries: %zd\n",
+				 count_dummy);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int validate_link(struct hists *leader, struct hists *other)
+{
+	return __validate_link(leader, 0) || __validate_link(other, 1);
+}
+
+static void print_hists(struct hists *hists)
+{
+	int i = 0;
+	struct rb_root *root;
+	struct rb_node *node;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	pr_info("----- %s --------\n", __func__);
+	node = rb_first(root);
+	while (node) {
+		struct hist_entry *he;
+
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+
+		pr_info("%2d: entry: %-8s [%-8s] %20s: period = %"PRIu64"\n",
+			i, he->thread->comm, he->ms.map->dso->short_name,
+			he->ms.sym->name, he->stat.period);
+
+		i++;
+		node = rb_next(node);
+	}
+}
+
+int test__hists_link(void)
+{
+	int err = -1;
+	struct machines machines;
+	struct machine *machine = NULL;
+	struct perf_evsel *evsel, *first;
+        struct perf_evlist *evlist = perf_evlist__new(NULL, NULL);
+
+	if (evlist == NULL)
+                return -ENOMEM;
+
+	err = parse_events(evlist, "cpu-clock");
+	if (err)
+		goto out;
+	err = parse_events(evlist, "task-clock");
+	if (err)
+		goto out;
+
+	/* default sort order (comm,dso,sym) will be used */
+	setup_sorting(NULL, NULL);
+
+	machines__init(&machines);
+
+	/* setup threads/dso/map/symbols also */
+	machine = setup_fake_machine(&machines);
+	if (!machine)
+		goto out;
+
+	if (verbose > 1)
+		machine__fprintf(machine, stderr);
+
+	/* process sample events */
+	err = add_hist_entries(evlist, machine);
+	if (err < 0)
+		goto out;
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		hists__collapse_resort(&evsel->hists);
+
+		if (verbose > 2)
+			print_hists(&evsel->hists);
+	}
+
+	first = perf_evlist__first(evlist);
+	evsel = perf_evlist__last(evlist);
+
+	/* match common entries */
+	hists__match(&first->hists, &evsel->hists);
+	err = validate_match(&first->hists, &evsel->hists);
+	if (err)
+		goto out;
+
+	/* link common and/or dummy entries */
+	hists__link(&first->hists, &evsel->hists);
+	err = validate_link(&first->hists, &evsel->hists);
+	if (err)
+		goto out;
+
+	err = 0;
+
+out:
+	/* tear down everything */
+	perf_evlist__delete(evlist);
+	machines__exit(&machines);
+
+	return err;
+}
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index e1746811e14b..cdd50755af51 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -22,36 +22,16 @@ int test__basic_mmap(void)
 	struct thread_map *threads;
 	struct cpu_map *cpus;
 	struct perf_evlist *evlist;
-	struct perf_event_attr attr = {
-		.type		= PERF_TYPE_TRACEPOINT,
-		.read_format	= PERF_FORMAT_ID,
-		.sample_type	= PERF_SAMPLE_ID,
-		.watermark	= 0,
-	};
 	cpu_set_t cpu_set;
 	const char *syscall_names[] = { "getsid", "getppid", "getpgrp",
 					"getpgid", };
 	pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp,
 				      (void*)getpgid };
 #define nsyscalls ARRAY_SIZE(syscall_names)
-	int ids[nsyscalls];
 	unsigned int nr_events[nsyscalls],
 		     expected_nr_events[nsyscalls], i, j;
 	struct perf_evsel *evsels[nsyscalls], *evsel;
 
-	for (i = 0; i < nsyscalls; ++i) {
-		char name[64];
-
-		snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
-		ids[i] = trace_event__id(name);
-		if (ids[i] < 0) {
-			pr_debug("Is debugfs mounted on /sys/kernel/debug?\n");
-			return -1;
-		}
-		nr_events[i] = 0;
-		expected_nr_events[i] = random() % 257;
-	}
-
 	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
@@ -79,18 +59,19 @@ int test__basic_mmap(void)
 		goto out_free_cpus;
 	}
 
-	/* anonymous union fields, can't be initialized above */
-	attr.wakeup_events = 1;
-	attr.sample_period = 1;
-
 	for (i = 0; i < nsyscalls; ++i) {
-		attr.config = ids[i];
-		evsels[i] = perf_evsel__new(&attr, i);
+		char name[64];
+
+		snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]);
+		evsels[i] = perf_evsel__newtp("syscalls", name, i);
 		if (evsels[i] == NULL) {
 			pr_debug("perf_evsel__new\n");
 			goto out_free_evlist;
 		}
 
+		evsels[i]->attr.wakeup_events = 1;
+		perf_evsel__set_sample_id(evsels[i]);
+
 		perf_evlist__add(evlist, evsels[i]);
 
 		if (perf_evsel__open(evsels[i], cpus, threads) < 0) {
@@ -99,6 +80,9 @@ int test__basic_mmap(void)
 				 strerror(errno));
 			goto out_close_fd;
 		}
+
+		nr_events[i] = 0;
+		expected_nr_events[i] = 1 + rand() % 127;
 	}
 
 	if (perf_evlist__mmap(evlist, 128, true) < 0) {
@@ -128,6 +112,7 @@ int test__basic_mmap(void)
 			goto out_munmap;
 		}
 
+		err = -1;
 		evsel = perf_evlist__id2evsel(evlist, sample.id);
 		if (evsel == NULL) {
 			pr_debug("event with id %" PRIu64
@@ -137,16 +122,17 @@ int test__basic_mmap(void)
 		nr_events[evsel->idx]++;
 	}
 
+	err = 0;
 	list_for_each_entry(evsel, &evlist->entries, node) {
 		if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) {
 			pr_debug("expected %d %s events, got %d\n",
 				 expected_nr_events[evsel->idx],
 				 perf_evsel__name(evsel), nr_events[evsel->idx]);
+			err = -1;
 			goto out_munmap;
 		}
 	}
 
-	err = 0;
 out_munmap:
 	perf_evlist__munmap(evlist);
 out_close_fd:
diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/open-syscall-all-cpus.c
index 31072aba0d54..9b920a0cce79 100644
--- a/tools/perf/tests/open-syscall-all-cpus.c
+++ b/tools/perf/tests/open-syscall-all-cpus.c
@@ -7,20 +7,12 @@
 int test__open_syscall_event_on_all_cpus(void)
 {
 	int err = -1, fd, cpu;
-	struct thread_map *threads;
 	struct cpu_map *cpus;
 	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
 	unsigned int nr_open_calls = 111, i;
 	cpu_set_t cpu_set;
-	int id = trace_event__id("sys_enter_open");
+	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
 
-	if (id < 0) {
-		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
-		return -1;
-	}
-
-	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
 		return -1;
@@ -32,15 +24,11 @@ int test__open_syscall_event_on_all_cpus(void)
 		goto out_thread_map_delete;
 	}
 
-
 	CPU_ZERO(&cpu_set);
 
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.config = id;
-	evsel = perf_evsel__new(&attr, 0);
+	evsel = perf_evsel__newtp("syscalls", "sys_enter_open", 0);
 	if (evsel == NULL) {
-		pr_debug("perf_evsel__new\n");
+		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
 		goto out_thread_map_delete;
 	}
 
diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/open-syscall.c
index 98be8b518b4f..befc0671f95d 100644
--- a/tools/perf/tests/open-syscall.c
+++ b/tools/perf/tests/open-syscall.c
@@ -6,29 +6,18 @@
 int test__open_syscall_event(void)
 {
 	int err = -1, fd;
-	struct thread_map *threads;
 	struct perf_evsel *evsel;
-	struct perf_event_attr attr;
 	unsigned int nr_open_calls = 111, i;
-	int id = trace_event__id("sys_enter_open");
+	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
 
-	if (id < 0) {
-		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
-		return -1;
-	}
-
-	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
 		return -1;
 	}
 
-	memset(&attr, 0, sizeof(attr));
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.config = id;
-	evsel = perf_evsel__new(&attr, 0);
+	evsel = perf_evsel__newtp("syscalls", "sys_enter_open", 0);
 	if (evsel == NULL) {
-		pr_debug("perf_evsel__new\n");
+		pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
 		goto out_thread_map_delete;
 	}
 
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 32ee478905eb..20acaff295d2 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -3,6 +3,7 @@
 #include "evsel.h"
 #include "evlist.h"
 #include "sysfs.h"
+#include "debugfs.h"
 #include "tests.h"
 #include <linux/hw_breakpoint.h>
 
@@ -463,10 +464,10 @@ static int test__checkevent_pmu_events(struct perf_evlist *evlist)
 
 static int test__checkterms_simple(struct list_head *terms)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	/* config=10 */
-	term = list_entry(terms->next, struct parse_events__term, list);
+	term = list_entry(terms->next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG);
 	TEST_ASSERT_VAL("wrong type val",
@@ -475,7 +476,7 @@ static int test__checkterms_simple(struct list_head *terms)
 	TEST_ASSERT_VAL("wrong config", !term->config);
 
 	/* config1 */
-	term = list_entry(term->list.next, struct parse_events__term, list);
+	term = list_entry(term->list.next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG1);
 	TEST_ASSERT_VAL("wrong type val",
@@ -484,7 +485,7 @@ static int test__checkterms_simple(struct list_head *terms)
 	TEST_ASSERT_VAL("wrong config", !term->config);
 
 	/* config2=3 */
-	term = list_entry(term->list.next, struct parse_events__term, list);
+	term = list_entry(term->list.next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG2);
 	TEST_ASSERT_VAL("wrong type val",
@@ -493,7 +494,7 @@ static int test__checkterms_simple(struct list_head *terms)
 	TEST_ASSERT_VAL("wrong config", !term->config);
 
 	/* umask=1*/
-	term = list_entry(term->list.next, struct parse_events__term, list);
+	term = list_entry(term->list.next, struct parse_events_term, list);
 	TEST_ASSERT_VAL("wrong type term",
 			term->type_term == PARSE_EVENTS__TERM_TYPE_USER);
 	TEST_ASSERT_VAL("wrong type val",
@@ -521,7 +522,7 @@ static int test__group1(struct perf_evlist *evlist)
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	/* cycles:upp */
 	evsel = perf_evsel__next(evsel);
@@ -557,7 +558,7 @@ static int test__group2(struct perf_evlist *evlist)
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	/* cache-references + :u modifier */
 	evsel = perf_evsel__next(evsel);
@@ -583,7 +584,7 @@ static int test__group2(struct perf_evlist *evlist)
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	return 0;
 }
@@ -606,7 +607,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 	TEST_ASSERT_VAL("wrong group name",
 		!strcmp(leader->group_name, "group1"));
 
@@ -636,7 +637,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 	TEST_ASSERT_VAL("wrong group name",
 		!strcmp(leader->group_name, "group2"));
 
@@ -663,7 +664,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	return 0;
 }
@@ -687,7 +688,7 @@ static int test__group4(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 1);
 	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	/* instructions:kp + p */
 	evsel = perf_evsel__next(evsel);
@@ -724,7 +725,7 @@ static int test__group5(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	/* instructions + G */
 	evsel = perf_evsel__next(evsel);
@@ -751,7 +752,7 @@ static int test__group5(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 	TEST_ASSERT_VAL("wrong group name", !evsel->group_name);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	/* instructions:G */
 	evsel = perf_evsel__next(evsel);
@@ -777,18 +778,75 @@ static int test__group5(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
-	TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel));
+	TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel));
 
 	return 0;
 }
 
-struct test__event_st {
+static int count_tracepoints(void)
+{
+	char events_path[PATH_MAX];
+	struct dirent *events_ent;
+	DIR *events_dir;
+	int cnt = 0;
+
+	scnprintf(events_path, PATH_MAX, "%s/tracing/events",
+		  debugfs_find_mountpoint());
+
+	events_dir = opendir(events_path);
+
+	TEST_ASSERT_VAL("Can't open events dir", events_dir);
+
+	while ((events_ent = readdir(events_dir))) {
+		char sys_path[PATH_MAX];
+		struct dirent *sys_ent;
+		DIR *sys_dir;
+
+		if (!strcmp(events_ent->d_name, ".")
+		    || !strcmp(events_ent->d_name, "..")
+		    || !strcmp(events_ent->d_name, "enable")
+		    || !strcmp(events_ent->d_name, "header_event")
+		    || !strcmp(events_ent->d_name, "header_page"))
+			continue;
+
+		scnprintf(sys_path, PATH_MAX, "%s/%s",
+			  events_path, events_ent->d_name);
+
+		sys_dir = opendir(sys_path);
+		TEST_ASSERT_VAL("Can't open sys dir", sys_dir);
+
+		while ((sys_ent = readdir(sys_dir))) {
+			if (!strcmp(sys_ent->d_name, ".")
+			    || !strcmp(sys_ent->d_name, "..")
+			    || !strcmp(sys_ent->d_name, "enable")
+			    || !strcmp(sys_ent->d_name, "filter"))
+				continue;
+
+			cnt++;
+		}
+
+		closedir(sys_dir);
+	}
+
+	closedir(events_dir);
+	return cnt;
+}
+
+static int test__all_tracepoints(struct perf_evlist *evlist)
+{
+	TEST_ASSERT_VAL("wrong events count",
+			count_tracepoints() == evlist->nr_entries);
+
+	return test__checkevent_tracepoint_multi(evlist);
+}
+
+struct evlist_test {
 	const char *name;
 	__u32 type;
 	int (*check)(struct perf_evlist *evlist);
 };
 
-static struct test__event_st test__events[] = {
+static struct evlist_test test__events[] = {
 	[0] = {
 		.name  = "syscalls:sys_enter_open",
 		.check = test__checkevent_tracepoint,
@@ -921,9 +979,13 @@ static struct test__event_st test__events[] = {
 		.name  = "{cycles,instructions}:G,{cycles:G,instructions:G},cycles",
 		.check = test__group5,
 	},
+	[33] = {
+		.name  = "*:*",
+		.check = test__all_tracepoints,
+	},
 };
 
-static struct test__event_st test__events_pmu[] = {
+static struct evlist_test test__events_pmu[] = {
 	[0] = {
 		.name  = "cpu/config=10,config1,config2=3,period=1000/u",
 		.check = test__checkevent_pmu,
@@ -934,20 +996,20 @@ static struct test__event_st test__events_pmu[] = {
 	},
 };
 
-struct test__term {
+struct terms_test {
 	const char *str;
 	__u32 type;
 	int (*check)(struct list_head *terms);
 };
 
-static struct test__term test__terms[] = {
+static struct terms_test test__terms[] = {
 	[0] = {
 		.str   = "config=10,config1,config2=3,umask=1",
 		.check = test__checkterms_simple,
 	},
 };
 
-static int test_event(struct test__event_st *e)
+static int test_event(struct evlist_test *e)
 {
 	struct perf_evlist *evlist;
 	int ret;
@@ -956,7 +1018,7 @@ static int test_event(struct test__event_st *e)
 	if (evlist == NULL)
 		return -ENOMEM;
 
-	ret = parse_events(evlist, e->name, 0);
+	ret = parse_events(evlist, e->name);
 	if (ret) {
 		pr_debug("failed to parse event '%s', err %d\n",
 			 e->name, ret);
@@ -969,13 +1031,13 @@ static int test_event(struct test__event_st *e)
 	return ret;
 }
 
-static int test_events(struct test__event_st *events, unsigned cnt)
+static int test_events(struct evlist_test *events, unsigned cnt)
 {
 	int ret1, ret2 = 0;
 	unsigned i;
 
 	for (i = 0; i < cnt; i++) {
-		struct test__event_st *e = &events[i];
+		struct evlist_test *e = &events[i];
 
 		pr_debug("running test %d '%s'\n", i, e->name);
 		ret1 = test_event(e);
@@ -986,7 +1048,7 @@ static int test_events(struct test__event_st *events, unsigned cnt)
 	return ret2;
 }
 
-static int test_term(struct test__term *t)
+static int test_term(struct terms_test *t)
 {
 	struct list_head *terms;
 	int ret;
@@ -1010,13 +1072,13 @@ static int test_term(struct test__term *t)
 	return ret;
 }
 
-static int test_terms(struct test__term *terms, unsigned cnt)
+static int test_terms(struct terms_test *terms, unsigned cnt)
 {
 	int ret = 0;
 	unsigned i;
 
 	for (i = 0; i < cnt; i++) {
-		struct test__term *t = &terms[i];
+		struct terms_test *t = &terms[i];
 
 		pr_debug("running test %d '%s'\n", i, t->str);
 		ret = test_term(t);
@@ -1067,7 +1129,7 @@ static int test_pmu_events(void)
 
 	while (!ret && (ent = readdir(dir))) {
 #define MAX_NAME 100
-		struct test__event_st e;
+		struct evlist_test e;
 		char name[MAX_NAME];
 
 		if (!strcmp(ent->d_name, ".") ||
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 70e0d4421df8..6ea66cf6791b 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -103,10 +103,10 @@ int test__PERF_RECORD(void)
 	 * Config the evsels, setting attr->comm on the first one, etc.
 	 */
 	evsel = perf_evlist__first(evlist);
-	evsel->attr.sample_type |= PERF_SAMPLE_CPU;
-	evsel->attr.sample_type |= PERF_SAMPLE_TID;
-	evsel->attr.sample_type |= PERF_SAMPLE_TIME;
-	perf_evlist__config_attrs(evlist, &opts);
+	perf_evsel__set_sample_bit(evsel, CPU);
+	perf_evsel__set_sample_bit(evsel, TID);
+	perf_evsel__set_sample_bit(evsel, TIME);
+	perf_evlist__config(evlist, &opts);
 
 	err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask);
 	if (err < 0) {
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index a5f379863b8f..12b322fa3475 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -19,10 +19,8 @@ static struct test_format {
 	{ "krava23", "config2:28-29,38\n", },
 };
 
-#define TEST_FORMATS_CNT (sizeof(test_formats) / sizeof(struct test_format))
-
 /* Simulated users input. */
-static struct parse_events__term test_terms[] = {
+static struct parse_events_term test_terms[] = {
 	{
 		.config    = (char *) "krava01",
 		.val.num   = 15,
@@ -78,7 +76,6 @@ static struct parse_events__term test_terms[] = {
 		.type_term = PARSE_EVENTS__TERM_TYPE_USER,
 	},
 };
-#define TERMS_CNT (sizeof(test_terms) / sizeof(struct parse_events__term))
 
 /*
  * Prepare format directory data, exported by kernel
@@ -93,7 +90,7 @@ static char *test_format_dir_get(void)
 	if (!mkdtemp(dir))
 		return NULL;
 
-	for (i = 0; i < TEST_FORMATS_CNT; i++) {
+	for (i = 0; i < ARRAY_SIZE(test_formats); i++) {
 		static char name[PATH_MAX];
 		struct test_format *format = &test_formats[i];
 		FILE *file;
@@ -130,14 +127,12 @@ static struct list_head *test_terms_list(void)
 	static LIST_HEAD(terms);
 	unsigned int i;
 
-	for (i = 0; i < TERMS_CNT; i++)
+	for (i = 0; i < ARRAY_SIZE(test_terms); i++)
 		list_add_tail(&test_terms[i].list, &terms);
 
 	return &terms;
 }
 
-#undef TERMS_CNT
-
 int test__pmu(void)
 {
 	char *format = test_format_dir_get();
diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c
new file mode 100644
index 000000000000..7760277c6def
--- /dev/null
+++ b/tools/perf/tests/python-use.c
@@ -0,0 +1,23 @@
+/*
+ * Just test if we can load the python binding.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "tests.h"
+
+extern int verbose;
+
+int test__python_use(void)
+{
+	char *cmd;
+	int ret;
+
+	if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s",
+		     PYTHONPATH, PYTHON, verbose ? "" : "2> /dev/null") < 0)
+		return -1;
+
+	ret = system(cmd) ? -1 : 0;
+	free(cmd);
+	return ret;
+}
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index fc121edab016..5de0be1ff4b6 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -1,6 +1,12 @@
 #ifndef TESTS_H
 #define TESTS_H
 
+enum {
+	TEST_OK   =  0,
+	TEST_FAIL = -1,
+	TEST_SKIP = -2,
+};
+
 /* Tests */
 int test__vmlinux_matches_kallsyms(void);
 int test__open_syscall_event(void);
@@ -15,8 +21,7 @@ int test__pmu(void);
 int test__attr(void);
 int test__dso_data(void);
 int test__parse_events(void);
-
-/* Util */
-int trace_event__id(const char *evname);
+int test__hists_link(void);
+int test__python_use(void);
 
 #endif /* TESTS_H */
diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c
deleted file mode 100644
index 748f2e8f6961..000000000000
--- a/tools/perf/tests/util.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "tests.h"
-#include "debugfs.h"
-
-int trace_event__id(const char *evname)
-{
-	char *filename;
-	int err = -1, fd;
-
-	if (asprintf(&filename,
-		     "%s/syscalls/%s/id",
-		     tracing_events_path, evname) < 0)
-		return -1;
-
-	fd = open(filename, O_RDONLY);
-	if (fd >= 0) {
-		char id[16];
-		if (read(fd, id, sizeof(id)) > 0)
-			err = atoi(id);
-		close(fd);
-	}
-
-	free(filename);
-	return err;
-}
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 0d1cdbee2f59..a1a8442829b4 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -101,7 +101,8 @@ int test__vmlinux_matches_kallsyms(void)
 	 */
 	if (machine__load_vmlinux_path(&vmlinux, type,
 				       vmlinux_matches_kallsyms_filter) <= 0) {
-		pr_debug("machine__load_vmlinux_path ");
+		pr_debug("Couldn't find a vmlinux that matches the kernel running on this machine, skipping test\n");
+		err = TEST_SKIP;
 		goto out;
 	}
 
diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index 4aeb7d5df939..588bcb2d008b 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -471,7 +471,7 @@ unsigned int ui_browser__list_head_refresh(struct ui_browser *browser)
 	return row;
 }
 
-static struct ui_browser__colorset {
+static struct ui_browser_colorset {
 	const char *name, *fg, *bg;
 	int colorset;
 } ui_browser__colorsets[] = {
@@ -706,7 +706,7 @@ void ui_browser__init(void)
 	perf_config(ui_browser__color_config, NULL);
 
 	while (ui_browser__colorsets[i].name) {
-		struct ui_browser__colorset *c = &ui_browser__colorsets[i++];
+		struct ui_browser_colorset *c = &ui_browser__colorsets[i++];
 		sltt_set_color(c->colorset, c->name, c->fg, c->bg);
 	}
 
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 5dab3ca96980..7dca1555c610 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -182,6 +182,16 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 		ab->selection = dl;
 }
 
+static bool disasm_line__is_valid_jump(struct disasm_line *dl, struct symbol *sym)
+{
+	if (!dl || !dl->ins || !ins__is_jump(dl->ins)
+	    || !disasm_line__has_offset(dl)
+	    || dl->ops.target.offset >= symbol__size(sym))
+		return false;
+
+	return true;
+}
+
 static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 {
 	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
@@ -195,8 +205,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 	if (strstr(sym->name, "@plt"))
 		return;
 
-	if (!cursor || !cursor->ins || !ins__is_jump(cursor->ins) ||
-	    !disasm_line__has_offset(cursor))
+	if (!disasm_line__is_valid_jump(cursor, sym))
 		return;
 
 	target = ab->offsets[cursor->ops.target.offset];
@@ -788,17 +797,9 @@ static void annotate_browser__mark_jump_targets(struct annotate_browser *browser
 		struct disasm_line *dl = browser->offsets[offset], *dlt;
 		struct browser_disasm_line *bdlt;
 
-		if (!dl || !dl->ins || !ins__is_jump(dl->ins) ||
-		    !disasm_line__has_offset(dl))
+		if (!disasm_line__is_valid_jump(dl, sym))
 			continue;
 
-		if (dl->ops.target.offset >= size) {
-			ui__error("jump to after symbol!\n"
-				  "size: %zx, jump target: %" PRIx64,
-				  size, dl->ops.target.offset);
-			continue;
-		}
-
 		dlt = browser->offsets[dl->ops.target.offset];
 		/*
  		 * FIXME: Oops, no jump target? Buggy disassembler? Or do we
@@ -921,11 +922,11 @@ out_free_offsets:
 
 #define ANNOTATE_CFG(n) \
 	{ .name = #n, .value = &annotate_browser__opts.n, }
-	
+
 /*
  * Keep the entries sorted, they are bsearch'ed
  */
-static struct annotate__config {
+static struct annotate_config {
 	const char *name;
 	bool *value;
 } annotate__configs[] = {
@@ -939,7 +940,7 @@ static struct annotate__config {
 
 static int annotate_config__cmp(const void *name, const void *cfgp)
 {
-	const struct annotate__config *cfg = cfgp;
+	const struct annotate_config *cfg = cfgp;
 
 	return strcmp(name, cfg->name);
 }
@@ -947,7 +948,7 @@ static int annotate_config__cmp(const void *name, const void *cfgp)
 static int annotate__config(const char *var, const char *value,
 			    void *data __maybe_unused)
 {
-	struct annotate__config *cfg;
+	struct annotate_config *cfg;
 	const char *name;
 
 	if (prefixcmp(var, "annotate.") != 0)
@@ -955,7 +956,7 @@ static int annotate__config(const char *var, const char *value,
 
 	name = var + 9;
 	cfg = bsearch(name, annotate__configs, ARRAY_SIZE(annotate__configs),
-		      sizeof(struct annotate__config), annotate_config__cmp);
+		      sizeof(struct annotate_config), annotate_config__cmp);
 
 	if (cfg == NULL)
 		return -1;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index ccc4bd161420..57b82c26cd05 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -587,6 +587,8 @@ HPP__COLOR_FN(overhead_guest_us, period_guest_us)
 
 void hist_browser__init_hpp(void)
 {
+	perf_hpp__column_enable(PERF_HPP__OVERHEAD);
+
 	perf_hpp__init();
 
 	perf_hpp__format[PERF_HPP__OVERHEAD].color =
@@ -607,12 +609,13 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 {
 	char s[256];
 	double percent;
-	int i, printed = 0;
+	int printed = 0;
 	int width = browser->b.width;
 	char folded_sign = ' ';
 	bool current_entry = ui_browser__is_current_entry(&browser->b, row);
 	off_t row_offset = entry->row_offset;
 	bool first = true;
+	struct perf_hpp_fmt *fmt;
 
 	if (current_entry) {
 		browser->he_selection = entry;
@@ -629,12 +632,11 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 			.buf		= s,
 			.size		= sizeof(s),
 		};
+		int i = 0;
 
 		ui_browser__gotorc(&browser->b, row, 0);
 
-		for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-			if (!perf_hpp__format[i].cond)
-				continue;
+		perf_hpp__for_each_format(fmt) {
 
 			if (!first) {
 				slsmg_printf("  ");
@@ -642,14 +644,14 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 			}
 			first = false;
 
-			if (perf_hpp__format[i].color) {
+			if (fmt->color) {
 				hpp.ptr = &percent;
 				/* It will set percent for us. See HPP__COLOR_FN above. */
-				width -= perf_hpp__format[i].color(&hpp, entry);
+				width -= fmt->color(&hpp, entry);
 
 				ui_browser__set_percent_color(&browser->b, percent, current_entry);
 
-				if (i == PERF_HPP__OVERHEAD && symbol_conf.use_callchain) {
+				if (!i && symbol_conf.use_callchain) {
 					slsmg_printf("%c ", folded_sign);
 					width -= 2;
 				}
@@ -659,9 +661,11 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 				if (!current_entry || !browser->b.navkeypressed)
 					ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
 			} else {
-				width -= perf_hpp__format[i].entry(&hpp, entry);
+				width -= fmt->entry(&hpp, entry);
 				slsmg_printf("%s", s);
 			}
+
+			i++;
 		}
 
 		/* The scroll bar isn't being used */
diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c
index 253b6219a39e..c95012cdb438 100644
--- a/tools/perf/ui/gtk/browser.c
+++ b/tools/perf/ui/gtk/browser.c
@@ -8,15 +8,13 @@
 
 #include <signal.h>
 
-#define MAX_COLUMNS			32
-
-static void perf_gtk__signal(int sig)
+void perf_gtk__signal(int sig)
 {
 	perf_gtk__exit(false);
 	psignal(sig, "perf");
 }
 
-static void perf_gtk__resize_window(GtkWidget *window)
+void perf_gtk__resize_window(GtkWidget *window)
 {
 	GdkRectangle rect;
 	GdkScreen *screen;
@@ -36,7 +34,7 @@ static void perf_gtk__resize_window(GtkWidget *window)
 	gtk_window_resize(GTK_WINDOW(window), width, height);
 }
 
-static const char *perf_gtk__get_percent_color(double percent)
+const char *perf_gtk__get_percent_color(double percent)
 {
 	if (percent >= MIN_RED)
 		return "<span fgcolor='red'>";
@@ -45,155 +43,8 @@ static const char *perf_gtk__get_percent_color(double percent)
 	return NULL;
 }
 
-#define HPP__COLOR_FN(_name, _field)						\
-static int perf_gtk__hpp_color_ ## _name(struct perf_hpp *hpp,			\
-					 struct hist_entry *he)			\
-{										\
-	struct hists *hists = he->hists;					\
-	double percent = 100.0 * he->stat._field / hists->stats.total_period;	\
-	const char *markup;							\
-	int ret = 0;								\
-										\
-	markup = perf_gtk__get_percent_color(percent);				\
-	if (markup)								\
-		ret += scnprintf(hpp->buf, hpp->size, "%s", markup);		\
-	ret += scnprintf(hpp->buf + ret, hpp->size - ret, "%6.2f%%", percent); 	\
-	if (markup)								\
-		ret += scnprintf(hpp->buf + ret, hpp->size - ret, "</span>"); 	\
-										\
-	return ret;								\
-}
-
-HPP__COLOR_FN(overhead, period)
-HPP__COLOR_FN(overhead_sys, period_sys)
-HPP__COLOR_FN(overhead_us, period_us)
-HPP__COLOR_FN(overhead_guest_sys, period_guest_sys)
-HPP__COLOR_FN(overhead_guest_us, period_guest_us)
-
-#undef HPP__COLOR_FN
-
-void perf_gtk__init_hpp(void)
-{
-	perf_hpp__init();
-
-	perf_hpp__format[PERF_HPP__OVERHEAD].color =
-				perf_gtk__hpp_color_overhead;
-	perf_hpp__format[PERF_HPP__OVERHEAD_SYS].color =
-				perf_gtk__hpp_color_overhead_sys;
-	perf_hpp__format[PERF_HPP__OVERHEAD_US].color =
-				perf_gtk__hpp_color_overhead_us;
-	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].color =
-				perf_gtk__hpp_color_overhead_guest_sys;
-	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].color =
-				perf_gtk__hpp_color_overhead_guest_us;
-}
-
-static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists)
-{
-	GType col_types[MAX_COLUMNS];
-	GtkCellRenderer *renderer;
-	struct sort_entry *se;
-	GtkListStore *store;
-	struct rb_node *nd;
-	GtkWidget *view;
-	int i, col_idx;
-	int nr_cols;
-	char s[512];
-
-	struct perf_hpp hpp = {
-		.buf		= s,
-		.size		= sizeof(s),
-	};
-
-	nr_cols = 0;
-
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
-
-		col_types[nr_cols++] = G_TYPE_STRING;
-	}
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (se->elide)
-			continue;
-
-		col_types[nr_cols++] = G_TYPE_STRING;
-	}
-
-	store = gtk_list_store_newv(nr_cols, col_types);
-
-	view = gtk_tree_view_new();
-
-	renderer = gtk_cell_renderer_text_new();
-
-	col_idx = 0;
-
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
-
-		perf_hpp__format[i].header(&hpp);
-
-		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
-							    -1, s,
-							    renderer, "markup",
-							    col_idx++, NULL);
-	}
-
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (se->elide)
-			continue;
-
-		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
-							    -1, se->se_header,
-							    renderer, "text",
-							    col_idx++, NULL);
-	}
-
-	gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
-
-	g_object_unref(GTK_TREE_MODEL(store));
-
-	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-		GtkTreeIter iter;
-
-		if (h->filtered)
-			continue;
-
-		gtk_list_store_append(store, &iter);
-
-		col_idx = 0;
-
-		for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-			if (!perf_hpp__format[i].cond)
-				continue;
-
-			if (perf_hpp__format[i].color)
-				perf_hpp__format[i].color(&hpp, h);
-			else
-				perf_hpp__format[i].entry(&hpp, h);
-
-			gtk_list_store_set(store, &iter, col_idx++, s, -1);
-		}
-
-		list_for_each_entry(se, &hist_entry__sort_list, list) {
-			if (se->elide)
-				continue;
-
-			se->se_snprintf(h, s, ARRAY_SIZE(s),
-					hists__col_len(hists, se->se_width_idx));
-
-			gtk_list_store_set(store, &iter, col_idx++, s, -1);
-		}
-	}
-
-	gtk_container_add(GTK_CONTAINER(window), view);
-}
-
 #ifdef HAVE_GTK_INFO_BAR
-static GtkWidget *perf_gtk__setup_info_bar(void)
+GtkWidget *perf_gtk__setup_info_bar(void)
 {
 	GtkWidget *info_bar;
 	GtkWidget *label;
@@ -220,7 +71,7 @@ static GtkWidget *perf_gtk__setup_info_bar(void)
 }
 #endif
 
-static GtkWidget *perf_gtk__setup_statusbar(void)
+GtkWidget *perf_gtk__setup_statusbar(void)
 {
 	GtkWidget *stbar;
 	unsigned ctxid;
@@ -234,79 +85,3 @@ static GtkWidget *perf_gtk__setup_statusbar(void)
 
 	return stbar;
 }
-
-int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
-				  const char *help,
-				  struct hist_browser_timer *hbt __maybe_unused)
-{
-	struct perf_evsel *pos;
-	GtkWidget *vbox;
-	GtkWidget *notebook;
-	GtkWidget *info_bar;
-	GtkWidget *statbar;
-	GtkWidget *window;
-
-	signal(SIGSEGV, perf_gtk__signal);
-	signal(SIGFPE,  perf_gtk__signal);
-	signal(SIGINT,  perf_gtk__signal);
-	signal(SIGQUIT, perf_gtk__signal);
-	signal(SIGTERM, perf_gtk__signal);
-
-	window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
-
-	gtk_window_set_title(GTK_WINDOW(window), "perf report");
-
-	g_signal_connect(window, "delete_event", gtk_main_quit, NULL);
-
-	pgctx = perf_gtk__activate_context(window);
-	if (!pgctx)
-		return -1;
-
-	vbox = gtk_vbox_new(FALSE, 0);
-
-	notebook = gtk_notebook_new();
-
-	list_for_each_entry(pos, &evlist->entries, node) {
-		struct hists *hists = &pos->hists;
-		const char *evname = perf_evsel__name(pos);
-		GtkWidget *scrolled_window;
-		GtkWidget *tab_label;
-
-		scrolled_window = gtk_scrolled_window_new(NULL, NULL);
-
-		gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window),
-							GTK_POLICY_AUTOMATIC,
-							GTK_POLICY_AUTOMATIC);
-
-		perf_gtk__show_hists(scrolled_window, hists);
-
-		tab_label = gtk_label_new(evname);
-
-		gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window, tab_label);
-	}
-
-	gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0);
-
-	info_bar = perf_gtk__setup_info_bar();
-	if (info_bar)
-		gtk_box_pack_start(GTK_BOX(vbox), info_bar, FALSE, FALSE, 0);
-
-	statbar = perf_gtk__setup_statusbar();
-	gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0);
-
-	gtk_container_add(GTK_CONTAINER(window), vbox);
-
-	gtk_widget_show_all(window);
-
-	perf_gtk__resize_window(window);
-
-	gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER);
-
-	ui_helpline__push(help);
-
-	gtk_main();
-
-	perf_gtk__deactivate_context(&pgctx);
-
-	return 0;
-}
diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h
index 856320e2cc05..5d3693754828 100644
--- a/tools/perf/ui/gtk/gtk.h
+++ b/tools/perf/ui/gtk/gtk.h
@@ -33,7 +33,14 @@ void perf_gtk__init_helpline(void);
 void perf_gtk__init_progress(void);
 void perf_gtk__init_hpp(void);
 
-#ifndef HAVE_GTK_INFO_BAR
+void perf_gtk__signal(int sig);
+void perf_gtk__resize_window(GtkWidget *window);
+const char *perf_gtk__get_percent_color(double percent);
+GtkWidget *perf_gtk__setup_statusbar(void);
+
+#ifdef HAVE_GTK_INFO_BAR
+GtkWidget *perf_gtk__setup_info_bar(void);
+#else
 static inline GtkWidget *perf_gtk__setup_info_bar(void)
 {
 	return NULL;
diff --git a/tools/perf/ui/gtk/helpline.c b/tools/perf/ui/gtk/helpline.c
index 5db4432ff12a..3388cbd12186 100644
--- a/tools/perf/ui/gtk/helpline.c
+++ b/tools/perf/ui/gtk/helpline.c
@@ -24,17 +24,7 @@ static void gtk_helpline_push(const char *msg)
 			   pgctx->statbar_ctx_id, msg);
 }
 
-static struct ui_helpline gtk_helpline_fns = {
-	.pop	= gtk_helpline_pop,
-	.push	= gtk_helpline_push,
-};
-
-void perf_gtk__init_helpline(void)
-{
-	helpline_fns = &gtk_helpline_fns;
-}
-
-int perf_gtk__show_helpline(const char *fmt, va_list ap)
+static int gtk_helpline_show(const char *fmt, va_list ap)
 {
 	int ret;
 	char *ptr;
@@ -54,3 +44,14 @@ int perf_gtk__show_helpline(const char *fmt, va_list ap)
 
 	return ret;
 }
+
+static struct ui_helpline gtk_helpline_fns = {
+	.pop	= gtk_helpline_pop,
+	.push	= gtk_helpline_push,
+	.show	= gtk_helpline_show,
+};
+
+void perf_gtk__init_helpline(void)
+{
+	helpline_fns = &gtk_helpline_fns;
+}
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
new file mode 100644
index 000000000000..c03da79d524f
--- /dev/null
+++ b/tools/perf/ui/gtk/hists.c
@@ -0,0 +1,226 @@
+#include "../evlist.h"
+#include "../cache.h"
+#include "../evsel.h"
+#include "../sort.h"
+#include "../hist.h"
+#include "../helpline.h"
+#include "gtk.h"
+
+#define MAX_COLUMNS			32
+
+#define HPP__COLOR_FN(_name, _field)						\
+static int perf_gtk__hpp_color_ ## _name(struct perf_hpp *hpp,			\
+					 struct hist_entry *he)			\
+{										\
+	struct hists *hists = he->hists;					\
+	double percent = 100.0 * he->stat._field / hists->stats.total_period;	\
+	const char *markup;							\
+	int ret = 0;								\
+										\
+	markup = perf_gtk__get_percent_color(percent);				\
+	if (markup)								\
+		ret += scnprintf(hpp->buf, hpp->size, "%s", markup);		\
+	ret += scnprintf(hpp->buf + ret, hpp->size - ret, "%6.2f%%", percent); 	\
+	if (markup)								\
+		ret += scnprintf(hpp->buf + ret, hpp->size - ret, "</span>"); 	\
+										\
+	return ret;								\
+}
+
+HPP__COLOR_FN(overhead, period)
+HPP__COLOR_FN(overhead_sys, period_sys)
+HPP__COLOR_FN(overhead_us, period_us)
+HPP__COLOR_FN(overhead_guest_sys, period_guest_sys)
+HPP__COLOR_FN(overhead_guest_us, period_guest_us)
+
+#undef HPP__COLOR_FN
+
+
+void perf_gtk__init_hpp(void)
+{
+	perf_hpp__column_enable(PERF_HPP__OVERHEAD);
+
+	perf_hpp__init();
+
+	perf_hpp__format[PERF_HPP__OVERHEAD].color =
+				perf_gtk__hpp_color_overhead;
+	perf_hpp__format[PERF_HPP__OVERHEAD_SYS].color =
+				perf_gtk__hpp_color_overhead_sys;
+	perf_hpp__format[PERF_HPP__OVERHEAD_US].color =
+				perf_gtk__hpp_color_overhead_us;
+	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].color =
+				perf_gtk__hpp_color_overhead_guest_sys;
+	perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].color =
+				perf_gtk__hpp_color_overhead_guest_us;
+}
+
+static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists)
+{
+	struct perf_hpp_fmt *fmt;
+	GType col_types[MAX_COLUMNS];
+	GtkCellRenderer *renderer;
+	struct sort_entry *se;
+	GtkListStore *store;
+	struct rb_node *nd;
+	GtkWidget *view;
+	int col_idx;
+	int nr_cols;
+	char s[512];
+
+	struct perf_hpp hpp = {
+		.buf		= s,
+		.size		= sizeof(s),
+	};
+
+	nr_cols = 0;
+
+	perf_hpp__for_each_format(fmt)
+		col_types[nr_cols++] = G_TYPE_STRING;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (se->elide)
+			continue;
+
+		col_types[nr_cols++] = G_TYPE_STRING;
+	}
+
+	store = gtk_list_store_newv(nr_cols, col_types);
+
+	view = gtk_tree_view_new();
+
+	renderer = gtk_cell_renderer_text_new();
+
+	col_idx = 0;
+
+	perf_hpp__for_each_format(fmt) {
+		fmt->header(&hpp);
+
+		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+							    -1, s,
+							    renderer, "markup",
+							    col_idx++, NULL);
+	}
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (se->elide)
+			continue;
+
+		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+							    -1, se->se_header,
+							    renderer, "text",
+							    col_idx++, NULL);
+	}
+
+	gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
+
+	g_object_unref(GTK_TREE_MODEL(store));
+
+	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+		GtkTreeIter iter;
+
+		if (h->filtered)
+			continue;
+
+		gtk_list_store_append(store, &iter);
+
+		col_idx = 0;
+
+		perf_hpp__for_each_format(fmt) {
+			if (fmt->color)
+				fmt->color(&hpp, h);
+			else
+				fmt->entry(&hpp, h);
+
+			gtk_list_store_set(store, &iter, col_idx++, s, -1);
+		}
+
+		list_for_each_entry(se, &hist_entry__sort_list, list) {
+			if (se->elide)
+				continue;
+
+			se->se_snprintf(h, s, ARRAY_SIZE(s),
+					hists__col_len(hists, se->se_width_idx));
+
+			gtk_list_store_set(store, &iter, col_idx++, s, -1);
+		}
+	}
+
+	gtk_container_add(GTK_CONTAINER(window), view);
+}
+
+int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
+				  const char *help,
+				  struct hist_browser_timer *hbt __maybe_unused)
+{
+	struct perf_evsel *pos;
+	GtkWidget *vbox;
+	GtkWidget *notebook;
+	GtkWidget *info_bar;
+	GtkWidget *statbar;
+	GtkWidget *window;
+
+	signal(SIGSEGV, perf_gtk__signal);
+	signal(SIGFPE,  perf_gtk__signal);
+	signal(SIGINT,  perf_gtk__signal);
+	signal(SIGQUIT, perf_gtk__signal);
+	signal(SIGTERM, perf_gtk__signal);
+
+	window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+
+	gtk_window_set_title(GTK_WINDOW(window), "perf report");
+
+	g_signal_connect(window, "delete_event", gtk_main_quit, NULL);
+
+	pgctx = perf_gtk__activate_context(window);
+	if (!pgctx)
+		return -1;
+
+	vbox = gtk_vbox_new(FALSE, 0);
+
+	notebook = gtk_notebook_new();
+
+	gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0);
+
+	info_bar = perf_gtk__setup_info_bar();
+	if (info_bar)
+		gtk_box_pack_start(GTK_BOX(vbox), info_bar, FALSE, FALSE, 0);
+
+	statbar = perf_gtk__setup_statusbar();
+	gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0);
+
+	gtk_container_add(GTK_CONTAINER(window), vbox);
+
+	list_for_each_entry(pos, &evlist->entries, node) {
+		struct hists *hists = &pos->hists;
+		const char *evname = perf_evsel__name(pos);
+		GtkWidget *scrolled_window;
+		GtkWidget *tab_label;
+
+		scrolled_window = gtk_scrolled_window_new(NULL, NULL);
+
+		gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window),
+							GTK_POLICY_AUTOMATIC,
+							GTK_POLICY_AUTOMATIC);
+
+		perf_gtk__show_hists(scrolled_window, hists);
+
+		tab_label = gtk_label_new(evname);
+
+		gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window, tab_label);
+	}
+
+	gtk_widget_show_all(window);
+
+	perf_gtk__resize_window(window);
+
+	gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER);
+
+	ui_helpline__push(help);
+
+	gtk_main();
+
+	perf_gtk__deactivate_context(&pgctx);
+
+	return 0;
+}
diff --git a/tools/perf/ui/helpline.c b/tools/perf/ui/helpline.c
index a49bcf3c190b..700fb3cfa1c7 100644
--- a/tools/perf/ui/helpline.c
+++ b/tools/perf/ui/helpline.c
@@ -16,9 +16,16 @@ static void nop_helpline__push(const char *msg __maybe_unused)
 {
 }
 
+static int nop_helpline__show(const char *fmt __maybe_unused,
+			       va_list ap __maybe_unused)
+{
+	return 0;
+}
+
 static struct ui_helpline default_helpline_fns = {
 	.pop	= nop_helpline__pop,
 	.push	= nop_helpline__push,
+	.show	= nop_helpline__show,
 };
 
 struct ui_helpline *helpline_fns = &default_helpline_fns;
@@ -59,3 +66,8 @@ void ui_helpline__puts(const char *msg)
 	ui_helpline__pop();
 	ui_helpline__push(msg);
 }
+
+int ui_helpline__vshow(const char *fmt, va_list ap)
+{
+	return helpline_fns->show(fmt, ap);
+}
diff --git a/tools/perf/ui/helpline.h b/tools/perf/ui/helpline.h
index baa28a4d16b9..46181f4fc07e 100644
--- a/tools/perf/ui/helpline.h
+++ b/tools/perf/ui/helpline.h
@@ -9,6 +9,7 @@
 struct ui_helpline {
 	void (*pop)(void);
 	void (*push)(const char *msg);
+	int  (*show)(const char *fmt, va_list ap);
 };
 
 extern struct ui_helpline *helpline_fns;
@@ -20,28 +21,9 @@ void ui_helpline__push(const char *msg);
 void ui_helpline__vpush(const char *fmt, va_list ap);
 void ui_helpline__fpush(const char *fmt, ...);
 void ui_helpline__puts(const char *msg);
+int  ui_helpline__vshow(const char *fmt, va_list ap);
 
 extern char ui_helpline__current[512];
-
-#ifdef NEWT_SUPPORT
 extern char ui_helpline__last_msg[];
-int ui_helpline__show_help(const char *format, va_list ap);
-#else
-static inline int ui_helpline__show_help(const char *format __maybe_unused,
-					 va_list ap __maybe_unused)
-{
-	return 0;
-}
-#endif /* NEWT_SUPPORT */
-
-#ifdef GTK2_SUPPORT
-int perf_gtk__show_helpline(const char *format, va_list ap);
-#else
-static inline int perf_gtk__show_helpline(const char *format __maybe_unused,
-					  va_list ap __maybe_unused)
-{
-	return 0;
-}
-#endif /* GTK2_SUPPORT */
 
 #endif /* _PERF_UI_HELPLINE_H_ */
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index aa84130024d5..1889c12ca81f 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -268,14 +268,18 @@ static int hpp__width_delta(struct perf_hpp *hpp __maybe_unused)
 
 static int hpp__entry_delta(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%7.7s";
 	char buf[32] = " ";
-	double diff;
+	double diff = 0.0;
 
-	if (he->diff.computed)
-		diff = he->diff.period_ratio_delta;
-	else
-		diff = perf_diff__compute_delta(he);
+	if (pair) {
+		if (he->diff.computed)
+			diff = he->diff.period_ratio_delta;
+		else
+			diff = perf_diff__compute_delta(he, pair);
+	} else
+		diff = perf_diff__period_percent(he, he->stat.period);
 
 	if (fabs(diff) >= 0.01)
 		scnprintf(buf, sizeof(buf), "%+4.2F%%", diff);
@@ -297,14 +301,17 @@ static int hpp__width_ratio(struct perf_hpp *hpp __maybe_unused)
 
 static int hpp__entry_ratio(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%14s";
 	char buf[32] = " ";
-	double ratio;
+	double ratio = 0.0;
 
-	if (he->diff.computed)
-		ratio = he->diff.period_ratio;
-	else
-		ratio = perf_diff__compute_ratio(he);
+	if (pair) {
+		if (he->diff.computed)
+			ratio = he->diff.period_ratio;
+		else
+			ratio = perf_diff__compute_ratio(he, pair);
+	}
 
 	if (ratio > 0.0)
 		scnprintf(buf, sizeof(buf), "%+14.6F", ratio);
@@ -326,14 +333,17 @@ static int hpp__width_wdiff(struct perf_hpp *hpp __maybe_unused)
 
 static int hpp__entry_wdiff(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%14s";
 	char buf[32] = " ";
-	s64 wdiff;
+	s64 wdiff = 0;
 
-	if (he->diff.computed)
-		wdiff = he->diff.wdiff;
-	else
-		wdiff = perf_diff__compute_wdiff(he);
+	if (pair) {
+		if (he->diff.computed)
+			wdiff = he->diff.wdiff;
+		else
+			wdiff = perf_diff__compute_wdiff(he, pair);
+	}
 
 	if (wdiff != 0)
 		scnprintf(buf, sizeof(buf), "%14ld", wdiff);
@@ -341,30 +351,6 @@ static int hpp__entry_wdiff(struct perf_hpp *hpp, struct hist_entry *he)
 	return scnprintf(hpp->buf, hpp->size, fmt, buf);
 }
 
-static int hpp__header_displ(struct perf_hpp *hpp)
-{
-	return scnprintf(hpp->buf, hpp->size, "Displ.");
-}
-
-static int hpp__width_displ(struct perf_hpp *hpp __maybe_unused)
-{
-	return 6;
-}
-
-static int hpp__entry_displ(struct perf_hpp *hpp,
-			    struct hist_entry *he)
-{
-	struct hist_entry *pair = hist_entry__next_pair(he);
-	long displacement = pair ? pair->position - he->position : 0;
-	const char *fmt = symbol_conf.field_sep ? "%s" : "%6.6s";
-	char buf[32] = " ";
-
-	if (displacement)
-		scnprintf(buf, sizeof(buf), "%+4ld", displacement);
-
-	return scnprintf(hpp->buf, hpp->size, fmt, buf);
-}
-
 static int hpp__header_formula(struct perf_hpp *hpp)
 {
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%70s";
@@ -379,67 +365,80 @@ static int hpp__width_formula(struct perf_hpp *hpp __maybe_unused)
 
 static int hpp__entry_formula(struct perf_hpp *hpp, struct hist_entry *he)
 {
+	struct hist_entry *pair = hist_entry__next_pair(he);
 	const char *fmt = symbol_conf.field_sep ? "%s" : "%-70s";
 	char buf[96] = " ";
 
-	perf_diff__formula(buf, sizeof(buf), he);
+	if (pair)
+		perf_diff__formula(he, pair, buf, sizeof(buf));
+
 	return scnprintf(hpp->buf, hpp->size, fmt, buf);
 }
 
-#define HPP__COLOR_PRINT_FNS(_name)		\
-	.header	= hpp__header_ ## _name,		\
-	.width	= hpp__width_ ## _name,		\
-	.color	= hpp__color_ ## _name,		\
-	.entry	= hpp__entry_ ## _name
+#define HPP__COLOR_PRINT_FNS(_name)			\
+	{						\
+		.header	= hpp__header_ ## _name,	\
+		.width	= hpp__width_ ## _name,		\
+		.color	= hpp__color_ ## _name,		\
+		.entry	= hpp__entry_ ## _name		\
+	}
 
-#define HPP__PRINT_FNS(_name)			\
-	.header	= hpp__header_ ## _name,		\
-	.width	= hpp__width_ ## _name,		\
-	.entry	= hpp__entry_ ## _name
+#define HPP__PRINT_FNS(_name)				\
+	{						\
+		.header	= hpp__header_ ## _name,	\
+		.width	= hpp__width_ ## _name,		\
+		.entry	= hpp__entry_ ## _name		\
+	}
 
 struct perf_hpp_fmt perf_hpp__format[] = {
-	{ .cond = false, HPP__COLOR_PRINT_FNS(baseline) },
-	{ .cond = true,  HPP__COLOR_PRINT_FNS(overhead) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_sys) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_us) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_guest_sys) },
-	{ .cond = false, HPP__COLOR_PRINT_FNS(overhead_guest_us) },
-	{ .cond = false, HPP__PRINT_FNS(samples) },
-	{ .cond = false, HPP__PRINT_FNS(period) },
-	{ .cond = false, HPP__PRINT_FNS(period_baseline) },
-	{ .cond = false, HPP__PRINT_FNS(delta) },
-	{ .cond = false, HPP__PRINT_FNS(ratio) },
-	{ .cond = false, HPP__PRINT_FNS(wdiff) },
-	{ .cond = false, HPP__PRINT_FNS(displ) },
-	{ .cond = false, HPP__PRINT_FNS(formula) }
+	HPP__COLOR_PRINT_FNS(baseline),
+	HPP__COLOR_PRINT_FNS(overhead),
+	HPP__COLOR_PRINT_FNS(overhead_sys),
+	HPP__COLOR_PRINT_FNS(overhead_us),
+	HPP__COLOR_PRINT_FNS(overhead_guest_sys),
+	HPP__COLOR_PRINT_FNS(overhead_guest_us),
+	HPP__PRINT_FNS(samples),
+	HPP__PRINT_FNS(period),
+	HPP__PRINT_FNS(period_baseline),
+	HPP__PRINT_FNS(delta),
+	HPP__PRINT_FNS(ratio),
+	HPP__PRINT_FNS(wdiff),
+	HPP__PRINT_FNS(formula)
 };
 
+LIST_HEAD(perf_hpp__list);
+
 #undef HPP__COLOR_PRINT_FNS
 #undef HPP__PRINT_FNS
 
 void perf_hpp__init(void)
 {
 	if (symbol_conf.show_cpu_utilization) {
-		perf_hpp__format[PERF_HPP__OVERHEAD_SYS].cond = true;
-		perf_hpp__format[PERF_HPP__OVERHEAD_US].cond = true;
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD_SYS);
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD_US);
 
 		if (perf_guest) {
-			perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].cond = true;
-			perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].cond = true;
+			perf_hpp__column_enable(PERF_HPP__OVERHEAD_GUEST_SYS);
+			perf_hpp__column_enable(PERF_HPP__OVERHEAD_GUEST_US);
 		}
 	}
 
 	if (symbol_conf.show_nr_samples)
-		perf_hpp__format[PERF_HPP__SAMPLES].cond = true;
+		perf_hpp__column_enable(PERF_HPP__SAMPLES);
 
 	if (symbol_conf.show_total_period)
-		perf_hpp__format[PERF_HPP__PERIOD].cond = true;
+		perf_hpp__column_enable(PERF_HPP__PERIOD);
+}
+
+void perf_hpp__column_register(struct perf_hpp_fmt *format)
+{
+	list_add_tail(&format->list, &perf_hpp__list);
 }
 
-void perf_hpp__column_enable(unsigned col, bool enable)
+void perf_hpp__column_enable(unsigned col)
 {
 	BUG_ON(col >= PERF_HPP__MAX_INDEX);
-	perf_hpp__format[col].cond = enable;
+	perf_hpp__column_register(&perf_hpp__format[col]);
 }
 
 static inline void advance_hpp(struct perf_hpp *hpp, int inc)
@@ -452,27 +451,29 @@ int hist_entry__period_snprintf(struct perf_hpp *hpp, struct hist_entry *he,
 				bool color)
 {
 	const char *sep = symbol_conf.field_sep;
+	struct perf_hpp_fmt *fmt;
 	char *start = hpp->buf;
-	int i, ret;
+	int ret;
 	bool first = true;
 
 	if (symbol_conf.exclude_other && !he->parent)
 		return 0;
 
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
-
+	perf_hpp__for_each_format(fmt) {
+		/*
+		 * If there's no field_sep, we still need
+		 * to display initial '  '.
+		 */
 		if (!sep || !first) {
 			ret = scnprintf(hpp->buf, hpp->size, "%s", sep ?: "  ");
 			advance_hpp(hpp, ret);
+		} else
 			first = false;
-		}
 
-		if (color && perf_hpp__format[i].color)
-			ret = perf_hpp__format[i].color(hpp, he);
+		if (color && fmt->color)
+			ret = fmt->color(hpp, he);
 		else
-			ret = perf_hpp__format[i].entry(hpp, he);
+			ret = fmt->entry(hpp, he);
 
 		advance_hpp(hpp, ret);
 	}
@@ -504,16 +505,15 @@ int hist_entry__sort_snprintf(struct hist_entry *he, char *s, size_t size,
  */
 unsigned int hists__sort_list_width(struct hists *hists)
 {
+	struct perf_hpp_fmt *fmt;
 	struct sort_entry *se;
-	int i, ret = 0;
+	int i = 0, ret = 0;
 
-	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
-		if (!perf_hpp__format[i].cond)
-			continue;
+	perf_hpp__for_each_format(fmt) {
 		if (i)
 			ret += 2;
 
-		ret += perf_hpp__format[i].width(NULL);
+		ret += fmt->width(NULL);
 	}
 
 	list_for_each_entry(se, &hist_entry__sort_list, list)
diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c
index ebb4cc107876..166f13df3134 100644
--- a/tools/perf/ui/setup.c
+++ b/tools/perf/ui/setup.c
@@ -30,6 +30,7 @@ void setup_browser(bool fallback_to_pager)
 		if (fallback_to_pager)
 			setup_pager();
 
+		perf_hpp__column_enable(PERF_HPP__OVERHEAD);
 		perf_hpp__init();
 		break;
 	}
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index f0ee204f99bb..f9798298e3e0 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -335,13 +335,14 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 		      int max_cols, FILE *fp)
 {
+	struct perf_hpp_fmt *fmt;
 	struct sort_entry *se;
 	struct rb_node *nd;
 	size_t ret = 0;
 	unsigned int width;
 	const char *sep = symbol_conf.field_sep;
 	const char *col_width = symbol_conf.col_width_list_str;
-	int idx, nr_rows = 0;
+	int nr_rows = 0;
 	char bf[96];
 	struct perf_hpp dummy_hpp = {
 		.buf	= bf,
@@ -355,16 +356,14 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 		goto print_entries;
 
 	fprintf(fp, "# ");
-	for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-		if (!perf_hpp__format[idx].cond)
-			continue;
 
+	perf_hpp__for_each_format(fmt) {
 		if (!first)
 			fprintf(fp, "%s", sep ?: "  ");
 		else
 			first = false;
 
-		perf_hpp__format[idx].header(&dummy_hpp);
+		fmt->header(&dummy_hpp);
 		fprintf(fp, "%s", bf);
 	}
 
@@ -400,18 +399,16 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 	first = true;
 
 	fprintf(fp, "# ");
-	for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-		unsigned int i;
 
-		if (!perf_hpp__format[idx].cond)
-			continue;
+	perf_hpp__for_each_format(fmt) {
+		unsigned int i;
 
 		if (!first)
 			fprintf(fp, "%s", sep ?: "  ");
 		else
 			first = false;
 
-		width = perf_hpp__format[idx].width(&dummy_hpp);
+		width = fmt->width(&dummy_hpp);
 		for (i = 0; i < width; i++)
 			fprintf(fp, ".");
 	}
@@ -462,7 +459,7 @@ out:
 	return ret;
 }
 
-size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp)
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp)
 {
 	int i;
 	size_t ret = 0;
@@ -470,7 +467,7 @@ size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp)
 	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
 		const char *name;
 
-		if (hists->stats.nr_events[i] == 0)
+		if (stats->nr_events[i] == 0)
 			continue;
 
 		name = perf_event__name(i);
@@ -478,7 +475,7 @@ size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp)
 			continue;
 
 		ret += fprintf(fp, "%16s events: %10d\n", name,
-			       hists->stats.nr_events[i]);
+			       stats->nr_events[i]);
 	}
 
 	return ret;
diff --git a/tools/perf/ui/tui/helpline.c b/tools/perf/ui/tui/helpline.c
index 2884d2f41e33..1c8b9afd5d6e 100644
--- a/tools/perf/ui/tui/helpline.c
+++ b/tools/perf/ui/tui/helpline.c
@@ -8,6 +8,8 @@
 #include "../ui.h"
 #include "../libslang.h"
 
+char ui_helpline__last_msg[1024];
+
 static void tui_helpline__pop(void)
 {
 }
@@ -23,20 +25,7 @@ static void tui_helpline__push(const char *msg)
 	strncpy(ui_helpline__current, msg, sz)[sz - 1] = '\0';
 }
 
-struct ui_helpline tui_helpline_fns = {
-	.pop	= tui_helpline__pop,
-	.push	= tui_helpline__push,
-};
-
-void ui_helpline__init(void)
-{
-	helpline_fns = &tui_helpline_fns;
-	ui_helpline__puts(" ");
-}
-
-char ui_helpline__last_msg[1024];
-
-int ui_helpline__show_help(const char *format, va_list ap)
+static int tui_helpline__show(const char *format, va_list ap)
 {
 	int ret;
 	static int backlog;
@@ -55,3 +44,15 @@ int ui_helpline__show_help(const char *format, va_list ap)
 
 	return ret;
 }
+
+struct ui_helpline tui_helpline_fns = {
+	.pop	= tui_helpline__pop,
+	.push	= tui_helpline__push,
+	.show	= tui_helpline__show,
+};
+
+void ui_helpline__init(void)
+{
+	helpline_fns = &tui_helpline_fns;
+	ui_helpline__puts(" ");
+}
diff --git a/tools/perf/ui/util.c b/tools/perf/ui/util.c
index 4f989774c8c6..e3e0a963d03a 100644
--- a/tools/perf/ui/util.c
+++ b/tools/perf/ui/util.c
@@ -52,7 +52,6 @@ int ui__warning(const char *format, ...)
 	return ret;
 }
 
-
 /**
  * perf_error__register - Register error logging functions
  * @eops: The pointer to error logging function struct
diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN
index 6aa34e5afdcf..055fef34b6f6 100755
--- a/tools/perf/util/PERF-VERSION-GEN
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -26,13 +26,13 @@ VN=$(expr "$VN" : v*'\(.*\)')
 
 if test -r $GVF
 then
-	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
+	VC=$(sed -e 's/^#define PERF_VERSION "\(.*\)"/\1/' <$GVF)
 else
 	VC=unset
 fi
 test "$VN" = "$VC" || {
 	echo >&2 "PERF_VERSION = $VN"
-	echo "PERF_VERSION = $VN" >$GVF
+	echo "#define PERF_VERSION \"$VN\"" >$GVF
 }
 
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index eb340571e7d6..3ee9f67d5af0 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -143,4 +143,9 @@ static inline void callchain_cursor_advance(struct callchain_cursor *cursor)
 	cursor->curr = cursor->curr->next;
 	cursor->pos++;
 }
+
+struct option;
+
+int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset);
+extern const char record_callchain_help[];
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 03f830b48148..399e74c34c1a 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -23,10 +23,8 @@ int eprintf(int level, const char *fmt, ...)
 
 	if (verbose >= level) {
 		va_start(args, fmt);
-		if (use_browser == 1)
-			ret = ui_helpline__show_help(fmt, args);
-		else if (use_browser == 2)
-			ret = perf_gtk__show_helpline(fmt, args);
+		if (use_browser >= 1)
+			ui_helpline__vshow(fmt, args);
 		else
 			ret = vfprintf(stderr, fmt, args);
 		va_end(args);
@@ -49,28 +47,6 @@ int dump_printf(const char *fmt, ...)
 	return ret;
 }
 
-#if !defined(NEWT_SUPPORT) && !defined(GTK2_SUPPORT)
-int ui__warning(const char *format, ...)
-{
-	va_list args;
-
-	va_start(args, format);
-	vfprintf(stderr, format, args);
-	va_end(args);
-	return 0;
-}
-#endif
-
-int ui__error_paranoid(void)
-{
-	return ui__error("Permission error - are you root?\n"
-		    "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
-		    " -1 - Not paranoid at all\n"
-		    "  0 - Disallow raw tracepoint access for unpriv\n"
-		    "  1 - Disallow cpu events for unpriv\n"
-		    "  2 - Disallow kernel profiling for unpriv\n");
-}
-
 void trace_event(union perf_event *event)
 {
 	unsigned char *raw_event = (void *)event;
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 83e8d234af6b..efbd98805ad0 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -5,6 +5,8 @@
 #include <stdbool.h>
 #include "event.h"
 #include "../ui/helpline.h"
+#include "../ui/progress.h"
+#include "../ui/util.h"
 
 extern int verbose;
 extern bool quiet, dump_trace;
@@ -12,39 +14,7 @@ extern bool quiet, dump_trace;
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(union perf_event *event);
 
-struct ui_progress;
-struct perf_error_ops;
-
-#if defined(NEWT_SUPPORT) || defined(GTK2_SUPPORT)
-
-#include "../ui/progress.h"
 int ui__error(const char *format, ...) __attribute__((format(printf, 1, 2)));
-#include "../ui/util.h"
-
-#else
-
-static inline void ui_progress__update(u64 curr __maybe_unused,
-				       u64 total __maybe_unused,
-				       const char *title __maybe_unused) {}
-static inline void ui_progress__finish(void) {}
-
-#define ui__error(format, arg...) ui__warning(format, ##arg)
-
-static inline int
-perf_error__register(struct perf_error_ops *eops __maybe_unused)
-{
-	return 0;
-}
-
-static inline int
-perf_error__unregister(struct perf_error_ops *eops __maybe_unused)
-{
-	return 0;
-}
-
-#endif /* NEWT_SUPPORT || GTK2_SUPPORT */
-
 int ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2)));
-int ui__error_paranoid(void);
 
 #endif	/* __PERF_DEBUG_H */
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index d6d9a465acdb..6f7d5a9d6b05 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -539,13 +539,13 @@ struct dso *__dsos__findnew(struct list_head *head, const char *name)
 }
 
 size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
-			       bool with_hits)
+			       bool (skip)(struct dso *dso, int parm), int parm)
 {
 	struct dso *pos;
 	size_t ret = 0;
 
 	list_for_each_entry(pos, head, node) {
-		if (with_hits && !pos->hit)
+		if (skip && skip(pos, parm))
 			continue;
 		ret += dso__fprintf_buildid(pos, fp);
 		ret += fprintf(fp, " %s\n", pos->long_name);
@@ -583,7 +583,7 @@ size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp)
 	if (dso->short_name != dso->long_name)
 		ret += fprintf(fp, "%s, ", dso->long_name);
 	ret += fprintf(fp, "%s, %sloaded, ", map_type__name[type],
-		       dso->loaded ? "" : "NOT ");
+		       dso__loaded(dso, type) ? "" : "NOT ");
 	ret += dso__fprintf_buildid(dso, fp);
 	ret += fprintf(fp, ")\n");
 	for (nd = rb_first(&dso->symbols[type]); nd; nd = rb_next(nd)) {
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index e03276940b99..450199ab51b5 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -138,7 +138,7 @@ struct dso *__dsos__findnew(struct list_head *head, const char *name);
 bool __dsos__read_build_ids(struct list_head *head, bool with_hits);
 
 size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
-			       bool with_hits);
+			       bool (skip)(struct dso *dso, int parm), int parm);
 size_t __dsos__fprintf(struct list_head *head, FILE *fp);
 
 size_t dso__fprintf_buildid(struct dso *dso, FILE *fp);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 705293489e3c..dc8aee97a488 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -49,10 +49,16 @@ struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
 	return evlist;
 }
 
-void perf_evlist__config_attrs(struct perf_evlist *evlist,
-			       struct perf_record_opts *opts)
+void perf_evlist__config(struct perf_evlist *evlist,
+			struct perf_record_opts *opts)
 {
 	struct perf_evsel *evsel;
+	/*
+	 * Set the evsel leader links before we configure attributes,
+	 * since some might depend on this info.
+	 */
+	if (opts->group)
+		perf_evlist__set_leader(evlist);
 
 	if (evlist->cpus->map[0] < 0)
 		opts->no_inherit = true;
@@ -61,7 +67,7 @@ void perf_evlist__config_attrs(struct perf_evlist *evlist,
 		perf_evsel__config(evsel, opts);
 
 		if (evlist->nr_entries > 1)
-			evsel->attr.sample_type |= PERF_SAMPLE_ID;
+			perf_evsel__set_sample_id(evsel);
 	}
 }
 
@@ -111,7 +117,6 @@ void __perf_evlist__set_leader(struct list_head *list)
 	struct perf_evsel *evsel, *leader;
 
 	leader = list_entry(list->next, struct perf_evsel, node);
-	leader->leader = NULL;
 
 	list_for_each_entry(evsel, list, node) {
 		if (evsel != leader)
@@ -222,7 +227,7 @@ void perf_evlist__disable(struct perf_evlist *evlist)
 
 	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
 		list_for_each_entry(pos, &evlist->entries, node) {
-			if (perf_evsel__is_group_member(pos))
+			if (!perf_evsel__is_group_leader(pos))
 				continue;
 			for (thread = 0; thread < evlist->threads->nr; thread++)
 				ioctl(FD(pos, cpu, thread),
@@ -238,7 +243,7 @@ void perf_evlist__enable(struct perf_evlist *evlist)
 
 	for (cpu = 0; cpu < cpu_map__nr(evlist->cpus); cpu++) {
 		list_for_each_entry(pos, &evlist->entries, node) {
-			if (perf_evsel__is_group_member(pos))
+			if (!perf_evsel__is_group_leader(pos))
 				continue;
 			for (thread = 0; thread < evlist->threads->nr; thread++)
 				ioctl(FD(pos, cpu, thread),
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 56003f779e60..457e2350d21d 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -76,8 +76,8 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx);
 
 int perf_evlist__open(struct perf_evlist *evlist);
 
-void perf_evlist__config_attrs(struct perf_evlist *evlist,
-			       struct perf_record_opts *opts);
+void perf_evlist__config(struct perf_evlist *evlist,
+			 struct perf_record_opts *opts);
 
 int perf_evlist__prepare_workload(struct perf_evlist *evlist,
 				  struct perf_record_opts *opts,
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 1b16dd1edc8e..e45332d08a58 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -22,6 +22,11 @@
 #include <linux/perf_event.h>
 #include "perf_regs.h"
 
+static struct {
+	bool sample_id_all;
+	bool exclude_guest;
+} perf_missing_features;
+
 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
 
 static int __perf_evsel__sample_size(u64 sample_type)
@@ -50,11 +55,36 @@ void hists__init(struct hists *hists)
 	pthread_mutex_init(&hists->lock, NULL);
 }
 
+void __perf_evsel__set_sample_bit(struct perf_evsel *evsel,
+				  enum perf_event_sample_format bit)
+{
+	if (!(evsel->attr.sample_type & bit)) {
+		evsel->attr.sample_type |= bit;
+		evsel->sample_size += sizeof(u64);
+	}
+}
+
+void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel,
+				    enum perf_event_sample_format bit)
+{
+	if (evsel->attr.sample_type & bit) {
+		evsel->attr.sample_type &= ~bit;
+		evsel->sample_size -= sizeof(u64);
+	}
+}
+
+void perf_evsel__set_sample_id(struct perf_evsel *evsel)
+{
+	perf_evsel__set_sample_bit(evsel, ID);
+	evsel->attr.read_format |= PERF_FORMAT_ID;
+}
+
 void perf_evsel__init(struct perf_evsel *evsel,
 		      struct perf_event_attr *attr, int idx)
 {
 	evsel->idx	   = idx;
 	evsel->attr	   = *attr;
+	evsel->leader	   = evsel;
 	INIT_LIST_HEAD(&evsel->node);
 	hists__init(&evsel->hists);
 	evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
@@ -438,13 +468,11 @@ void perf_evsel__config(struct perf_evsel *evsel,
 	struct perf_event_attr *attr = &evsel->attr;
 	int track = !evsel->idx; /* only the first counter needs these */
 
-	attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
+	attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
 	attr->inherit	    = !opts->no_inherit;
-	attr->read_format   = PERF_FORMAT_TOTAL_TIME_ENABLED |
-			      PERF_FORMAT_TOTAL_TIME_RUNNING |
-			      PERF_FORMAT_ID;
 
-	attr->sample_type  |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	perf_evsel__set_sample_bit(evsel, IP);
+	perf_evsel__set_sample_bit(evsel, TID);
 
 	/*
 	 * We default some events to a 1 default interval. But keep
@@ -453,7 +481,7 @@ void perf_evsel__config(struct perf_evsel *evsel,
 	if (!attr->sample_period || (opts->user_freq != UINT_MAX &&
 				     opts->user_interval != ULLONG_MAX)) {
 		if (opts->freq) {
-			attr->sample_type	|= PERF_SAMPLE_PERIOD;
+			perf_evsel__set_sample_bit(evsel, PERIOD);
 			attr->freq		= 1;
 			attr->sample_freq	= opts->freq;
 		} else {
@@ -468,16 +496,16 @@ void perf_evsel__config(struct perf_evsel *evsel,
 		attr->inherit_stat = 1;
 
 	if (opts->sample_address) {
-		attr->sample_type	|= PERF_SAMPLE_ADDR;
+		perf_evsel__set_sample_bit(evsel, ADDR);
 		attr->mmap_data = track;
 	}
 
 	if (opts->call_graph) {
-		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
+		perf_evsel__set_sample_bit(evsel, CALLCHAIN);
 
 		if (opts->call_graph == CALLCHAIN_DWARF) {
-			attr->sample_type |= PERF_SAMPLE_REGS_USER |
-					     PERF_SAMPLE_STACK_USER;
+			perf_evsel__set_sample_bit(evsel, REGS_USER);
+			perf_evsel__set_sample_bit(evsel, STACK_USER);
 			attr->sample_regs_user = PERF_REGS_MASK;
 			attr->sample_stack_user = opts->stack_dump_size;
 			attr->exclude_callchain_user = 1;
@@ -485,20 +513,20 @@ void perf_evsel__config(struct perf_evsel *evsel,
 	}
 
 	if (perf_target__has_cpu(&opts->target))
-		attr->sample_type	|= PERF_SAMPLE_CPU;
+		perf_evsel__set_sample_bit(evsel, CPU);
 
 	if (opts->period)
-		attr->sample_type	|= PERF_SAMPLE_PERIOD;
+		perf_evsel__set_sample_bit(evsel, PERIOD);
 
-	if (!opts->sample_id_all_missing &&
+	if (!perf_missing_features.sample_id_all &&
 	    (opts->sample_time || !opts->no_inherit ||
 	     perf_target__has_cpu(&opts->target)))
-		attr->sample_type	|= PERF_SAMPLE_TIME;
+		perf_evsel__set_sample_bit(evsel, TIME);
 
 	if (opts->raw_samples) {
-		attr->sample_type	|= PERF_SAMPLE_TIME;
-		attr->sample_type	|= PERF_SAMPLE_RAW;
-		attr->sample_type	|= PERF_SAMPLE_CPU;
+		perf_evsel__set_sample_bit(evsel, TIME);
+		perf_evsel__set_sample_bit(evsel, RAW);
+		perf_evsel__set_sample_bit(evsel, CPU);
 	}
 
 	if (opts->no_delay) {
@@ -506,7 +534,7 @@ void perf_evsel__config(struct perf_evsel *evsel,
 		attr->wakeup_events = 1;
 	}
 	if (opts->branch_stack) {
-		attr->sample_type	|= PERF_SAMPLE_BRANCH_STACK;
+		perf_evsel__set_sample_bit(evsel, BRANCH_STACK);
 		attr->branch_sample_type = opts->branch_stack;
 	}
 
@@ -519,14 +547,14 @@ void perf_evsel__config(struct perf_evsel *evsel,
 	 * Disabling only independent events or group leaders,
 	 * keeping group members enabled.
 	 */
-	if (!perf_evsel__is_group_member(evsel))
+	if (perf_evsel__is_group_leader(evsel))
 		attr->disabled = 1;
 
 	/*
 	 * Setting enable_on_exec for independent events and
 	 * group leaders for traced executed by perf.
 	 */
-	if (perf_target__none(&opts->target) && !perf_evsel__is_group_member(evsel))
+	if (perf_target__none(&opts->target) && perf_evsel__is_group_leader(evsel))
 		attr->enable_on_exec = 1;
 }
 
@@ -707,7 +735,7 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
 	struct perf_evsel *leader = evsel->leader;
 	int fd;
 
-	if (!perf_evsel__is_group_member(evsel))
+	if (perf_evsel__is_group_leader(evsel))
 		return -1;
 
 	/*
@@ -738,6 +766,13 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		pid = evsel->cgrp->fd;
 	}
 
+fallback_missing_features:
+	if (perf_missing_features.exclude_guest)
+		evsel->attr.exclude_guest = evsel->attr.exclude_host = 0;
+retry_sample_id:
+	if (perf_missing_features.sample_id_all)
+		evsel->attr.sample_id_all = 0;
+
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 
 		for (thread = 0; thread < threads->nr; thread++) {
@@ -754,13 +789,26 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 								     group_fd, flags);
 			if (FD(evsel, cpu, thread) < 0) {
 				err = -errno;
-				goto out_close;
+				goto try_fallback;
 			}
 		}
 	}
 
 	return 0;
 
+try_fallback:
+	if (err != -EINVAL || cpu > 0 || thread > 0)
+		goto out_close;
+
+	if (!perf_missing_features.exclude_guest &&
+	    (evsel->attr.exclude_guest || evsel->attr.exclude_host)) {
+		perf_missing_features.exclude_guest = true;
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.sample_id_all) {
+		perf_missing_features.sample_id_all = true;
+		goto retry_sample_id;
+	}
+
 out_close:
 	do {
 		while (--thread >= 0) {
@@ -1205,3 +1253,205 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
 
 	return 0;
 }
+
+static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
+{
+	va_list args;
+	int ret = 0;
+
+	if (!*first) {
+		ret += fprintf(fp, ",");
+	} else {
+		ret += fprintf(fp, ":");
+		*first = false;
+	}
+
+	va_start(args, fmt);
+	ret += vfprintf(fp, fmt, args);
+	va_end(args);
+	return ret;
+}
+
+static int __if_fprintf(FILE *fp, bool *first, const char *field, u64 value)
+{
+	if (value == 0)
+		return 0;
+
+	return comma_fprintf(fp, first, " %s: %" PRIu64, field, value);
+}
+
+#define if_print(field) printed += __if_fprintf(fp, &first, #field, evsel->attr.field)
+
+struct bit_names {
+	int bit;
+	const char *name;
+};
+
+static int bits__fprintf(FILE *fp, const char *field, u64 value,
+			 struct bit_names *bits, bool *first)
+{
+	int i = 0, printed = comma_fprintf(fp, first, " %s: ", field);
+	bool first_bit = true;
+
+	do {
+		if (value & bits[i].bit) {
+			printed += fprintf(fp, "%s%s", first_bit ? "" : "|", bits[i].name);
+			first_bit = false;
+		}
+	} while (bits[++i].name != NULL);
+
+	return printed;
+}
+
+static int sample_type__fprintf(FILE *fp, bool *first, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
+		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
+		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
+		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	return bits__fprintf(fp, "sample_type", value, bits, first);
+}
+
+static int read_format__fprintf(FILE *fp, bool *first, u64 value)
+{
+#define bit_name(n) { PERF_FORMAT_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
+		bit_name(ID), bit_name(GROUP),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	return bits__fprintf(fp, "read_format", value, bits, first);
+}
+
+int perf_evsel__fprintf(struct perf_evsel *evsel,
+			struct perf_attr_details *details, FILE *fp)
+{
+	bool first = true;
+	int printed = fprintf(fp, "%s", perf_evsel__name(evsel));
+
+	if (details->verbose || details->freq) {
+		printed += comma_fprintf(fp, &first, " sample_freq=%" PRIu64,
+					 (u64)evsel->attr.sample_freq);
+	}
+
+	if (details->verbose) {
+		if_print(type);
+		if_print(config);
+		if_print(config1);
+		if_print(config2);
+		if_print(size);
+		printed += sample_type__fprintf(fp, &first, evsel->attr.sample_type);
+		if (evsel->attr.read_format)
+			printed += read_format__fprintf(fp, &first, evsel->attr.read_format);
+		if_print(disabled);
+		if_print(inherit);
+		if_print(pinned);
+		if_print(exclusive);
+		if_print(exclude_user);
+		if_print(exclude_kernel);
+		if_print(exclude_hv);
+		if_print(exclude_idle);
+		if_print(mmap);
+		if_print(comm);
+		if_print(freq);
+		if_print(inherit_stat);
+		if_print(enable_on_exec);
+		if_print(task);
+		if_print(watermark);
+		if_print(precise_ip);
+		if_print(mmap_data);
+		if_print(sample_id_all);
+		if_print(exclude_host);
+		if_print(exclude_guest);
+		if_print(__reserved_1);
+		if_print(wakeup_events);
+		if_print(bp_type);
+		if_print(branch_sample_type);
+	}
+
+	fputc('\n', fp);
+	return ++printed;
+}
+
+bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
+			  char *msg, size_t msgsize)
+{
+	if ((err == ENOENT || err == ENXIO) &&
+	    evsel->attr.type   == PERF_TYPE_HARDWARE &&
+	    evsel->attr.config == PERF_COUNT_HW_CPU_CYCLES) {
+		/*
+		 * If it's cycles then fall back to hrtimer based
+		 * cpu-clock-tick sw counter, which is always available even if
+		 * no PMU support.
+		 *
+		 * PPC returns ENXIO until 2.6.37 (behavior changed with commit
+		 * b0a873e).
+		 */
+		scnprintf(msg, msgsize, "%s",
+"The cycles event is not supported, trying to fall back to cpu-clock-ticks");
+
+		evsel->attr.type   = PERF_TYPE_SOFTWARE;
+		evsel->attr.config = PERF_COUNT_SW_CPU_CLOCK;
+
+		free(evsel->name);
+		evsel->name = NULL;
+		return true;
+	}
+
+	return false;
+}
+
+int perf_evsel__open_strerror(struct perf_evsel *evsel,
+			      struct perf_target *target,
+			      int err, char *msg, size_t size)
+{
+	switch (err) {
+	case EPERM:
+	case EACCES:
+		return scnprintf(msg, size, "%s",
+		 "You may not have permission to collect %sstats.\n"
+		 "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
+		 " -1 - Not paranoid at all\n"
+		 "  0 - Disallow raw tracepoint access for unpriv\n"
+		 "  1 - Disallow cpu events for unpriv\n"
+		 "  2 - Disallow kernel profiling for unpriv",
+				 target->system_wide ? "system-wide " : "");
+	case ENOENT:
+		return scnprintf(msg, size, "The %s event is not supported.",
+				 perf_evsel__name(evsel));
+	case EMFILE:
+		return scnprintf(msg, size, "%s",
+			 "Too many events are opened.\n"
+			 "Try again after reducing the number of events.");
+	case ENODEV:
+		if (target->cpu_list)
+			return scnprintf(msg, size, "%s",
+	 "No such device - did you specify an out-of-range profile CPU?\n");
+		break;
+	case EOPNOTSUPP:
+		if (evsel->attr.precise_ip)
+			return scnprintf(msg, size, "%s",
+	"\'precise\' request may not be supported. Try removing 'p' modifier.");
+#if defined(__i386__) || defined(__x86_64__)
+		if (evsel->attr.type == PERF_TYPE_HARDWARE)
+			return scnprintf(msg, size, "%s",
+	"No hardware sampling interrupt available.\n"
+	"No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.");
+#endif
+		break;
+	default:
+		break;
+	}
+
+	return scnprintf(msg, size,
+	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).  \n"
+	"/bin/dmesg may provide additional information.\n"
+	"No CONFIG_PERF_EVENTS=y kernel support configured?\n",
+			 err, strerror(err), perf_evsel__name(evsel));
+}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 3d2b8017438c..c68d1b82e843 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -118,6 +118,19 @@ void perf_evsel__free_fd(struct perf_evsel *evsel);
 void perf_evsel__free_id(struct perf_evsel *evsel);
 void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 
+void __perf_evsel__set_sample_bit(struct perf_evsel *evsel,
+				  enum perf_event_sample_format bit);
+void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel,
+				    enum perf_event_sample_format bit);
+
+#define perf_evsel__set_sample_bit(evsel, bit) \
+	__perf_evsel__set_sample_bit(evsel, PERF_SAMPLE_##bit)
+
+#define perf_evsel__reset_sample_bit(evsel, bit) \
+	__perf_evsel__reset_sample_bit(evsel, PERF_SAMPLE_##bit)
+
+void perf_evsel__set_sample_id(struct perf_evsel *evsel);
+
 int perf_evsel__set_filter(struct perf_evsel *evsel, int ncpus, int nthreads,
 			   const char *filter);
 
@@ -226,8 +239,22 @@ static inline struct perf_evsel *perf_evsel__next(struct perf_evsel *evsel)
 	return list_entry(evsel->node.next, struct perf_evsel, node);
 }
 
-static inline bool perf_evsel__is_group_member(const struct perf_evsel *evsel)
+static inline bool perf_evsel__is_group_leader(const struct perf_evsel *evsel)
 {
-	return evsel->leader != NULL;
+	return evsel->leader == evsel;
 }
+
+struct perf_attr_details {
+	bool freq;
+	bool verbose;
+};
+
+int perf_evsel__fprintf(struct perf_evsel *evsel,
+			struct perf_attr_details *details, FILE *fp);
+
+bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
+			  char *msg, size_t msgsize);
+int perf_evsel__open_strerror(struct perf_evsel *evsel,
+			      struct perf_target *target,
+			      int err, char *msg, size_t size);
 #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index b7da4634a047..fccd69dbbbb9 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -148,7 +148,7 @@ static char *do_read_string(int fd, struct perf_header *ph)
 	u32 len;
 	char *buf;
 
-	sz = read(fd, &len, sizeof(len));
+	sz = readn(fd, &len, sizeof(len));
 	if (sz < (ssize_t)sizeof(len))
 		return NULL;
 
@@ -159,7 +159,7 @@ static char *do_read_string(int fd, struct perf_header *ph)
 	if (!buf)
 		return NULL;
 
-	ret = read(fd, buf, len);
+	ret = readn(fd, buf, len);
 	if (ret == (ssize_t)len) {
 		/*
 		 * strings are padded by zeroes
@@ -287,12 +287,12 @@ static int dsos__write_buildid_table(struct perf_header *header, int fd)
 	struct perf_session *session = container_of(header,
 			struct perf_session, header);
 	struct rb_node *nd;
-	int err = machine__write_buildid_table(&session->host_machine, fd);
+	int err = machine__write_buildid_table(&session->machines.host, fd);
 
 	if (err)
 		return err;
 
-	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		err = machine__write_buildid_table(pos, fd);
 		if (err)
@@ -448,9 +448,9 @@ static int perf_session__cache_build_ids(struct perf_session *session)
 	if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
 		return -1;
 
-	ret = machine__cache_build_ids(&session->host_machine, debugdir);
+	ret = machine__cache_build_ids(&session->machines.host, debugdir);
 
-	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		ret |= machine__cache_build_ids(pos, debugdir);
 	}
@@ -467,9 +467,9 @@ static bool machine__read_build_ids(struct machine *machine, bool with_hits)
 static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
 {
 	struct rb_node *nd;
-	bool ret = machine__read_build_ids(&session->host_machine, with_hits);
+	bool ret = machine__read_build_ids(&session->machines.host, with_hits);
 
-	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		ret |= machine__read_build_ids(pos, with_hits);
 	}
@@ -1051,16 +1051,25 @@ static int write_pmu_mappings(int fd, struct perf_header *h __maybe_unused,
 	struct perf_pmu *pmu = NULL;
 	off_t offset = lseek(fd, 0, SEEK_CUR);
 	__u32 pmu_num = 0;
+	int ret;
 
 	/* write real pmu_num later */
-	do_write(fd, &pmu_num, sizeof(pmu_num));
+	ret = do_write(fd, &pmu_num, sizeof(pmu_num));
+	if (ret < 0)
+		return ret;
 
 	while ((pmu = perf_pmu__scan(pmu))) {
 		if (!pmu->name)
 			continue;
 		pmu_num++;
-		do_write(fd, &pmu->type, sizeof(pmu->type));
-		do_write_string(fd, pmu->name);
+
+		ret = do_write(fd, &pmu->type, sizeof(pmu->type));
+		if (ret < 0)
+			return ret;
+
+		ret = do_write_string(fd, pmu->name);
+		if (ret < 0)
+			return ret;
 	}
 
 	if (pwrite(fd, &pmu_num, sizeof(pmu_num), offset) != sizeof(pmu_num)) {
@@ -1209,14 +1218,14 @@ read_event_desc(struct perf_header *ph, int fd)
 	size_t msz;
 
 	/* number of events */
-	ret = read(fd, &nre, sizeof(nre));
+	ret = readn(fd, &nre, sizeof(nre));
 	if (ret != (ssize_t)sizeof(nre))
 		goto error;
 
 	if (ph->needs_swap)
 		nre = bswap_32(nre);
 
-	ret = read(fd, &sz, sizeof(sz));
+	ret = readn(fd, &sz, sizeof(sz));
 	if (ret != (ssize_t)sizeof(sz))
 		goto error;
 
@@ -1244,7 +1253,7 @@ read_event_desc(struct perf_header *ph, int fd)
 		 * must read entire on-file attr struct to
 		 * sync up with layout.
 		 */
-		ret = read(fd, buf, sz);
+		ret = readn(fd, buf, sz);
 		if (ret != (ssize_t)sz)
 			goto error;
 
@@ -1253,7 +1262,7 @@ read_event_desc(struct perf_header *ph, int fd)
 
 		memcpy(&evsel->attr, buf, msz);
 
-		ret = read(fd, &nr, sizeof(nr));
+		ret = readn(fd, &nr, sizeof(nr));
 		if (ret != (ssize_t)sizeof(nr))
 			goto error;
 
@@ -1274,7 +1283,7 @@ read_event_desc(struct perf_header *ph, int fd)
 		evsel->id = id;
 
 		for (j = 0 ; j < nr; j++) {
-			ret = read(fd, id, sizeof(*id));
+			ret = readn(fd, id, sizeof(*id));
 			if (ret != (ssize_t)sizeof(*id))
 				goto error;
 			if (ph->needs_swap)
@@ -1506,14 +1515,14 @@ static int perf_header__read_build_ids_abi_quirk(struct perf_header *header,
 	while (offset < limit) {
 		ssize_t len;
 
-		if (read(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev))
+		if (readn(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev))
 			return -1;
 
 		if (header->needs_swap)
 			perf_event_header__bswap(&old_bev.header);
 
 		len = old_bev.header.size - sizeof(old_bev);
-		if (read(input, filename, len) != len)
+		if (readn(input, filename, len) != len)
 			return -1;
 
 		bev.header = old_bev.header;
@@ -1548,14 +1557,14 @@ static int perf_header__read_build_ids(struct perf_header *header,
 	while (offset < limit) {
 		ssize_t len;
 
-		if (read(input, &bev, sizeof(bev)) != sizeof(bev))
+		if (readn(input, &bev, sizeof(bev)) != sizeof(bev))
 			goto out;
 
 		if (header->needs_swap)
 			perf_event_header__bswap(&bev.header);
 
 		len = bev.header.size - sizeof(bev);
-		if (read(input, filename, len) != len)
+		if (readn(input, filename, len) != len)
 			goto out;
 		/*
 		 * The a1645ce1 changeset:
@@ -1641,7 +1650,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused,
 	size_t ret;
 	u32 nr;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1650,7 +1659,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused,
 
 	ph->env.nr_cpus_online = nr;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1684,7 +1693,7 @@ static int process_total_mem(struct perf_file_section *section __maybe_unused,
 	uint64_t mem;
 	size_t ret;
 
-	ret = read(fd, &mem, sizeof(mem));
+	ret = readn(fd, &mem, sizeof(mem));
 	if (ret != sizeof(mem))
 		return -1;
 
@@ -1756,7 +1765,7 @@ static int process_cmdline(struct perf_file_section *section __maybe_unused,
 	u32 nr, i;
 	struct strbuf sb;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1792,7 +1801,7 @@ static int process_cpu_topology(struct perf_file_section *section __maybe_unused
 	char *str;
 	struct strbuf sb;
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1813,7 +1822,7 @@ static int process_cpu_topology(struct perf_file_section *section __maybe_unused
 	}
 	ph->env.sibling_cores = strbuf_detach(&sb, NULL);
 
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		return -1;
 
@@ -1850,7 +1859,7 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
 	struct strbuf sb;
 
 	/* nr nodes */
-	ret = read(fd, &nr, sizeof(nr));
+	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
 		goto error;
 
@@ -1862,15 +1871,15 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
 
 	for (i = 0; i < nr; i++) {
 		/* node number */
-		ret = read(fd, &node, sizeof(node));
+		ret = readn(fd, &node, sizeof(node));
 		if (ret != sizeof(node))
 			goto error;
 
-		ret = read(fd, &mem_total, sizeof(u64));
+		ret = readn(fd, &mem_total, sizeof(u64));
 		if (ret != sizeof(u64))
 			goto error;
 
-		ret = read(fd, &mem_free, sizeof(u64));
+		ret = readn(fd, &mem_free, sizeof(u64));
 		if (ret != sizeof(u64))
 			goto error;
 
@@ -1909,7 +1918,7 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused
 	u32 type;
 	struct strbuf sb;
 
-	ret = read(fd, &pmu_num, sizeof(pmu_num));
+	ret = readn(fd, &pmu_num, sizeof(pmu_num));
 	if (ret != sizeof(pmu_num))
 		return -1;
 
@@ -1925,7 +1934,7 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused
 	strbuf_init(&sb, 128);
 
 	while (pmu_num) {
-		if (read(fd, &type, sizeof(type)) != sizeof(type))
+		if (readn(fd, &type, sizeof(type)) != sizeof(type))
 			goto error;
 		if (ph->needs_swap)
 			type = bswap_32(type);
@@ -2912,7 +2921,7 @@ int perf_event__process_tracing_data(union perf_event *event,
 				 session->repipe);
 	padding = PERF_ALIGN(size_read, sizeof(u64)) - size_read;
 
-	if (read(session->fd, buf, padding) < 0)
+	if (readn(session->fd, buf, padding) < 0)
 		die("reading input file");
 	if (session->repipe) {
 		int retw = write(STDOUT_FILENO, buf, padding);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index cb17e2a8c6ed..8170a3d11ffa 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -82,6 +82,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 		hists__new_col_len(hists, HISTC_DSO, len);
 	}
 
+	if (h->parent)
+		hists__new_col_len(hists, HISTC_PARENT, h->parent->namelen);
+
 	if (h->branch_info) {
 		int symlen;
 		/*
@@ -242,6 +245,14 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template)
 
 		if (he->ms.map)
 			he->ms.map->referenced = true;
+
+		if (he->branch_info) {
+			if (he->branch_info->from.map)
+				he->branch_info->from.map->referenced = true;
+			if (he->branch_info->to.map)
+				he->branch_info->to.map->referenced = true;
+		}
+
 		if (symbol_conf.use_callchain)
 			callchain_init(he->callchain);
 
@@ -251,7 +262,7 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template)
 	return he;
 }
 
-static void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h)
+void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h)
 {
 	if (!h->filtered) {
 		hists__calc_col_len(hists, h);
@@ -285,7 +296,13 @@ static struct hist_entry *add_hist_entry(struct hists *hists,
 		parent = *p;
 		he = rb_entry(parent, struct hist_entry, rb_node_in);
 
-		cmp = hist_entry__cmp(entry, he);
+		/*
+		 * Make sure that it receives arguments in a same order as
+		 * hist_entry__collapse() so that we can use an appropriate
+		 * function when searching an entry regardless which sort
+		 * keys were used.
+		 */
+		cmp = hist_entry__cmp(he, entry);
 
 		if (!cmp) {
 			he_stat__add_period(&he->stat, period);
@@ -711,25 +728,38 @@ int hist_entry__annotate(struct hist_entry *he, size_t privsize)
 	return symbol__annotate(he->ms.sym, he->ms.map, privsize);
 }
 
+void events_stats__inc(struct events_stats *stats, u32 type)
+{
+	++stats->nr_events[0];
+	++stats->nr_events[type];
+}
+
 void hists__inc_nr_events(struct hists *hists, u32 type)
 {
-	++hists->stats.nr_events[0];
-	++hists->stats.nr_events[type];
+	events_stats__inc(&hists->stats, type);
 }
 
 static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 						 struct hist_entry *pair)
 {
-	struct rb_node **p = &hists->entries.rb_node;
+	struct rb_root *root;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct hist_entry *he;
 	int cmp;
 
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	p = &root->rb_node;
+
 	while (*p != NULL) {
 		parent = *p;
-		he = rb_entry(parent, struct hist_entry, rb_node);
+		he = rb_entry(parent, struct hist_entry, rb_node_in);
 
-		cmp = hist_entry__cmp(pair, he);
+		cmp = hist_entry__collapse(he, pair);
 
 		if (!cmp)
 			goto out;
@@ -744,8 +774,8 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 	if (he) {
 		memset(&he->stat, 0, sizeof(he->stat));
 		he->hists = hists;
-		rb_link_node(&he->rb_node, parent, p);
-		rb_insert_color(&he->rb_node, &hists->entries);
+		rb_link_node(&he->rb_node_in, parent, p);
+		rb_insert_color(&he->rb_node_in, root);
 		hists__inc_nr_entries(hists, he);
 	}
 out:
@@ -755,11 +785,16 @@ out:
 static struct hist_entry *hists__find_entry(struct hists *hists,
 					    struct hist_entry *he)
 {
-	struct rb_node *n = hists->entries.rb_node;
+	struct rb_node *n;
+
+	if (sort__need_collapse)
+		n = hists->entries_collapsed.rb_node;
+	else
+		n = hists->entries_in->rb_node;
 
 	while (n) {
-		struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node);
-		int64_t cmp = hist_entry__cmp(he, iter);
+		struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node_in);
+		int64_t cmp = hist_entry__collapse(iter, he);
 
 		if (cmp < 0)
 			n = n->rb_left;
@@ -777,15 +812,21 @@ static struct hist_entry *hists__find_entry(struct hists *hists,
  */
 void hists__match(struct hists *leader, struct hists *other)
 {
+	struct rb_root *root;
 	struct rb_node *nd;
 	struct hist_entry *pos, *pair;
 
-	for (nd = rb_first(&leader->entries); nd; nd = rb_next(nd)) {
-		pos  = rb_entry(nd, struct hist_entry, rb_node);
+	if (sort__need_collapse)
+		root = &leader->entries_collapsed;
+	else
+		root = leader->entries_in;
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		pos  = rb_entry(nd, struct hist_entry, rb_node_in);
 		pair = hists__find_entry(other, pos);
 
 		if (pair)
-			hist__entry_add_pair(pos, pair);
+			hist_entry__add_pair(pair, pos);
 	}
 }
 
@@ -796,17 +837,23 @@ void hists__match(struct hists *leader, struct hists *other)
  */
 int hists__link(struct hists *leader, struct hists *other)
 {
+	struct rb_root *root;
 	struct rb_node *nd;
 	struct hist_entry *pos, *pair;
 
-	for (nd = rb_first(&other->entries); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct hist_entry, rb_node);
+	if (sort__need_collapse)
+		root = &other->entries_collapsed;
+	else
+		root = other->entries_in;
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct hist_entry, rb_node_in);
 
 		if (!hist_entry__has_pairs(pos)) {
 			pair = hists__add_dummy_entry(leader, pos);
 			if (pair == NULL)
 				return -1;
-			hist__entry_add_pair(pair, pos);
+			hist_entry__add_pair(pos, pair);
 		}
 	}
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 8b091a51e4a2..38624686ee9a 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -96,8 +96,10 @@ void hists__decay_entries_threaded(struct hists *hists, bool zap_user,
 				   bool zap_kernel);
 void hists__output_recalc_col_len(struct hists *hists, int max_rows);
 
+void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h);
 void hists__inc_nr_events(struct hists *self, u32 type);
-size_t hists__fprintf_nr_events(struct hists *self, FILE *fp);
+void events_stats__inc(struct events_stats *stats, u32 type);
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp);
 
 size_t hists__fprintf(struct hists *self, bool show_header, int max_rows,
 		      int max_cols, FILE *fp);
@@ -126,13 +128,19 @@ struct perf_hpp {
 };
 
 struct perf_hpp_fmt {
-	bool cond;
 	int (*header)(struct perf_hpp *hpp);
 	int (*width)(struct perf_hpp *hpp);
 	int (*color)(struct perf_hpp *hpp, struct hist_entry *he);
 	int (*entry)(struct perf_hpp *hpp, struct hist_entry *he);
+
+	struct list_head list;
 };
 
+extern struct list_head perf_hpp__list;
+
+#define perf_hpp__for_each_format(format) \
+	list_for_each_entry(format, &perf_hpp__list, list)
+
 extern struct perf_hpp_fmt perf_hpp__format[];
 
 enum {
@@ -148,14 +156,14 @@ enum {
 	PERF_HPP__DELTA,
 	PERF_HPP__RATIO,
 	PERF_HPP__WEIGHTED_DIFF,
-	PERF_HPP__DISPL,
 	PERF_HPP__FORMULA,
 
 	PERF_HPP__MAX_INDEX
 };
 
 void perf_hpp__init(void);
-void perf_hpp__column_enable(unsigned col, bool enable);
+void perf_hpp__column_register(struct perf_hpp_fmt *format);
+void perf_hpp__column_enable(unsigned col);
 int hist_entry__period_snprintf(struct perf_hpp *hpp, struct hist_entry *he,
 				bool color);
 
@@ -219,8 +227,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist __maybe_unused,
 
 unsigned int hists__sort_list_width(struct hists *self);
 
-double perf_diff__compute_delta(struct hist_entry *he);
-double perf_diff__compute_ratio(struct hist_entry *he);
-s64 perf_diff__compute_wdiff(struct hist_entry *he);
-int perf_diff__formula(char *buf, size_t size, struct hist_entry *he);
+double perf_diff__compute_delta(struct hist_entry *he, struct hist_entry *pair);
+double perf_diff__compute_ratio(struct hist_entry *he, struct hist_entry *pair);
+s64 perf_diff__compute_wdiff(struct hist_entry *he, struct hist_entry *pair);
+int perf_diff__formula(struct hist_entry *he, struct hist_entry *pair,
+		       char *buf, size_t size);
+double perf_diff__period_percent(struct hist_entry *he, u64 period);
 #endif	/* __PERF_HIST_H */
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index a55d8cf083c9..45cf10a562bd 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -14,6 +14,7 @@
 #define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
 #define BITS_TO_U64(nr)         DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64))
 #define BITS_TO_U32(nr)         DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32))
+#define BITS_TO_BYTES(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE)
 
 #define for_each_set_bit(bit, addr, size) \
 	for ((bit) = find_first_bit((addr), (size));		\
diff --git a/tools/perf/util/intlist.c b/tools/perf/util/intlist.c
index 9d0740024ba8..11a8d86f7fea 100644
--- a/tools/perf/util/intlist.c
+++ b/tools/perf/util/intlist.c
@@ -59,16 +59,40 @@ void intlist__remove(struct intlist *ilist, struct int_node *node)
 
 struct int_node *intlist__find(struct intlist *ilist, int i)
 {
-	struct int_node *node = NULL;
-	struct rb_node *rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
+	struct int_node *node;
+	struct rb_node *rb_node;
 
+	if (ilist == NULL)
+		return NULL;
+
+	node = NULL;
+	rb_node = rblist__find(&ilist->rblist, (void *)((long)i));
 	if (rb_node)
 		node = container_of(rb_node, struct int_node, rb_node);
 
 	return node;
 }
 
-struct intlist *intlist__new(void)
+static int intlist__parse_list(struct intlist *ilist, const char *s)
+{
+	char *sep;
+	int err;
+
+	do {
+		long value = strtol(s, &sep, 10);
+		err = -EINVAL;
+		if (*sep != ',' && *sep != '\0')
+			break;
+		err = intlist__add(ilist, value);
+		if (err)
+			break;
+		s = sep + 1;
+	} while (*sep != '\0');
+
+	return err;
+}
+
+struct intlist *intlist__new(const char *slist)
 {
 	struct intlist *ilist = malloc(sizeof(*ilist));
 
@@ -77,9 +101,15 @@ struct intlist *intlist__new(void)
 		ilist->rblist.node_cmp    = intlist__node_cmp;
 		ilist->rblist.node_new    = intlist__node_new;
 		ilist->rblist.node_delete = intlist__node_delete;
+
+		if (slist && intlist__parse_list(ilist, slist))
+			goto out_delete;
 	}
 
 	return ilist;
+out_delete:
+	intlist__delete(ilist);
+	return NULL;
 }
 
 void intlist__delete(struct intlist *ilist)
diff --git a/tools/perf/util/intlist.h b/tools/perf/util/intlist.h
index 6d63ab90db50..62351dad848f 100644
--- a/tools/perf/util/intlist.h
+++ b/tools/perf/util/intlist.h
@@ -15,7 +15,7 @@ struct intlist {
 	struct rblist rblist;
 };
 
-struct intlist *intlist__new(void);
+struct intlist *intlist__new(const char *slist);
 void intlist__delete(struct intlist *ilist);
 
 void intlist__remove(struct intlist *ilist, struct int_node *in);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 1f09d0581e6b..efdb38e65a92 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1,10 +1,15 @@
+#include "callchain.h"
 #include "debug.h"
 #include "event.h"
+#include "evsel.h"
+#include "hist.h"
 #include "machine.h"
 #include "map.h"
+#include "sort.h"
 #include "strlist.h"
 #include "thread.h"
 #include <stdbool.h>
+#include "unwind.h"
 
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 {
@@ -48,6 +53,29 @@ static void dsos__delete(struct list_head *dsos)
 	}
 }
 
+void machine__delete_dead_threads(struct machine *machine)
+{
+	struct thread *n, *t;
+
+	list_for_each_entry_safe(t, n, &machine->dead_threads, node) {
+		list_del(&t->node);
+		thread__delete(t);
+	}
+}
+
+void machine__delete_threads(struct machine *machine)
+{
+	struct rb_node *nd = rb_first(&machine->threads);
+
+	while (nd) {
+		struct thread *t = rb_entry(nd, struct thread, rb_node);
+
+		rb_erase(&t->rb_node, &machine->threads);
+		nd = rb_next(nd);
+		thread__delete(t);
+	}
+}
+
 void machine__exit(struct machine *machine)
 {
 	map_groups__exit(&machine->kmaps);
@@ -63,10 +91,22 @@ void machine__delete(struct machine *machine)
 	free(machine);
 }
 
-struct machine *machines__add(struct rb_root *machines, pid_t pid,
+void machines__init(struct machines *machines)
+{
+	machine__init(&machines->host, "", HOST_KERNEL_ID);
+	machines->guests = RB_ROOT;
+}
+
+void machines__exit(struct machines *machines)
+{
+	machine__exit(&machines->host);
+	/* XXX exit guest */
+}
+
+struct machine *machines__add(struct machines *machines, pid_t pid,
 			      const char *root_dir)
 {
-	struct rb_node **p = &machines->rb_node;
+	struct rb_node **p = &machines->guests.rb_node;
 	struct rb_node *parent = NULL;
 	struct machine *pos, *machine = malloc(sizeof(*machine));
 
@@ -88,18 +128,21 @@ struct machine *machines__add(struct rb_root *machines, pid_t pid,
 	}
 
 	rb_link_node(&machine->rb_node, parent, p);
-	rb_insert_color(&machine->rb_node, machines);
+	rb_insert_color(&machine->rb_node, &machines->guests);
 
 	return machine;
 }
 
-struct machine *machines__find(struct rb_root *machines, pid_t pid)
+struct machine *machines__find(struct machines *machines, pid_t pid)
 {
-	struct rb_node **p = &machines->rb_node;
+	struct rb_node **p = &machines->guests.rb_node;
 	struct rb_node *parent = NULL;
 	struct machine *machine;
 	struct machine *default_machine = NULL;
 
+	if (pid == HOST_KERNEL_ID)
+		return &machines->host;
+
 	while (*p != NULL) {
 		parent = *p;
 		machine = rb_entry(parent, struct machine, rb_node);
@@ -116,7 +159,7 @@ struct machine *machines__find(struct rb_root *machines, pid_t pid)
 	return default_machine;
 }
 
-struct machine *machines__findnew(struct rb_root *machines, pid_t pid)
+struct machine *machines__findnew(struct machines *machines, pid_t pid)
 {
 	char path[PATH_MAX];
 	const char *root_dir = "";
@@ -150,12 +193,12 @@ out:
 	return machine;
 }
 
-void machines__process(struct rb_root *machines,
-		       machine__process_t process, void *data)
+void machines__process_guests(struct machines *machines,
+			      machine__process_t process, void *data)
 {
 	struct rb_node *nd;
 
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		process(pos, data);
 	}
@@ -175,12 +218,14 @@ char *machine__mmap_name(struct machine *machine, char *bf, size_t size)
 	return bf;
 }
 
-void machines__set_id_hdr_size(struct rb_root *machines, u16 id_hdr_size)
+void machines__set_id_hdr_size(struct machines *machines, u16 id_hdr_size)
 {
 	struct rb_node *node;
 	struct machine *machine;
 
-	for (node = rb_first(machines); node; node = rb_next(node)) {
+	machines->host.id_hdr_size = id_hdr_size;
+
+	for (node = rb_first(&machines->guests); node; node = rb_next(node)) {
 		machine = rb_entry(node, struct machine, rb_node);
 		machine->id_hdr_size = id_hdr_size;
 	}
@@ -264,6 +309,537 @@ int machine__process_lost_event(struct machine *machine __maybe_unused,
 	return 0;
 }
 
+struct map *machine__new_module(struct machine *machine, u64 start,
+				const char *filename)
+{
+	struct map *map;
+	struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename);
+
+	if (dso == NULL)
+		return NULL;
+
+	map = map__new2(start, dso, MAP__FUNCTION);
+	if (map == NULL)
+		return NULL;
+
+	if (machine__is_host(machine))
+		dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE;
+	else
+		dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE;
+	map_groups__insert(&machine->kmaps, map);
+	return map;
+}
+
+size_t machines__fprintf_dsos(struct machines *machines, FILE *fp)
+{
+	struct rb_node *nd;
+	size_t ret = __dsos__fprintf(&machines->host.kernel_dsos, fp) +
+		     __dsos__fprintf(&machines->host.user_dsos, fp);
+
+	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		ret += __dsos__fprintf(&pos->kernel_dsos, fp);
+		ret += __dsos__fprintf(&pos->user_dsos, fp);
+	}
+
+	return ret;
+}
+
+size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm)
+{
+	return __dsos__fprintf_buildid(&machine->kernel_dsos, fp, skip, parm) +
+	       __dsos__fprintf_buildid(&machine->user_dsos, fp, skip, parm);
+}
+
+size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm)
+{
+	struct rb_node *nd;
+	size_t ret = machine__fprintf_dsos_buildid(&machines->host, fp, skip, parm);
+
+	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		ret += machine__fprintf_dsos_buildid(pos, fp, skip, parm);
+	}
+	return ret;
+}
+
+size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
+{
+	int i;
+	size_t printed = 0;
+	struct dso *kdso = machine->vmlinux_maps[MAP__FUNCTION]->dso;
+
+	if (kdso->has_build_id) {
+		char filename[PATH_MAX];
+		if (dso__build_id_filename(kdso, filename, sizeof(filename)))
+			printed += fprintf(fp, "[0] %s\n", filename);
+	}
+
+	for (i = 0; i < vmlinux_path__nr_entries; ++i)
+		printed += fprintf(fp, "[%d] %s\n",
+				   i + kdso->has_build_id, vmlinux_path[i]);
+
+	return printed;
+}
+
+size_t machine__fprintf(struct machine *machine, FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+static struct dso *machine__get_kernel(struct machine *machine)
+{
+	const char *vmlinux_name = NULL;
+	struct dso *kernel;
+
+	if (machine__is_host(machine)) {
+		vmlinux_name = symbol_conf.vmlinux_name;
+		if (!vmlinux_name)
+			vmlinux_name = "[kernel.kallsyms]";
+
+		kernel = dso__kernel_findnew(machine, vmlinux_name,
+					     "[kernel]",
+					     DSO_TYPE_KERNEL);
+	} else {
+		char bf[PATH_MAX];
+
+		if (machine__is_default_guest(machine))
+			vmlinux_name = symbol_conf.default_guest_vmlinux_name;
+		if (!vmlinux_name)
+			vmlinux_name = machine__mmap_name(machine, bf,
+							  sizeof(bf));
+
+		kernel = dso__kernel_findnew(machine, vmlinux_name,
+					     "[guest.kernel]",
+					     DSO_TYPE_GUEST_KERNEL);
+	}
+
+	if (kernel != NULL && (!kernel->has_build_id))
+		dso__read_running_kernel_build_id(kernel, machine);
+
+	return kernel;
+}
+
+struct process_args {
+	u64 start;
+};
+
+static int symbol__in_kernel(void *arg, const char *name,
+			     char type __maybe_unused, u64 start)
+{
+	struct process_args *args = arg;
+
+	if (strchr(name, '['))
+		return 0;
+
+	args->start = start;
+	return 1;
+}
+
+/* Figure out the start address of kernel map from /proc/kallsyms */
+static u64 machine__get_kernel_start_addr(struct machine *machine)
+{
+	const char *filename;
+	char path[PATH_MAX];
+	struct process_args args;
+
+	if (machine__is_host(machine)) {
+		filename = "/proc/kallsyms";
+	} else {
+		if (machine__is_default_guest(machine))
+			filename = (char *)symbol_conf.default_guest_kallsyms;
+		else {
+			sprintf(path, "%s/proc/kallsyms", machine->root_dir);
+			filename = path;
+		}
+	}
+
+	if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+		return 0;
+
+	if (kallsyms__parse(filename, &args, symbol__in_kernel) <= 0)
+		return 0;
+
+	return args.start;
+}
+
+int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
+{
+	enum map_type type;
+	u64 start = machine__get_kernel_start_addr(machine);
+
+	for (type = 0; type < MAP__NR_TYPES; ++type) {
+		struct kmap *kmap;
+
+		machine->vmlinux_maps[type] = map__new2(start, kernel, type);
+		if (machine->vmlinux_maps[type] == NULL)
+			return -1;
+
+		machine->vmlinux_maps[type]->map_ip =
+			machine->vmlinux_maps[type]->unmap_ip =
+				identity__map_ip;
+		kmap = map__kmap(machine->vmlinux_maps[type]);
+		kmap->kmaps = &machine->kmaps;
+		map_groups__insert(&machine->kmaps,
+				   machine->vmlinux_maps[type]);
+	}
+
+	return 0;
+}
+
+void machine__destroy_kernel_maps(struct machine *machine)
+{
+	enum map_type type;
+
+	for (type = 0; type < MAP__NR_TYPES; ++type) {
+		struct kmap *kmap;
+
+		if (machine->vmlinux_maps[type] == NULL)
+			continue;
+
+		kmap = map__kmap(machine->vmlinux_maps[type]);
+		map_groups__remove(&machine->kmaps,
+				   machine->vmlinux_maps[type]);
+		if (kmap->ref_reloc_sym) {
+			/*
+			 * ref_reloc_sym is shared among all maps, so free just
+			 * on one of them.
+			 */
+			if (type == MAP__FUNCTION) {
+				free((char *)kmap->ref_reloc_sym->name);
+				kmap->ref_reloc_sym->name = NULL;
+				free(kmap->ref_reloc_sym);
+			}
+			kmap->ref_reloc_sym = NULL;
+		}
+
+		map__delete(machine->vmlinux_maps[type]);
+		machine->vmlinux_maps[type] = NULL;
+	}
+}
+
+int machines__create_guest_kernel_maps(struct machines *machines)
+{
+	int ret = 0;
+	struct dirent **namelist = NULL;
+	int i, items = 0;
+	char path[PATH_MAX];
+	pid_t pid;
+	char *endp;
+
+	if (symbol_conf.default_guest_vmlinux_name ||
+	    symbol_conf.default_guest_modules ||
+	    symbol_conf.default_guest_kallsyms) {
+		machines__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID);
+	}
+
+	if (symbol_conf.guestmount) {
+		items = scandir(symbol_conf.guestmount, &namelist, NULL, NULL);
+		if (items <= 0)
+			return -ENOENT;
+		for (i = 0; i < items; i++) {
+			if (!isdigit(namelist[i]->d_name[0])) {
+				/* Filter out . and .. */
+				continue;
+			}
+			pid = (pid_t)strtol(namelist[i]->d_name, &endp, 10);
+			if ((*endp != '\0') ||
+			    (endp == namelist[i]->d_name) ||
+			    (errno == ERANGE)) {
+				pr_debug("invalid directory (%s). Skipping.\n",
+					 namelist[i]->d_name);
+				continue;
+			}
+			sprintf(path, "%s/%s/proc/kallsyms",
+				symbol_conf.guestmount,
+				namelist[i]->d_name);
+			ret = access(path, R_OK);
+			if (ret) {
+				pr_debug("Can't access file %s\n", path);
+				goto failure;
+			}
+			machines__create_kernel_maps(machines, pid);
+		}
+failure:
+		free(namelist);
+	}
+
+	return ret;
+}
+
+void machines__destroy_kernel_maps(struct machines *machines)
+{
+	struct rb_node *next = rb_first(&machines->guests);
+
+	machine__destroy_kernel_maps(&machines->host);
+
+	while (next) {
+		struct machine *pos = rb_entry(next, struct machine, rb_node);
+
+		next = rb_next(&pos->rb_node);
+		rb_erase(&pos->rb_node, &machines->guests);
+		machine__delete(pos);
+	}
+}
+
+int machines__create_kernel_maps(struct machines *machines, pid_t pid)
+{
+	struct machine *machine = machines__findnew(machines, pid);
+
+	if (machine == NULL)
+		return -1;
+
+	return machine__create_kernel_maps(machine);
+}
+
+int machine__load_kallsyms(struct machine *machine, const char *filename,
+			   enum map_type type, symbol_filter_t filter)
+{
+	struct map *map = machine->vmlinux_maps[type];
+	int ret = dso__load_kallsyms(map->dso, filename, map, filter);
+
+	if (ret > 0) {
+		dso__set_loaded(map->dso, type);
+		/*
+		 * Since /proc/kallsyms will have multiple sessions for the
+		 * kernel, with modules between them, fixup the end of all
+		 * sections.
+		 */
+		__map_groups__fixup_end(&machine->kmaps, type);
+	}
+
+	return ret;
+}
+
+int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
+			       symbol_filter_t filter)
+{
+	struct map *map = machine->vmlinux_maps[type];
+	int ret = dso__load_vmlinux_path(map->dso, map, filter);
+
+	if (ret > 0) {
+		dso__set_loaded(map->dso, type);
+		map__reloc_vmlinux(map);
+	}
+
+	return ret;
+}
+
+static void map_groups__fixup_end(struct map_groups *mg)
+{
+	int i;
+	for (i = 0; i < MAP__NR_TYPES; ++i)
+		__map_groups__fixup_end(mg, i);
+}
+
+static char *get_kernel_version(const char *root_dir)
+{
+	char version[PATH_MAX];
+	FILE *file;
+	char *name, *tmp;
+	const char *prefix = "Linux version ";
+
+	sprintf(version, "%s/proc/version", root_dir);
+	file = fopen(version, "r");
+	if (!file)
+		return NULL;
+
+	version[0] = '\0';
+	tmp = fgets(version, sizeof(version), file);
+	fclose(file);
+
+	name = strstr(version, prefix);
+	if (!name)
+		return NULL;
+	name += strlen(prefix);
+	tmp = strchr(name, ' ');
+	if (tmp)
+		*tmp = '\0';
+
+	return strdup(name);
+}
+
+static int map_groups__set_modules_path_dir(struct map_groups *mg,
+				const char *dir_name)
+{
+	struct dirent *dent;
+	DIR *dir = opendir(dir_name);
+	int ret = 0;
+
+	if (!dir) {
+		pr_debug("%s: cannot open %s dir\n", __func__, dir_name);
+		return -1;
+	}
+
+	while ((dent = readdir(dir)) != NULL) {
+		char path[PATH_MAX];
+		struct stat st;
+
+		/*sshfs might return bad dent->d_type, so we have to stat*/
+		snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
+		if (stat(path, &st))
+			continue;
+
+		if (S_ISDIR(st.st_mode)) {
+			if (!strcmp(dent->d_name, ".") ||
+			    !strcmp(dent->d_name, ".."))
+				continue;
+
+			ret = map_groups__set_modules_path_dir(mg, path);
+			if (ret < 0)
+				goto out;
+		} else {
+			char *dot = strrchr(dent->d_name, '.'),
+			     dso_name[PATH_MAX];
+			struct map *map;
+			char *long_name;
+
+			if (dot == NULL || strcmp(dot, ".ko"))
+				continue;
+			snprintf(dso_name, sizeof(dso_name), "[%.*s]",
+				 (int)(dot - dent->d_name), dent->d_name);
+
+			strxfrchar(dso_name, '-', '_');
+			map = map_groups__find_by_name(mg, MAP__FUNCTION,
+						       dso_name);
+			if (map == NULL)
+				continue;
+
+			long_name = strdup(path);
+			if (long_name == NULL) {
+				ret = -1;
+				goto out;
+			}
+			dso__set_long_name(map->dso, long_name);
+			map->dso->lname_alloc = 1;
+			dso__kernel_module_get_build_id(map->dso, "");
+		}
+	}
+
+out:
+	closedir(dir);
+	return ret;
+}
+
+static int machine__set_modules_path(struct machine *machine)
+{
+	char *version;
+	char modules_path[PATH_MAX];
+
+	version = get_kernel_version(machine->root_dir);
+	if (!version)
+		return -1;
+
+	snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel",
+		 machine->root_dir, version);
+	free(version);
+
+	return map_groups__set_modules_path_dir(&machine->kmaps, modules_path);
+}
+
+static int machine__create_modules(struct machine *machine)
+{
+	char *line = NULL;
+	size_t n;
+	FILE *file;
+	struct map *map;
+	const char *modules;
+	char path[PATH_MAX];
+
+	if (machine__is_default_guest(machine))
+		modules = symbol_conf.default_guest_modules;
+	else {
+		sprintf(path, "%s/proc/modules", machine->root_dir);
+		modules = path;
+	}
+
+	if (symbol__restricted_filename(path, "/proc/modules"))
+		return -1;
+
+	file = fopen(modules, "r");
+	if (file == NULL)
+		return -1;
+
+	while (!feof(file)) {
+		char name[PATH_MAX];
+		u64 start;
+		char *sep;
+		int line_len;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0)
+			break;
+
+		if (!line)
+			goto out_failure;
+
+		line[--line_len] = '\0'; /* \n */
+
+		sep = strrchr(line, 'x');
+		if (sep == NULL)
+			continue;
+
+		hex2u64(sep + 1, &start);
+
+		sep = strchr(line, ' ');
+		if (sep == NULL)
+			continue;
+
+		*sep = '\0';
+
+		snprintf(name, sizeof(name), "[%s]", line);
+		map = machine__new_module(machine, start, name);
+		if (map == NULL)
+			goto out_delete_line;
+		dso__kernel_module_get_build_id(map->dso, machine->root_dir);
+	}
+
+	free(line);
+	fclose(file);
+
+	return machine__set_modules_path(machine);
+
+out_delete_line:
+	free(line);
+out_failure:
+	return -1;
+}
+
+int machine__create_kernel_maps(struct machine *machine)
+{
+	struct dso *kernel = machine__get_kernel(machine);
+
+	if (kernel == NULL ||
+	    __machine__create_kernel_maps(machine, kernel) < 0)
+		return -1;
+
+	if (symbol_conf.use_modules && machine__create_modules(machine) < 0) {
+		if (machine__is_host(machine))
+			pr_debug("Problems creating module maps, "
+				 "continuing anyway...\n");
+		else
+			pr_debug("Problems creating module maps for guest %d, "
+				 "continuing anyway...\n", machine->pid);
+	}
+
+	/*
+	 * Now that we have all the maps created, just set the ->end of them:
+	 */
+	map_groups__fixup_end(&machine->kmaps);
+	return 0;
+}
+
 static void machine__set_kernel_mmap_len(struct machine *machine,
 					 union perf_event *event)
 {
@@ -462,3 +1038,189 @@ int machine__process_event(struct machine *machine, union perf_event *event)
 
 	return ret;
 }
+
+void machine__remove_thread(struct machine *machine, struct thread *th)
+{
+	machine->last_match = NULL;
+	rb_erase(&th->rb_node, &machine->threads);
+	/*
+	 * We may have references to this thread, for instance in some hist_entry
+	 * instances, so just move them to a separate list.
+	 */
+	list_add_tail(&th->node, &machine->dead_threads);
+}
+
+static bool symbol__match_parent_regex(struct symbol *sym)
+{
+	if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
+		return 1;
+
+	return 0;
+}
+
+static const u8 cpumodes[] = {
+	PERF_RECORD_MISC_USER,
+	PERF_RECORD_MISC_KERNEL,
+	PERF_RECORD_MISC_GUEST_USER,
+	PERF_RECORD_MISC_GUEST_KERNEL
+};
+#define NCPUMODES (sizeof(cpumodes)/sizeof(u8))
+
+static void ip__resolve_ams(struct machine *machine, struct thread *thread,
+			    struct addr_map_symbol *ams,
+			    u64 ip)
+{
+	struct addr_location al;
+	size_t i;
+	u8 m;
+
+	memset(&al, 0, sizeof(al));
+
+	for (i = 0; i < NCPUMODES; i++) {
+		m = cpumodes[i];
+		/*
+		 * We cannot use the header.misc hint to determine whether a
+		 * branch stack address is user, kernel, guest, hypervisor.
+		 * Branches may straddle the kernel/user/hypervisor boundaries.
+		 * Thus, we have to try consecutively until we find a match
+		 * or else, the symbol is unknown
+		 */
+		thread__find_addr_location(thread, machine, m, MAP__FUNCTION,
+				ip, &al, NULL);
+		if (al.sym)
+			goto found;
+	}
+found:
+	ams->addr = ip;
+	ams->al_addr = al.addr;
+	ams->sym = al.sym;
+	ams->map = al.map;
+}
+
+struct branch_info *machine__resolve_bstack(struct machine *machine,
+					    struct thread *thr,
+					    struct branch_stack *bs)
+{
+	struct branch_info *bi;
+	unsigned int i;
+
+	bi = calloc(bs->nr, sizeof(struct branch_info));
+	if (!bi)
+		return NULL;
+
+	for (i = 0; i < bs->nr; i++) {
+		ip__resolve_ams(machine, thr, &bi[i].to, bs->entries[i].to);
+		ip__resolve_ams(machine, thr, &bi[i].from, bs->entries[i].from);
+		bi[i].flags = bs->entries[i].flags;
+	}
+	return bi;
+}
+
+static int machine__resolve_callchain_sample(struct machine *machine,
+					     struct thread *thread,
+					     struct ip_callchain *chain,
+					     struct symbol **parent)
+
+{
+	u8 cpumode = PERF_RECORD_MISC_USER;
+	unsigned int i;
+	int err;
+
+	callchain_cursor_reset(&callchain_cursor);
+
+	if (chain->nr > PERF_MAX_STACK_DEPTH) {
+		pr_warning("corrupted callchain. skipping...\n");
+		return 0;
+	}
+
+	for (i = 0; i < chain->nr; i++) {
+		u64 ip;
+		struct addr_location al;
+
+		if (callchain_param.order == ORDER_CALLEE)
+			ip = chain->ips[i];
+		else
+			ip = chain->ips[chain->nr - i - 1];
+
+		if (ip >= PERF_CONTEXT_MAX) {
+			switch (ip) {
+			case PERF_CONTEXT_HV:
+				cpumode = PERF_RECORD_MISC_HYPERVISOR;
+				break;
+			case PERF_CONTEXT_KERNEL:
+				cpumode = PERF_RECORD_MISC_KERNEL;
+				break;
+			case PERF_CONTEXT_USER:
+				cpumode = PERF_RECORD_MISC_USER;
+				break;
+			default:
+				pr_debug("invalid callchain context: "
+					 "%"PRId64"\n", (s64) ip);
+				/*
+				 * It seems the callchain is corrupted.
+				 * Discard all.
+				 */
+				callchain_cursor_reset(&callchain_cursor);
+				return 0;
+			}
+			continue;
+		}
+
+		al.filtered = false;
+		thread__find_addr_location(thread, machine, cpumode,
+					   MAP__FUNCTION, ip, &al, NULL);
+		if (al.sym != NULL) {
+			if (sort__has_parent && !*parent &&
+			    symbol__match_parent_regex(al.sym))
+				*parent = al.sym;
+			if (!symbol_conf.use_callchain)
+				break;
+		}
+
+		err = callchain_cursor_append(&callchain_cursor,
+					      ip, al.map, al.sym);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int unwind_entry(struct unwind_entry *entry, void *arg)
+{
+	struct callchain_cursor *cursor = arg;
+	return callchain_cursor_append(cursor, entry->ip,
+				       entry->map, entry->sym);
+}
+
+int machine__resolve_callchain(struct machine *machine,
+			       struct perf_evsel *evsel,
+			       struct thread *thread,
+			       struct perf_sample *sample,
+			       struct symbol **parent)
+
+{
+	int ret;
+
+	callchain_cursor_reset(&callchain_cursor);
+
+	ret = machine__resolve_callchain_sample(machine, thread,
+						sample->callchain, parent);
+	if (ret)
+		return ret;
+
+	/* Can we do dwarf post unwind? */
+	if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) &&
+	      (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER)))
+		return 0;
+
+	/* Bail out if nothing was captured. */
+	if ((!sample->user_regs.regs) ||
+	    (!sample->user_stack.size))
+		return 0;
+
+	return unwind__get_entries(unwind_entry, &callchain_cursor, machine,
+				   thread, evsel->attr.sample_regs_user,
+				   sample);
+
+}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index b7cde7467d55..5ac5892f2326 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -47,23 +47,32 @@ int machine__process_event(struct machine *machine, union perf_event *event);
 
 typedef void (*machine__process_t)(struct machine *machine, void *data);
 
-void machines__process(struct rb_root *machines,
-		       machine__process_t process, void *data);
+struct machines {
+	struct machine host;
+	struct rb_root guests;
+};
+
+void machines__init(struct machines *machines);
+void machines__exit(struct machines *machines);
 
-struct machine *machines__add(struct rb_root *machines, pid_t pid,
+void machines__process_guests(struct machines *machines,
+			      machine__process_t process, void *data);
+
+struct machine *machines__add(struct machines *machines, pid_t pid,
 			      const char *root_dir);
-struct machine *machines__find_host(struct rb_root *machines);
-struct machine *machines__find(struct rb_root *machines, pid_t pid);
-struct machine *machines__findnew(struct rb_root *machines, pid_t pid);
+struct machine *machines__find_host(struct machines *machines);
+struct machine *machines__find(struct machines *machines, pid_t pid);
+struct machine *machines__findnew(struct machines *machines, pid_t pid);
 
-void machines__set_id_hdr_size(struct rb_root *machines, u16 id_hdr_size);
+void machines__set_id_hdr_size(struct machines *machines, u16 id_hdr_size);
 char *machine__mmap_name(struct machine *machine, char *bf, size_t size);
 
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid);
 void machine__exit(struct machine *machine);
+void machine__delete_dead_threads(struct machine *machine);
+void machine__delete_threads(struct machine *machine);
 void machine__delete(struct machine *machine);
 
-
 struct branch_info *machine__resolve_bstack(struct machine *machine,
 					    struct thread *thread,
 					    struct branch_stack *bs);
@@ -129,19 +138,19 @@ int machine__load_kallsyms(struct machine *machine, const char *filename,
 int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
 			       symbol_filter_t filter);
 
-size_t machine__fprintf_dsos_buildid(struct machine *machine,
-				     FILE *fp, bool with_hits);
-size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp);
-size_t machines__fprintf_dsos_buildid(struct rb_root *machines,
-				      FILE *fp, bool with_hits);
+size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm);
+size_t machines__fprintf_dsos(struct machines *machines, FILE *fp);
+size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp,
+				     bool (skip)(struct dso *dso, int parm), int parm);
 
 void machine__destroy_kernel_maps(struct machine *machine);
 int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel);
 int machine__create_kernel_maps(struct machine *machine);
 
-int machines__create_kernel_maps(struct rb_root *machines, pid_t pid);
-int machines__create_guest_kernel_maps(struct rb_root *machines);
-void machines__destroy_guest_kernel_maps(struct rb_root *machines);
+int machines__create_kernel_maps(struct machines *machines, pid_t pid);
+int machines__create_guest_kernel_maps(struct machines *machines);
+void machines__destroy_kernel_maps(struct machines *machines);
 
 size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp);
 
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 0328d45c4f2a..ff94425779a2 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -19,7 +19,8 @@ const char *map_type__name[MAP__NR_TYPES] = {
 
 static inline int is_anon_memory(const char *filename)
 {
-	return strcmp(filename, "//anon") == 0;
+	return !strcmp(filename, "//anon") ||
+	       !strcmp(filename, "/anon_hugepage (deleted)");
 }
 
 static inline int is_no_dso_memory(const char *filename)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 2d8d53bec17e..02f6421f03a0 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -380,8 +380,8 @@ static int add_tracepoint(struct list_head **listp, int *idx,
 	return 0;
 }
 
-static int add_tracepoint_multi(struct list_head **list, int *idx,
-				char *sys_name, char *evt_name)
+static int add_tracepoint_multi_event(struct list_head **list, int *idx,
+				      char *sys_name, char *evt_name)
 {
 	char evt_path[MAXPATHLEN];
 	struct dirent *evt_ent;
@@ -408,6 +408,47 @@ static int add_tracepoint_multi(struct list_head **list, int *idx,
 		ret = add_tracepoint(list, idx, sys_name, evt_ent->d_name);
 	}
 
+	closedir(evt_dir);
+	return ret;
+}
+
+static int add_tracepoint_event(struct list_head **list, int *idx,
+				char *sys_name, char *evt_name)
+{
+	return strpbrk(evt_name, "*?") ?
+	       add_tracepoint_multi_event(list, idx, sys_name, evt_name) :
+	       add_tracepoint(list, idx, sys_name, evt_name);
+}
+
+static int add_tracepoint_multi_sys(struct list_head **list, int *idx,
+				    char *sys_name, char *evt_name)
+{
+	struct dirent *events_ent;
+	DIR *events_dir;
+	int ret = 0;
+
+	events_dir = opendir(tracing_events_path);
+	if (!events_dir) {
+		perror("Can't open event dir");
+		return -1;
+	}
+
+	while (!ret && (events_ent = readdir(events_dir))) {
+		if (!strcmp(events_ent->d_name, ".")
+		    || !strcmp(events_ent->d_name, "..")
+		    || !strcmp(events_ent->d_name, "enable")
+		    || !strcmp(events_ent->d_name, "header_event")
+		    || !strcmp(events_ent->d_name, "header_page"))
+			continue;
+
+		if (!strglobmatch(events_ent->d_name, sys_name))
+			continue;
+
+		ret = add_tracepoint_event(list, idx, events_ent->d_name,
+					   evt_name);
+	}
+
+	closedir(events_dir);
 	return ret;
 }
 
@@ -420,9 +461,10 @@ int parse_events_add_tracepoint(struct list_head **list, int *idx,
 	if (ret)
 		return ret;
 
-	return strpbrk(event, "*?") ?
-	       add_tracepoint_multi(list, idx, sys, event) :
-	       add_tracepoint(list, idx, sys, event);
+	if (strpbrk(sys, "*?"))
+		return add_tracepoint_multi_sys(list, idx, sys, event);
+	else
+		return add_tracepoint_event(list, idx, sys, event);
 }
 
 static int
@@ -492,7 +534,7 @@ int parse_events_add_breakpoint(struct list_head **list, int *idx,
 }
 
 static int config_term(struct perf_event_attr *attr,
-		       struct parse_events__term *term)
+		       struct parse_events_term *term)
 {
 #define CHECK_TYPE_VAL(type)					\
 do {								\
@@ -537,7 +579,7 @@ do {								\
 static int config_attr(struct perf_event_attr *attr,
 		       struct list_head *head, int fail)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	list_for_each_entry(term, head, list)
 		if (config_term(attr, term) && fail)
@@ -563,14 +605,14 @@ int parse_events_add_numeric(struct list_head **list, int *idx,
 	return add_event(list, idx, &attr, NULL);
 }
 
-static int parse_events__is_name_term(struct parse_events__term *term)
+static int parse_events__is_name_term(struct parse_events_term *term)
 {
 	return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
 }
 
 static char *pmu_event_name(struct list_head *head_terms)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	list_for_each_entry(term, head_terms, list)
 		if (parse_events__is_name_term(term))
@@ -814,7 +856,7 @@ static int parse_events__scanner(const char *str, void *data, int start_token)
  */
 int parse_events_terms(struct list_head *terms, const char *str)
 {
-	struct parse_events_data__terms data = {
+	struct parse_events_terms data = {
 		.terms = NULL,
 	};
 	int ret;
@@ -830,10 +872,9 @@ int parse_events_terms(struct list_head *terms, const char *str)
 	return ret;
 }
 
-int parse_events(struct perf_evlist *evlist, const char *str,
-		 int unset __maybe_unused)
+int parse_events(struct perf_evlist *evlist, const char *str)
 {
-	struct parse_events_data__events data = {
+	struct parse_events_evlist data = {
 		.list = LIST_HEAD_INIT(data.list),
 		.idx  = evlist->nr_entries,
 	};
@@ -858,7 +899,7 @@ int parse_events_option(const struct option *opt, const char *str,
 			int unset __maybe_unused)
 {
 	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
-	int ret = parse_events(evlist, str, unset);
+	int ret = parse_events(evlist, str);
 
 	if (ret) {
 		fprintf(stderr, "invalid or unsupported event: '%s'\n", str);
@@ -1121,16 +1162,16 @@ void print_events(const char *event_glob, bool name_only)
 	print_tracepoint_events(NULL, NULL, name_only);
 }
 
-int parse_events__is_hardcoded_term(struct parse_events__term *term)
+int parse_events__is_hardcoded_term(struct parse_events_term *term)
 {
 	return term->type_term != PARSE_EVENTS__TERM_TYPE_USER;
 }
 
-static int new_term(struct parse_events__term **_term, int type_val,
+static int new_term(struct parse_events_term **_term, int type_val,
 		    int type_term, char *config,
 		    char *str, u64 num)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	term = zalloc(sizeof(*term));
 	if (!term)
@@ -1156,21 +1197,21 @@ static int new_term(struct parse_events__term **_term, int type_val,
 	return 0;
 }
 
-int parse_events__term_num(struct parse_events__term **term,
+int parse_events_term__num(struct parse_events_term **term,
 			   int type_term, char *config, u64 num)
 {
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
 			config, NULL, num);
 }
 
-int parse_events__term_str(struct parse_events__term **term,
+int parse_events_term__str(struct parse_events_term **term,
 			   int type_term, char *config, char *str)
 {
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
 			config, str, 0);
 }
 
-int parse_events__term_sym_hw(struct parse_events__term **term,
+int parse_events_term__sym_hw(struct parse_events_term **term,
 			      char *config, unsigned idx)
 {
 	struct event_symbol *sym;
@@ -1188,8 +1229,8 @@ int parse_events__term_sym_hw(struct parse_events__term **term,
 				(char *) "event", (char *) sym->symbol, 0);
 }
 
-int parse_events__term_clone(struct parse_events__term **new,
-			     struct parse_events__term *term)
+int parse_events_term__clone(struct parse_events_term **new,
+			     struct parse_events_term *term)
 {
 	return new_term(new, term->type_val, term->type_term, term->config,
 			term->val.str, term->val.num);
@@ -1197,7 +1238,7 @@ int parse_events__term_clone(struct parse_events__term **new,
 
 void parse_events__free_terms(struct list_head *terms)
 {
-	struct parse_events__term *term, *h;
+	struct parse_events_term *term, *h;
 
 	list_for_each_entry_safe(term, h, terms, list)
 		free(term);
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index b7af80b8bdda..2cd2c42a69c5 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -29,8 +29,7 @@ const char *event_type(int type);
 
 extern int parse_events_option(const struct option *opt, const char *str,
 			       int unset);
-extern int parse_events(struct perf_evlist *evlist, const char *str,
-			int unset);
+extern int parse_events(struct perf_evlist *evlist, const char *str);
 extern int parse_events_terms(struct list_head *terms, const char *str);
 extern int parse_filter(const struct option *opt, const char *str, int unset);
 
@@ -51,7 +50,7 @@ enum {
 	PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE,
 };
 
-struct parse_events__term {
+struct parse_events_term {
 	char *config;
 	union {
 		char *str;
@@ -62,24 +61,24 @@ struct parse_events__term {
 	struct list_head list;
 };
 
-struct parse_events_data__events {
+struct parse_events_evlist {
 	struct list_head list;
 	int idx;
 };
 
-struct parse_events_data__terms {
+struct parse_events_terms {
 	struct list_head *terms;
 };
 
-int parse_events__is_hardcoded_term(struct parse_events__term *term);
-int parse_events__term_num(struct parse_events__term **_term,
+int parse_events__is_hardcoded_term(struct parse_events_term *term);
+int parse_events_term__num(struct parse_events_term **_term,
 			   int type_term, char *config, u64 num);
-int parse_events__term_str(struct parse_events__term **_term,
+int parse_events_term__str(struct parse_events_term **_term,
 			   int type_term, char *config, char *str);
-int parse_events__term_sym_hw(struct parse_events__term **term,
+int parse_events_term__sym_hw(struct parse_events_term **term,
 			      char *config, unsigned idx);
-int parse_events__term_clone(struct parse_events__term **new,
-			     struct parse_events__term *term);
+int parse_events_term__clone(struct parse_events_term **new,
+			     struct parse_events_term *term);
 void parse_events__free_terms(struct list_head *terms);
 int parse_events__modifier_event(struct list_head *list, char *str, bool add);
 int parse_events__modifier_group(struct list_head *list, char *event_mod);
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 0f9914ae6bac..9d43c86176ff 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -68,7 +68,7 @@ do { \
 	char *str;
 	u64 num;
 	struct list_head *head;
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 }
 %%
 
@@ -79,7 +79,7 @@ PE_START_TERMS  start_terms
 
 start_events: groups
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 
 	parse_events_update_lists($1, &data->list);
 }
@@ -186,7 +186,7 @@ event_def: event_pmu |
 event_pmu:
 PE_NAME '/' event_config '/'
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_pmu(&list, &data->idx, $1, $3));
@@ -202,7 +202,7 @@ PE_VALUE_SYM_SW
 event_legacy_symbol:
 value_sym '/' event_config '/'
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 	int type = $1 >> 16;
 	int config = $1 & 255;
@@ -215,7 +215,7 @@ value_sym '/' event_config '/'
 |
 value_sym sep_slash_dc
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 	int type = $1 >> 16;
 	int config = $1 & 255;
@@ -228,7 +228,7 @@ value_sym sep_slash_dc
 event_legacy_cache:
 PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, $5));
@@ -237,7 +237,7 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT
 |
 PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, NULL));
@@ -246,7 +246,7 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT
 |
 PE_NAME_CACHE_TYPE
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, NULL, NULL));
@@ -256,7 +256,7 @@ PE_NAME_CACHE_TYPE
 event_legacy_mem:
 PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_breakpoint(&list, &data->idx,
@@ -266,7 +266,7 @@ PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc
 |
 PE_PREFIX_MEM PE_VALUE sep_dc
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_breakpoint(&list, &data->idx,
@@ -277,7 +277,7 @@ PE_PREFIX_MEM PE_VALUE sep_dc
 event_legacy_tracepoint:
 PE_NAME ':' PE_NAME
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_tracepoint(&list, &data->idx, $1, $3));
@@ -287,7 +287,7 @@ PE_NAME ':' PE_NAME
 event_legacy_numeric:
 PE_VALUE ':' PE_VALUE
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_numeric(&list, &data->idx, (u32)$1, $3, NULL));
@@ -297,7 +297,7 @@ PE_VALUE ':' PE_VALUE
 event_legacy_raw:
 PE_RAW
 {
-	struct parse_events_data__events *data = _data;
+	struct parse_events_evlist *data = _data;
 	struct list_head *list = NULL;
 
 	ABORT_ON(parse_events_add_numeric(&list, &data->idx,
@@ -307,7 +307,7 @@ PE_RAW
 
 start_terms: event_config
 {
-	struct parse_events_data__terms *data = _data;
+	struct parse_events_terms *data = _data;
 	data->terms = $1;
 }
 
@@ -315,7 +315,7 @@ event_config:
 event_config ',' event_term
 {
 	struct list_head *head = $1;
-	struct parse_events__term *term = $3;
+	struct parse_events_term *term = $3;
 
 	ABORT_ON(!head);
 	list_add_tail(&term->list, head);
@@ -325,7 +325,7 @@ event_config ',' event_term
 event_term
 {
 	struct list_head *head = malloc(sizeof(*head));
-	struct parse_events__term *term = $1;
+	struct parse_events_term *term = $1;
 
 	ABORT_ON(!head);
 	INIT_LIST_HEAD(head);
@@ -336,70 +336,70 @@ event_term
 event_term:
 PE_NAME '=' PE_NAME
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_str(&term, PARSE_EVENTS__TERM_TYPE_USER,
+	ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
 					$1, $3));
 	$$ = term;
 }
 |
 PE_NAME '=' PE_VALUE
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
 					$1, $3));
 	$$ = term;
 }
 |
 PE_NAME '=' PE_VALUE_SYM_HW
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 	int config = $3 & 255;
 
-	ABORT_ON(parse_events__term_sym_hw(&term, $1, config));
+	ABORT_ON(parse_events_term__sym_hw(&term, $1, config));
 	$$ = term;
 }
 |
 PE_NAME
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
 					$1, 1));
 	$$ = term;
 }
 |
 PE_VALUE_SYM_HW
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 	int config = $1 & 255;
 
-	ABORT_ON(parse_events__term_sym_hw(&term, NULL, config));
+	ABORT_ON(parse_events_term__sym_hw(&term, NULL, config));
 	$$ = term;
 }
 |
 PE_TERM '=' PE_NAME
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_str(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3));
 	$$ = term;
 }
 |
 PE_TERM '=' PE_VALUE
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3));
 	$$ = term;
 }
 |
 PE_TERM
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
-	ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, 1));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1));
 	$$ = term;
 }
 
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 9bdc60c6f138..4c6f9c490a8d 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1,4 +1,3 @@
-
 #include <linux/list.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -11,6 +10,19 @@
 #include "parse-events.h"
 #include "cpumap.h"
 
+struct perf_pmu_alias {
+	char *name;
+	struct list_head terms;
+	struct list_head list;
+};
+
+struct perf_pmu_format {
+	char *name;
+	int value;
+	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
+	struct list_head list;
+};
+
 #define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/"
 
 int perf_pmu_parse(struct list_head *list, char *name);
@@ -85,7 +97,7 @@ static int pmu_format(char *name, struct list_head *format)
 
 static int perf_pmu__new_alias(struct list_head *list, char *name, FILE *file)
 {
-	struct perf_pmu__alias *alias;
+	struct perf_pmu_alias *alias;
 	char buf[256];
 	int ret;
 
@@ -172,15 +184,15 @@ static int pmu_aliases(char *name, struct list_head *head)
 	return 0;
 }
 
-static int pmu_alias_terms(struct perf_pmu__alias *alias,
+static int pmu_alias_terms(struct perf_pmu_alias *alias,
 			   struct list_head *terms)
 {
-	struct parse_events__term *term, *clone;
+	struct parse_events_term *term, *clone;
 	LIST_HEAD(list);
 	int ret;
 
 	list_for_each_entry(term, &alias->terms, list) {
-		ret = parse_events__term_clone(&clone, term);
+		ret = parse_events_term__clone(&clone, term);
 		if (ret) {
 			parse_events__free_terms(&list);
 			return ret;
@@ -360,10 +372,10 @@ struct perf_pmu *perf_pmu__find(char *name)
 	return pmu_lookup(name);
 }
 
-static struct perf_pmu__format*
+static struct perf_pmu_format *
 pmu_find_format(struct list_head *formats, char *name)
 {
-	struct perf_pmu__format *format;
+	struct perf_pmu_format *format;
 
 	list_for_each_entry(format, formats, list)
 		if (!strcmp(format->name, name))
@@ -403,9 +415,9 @@ static __u64 pmu_format_value(unsigned long *format, __u64 value)
  */
 static int pmu_config_term(struct list_head *formats,
 			   struct perf_event_attr *attr,
-			   struct parse_events__term *term)
+			   struct parse_events_term *term)
 {
-	struct perf_pmu__format *format;
+	struct perf_pmu_format *format;
 	__u64 *vp;
 
 	/*
@@ -450,7 +462,7 @@ int perf_pmu__config_terms(struct list_head *formats,
 			   struct perf_event_attr *attr,
 			   struct list_head *head_terms)
 {
-	struct parse_events__term *term;
+	struct parse_events_term *term;
 
 	list_for_each_entry(term, head_terms, list)
 		if (pmu_config_term(formats, attr, term))
@@ -471,10 +483,10 @@ int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
 	return perf_pmu__config_terms(&pmu->format, attr, head_terms);
 }
 
-static struct perf_pmu__alias *pmu_find_alias(struct perf_pmu *pmu,
-					      struct parse_events__term *term)
+static struct perf_pmu_alias *pmu_find_alias(struct perf_pmu *pmu,
+					     struct parse_events_term *term)
 {
-	struct perf_pmu__alias *alias;
+	struct perf_pmu_alias *alias;
 	char *name;
 
 	if (parse_events__is_hardcoded_term(term))
@@ -507,8 +519,8 @@ static struct perf_pmu__alias *pmu_find_alias(struct perf_pmu *pmu,
  */
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms)
 {
-	struct parse_events__term *term, *h;
-	struct perf_pmu__alias *alias;
+	struct parse_events_term *term, *h;
+	struct perf_pmu_alias *alias;
 	int ret;
 
 	list_for_each_entry_safe(term, h, head_terms, list) {
@@ -527,7 +539,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms)
 int perf_pmu__new_format(struct list_head *list, char *name,
 			 int config, unsigned long *bits)
 {
-	struct perf_pmu__format *format;
+	struct perf_pmu_format *format;
 
 	format = zalloc(sizeof(*format));
 	if (!format)
@@ -548,7 +560,7 @@ void perf_pmu__set_format(unsigned long *bits, long from, long to)
 	if (!to)
 		to = from;
 
-	memset(bits, 0, BITS_TO_LONGS(PERF_PMU_FORMAT_BITS));
+	memset(bits, 0, BITS_TO_BYTES(PERF_PMU_FORMAT_BITS));
 	for (b = from; b <= to; b++)
 		set_bit(b, bits);
 }
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index a313ed76a49a..32fe55b659fa 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -12,19 +12,6 @@ enum {
 
 #define PERF_PMU_FORMAT_BITS 64
 
-struct perf_pmu__format {
-	char *name;
-	int value;
-	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
-	struct list_head list;
-};
-
-struct perf_pmu__alias {
-	char *name;
-	struct list_head terms;
-	struct list_head list;
-};
-
 struct perf_pmu {
 	char *name;
 	__u32 type;
@@ -42,7 +29,7 @@ int perf_pmu__config_terms(struct list_head *formats,
 			   struct list_head *head_terms);
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms);
 struct list_head *perf_pmu__alias(struct perf_pmu *pmu,
-				struct list_head *head_terms);
+				  struct list_head *head_terms);
 int perf_pmu_wrap(void);
 void perf_pmu_error(struct list_head *list, char *name, char const *msg);
 
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1daf5c14e751..be0329394d56 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -413,12 +413,12 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 				   dwarf_diename(vr_die), dwarf_diename(&type));
 			return -EINVAL;
 		}
+		if (die_get_real_type(&type, &type) == NULL) {
+			pr_warning("Failed to get a type"
+				   " information.\n");
+			return -ENOENT;
+		}
 		if (ret == DW_TAG_pointer_type) {
-			if (die_get_real_type(&type, &type) == NULL) {
-				pr_warning("Failed to get a type"
-					   " information.\n");
-				return -ENOENT;
-			}
 			while (*ref_ptr)
 				ref_ptr = &(*ref_ptr)->next;
 			/* Add new reference with offset +0 */
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index a2657fd96837..925e0c3e6d91 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -1045,3 +1045,12 @@ error:
 	if (PyErr_Occurred())
 		PyErr_SetString(PyExc_ImportError, "perf: Init failed!");
 }
+
+/*
+ * Dummy, to avoid dragging all the test_attr infrastructure in the python
+ * binding.
+ */
+void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
+                     int fd, int group_fd, unsigned long flags)
+{
+}
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index f80605eb1855..eacec859f299 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -292,6 +292,7 @@ static void perl_process_tracepoint(union perf_event *perf_event __maybe_unused,
 	ns = nsecs - s * NSECS_PER_SEC;
 
 	scripting_context->event_data = data;
+	scripting_context->pevent = evsel->tp_format->pevent;
 
 	ENTER;
 	SAVETMPS;
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 14683dfca2ee..e87aa5d9696b 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -265,6 +265,7 @@ static void python_process_tracepoint(union perf_event *perf_event
 	ns = nsecs - s * NSECS_PER_SEC;
 
 	scripting_context->event_data = data;
+	scripting_context->pevent = evsel->tp_format->pevent;
 
 	context = PyCObject_FromVoidPtr(scripting_context, NULL);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index ce6f51162386..bd85280bb6e8 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -16,7 +16,6 @@
 #include "cpumap.h"
 #include "event-parse.h"
 #include "perf_regs.h"
-#include "unwind.h"
 #include "vdso.h"
 
 static int perf_session__open(struct perf_session *self, bool force)
@@ -87,13 +86,12 @@ void perf_session__set_id_hdr_size(struct perf_session *session)
 {
 	u16 id_hdr_size = perf_evlist__id_hdr_size(session->evlist);
 
-	session->host_machine.id_hdr_size = id_hdr_size;
 	machines__set_id_hdr_size(&session->machines, id_hdr_size);
 }
 
 int perf_session__create_kernel_maps(struct perf_session *self)
 {
-	int ret = machine__create_kernel_maps(&self->host_machine);
+	int ret = machine__create_kernel_maps(&self->machines.host);
 
 	if (ret >= 0)
 		ret = machines__create_guest_kernel_maps(&self->machines);
@@ -102,8 +100,7 @@ int perf_session__create_kernel_maps(struct perf_session *self)
 
 static void perf_session__destroy_kernel_maps(struct perf_session *self)
 {
-	machine__destroy_kernel_maps(&self->host_machine);
-	machines__destroy_guest_kernel_maps(&self->machines);
+	machines__destroy_kernel_maps(&self->machines);
 }
 
 struct perf_session *perf_session__new(const char *filename, int mode,
@@ -128,22 +125,11 @@ struct perf_session *perf_session__new(const char *filename, int mode,
 		goto out;
 
 	memcpy(self->filename, filename, len);
-	/*
-	 * On 64bit we can mmap the data file in one go. No need for tiny mmap
-	 * slices. On 32bit we use 32MB.
-	 */
-#if BITS_PER_LONG == 64
-	self->mmap_window = ULLONG_MAX;
-#else
-	self->mmap_window = 32 * 1024 * 1024ULL;
-#endif
-	self->machines = RB_ROOT;
 	self->repipe = repipe;
 	INIT_LIST_HEAD(&self->ordered_samples.samples);
 	INIT_LIST_HEAD(&self->ordered_samples.sample_cache);
 	INIT_LIST_HEAD(&self->ordered_samples.to_free);
-	machine__init(&self->host_machine, "", HOST_KERNEL_ID);
-	hists__init(&self->hists);
+	machines__init(&self->machines);
 
 	if (mode == O_RDONLY) {
 		if (perf_session__open(self, force) < 0)
@@ -171,37 +157,30 @@ out_delete:
 	return NULL;
 }
 
-static void machine__delete_dead_threads(struct machine *machine)
-{
-	struct thread *n, *t;
-
-	list_for_each_entry_safe(t, n, &machine->dead_threads, node) {
-		list_del(&t->node);
-		thread__delete(t);
-	}
-}
-
 static void perf_session__delete_dead_threads(struct perf_session *session)
 {
-	machine__delete_dead_threads(&session->host_machine);
+	machine__delete_dead_threads(&session->machines.host);
 }
 
-static void machine__delete_threads(struct machine *self)
+static void perf_session__delete_threads(struct perf_session *session)
 {
-	struct rb_node *nd = rb_first(&self->threads);
-
-	while (nd) {
-		struct thread *t = rb_entry(nd, struct thread, rb_node);
-
-		rb_erase(&t->rb_node, &self->threads);
-		nd = rb_next(nd);
-		thread__delete(t);
-	}
+	machine__delete_threads(&session->machines.host);
 }
 
-static void perf_session__delete_threads(struct perf_session *session)
+static void perf_session_env__delete(struct perf_session_env *env)
 {
-	machine__delete_threads(&session->host_machine);
+	free(env->hostname);
+	free(env->os_release);
+	free(env->version);
+	free(env->arch);
+	free(env->cpu_desc);
+	free(env->cpuid);
+
+	free(env->cmdline);
+	free(env->sibling_cores);
+	free(env->sibling_threads);
+	free(env->numa_nodes);
+	free(env->pmu_mappings);
 }
 
 void perf_session__delete(struct perf_session *self)
@@ -209,198 +188,13 @@ void perf_session__delete(struct perf_session *self)
 	perf_session__destroy_kernel_maps(self);
 	perf_session__delete_dead_threads(self);
 	perf_session__delete_threads(self);
-	machine__exit(&self->host_machine);
+	perf_session_env__delete(&self->header.env);
+	machines__exit(&self->machines);
 	close(self->fd);
 	free(self);
 	vdso__exit();
 }
 
-void machine__remove_thread(struct machine *self, struct thread *th)
-{
-	self->last_match = NULL;
-	rb_erase(&th->rb_node, &self->threads);
-	/*
-	 * We may have references to this thread, for instance in some hist_entry
-	 * instances, so just move them to a separate list.
-	 */
-	list_add_tail(&th->node, &self->dead_threads);
-}
-
-static bool symbol__match_parent_regex(struct symbol *sym)
-{
-	if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
-		return 1;
-
-	return 0;
-}
-
-static const u8 cpumodes[] = {
-	PERF_RECORD_MISC_USER,
-	PERF_RECORD_MISC_KERNEL,
-	PERF_RECORD_MISC_GUEST_USER,
-	PERF_RECORD_MISC_GUEST_KERNEL
-};
-#define NCPUMODES (sizeof(cpumodes)/sizeof(u8))
-
-static void ip__resolve_ams(struct machine *self, struct thread *thread,
-			    struct addr_map_symbol *ams,
-			    u64 ip)
-{
-	struct addr_location al;
-	size_t i;
-	u8 m;
-
-	memset(&al, 0, sizeof(al));
-
-	for (i = 0; i < NCPUMODES; i++) {
-		m = cpumodes[i];
-		/*
-		 * We cannot use the header.misc hint to determine whether a
-		 * branch stack address is user, kernel, guest, hypervisor.
-		 * Branches may straddle the kernel/user/hypervisor boundaries.
-		 * Thus, we have to try consecutively until we find a match
-		 * or else, the symbol is unknown
-		 */
-		thread__find_addr_location(thread, self, m, MAP__FUNCTION,
-				ip, &al, NULL);
-		if (al.sym)
-			goto found;
-	}
-found:
-	ams->addr = ip;
-	ams->al_addr = al.addr;
-	ams->sym = al.sym;
-	ams->map = al.map;
-}
-
-struct branch_info *machine__resolve_bstack(struct machine *self,
-					    struct thread *thr,
-					    struct branch_stack *bs)
-{
-	struct branch_info *bi;
-	unsigned int i;
-
-	bi = calloc(bs->nr, sizeof(struct branch_info));
-	if (!bi)
-		return NULL;
-
-	for (i = 0; i < bs->nr; i++) {
-		ip__resolve_ams(self, thr, &bi[i].to, bs->entries[i].to);
-		ip__resolve_ams(self, thr, &bi[i].from, bs->entries[i].from);
-		bi[i].flags = bs->entries[i].flags;
-	}
-	return bi;
-}
-
-static int machine__resolve_callchain_sample(struct machine *machine,
-					     struct thread *thread,
-					     struct ip_callchain *chain,
-					     struct symbol **parent)
-
-{
-	u8 cpumode = PERF_RECORD_MISC_USER;
-	unsigned int i;
-	int err;
-
-	callchain_cursor_reset(&callchain_cursor);
-
-	if (chain->nr > PERF_MAX_STACK_DEPTH) {
-		pr_warning("corrupted callchain. skipping...\n");
-		return 0;
-	}
-
-	for (i = 0; i < chain->nr; i++) {
-		u64 ip;
-		struct addr_location al;
-
-		if (callchain_param.order == ORDER_CALLEE)
-			ip = chain->ips[i];
-		else
-			ip = chain->ips[chain->nr - i - 1];
-
-		if (ip >= PERF_CONTEXT_MAX) {
-			switch (ip) {
-			case PERF_CONTEXT_HV:
-				cpumode = PERF_RECORD_MISC_HYPERVISOR;
-				break;
-			case PERF_CONTEXT_KERNEL:
-				cpumode = PERF_RECORD_MISC_KERNEL;
-				break;
-			case PERF_CONTEXT_USER:
-				cpumode = PERF_RECORD_MISC_USER;
-				break;
-			default:
-				pr_debug("invalid callchain context: "
-					 "%"PRId64"\n", (s64) ip);
-				/*
-				 * It seems the callchain is corrupted.
-				 * Discard all.
-				 */
-				callchain_cursor_reset(&callchain_cursor);
-				return 0;
-			}
-			continue;
-		}
-
-		al.filtered = false;
-		thread__find_addr_location(thread, machine, cpumode,
-					   MAP__FUNCTION, ip, &al, NULL);
-		if (al.sym != NULL) {
-			if (sort__has_parent && !*parent &&
-			    symbol__match_parent_regex(al.sym))
-				*parent = al.sym;
-			if (!symbol_conf.use_callchain)
-				break;
-		}
-
-		err = callchain_cursor_append(&callchain_cursor,
-					      ip, al.map, al.sym);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int unwind_entry(struct unwind_entry *entry, void *arg)
-{
-	struct callchain_cursor *cursor = arg;
-	return callchain_cursor_append(cursor, entry->ip,
-				       entry->map, entry->sym);
-}
-
-int machine__resolve_callchain(struct machine *machine,
-			       struct perf_evsel *evsel,
-			       struct thread *thread,
-			       struct perf_sample *sample,
-			       struct symbol **parent)
-
-{
-	int ret;
-
-	callchain_cursor_reset(&callchain_cursor);
-
-	ret = machine__resolve_callchain_sample(machine, thread,
-						sample->callchain, parent);
-	if (ret)
-		return ret;
-
-	/* Can we do dwarf post unwind? */
-	if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) &&
-	      (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER)))
-		return 0;
-
-	/* Bail out if nothing was captured. */
-	if ((!sample->user_regs.regs) ||
-	    (!sample->user_stack.size))
-		return 0;
-
-	return unwind__get_entries(unwind_entry, &callchain_cursor, machine,
-				   thread, evsel->attr.sample_regs_user,
-				   sample);
-
-}
-
 static int process_event_synth_tracing_data_stub(union perf_event *event
 						 __maybe_unused,
 						 struct perf_session *session
@@ -1027,7 +821,7 @@ static struct machine *
 		return perf_session__findnew_machine(session, pid);
 	}
 
-	return perf_session__find_host_machine(session);
+	return &session->machines.host;
 }
 
 static int perf_session_deliver_event(struct perf_session *session,
@@ -1065,11 +859,11 @@ static int perf_session_deliver_event(struct perf_session *session,
 	case PERF_RECORD_SAMPLE:
 		dump_sample(evsel, event, sample);
 		if (evsel == NULL) {
-			++session->hists.stats.nr_unknown_id;
+			++session->stats.nr_unknown_id;
 			return 0;
 		}
 		if (machine == NULL) {
-			++session->hists.stats.nr_unprocessable_samples;
+			++session->stats.nr_unprocessable_samples;
 			return 0;
 		}
 		return tool->sample(tool, event, sample, evsel, machine);
@@ -1083,7 +877,7 @@ static int perf_session_deliver_event(struct perf_session *session,
 		return tool->exit(tool, event, sample, machine);
 	case PERF_RECORD_LOST:
 		if (tool->lost == perf_event__process_lost)
-			session->hists.stats.total_lost += event->lost.lost;
+			session->stats.total_lost += event->lost.lost;
 		return tool->lost(tool, event, sample, machine);
 	case PERF_RECORD_READ:
 		return tool->read(tool, event, sample, evsel, machine);
@@ -1092,7 +886,7 @@ static int perf_session_deliver_event(struct perf_session *session,
 	case PERF_RECORD_UNTHROTTLE:
 		return tool->unthrottle(tool, event, sample, machine);
 	default:
-		++session->hists.stats.nr_unknown_events;
+		++session->stats.nr_unknown_events;
 		return -1;
 	}
 }
@@ -1106,8 +900,8 @@ static int perf_session__preprocess_sample(struct perf_session *session,
 
 	if (!ip_callchain__valid(sample->callchain, event)) {
 		pr_debug("call-chain problem with event, skipping it.\n");
-		++session->hists.stats.nr_invalid_chains;
-		session->hists.stats.total_invalid_chains += sample->period;
+		++session->stats.nr_invalid_chains;
+		session->stats.total_invalid_chains += sample->period;
 		return -EINVAL;
 	}
 	return 0;
@@ -1165,7 +959,7 @@ static int perf_session__process_event(struct perf_session *session,
 	if (event->header.type >= PERF_RECORD_HEADER_MAX)
 		return -EINVAL;
 
-	hists__inc_nr_events(&session->hists, event->header.type);
+	events_stats__inc(&session->stats, event->header.type);
 
 	if (event->header.type >= PERF_RECORD_USER_TYPE_START)
 		return perf_session__process_user_event(session, event, tool, file_offset);
@@ -1201,7 +995,7 @@ void perf_event_header__bswap(struct perf_event_header *self)
 
 struct thread *perf_session__findnew(struct perf_session *session, pid_t pid)
 {
-	return machine__findnew_thread(&session->host_machine, pid);
+	return machine__findnew_thread(&session->machines.host, pid);
 }
 
 static struct thread *perf_session__register_idle_thread(struct perf_session *self)
@@ -1220,39 +1014,39 @@ static void perf_session__warn_about_errors(const struct perf_session *session,
 					    const struct perf_tool *tool)
 {
 	if (tool->lost == perf_event__process_lost &&
-	    session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) {
+	    session->stats.nr_events[PERF_RECORD_LOST] != 0) {
 		ui__warning("Processed %d events and lost %d chunks!\n\n"
 			    "Check IO/CPU overload!\n\n",
-			    session->hists.stats.nr_events[0],
-			    session->hists.stats.nr_events[PERF_RECORD_LOST]);
+			    session->stats.nr_events[0],
+			    session->stats.nr_events[PERF_RECORD_LOST]);
 	}
 
-	if (session->hists.stats.nr_unknown_events != 0) {
+	if (session->stats.nr_unknown_events != 0) {
 		ui__warning("Found %u unknown events!\n\n"
 			    "Is this an older tool processing a perf.data "
 			    "file generated by a more recent tool?\n\n"
 			    "If that is not the case, consider "
 			    "reporting to linux-kernel@vger.kernel.org.\n\n",
-			    session->hists.stats.nr_unknown_events);
+			    session->stats.nr_unknown_events);
 	}
 
-	if (session->hists.stats.nr_unknown_id != 0) {
+	if (session->stats.nr_unknown_id != 0) {
 		ui__warning("%u samples with id not present in the header\n",
-			    session->hists.stats.nr_unknown_id);
+			    session->stats.nr_unknown_id);
 	}
 
- 	if (session->hists.stats.nr_invalid_chains != 0) {
+ 	if (session->stats.nr_invalid_chains != 0) {
  		ui__warning("Found invalid callchains!\n\n"
  			    "%u out of %u events were discarded for this reason.\n\n"
  			    "Consider reporting to linux-kernel@vger.kernel.org.\n\n",
- 			    session->hists.stats.nr_invalid_chains,
- 			    session->hists.stats.nr_events[PERF_RECORD_SAMPLE]);
+ 			    session->stats.nr_invalid_chains,
+ 			    session->stats.nr_events[PERF_RECORD_SAMPLE]);
  	}
 
-	if (session->hists.stats.nr_unprocessable_samples != 0) {
+	if (session->stats.nr_unprocessable_samples != 0) {
 		ui__warning("%u unprocessable samples recorded.\n"
 			    "Do you have a KVM guest running and not using 'perf kvm'?\n",
-			    session->hists.stats.nr_unprocessable_samples);
+			    session->stats.nr_unprocessable_samples);
 	}
 }
 
@@ -1369,6 +1163,18 @@ fetch_mmaped_event(struct perf_session *session,
 	return event;
 }
 
+/*
+ * On 64bit we can mmap the data file in one go. No need for tiny mmap
+ * slices. On 32bit we use 32MB.
+ */
+#if BITS_PER_LONG == 64
+#define MMAP_SIZE ULLONG_MAX
+#define NUM_MMAPS 1
+#else
+#define MMAP_SIZE (32 * 1024 * 1024ULL)
+#define NUM_MMAPS 128
+#endif
+
 int __perf_session__process_events(struct perf_session *session,
 				   u64 data_offset, u64 data_size,
 				   u64 file_size, struct perf_tool *tool)
@@ -1376,7 +1182,7 @@ int __perf_session__process_events(struct perf_session *session,
 	u64 head, page_offset, file_offset, file_pos, progress_next;
 	int err, mmap_prot, mmap_flags, map_idx = 0;
 	size_t	mmap_size;
-	char *buf, *mmaps[8];
+	char *buf, *mmaps[NUM_MMAPS];
 	union perf_event *event;
 	uint32_t size;
 
@@ -1391,7 +1197,7 @@ int __perf_session__process_events(struct perf_session *session,
 
 	progress_next = file_size / 16;
 
-	mmap_size = session->mmap_window;
+	mmap_size = MMAP_SIZE;
 	if (mmap_size > file_size)
 		mmap_size = file_size;
 
@@ -1526,16 +1332,13 @@ int maps__set_kallsyms_ref_reloc_sym(struct map **maps,
 
 size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp)
 {
-	return __dsos__fprintf(&self->host_machine.kernel_dsos, fp) +
-	       __dsos__fprintf(&self->host_machine.user_dsos, fp) +
-	       machines__fprintf_dsos(&self->machines, fp);
+	return machines__fprintf_dsos(&self->machines, fp);
 }
 
 size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp,
-					  bool with_hits)
+					  bool (skip)(struct dso *dso, int parm), int parm)
 {
-	size_t ret = machine__fprintf_dsos_buildid(&self->host_machine, fp, with_hits);
-	return ret + machines__fprintf_dsos_buildid(&self->machines, fp, with_hits);
+	return machines__fprintf_dsos_buildid(&self->machines, fp, skip, parm);
 }
 
 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
@@ -1543,11 +1346,11 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 	struct perf_evsel *pos;
 	size_t ret = fprintf(fp, "Aggregated stats:\n");
 
-	ret += hists__fprintf_nr_events(&session->hists, fp);
+	ret += events_stats__fprintf(&session->stats, fp);
 
 	list_for_each_entry(pos, &session->evlist->entries, node) {
 		ret += fprintf(fp, "%s stats:\n", perf_evsel__name(pos));
-		ret += hists__fprintf_nr_events(&pos->hists, fp);
+		ret += events_stats__fprintf(&pos->hists.stats, fp);
 	}
 
 	return ret;
@@ -1559,7 +1362,7 @@ size_t perf_session__fprintf(struct perf_session *session, FILE *fp)
 	 * FIXME: Here we have to actually print all the machines in this
 	 * session, not just the host...
 	 */
-	return machine__fprintf(&session->host_machine, fp);
+	return machine__fprintf(&session->machines.host, fp);
 }
 
 void perf_session__remove_thread(struct perf_session *session,
@@ -1568,10 +1371,10 @@ void perf_session__remove_thread(struct perf_session *session,
 	/*
 	 * FIXME: This one makes no sense, we need to remove the thread from
 	 * the machine it belongs to, perf_session can have many machines, so
-	 * doing it always on ->host_machine is wrong.  Fix when auditing all
+	 * doing it always on ->machines.host is wrong.  Fix when auditing all
 	 * the 'perf kvm' code.
 	 */
-	machine__remove_thread(&session->host_machine, th);
+	machine__remove_thread(&session->machines.host, th);
 }
 
 struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index cea133a6bdf1..b5c0847edfa9 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -30,16 +30,10 @@ struct ordered_samples {
 struct perf_session {
 	struct perf_header	header;
 	unsigned long		size;
-	unsigned long		mmap_window;
-	struct machine		host_machine;
-	struct rb_root		machines;
+	struct machines		machines;
 	struct perf_evlist	*evlist;
 	struct pevent		*pevent;
-	/*
-	 * FIXME: Need to split this up further, we need global
-	 *	  stats + per event stats.
-	 */
-	struct hists		hists;
+	struct events_stats	stats;
 	int			fd;
 	bool			fd_pipe;
 	bool			repipe;
@@ -54,7 +48,7 @@ struct perf_tool;
 struct perf_session *perf_session__new(const char *filename, int mode,
 				       bool force, bool repipe,
 				       struct perf_tool *tool);
-void perf_session__delete(struct perf_session *self);
+void perf_session__delete(struct perf_session *session);
 
 void perf_event_header__bswap(struct perf_event_header *self);
 
@@ -81,43 +75,24 @@ void perf_session__set_id_hdr_size(struct perf_session *session);
 void perf_session__remove_thread(struct perf_session *self, struct thread *th);
 
 static inline
-struct machine *perf_session__find_host_machine(struct perf_session *self)
-{
-	return &self->host_machine;
-}
-
-static inline
 struct machine *perf_session__find_machine(struct perf_session *self, pid_t pid)
 {
-	if (pid == HOST_KERNEL_ID)
-		return &self->host_machine;
 	return machines__find(&self->machines, pid);
 }
 
 static inline
 struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t pid)
 {
-	if (pid == HOST_KERNEL_ID)
-		return &self->host_machine;
 	return machines__findnew(&self->machines, pid);
 }
 
-static inline
-void perf_session__process_machines(struct perf_session *self,
-				    struct perf_tool *tool,
-				    machine__process_t process)
-{
-	process(&self->host_machine, tool);
-	return machines__process(&self->machines, process, tool);
-}
-
 struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
 size_t perf_session__fprintf(struct perf_session *self, FILE *fp);
 
 size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp);
 
-size_t perf_session__fprintf_dsos_buildid(struct perf_session *self,
-					  FILE *fp, bool with_hits);
+size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp,
+					  bool (fn)(struct dso *dso, int parm), int parm);
 
 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index cfd1c0feb32d..7ad62393aa88 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -60,7 +60,7 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf,
 				       size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%*s:%5d", width,
+	return repsep_snprintf(bf, size, "%*s:%5d", width - 6,
 			      self->thread->comm ?: "", self->thread->pid);
 }
 
@@ -97,6 +97,16 @@ static int hist_entry__comm_snprintf(struct hist_entry *self, char *bf,
 	return repsep_snprintf(bf, size, "%*s", width, self->thread->comm);
 }
 
+struct sort_entry sort_comm = {
+	.se_header	= "Command",
+	.se_cmp		= sort__comm_cmp,
+	.se_collapse	= sort__comm_collapse,
+	.se_snprintf	= hist_entry__comm_snprintf,
+	.se_width_idx	= HISTC_COMM,
+};
+
+/* --sort dso */
+
 static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
 {
 	struct dso *dso_l = map_l ? map_l->dso : NULL;
@@ -117,22 +127,38 @@ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
 	return strcmp(dso_name_l, dso_name_r);
 }
 
-struct sort_entry sort_comm = {
-	.se_header	= "Command",
-	.se_cmp		= sort__comm_cmp,
-	.se_collapse	= sort__comm_collapse,
-	.se_snprintf	= hist_entry__comm_snprintf,
-	.se_width_idx	= HISTC_COMM,
-};
-
-/* --sort dso */
-
 static int64_t
 sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	return _sort__dso_cmp(left->ms.map, right->ms.map);
 }
 
+static int _hist_entry__dso_snprintf(struct map *map, char *bf,
+				     size_t size, unsigned int width)
+{
+	if (map && map->dso) {
+		const char *dso_name = !verbose ? map->dso->short_name :
+			map->dso->long_name;
+		return repsep_snprintf(bf, size, "%-*s", width, dso_name);
+	}
+
+	return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
+}
+
+static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width)
+{
+	return _hist_entry__dso_snprintf(self->ms.map, bf, size, width);
+}
+
+struct sort_entry sort_dso = {
+	.se_header	= "Shared Object",
+	.se_cmp		= sort__dso_cmp,
+	.se_snprintf	= hist_entry__dso_snprintf,
+	.se_width_idx	= HISTC_DSO,
+};
+
+/* --sort symbol */
 
 static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r,
 			      u64 ip_l, u64 ip_r)
@@ -143,35 +169,35 @@ static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r,
 	if (sym_l == sym_r)
 		return 0;
 
-	if (sym_l)
-		ip_l = sym_l->start;
-	if (sym_r)
-		ip_r = sym_r->start;
+	ip_l = sym_l->start;
+	ip_r = sym_r->start;
 
 	return (int64_t)(ip_r - ip_l);
 }
 
-static int _hist_entry__dso_snprintf(struct map *map, char *bf,
-				     size_t size, unsigned int width)
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	if (map && map->dso) {
-		const char *dso_name = !verbose ? map->dso->short_name :
-			map->dso->long_name;
-		return repsep_snprintf(bf, size, "%-*s", width, dso_name);
-	}
+	u64 ip_l, ip_r;
 
-	return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
-}
+	if (!left->ms.sym && !right->ms.sym)
+		return right->level - left->level;
 
-static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf,
-				    size_t size, unsigned int width)
-{
-	return _hist_entry__dso_snprintf(self->ms.map, bf, size, width);
+	if (!left->ms.sym || !right->ms.sym)
+		return cmp_null(left->ms.sym, right->ms.sym);
+
+	if (left->ms.sym == right->ms.sym)
+		return 0;
+
+	ip_l = left->ms.sym->start;
+	ip_r = right->ms.sym->start;
+
+	return _sort__sym_cmp(left->ms.sym, right->ms.sym, ip_l, ip_r);
 }
 
 static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
 				     u64 ip, char level, char *bf, size_t size,
-				     unsigned int width __maybe_unused)
+				     unsigned int width)
 {
 	size_t ret = 0;
 
@@ -197,43 +223,13 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
 	return ret;
 }
 
-
-struct sort_entry sort_dso = {
-	.se_header	= "Shared Object",
-	.se_cmp		= sort__dso_cmp,
-	.se_snprintf	= hist_entry__dso_snprintf,
-	.se_width_idx	= HISTC_DSO,
-};
-
 static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
-				    size_t size,
-				    unsigned int width __maybe_unused)
+				    size_t size, unsigned int width)
 {
 	return _hist_entry__sym_snprintf(self->ms.map, self->ms.sym, self->ip,
 					 self->level, bf, size, width);
 }
 
-/* --sort symbol */
-static int64_t
-sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-	u64 ip_l, ip_r;
-
-	if (!left->ms.sym && !right->ms.sym)
-		return right->level - left->level;
-
-	if (!left->ms.sym || !right->ms.sym)
-		return cmp_null(left->ms.sym, right->ms.sym);
-
-	if (left->ms.sym == right->ms.sym)
-		return 0;
-
-	ip_l = left->ms.sym->start;
-	ip_r = right->ms.sym->start;
-
-	return _sort__sym_cmp(left->ms.sym, right->ms.sym, ip_l, ip_r);
-}
-
 struct sort_entry sort_sym = {
 	.se_header	= "Symbol",
 	.se_cmp		= sort__sym_cmp,
@@ -335,7 +331,7 @@ sort__cpu_cmp(struct hist_entry *left, struct hist_entry *right)
 static int hist_entry__cpu_snprintf(struct hist_entry *self, char *bf,
 				       size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%-*d", width, self->cpu);
+	return repsep_snprintf(bf, size, "%*d", width, self->cpu);
 }
 
 struct sort_entry sort_cpu = {
@@ -345,6 +341,8 @@ struct sort_entry sort_cpu = {
 	.se_width_idx	= HISTC_CPU,
 };
 
+/* sort keys for branch stacks */
+
 static int64_t
 sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -359,13 +357,6 @@ static int hist_entry__dso_from_snprintf(struct hist_entry *self, char *bf,
 					 bf, size, width);
 }
 
-struct sort_entry sort_dso_from = {
-	.se_header	= "Source Shared Object",
-	.se_cmp		= sort__dso_from_cmp,
-	.se_snprintf	= hist_entry__dso_from_snprintf,
-	.se_width_idx	= HISTC_DSO_FROM,
-};
-
 static int64_t
 sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -406,8 +397,7 @@ sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static int hist_entry__sym_from_snprintf(struct hist_entry *self, char *bf,
-					size_t size,
-					unsigned int width __maybe_unused)
+					 size_t size, unsigned int width)
 {
 	struct addr_map_symbol *from = &self->branch_info->from;
 	return _hist_entry__sym_snprintf(from->map, from->sym, from->addr,
@@ -416,8 +406,7 @@ static int hist_entry__sym_from_snprintf(struct hist_entry *self, char *bf,
 }
 
 static int hist_entry__sym_to_snprintf(struct hist_entry *self, char *bf,
-				       size_t size,
-				       unsigned int width __maybe_unused)
+				       size_t size, unsigned int width)
 {
 	struct addr_map_symbol *to = &self->branch_info->to;
 	return _hist_entry__sym_snprintf(to->map, to->sym, to->addr,
@@ -425,6 +414,13 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *self, char *bf,
 
 }
 
+struct sort_entry sort_dso_from = {
+	.se_header	= "Source Shared Object",
+	.se_cmp		= sort__dso_from_cmp,
+	.se_snprintf	= hist_entry__dso_from_snprintf,
+	.se_width_idx	= HISTC_DSO_FROM,
+};
+
 struct sort_entry sort_dso_to = {
 	.se_header	= "Target Shared Object",
 	.se_cmp		= sort__dso_to_cmp,
@@ -484,30 +480,40 @@ struct sort_dimension {
 
 #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
 
-static struct sort_dimension sort_dimensions[] = {
+static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_PID, "pid", sort_thread),
 	DIM(SORT_COMM, "comm", sort_comm),
 	DIM(SORT_DSO, "dso", sort_dso),
-	DIM(SORT_DSO_FROM, "dso_from", sort_dso_from),
-	DIM(SORT_DSO_TO, "dso_to", sort_dso_to),
 	DIM(SORT_SYM, "symbol", sort_sym),
-	DIM(SORT_SYM_TO, "symbol_from", sort_sym_from),
-	DIM(SORT_SYM_FROM, "symbol_to", sort_sym_to),
 	DIM(SORT_PARENT, "parent", sort_parent),
 	DIM(SORT_CPU, "cpu", sort_cpu),
-	DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
 	DIM(SORT_SRCLINE, "srcline", sort_srcline),
 };
 
+#undef DIM
+
+#define DIM(d, n, func) [d - __SORT_BRANCH_STACK] = { .name = n, .entry = &(func) }
+
+static struct sort_dimension bstack_sort_dimensions[] = {
+	DIM(SORT_DSO_FROM, "dso_from", sort_dso_from),
+	DIM(SORT_DSO_TO, "dso_to", sort_dso_to),
+	DIM(SORT_SYM_FROM, "symbol_from", sort_sym_from),
+	DIM(SORT_SYM_TO, "symbol_to", sort_sym_to),
+	DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
+};
+
+#undef DIM
+
 int sort_dimension__add(const char *tok)
 {
 	unsigned int i;
 
-	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
-		struct sort_dimension *sd = &sort_dimensions[i];
+	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
+		struct sort_dimension *sd = &common_sort_dimensions[i];
 
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
+
 		if (sd->entry == &sort_parent) {
 			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
 			if (ret) {
@@ -518,9 +524,7 @@ int sort_dimension__add(const char *tok)
 				return -EINVAL;
 			}
 			sort__has_parent = 1;
-		} else if (sd->entry == &sort_sym ||
-			   sd->entry == &sort_sym_from ||
-			   sd->entry == &sort_sym_to) {
+		} else if (sd->entry == &sort_sym) {
 			sort__has_sym = 1;
 		}
 
@@ -530,36 +534,42 @@ int sort_dimension__add(const char *tok)
 		if (sd->entry->se_collapse)
 			sort__need_collapse = 1;
 
-		if (list_empty(&hist_entry__sort_list)) {
-			if (!strcmp(sd->name, "pid"))
-				sort__first_dimension = SORT_PID;
-			else if (!strcmp(sd->name, "comm"))
-				sort__first_dimension = SORT_COMM;
-			else if (!strcmp(sd->name, "dso"))
-				sort__first_dimension = SORT_DSO;
-			else if (!strcmp(sd->name, "symbol"))
-				sort__first_dimension = SORT_SYM;
-			else if (!strcmp(sd->name, "parent"))
-				sort__first_dimension = SORT_PARENT;
-			else if (!strcmp(sd->name, "cpu"))
-				sort__first_dimension = SORT_CPU;
-			else if (!strcmp(sd->name, "symbol_from"))
-				sort__first_dimension = SORT_SYM_FROM;
-			else if (!strcmp(sd->name, "symbol_to"))
-				sort__first_dimension = SORT_SYM_TO;
-			else if (!strcmp(sd->name, "dso_from"))
-				sort__first_dimension = SORT_DSO_FROM;
-			else if (!strcmp(sd->name, "dso_to"))
-				sort__first_dimension = SORT_DSO_TO;
-			else if (!strcmp(sd->name, "mispredict"))
-				sort__first_dimension = SORT_MISPREDICT;
-		}
+		if (list_empty(&hist_entry__sort_list))
+			sort__first_dimension = i;
 
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
 
 		return 0;
 	}
+
+	for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
+		struct sort_dimension *sd = &bstack_sort_dimensions[i];
+
+		if (strncasecmp(tok, sd->name, strlen(tok)))
+			continue;
+
+		if (sort__branch_mode != 1)
+			return -EINVAL;
+
+		if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
+			sort__has_sym = 1;
+
+		if (sd->taken)
+			return 0;
+
+		if (sd->entry->se_collapse)
+			sort__need_collapse = 1;
+
+		if (list_empty(&hist_entry__sort_list))
+			sort__first_dimension = i + __SORT_BRANCH_STACK;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+
+		return 0;
+	}
+
 	return -ESRCH;
 }
 
@@ -569,7 +579,11 @@ void setup_sorting(const char * const usagestr[], const struct option *opts)
 
 	for (tok = strtok_r(str, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		if (sort_dimension__add(tok) < 0) {
+		int ret = sort_dimension__add(tok);
+		if (ret == -EINVAL) {
+			error("Invalid --sort key: `%s'", tok);
+			usage_with_options(usagestr, opts);
+		} else if (ret == -ESRCH) {
 			error("Unknown --sort key: `%s'", tok);
 			usage_with_options(usagestr, opts);
 		}
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index b4e8c3ba559d..e994ad3e9897 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -55,9 +55,6 @@ struct he_stat {
 struct hist_entry_diff {
 	bool	computed;
 
-	/* PERF_HPP__DISPL */
-	int	displacement;
-
 	/* PERF_HPP__DELTA */
 	double	period_ratio_delta;
 
@@ -118,25 +115,29 @@ static inline struct hist_entry *hist_entry__next_pair(struct hist_entry *he)
 	return NULL;
 }
 
-static inline void hist__entry_add_pair(struct hist_entry *he,
+static inline void hist_entry__add_pair(struct hist_entry *he,
 					struct hist_entry *pair)
 {
 	list_add_tail(&he->pairs.head, &pair->pairs.node);
 }
 
 enum sort_type {
+	/* common sort keys */
 	SORT_PID,
 	SORT_COMM,
 	SORT_DSO,
 	SORT_SYM,
 	SORT_PARENT,
 	SORT_CPU,
-	SORT_DSO_FROM,
+	SORT_SRCLINE,
+
+	/* branch stack specific sort keys */
+	__SORT_BRANCH_STACK,
+	SORT_DSO_FROM = __SORT_BRANCH_STACK,
 	SORT_DSO_TO,
 	SORT_SYM_FROM,
 	SORT_SYM_TO,
 	SORT_MISPREDICT,
-	SORT_SRCLINE,
 };
 
 /*
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index 346707df04b9..29c7b2cb2521 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c
@@ -332,6 +332,24 @@ char *strxfrchar(char *s, char from, char to)
 }
 
 /**
+ * ltrim - Removes leading whitespace from @s.
+ * @s: The string to be stripped.
+ *
+ * Return pointer to the first non-whitespace character in @s.
+ */
+char *ltrim(char *s)
+{
+	int len = strlen(s);
+
+	while (len && isspace(*s)) {
+		len--;
+		s++;
+	}
+
+	return s;
+}
+
+/**
  * rtrim - Removes trailing whitespace from @s.
  * @s: The string to be stripped.
  *
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index db0cc92cf2ea..54efcb5659ac 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1,6 +1,3 @@
-#include <libelf.h>
-#include <gelf.h>
-#include <elf.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
@@ -718,6 +715,17 @@ int dso__load_sym(struct dso *dso, struct map *map,
 					sym.st_value);
 			used_opd = true;
 		}
+		/*
+		 * When loading symbols in a data mapping, ABS symbols (which
+		 * has a value of SHN_ABS in its st_shndx) failed at
+		 * elf_getscn().  And it marks the loading as a failure so
+		 * already loaded symbols cannot be fixed up.
+		 *
+		 * I'm not sure what should be done. Just ignore them for now.
+		 * - Namhyung Kim
+		 */
+		if (sym.st_shndx == SHN_ABS)
+			continue;
 
 		sec = elf_getscn(runtime_ss->elf, sym.st_shndx);
 		if (!sec)
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index 259f8f2ea9c9..a7390cde63bc 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -1,6 +1,5 @@
 #include "symbol.h"
 
-#include <elf.h>
 #include <stdio.h>
 #include <fcntl.h>
 #include <string.h>
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 295f8d4feedf..e6432d85b43d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -28,8 +28,8 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map,
 				symbol_filter_t filter);
 static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map,
 			symbol_filter_t filter);
-static int vmlinux_path__nr_entries;
-static char **vmlinux_path;
+int vmlinux_path__nr_entries;
+char **vmlinux_path;
 
 struct symbol_conf symbol_conf = {
 	.exclude_other	  = true,
@@ -202,13 +202,6 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
 	curr->end = ~0ULL;
 }
 
-static void map_groups__fixup_end(struct map_groups *mg)
-{
-	int i;
-	for (i = 0; i < MAP__NR_TYPES; ++i)
-		__map_groups__fixup_end(mg, i);
-}
-
 struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
 {
 	size_t namelen = strlen(name) + 1;
@@ -652,8 +645,8 @@ discard_symbol:		rb_erase(&pos->rb_node, root);
 	return count + moved;
 }
 
-static bool symbol__restricted_filename(const char *filename,
-					const char *restricted_filename)
+bool symbol__restricted_filename(const char *filename,
+				 const char *restricted_filename)
 {
 	bool restricted = false;
 
@@ -775,10 +768,6 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 	else
 		machine = NULL;
 
-	name = malloc(PATH_MAX);
-	if (!name)
-		return -1;
-
 	dso->adjust_symbols = 0;
 
 	if (strncmp(dso->name, "/tmp/perf-", 10) == 0) {
@@ -802,6 +791,10 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 	if (machine)
 		root_dir = machine->root_dir;
 
+	name = malloc(PATH_MAX);
+	if (!name)
+		return -1;
+
 	/* Iterate over candidate debug images.
 	 * Keep track of "interesting" ones (those which have a symtab, dynsym,
 	 * and/or opd section) for processing.
@@ -887,200 +880,6 @@ struct map *map_groups__find_by_name(struct map_groups *mg,
 	return NULL;
 }
 
-static int map_groups__set_modules_path_dir(struct map_groups *mg,
-				const char *dir_name)
-{
-	struct dirent *dent;
-	DIR *dir = opendir(dir_name);
-	int ret = 0;
-
-	if (!dir) {
-		pr_debug("%s: cannot open %s dir\n", __func__, dir_name);
-		return -1;
-	}
-
-	while ((dent = readdir(dir)) != NULL) {
-		char path[PATH_MAX];
-		struct stat st;
-
-		/*sshfs might return bad dent->d_type, so we have to stat*/
-		snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
-		if (stat(path, &st))
-			continue;
-
-		if (S_ISDIR(st.st_mode)) {
-			if (!strcmp(dent->d_name, ".") ||
-			    !strcmp(dent->d_name, ".."))
-				continue;
-
-			ret = map_groups__set_modules_path_dir(mg, path);
-			if (ret < 0)
-				goto out;
-		} else {
-			char *dot = strrchr(dent->d_name, '.'),
-			     dso_name[PATH_MAX];
-			struct map *map;
-			char *long_name;
-
-			if (dot == NULL || strcmp(dot, ".ko"))
-				continue;
-			snprintf(dso_name, sizeof(dso_name), "[%.*s]",
-				 (int)(dot - dent->d_name), dent->d_name);
-
-			strxfrchar(dso_name, '-', '_');
-			map = map_groups__find_by_name(mg, MAP__FUNCTION,
-						       dso_name);
-			if (map == NULL)
-				continue;
-
-			long_name = strdup(path);
-			if (long_name == NULL) {
-				ret = -1;
-				goto out;
-			}
-			dso__set_long_name(map->dso, long_name);
-			map->dso->lname_alloc = 1;
-			dso__kernel_module_get_build_id(map->dso, "");
-		}
-	}
-
-out:
-	closedir(dir);
-	return ret;
-}
-
-static char *get_kernel_version(const char *root_dir)
-{
-	char version[PATH_MAX];
-	FILE *file;
-	char *name, *tmp;
-	const char *prefix = "Linux version ";
-
-	sprintf(version, "%s/proc/version", root_dir);
-	file = fopen(version, "r");
-	if (!file)
-		return NULL;
-
-	version[0] = '\0';
-	tmp = fgets(version, sizeof(version), file);
-	fclose(file);
-
-	name = strstr(version, prefix);
-	if (!name)
-		return NULL;
-	name += strlen(prefix);
-	tmp = strchr(name, ' ');
-	if (tmp)
-		*tmp = '\0';
-
-	return strdup(name);
-}
-
-static int machine__set_modules_path(struct machine *machine)
-{
-	char *version;
-	char modules_path[PATH_MAX];
-
-	version = get_kernel_version(machine->root_dir);
-	if (!version)
-		return -1;
-
-	snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel",
-		 machine->root_dir, version);
-	free(version);
-
-	return map_groups__set_modules_path_dir(&machine->kmaps, modules_path);
-}
-
-struct map *machine__new_module(struct machine *machine, u64 start,
-				const char *filename)
-{
-	struct map *map;
-	struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename);
-
-	if (dso == NULL)
-		return NULL;
-
-	map = map__new2(start, dso, MAP__FUNCTION);
-	if (map == NULL)
-		return NULL;
-
-	if (machine__is_host(machine))
-		dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE;
-	else
-		dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE;
-	map_groups__insert(&machine->kmaps, map);
-	return map;
-}
-
-static int machine__create_modules(struct machine *machine)
-{
-	char *line = NULL;
-	size_t n;
-	FILE *file;
-	struct map *map;
-	const char *modules;
-	char path[PATH_MAX];
-
-	if (machine__is_default_guest(machine))
-		modules = symbol_conf.default_guest_modules;
-	else {
-		sprintf(path, "%s/proc/modules", machine->root_dir);
-		modules = path;
-	}
-
-	if (symbol__restricted_filename(path, "/proc/modules"))
-		return -1;
-
-	file = fopen(modules, "r");
-	if (file == NULL)
-		return -1;
-
-	while (!feof(file)) {
-		char name[PATH_MAX];
-		u64 start;
-		char *sep;
-		int line_len;
-
-		line_len = getline(&line, &n, file);
-		if (line_len < 0)
-			break;
-
-		if (!line)
-			goto out_failure;
-
-		line[--line_len] = '\0'; /* \n */
-
-		sep = strrchr(line, 'x');
-		if (sep == NULL)
-			continue;
-
-		hex2u64(sep + 1, &start);
-
-		sep = strchr(line, ' ');
-		if (sep == NULL)
-			continue;
-
-		*sep = '\0';
-
-		snprintf(name, sizeof(name), "[%s]", line);
-		map = machine__new_module(machine, start, name);
-		if (map == NULL)
-			goto out_delete_line;
-		dso__kernel_module_get_build_id(map->dso, machine->root_dir);
-	}
-
-	free(line);
-	fclose(file);
-
-	return machine__set_modules_path(machine);
-
-out_delete_line:
-	free(line);
-out_failure:
-	return -1;
-}
-
 int dso__load_vmlinux(struct dso *dso, struct map *map,
 		      const char *vmlinux, symbol_filter_t filter)
 {
@@ -1124,8 +923,10 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map,
 	filename = dso__build_id_filename(dso, NULL, 0);
 	if (filename != NULL) {
 		err = dso__load_vmlinux(dso, map, filename, filter);
-		if (err > 0)
+		if (err > 0) {
+			dso->lname_alloc = 1;
 			goto out;
+		}
 		free(filename);
 	}
 
@@ -1133,6 +934,7 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map,
 		err = dso__load_vmlinux(dso, map, vmlinux_path[i], filter);
 		if (err > 0) {
 			dso__set_long_name(dso, strdup(vmlinux_path[i]));
+			dso->lname_alloc = 1;
 			break;
 		}
 	}
@@ -1172,6 +974,7 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map,
 		if (err > 0) {
 			dso__set_long_name(dso,
 					   strdup(symbol_conf.vmlinux_name));
+			dso->lname_alloc = 1;
 			goto out_fixup;
 		}
 		return err;
@@ -1300,195 +1103,6 @@ out_try_fixup:
 	return err;
 }
 
-size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp)
-{
-	struct rb_node *nd;
-	size_t ret = 0;
-
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
-		struct machine *pos = rb_entry(nd, struct machine, rb_node);
-		ret += __dsos__fprintf(&pos->kernel_dsos, fp);
-		ret += __dsos__fprintf(&pos->user_dsos, fp);
-	}
-
-	return ret;
-}
-
-size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp,
-				     bool with_hits)
-{
-	return __dsos__fprintf_buildid(&machine->kernel_dsos, fp, with_hits) +
-	       __dsos__fprintf_buildid(&machine->user_dsos, fp, with_hits);
-}
-
-size_t machines__fprintf_dsos_buildid(struct rb_root *machines,
-				      FILE *fp, bool with_hits)
-{
-	struct rb_node *nd;
-	size_t ret = 0;
-
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
-		struct machine *pos = rb_entry(nd, struct machine, rb_node);
-		ret += machine__fprintf_dsos_buildid(pos, fp, with_hits);
-	}
-	return ret;
-}
-
-static struct dso *machine__get_kernel(struct machine *machine)
-{
-	const char *vmlinux_name = NULL;
-	struct dso *kernel;
-
-	if (machine__is_host(machine)) {
-		vmlinux_name = symbol_conf.vmlinux_name;
-		if (!vmlinux_name)
-			vmlinux_name = "[kernel.kallsyms]";
-
-		kernel = dso__kernel_findnew(machine, vmlinux_name,
-					     "[kernel]",
-					     DSO_TYPE_KERNEL);
-	} else {
-		char bf[PATH_MAX];
-
-		if (machine__is_default_guest(machine))
-			vmlinux_name = symbol_conf.default_guest_vmlinux_name;
-		if (!vmlinux_name)
-			vmlinux_name = machine__mmap_name(machine, bf,
-							  sizeof(bf));
-
-		kernel = dso__kernel_findnew(machine, vmlinux_name,
-					     "[guest.kernel]",
-					     DSO_TYPE_GUEST_KERNEL);
-	}
-
-	if (kernel != NULL && (!kernel->has_build_id))
-		dso__read_running_kernel_build_id(kernel, machine);
-
-	return kernel;
-}
-
-struct process_args {
-	u64 start;
-};
-
-static int symbol__in_kernel(void *arg, const char *name,
-			     char type __maybe_unused, u64 start)
-{
-	struct process_args *args = arg;
-
-	if (strchr(name, '['))
-		return 0;
-
-	args->start = start;
-	return 1;
-}
-
-/* Figure out the start address of kernel map from /proc/kallsyms */
-static u64 machine__get_kernel_start_addr(struct machine *machine)
-{
-	const char *filename;
-	char path[PATH_MAX];
-	struct process_args args;
-
-	if (machine__is_host(machine)) {
-		filename = "/proc/kallsyms";
-	} else {
-		if (machine__is_default_guest(machine))
-			filename = (char *)symbol_conf.default_guest_kallsyms;
-		else {
-			sprintf(path, "%s/proc/kallsyms", machine->root_dir);
-			filename = path;
-		}
-	}
-
-	if (symbol__restricted_filename(filename, "/proc/kallsyms"))
-		return 0;
-
-	if (kallsyms__parse(filename, &args, symbol__in_kernel) <= 0)
-		return 0;
-
-	return args.start;
-}
-
-int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
-{
-	enum map_type type;
-	u64 start = machine__get_kernel_start_addr(machine);
-
-	for (type = 0; type < MAP__NR_TYPES; ++type) {
-		struct kmap *kmap;
-
-		machine->vmlinux_maps[type] = map__new2(start, kernel, type);
-		if (machine->vmlinux_maps[type] == NULL)
-			return -1;
-
-		machine->vmlinux_maps[type]->map_ip =
-			machine->vmlinux_maps[type]->unmap_ip =
-				identity__map_ip;
-		kmap = map__kmap(machine->vmlinux_maps[type]);
-		kmap->kmaps = &machine->kmaps;
-		map_groups__insert(&machine->kmaps,
-				   machine->vmlinux_maps[type]);
-	}
-
-	return 0;
-}
-
-void machine__destroy_kernel_maps(struct machine *machine)
-{
-	enum map_type type;
-
-	for (type = 0; type < MAP__NR_TYPES; ++type) {
-		struct kmap *kmap;
-
-		if (machine->vmlinux_maps[type] == NULL)
-			continue;
-
-		kmap = map__kmap(machine->vmlinux_maps[type]);
-		map_groups__remove(&machine->kmaps,
-				   machine->vmlinux_maps[type]);
-		if (kmap->ref_reloc_sym) {
-			/*
-			 * ref_reloc_sym is shared among all maps, so free just
-			 * on one of them.
-			 */
-			if (type == MAP__FUNCTION) {
-				free((char *)kmap->ref_reloc_sym->name);
-				kmap->ref_reloc_sym->name = NULL;
-				free(kmap->ref_reloc_sym);
-			}
-			kmap->ref_reloc_sym = NULL;
-		}
-
-		map__delete(machine->vmlinux_maps[type]);
-		machine->vmlinux_maps[type] = NULL;
-	}
-}
-
-int machine__create_kernel_maps(struct machine *machine)
-{
-	struct dso *kernel = machine__get_kernel(machine);
-
-	if (kernel == NULL ||
-	    __machine__create_kernel_maps(machine, kernel) < 0)
-		return -1;
-
-	if (symbol_conf.use_modules && machine__create_modules(machine) < 0) {
-		if (machine__is_host(machine))
-			pr_debug("Problems creating module maps, "
-				 "continuing anyway...\n");
-		else
-			pr_debug("Problems creating module maps for guest %d, "
-				 "continuing anyway...\n", machine->pid);
-	}
-
-	/*
-	 * Now that we have all the maps created, just set the ->end of them:
-	 */
-	map_groups__fixup_end(&machine->kmaps);
-	return 0;
-}
-
 static void vmlinux_path__exit(void)
 {
 	while (--vmlinux_path__nr_entries >= 0) {
@@ -1549,25 +1163,6 @@ out_fail:
 	return -1;
 }
 
-size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
-{
-	int i;
-	size_t printed = 0;
-	struct dso *kdso = machine->vmlinux_maps[MAP__FUNCTION]->dso;
-
-	if (kdso->has_build_id) {
-		char filename[PATH_MAX];
-		if (dso__build_id_filename(kdso, filename, sizeof(filename)))
-			printed += fprintf(fp, "[0] %s\n", filename);
-	}
-
-	for (i = 0; i < vmlinux_path__nr_entries; ++i)
-		printed += fprintf(fp, "[%d] %s\n",
-				   i + kdso->has_build_id, vmlinux_path[i]);
-
-	return printed;
-}
-
 static int setup_list(struct strlist **list, const char *list_str,
 		      const char *list_name)
 {
@@ -1671,108 +1266,3 @@ void symbol__exit(void)
 	symbol_conf.sym_list = symbol_conf.dso_list = symbol_conf.comm_list = NULL;
 	symbol_conf.initialized = false;
 }
-
-int machines__create_kernel_maps(struct rb_root *machines, pid_t pid)
-{
-	struct machine *machine = machines__findnew(machines, pid);
-
-	if (machine == NULL)
-		return -1;
-
-	return machine__create_kernel_maps(machine);
-}
-
-int machines__create_guest_kernel_maps(struct rb_root *machines)
-{
-	int ret = 0;
-	struct dirent **namelist = NULL;
-	int i, items = 0;
-	char path[PATH_MAX];
-	pid_t pid;
-	char *endp;
-
-	if (symbol_conf.default_guest_vmlinux_name ||
-	    symbol_conf.default_guest_modules ||
-	    symbol_conf.default_guest_kallsyms) {
-		machines__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID);
-	}
-
-	if (symbol_conf.guestmount) {
-		items = scandir(symbol_conf.guestmount, &namelist, NULL, NULL);
-		if (items <= 0)
-			return -ENOENT;
-		for (i = 0; i < items; i++) {
-			if (!isdigit(namelist[i]->d_name[0])) {
-				/* Filter out . and .. */
-				continue;
-			}
-			pid = (pid_t)strtol(namelist[i]->d_name, &endp, 10);
-			if ((*endp != '\0') ||
-			    (endp == namelist[i]->d_name) ||
-			    (errno == ERANGE)) {
-				pr_debug("invalid directory (%s). Skipping.\n",
-					 namelist[i]->d_name);
-				continue;
-			}
-			sprintf(path, "%s/%s/proc/kallsyms",
-				symbol_conf.guestmount,
-				namelist[i]->d_name);
-			ret = access(path, R_OK);
-			if (ret) {
-				pr_debug("Can't access file %s\n", path);
-				goto failure;
-			}
-			machines__create_kernel_maps(machines, pid);
-		}
-failure:
-		free(namelist);
-	}
-
-	return ret;
-}
-
-void machines__destroy_guest_kernel_maps(struct rb_root *machines)
-{
-	struct rb_node *next = rb_first(machines);
-
-	while (next) {
-		struct machine *pos = rb_entry(next, struct machine, rb_node);
-
-		next = rb_next(&pos->rb_node);
-		rb_erase(&pos->rb_node, machines);
-		machine__delete(pos);
-	}
-}
-
-int machine__load_kallsyms(struct machine *machine, const char *filename,
-			   enum map_type type, symbol_filter_t filter)
-{
-	struct map *map = machine->vmlinux_maps[type];
-	int ret = dso__load_kallsyms(map->dso, filename, map, filter);
-
-	if (ret > 0) {
-		dso__set_loaded(map->dso, type);
-		/*
-		 * Since /proc/kallsyms will have multiple sessions for the
-		 * kernel, with modules between them, fixup the end of all
-		 * sections.
-		 */
-		__map_groups__fixup_end(&machine->kmaps, type);
-	}
-
-	return ret;
-}
-
-int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
-			       symbol_filter_t filter)
-{
-	struct map *map = machine->vmlinux_maps[type];
-	int ret = dso__load_vmlinux_path(map->dso, map, filter);
-
-	if (ret > 0) {
-		dso__set_loaded(map->dso, type);
-		map__reloc_vmlinux(map);
-	}
-
-	return ret;
-}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index de68f98b236d..d97377ac2f16 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -16,8 +16,8 @@
 #ifdef LIBELF_SUPPORT
 #include <libelf.h>
 #include <gelf.h>
-#include <elf.h>
 #endif
+#include <elf.h>
 
 #include "dso.h"
 
@@ -120,6 +120,8 @@ struct symbol_conf {
 };
 
 extern struct symbol_conf symbol_conf;
+extern int vmlinux_path__nr_entries;
+extern char **vmlinux_path;
 
 static inline void *symbol__priv(struct symbol *sym)
 {
@@ -223,6 +225,8 @@ size_t symbol__fprintf_symname_offs(const struct symbol *sym,
 size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp);
 size_t symbol__fprintf(struct symbol *sym, FILE *fp);
 bool symbol_type__is_a(char symbol_type, enum map_type map_type);
+bool symbol__restricted_filename(const char *filename,
+				 const char *restricted_filename);
 
 int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 		  struct symsrc *runtime_ss, symbol_filter_t filter,
diff --git a/tools/perf/util/sysfs.c b/tools/perf/util/sysfs.c
index 48c6902e749f..f71e9eafe15a 100644
--- a/tools/perf/util/sysfs.c
+++ b/tools/perf/util/sysfs.c
@@ -8,7 +8,7 @@ static const char * const sysfs_known_mountpoints[] = {
 };
 
 static int sysfs_found;
-char sysfs_mountpoint[PATH_MAX];
+char sysfs_mountpoint[PATH_MAX + 1];
 
 static int sysfs_valid_mountpoint(const char *sysfs)
 {
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index df59623ac763..632e40e5ceca 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -54,10 +54,10 @@ int thread__comm_len(struct thread *self)
 	return self->comm_len;
 }
 
-static size_t thread__fprintf(struct thread *self, FILE *fp)
+size_t thread__fprintf(struct thread *thread, FILE *fp)
 {
-	return fprintf(fp, "Thread %d %s\n", self->pid, self->comm) +
-	       map_groups__fprintf(&self->mg, verbose, fp);
+	return fprintf(fp, "Thread %d %s\n", thread->pid, thread->comm) +
+	       map_groups__fprintf(&thread->mg, verbose, fp);
 }
 
 void thread__insert_map(struct thread *self, struct map *map)
@@ -84,17 +84,3 @@ int thread__fork(struct thread *self, struct thread *parent)
 			return -ENOMEM;
 	return 0;
 }
-
-size_t machine__fprintf(struct machine *machine, FILE *fp)
-{
-	size_t ret = 0;
-	struct rb_node *nd;
-
-	for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
-		struct thread *pos = rb_entry(nd, struct thread, rb_node);
-
-		ret += thread__fprintf(pos, fp);
-	}
-
-	return ret;
-}
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index f2fa17caa7d5..5ad266403098 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -30,6 +30,7 @@ int thread__set_comm(struct thread *self, const char *comm);
 int thread__comm_len(struct thread *self);
 void thread__insert_map(struct thread *self, struct map *map);
 int thread__fork(struct thread *self, struct thread *parent);
+size_t thread__fprintf(struct thread *thread, FILE *fp);
 
 static inline struct map *thread__find_map(struct thread *self,
 					   enum map_type type, u64 addr)
diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index 884dde9b9bc1..54d37a4753c5 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c
@@ -26,6 +26,8 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 	float samples_per_sec = top->samples / top->delay_secs;
 	float ksamples_per_sec = top->kernel_samples / top->delay_secs;
 	float esamples_percent = (100.0 * top->exact_samples) / top->samples;
+	struct perf_record_opts *opts = &top->record_opts;
+	struct perf_target *target = &opts->target;
 	size_t ret = 0;
 
 	if (!perf_guest) {
@@ -61,31 +63,31 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 		struct perf_evsel *first = perf_evlist__first(top->evlist);
 		ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ",
 				(uint64_t)first->attr.sample_period,
-				top->freq ? "Hz" : "");
+				opts->freq ? "Hz" : "");
 	}
 
 	ret += SNPRINTF(bf + ret, size - ret, "%s", perf_evsel__name(top->sym_evsel));
 
 	ret += SNPRINTF(bf + ret, size - ret, "], ");
 
-	if (top->target.pid)
+	if (target->pid)
 		ret += SNPRINTF(bf + ret, size - ret, " (target_pid: %s",
-				top->target.pid);
-	else if (top->target.tid)
+				target->pid);
+	else if (target->tid)
 		ret += SNPRINTF(bf + ret, size - ret, " (target_tid: %s",
-				top->target.tid);
-	else if (top->target.uid_str != NULL)
+				target->tid);
+	else if (target->uid_str != NULL)
 		ret += SNPRINTF(bf + ret, size - ret, " (uid: %s",
-				top->target.uid_str);
+				target->uid_str);
 	else
 		ret += SNPRINTF(bf + ret, size - ret, " (all");
 
-	if (top->target.cpu_list)
+	if (target->cpu_list)
 		ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)",
 				top->evlist->cpus->nr > 1 ? "s" : "",
-				top->target.cpu_list);
+				target->cpu_list);
 	else {
-		if (top->target.tid)
+		if (target->tid)
 			ret += SNPRINTF(bf + ret, size - ret, ")");
 		else
 			ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)",
diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index 86ff1b15059b..7ebf357dc9e1 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h
@@ -14,7 +14,7 @@ struct perf_session;
 struct perf_top {
 	struct perf_tool   tool;
 	struct perf_evlist *evlist;
-	struct perf_target target;
+	struct perf_record_opts record_opts;
 	/*
 	 * Symbols will be added here in perf_event__process_sample and will
 	 * get out after decayed.
@@ -24,24 +24,16 @@ struct perf_top {
 	u64		   exact_samples;
 	u64		   guest_us_samples, guest_kernel_samples;
 	int		   print_entries, count_filter, delay_secs;
-	int		   freq;
 	bool		   hide_kernel_symbols, hide_user_symbols, zero;
 	bool		   use_tui, use_stdio;
 	bool		   sort_has_symbols;
-	bool		   dont_use_callchains;
 	bool		   kptr_restrict_warned;
 	bool		   vmlinux_warned;
-	bool		   inherit;
-	bool		   group;
-	bool		   sample_id_all_missing;
-	bool		   exclude_guest_missing;
 	bool		   dump_symtab;
 	struct hist_entry  *sym_filter_entry;
 	struct perf_evsel  *sym_evsel;
 	struct perf_session *session;
 	struct winsize	   winsize;
-	unsigned int	   mmap_pages;
-	int		   default_interval;
 	int		   realtime_prio;
 	int		   sym_pcnt_filter;
 	const char	   *sym_filter;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 5906e8426cc7..805d1f52c5b4 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -12,6 +12,8 @@
  */
 unsigned int page_size;
 
+bool test_attr__enabled;
+
 bool perf_host  = true;
 bool perf_guest = false;
 
@@ -218,3 +220,25 @@ void dump_stack(void)
 #else
 void dump_stack(void) {}
 #endif
+
+void get_term_dimensions(struct winsize *ws)
+{
+	char *s = getenv("LINES");
+
+	if (s != NULL) {
+		ws->ws_row = atoi(s);
+		s = getenv("COLUMNS");
+		if (s != NULL) {
+			ws->ws_col = atoi(s);
+			if (ws->ws_row && ws->ws_col)
+				return;
+		}
+	}
+#ifdef TIOCGWINSZ
+	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
+	    ws->ws_row && ws->ws_col)
+		return;
+#endif
+	ws->ws_row = 25;
+	ws->ws_col = 80;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index c2330918110c..09b4c26b71aa 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -265,10 +265,14 @@ bool is_power_of_2(unsigned long n)
 size_t hex_width(u64 v);
 int hex2u64(const char *ptr, u64 *val);
 
+char *ltrim(char *s);
 char *rtrim(char *s);
 
 void dump_stack(void);
 
 extern unsigned int page_size;
 
+struct winsize;
+void get_term_dimensions(struct winsize *ws);
+
 #endif
author	Stephen Rothwell <sfr@canb.auug.org.au>	2013-01-28 19:19:54 +1100
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2013-01-28 19:19:54 +1100
commit	9203a18694285cf12becb6a695feac055268f675 (patch)
tree	685bec0b563948ca7602de2d42ca03489317a463 /tools
parent	fa7201203eac6d21f8cd1bdbca8136672e82636c (diff)
parent	ec4add71968abd7e4cbcab8c73421aae7100b611 (diff)