diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-01-28 19:19:54 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2013-01-28 19:19:54 +1100 |
commit | 9203a18694285cf12becb6a695feac055268f675 (patch) | |
tree | 685bec0b563948ca7602de2d42ca03489317a463 /tools | |
parent | fa7201203eac6d21f8cd1bdbca8136672e82636c (diff) | |
parent | ec4add71968abd7e4cbcab8c73421aae7100b611 (diff) |
Merge remote-tracking branch 'tip/auto-latest'
Diffstat (limited to 'tools')
360 files changed, 32070 insertions, 2676 deletions
diff --git a/tools/kvm/.gitignore b/tools/kvm/.gitignore new file mode 100644 index 000000000000..60dd6dba3a50 --- /dev/null +++ b/tools/kvm/.gitignore @@ -0,0 +1,12 @@ +/lkvm +/vm +*.o +*.d +.cscope +tags +include/common-cmds.h +tests/boot/boot_test.iso +tests/boot/rootfs/ +guest/init +guest/init_stage2 +KVMTOOLS-VERSION-FILE diff --git a/tools/kvm/CREDITS-Git b/tools/kvm/CREDITS-Git new file mode 100644 index 000000000000..c2ddcb3acbd0 --- /dev/null +++ b/tools/kvm/CREDITS-Git @@ -0,0 +1,30 @@ +Most of the infrastructure that 'perf' uses here has been reused +from the Git project, as of version: + + 66996ec: Sync with 1.6.2.4 + +Here is an (incomplete!) list of main contributors to those files +in util/* and elsewhere: + + Alex Riesen + Christian Couder + Dmitry Potapov + Jeff King + Johannes Schindelin + Johannes Sixt + Junio C Hamano + Linus Torvalds + Matthias Kestenholz + Michal Ostrowski + Miklos Vajna + Petr Baudis + Pierre Habouzit + René Scharfe + Samuel Tardieu + Shawn O. Pearce + Steffen Prohaska + Steve Haslam + +Thanks guys! + +The full history of the files can be found in the upstream Git commits. diff --git a/tools/kvm/Documentation/kernel-debugging.txt b/tools/kvm/Documentation/kernel-debugging.txt new file mode 100644 index 000000000000..98b943829cbb --- /dev/null +++ b/tools/kvm/Documentation/kernel-debugging.txt @@ -0,0 +1,15 @@ +This document explains how to debug a guests' kernel using KGDB. + +1. Run the guest: + 'lkvm run -k [vmlinuz] -p "kgdboc=ttyS1 kgdbwait" --tty 1' + +And see which PTY got assigned to ttyS1 (you'll see: +' Info: Assigned terminal 1 to pty /dev/pts/X'). + +2. Run GDB on the host: + 'gdb [vmlinuz]' + +3. Connect to the guest (from within GDB): + 'target remote /dev/pty/X' + +4. Start debugging! (enter 'continue' to continue boot). diff --git a/tools/kvm/Documentation/kvm-balloon.txt b/tools/kvm/Documentation/kvm-balloon.txt new file mode 100644 index 000000000000..efc0a87e68c7 --- /dev/null +++ b/tools/kvm/Documentation/kvm-balloon.txt @@ -0,0 +1,24 @@ +lkvm-balloon(1) +================ + +NAME +---- +lkvm-balloon - Inflate or deflate the virtio balloon + +SYNOPSIS +-------- +[verse] +'lkvm balloon [command] [size] [instance]' + +DESCRIPTION +----------- +The command inflates or deflates the virtio balloon located in the +specified instance. +For a list of running instances see 'lkvm list'. + +Command can be either 'inflate' or 'deflate'. Inflate increases the +size of the balloon, thus decreasing the amount of virtual RAM available +for the guest. Deflation returns previously inflated memory back to the +guest. + +size is specified in Mb. diff --git a/tools/kvm/Documentation/kvm-debug.txt b/tools/kvm/Documentation/kvm-debug.txt new file mode 100644 index 000000000000..a8eb2c0196f7 --- /dev/null +++ b/tools/kvm/Documentation/kvm-debug.txt @@ -0,0 +1,16 @@ +lkvm-debug(1) +================ + +NAME +---- +lkvm-debug - Print debug information from a running instance + +SYNOPSIS +-------- +[verse] +'lkvm debug [instance]' + +DESCRIPTION +----------- +The command prints debug information from a running instance. +For a list of running instances see 'lkvm list'. diff --git a/tools/kvm/Documentation/kvm-list.txt b/tools/kvm/Documentation/kvm-list.txt new file mode 100644 index 000000000000..a245607d4d97 --- /dev/null +++ b/tools/kvm/Documentation/kvm-list.txt @@ -0,0 +1,16 @@ +lkvm-list(1) +================ + +NAME +---- +lkvm-list - Print a list of running instances on the host. + +SYNOPSIS +-------- +[verse] +'lkvm list' + +DESCRIPTION +----------- +This command prints a list of running instances on the host which +belong to the user who currently ran 'lkvm list'. diff --git a/tools/kvm/Documentation/kvm-pause.txt b/tools/kvm/Documentation/kvm-pause.txt new file mode 100644 index 000000000000..1ea2a239cc75 --- /dev/null +++ b/tools/kvm/Documentation/kvm-pause.txt @@ -0,0 +1,16 @@ +lkvm-pause(1) +================ + +NAME +---- +lkvm-pause - Pause the virtual machine + +SYNOPSIS +-------- +[verse] +'lkvm pause [instance]' + +DESCRIPTION +----------- +The command pauses a virtual machine. +For a list of running instances see 'lkvm list'. diff --git a/tools/kvm/Documentation/kvm-resume.txt b/tools/kvm/Documentation/kvm-resume.txt new file mode 100644 index 000000000000..a36c4df40d76 --- /dev/null +++ b/tools/kvm/Documentation/kvm-resume.txt @@ -0,0 +1,16 @@ +lkvm-resume(1) +================ + +NAME +---- +lkvm-resume - Resume the virtual machine + +SYNOPSIS +-------- +[verse] +'lkvm resume [instance]' + +DESCRIPTION +----------- +The command resumes a virtual machine. +For a list of running instances see 'lkvm list'. diff --git a/tools/kvm/Documentation/kvm-run.txt b/tools/kvm/Documentation/kvm-run.txt new file mode 100644 index 000000000000..8ddf470145d1 --- /dev/null +++ b/tools/kvm/Documentation/kvm-run.txt @@ -0,0 +1,62 @@ +lkvm-run(1) +================ + +NAME +---- +lkvm-run - Start the virtual machine + +SYNOPSIS +-------- +[verse] +'lkvm run' [-k <kernel image> | --kernel <kernel image>] + +DESCRIPTION +----------- +The command starts a virtual machine. + +OPTIONS +------- +-m:: +--mem=:: + Virtual machine memory size in MiB. + +-p:: +--params:: + Additional kernel command line arguments. + +-r:: +--initrd=:: + Initial RAM disk image. + +-k:: +--kernel=:: + The virtual machine kernel. + +--dev=:: + KVM device file. + +-i:: +--image=:: + A disk image file. + +-s:: +--single-step:: + Enable single stepping. + +-g:: +--ioport-debug:: + Enable ioport debugging. + +-c:: +--enable-virtio-console:: + Enable the virtual IO console. + +--cpus:: + The number of virtual CPUs to run. + +--debug:: + Enable debug messages. + +SEE ALSO +-------- +linkkvm: diff --git a/tools/kvm/Documentation/kvm-sandbox.txt b/tools/kvm/Documentation/kvm-sandbox.txt new file mode 100644 index 000000000000..2d7f558e5d52 --- /dev/null +++ b/tools/kvm/Documentation/kvm-sandbox.txt @@ -0,0 +1,16 @@ +lkvm-sandbox(1) +================ + +NAME +---- +lkvm-sandbox - Run a command in a sandboxed guest + +SYNOPSIS +-------- +[verse] +'lkvm sandbox ['lkvm run' arguments] -- [sandboxed command]' + +DESCRIPTION +----------- +The sandboxed command will run in a guest as part of it's init +command. diff --git a/tools/kvm/Documentation/kvm-setup.txt b/tools/kvm/Documentation/kvm-setup.txt new file mode 100644 index 000000000000..4b6e3318b0a7 --- /dev/null +++ b/tools/kvm/Documentation/kvm-setup.txt @@ -0,0 +1,15 @@ +lkvm-setup(1) +================ + +NAME +---- +lkvm-setup - Setup a new virtual machine + +SYNOPSIS +-------- +[verse] +'lkvm setup <name>' + +DESCRIPTION +----------- +The command setups a virtual machine. diff --git a/tools/kvm/Documentation/kvm-stat.txt b/tools/kvm/Documentation/kvm-stat.txt new file mode 100644 index 000000000000..101ce7ac12a7 --- /dev/null +++ b/tools/kvm/Documentation/kvm-stat.txt @@ -0,0 +1,19 @@ +lkvm-stat(1) +================ + +NAME +---- +lkvm-stat - Print statistics about a running instance + +SYNOPSIS +-------- +[verse] +'lkvm [command] [-n instance] [-p instance pid] [--all]' + +DESCRIPTION +----------- +The command prints statistics about a running instance. +For a list of running instances see 'lkvm list'. + +Commands: + --memory, -m Display memory statistics diff --git a/tools/kvm/Documentation/kvm-stop.txt b/tools/kvm/Documentation/kvm-stop.txt new file mode 100644 index 000000000000..6e4bc831e2af --- /dev/null +++ b/tools/kvm/Documentation/kvm-stop.txt @@ -0,0 +1,16 @@ +lkvm-stop(1) +================ + +NAME +---- +lkvm-stop - Stop a running instance + +SYNOPSIS +-------- +[verse] +'lkvm stop [instance]' + +DESCRIPTION +----------- +The command stops a running instance. +For a list of running instances see 'lkvm list'. diff --git a/tools/kvm/Documentation/kvm-version.txt b/tools/kvm/Documentation/kvm-version.txt new file mode 100644 index 000000000000..41003d2b99bb --- /dev/null +++ b/tools/kvm/Documentation/kvm-version.txt @@ -0,0 +1,21 @@ +lkvm-version(1) +================ + +NAME +---- +lkvm-version - Print the version of the kernel tree kvm tools +was built on. + +SYNOPSIS +-------- +[verse] +'lkvm version' + +DESCRIPTION +----------- +The command prints the version of the kernel that was used to build +kvm tools. + +Note that the version is not the version of the kernel which is currently +running on the host, but is the version of the kernel tree from which kvm +tools was built. diff --git a/tools/kvm/Documentation/virtio-console.txt b/tools/kvm/Documentation/virtio-console.txt new file mode 100644 index 000000000000..4a58d567c991 --- /dev/null +++ b/tools/kvm/Documentation/virtio-console.txt @@ -0,0 +1,41 @@ +General +-------- + +virtio-console as the name implies is a console over virtio transport. Here is +a simple head to head comparison of the virtio-console vs regular 8250 console: + +8250 serial console: + + - Requires CONFIG_SERIAL_8250=y and CONFIG_SERIAL_8250_CONSOLE=y kernel configs, +which are enabled almost everywhere. + - Doesn't require guest-side changes. + - Compatible with older guests. + +virtio-console: + + - Requires CONFIG_VIRTIO_CONSOLE=y (along with all other virtio dependencies), +which got enabled only in recent kernels (but not all of them). + - Much faster. + - Consumes less processing resources. + - Requires guest-side changes. + +Enabling virtio-console +------------------------ + +First, make sure guest kernel is built with CONFIG_VIRTIO_CONSOLE=y. Once this +is done, the following has to be done inside guest image: + + - Add the following line to /etc/inittab: + 'hvc0:2345:respawn:/sbin/agetty -L 9600 hvc0' + - Add 'hvc0' to /etc/securetty (so you could actually log on) + - Start the guest with '--console virtio' + +Common errors +-------------- + +Q: I don't see anything on the screen! +A: Make sure CONFIG_VIRTIO_CONSOLE=y is enabled in the *guest* kernel, also +make sure you've updated /etc/inittab + +Q: It won't accept my username/password, but I enter them correctly! +A: You didn't add 'hvc0' to /etc/securetty diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile new file mode 100644 index 000000000000..0c59faaa0779 --- /dev/null +++ b/tools/kvm/Makefile @@ -0,0 +1,491 @@ +# +# Define WERROR=0 to disable -Werror. +# + +ifeq ($(strip $(V)),) + E = @echo + Q = @ +else + E = @\# + Q = +endif +ifneq ($(I), ) + KINCL_PATH=$(I) +else + KINCL_PATH=../.. +endif +export E Q KINCL_PATH + +include config/utilities.mak +include config/feature-tests.mak + +CC := $(CROSS_COMPILE)gcc +LD := $(CROSS_COMPILE)ld + +FIND := find +CSCOPE := cscope +TAGS := ctags +INSTALL := install + +prefix = $(HOME) +bindir_relative = bin +bindir = $(prefix)/$(bindir_relative) + +DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) +bindir_SQ = $(subst ','\'',$(bindir)) + +PROGRAM := lkvm +PROGRAM_ALIAS := vm + +GUEST_INIT := guest/init + +OBJS += builtin-balloon.o +OBJS += builtin-debug.o +OBJS += builtin-help.o +OBJS += builtin-list.o +OBJS += builtin-stat.o +OBJS += builtin-pause.o +OBJS += builtin-resume.o +OBJS += builtin-run.o +OBJS += builtin-setup.o +OBJS += builtin-stop.o +OBJS += builtin-version.o +OBJS += devices.o +OBJS += disk/core.o +OBJS += framebuffer.o +OBJS += guest_compat.o +OBJS += hw/rtc.o +OBJS += hw/serial.o +OBJS += ioport.o +OBJS += kvm-cpu.o +OBJS += kvm.o +OBJS += main.o +OBJS += mmio.o +OBJS += pci.o +OBJS += term.o +OBJS += virtio/blk.o +OBJS += virtio/scsi.o +OBJS += virtio/console.o +OBJS += virtio/core.o +OBJS += virtio/net.o +OBJS += virtio/rng.o +OBJS += virtio/balloon.o +OBJS += virtio/pci.o +OBJS += disk/blk.o +OBJS += disk/qcow.o +OBJS += disk/raw.o +OBJS += ioeventfd.o +OBJS += net/uip/core.o +OBJS += net/uip/arp.o +OBJS += net/uip/icmp.o +OBJS += net/uip/ipv4.o +OBJS += net/uip/tcp.o +OBJS += net/uip/udp.o +OBJS += net/uip/buf.o +OBJS += net/uip/csum.o +OBJS += net/uip/dhcp.o +OBJS += kvm-cmd.o +OBJS += util/init.o +OBJS += util/rbtree.o +OBJS += util/threadpool.o +OBJS += util/parse-options.o +OBJS += util/rbtree-interval.o +OBJS += util/strbuf.o +OBJS += util/read-write.o +OBJS += util/util.o +OBJS += virtio/9p.o +OBJS += virtio/9p-pdu.o +OBJS += hw/vesa.o +OBJS += hw/pci-shmem.o +OBJS += kvm-ipc.o +OBJS += builtin-sandbox.o +OBJS += virtio/mmio.o + +# Translate uname -m into ARCH string +ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/ \ + -e s/armv7.*/arm/ -e s/aarch64.*/arm64/) + +ifeq ($(ARCH),i386) + ARCH := x86 + DEFINES += -DCONFIG_X86_32 +endif +ifeq ($(ARCH),x86_64) + ARCH := x86 + DEFINES += -DCONFIG_X86_64 +endif + +LIBFDT_SRC = fdt.o fdt_ro.o fdt_wip.o fdt_sw.o fdt_rw.o fdt_strerror.o +LIBFDT_OBJS = $(patsubst %,../../scripts/dtc/libfdt/%,$(LIBFDT_SRC)) + +### Arch-specific stuff + +#x86 +ifeq ($(ARCH),x86) + DEFINES += -DCONFIG_X86 + OBJS += x86/boot.o + OBJS += x86/cpuid.o + OBJS += x86/interrupt.o + OBJS += x86/ioport.o + OBJS += x86/irq.o + OBJS += x86/kvm.o + OBJS += x86/kvm-cpu.o + OBJS += x86/mptable.o + OBJS += hw/i8042.o +# Exclude BIOS object files from header dependencies. + OTHEROBJS += x86/bios.o + OTHEROBJS += x86/bios/bios-rom.o + ARCH_INCLUDE := x86/include +endif +# POWER/ppc: Actually only support ppc64 currently. +ifeq ($(ARCH), powerpc) + DEFINES += -DCONFIG_PPC + OBJS += powerpc/boot.o + OBJS += powerpc/ioport.o + OBJS += powerpc/irq.o + OBJS += powerpc/kvm.o + OBJS += powerpc/cpu_info.o + OBJS += powerpc/kvm-cpu.o + OBJS += powerpc/spapr_hcall.o + OBJS += powerpc/spapr_rtas.o + OBJS += powerpc/spapr_hvcons.o + OBJS += powerpc/spapr_pci.o + OBJS += powerpc/xics.o +# We use libfdt, but it's sometimes not packaged 64bit. It's small too, +# so just build it in: + CFLAGS += -I../../scripts/dtc/libfdt + OTHEROBJS += $(LIBFDT_OBJS) + ARCH_INCLUDE := powerpc/include + CFLAGS += -m64 +endif + +# ARM +OBJS_ARM_COMMON := arm/fdt.o arm/gic.o arm/ioport.o arm/irq.o \ + arm/kvm.o arm/kvm-cpu.o +HDRS_ARM_COMMON := arm/include +ifeq ($(ARCH), arm) + DEFINES += -DCONFIG_ARM + OBJS += $(OBJS_ARM_COMMON) + OBJS += arm/aarch32/cortex-a15.o + OBJS += arm/aarch32/kvm-cpu.o + ARCH_INCLUDE := $(HDRS_ARM_COMMON) + ARCH_INCLUDE += -Iarm/aarch32/include + CFLAGS += -march=armv7-a + CFLAGS += -I../../scripts/dtc/libfdt + OTHEROBJS += $(LIBFDT_OBJS) +endif + +# ARM64 +ifeq ($(ARCH), arm64) + DEFINES += -DCONFIG_ARM64 + OBJS += $(OBJS_ARM_COMMON) + OBJS += arm/aarch64/cortex-a57.o + OBJS += arm/aarch64/kvm-cpu.o + ARCH_INCLUDE := $(HDRS_ARM_COMMON) + ARCH_INCLUDE += -Iarm/aarch64/include + CFLAGS += -I../../scripts/dtc/libfdt + OTHEROBJS += $(LIBFDT_OBJS) +endif + +### + +ifeq (,$(ARCH_INCLUDE)) + UNSUPP_ERR = @echo "This architecture is not supported in kvmtool." && exit 1 +else + UNSUPP_ERR = +endif + +### + +# Detect optional features. +# On a given system, some libs may link statically, some may not; so, check +# both and only build those that link! + +FLAGS_BFD := $(CFLAGS) -lbfd +ifeq ($(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD) -static),y) + CFLAGS_STATOPT += -DCONFIG_HAS_BFD + OBJS_STATOPT += symbol.o + LIBS_STATOPT += -lbfd +endif + +FLAGS_VNCSERVER := $(CFLAGS) -lvncserver +ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER)),y) + OBJS_DYNOPT += ui/vnc.o + CFLAGS_DYNOPT += -DCONFIG_HAS_VNCSERVER + LIBS_DYNOPT += -lvncserver +endif +ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER) -static),y) + OBJS_STATOPT += ui/vnc.o + CFLAGS_STATOPT += -DCONFIG_HAS_VNCSERVER + LIBS_STATOPT += -lvncserver +endif + +FLAGS_SDL := $(CFLAGS) -lSDL +ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL)),y) + OBJS_DYNOPT += ui/sdl.o + CFLAGS_DYNOPT += -DCONFIG_HAS_SDL + LIBS_DYNOPT += -lSDL +endif +ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL) -static), y) + OBJS_STATOPT += ui/sdl.o + CFLAGS_STATOPT += -DCONFIG_HAS_SDL + LIBS_STATOPT += -lSDL +endif + +FLAGS_ZLIB := $(CFLAGS) -lz +ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB)),y) + CFLAGS_DYNOPT += -DCONFIG_HAS_ZLIB + LIBS_DYNOPT += -lz +endif +ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB) -static),y) + CFLAGS_STATOPT += -DCONFIG_HAS_ZLIB + LIBS_STATOPT += -lz +endif + +FLAGS_AIO := $(CFLAGS) -laio +ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO)),y) + CFLAGS_DYNOPT += -DCONFIG_HAS_AIO + LIBS_DYNOPT += -laio +endif +ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO) -static),y) + CFLAGS_STATOPT += -DCONFIG_HAS_AIO + LIBS_STATOPT += -laio +endif + +ifeq ($(LTO),1) + FLAGS_LTO := -flto + ifeq ($(call try-cc,$(SOURCE_HELLO),$(FLAGS_LTO)),y) + CFLAGS += $(FLAGS_LTO) + endif +endif + +ifneq ($(call try-build,$(SOURCE_STATIC),-static,),y) +$(error No static libc found. Please install glibc-static package.) +endif +### + +LIBS += -lrt +LIBS += -lpthread +LIBS += -lutil + + +DEPS := $(patsubst %.o,%.d,$(OBJS)) + +DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_GNU_SOURCE +DEFINES += -DKVMTOOLS_VERSION='"$(KVMTOOLS_VERSION)"' +DEFINES += -DBUILD_ARCH='"$(ARCH)"' + +KVM_INCLUDE := include +CFLAGS += $(CPPFLAGS) $(DEFINES) -I$(KVM_INCLUDE) -I$(ARCH_INCLUDE) -I$(KINCL_PATH)/include/uapi -I$(KINCL_PATH)/include -I$(KINCL_PATH)/arch/$(ARCH)/include/uapi -I$(KINCL_PATH)/arch/$(ARCH)/include/ -O2 -fno-strict-aliasing -g + +WARNINGS += -Wall +WARNINGS += -Wformat=2 +WARNINGS += -Winit-self +WARNINGS += -Wmissing-declarations +WARNINGS += -Wmissing-prototypes +WARNINGS += -Wnested-externs +WARNINGS += -Wno-system-headers +WARNINGS += -Wold-style-definition +WARNINGS += -Wredundant-decls +WARNINGS += -Wsign-compare +WARNINGS += -Wstrict-prototypes +WARNINGS += -Wundef +WARNINGS += -Wvolatile-register-var +WARNINGS += -Wwrite-strings + +CFLAGS += $(WARNINGS) + +# Some targets may use 'external' sources that don't build totally cleanly. +CFLAGS_EASYGOING := $(CFLAGS) + +ifneq ($(WERROR),0) + CFLAGS += -Werror +endif + +all: arch_support_check $(PROGRAM) $(PROGRAM_ALIAS) $(GUEST_INIT) + +arch_support_check: + $(UNSUPP_ERR) + +KVMTOOLS-VERSION-FILE: + @$(SHELL_PATH) util/KVMTOOLS-VERSION-GEN $(OUTPUT) +-include $(OUTPUT)KVMTOOLS-VERSION-FILE + +# When building -static all objects are built with appropriate flags, which +# may differ between static & dynamic .o. The objects are separated into +# .o and .static.o. See the %.o: %.c rules below. +# +# $(OTHEROBJS) are things that do not get substituted like this. +# +STATIC_OBJS = $(patsubst %.o,%.static.o,$(OBJS) $(OBJS_STATOPT)) +GUEST_OBJS = guest/guest_init.o + +$(PROGRAM)-static: $(DEPS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_INIT) + $(E) " LINK " $@ + $(Q) $(CC) -static $(CFLAGS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_STATOPT) -o $@ + +$(PROGRAM): $(DEPS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_INIT) + $(E) " LINK " $@ + $(Q) $(CC) $(CFLAGS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_DYNOPT) -o $@ + +$(PROGRAM_ALIAS): $(PROGRAM) + $(E) " LN " $@ + $(Q) ln -f $(PROGRAM) $@ + +$(GUEST_INIT): guest/init.c + $(E) " LINK " $@ + $(Q) $(CC) -static guest/init.c -o $@ + $(Q) $(LD) -r -b binary -o guest/guest_init.o $(GUEST_INIT) + +$(DEPS): + +util/rbtree.d: ../../lib/rbtree.c + $(Q) $(CC) -M -MT util/rbtree.o $(CFLAGS) $< -o $@ + +%.d: %.c + $(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@ + +%.s: %.c + $(Q) $(CC) -o $@ -S $(CFLAGS) -fverbose-asm $< + +# The header file common-cmds.h is needed for compilation of builtin-help.c. +builtin-help.d: $(KVM_INCLUDE)/common-cmds.h + +$(OBJS): + +# This rule relaxes the -Werror on libfdt, since for now it still has +# a bunch of warnings. :( +../../scripts/dtc/libfdt/%.o: ../../scripts/dtc/libfdt/%.c +ifeq ($(C),1) + $(E) " CHECK " $@ + $(Q) $(CHECK) -c $(CFLAGS_EASYGOING) $< -o $@ +endif + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS_EASYGOING) $< -o $@ + +util/rbtree.static.o util/rbtree.o: ../../lib/rbtree.c +ifeq ($(C),1) + $(E) " CHECK " $@ + $(Q) $(CHECK) -c $(CFLAGS) $< -o $@ +endif + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS) $< -o $@ + +%.static.o: %.c +ifeq ($(C),1) + $(E) " CHECK " $@ + $(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_STATOPT) $< -o $@ +endif + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_STATOPT) $< -o $@ + +%.o: %.c +ifeq ($(C),1) + $(E) " CHECK " $@ + $(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@ +endif + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@ + + +$(KVM_INCLUDE)/common-cmds.h: util/generate-cmdlist.sh command-list.txt + +$(KVM_INCLUDE)/common-cmds.h: $(wildcard Documentation/kvm-*.txt) + $(E) " GEN " $@ + $(Q) util/generate-cmdlist.sh > $@+ && mv $@+ $@ + +# +# BIOS assembly weirdness +# +BIOS_CFLAGS += -m32 +BIOS_CFLAGS += -march=i386 +BIOS_CFLAGS += -mregparm=3 + +BIOS_CFLAGS += -fno-stack-protector +BIOS_CFLAGS += -I../../arch/$(ARCH) + +x86/bios.o: x86/bios/bios.bin x86/bios/bios-rom.h + +x86/bios/bios.bin.elf: x86/bios/entry.S x86/bios/e820.c x86/bios/int10.c x86/bios/int15.c x86/bios/rom.ld.S + $(E) " CC x86/bios/memcpy.o" + $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/memcpy.c -o x86/bios/memcpy.o + $(E) " CC x86/bios/e820.o" + $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/e820.c -o x86/bios/e820.o + $(E) " CC x86/bios/int10.o" + $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int10.c -o x86/bios/int10.o + $(E) " CC x86/bios/int15.o" + $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int15.c -o x86/bios/int15.o + $(E) " CC x86/bios/entry.o" + $(Q) $(CC) $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/entry.S -o x86/bios/entry.o + $(E) " LD " $@ + $(Q) $(LD) -T x86/bios/rom.ld.S -o x86/bios/bios.bin.elf x86/bios/memcpy.o x86/bios/entry.o x86/bios/e820.o x86/bios/int10.o x86/bios/int15.o + +x86/bios/bios.bin: x86/bios/bios.bin.elf + $(E) " OBJCOPY " $@ + $(Q) objcopy -O binary -j .text x86/bios/bios.bin.elf x86/bios/bios.bin + +x86/bios/bios-rom.o: x86/bios/bios-rom.S x86/bios/bios.bin x86/bios/bios-rom.h + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS) x86/bios/bios-rom.S -o x86/bios/bios-rom.o + +x86/bios/bios-rom.h: x86/bios/bios.bin.elf + $(E) " NM " $@ + $(Q) cd x86/bios && sh gen-offsets.sh > bios-rom.h && cd .. + +check: all + $(MAKE) -C tests + ./$(PROGRAM) run tests/pit/tick.bin + ./$(PROGRAM) run -d tests/boot/boot_test.iso -p "init=init" +.PHONY: check + +install: all + $(E) " INSTALL" + $(Q) $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' + $(Q) $(INSTALL) $(PROGRAM) '$(DESTDIR_SQ)$(bindir_SQ)' +.PHONY: install + +clean: + $(E) " CLEAN" + $(Q) rm -f x86/bios/*.bin + $(Q) rm -f x86/bios/*.elf + $(Q) rm -f x86/bios/*.o + $(Q) rm -f x86/bios/bios-rom.h + $(Q) rm -f tests/boot/boot_test.iso + $(Q) rm -rf tests/boot/rootfs/ + $(Q) rm -f $(DEPS) $(OBJS) $(OTHEROBJS) $(OBJS_DYNOPT) $(STATIC_OBJS) $(PROGRAM) $(PROGRAM_ALIAS) $(PROGRAM)-static $(GUEST_INIT) $(GUEST_OBJS) + $(Q) rm -f cscope.* + $(Q) rm -f tags + $(Q) rm -f TAGS + $(Q) rm -f $(KVM_INCLUDE)/common-cmds.h + $(Q) rm -f KVMTOOLS-VERSION-FILE +.PHONY: clean + +KVM_DEV ?= /dev/kvm + +$(KVM_DEV): + $(E) " MKNOD " $@ + $(Q) mknod $@ char 10 232 + +devices: $(KVM_DEV) +.PHONY: devices + +TAGS: + $(E) " GEN" $@ + $(Q) $(RM) -f TAGS + $(Q) $(FIND) . -name '*.[hcS]' -print | xargs etags -a +.PHONY: TAGS + +tags: + $(E) " GEN" $@ + $(Q) $(RM) -f tags + $(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a +.PHONY: tags + +cscope: + $(E) " GEN" $@ + $(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files + $(Q) $(CSCOPE) -bkqu +.PHONY: cscope + +# Deps +-include $(DEPS) diff --git a/tools/kvm/README b/tools/kvm/README new file mode 100644 index 000000000000..358fa23cbab3 --- /dev/null +++ b/tools/kvm/README @@ -0,0 +1,112 @@ +Native Linux KVM tool +===================== +The goal of this tool is to provide a clean, from-scratch, lightweight +KVM host tool implementation that can boot Linux guest images (just a +hobby, won't be big and professional like QEMU) with no BIOS +dependencies and with only the minimal amount of legacy device +emulation. + +It's great as a learning tool if you want to get your feet wet in +virtualization land: it's only 5 KLOC of clean C code that can already +boot a guest Linux image. + +Right now it can boot a Linux image and provide you output via a serial +console, over the host terminal, i.e. you can use it to boot a guest +Linux image in a terminal or over ssh and log into the guest without +much guest or host side setup work needed. + +1. To try out the tool, clone the git repository: + + git clone git://github.com/penberg/linux-kvm.git + +or alternatively, if you already have a kernel source tree: + + git remote add kvm-tool git://github.com/penberg/linux-kvm.git + git remote update + git checkout -b kvm-tool/master kvm-tool + +2. Compile the tool: + + cd tools/kvm && make + +3. Download a raw userspace image: + + wget http://wiki.qemu.org/download/linux-0.2.img.bz2 && bunzip2 +linux-0.2.img.bz2 + +4. The guest kernel has to be built with the following configuration: + + - For the default console output: + CONFIG_SERIAL_8250=y + CONFIG_SERIAL_8250_CONSOLE=y + + - For running 32bit images on 64bit hosts: + CONFIG_IA32_EMULATION=y + + - Proper FS options according to image FS (e.g. CONFIG_EXT2_FS, CONFIG_EXT4_FS). + + - For all virtio devices listed below: + CONFIG_VIRTIO=y + CONFIG_VIRTIO_RING=y + CONFIG_VIRTIO_PCI=y + + - For virtio-blk devices (--disk, -d): + CONFIG_VIRTIO_BLK=y + + - For virtio-net devices ([--network, -n] virtio): + CONFIG_VIRTIO_NET=y + + - For virtio-9p devices (--virtio-9p): + CONFIG_NET_9P=y + CONFIG_NET_9P_VIRTIO=y + CONFIG_9P_FS=y + + - For virtio-balloon device (--balloon): + CONFIG_VIRTIO_BALLOON=y + + - For virtio-console device (--console virtio): + CONFIG_VIRTIO_CONSOLE=y + + - For virtio-rng device (--rng): + CONFIG_HW_RANDOM_VIRTIO=y + + - For vesa device (--sdl or --vnc): + CONFIG_FB_VESA=y + + +5. And finally, launch the hypervisor: + + ./lkvm run --disk linux-0.2.img \ + --kernel ../../arch/x86/boot/bzImage \ +or + + sudo ./lkvm run --disk linux-0.2.img \ + --kernel ../../arch/x86/boot/bzImage \ + --network virtio + +The tool has been written by Pekka Enberg, Cyrill Gorcunov, Asias He, +Sasha Levin and Prasad Joshi. Special thanks to Avi Kivity for his help +on KVM internals and Ingo Molnar for all-around support and encouragement! + +See the following thread for original discussion for motivation of this +project: + +http://thread.gmane.org/gmane.linux.kernel/962051/focus=962620 + +Build dependencies +===================== +For deb based systems: +32-bit: +sudo apt-get install build-essential +64-bit: +sudo apt-get install build-essential libc6-dev-i386 + +For rpm based systems: +32-bit: +yum install glibc-devel +64-bit: +yum install glibc-devel glibc-static + +On 64-bit Arch Linux make sure the multilib repository is enabled in your +/etc/pacman.conf and run +pacman -Sy lib32-glibc diff --git a/tools/kvm/arm/aarch32/cortex-a15.c b/tools/kvm/arm/aarch32/cortex-a15.c new file mode 100644 index 000000000000..80317474c8bc --- /dev/null +++ b/tools/kvm/arm/aarch32/cortex-a15.c @@ -0,0 +1,94 @@ +#include "kvm/fdt.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" +#include "kvm/util.h" + +#include "arm-common/gic.h" + +#include <linux/byteorder.h> +#include <linux/types.h> + +#define CPU_NAME_MAX_LEN 8 +static void generate_cpu_nodes(void *fdt, struct kvm *kvm) +{ + int cpu; + + _FDT(fdt_begin_node(fdt, "cpus")); + _FDT(fdt_property_cell(fdt, "#address-cells", 0x1)); + _FDT(fdt_property_cell(fdt, "#size-cells", 0x0)); + + for (cpu = 0; cpu < kvm->nrcpus; ++cpu) { + char cpu_name[CPU_NAME_MAX_LEN]; + + if (kvm->cpus[cpu]->cpu_type != KVM_ARM_TARGET_CORTEX_A15) { + pr_warning("Ignoring unknown type for CPU %d\n", cpu); + continue; + } + + snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%d", cpu); + + _FDT(fdt_begin_node(fdt, cpu_name)); + _FDT(fdt_property_string(fdt, "device_type", "cpu")); + _FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a15")); + + if (kvm->nrcpus > 1) + _FDT(fdt_property_string(fdt, "enable-method", "psci")); + + _FDT(fdt_property_cell(fdt, "reg", cpu)); + _FDT(fdt_end_node(fdt)); + } + + _FDT(fdt_end_node(fdt)); +} + +static void generate_timer_nodes(void *fdt, struct kvm *kvm) +{ + u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \ + & GIC_FDT_IRQ_PPI_CPU_MASK; + u32 irq_prop[] = { + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(13), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(14), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(11), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(10), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + }; + + _FDT(fdt_begin_node(fdt, "timer")); + _FDT(fdt_property_string(fdt, "compatible", "arm,armv7-timer")); + _FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop))); + _FDT(fdt_end_node(fdt)); +} + +static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle) +{ + generate_cpu_nodes(fdt, kvm); + gic__generate_fdt_nodes(fdt, gic_phandle); + generate_timer_nodes(fdt, kvm); +} + +static int cortex_a15__vcpu_init(struct kvm_cpu *vcpu) +{ + vcpu->generate_fdt_nodes = generate_fdt_nodes; + return 0; +} + +static struct kvm_arm_target target_cortex_a15 = { + .id = KVM_ARM_TARGET_CORTEX_A15, + .init = cortex_a15__vcpu_init, +}; + +static int cortex_a15__core_init(struct kvm *kvm) +{ + return kvm_cpu__register_kvm_arm_target(&target_cortex_a15); +} +core_init(cortex_a15__core_init); diff --git a/tools/kvm/arm/aarch32/include/kvm/barrier.h b/tools/kvm/arm/aarch32/include/kvm/barrier.h new file mode 100644 index 000000000000..94913a9564d4 --- /dev/null +++ b/tools/kvm/arm/aarch32/include/kvm/barrier.h @@ -0,0 +1,10 @@ +#ifndef KVM__KVM_BARRIER_H +#define KVM__KVM_BARRIER_H + +#define dmb() asm volatile ("dmb" : : : "memory") + +#define mb() dmb() +#define rmb() dmb() +#define wmb() dmb() + +#endif /* KVM__KVM_BARRIER_H */ diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h new file mode 100644 index 000000000000..1632e3c5e834 --- /dev/null +++ b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h @@ -0,0 +1,13 @@ +#ifndef KVM__KVM_ARCH_H +#define KVM__KVM_ARCH_H + +#define ARM_GIC_DIST_SIZE 0x1000 +#define ARM_GIC_CPUI_SIZE 0x2000 + +#define ARM_KERN_OFFSET(...) 0x8000 + +#define ARM_MAX_MEMORY(...) ARM_LOMAP_MAX_MEMORY + +#include "arm-common/kvm-arch.h" + +#endif /* KVM__KVM_ARCH_H */ diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h new file mode 100644 index 000000000000..acf0d2387774 --- /dev/null +++ b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h @@ -0,0 +1,8 @@ +#ifndef KVM__KVM_CONFIG_ARCH_H +#define KVM__KVM_CONFIG_ARCH_H + +#define ARM_OPT_ARCH_RUN(...) + +#include "arm-common/kvm-config-arch.h" + +#endif /* KVM__KVM_CONFIG_ARCH_H */ diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h new file mode 100644 index 000000000000..b9fda07d1e55 --- /dev/null +++ b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h @@ -0,0 +1,12 @@ +#ifndef KVM__KVM_CPU_ARCH_H +#define KVM__KVM_CPU_ARCH_H + +#include "kvm/kvm.h" + +#include "arm-common/kvm-cpu-arch.h" + +#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid) { \ + [0] = (!!(cpuid) << KVM_ARM_VCPU_POWER_OFF), \ +} + +#endif /* KVM__KVM_CPU_ARCH_H */ diff --git a/tools/kvm/arm/aarch32/kvm-cpu.c b/tools/kvm/arm/aarch32/kvm-cpu.c new file mode 100644 index 000000000000..a5287897ee72 --- /dev/null +++ b/tools/kvm/arm/aarch32/kvm-cpu.c @@ -0,0 +1,106 @@ +#include "kvm/kvm-cpu.h" +#include "kvm/kvm.h" + +#include <asm/ptrace.h> + +#define ARM_CORE_REG(x) (KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | \ + KVM_REG_ARM_CORE_REG(x)) + +void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_one_reg reg; + u32 data; + + /* Who said future-proofing was a good idea? */ + reg.addr = (u64)(unsigned long)&data; + + /* cpsr = IRQs/FIQs masked */ + data = PSR_I_BIT | PSR_F_BIT | SVC_MODE; + reg.id = ARM_CORE_REG(usr_regs.ARM_cpsr); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (cpsr)"); + + /* Secondary cores are stopped awaiting PSCI wakeup */ + if (vcpu->cpu_id != 0) + return; + + /* r0 = 0 */ + data = 0; + reg.id = ARM_CORE_REG(usr_regs.ARM_r0); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (r0)"); + + /* r1 = machine type (-1) */ + data = -1; + reg.id = ARM_CORE_REG(usr_regs.ARM_r1); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (r1)"); + + /* r2 = physical address of the device tree blob */ + data = kvm->arch.dtb_guest_start; + reg.id = ARM_CORE_REG(usr_regs.ARM_r2); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (r2)"); + + /* pc = start of kernel image */ + data = kvm->arch.kern_guest_start; + reg.id = ARM_CORE_REG(usr_regs.ARM_pc); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (pc)"); +} + +void kvm_cpu__show_code(struct kvm_cpu *vcpu) +{ + struct kvm_one_reg reg; + u32 data; + + reg.addr = (u64)(unsigned long)&data; + + printf("*pc:\n"); + reg.id = ARM_CORE_REG(usr_regs.ARM_pc); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (show_code @ PC)"); + + kvm__dump_mem(vcpu->kvm, data, 32); + printf("\n"); + + printf("*lr (svc):\n"); + reg.id = ARM_CORE_REG(svc_regs[1]); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (show_code @ LR_svc)"); + data &= ~0x1; + + kvm__dump_mem(vcpu->kvm, data, 32); + printf("\n"); +} + +void kvm_cpu__show_registers(struct kvm_cpu *vcpu) +{ + struct kvm_one_reg reg; + u32 data; + int debug_fd = kvm_cpu__get_debug_fd(); + + reg.addr = (u64)(unsigned long)&data; + dprintf(debug_fd, "\n Registers:\n"); + + reg.id = ARM_CORE_REG(usr_regs.ARM_pc); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (pc)"); + dprintf(debug_fd, " PC: 0x%x\n", data); + + reg.id = ARM_CORE_REG(usr_regs.ARM_cpsr); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (cpsr)"); + dprintf(debug_fd, " CPSR: 0x%x\n", data); + + reg.id = ARM_CORE_REG(svc_regs[0]); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (SP_svc)"); + dprintf(debug_fd, " SP_svc: 0x%x\n", data); + + reg.id = ARM_CORE_REG(svc_regs[1]); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (LR_svc)"); + dprintf(debug_fd, " LR_svc: 0x%x\n", data); +} diff --git a/tools/kvm/arm/aarch64/cortex-a57.c b/tools/kvm/arm/aarch64/cortex-a57.c new file mode 100644 index 000000000000..4fd11ba1b8c2 --- /dev/null +++ b/tools/kvm/arm/aarch64/cortex-a57.c @@ -0,0 +1,95 @@ +#include "kvm/fdt.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" +#include "kvm/util.h" + +#include "arm-common/gic.h" + +#include <linux/byteorder.h> +#include <linux/types.h> + +#define CPU_NAME_MAX_LEN 8 +static void generate_cpu_nodes(void *fdt, struct kvm *kvm) +{ + int cpu; + + _FDT(fdt_begin_node(fdt, "cpus")); + _FDT(fdt_property_cell(fdt, "#address-cells", 0x1)); + _FDT(fdt_property_cell(fdt, "#size-cells", 0x0)); + + for (cpu = 0; cpu < kvm->nrcpus; ++cpu) { + char cpu_name[CPU_NAME_MAX_LEN]; + + if (kvm->cpus[cpu]->cpu_type != KVM_ARM_TARGET_CORTEX_A57) { + pr_warning("Ignoring unknown type for CPU %d\n", cpu); + continue; + } + + snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%d", cpu); + + _FDT(fdt_begin_node(fdt, cpu_name)); + _FDT(fdt_property_string(fdt, "device_type", "cpu")); + _FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a57")); + + if (kvm->nrcpus > 1) + _FDT(fdt_property_string(fdt, "enable-method", "psci")); + + _FDT(fdt_property_cell(fdt, "reg", cpu)); + _FDT(fdt_end_node(fdt)); + } + + _FDT(fdt_end_node(fdt)); +} + +static void generate_timer_nodes(void *fdt, struct kvm *kvm) +{ + u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \ + & GIC_FDT_IRQ_PPI_CPU_MASK; + u32 irq_prop[] = { + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(13), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(14), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(11), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI), + cpu_to_fdt32(10), + cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + }; + + _FDT(fdt_begin_node(fdt, "timer")); + _FDT(fdt_property_string(fdt, "compatible", "arm,armv8-timer")); + _FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop))); + _FDT(fdt_end_node(fdt)); +} + +static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle) +{ + generate_cpu_nodes(fdt, kvm); + gic__generate_fdt_nodes(fdt, gic_phandle); + generate_timer_nodes(fdt, kvm); +} + + +static int cortex_a57__vcpu_init(struct kvm_cpu *vcpu) +{ + vcpu->generate_fdt_nodes = generate_fdt_nodes; + return 0; +} + +static struct kvm_arm_target target_cortex_a57 = { + .id = KVM_ARM_TARGET_CORTEX_A57, + .init = cortex_a57__vcpu_init, +}; + +static int cortex_a57__core_init(struct kvm *kvm) +{ + return kvm_cpu__register_kvm_arm_target(&target_cortex_a57); +} +core_init(cortex_a57__core_init); diff --git a/tools/kvm/arm/aarch64/include/kvm/barrier.h b/tools/kvm/arm/aarch64/include/kvm/barrier.h new file mode 100644 index 000000000000..97ab252171e1 --- /dev/null +++ b/tools/kvm/arm/aarch64/include/kvm/barrier.h @@ -0,0 +1,8 @@ +#ifndef KVM__KVM_BARRIER_H +#define KVM__KVM_BARRIER_H + +#define mb() asm volatile ("dmb ish" : : : "memory") +#define rmb() asm volatile ("dmb ishld" : : : "memory") +#define wmb() asm volatile ("dmb ishst" : : : "memory") + +#endif /* KVM__KVM_BARRIER_H */ diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h new file mode 100644 index 000000000000..2f08a26306d7 --- /dev/null +++ b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h @@ -0,0 +1,17 @@ +#ifndef KVM__KVM_ARCH_H +#define KVM__KVM_ARCH_H + +#define ARM_GIC_DIST_SIZE 0x10000 +#define ARM_GIC_CPUI_SIZE 0x10000 + +#define ARM_KERN_OFFSET(kvm) ((kvm)->cfg.arch.aarch32_guest ? \ + 0x8000 : \ + 0x80000) + +#define ARM_MAX_MEMORY(kvm) ((kvm)->cfg.arch.aarch32_guest ? \ + ARM_LOMAP_MAX_MEMORY : \ + ARM_HIMAP_MAX_MEMORY) + +#include "arm-common/kvm-arch.h" + +#endif /* KVM__KVM_ARCH_H */ diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h new file mode 100644 index 000000000000..89860ae3166c --- /dev/null +++ b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h @@ -0,0 +1,10 @@ +#ifndef KVM__KVM_CONFIG_ARCH_H +#define KVM__KVM_CONFIG_ARCH_H + +#define ARM_OPT_ARCH_RUN(cfg) \ + OPT_BOOLEAN('\0', "aarch32", &(cfg)->aarch32_guest, \ + "Run AArch32 guest"), + +#include "arm-common/kvm-config-arch.h" + +#endif /* KVM__KVM_CONFIG_ARCH_H */ diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h new file mode 100644 index 000000000000..d85c583421c5 --- /dev/null +++ b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h @@ -0,0 +1,13 @@ +#ifndef KVM__KVM_CPU_ARCH_H +#define KVM__KVM_CPU_ARCH_H + +#include "kvm/kvm.h" + +#include "arm-common/kvm-cpu-arch.h" + +#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid) { \ + [0] = ((!!(cpuid) << KVM_ARM_VCPU_POWER_OFF) | \ + (!!(kvm)->cfg.arch.aarch32_guest << KVM_ARM_VCPU_EL1_32BIT)) \ +} + +#endif /* KVM__KVM_CPU_ARCH_H */ diff --git a/tools/kvm/arm/aarch64/kvm-cpu.c b/tools/kvm/arm/aarch64/kvm-cpu.c new file mode 100644 index 000000000000..2eb06eab7e4b --- /dev/null +++ b/tools/kvm/arm/aarch64/kvm-cpu.c @@ -0,0 +1,160 @@ +#include "kvm/kvm-cpu.h" +#include "kvm/kvm.h" + +#include <asm/ptrace.h> + +#define COMPAT_PSR_F_BIT 0x00000040 +#define COMPAT_PSR_I_BIT 0x00000080 +#define COMPAT_PSR_MODE_SVC 0x00000013 + +#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ + KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x)) + +static void reset_vcpu_aarch32(struct kvm_cpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_one_reg reg; + u64 data; + + reg.addr = (u64)&data; + + /* pstate = all interrupts masked */ + data = COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT | COMPAT_PSR_MODE_SVC; + reg.id = ARM64_CORE_REG(regs.pstate); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (spsr[EL1])"); + + /* Secondary cores are stopped awaiting PSCI wakeup */ + if (vcpu->cpu_id != 0) + return; + + /* r0 = 0 */ + data = 0; + reg.id = ARM64_CORE_REG(regs.regs[0]); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (r0)"); + + /* r1 = machine type (-1) */ + data = -1; + reg.id = ARM64_CORE_REG(regs.regs[1]); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (r1)"); + + /* r2 = physical address of the device tree blob */ + data = kvm->arch.dtb_guest_start; + reg.id = ARM64_CORE_REG(regs.regs[2]); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (r2)"); + + /* pc = start of kernel image */ + data = kvm->arch.kern_guest_start; + reg.id = ARM64_CORE_REG(regs.pc); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (pc)"); +} + +static void reset_vcpu_aarch64(struct kvm_cpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_one_reg reg; + u64 data; + + reg.addr = (u64)&data; + + /* pstate = all interrupts masked */ + data = PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h; + reg.id = ARM64_CORE_REG(regs.pstate); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (spsr[EL1])"); + + /* x1...x3 = 0 */ + data = 0; + reg.id = ARM64_CORE_REG(regs.regs[1]); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (x1)"); + + reg.id = ARM64_CORE_REG(regs.regs[2]); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (x2)"); + + reg.id = ARM64_CORE_REG(regs.regs[3]); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (x3)"); + + /* Secondary cores are stopped awaiting PSCI wakeup */ + if (vcpu->cpu_id == 0) { + /* x0 = physical address of the device tree blob */ + data = kvm->arch.dtb_guest_start; + reg.id = ARM64_CORE_REG(regs.regs[0]); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (x0)"); + + /* pc = start of kernel image */ + data = kvm->arch.kern_guest_start; + reg.id = ARM64_CORE_REG(regs.pc); + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die_perror("KVM_SET_ONE_REG failed (pc)"); + } +} + +void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu) +{ + if (vcpu->kvm->cfg.arch.aarch32_guest) + return reset_vcpu_aarch32(vcpu); + else + return reset_vcpu_aarch64(vcpu); +} + +void kvm_cpu__show_code(struct kvm_cpu *vcpu) +{ + struct kvm_one_reg reg; + unsigned long data; + + reg.addr = (u64)&data; + + printf("*pc:\n"); + reg.id = ARM64_CORE_REG(regs.pc); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (show_code @ PC)"); + + kvm__dump_mem(vcpu->kvm, data, 32); + printf("\n"); + + printf("*lr:\n"); + reg.id = ARM64_CORE_REG(regs.regs[30]); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (show_code @ LR)"); + + kvm__dump_mem(vcpu->kvm, data, 32); + printf("\n"); +} + +void kvm_cpu__show_registers(struct kvm_cpu *vcpu) +{ + struct kvm_one_reg reg; + unsigned long data; + int debug_fd = kvm_cpu__get_debug_fd(); + + reg.addr = (u64)&data; + dprintf(debug_fd, "\n Registers:\n"); + + reg.id = ARM64_CORE_REG(regs.pc); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (pc)"); + dprintf(debug_fd, " PC: 0x%lx\n", data); + + reg.id = ARM64_CORE_REG(regs.pstate); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (pstate)"); + dprintf(debug_fd, " PSTATE: 0x%lx\n", data); + + reg.id = ARM64_CORE_REG(sp_el1); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (sp_el1)"); + dprintf(debug_fd, " SP_EL1: 0x%lx\n", data); + + reg.id = ARM64_CORE_REG(regs.regs[30]); + if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0) + die("KVM_GET_ONE_REG failed (lr)"); + dprintf(debug_fd, " LR: 0x%lx\n", data); +} diff --git a/tools/kvm/arm/fdt.c b/tools/kvm/arm/fdt.c new file mode 100644 index 000000000000..20e03084e518 --- /dev/null +++ b/tools/kvm/arm/fdt.c @@ -0,0 +1,250 @@ +#include "kvm/devices.h" +#include "kvm/fdt.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" +#include "kvm/virtio-mmio.h" + +#include "arm-common/gic.h" + +#include <stdbool.h> + +#include <asm/setup.h> +#include <linux/byteorder.h> +#include <linux/kernel.h> +#include <linux/sizes.h> + +static char kern_cmdline[COMMAND_LINE_SIZE]; + +bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename) +{ + return false; +} + +int kvm__arch_setup_firmware(struct kvm *kvm) +{ + return 0; +} + +static void dump_fdt(const char *dtb_file, void *fdt) +{ + int count, fd; + + fd = open(dtb_file, O_CREAT | O_TRUNC | O_RDWR, 0666); + if (fd < 0) + die("Failed to write dtb to %s", dtb_file); + + count = write(fd, fdt, FDT_MAX_SIZE); + if (count < 0) + die_perror("Failed to dump dtb"); + + pr_info("Wrote %d bytes to dtb %s\n", count, dtb_file); + close(fd); +} + +#define DEVICE_NAME_MAX_LEN 32 +static void generate_virtio_mmio_node(void *fdt, struct virtio_mmio *vmmio) +{ + char dev_name[DEVICE_NAME_MAX_LEN]; + u64 addr = vmmio->addr; + u64 reg_prop[] = { + cpu_to_fdt64(addr), + cpu_to_fdt64(VIRTIO_MMIO_IO_SIZE) + }; + u32 irq_prop[] = { + cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI), + cpu_to_fdt32(vmmio->irq - GIC_SPI_IRQ_BASE), + cpu_to_fdt32(GIC_FDT_IRQ_FLAGS_EDGE_LO_HI), + }; + + snprintf(dev_name, DEVICE_NAME_MAX_LEN, "virtio@%llx", addr); + + _FDT(fdt_begin_node(fdt, dev_name)); + _FDT(fdt_property_string(fdt, "compatible", "virtio,mmio")); + _FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop))); + _FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop))); + _FDT(fdt_end_node(fdt)); +} + +static int setup_fdt(struct kvm *kvm) +{ + struct device_header *dev_hdr; + u8 staging_fdt[FDT_MAX_SIZE]; + u32 gic_phandle = fdt__alloc_phandle(); + u64 mem_reg_prop[] = { + cpu_to_fdt64(kvm->arch.memory_guest_start), + cpu_to_fdt64(kvm->ram_size), + }; + void *fdt = staging_fdt; + void *fdt_dest = guest_flat_to_host(kvm, + kvm->arch.dtb_guest_start); + void (*generate_cpu_nodes)(void *, struct kvm *, u32) + = kvm->cpus[0]->generate_fdt_nodes; + + /* Create new tree without a reserve map */ + _FDT(fdt_create(fdt, FDT_MAX_SIZE)); + _FDT(fdt_finish_reservemap(fdt)); + + /* Header */ + _FDT(fdt_begin_node(fdt, "")); + _FDT(fdt_property_cell(fdt, "interrupt-parent", gic_phandle)); + _FDT(fdt_property_string(fdt, "compatible", "linux,dummy-virt")); + _FDT(fdt_property_cell(fdt, "#address-cells", 0x2)); + _FDT(fdt_property_cell(fdt, "#size-cells", 0x2)); + + /* /chosen */ + _FDT(fdt_begin_node(fdt, "chosen")); + _FDT(fdt_property_string(fdt, "bootargs", kern_cmdline)); + + /* Initrd */ + if (kvm->arch.initrd_size != 0) { + u32 ird_st_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start); + u32 ird_end_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start + + kvm->arch.initrd_size); + + _FDT(fdt_property(fdt, "linux,initrd-start", + &ird_st_prop, sizeof(ird_st_prop))); + _FDT(fdt_property(fdt, "linux,initrd-end", + &ird_end_prop, sizeof(ird_end_prop))); + } + _FDT(fdt_end_node(fdt)); + + /* Memory */ + _FDT(fdt_begin_node(fdt, "memory")); + _FDT(fdt_property_string(fdt, "device_type", "memory")); + _FDT(fdt_property(fdt, "reg", mem_reg_prop, sizeof(mem_reg_prop))); + _FDT(fdt_end_node(fdt)); + + /* CPU and peripherals (interrupt controller, timers, etc) */ + if (generate_cpu_nodes) + generate_cpu_nodes(fdt, kvm, gic_phandle); + + /* Virtio MMIO devices */ + dev_hdr = device__first_dev(DEVICE_BUS_MMIO); + while (dev_hdr) { + generate_virtio_mmio_node(fdt, dev_hdr->data); + dev_hdr = device__next_dev(dev_hdr); + } + + /* PSCI firmware */ + _FDT(fdt_begin_node(fdt, "psci")); + _FDT(fdt_property_string(fdt, "compatible", "arm,psci")); + _FDT(fdt_property_string(fdt, "method", "hvc")); + _FDT(fdt_property_cell(fdt, "cpu_suspend", KVM_PSCI_FN_CPU_SUSPEND)); + _FDT(fdt_property_cell(fdt, "cpu_off", KVM_PSCI_FN_CPU_OFF)); + _FDT(fdt_property_cell(fdt, "cpu_on", KVM_PSCI_FN_CPU_ON)); + _FDT(fdt_property_cell(fdt, "migrate", KVM_PSCI_FN_MIGRATE)); + _FDT(fdt_end_node(fdt)); + + /* Finalise. */ + _FDT(fdt_end_node(fdt)); + _FDT(fdt_finish(fdt)); + + _FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE)); + _FDT(fdt_pack(fdt_dest)); + + if (kvm->cfg.arch.dump_dtb_filename) + dump_fdt(kvm->cfg.arch.dump_dtb_filename, fdt_dest); + return 0; +} +late_init(setup_fdt); + +static int read_image(int fd, void **pos, void *limit) +{ + int count; + + while (((count = xread(fd, *pos, SZ_64K)) > 0) && *pos <= limit) + *pos += count; + + if (pos < 0) + die_perror("xread"); + + return *pos < limit ? 0 : -ENOMEM; +} + +#define FDT_ALIGN SZ_2M +#define INITRD_ALIGN 4 +int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, + const char *kernel_cmdline) +{ + void *pos, *kernel_end, *limit; + unsigned long guest_addr; + + if (lseek(fd_kernel, 0, SEEK_SET) < 0) + die_perror("lseek"); + + /* + * Linux requires the initrd and dtb to be mapped inside lowmem, + * so we can't just place them at the top of memory. + */ + limit = kvm->ram_start + min(kvm->ram_size, (u64)SZ_256M) - 1; + + pos = kvm->ram_start + ARM_KERN_OFFSET(kvm); + kvm->arch.kern_guest_start = host_to_guest_flat(kvm, pos); + if (read_image(fd_kernel, &pos, limit) == -ENOMEM) + die("kernel image too big to contain in guest memory."); + + kernel_end = pos; + pr_info("Loaded kernel to 0x%llx (%llu bytes)", + kvm->arch.kern_guest_start, + host_to_guest_flat(kvm, pos) - kvm->arch.kern_guest_start); + + /* + * Now load backwards from the end of memory so the kernel + * decompressor has plenty of space to work with. First up is + * the device tree blob... + */ + pos = limit; + pos -= (FDT_MAX_SIZE + FDT_ALIGN); + guest_addr = ALIGN(host_to_guest_flat(kvm, pos), FDT_ALIGN); + pos = guest_flat_to_host(kvm, guest_addr); + if (pos < kernel_end) + die("fdt overlaps with kernel image."); + + kvm->arch.dtb_guest_start = guest_addr; + pr_info("Placing fdt at 0x%llx - 0x%llx", + kvm->arch.dtb_guest_start, + host_to_guest_flat(kvm, limit)); + limit = pos; + + /* ... and finally the initrd, if we have one. */ + if (fd_initrd != -1) { + struct stat sb; + unsigned long initrd_start; + + if (lseek(fd_initrd, 0, SEEK_SET) < 0) + die_perror("lseek"); + + if (fstat(fd_initrd, &sb)) + die_perror("fstat"); + + pos -= (sb.st_size + INITRD_ALIGN); + guest_addr = ALIGN(host_to_guest_flat(kvm, pos), INITRD_ALIGN); + pos = guest_flat_to_host(kvm, guest_addr); + if (pos < kernel_end) + die("initrd overlaps with kernel image."); + + initrd_start = guest_addr; + if (read_image(fd_initrd, &pos, limit) == -ENOMEM) + die("initrd too big to contain in guest memory."); + + kvm->arch.initrd_guest_start = initrd_start; + kvm->arch.initrd_size = host_to_guest_flat(kvm, pos) - initrd_start; + pr_info("Loaded initrd to 0x%llx (%llu bytes)", + kvm->arch.initrd_guest_start, + kvm->arch.initrd_size); + } else { + kvm->arch.initrd_size = 0; + } + + strncpy(kern_cmdline, kernel_cmdline, COMMAND_LINE_SIZE); + kern_cmdline[COMMAND_LINE_SIZE - 1] = '\0'; + + return true; +} + +bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, + const char *kernel_cmdline) +{ + /* To b or not to b? That is the zImage. */ + return false; +} diff --git a/tools/kvm/arm/gic.c b/tools/kvm/arm/gic.c new file mode 100644 index 000000000000..3f42c3a11d16 --- /dev/null +++ b/tools/kvm/arm/gic.c @@ -0,0 +1,92 @@ +#include "kvm/fdt.h" +#include "kvm/kvm.h" +#include "kvm/virtio.h" + +#include "arm-common/gic.h" + +#include <linux/byteorder.h> +#include <linux/kvm.h> + +static int irq_ids; + +int gic__alloc_irqnum(void) +{ + int irq = GIC_SPI_IRQ_BASE + irq_ids++; + + if (irq > GIC_MAX_IRQ) + die("GIC IRQ limit %d reached!", GIC_MAX_IRQ); + + return irq; +} + +int gic__init_irqchip(struct kvm *kvm) +{ + int err; + struct kvm_device_address gic_addr[] = { + [0] = { + .id = (KVM_ARM_DEVICE_VGIC_V2 << KVM_DEVICE_ID_SHIFT) |\ + KVM_VGIC_V2_ADDR_TYPE_DIST, + .addr = ARM_GIC_DIST_BASE, + }, + [1] = { + .id = (KVM_ARM_DEVICE_VGIC_V2 << KVM_DEVICE_ID_SHIFT) |\ + KVM_VGIC_V2_ADDR_TYPE_CPU, + .addr = ARM_GIC_CPUI_BASE, + } + }; + + if (kvm->nrcpus > GIC_MAX_CPUS) { + pr_warning("%d CPUS greater than maximum of %d -- truncating\n", + kvm->nrcpus, GIC_MAX_CPUS); + kvm->nrcpus = GIC_MAX_CPUS; + } + + err = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP); + if (err) + return err; + + err = ioctl(kvm->vm_fd, KVM_SET_DEVICE_ADDRESS, &gic_addr[0]); + if (err) + return err; + + err = ioctl(kvm->vm_fd, KVM_SET_DEVICE_ADDRESS, &gic_addr[1]); + return err; +} + +void gic__generate_fdt_nodes(void *fdt, u32 phandle) +{ + u64 reg_prop[] = { + cpu_to_fdt64(ARM_GIC_DIST_BASE), cpu_to_fdt64(ARM_GIC_DIST_SIZE), + cpu_to_fdt64(ARM_GIC_CPUI_BASE), cpu_to_fdt64(ARM_GIC_CPUI_SIZE), + }; + + _FDT(fdt_begin_node(fdt, "intc")); + _FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a15-gic")); + _FDT(fdt_property_cell(fdt, "#interrupt-cells", GIC_FDT_IRQ_NUM_CELLS)); + _FDT(fdt_property(fdt, "interrupt-controller", NULL, 0)); + _FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop))); + _FDT(fdt_property_cell(fdt, "phandle", phandle)); + _FDT(fdt_end_node(fdt)); +} + +#define KVM_IRQCHIP_IRQ(x) (KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT) |\ + ((x) & KVM_ARM_IRQ_NUM_MASK) + +void kvm__irq_line(struct kvm *kvm, int irq, int level) +{ + struct kvm_irq_level irq_level = { + .irq = KVM_IRQCHIP_IRQ(irq), + .level = !!level, + }; + + if (irq < GIC_SPI_IRQ_BASE || irq > GIC_MAX_IRQ) + pr_warning("Ignoring invalid GIC IRQ %d", irq); + else if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0) + pr_warning("Could not KVM_IRQ_LINE for irq %d", irq); +} + +void kvm__irq_trigger(struct kvm *kvm, int irq) +{ + kvm__irq_line(kvm, irq, VIRTIO_IRQ_HIGH); + kvm__irq_line(kvm, irq, VIRTIO_IRQ_LOW); +} diff --git a/tools/kvm/arm/include/arm-common/gic.h b/tools/kvm/arm/include/arm-common/gic.h new file mode 100644 index 000000000000..850edc78e427 --- /dev/null +++ b/tools/kvm/arm/include/arm-common/gic.h @@ -0,0 +1,35 @@ +#ifndef ARM_COMMON__GIC_H +#define ARM_COMMON__GIC_H + +#define GIC_SGI_IRQ_BASE 0 +#define GIC_PPI_IRQ_BASE 16 +#define GIC_SPI_IRQ_BASE 32 + +#define GIC_FDT_IRQ_NUM_CELLS 3 + +#define GIC_FDT_IRQ_TYPE_SPI 0 +#define GIC_FDT_IRQ_TYPE_PPI 1 + +#define GIC_FDT_IRQ_FLAGS_EDGE_LO_HI 1 +#define GIC_FDT_IRQ_FLAGS_EDGE_HI_LO 2 +#define GIC_FDT_IRQ_FLAGS_LEVEL_HI 4 +#define GIC_FDT_IRQ_FLAGS_LEVEL_LO 8 + +#define GIC_FDT_IRQ_PPI_CPU_SHIFT 8 +#define GIC_FDT_IRQ_PPI_CPU_MASK (0xff << GIC_FDT_IRQ_PPI_CPU_SHIFT) + +#define GIC_CPUI_CTLR_EN (1 << 0) +#define GIC_CPUI_PMR_MIN_PRIO 0xff + +#define GIC_CPUI_OFF_PMR 4 + +#define GIC_MAX_CPUS 8 +#define GIC_MAX_IRQ 255 + +struct kvm; + +int gic__alloc_irqnum(void); +int gic__init_irqchip(struct kvm *kvm); +void gic__generate_fdt_nodes(void *fdt, u32 phandle); + +#endif /* ARM_COMMON__GIC_H */ diff --git a/tools/kvm/arm/include/arm-common/kvm-arch.h b/tools/kvm/arm/include/arm-common/kvm-arch.h new file mode 100644 index 000000000000..7860e1729ca1 --- /dev/null +++ b/tools/kvm/arm/include/arm-common/kvm-arch.h @@ -0,0 +1,57 @@ +#ifndef ARM_COMMON__KVM_ARCH_H +#define ARM_COMMON__KVM_ARCH_H + +#include <stdbool.h> +#include <linux/const.h> +#include <linux/types.h> + +#define ARM_MMIO_AREA _AC(0x0000000000000000, UL) +#define ARM_AXI_AREA _AC(0x0000000040000000, UL) +#define ARM_MEMORY_AREA _AC(0x0000000080000000, UL) + +#define ARM_LOMAP_MAX_MEMORY ((1ULL << 32) - ARM_MEMORY_AREA) +#define ARM_HIMAP_MAX_MEMORY ((1ULL << 40) - ARM_MEMORY_AREA) + +#define ARM_GIC_DIST_BASE (ARM_AXI_AREA - ARM_GIC_DIST_SIZE) +#define ARM_GIC_CPUI_BASE (ARM_GIC_DIST_BASE - ARM_GIC_CPUI_SIZE) +#define ARM_GIC_SIZE (ARM_GIC_DIST_SIZE + ARM_GIC_CPUI_SIZE) + +#define ARM_VIRTIO_MMIO_SIZE (ARM_AXI_AREA - ARM_GIC_SIZE) +#define ARM_PCI_MMIO_SIZE (ARM_MEMORY_AREA - ARM_AXI_AREA) + +#define KVM_PCI_MMIO_AREA ARM_AXI_AREA +#define KVM_VIRTIO_MMIO_AREA ARM_MMIO_AREA + +#define VIRTIO_DEFAULT_TRANS VIRTIO_MMIO + +static inline bool arm_addr_in_virtio_mmio_region(u64 phys_addr) +{ + u64 limit = KVM_VIRTIO_MMIO_AREA + ARM_VIRTIO_MMIO_SIZE; + return phys_addr >= KVM_VIRTIO_MMIO_AREA && phys_addr < limit; +} + +static inline bool arm_addr_in_pci_mmio_region(u64 phys_addr) +{ + u64 limit = KVM_PCI_MMIO_AREA + ARM_PCI_MMIO_SIZE; + return phys_addr >= KVM_PCI_MMIO_AREA && phys_addr < limit; +} + +struct kvm_arch { + /* + * We may have to align the guest memory for virtio, so keep the + * original pointers here for munmap. + */ + void *ram_alloc_start; + u64 ram_alloc_size; + + /* + * Guest addresses for memory layout. + */ + u64 memory_guest_start; + u64 kern_guest_start; + u64 initrd_guest_start; + u64 initrd_size; + u64 dtb_guest_start; +}; + +#endif /* ARM_COMMON__KVM_ARCH_H */ diff --git a/tools/kvm/arm/include/arm-common/kvm-config-arch.h b/tools/kvm/arm/include/arm-common/kvm-config-arch.h new file mode 100644 index 000000000000..7ac6f6e88550 --- /dev/null +++ b/tools/kvm/arm/include/arm-common/kvm-config-arch.h @@ -0,0 +1,17 @@ +#ifndef ARM_COMMON__KVM_CONFIG_ARCH_H +#define ARM_COMMON__KVM_CONFIG_ARCH_H + +#include "kvm/parse-options.h" + +struct kvm_config_arch { + const char *dump_dtb_filename; + bool aarch32_guest; +}; + +#define OPT_ARCH_RUN(pfx, cfg) \ + pfx, \ + ARM_OPT_ARCH_RUN(cfg) \ + OPT_STRING('\0', "dump-dtb", &(cfg)->dump_dtb_filename, \ + ".dtb file", "Dump generated .dtb to specified file"), + +#endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */ diff --git a/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h new file mode 100644 index 000000000000..351fbe68e5a9 --- /dev/null +++ b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h @@ -0,0 +1,46 @@ +#ifndef ARM_COMMON__KVM_CPU_ARCH_H +#define ARM_COMMON__KVM_CPU_ARCH_H + +#include <linux/kvm.h> +#include <pthread.h> +#include <stdbool.h> + +struct kvm; + +struct kvm_cpu { + pthread_t thread; + + unsigned long cpu_id; + unsigned long cpu_type; + + struct kvm *kvm; + int vcpu_fd; + struct kvm_run *kvm_run; + + u8 is_running; + u8 paused; + u8 needs_nmi; + + struct kvm_coalesced_mmio_ring *ring; + + void (*generate_fdt_nodes)(void *fdt, struct kvm* kvm, + u32 gic_phandle); +}; + +struct kvm_arm_target { + u32 id; + int (*init)(struct kvm_cpu *vcpu); +}; + +int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target); + +static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data, + int direction, int size, u32 count) +{ + return false; +} + +bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, + u8 is_write); + +#endif /* ARM_COMMON__KVM_CPU_ARCH_H */ diff --git a/tools/kvm/arm/ioport.c b/tools/kvm/arm/ioport.c new file mode 100644 index 000000000000..3c03fa05dd05 --- /dev/null +++ b/tools/kvm/arm/ioport.c @@ -0,0 +1,5 @@ +#include "kvm/ioport.h" + +void ioport__setup_arch(struct kvm *kvm) +{ +} diff --git a/tools/kvm/arm/irq.c b/tools/kvm/arm/irq.c new file mode 100644 index 000000000000..e173e04f3668 --- /dev/null +++ b/tools/kvm/arm/irq.c @@ -0,0 +1,17 @@ +#include "kvm/irq.h" +#include "kvm/kvm.h" +#include "kvm/util.h" + +#include "arm-common/gic.h" + +int irq__register_device(u32 dev, u8 *pin, u8 *line) +{ + *line = gic__alloc_irqnum(); + return 0; +} + +int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg) +{ + die(__FUNCTION__); + return 0; +} diff --git a/tools/kvm/arm/kvm-cpu.c b/tools/kvm/arm/kvm-cpu.c new file mode 100644 index 000000000000..7a0eff45d4ca --- /dev/null +++ b/tools/kvm/arm/kvm-cpu.c @@ -0,0 +1,109 @@ +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" + +static int debug_fd; + +void kvm_cpu__set_debug_fd(int fd) +{ + debug_fd = fd; +} + +int kvm_cpu__get_debug_fd(void) +{ + return debug_fd; +} + +static struct kvm_arm_target *kvm_arm_targets[KVM_ARM_NUM_TARGETS]; +int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target) +{ + unsigned int i = 0; + + for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) { + if (!kvm_arm_targets[i]) { + kvm_arm_targets[i] = target; + return 0; + } + } + + return -ENOSPC; +} + +struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id) +{ + struct kvm_cpu *vcpu; + int coalesced_offset, mmap_size, err = -1; + unsigned int i; + struct kvm_vcpu_init vcpu_init = { + .features = ARM_VCPU_FEATURE_FLAGS(kvm, cpu_id) + }; + + vcpu = calloc(1, sizeof(struct kvm_cpu)); + if (!vcpu) + return NULL; + + vcpu->vcpu_fd = ioctl(kvm->vm_fd, KVM_CREATE_VCPU, cpu_id); + if (vcpu->vcpu_fd < 0) + die_perror("KVM_CREATE_VCPU ioctl"); + + mmap_size = ioctl(kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0); + if (mmap_size < 0) + die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl"); + + vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, + vcpu->vcpu_fd, 0); + if (vcpu->kvm_run == MAP_FAILED) + die("unable to mmap vcpu fd"); + + /* Find an appropriate target CPU type. */ + for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) { + vcpu_init.target = kvm_arm_targets[i]->id; + err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init); + if (!err) + break; + } + + if (err || kvm_arm_targets[i]->init(vcpu)) + die("Unable to initialise ARM vcpu"); + + coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, + KVM_CAP_COALESCED_MMIO); + if (coalesced_offset) + vcpu->ring = (void *)vcpu->kvm_run + + (coalesced_offset * PAGE_SIZE); + + /* Populate the vcpu structure. */ + vcpu->kvm = kvm; + vcpu->cpu_id = cpu_id; + vcpu->cpu_type = vcpu_init.target; + vcpu->is_running = true; + return vcpu; +} + +void kvm_cpu__arch_nmi(struct kvm_cpu *cpu) +{ +} + +void kvm_cpu__delete(struct kvm_cpu *vcpu) +{ + free(vcpu); +} + +bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) +{ + return false; +} + +bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, + u8 is_write) +{ + if (arm_addr_in_virtio_mmio_region(phys_addr)) + return kvm__emulate_mmio(kvm, phys_addr, data, len, is_write); + else if (arm_addr_in_pci_mmio_region(phys_addr)) + die("PCI emulation not supported on ARM!"); + + return false; +} + +void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu) +{ +} diff --git a/tools/kvm/arm/kvm.c b/tools/kvm/arm/kvm.c new file mode 100644 index 000000000000..1bcfce3c1f44 --- /dev/null +++ b/tools/kvm/arm/kvm.c @@ -0,0 +1,82 @@ +#include "kvm/kvm.h" +#include "kvm/term.h" +#include "kvm/util.h" +#include "kvm/virtio-console.h" + +#include "arm-common/gic.h" + +#include <linux/kernel.h> +#include <linux/kvm.h> +#include <linux/sizes.h> + +struct kvm_ext kvm_req_ext[] = { + { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) }, + { DEFINE_KVM_EXT(KVM_CAP_ONE_REG) }, + { DEFINE_KVM_EXT(KVM_CAP_ARM_PSCI) }, + { 0, 0 }, +}; + +bool kvm__arch_cpu_supports_vm(void) +{ + /* The KVM capability check is enough. */ + return true; +} + +void kvm__init_ram(struct kvm *kvm) +{ + int err; + u64 phys_start, phys_size; + void *host_mem; + + phys_start = ARM_MEMORY_AREA; + phys_size = kvm->ram_size; + host_mem = kvm->ram_start; + + err = kvm__register_mem(kvm, phys_start, phys_size, host_mem); + if (err) + die("Failed to register %lld bytes of memory at physical " + "address 0x%llx [err %d]", phys_size, phys_start, err); + + kvm->arch.memory_guest_start = phys_start; +} + +void kvm__arch_delete_ram(struct kvm *kvm) +{ + munmap(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size); +} + +void kvm__arch_periodic_poll(struct kvm *kvm) +{ + if (term_readable(0)) + virtio_console__inject_interrupt(kvm); +} + +void kvm__arch_set_cmdline(char *cmdline, bool video) +{ +} + +void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size) +{ + /* + * Allocate guest memory. We must align out buffer to 64K to + * correlate with the maximum guest page size for virtio-mmio. + */ + kvm->ram_size = min(ram_size, (u64)ARM_MAX_MEMORY(kvm)); + kvm->arch.ram_alloc_size = kvm->ram_size + SZ_64K; + kvm->arch.ram_alloc_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, + kvm->arch.ram_alloc_size); + + if (kvm->arch.ram_alloc_start == MAP_FAILED) + die("Failed to map %lld bytes for guest memory (%d)", + kvm->arch.ram_alloc_size, errno); + + kvm->ram_start = (void *)ALIGN((unsigned long)kvm->arch.ram_alloc_start, + SZ_64K); + + madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size, + MADV_MERGEABLE); + + /* Initialise the virtual GIC. */ + if (gic__init_irqchip(kvm)) + die("Failed to initialise virtual GIC"); +} diff --git a/tools/kvm/builtin-balloon.c b/tools/kvm/builtin-balloon.c new file mode 100644 index 000000000000..d158acec63a1 --- /dev/null +++ b/tools/kvm/builtin-balloon.c @@ -0,0 +1,80 @@ +#include <stdio.h> +#include <string.h> +#include <signal.h> + +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-balloon.h> +#include <kvm/parse-options.h> +#include <kvm/kvm.h> +#include <kvm/kvm-ipc.h> + +static const char *instance_name; +static u64 inflate; +static u64 deflate; + +static const char * const balloon_usage[] = { + "lkvm balloon [-n name] [-p pid] [-i amount] [-d amount]", + NULL +}; + +static const struct option balloon_options[] = { + OPT_GROUP("Instance options:"), + OPT_STRING('n', "name", &instance_name, "name", "Instance name"), + OPT_GROUP("Balloon options:"), + OPT_U64('i', "inflate", &inflate, "Amount to inflate (in MB)"), + OPT_U64('d', "deflate", &deflate, "Amount to deflate (in MB)"), + OPT_END(), +}; + +void kvm_balloon_help(void) +{ + usage_with_options(balloon_usage, balloon_options); +} + +static void parse_balloon_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, balloon_options, balloon_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0) + kvm_balloon_help(); + } +} + +int kvm_cmd_balloon(int argc, const char **argv, const char *prefix) +{ + int instance; + int r; + int amount; + + parse_balloon_options(argc, argv); + + if (inflate == 0 && deflate == 0) + kvm_balloon_help(); + + if (instance_name == NULL) + kvm_balloon_help(); + + instance = kvm__get_sock_by_instance(instance_name); + + if (instance <= 0) + die("Failed locating instance"); + + if (inflate) + amount = inflate; + else if (deflate) + amount = -deflate; + else + kvm_balloon_help(); + + r = kvm_ipc__send_msg(instance, KVM_IPC_BALLOON, + sizeof(amount), (u8 *)&amount); + + close(instance); + + if (r < 0) + return -1; + + return 0; +} diff --git a/tools/kvm/builtin-debug.c b/tools/kvm/builtin-debug.c new file mode 100644 index 000000000000..4ae51d200374 --- /dev/null +++ b/tools/kvm/builtin-debug.c @@ -0,0 +1,110 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-debug.h> +#include <kvm/kvm.h> +#include <kvm/parse-options.h> +#include <kvm/kvm-ipc.h> +#include <kvm/read-write.h> + +#include <stdio.h> +#include <string.h> +#include <signal.h> + +#define BUFFER_SIZE 100 + +static bool all; +static int nmi = -1; +static bool dump; +static const char *instance_name; +static const char *sysrq; + +static const char * const debug_usage[] = { + "lkvm debug [--all] [-n name] [-d] [-m vcpu]", + NULL +}; + +static const struct option debug_options[] = { + OPT_GROUP("General options:"), + OPT_BOOLEAN('d', "dump", &dump, "Generate a debug dump from guest"), + OPT_INTEGER('m', "nmi", &nmi, "Generate NMI on VCPU"), + OPT_STRING('s', "sysrq", &sysrq, "sysrq", "Inject a sysrq"), + OPT_GROUP("Instance options:"), + OPT_BOOLEAN('a', "all", &all, "Debug all instances"), + OPT_STRING('n', "name", &instance_name, "name", "Instance name"), + OPT_END() +}; + +static void parse_debug_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, debug_options, debug_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0) + kvm_debug_help(); + } +} + +void kvm_debug_help(void) +{ + usage_with_options(debug_usage, debug_options); +} + +static int do_debug(const char *name, int sock) +{ + char buff[BUFFER_SIZE]; + struct debug_cmd_params cmd = {.dbg_type = 0}; + int r; + + if (dump) + cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_DUMP; + + if (nmi != -1) { + cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_NMI; + cmd.cpu = nmi; + } + + if (sysrq) { + cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_SYSRQ; + cmd.sysrq = sysrq[0]; + } + + r = kvm_ipc__send_msg(sock, KVM_IPC_DEBUG, sizeof(cmd), (u8 *)&cmd); + if (r < 0) + return r; + + if (!dump) + return 0; + + do { + r = xread(sock, buff, BUFFER_SIZE); + if (r < 0) + return 0; + printf("%.*s", r, buff); + } while (r > 0); + + return 0; +} + +int kvm_cmd_debug(int argc, const char **argv, const char *prefix) +{ + parse_debug_options(argc, argv); + int instance; + int r; + + if (all) + return kvm__enumerate_instances(do_debug); + + if (instance_name == NULL) + kvm_debug_help(); + + instance = kvm__get_sock_by_instance(instance_name); + + if (instance <= 0) + die("Failed locating instance"); + + r = do_debug(instance_name, instance); + + close(instance); + + return r; +} diff --git a/tools/kvm/builtin-help.c b/tools/kvm/builtin-help.c new file mode 100644 index 000000000000..5970fb7484f6 --- /dev/null +++ b/tools/kvm/builtin-help.c @@ -0,0 +1,63 @@ +#include <stdio.h> +#include <string.h> + +/* user defined headers */ +#include <common-cmds.h> + +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-help.h> +#include <kvm/kvm.h> + + +const char kvm_usage_string[] = + "lkvm COMMAND [ARGS]"; + +const char kvm_more_info_string[] = + "See 'lkvm help COMMAND' for more information on a specific command."; + + +static void list_common_cmds_help(void) +{ + unsigned int i, longest = 0; + + for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { + if (longest < strlen(common_cmds[i].name)) + longest = strlen(common_cmds[i].name); + } + + puts(" The most commonly used lkvm commands are:"); + for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { + printf(" %-*s ", longest, common_cmds[i].name); + puts(common_cmds[i].help); + } +} + +static void kvm_help(void) +{ + printf("\n To start a simple non-privileged shell run '%s run'\n\n" + "usage: %s\n\n", KVM_BINARY_NAME, kvm_usage_string); + list_common_cmds_help(); + printf("\n %s\n\n", kvm_more_info_string); +} + + +static void help_cmd(const char *cmd) +{ + struct cmd_struct *p; + p = kvm_get_command(kvm_commands, cmd); + if (!p) + kvm_help(); + else if (p->help) + p->help(); +} + +int kvm_cmd_help(int argc, const char **argv, const char *prefix) +{ + if (!argv || !*argv) { + kvm_help(); + return 0; + } + help_cmd(argv[0]); + return 0; +} diff --git a/tools/kvm/builtin-list.c b/tools/kvm/builtin-list.c new file mode 100644 index 000000000000..9299f17b6f00 --- /dev/null +++ b/tools/kvm/builtin-list.c @@ -0,0 +1,149 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-list.h> +#include <kvm/kvm.h> +#include <kvm/parse-options.h> +#include <kvm/kvm-ipc.h> + +#include <dirent.h> +#include <stdio.h> +#include <string.h> +#include <signal.h> +#include <fcntl.h> + +static bool run; +static bool rootfs; + +static const char * const list_usage[] = { + "lkvm list", + NULL +}; + +static const struct option list_options[] = { + OPT_GROUP("General options:"), + OPT_BOOLEAN('i', "run", &run, "List running instances"), + OPT_BOOLEAN('r', "rootfs", &rootfs, "List rootfs instances"), + OPT_END() +}; + +#define KVM_INSTANCE_RUNNING "running" +#define KVM_INSTANCE_PAUSED "paused" +#define KVM_INSTANCE_SHUTOFF "shut off" + +void kvm_list_help(void) +{ + usage_with_options(list_usage, list_options); +} + +static pid_t get_pid(int sock) +{ + pid_t pid; + int r; + + r = kvm_ipc__send(sock, KVM_IPC_PID); + if (r < 0) + return r; + + r = read(sock, &pid, sizeof(pid)); + if (r < 0) + return r; + + return pid; +} + +int get_vmstate(int sock) +{ + int vmstate; + int r; + + r = kvm_ipc__send(sock, KVM_IPC_VMSTATE); + if (r < 0) + return r; + + r = read(sock, &vmstate, sizeof(vmstate)); + if (r < 0) + return r; + + return vmstate; + +} + +static int print_guest(const char *name, int sock) +{ + pid_t pid; + int vmstate; + + pid = get_pid(sock); + vmstate = get_vmstate(sock); + + if ((int)pid < 0 || vmstate < 0) + return -1; + + if (vmstate == KVM_VMSTATE_PAUSED) + printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_PAUSED); + else + printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_RUNNING); + + return 0; +} + +static int kvm_list_running_instances(void) +{ + return kvm__enumerate_instances(print_guest); +} + +static int kvm_list_rootfs(void) +{ + DIR *dir; + struct dirent *dirent; + + dir = opendir(kvm__get_dir()); + if (dir == NULL) + return -1; + + while ((dirent = readdir(dir))) { + if (dirent->d_type == DT_DIR && + strcmp(dirent->d_name, ".") && + strcmp(dirent->d_name, "..")) + printf("%5s %-20s %s\n", "", dirent->d_name, KVM_INSTANCE_SHUTOFF); + } + + return 0; +} + +static void parse_setup_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, list_options, list_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0) + kvm_list_help(); + } +} + +int kvm_cmd_list(int argc, const char **argv, const char *prefix) +{ + int r; + + parse_setup_options(argc, argv); + + if (!run && !rootfs) + run = rootfs = true; + + printf("%6s %-20s %s\n", "PID", "NAME", "STATE"); + printf("------------------------------------\n"); + + if (run) { + r = kvm_list_running_instances(); + if (r < 0) + perror("Error listing instances"); + } + + if (rootfs) { + r = kvm_list_rootfs(); + if (r < 0) + perror("Error listing rootfs"); + } + + return 0; +} diff --git a/tools/kvm/builtin-pause.c b/tools/kvm/builtin-pause.c new file mode 100644 index 000000000000..c08595a304d1 --- /dev/null +++ b/tools/kvm/builtin-pause.c @@ -0,0 +1,88 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-pause.h> +#include <kvm/builtin-list.h> +#include <kvm/kvm.h> +#include <kvm/parse-options.h> +#include <kvm/kvm-ipc.h> + +#include <stdio.h> +#include <string.h> +#include <signal.h> + +static bool all; +static const char *instance_name; + +static const char * const pause_usage[] = { + "lkvm pause [--all] [-n name]", + NULL +}; + +static const struct option pause_options[] = { + OPT_GROUP("General options:"), + OPT_BOOLEAN('a', "all", &all, "Pause all instances"), + OPT_STRING('n', "name", &instance_name, "name", "Instance name"), + OPT_END() +}; + +static void parse_pause_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, pause_options, pause_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0) + kvm_pause_help(); + } +} + +void kvm_pause_help(void) +{ + usage_with_options(pause_usage, pause_options); +} + +static int do_pause(const char *name, int sock) +{ + int r; + int vmstate; + + vmstate = get_vmstate(sock); + if (vmstate < 0) + return vmstate; + if (vmstate == KVM_VMSTATE_PAUSED) { + printf("Guest %s is already paused.\n", name); + return 0; + } + + r = kvm_ipc__send(sock, KVM_IPC_PAUSE); + if (r) + return r; + + printf("Guest %s paused\n", name); + + return 0; +} + +int kvm_cmd_pause(int argc, const char **argv, const char *prefix) +{ + int instance; + int r; + + parse_pause_options(argc, argv); + + if (all) + return kvm__enumerate_instances(do_pause); + + if (instance_name == NULL) + kvm_pause_help(); + + instance = kvm__get_sock_by_instance(instance_name); + + if (instance <= 0) + die("Failed locating instance"); + + r = do_pause(instance_name, instance); + + close(instance); + + return r; +} diff --git a/tools/kvm/builtin-resume.c b/tools/kvm/builtin-resume.c new file mode 100644 index 000000000000..0e954b405ee8 --- /dev/null +++ b/tools/kvm/builtin-resume.c @@ -0,0 +1,88 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-resume.h> +#include <kvm/builtin-list.h> +#include <kvm/kvm.h> +#include <kvm/parse-options.h> +#include <kvm/kvm-ipc.h> + +#include <stdio.h> +#include <string.h> +#include <signal.h> + +static bool all; +static const char *instance_name; + +static const char * const resume_usage[] = { + "lkvm resume [--all] [-n name]", + NULL +}; + +static const struct option resume_options[] = { + OPT_GROUP("General options:"), + OPT_BOOLEAN('a', "all", &all, "Resume all instances"), + OPT_STRING('n', "name", &instance_name, "name", "Instance name"), + OPT_END() +}; + +static void parse_resume_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, resume_options, resume_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0) + kvm_resume_help(); + } +} + +void kvm_resume_help(void) +{ + usage_with_options(resume_usage, resume_options); +} + +static int do_resume(const char *name, int sock) +{ + int r; + int vmstate; + + vmstate = get_vmstate(sock); + if (vmstate < 0) + return vmstate; + if (vmstate == KVM_VMSTATE_RUNNING) { + printf("Guest %s is still running.\n", name); + return 0; + } + + r = kvm_ipc__send(sock, KVM_IPC_RESUME); + if (r) + return r; + + printf("Guest %s resumed\n", name); + + return 0; +} + +int kvm_cmd_resume(int argc, const char **argv, const char *prefix) +{ + int instance; + int r; + + parse_resume_options(argc, argv); + + if (all) + return kvm__enumerate_instances(do_resume); + + if (instance_name == NULL) + kvm_resume_help(); + + instance = kvm__get_sock_by_instance(instance_name); + + if (instance <= 0) + die("Failed locating instance"); + + r = do_resume(instance_name, instance); + + close(instance); + + return r; +} diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c new file mode 100644 index 000000000000..d0b876a78819 --- /dev/null +++ b/tools/kvm/builtin-run.c @@ -0,0 +1,702 @@ +#include "kvm/builtin-run.h" + +#include "kvm/builtin-setup.h" +#include "kvm/virtio-balloon.h" +#include "kvm/virtio-console.h" +#include "kvm/parse-options.h" +#include "kvm/8250-serial.h" +#include "kvm/framebuffer.h" +#include "kvm/disk-image.h" +#include "kvm/threadpool.h" +#include "kvm/virtio-scsi.h" +#include "kvm/virtio-blk.h" +#include "kvm/virtio-net.h" +#include "kvm/virtio-rng.h" +#include "kvm/ioeventfd.h" +#include "kvm/virtio-9p.h" +#include "kvm/barrier.h" +#include "kvm/kvm-cpu.h" +#include "kvm/ioport.h" +#include "kvm/symbol.h" +#include "kvm/i8042.h" +#include "kvm/mutex.h" +#include "kvm/term.h" +#include "kvm/util.h" +#include "kvm/strbuf.h" +#include "kvm/vesa.h" +#include "kvm/irq.h" +#include "kvm/kvm.h" +#include "kvm/pci.h" +#include "kvm/rtc.h" +#include "kvm/sdl.h" +#include "kvm/vnc.h" +#include "kvm/guest_compat.h" +#include "kvm/pci-shmem.h" +#include "kvm/kvm-ipc.h" +#include "kvm/builtin-debug.h" + +#include <linux/types.h> +#include <linux/err.h> + +#include <sys/utsname.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <termios.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <ctype.h> +#include <stdio.h> + +#define MB_SHIFT (20) +#define KB_SHIFT (10) +#define GB_SHIFT (30) + +__thread struct kvm_cpu *current_kvm_cpu; + +static int kvm_run_wrapper; + +bool do_debug_print = false; + +extern char _binary_guest_init_start; +extern char _binary_guest_init_size; + +static const char * const run_usage[] = { + "lkvm run [<options>] [<kernel image>]", + NULL +}; + +enum { + KVM_RUN_DEFAULT, + KVM_RUN_SANDBOX, +}; + +static int img_name_parser(const struct option *opt, const char *arg, int unset) +{ + char path[PATH_MAX]; + struct stat st; + + snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg); + + if ((stat(arg, &st) == 0 && S_ISDIR(st.st_mode)) || + (stat(path, &st) == 0 && S_ISDIR(st.st_mode))) + return virtio_9p_img_name_parser(opt, arg, unset); + return disk_img_name_parser(opt, arg, unset); +} + +void kvm_run_set_wrapper_sandbox(void) +{ + kvm_run_wrapper = KVM_RUN_SANDBOX; +} + +#ifndef OPT_ARCH_RUN +#define OPT_ARCH_RUN(...) +#endif + +#define BUILD_OPTIONS(name, cfg, kvm) \ + struct option name[] = { \ + OPT_GROUP("Basic options:"), \ + OPT_STRING('\0', "name", &(cfg)->guest_name, "guest name", \ + "A name for the guest"), \ + OPT_INTEGER('c', "cpus", &(cfg)->nrcpus, "Number of CPUs"), \ + OPT_U64('m', "mem", &(cfg)->ram_size, "Virtual machine memory" \ + " size in MiB."), \ + OPT_CALLBACK('\0', "shmem", NULL, \ + "[pci:]<addr>:<size>[:handle=<handle>][:create]", \ + "Share host shmem with guest via pci device", \ + shmem_parser, NULL), \ + OPT_CALLBACK('d', "disk", kvm, "image or rootfs_dir", "Disk " \ + " image or rootfs directory", img_name_parser, \ + kvm), \ + OPT_BOOLEAN('\0', "balloon", &(cfg)->balloon, "Enable virtio" \ + " balloon"), \ + OPT_BOOLEAN('\0', "vnc", &(cfg)->vnc, "Enable VNC framebuffer"),\ + OPT_BOOLEAN('\0', "sdl", &(cfg)->sdl, "Enable SDL framebuffer"),\ + OPT_BOOLEAN('\0', "rng", &(cfg)->virtio_rng, "Enable virtio" \ + " Random Number Generator"), \ + OPT_CALLBACK('\0', "9p", NULL, "dir_to_share,tag_name", \ + "Enable virtio 9p to share files between host and" \ + " guest", virtio_9p_rootdir_parser, kvm), \ + OPT_STRING('\0', "console", &(cfg)->console, "serial, virtio or"\ + " hv", "Console to use"), \ + OPT_STRING('\0', "dev", &(cfg)->dev, "device_file", \ + "KVM device file"), \ + OPT_CALLBACK('\0', "tty", NULL, "tty id", \ + "Remap guest TTY into a pty on the host", \ + tty_parser, NULL), \ + OPT_STRING('\0', "sandbox", &(cfg)->sandbox, "script", \ + "Run this script when booting into custom" \ + " rootfs"), \ + OPT_STRING('\0', "hugetlbfs", &(cfg)->hugetlbfs_path, "path", \ + "Hugetlbfs path"), \ + \ + OPT_GROUP("Kernel options:"), \ + OPT_STRING('k', "kernel", &(cfg)->kernel_filename, "kernel", \ + "Kernel to boot in virtual machine"), \ + OPT_STRING('i', "initrd", &(cfg)->initrd_filename, "initrd", \ + "Initial RAM disk image"), \ + OPT_STRING('p', "params", &(cfg)->kernel_cmdline, "params", \ + "Kernel command line arguments"), \ + OPT_STRING('f', "firmware", &(cfg)->firmware_filename, "firmware",\ + "Firmware image to boot in virtual machine"), \ + \ + OPT_GROUP("Networking options:"), \ + OPT_CALLBACK_DEFAULT('n', "network", NULL, "network params", \ + "Create a new guest NIC", \ + netdev_parser, NULL, kvm), \ + OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel" \ + " DHCP in rootfs mode"), \ + \ + OPT_GROUP("Debug options:"), \ + OPT_BOOLEAN('\0', "debug", &do_debug_print, \ + "Enable debug messages"), \ + OPT_BOOLEAN('\0', "debug-single-step", &(cfg)->single_step, \ + "Enable single stepping"), \ + OPT_BOOLEAN('\0', "debug-ioport", &(cfg)->ioport_debug, \ + "Enable ioport debugging"), \ + OPT_BOOLEAN('\0', "debug-mmio", &(cfg)->mmio_debug, \ + "Enable MMIO debugging"), \ + OPT_INTEGER('\0', "debug-iodelay", &(cfg)->debug_iodelay, \ + "Delay IO by millisecond"), \ + \ + OPT_ARCH(RUN, cfg) \ + OPT_END() \ + }; + +static void handle_sigalrm(int sig, siginfo_t *si, void *uc) +{ + struct kvm *kvm = si->si_value.sival_ptr; + + kvm__arch_periodic_poll(kvm); +} + +static void *kvm_cpu_thread(void *arg) +{ + char name[16]; + + current_kvm_cpu = arg; + + sprintf(name, "kvm-vcpu-%lu", current_kvm_cpu->cpu_id); + kvm__set_thread_name(name); + + if (kvm_cpu__start(current_kvm_cpu)) + goto panic_kvm; + + return (void *) (intptr_t) 0; + +panic_kvm: + fprintf(stderr, "KVM exit reason: %u (\"%s\")\n", + current_kvm_cpu->kvm_run->exit_reason, + kvm_exit_reasons[current_kvm_cpu->kvm_run->exit_reason]); + if (current_kvm_cpu->kvm_run->exit_reason == KVM_EXIT_UNKNOWN) + fprintf(stderr, "KVM exit code: 0x%Lu\n", + current_kvm_cpu->kvm_run->hw.hardware_exit_reason); + + kvm_cpu__set_debug_fd(STDOUT_FILENO); + kvm_cpu__show_registers(current_kvm_cpu); + kvm_cpu__show_code(current_kvm_cpu); + kvm_cpu__show_page_tables(current_kvm_cpu); + + return (void *) (intptr_t) 1; +} + +static char kernel[PATH_MAX]; + +static const char *host_kernels[] = { + "/boot/vmlinuz", + "/boot/bzImage", + NULL +}; + +static const char *default_kernels[] = { + "./bzImage", + "arch/" BUILD_ARCH "/boot/bzImage", + "../../arch/" BUILD_ARCH "/boot/bzImage", + NULL +}; + +static const char *default_vmlinux[] = { + "vmlinux", + "../../../vmlinux", + "../../vmlinux", + NULL +}; + +static void kernel_usage_with_options(void) +{ + const char **k; + struct utsname uts; + + fprintf(stderr, "Fatal: could not find default kernel image in:\n"); + k = &default_kernels[0]; + while (*k) { + fprintf(stderr, "\t%s\n", *k); + k++; + } + + if (uname(&uts) < 0) + return; + + k = &host_kernels[0]; + while (*k) { + if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0) + return; + fprintf(stderr, "\t%s\n", kernel); + k++; + } + fprintf(stderr, "\nPlease see '%s run --help' for more options.\n\n", + KVM_BINARY_NAME); +} + +static u64 host_ram_size(void) +{ + long page_size; + long nr_pages; + + nr_pages = sysconf(_SC_PHYS_PAGES); + if (nr_pages < 0) { + pr_warning("sysconf(_SC_PHYS_PAGES) failed"); + return 0; + } + + page_size = sysconf(_SC_PAGE_SIZE); + if (page_size < 0) { + pr_warning("sysconf(_SC_PAGE_SIZE) failed"); + return 0; + } + + return (nr_pages * page_size) >> MB_SHIFT; +} + +/* + * If user didn't specify how much memory it wants to allocate for the guest, + * avoid filling the whole host RAM. + */ +#define RAM_SIZE_RATIO 0.8 + +static u64 get_ram_size(int nr_cpus) +{ + u64 available; + u64 ram_size; + + ram_size = 64 * (nr_cpus + 3); + + available = host_ram_size() * RAM_SIZE_RATIO; + if (!available) + available = MIN_RAM_SIZE_MB; + + if (ram_size > available) + ram_size = available; + + return ram_size; +} + +static const char *find_kernel(void) +{ + const char **k; + struct stat st; + struct utsname uts; + + k = &default_kernels[0]; + while (*k) { + if (stat(*k, &st) < 0 || !S_ISREG(st.st_mode)) { + k++; + continue; + } + strncpy(kernel, *k, PATH_MAX); + return kernel; + } + + if (uname(&uts) < 0) + return NULL; + + k = &host_kernels[0]; + while (*k) { + if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0) + return NULL; + + if (stat(kernel, &st) < 0 || !S_ISREG(st.st_mode)) { + k++; + continue; + } + return kernel; + + } + return NULL; +} + +static const char *find_vmlinux(void) +{ + const char **vmlinux; + + vmlinux = &default_vmlinux[0]; + while (*vmlinux) { + struct stat st; + + if (stat(*vmlinux, &st) < 0 || !S_ISREG(st.st_mode)) { + vmlinux++; + continue; + } + return *vmlinux; + } + return NULL; +} + +void kvm_run_help(void) +{ + struct kvm *kvm = NULL; + + BUILD_OPTIONS(options, &kvm->cfg, kvm); + usage_with_options(run_usage, options); +} + +static int kvm_setup_guest_init(struct kvm *kvm) +{ + const char *rootfs = kvm->cfg.custom_rootfs_name; + char tmp[PATH_MAX]; + size_t size; + int fd, ret; + char *data; + + /* Setup /virt/init */ + size = (size_t)&_binary_guest_init_size; + data = (char *)&_binary_guest_init_start; + snprintf(tmp, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), rootfs); + remove(tmp); + fd = open(tmp, O_CREAT | O_WRONLY, 0755); + if (fd < 0) + die("Fail to setup %s", tmp); + ret = xwrite(fd, data, size); + if (ret < 0) + die("Fail to setup %s", tmp); + close(fd); + + return 0; +} + +static int kvm_run_set_sandbox(struct kvm *kvm) +{ + const char *guestfs_name = kvm->cfg.custom_rootfs_name; + char path[PATH_MAX], script[PATH_MAX], *tmp; + + snprintf(path, PATH_MAX, "%s%s/virt/sandbox.sh", kvm__get_dir(), guestfs_name); + + remove(path); + + if (kvm->cfg.sandbox == NULL) + return 0; + + tmp = realpath(kvm->cfg.sandbox, NULL); + if (tmp == NULL) + return -ENOMEM; + + snprintf(script, PATH_MAX, "/host/%s", tmp); + free(tmp); + + return symlink(script, path); +} + +static void kvm_write_sandbox_cmd_exactly(int fd, const char *arg) +{ + const char *single_quote; + + if (!*arg) { /* zero length string */ + if (write(fd, "''", 2) <= 0) + die("Failed writing sandbox script"); + return; + } + + while (*arg) { + single_quote = strchrnul(arg, '\''); + + /* write non-single-quote string as #('string') */ + if (arg != single_quote) { + if (write(fd, "'", 1) <= 0 || + write(fd, arg, single_quote - arg) <= 0 || + write(fd, "'", 1) <= 0) + die("Failed writing sandbox script"); + } + + /* write single quote as #("'") */ + if (*single_quote) { + if (write(fd, "\"'\"", 3) <= 0) + die("Failed writing sandbox script"); + } else + break; + + arg = single_quote + 1; + } +} + +static void resolve_program(const char *src, char *dst, size_t len) +{ + struct stat st; + int err; + + err = stat(src, &st); + + if (!err && S_ISREG(st.st_mode)) { + char resolved_path[PATH_MAX]; + + if (!realpath(src, resolved_path)) + die("Unable to resolve program %s: %s\n", src, strerror(errno)); + + snprintf(dst, len, "/host%s", resolved_path); + } else + strncpy(dst, src, len); +} + +static void kvm_run_write_sandbox_cmd(struct kvm *kvm, const char **argv, int argc) +{ + const char script_hdr[] = "#! /bin/bash\n\n"; + char program[PATH_MAX]; + int fd; + + remove(kvm->cfg.sandbox); + + fd = open(kvm->cfg.sandbox, O_RDWR | O_CREAT, 0777); + if (fd < 0) + die("Failed creating sandbox script"); + + if (write(fd, script_hdr, sizeof(script_hdr) - 1) <= 0) + die("Failed writing sandbox script"); + + resolve_program(argv[0], program, PATH_MAX); + kvm_write_sandbox_cmd_exactly(fd, program); + + argv++; + argc--; + + while (argc) { + if (write(fd, " ", 1) <= 0) + die("Failed writing sandbox script"); + + kvm_write_sandbox_cmd_exactly(fd, argv[0]); + argv++; + argc--; + } + if (write(fd, "\n", 1) <= 0) + die("Failed writing sandbox script"); + + close(fd); +} + +static struct kvm *kvm_cmd_run_init(int argc, const char **argv) +{ + static char real_cmdline[2048], default_name[20]; + unsigned int nr_online_cpus; + struct sigaction sa; + struct kvm *kvm = kvm__new(); + + if (IS_ERR(kvm)) + return kvm; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = handle_sigalrm; + sigemptyset(&sa.sa_mask); + sigaction(SIGALRM, &sa, NULL); + + nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN); + kvm->cfg.custom_rootfs_name = "default"; + + while (argc != 0) { + BUILD_OPTIONS(options, &kvm->cfg, kvm); + argc = parse_options(argc, argv, options, run_usage, + PARSE_OPT_STOP_AT_NON_OPTION | + PARSE_OPT_KEEP_DASHDASH); + if (argc != 0) { + /* Cusrom options, should have been handled elsewhere */ + if (strcmp(argv[0], "--") == 0) { + if (kvm_run_wrapper == KVM_RUN_SANDBOX) { + kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME; + kvm_run_write_sandbox_cmd(kvm, argv+1, argc-1); + break; + } + } + + if ((kvm_run_wrapper == KVM_RUN_DEFAULT && kvm->cfg.kernel_filename) || + (kvm_run_wrapper == KVM_RUN_SANDBOX && kvm->cfg.sandbox)) { + fprintf(stderr, "Cannot handle parameter: " + "%s\n", argv[0]); + usage_with_options(run_usage, options); + free(kvm); + return ERR_PTR(-EINVAL); + } + if (kvm_run_wrapper == KVM_RUN_SANDBOX) { + /* + * first unhandled parameter is treated as + * sandbox command + */ + kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME; + kvm_run_write_sandbox_cmd(kvm, argv, argc); + } else { + /* + * first unhandled parameter is treated as a kernel + * image + */ + kvm->cfg.kernel_filename = argv[0]; + } + argv++; + argc--; + } + + } + + kvm->nr_disks = kvm->cfg.image_count; + + if (!kvm->cfg.kernel_filename) + kvm->cfg.kernel_filename = find_kernel(); + + if (!kvm->cfg.kernel_filename) { + kernel_usage_with_options(); + return ERR_PTR(-EINVAL); + } + + kvm->cfg.vmlinux_filename = find_vmlinux(); + kvm->vmlinux = kvm->cfg.vmlinux_filename; + + if (kvm->cfg.nrcpus == 0) + kvm->cfg.nrcpus = nr_online_cpus; + + if (!kvm->cfg.ram_size) + kvm->cfg.ram_size = get_ram_size(kvm->cfg.nrcpus); + + if (kvm->cfg.ram_size < MIN_RAM_SIZE_MB) + die("Not enough memory specified: %lluMB (min %lluMB)", kvm->cfg.ram_size, MIN_RAM_SIZE_MB); + + if (kvm->cfg.ram_size > host_ram_size()) + pr_warning("Guest memory size %lluMB exceeds host physical RAM size %lluMB", kvm->cfg.ram_size, host_ram_size()); + + kvm->cfg.ram_size <<= MB_SHIFT; + + if (!kvm->cfg.dev) + kvm->cfg.dev = DEFAULT_KVM_DEV; + + if (!kvm->cfg.console) + kvm->cfg.console = DEFAULT_CONSOLE; + + if (!strncmp(kvm->cfg.console, "virtio", 6)) + kvm->cfg.active_console = CONSOLE_VIRTIO; + else if (!strncmp(kvm->cfg.console, "serial", 6)) + kvm->cfg.active_console = CONSOLE_8250; + else if (!strncmp(kvm->cfg.console, "hv", 2)) + kvm->cfg.active_console = CONSOLE_HV; + else + pr_warning("No console!"); + + if (!kvm->cfg.host_ip) + kvm->cfg.host_ip = DEFAULT_HOST_ADDR; + + if (!kvm->cfg.guest_ip) + kvm->cfg.guest_ip = DEFAULT_GUEST_ADDR; + + if (!kvm->cfg.guest_mac) + kvm->cfg.guest_mac = DEFAULT_GUEST_MAC; + + if (!kvm->cfg.host_mac) + kvm->cfg.host_mac = DEFAULT_HOST_MAC; + + if (!kvm->cfg.script) + kvm->cfg.script = DEFAULT_SCRIPT; + + if (!kvm->cfg.network) + kvm->cfg.network = DEFAULT_NETWORK; + + memset(real_cmdline, 0, sizeof(real_cmdline)); + kvm__arch_set_cmdline(real_cmdline, kvm->cfg.vnc || kvm->cfg.sdl); + + if (strlen(real_cmdline) > 0) + strcat(real_cmdline, " "); + + if (kvm->cfg.kernel_cmdline) + strlcat(real_cmdline, kvm->cfg.kernel_cmdline, sizeof(real_cmdline)); + + if (!kvm->cfg.guest_name) { + if (kvm->cfg.custom_rootfs) { + kvm->cfg.guest_name = kvm->cfg.custom_rootfs_name; + } else { + sprintf(default_name, "guest-%u", getpid()); + kvm->cfg.guest_name = default_name; + } + } + + if (!kvm->cfg.using_rootfs && !kvm->cfg.disk_image[0].filename && !kvm->cfg.initrd_filename) { + char tmp[PATH_MAX]; + + kvm_setup_create_new(kvm->cfg.custom_rootfs_name); + kvm_setup_resolv(kvm->cfg.custom_rootfs_name); + + snprintf(tmp, PATH_MAX, "%s%s", kvm__get_dir(), "default"); + if (virtio_9p__register(kvm, tmp, "/dev/root") < 0) + die("Unable to initialize virtio 9p"); + if (virtio_9p__register(kvm, "/", "hostfs") < 0) + die("Unable to initialize virtio 9p"); + kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1; + } + + if (kvm->cfg.using_rootfs) { + strcat(real_cmdline, " root=/dev/root rw rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p"); + if (kvm->cfg.custom_rootfs) { + kvm_run_set_sandbox(kvm); + + strcat(real_cmdline, " init=/virt/init"); + + if (!kvm->cfg.no_dhcp) + strcat(real_cmdline, " ip=dhcp"); + if (kvm_setup_guest_init(kvm)) + die("Failed to setup init for guest."); + } + } else if (!strstr(real_cmdline, "root=")) { + strlcat(real_cmdline, " root=/dev/vda rw ", sizeof(real_cmdline)); + } + + kvm->cfg.real_cmdline = real_cmdline; + + printf(" # %s run -k %s -m %Lu -c %d --name %s\n", KVM_BINARY_NAME, + kvm->cfg.kernel_filename, kvm->cfg.ram_size / 1024 / 1024, kvm->cfg.nrcpus, kvm->cfg.guest_name); + + if (init_list__init(kvm) < 0) + die ("Initialisation failed"); + + return kvm; +} + +static int kvm_cmd_run_work(struct kvm *kvm) +{ + int i; + void *ret = NULL; + + for (i = 0; i < kvm->nrcpus; i++) { + if (pthread_create(&kvm->cpus[i]->thread, NULL, kvm_cpu_thread, kvm->cpus[i]) != 0) + die("unable to create KVM VCPU thread"); + } + + /* Only VCPU #0 is going to exit by itself when shutting down */ + return pthread_join(kvm->cpus[0]->thread, &ret); +} + +static void kvm_cmd_run_exit(struct kvm *kvm, int guest_ret) +{ + compat__print_all_messages(); + + init_list__exit(kvm); + + if (guest_ret == 0) + printf("\n # KVM session ended normally.\n"); +} + +int kvm_cmd_run(int argc, const char **argv, const char *prefix) +{ + int ret = -EFAULT; + struct kvm *kvm; + + kvm = kvm_cmd_run_init(argc, argv); + if (IS_ERR(kvm)) + return PTR_ERR(kvm); + + ret = kvm_cmd_run_work(kvm); + kvm_cmd_run_exit(kvm, ret); + + return ret; +} diff --git a/tools/kvm/builtin-sandbox.c b/tools/kvm/builtin-sandbox.c new file mode 100644 index 000000000000..433f5361e8a8 --- /dev/null +++ b/tools/kvm/builtin-sandbox.c @@ -0,0 +1,9 @@ +#include "kvm/builtin-sandbox.h" +#include "kvm/builtin-run.h" + +int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix) +{ + kvm_run_set_wrapper_sandbox(); + + return kvm_cmd_run(argc, argv, prefix); +} diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c new file mode 100644 index 000000000000..8b45c5645ad4 --- /dev/null +++ b/tools/kvm/builtin-setup.c @@ -0,0 +1,258 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-setup.h> +#include <kvm/kvm.h> +#include <kvm/parse-options.h> +#include <kvm/read-write.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <limits.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <sys/mman.h> +#include <fcntl.h> + +extern char _binary_guest_init_start; +extern char _binary_guest_init_size; + +static const char *instance_name; + +static const char * const setup_usage[] = { + "lkvm setup [name]", + NULL +}; + +static const struct option setup_options[] = { + OPT_END() +}; + +static void parse_setup_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, setup_options, setup_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0 && instance_name) + kvm_setup_help(); + else + instance_name = argv[0]; + argv++; + argc--; + } +} + +void kvm_setup_help(void) +{ + printf("\n%s setup creates a new rootfs under %s.\n" + "This can be used later by the '-d' parameter of '%s run'.\n", + KVM_BINARY_NAME, kvm__get_dir(), KVM_BINARY_NAME); + usage_with_options(setup_usage, setup_options); +} + +static int copy_file(const char *from, const char *to) +{ + int in_fd, out_fd; + void *src, *dst; + struct stat st; + int err = -1; + + in_fd = open(from, O_RDONLY); + if (in_fd < 0) + return err; + + if (fstat(in_fd, &st) < 0) + goto error_close_in; + + out_fd = open(to, O_RDWR | O_CREAT | O_TRUNC, st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)); + if (out_fd < 0) + goto error_close_in; + + src = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, in_fd, 0); + if (src == MAP_FAILED) + goto error_close_out; + + if (ftruncate(out_fd, st.st_size) < 0) + goto error_munmap_src; + + dst = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, out_fd, 0); + if (dst == MAP_FAILED) + goto error_munmap_src; + + memcpy(dst, src, st.st_size); + + if (fsync(out_fd) < 0) + goto error_munmap_dst; + + err = 0; + +error_munmap_dst: + munmap(dst, st.st_size); +error_munmap_src: + munmap(src, st.st_size); +error_close_out: + close(out_fd); +error_close_in: + close(in_fd); + + return err; +} + +static const char *guestfs_dirs[] = { + "/dev", + "/etc", + "/home", + "/host", + "/proc", + "/root", + "/sys", + "/tmp", + "/var", + "/var/lib", + "/virt", + "/virt/home", +}; + +static const char *guestfs_symlinks[] = { + "/bin", + "/lib", + "/lib64", + "/sbin", + "/usr", + "/etc/ld.so.conf", +}; + +static int copy_init(const char *guestfs_name) +{ + char path[PATH_MAX]; + size_t size; + int fd, ret; + char *data; + + size = (size_t)&_binary_guest_init_size; + data = (char *)&_binary_guest_init_start; + snprintf(path, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), guestfs_name); + remove(path); + fd = open(path, O_CREAT | O_WRONLY, 0755); + if (fd < 0) + die("Fail to setup %s", path); + ret = xwrite(fd, data, size); + if (ret < 0) + die("Fail to setup %s", path); + close(fd); + + return 0; +} + +static int copy_passwd(const char *guestfs_name) +{ + char path[PATH_MAX]; + FILE *file; + int ret; + + snprintf(path, PATH_MAX, "%s%s/etc/passwd", kvm__get_dir(), guestfs_name); + + file = fopen(path, "w"); + if (!file) + return -1; + + ret = fprintf(file, "root:x:0:0:root:/root:/bin/sh\n"); + if (ret > 0) + ret = 0; + + fclose(file); + + return ret; +} + +static int make_guestfs_symlink(const char *guestfs_name, const char *path) +{ + char target[PATH_MAX]; + char name[PATH_MAX]; + + snprintf(name, PATH_MAX, "%s%s%s", kvm__get_dir(), guestfs_name, path); + + snprintf(target, PATH_MAX, "/host%s", path); + + return symlink(target, name); +} + +static int make_dir(const char *dir) +{ + char name[PATH_MAX]; + + snprintf(name, PATH_MAX, "%s%s", kvm__get_dir(), dir); + + return mkdir(name, 0777); +} + +static void make_guestfs_dir(const char *guestfs_name, const char *dir) +{ + char name[PATH_MAX]; + + snprintf(name, PATH_MAX, "%s%s", guestfs_name, dir); + + make_dir(name); +} + +void kvm_setup_resolv(const char *guestfs_name) +{ + char path[PATH_MAX]; + + snprintf(path, PATH_MAX, "%s%s/etc/resolv.conf", kvm__get_dir(), guestfs_name); + + copy_file("/etc/resolv.conf", path); +} + +static int do_setup(const char *guestfs_name) +{ + unsigned int i; + int ret; + + ret = make_dir(guestfs_name); + if (ret < 0) + return ret; + + for (i = 0; i < ARRAY_SIZE(guestfs_dirs); i++) + make_guestfs_dir(guestfs_name, guestfs_dirs[i]); + + for (i = 0; i < ARRAY_SIZE(guestfs_symlinks); i++) { + make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]); + } + + ret = copy_init(guestfs_name); + if (ret < 0) + return ret; + + return copy_passwd(guestfs_name); +} + +int kvm_setup_create_new(const char *guestfs_name) +{ + return do_setup(guestfs_name); +} + +int kvm_cmd_setup(int argc, const char **argv, const char *prefix) +{ + int r; + + parse_setup_options(argc, argv); + + if (instance_name == NULL) + kvm_setup_help(); + + r = do_setup(instance_name); + if (r == 0) + printf("A new rootfs '%s' has been created in '%s%s'.\n\n" + "You can now start it by running the following command:\n\n" + " %s run -d %s\n", + instance_name, kvm__get_dir(), instance_name, + KVM_BINARY_NAME,instance_name); + else + printf("Unable to create rootfs in %s%s: %s\n", + kvm__get_dir(), instance_name, strerror(errno)); + + return r; +} diff --git a/tools/kvm/builtin-stat.c b/tools/kvm/builtin-stat.c new file mode 100644 index 000000000000..ffd72e80ba16 --- /dev/null +++ b/tools/kvm/builtin-stat.c @@ -0,0 +1,127 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-stat.h> +#include <kvm/kvm.h> +#include <kvm/parse-options.h> +#include <kvm/kvm-ipc.h> + +#include <sys/select.h> +#include <stdio.h> +#include <string.h> +#include <signal.h> + +#include <linux/virtio_balloon.h> + +static bool mem; +static bool all; +static const char *instance_name; + +static const char * const stat_usage[] = { + "lkvm stat [command] [--all] [-n name]", + NULL +}; + +static const struct option stat_options[] = { + OPT_GROUP("Commands options:"), + OPT_BOOLEAN('m', "memory", &mem, "Display memory statistics"), + OPT_GROUP("Instance options:"), + OPT_BOOLEAN('a', "all", &all, "All instances"), + OPT_STRING('n', "name", &instance_name, "name", "Instance name"), + OPT_END() +}; + +static void parse_stat_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, stat_options, stat_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0) + kvm_stat_help(); + } +} + +void kvm_stat_help(void) +{ + usage_with_options(stat_usage, stat_options); +} + +static int do_memstat(const char *name, int sock) +{ + struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; + fd_set fdset; + struct timeval t = { .tv_sec = 1 }; + int r; + u8 i; + + FD_ZERO(&fdset); + FD_SET(sock, &fdset); + r = kvm_ipc__send(sock, KVM_IPC_STAT); + if (r < 0) + return r; + + r = select(1, &fdset, NULL, NULL, &t); + if (r < 0) { + pr_err("Could not retrieve mem stats from %s", name); + return r; + } + r = read(sock, &stats, sizeof(stats)); + if (r < 0) + return r; + + printf("\n\n\t*** Guest memory statistics ***\n\n"); + for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) { + switch (stats[i].tag) { + case VIRTIO_BALLOON_S_SWAP_IN: + printf("The amount of memory that has been swapped in (in bytes):"); + break; + case VIRTIO_BALLOON_S_SWAP_OUT: + printf("The amount of memory that has been swapped out to disk (in bytes):"); + break; + case VIRTIO_BALLOON_S_MAJFLT: + printf("The number of major page faults that have occurred:"); + break; + case VIRTIO_BALLOON_S_MINFLT: + printf("The number of minor page faults that have occurred:"); + break; + case VIRTIO_BALLOON_S_MEMFREE: + printf("The amount of memory not being used for any purpose (in bytes):"); + break; + case VIRTIO_BALLOON_S_MEMTOT: + printf("The total amount of memory available (in bytes):"); + break; + } + printf("%llu\n", stats[i].val); + } + printf("\n"); + + return 0; +} + +int kvm_cmd_stat(int argc, const char **argv, const char *prefix) +{ + int instance; + int r = 0; + + parse_stat_options(argc, argv); + + if (!mem) + usage_with_options(stat_usage, stat_options); + + if (mem && all) + return kvm__enumerate_instances(do_memstat); + + if (instance_name == NULL) + kvm_stat_help(); + + instance = kvm__get_sock_by_instance(instance_name); + + if (instance <= 0) + die("Failed locating instance"); + + if (mem) + r = do_memstat(instance_name, instance); + + close(instance); + + return r; +} diff --git a/tools/kvm/builtin-stop.c b/tools/kvm/builtin-stop.c new file mode 100644 index 000000000000..6067630568df --- /dev/null +++ b/tools/kvm/builtin-stop.c @@ -0,0 +1,70 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-stop.h> +#include <kvm/kvm.h> +#include <kvm/parse-options.h> +#include <kvm/kvm-ipc.h> + +#include <stdio.h> +#include <string.h> +#include <signal.h> + +static bool all; +static const char *instance_name; + +static const char * const stop_usage[] = { + "lkvm stop [--all] [-n name]", + NULL +}; + +static const struct option stop_options[] = { + OPT_GROUP("General options:"), + OPT_BOOLEAN('a', "all", &all, "Stop all instances"), + OPT_STRING('n', "name", &instance_name, "name", "Instance name"), + OPT_END() +}; + +static void parse_stop_options(int argc, const char **argv) +{ + while (argc != 0) { + argc = parse_options(argc, argv, stop_options, stop_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc != 0) + kvm_stop_help(); + } +} + +void kvm_stop_help(void) +{ + usage_with_options(stop_usage, stop_options); +} + +static int do_stop(const char *name, int sock) +{ + return kvm_ipc__send(sock, KVM_IPC_STOP); +} + +int kvm_cmd_stop(int argc, const char **argv, const char *prefix) +{ + int instance; + int r; + + parse_stop_options(argc, argv); + + if (all) + return kvm__enumerate_instances(do_stop); + + if (instance_name == NULL) + kvm_stop_help(); + + instance = kvm__get_sock_by_instance(instance_name); + + if (instance <= 0) + die("Failed locating instance"); + + r = do_stop(instance_name, instance); + + close(instance); + + return r; +} diff --git a/tools/kvm/builtin-version.c b/tools/kvm/builtin-version.c new file mode 100644 index 000000000000..b8bb8597b97d --- /dev/null +++ b/tools/kvm/builtin-version.c @@ -0,0 +1,15 @@ +#include <kvm/util.h> +#include <kvm/kvm-cmd.h> +#include <kvm/builtin-version.h> +#include <kvm/kvm.h> + +#include <stdio.h> +#include <string.h> +#include <signal.h> + +int kvm_cmd_version(int argc, const char **argv, const char *prefix) +{ + printf("kvm tool %s\n", KVMTOOLS_VERSION); + + return 0; +} diff --git a/tools/kvm/code16gcc.h b/tools/kvm/code16gcc.h new file mode 100644 index 000000000000..d93e48010b61 --- /dev/null +++ b/tools/kvm/code16gcc.h @@ -0,0 +1,15 @@ +/* + * code16gcc.h + * + * This file is -include'd when compiling 16-bit C code. + * Note: this asm() needs to be emitted before gcc emits any code. + * Depending on gcc version, this requires -fno-unit-at-a-time or + * -fno-toplevel-reorder. + * + * Hopefully gcc will eventually have a real -m16 option so we can + * drop this hack long term. + */ + +#ifndef __ASSEMBLY__ +asm(".code16gcc"); +#endif diff --git a/tools/kvm/command-list.txt b/tools/kvm/command-list.txt new file mode 100644 index 000000000000..d93597dc551d --- /dev/null +++ b/tools/kvm/command-list.txt @@ -0,0 +1,15 @@ +# +# List of known perf commands. +# command name category [deprecated] [common] +# +lkvm-run mainporcelain common +lkvm-setup mainporcelain common +lkvm-pause common +lkvm-resume common +lkvm-version common +lkvm-list common +lkvm-debug common +lkvm-balloon common +lkvm-stop common +lkvm-stat common +lkvm-sandbox common diff --git a/tools/kvm/config/feature-tests.mak b/tools/kvm/config/feature-tests.mak new file mode 100644 index 000000000000..4a81f562903b --- /dev/null +++ b/tools/kvm/config/feature-tests.mak @@ -0,0 +1,177 @@ +define SOURCE_HELLO +#include <stdio.h> +int main(void) +{ + return puts(\"hi\"); +} +endef + +ifndef NO_DWARF +define SOURCE_DWARF +#include <dwarf.h> +#include <elfutils/libdw.h> +#include <elfutils/version.h> +#ifndef _ELFUTILS_PREREQ +#error +#endif + +int main(void) +{ + Dwarf *dbg = dwarf_begin(0, DWARF_C_READ); + return (long)dbg; +} +endef +endif + +define SOURCE_LIBELF +#include <libelf.h> + +int main(void) +{ + Elf *elf = elf_begin(0, ELF_C_READ, 0); + return (long)elf; +} +endef + +define SOURCE_GLIBC +#include <gnu/libc-version.h> + +int main(void) +{ + const char *version = gnu_get_libc_version(); + return (long)version; +} +endef + +define SOURCE_ELF_MMAP +#include <libelf.h> +int main(void) +{ + Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0); + return (long)elf; +} +endef + +ifndef NO_NEWT +define SOURCE_NEWT +#include <newt.h> + +int main(void) +{ + newtInit(); + newtCls(); + return newtFinished(); +} +endef +endif + +ifndef NO_LIBPERL +define SOURCE_PERL_EMBED +#include <EXTERN.h> +#include <perl.h> + +int main(void) +{ +perl_alloc(); +return 0; +} +endef +endif + +ifndef NO_LIBPYTHON +define SOURCE_PYTHON_VERSION +#include <Python.h> +#if PY_VERSION_HEX >= 0x03000000 + #error +#endif +int main(void){} +endef +define SOURCE_PYTHON_EMBED +#include <Python.h> +int main(void) +{ + Py_Initialize(); + return 0; +} +endef +endif + +define SOURCE_BFD +#include <bfd.h> + +int main(void) +{ + bfd_demangle(0, 0, 0); + return 0; +} +endef + +define SOURCE_CPLUS_DEMANGLE +extern char *cplus_demangle(const char *, int); + +int main(void) +{ + cplus_demangle(0, 0); + return 0; +} +endef + +define SOURCE_STRLCPY +#include <stdlib.h> +extern size_t strlcpy(char *dest, const char *src, size_t size); + +int main(void) +{ + strlcpy(NULL, NULL, 0); + return 0; +} +endef + +define SOURCE_VNCSERVER +#include <rfb/rfb.h> + +int main(void) +{ + rfbIsActive((void *)0); + return 0; +} +endef + +define SOURCE_SDL +#include <SDL/SDL.h> + +int main(void) +{ + SDL_Init(SDL_INIT_VIDEO); + return 0; +} +endef + +define SOURCE_ZLIB +#include <zlib.h> + +int main(void) +{ + inflateInit2(NULL, 0); + return 0; +} +endef + +define SOURCE_AIO +#include <libaio.h> + +int main(void) +{ + io_setup(0, NULL); + return 0; +} +endef + +define SOURCE_STATIC +#include <stdlib.h> + +int main(void) +{ + return 0; +} +endef diff --git a/tools/kvm/config/utilities.mak b/tools/kvm/config/utilities.mak new file mode 100644 index 000000000000..a70963b33b0f --- /dev/null +++ b/tools/kvm/config/utilities.mak @@ -0,0 +1,196 @@ +# This allows us to work with the newline character: +define newline + + +endef +newline := $(newline) + +# nl-escape +# +# Usage: escape = $(call nl-escape[,escape]) +# +# This is used as the common way to specify +# what should replace a newline when escaping +# newlines; the default is a bizarre string. +# +nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n) + +# escape-nl +# +# Usage: escaped-text = $(call escape-nl,text[,escape]) +# +# GNU make's $(shell ...) function converts to a +# single space each newline character in the output +# produced during the expansion; this may not be +# desirable. +# +# The only solution is to change each newline into +# something that won't be converted, so that the +# information can be recovered later with +# $(call unescape-nl...) +# +escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1)) + +# unescape-nl +# +# Usage: text = $(call unescape-nl,escaped-text[,escape]) +# +# See escape-nl. +# +unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1)) + +# shell-escape-nl +# +# Usage: $(shell some-command | $(call shell-escape-nl[,escape])) +# +# Use this to escape newlines from within a shell call; +# the default escape is a bizarre string. +# +# NOTE: The escape is used directly as a string constant +# in an `awk' program that is delimited by shell +# single-quotes, so be wary of the characters +# that are chosen. +# +define shell-escape-nl +awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}' +endef + +# shell-unescape-nl +# +# Usage: $(shell some-command | $(call shell-unescape-nl[,escape])) +# +# Use this to unescape newlines from within a shell call; +# the default escape is a bizarre string. +# +# NOTE: The escape is used directly as an extended regular +# expression constant in an `awk' program that is +# delimited by shell single-quotes, so be wary +# of the characters that are chosen. +# +# (The bash shell has a bug where `{gsub(...),...}' is +# misinterpreted as a brace expansion; this can be +# overcome by putting a space between `{' and `gsub'). +# +define shell-unescape-nl +awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }' +endef + +# escape-for-shell-sq +# +# Usage: embeddable-text = $(call escape-for-shell-sq,text) +# +# This function produces text that is suitable for +# embedding in a shell string that is delimited by +# single-quotes. +# +escape-for-shell-sq = $(subst ','\'',$(1)) + +# shell-sq +# +# Usage: single-quoted-and-escaped-text = $(call shell-sq,text) +# +shell-sq = '$(escape-for-shell-sq)' + +# shell-wordify +# +# Usage: wordified-text = $(call shell-wordify,text) +# +# For instance: +# +# |define text +# |hello +# |world +# |endef +# | +# |target: +# | echo $(call shell-wordify,$(text)) +# +# At least GNU make gets confused by expanding a newline +# within the context of a command line of a makefile rule +# (this is in constrast to a `$(shell ...)' function call, +# which can handle it just fine). +# +# This function avoids the problem by producing a string +# that works as a shell word, regardless of whether or +# not it contains a newline. +# +# If the text to be wordified contains a newline, then +# an intrictate shell command substitution is constructed +# to render the text as a single line; when the shell +# processes the resulting escaped text, it transforms +# it into the original unescaped text. +# +# If the text does not contain a newline, then this function +# produces the same results as the `$(shell-sq)' function. +# +shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq)) +define _sw-esc-nl +"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))" +endef + +# is-absolute +# +# Usage: bool-value = $(call is-absolute,path) +# +is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y) + +# lookup +# +# Usage: absolute-executable-path-or-empty = $(call lookup,path) +# +# (It's necessary to use `sh -c' because GNU make messes up by +# trying too hard and getting things wrong). +# +lookup = $(call unescape-nl,$(shell sh -c $(_l-sh))) +_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,)) + +# is-executable +# +# Usage: bool-value = $(call is-executable,path) +# +# (It's necessary to use `sh -c' because GNU make messes up by +# trying too hard and getting things wrong). +# +is-executable = $(call _is-executable-helper,$(shell-sq)) +_is-executable-helper = $(shell sh -c $(_is-executable-sh)) +_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y) + +# get-executable +# +# Usage: absolute-executable-path-or-empty = $(call get-executable,path) +# +# The goal is to get an absolute path for an executable; +# the `command -v' is defined by POSIX, but it's not +# necessarily very portable, so it's only used if +# relative path resolution is requested, as determined +# by the presence of a leading `/'. +# +get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup))) +_ge-abspath = $(if $(is-executable),$(1)) + +# get-supplied-or-default-executable +# +# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default) +# +define get-executable-or-default +$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2))) +endef +_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2))) +_gea_warn = $(warning The path '$(1)' is not executable.) +_gea_err = $(if $(1),$(error Please set '$(1)' appropriately)) + +# try-cc +# Usage: option = $(call try-cc, source-to-build, cc-options) +try-cc = $(shell sh -c \ + 'TMP="$(OUTPUT)$(TMPOUT).$$$$"; \ + echo "$(1)" | \ + $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \ + rm -f "$$TMP"') + +# try-build +# Usage: option = $(call try-build, source-to-build, cc-options, link-options) +try-build = $(shell sh -c \ + 'TMP="$(OUTPUT)$(TMPOUT).$$$$"; \ + echo "$(1)" | \ + $(CC) -x c - $(2) $(3) -o "$$TMP" > /dev/null 2>&1 && echo y; \ + rm -f "$$TMP"') diff --git a/tools/kvm/devices.c b/tools/kvm/devices.c new file mode 100644 index 000000000000..9f1941d8f7c4 --- /dev/null +++ b/tools/kvm/devices.c @@ -0,0 +1,86 @@ +#include "kvm/devices.h" +#include "kvm/kvm.h" + +#include <linux/err.h> +#include <linux/rbtree.h> + +struct device_bus { + struct rb_root root; + int dev_num; +}; + +static struct device_bus device_trees[DEVICE_BUS_MAX] = { + [0 ... (DEVICE_BUS_MAX - 1)] = { RB_ROOT, 0 }, +}; + +int device__register(struct device_header *dev) +{ + struct device_bus *bus; + struct rb_node **node, *parent = NULL; + + if (dev->bus_type >= DEVICE_BUS_MAX) { + pr_warning("Ignoring device registration on unknown bus %d\n", + dev->bus_type); + return -EINVAL; + } + + bus = &device_trees[dev->bus_type]; + dev->dev_num = bus->dev_num++; + + node = &bus->root.rb_node; + while (*node) { + int num = rb_entry(*node, struct device_header, node)->dev_num; + int result = dev->dev_num - num; + + if (result < 0) + node = &((*node)->rb_left); + else if (result > 0) + node = &((*node)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&dev->node, parent, node); + rb_insert_color(&dev->node, &bus->root); + return 0; +} + +struct device_header *device__find_dev(enum device_bus_type bus_type, u8 dev_num) +{ + struct rb_node *node; + + if (bus_type >= DEVICE_BUS_MAX) + return ERR_PTR(-EINVAL); + + node = device_trees[bus_type].root.rb_node; + while (node) { + struct device_header *dev = rb_entry(node, struct device_header, + node); + if (dev_num < dev->dev_num) { + node = node->rb_left; + } else if (dev_num > dev->dev_num) { + node = node->rb_right; + } else { + return dev; + } + } + + return NULL; +} + +struct device_header *device__first_dev(enum device_bus_type bus_type) +{ + struct rb_node *node; + + if (bus_type >= DEVICE_BUS_MAX) + return NULL; + + node = rb_first(&device_trees[bus_type].root); + return node ? rb_entry(node, struct device_header, node) : NULL; +} + +struct device_header *device__next_dev(struct device_header *dev) +{ + struct rb_node *node = rb_next(&dev->node); + return node ? rb_entry(node, struct device_header, node) : NULL; +} diff --git a/tools/kvm/disk/blk.c b/tools/kvm/disk/blk.c new file mode 100644 index 000000000000..37581d33136b --- /dev/null +++ b/tools/kvm/disk/blk.c @@ -0,0 +1,76 @@ +#include "kvm/disk-image.h" + +#include <linux/err.h> +#include <mntent.h> + +/* + * raw image and blk dev are similar, so reuse raw image ops. + */ +static struct disk_image_operations blk_dev_ops = { + .read = raw_image__read, + .write = raw_image__write, +}; + +static bool is_mounted(struct stat *st) +{ + struct stat st_buf; + struct mntent *mnt; + FILE *f; + + f = setmntent("/proc/mounts", "r"); + if (!f) + return false; + + while ((mnt = getmntent(f)) != NULL) { + if (stat(mnt->mnt_fsname, &st_buf) == 0 && + S_ISBLK(st_buf.st_mode) && st->st_rdev == st_buf.st_rdev) { + fclose(f); + return true; + } + } + + fclose(f); + return false; +} + +struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st) +{ + struct disk_image *disk; + int fd, r; + u64 size; + + if (!S_ISBLK(st->st_mode)) + return ERR_PTR(-EINVAL); + + if (is_mounted(st)) { + pr_err("Block device %s is already mounted! Unmount before use.", + filename); + return ERR_PTR(-EINVAL); + } + + /* + * Be careful! We are opening host block device! + * Open it readonly since we do not want to break user's data on disk. + */ + fd = open(filename, flags); + if (fd < 0) + return ERR_PTR(fd); + + if (ioctl(fd, BLKGETSIZE64, &size) < 0) { + r = -errno; + close(fd); + return ERR_PTR(r); + } + + /* + * FIXME: This will not work on 32-bit host because we can not + * mmap large disk. There is not enough virtual address space + * in 32-bit host. However, this works on 64-bit host. + */ + disk = disk_image__new(fd, size, &blk_dev_ops, DISK_IMAGE_REGULAR); +#ifdef CONFIG_HAS_AIO + if (!IS_ERR_OR_NULL(disk)) + disk->async = 1; +#endif + return disk; +} diff --git a/tools/kvm/disk/core.c b/tools/kvm/disk/core.c new file mode 100644 index 000000000000..4e9bda01c6d0 --- /dev/null +++ b/tools/kvm/disk/core.c @@ -0,0 +1,356 @@ +#include "kvm/disk-image.h" +#include "kvm/qcow.h" +#include "kvm/virtio-blk.h" +#include "kvm/kvm.h" + +#include <linux/err.h> +#include <sys/eventfd.h> +#include <sys/poll.h> + +#define AIO_MAX 256 + +int debug_iodelay; + +static int disk_image__close(struct disk_image *disk); + +int disk_img_name_parser(const struct option *opt, const char *arg, int unset) +{ + const char *cur; + char *sep; + struct kvm *kvm = opt->ptr; + + if (kvm->cfg.image_count >= MAX_DISK_IMAGES) + die("Currently only 4 images are supported"); + + kvm->cfg.disk_image[kvm->cfg.image_count].filename = arg; + cur = arg; + + if (strncmp(arg, "scsi:", 5) == 0) { + sep = strstr(arg, ":"); + if (sep) + kvm->cfg.disk_image[kvm->cfg.image_count].wwpn = sep + 1; + sep = strstr(sep + 1, ":"); + if (sep) { + *sep = 0; + kvm->cfg.disk_image[kvm->cfg.image_count].tpgt = sep + 1; + } + cur = sep + 1; + } + + do { + sep = strstr(cur, ","); + if (sep) { + if (strncmp(sep + 1, "ro", 2) == 0) + kvm->cfg.disk_image[kvm->cfg.image_count].readonly = true; + else if (strncmp(sep + 1, "direct", 6) == 0) + kvm->cfg.disk_image[kvm->cfg.image_count].direct = true; + *sep = 0; + cur = sep + 1; + } + } while (sep); + + kvm->cfg.image_count++; + + return 0; +} + +#ifdef CONFIG_HAS_AIO +static void *disk_image__thread(void *param) +{ + struct disk_image *disk = param; + struct io_event event[AIO_MAX]; + struct timespec notime = {0}; + int nr, i; + u64 dummy; + + kvm__set_thread_name("disk-image-io"); + + while (read(disk->evt, &dummy, sizeof(dummy)) > 0) { + nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, ¬ime); + for (i = 0; i < nr; i++) + disk->disk_req_cb(event[i].data, event[i].res); + } + + return NULL; +} +#endif + +struct disk_image *disk_image__new(int fd, u64 size, + struct disk_image_operations *ops, + int use_mmap) +{ + struct disk_image *disk; + int r; + + disk = malloc(sizeof *disk); + if (!disk) + return ERR_PTR(-ENOMEM); + + *disk = (struct disk_image) { + .fd = fd, + .size = size, + .ops = ops, + }; + + if (use_mmap == DISK_IMAGE_MMAP) { + /* + * The write to disk image will be discarded + */ + disk->priv = mmap(NULL, size, PROT_RW, MAP_PRIVATE | MAP_NORESERVE, fd, 0); + if (disk->priv == MAP_FAILED) { + r = -errno; + free(disk); + return ERR_PTR(r); + } + } + +#ifdef CONFIG_HAS_AIO + { + pthread_t thread; + + disk->evt = eventfd(0, 0); + io_setup(AIO_MAX, &disk->ctx); + r = pthread_create(&thread, NULL, disk_image__thread, disk); + if (r) { + r = -errno; + free(disk); + return ERR_PTR(r); + } + } +#endif + return disk; +} + +static struct disk_image *disk_image__open(const char *filename, bool readonly, bool direct) +{ + struct disk_image *disk; + struct stat st; + int fd, flags; + + if (readonly) + flags = O_RDONLY; + else + flags = O_RDWR; + if (direct) + flags |= O_DIRECT; + + if (stat(filename, &st) < 0) + return ERR_PTR(-errno); + + /* blk device ?*/ + disk = blkdev__probe(filename, flags, &st); + if (!IS_ERR_OR_NULL(disk)) + return disk; + + fd = open(filename, flags); + if (fd < 0) + return ERR_PTR(fd); + + /* qcow image ?*/ + disk = qcow_probe(fd, true); + if (!IS_ERR_OR_NULL(disk)) { + pr_warning("Forcing read-only support for QCOW"); + return disk; + } + + /* raw image ?*/ + disk = raw_image__probe(fd, &st, readonly); + if (!IS_ERR_OR_NULL(disk)) + return disk; + + if (close(fd) < 0) + pr_warning("close() failed"); + + return ERR_PTR(-ENOSYS); +} + +static struct disk_image **disk_image__open_all(struct kvm *kvm) +{ + struct disk_image **disks; + const char *filename; + const char *wwpn; + const char *tpgt; + bool readonly; + bool direct; + void *err; + int i; + struct disk_image_params *params = (struct disk_image_params *)&kvm->cfg.disk_image; + int count = kvm->cfg.image_count; + + if (!count) + return ERR_PTR(-EINVAL); + if (count > MAX_DISK_IMAGES) + return ERR_PTR(-ENOSPC); + + disks = calloc(count, sizeof(*disks)); + if (!disks) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + filename = params[i].filename; + readonly = params[i].readonly; + direct = params[i].direct; + wwpn = params[i].wwpn; + tpgt = params[i].tpgt; + + if (wwpn) { + disks[i] = malloc(sizeof(struct disk_image)); + if (!disks[i]) + return ERR_PTR(-ENOMEM); + disks[i]->wwpn = wwpn; + disks[i]->tpgt = tpgt; + continue; + } + + if (!filename) + continue; + + disks[i] = disk_image__open(filename, readonly, direct); + if (IS_ERR_OR_NULL(disks[i])) { + pr_err("Loading disk image '%s' failed", filename); + err = disks[i]; + goto error; + } + disks[i]->debug_iodelay = kvm->cfg.debug_iodelay; + } + + return disks; +error: + for (i = 0; i < count; i++) + if (!IS_ERR_OR_NULL(disks[i])) + disk_image__close(disks[i]); + + free(disks); + return err; +} + +int disk_image__flush(struct disk_image *disk) +{ + if (disk->ops->flush) + return disk->ops->flush(disk); + + return fsync(disk->fd); +} + +static int disk_image__close(struct disk_image *disk) +{ + /* If there was no disk image then there's nothing to do: */ + if (!disk) + return 0; + + if (disk->ops->close) + return disk->ops->close(disk); + + if (close(disk->fd) < 0) + pr_warning("close() failed"); + + free(disk); + + return 0; +} + +static int disk_image__close_all(struct disk_image **disks, int count) +{ + while (count) + disk_image__close(disks[--count]); + + free(disks); + + return 0; +} + +/* + * Fill iov with disk data, starting from sector 'sector'. + * Return amount of bytes read. + */ +ssize_t disk_image__read(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param) +{ + ssize_t total = 0; + + if (debug_iodelay) + msleep(debug_iodelay); + + if (disk->ops->read) { + total = disk->ops->read(disk, sector, iov, iovcount, param); + if (total < 0) { + pr_info("disk_image__read error: total=%ld\n", (long)total); + return total; + } + } + + if (!disk->async && disk->disk_req_cb) + disk->disk_req_cb(param, total); + + return total; +} + +/* + * Write iov to disk, starting from sector 'sector'. + * Return amount of bytes written. + */ +ssize_t disk_image__write(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param) +{ + ssize_t total = 0; + + if (debug_iodelay) + msleep(debug_iodelay); + + if (disk->ops->write) { + /* + * Try writev based operation first + */ + + total = disk->ops->write(disk, sector, iov, iovcount, param); + if (total < 0) { + pr_info("disk_image__write error: total=%ld\n", (long)total); + return total; + } + } else { + /* Do nothing */ + } + + if (!disk->async && disk->disk_req_cb) + disk->disk_req_cb(param, total); + + return total; +} + +ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len) +{ + struct stat st; + int r; + + r = fstat(disk->fd, &st); + if (r) + return r; + + *len = snprintf(buffer, *len, "%llu%llu%llu", + (u64)st.st_dev, (u64)st.st_rdev, (u64)st.st_ino); + return *len; +} + +void disk_image__set_callback(struct disk_image *disk, + void (*disk_req_cb)(void *param, long len)) +{ + disk->disk_req_cb = disk_req_cb; +} + +int disk_image__init(struct kvm *kvm) +{ + if (kvm->cfg.image_count) { + kvm->disks = disk_image__open_all(kvm); + if (IS_ERR(kvm->disks)) + return PTR_ERR(kvm->disks); + } + + return 0; +} +dev_base_init(disk_image__init); + +int disk_image__exit(struct kvm *kvm) +{ + return disk_image__close_all(kvm->disks, kvm->nr_disks); +} +dev_base_exit(disk_image__exit); diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c new file mode 100644 index 000000000000..64a25509899e --- /dev/null +++ b/tools/kvm/disk/qcow.c @@ -0,0 +1,1527 @@ +#include "kvm/qcow.h" + +#include "kvm/disk-image.h" +#include "kvm/read-write.h" +#include "kvm/mutex.h" +#include "kvm/util.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#ifdef CONFIG_HAS_ZLIB +#include <zlib.h> +#endif + +#include <linux/err.h> +#include <linux/byteorder.h> +#include <linux/kernel.h> +#include <linux/types.h> + +static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append); +static int qcow_write_refcount_table(struct qcow *q); +static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref); +static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size); + +static inline int qcow_pwrite_sync(int fd, + void *buf, size_t count, off_t offset) +{ + if (pwrite_in_full(fd, buf, count, offset) < 0) + return -1; + + return fdatasync(fd); +} + +static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new) +{ + struct rb_node **link = &(root->rb_node), *parent = NULL; + u64 offset = new->offset; + + /* search the tree */ + while (*link) { + struct qcow_l2_table *t; + + t = rb_entry(*link, struct qcow_l2_table, node); + if (!t) + goto error; + + parent = *link; + + if (t->offset > offset) + link = &(*link)->rb_left; + else if (t->offset < offset) + link = &(*link)->rb_right; + else + goto out; + } + + /* add new node */ + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); +out: + return 0; +error: + return -1; +} + +static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset) +{ + struct rb_node *link = root->rb_node; + + while (link) { + struct qcow_l2_table *t; + + t = rb_entry(link, struct qcow_l2_table, node); + if (!t) + goto out; + + if (t->offset > offset) + link = link->rb_left; + else if (t->offset < offset) + link = link->rb_right; + else + return t; + } +out: + return NULL; +} + +static void l1_table_free_cache(struct qcow_l1_table *l1t) +{ + struct rb_root *r = &l1t->root; + struct list_head *pos, *n; + struct qcow_l2_table *t; + + list_for_each_safe(pos, n, &l1t->lru_list) { + /* Remove cache table from the list and RB tree */ + list_del(pos); + t = list_entry(pos, struct qcow_l2_table, list); + rb_erase(&t->node, r); + + /* Free the cached node */ + free(t); + } +} + +static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c) +{ + struct qcow_header *header = q->header; + u64 size; + + if (!c->dirty) + return 0; + + size = 1 << header->l2_bits; + + if (qcow_pwrite_sync(q->fd, c->table, + size * sizeof(u64), c->offset) < 0) + return -1; + + c->dirty = 0; + + return 0; +} + +static int cache_table(struct qcow *q, struct qcow_l2_table *c) +{ + struct qcow_l1_table *l1t = &q->table; + struct rb_root *r = &l1t->root; + struct qcow_l2_table *lru; + + if (l1t->nr_cached == MAX_CACHE_NODES) { + /* + * The node at the head of the list is least recently used + * node. Remove it from the list and replaced with a new node. + */ + lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list); + + /* Remove the node from the cache */ + rb_erase(&lru->node, r); + list_del_init(&lru->list); + l1t->nr_cached--; + + /* Free the LRUed node */ + free(lru); + } + + /* Add new node in RB Tree: Helps in searching faster */ + if (l2_table_insert(r, c) < 0) + goto error; + + /* Add in LRU replacement list */ + list_add_tail(&c->list, &l1t->lru_list); + l1t->nr_cached++; + + return 0; +error: + return -1; +} + +static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset) +{ + struct qcow_l1_table *l1t = &q->table; + struct qcow_l2_table *l2t; + + l2t = l2_table_lookup(&l1t->root, offset); + if (!l2t) + return NULL; + + /* Update the LRU state, by moving the searched node to list tail */ + list_move_tail(&l2t->list, &l1t->lru_list); + + return l2t; +} + +/* Allocates a new node for caching L2 table */ +static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset) +{ + struct qcow_header *header = q->header; + struct qcow_l2_table *c; + u64 l2t_sz; + u64 size; + + l2t_sz = 1 << header->l2_bits; + size = sizeof(*c) + l2t_sz * sizeof(u64); + c = calloc(1, size); + if (!c) + goto out; + + c->offset = offset; + RB_CLEAR_NODE(&c->node); + INIT_LIST_HEAD(&c->list); +out: + return c; +} + +static inline u64 get_l1_index(struct qcow *q, u64 offset) +{ + struct qcow_header *header = q->header; + + return offset >> (header->l2_bits + header->cluster_bits); +} + +static inline u64 get_l2_index(struct qcow *q, u64 offset) +{ + struct qcow_header *header = q->header; + + return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1); +} + +static inline u64 get_cluster_offset(struct qcow *q, u64 offset) +{ + struct qcow_header *header = q->header; + + return offset & ((1 << header->cluster_bits)-1); +} + +static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset) +{ + struct qcow_header *header = q->header; + struct qcow_l2_table *l2t; + u64 size; + + size = 1 << header->l2_bits; + + /* search an entry for offset in cache */ + l2t = l2_table_search(q, offset); + if (l2t) + return l2t; + + /* allocate new node for caching l2 table */ + l2t = new_cache_table(q, offset); + if (!l2t) + goto error; + + /* table not cached: read from the disk */ + if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0) + goto error; + + /* cache the table */ + if (cache_table(q, l2t) < 0) + goto error; + + return l2t; +error: + free(l2t); + return NULL; +} + +static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size, + const u8 *buf, int buf_size) +{ +#ifdef CONFIG_HAS_ZLIB + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (u8 *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + + inflateEnd(strm); + return 0; +#else + return -1; +#endif +} + +static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, + void *dst, u32 dst_len) +{ + struct qcow_header *header = q->header; + struct qcow_l1_table *l1t = &q->table; + struct qcow_l2_table *l2t; + u64 clust_offset; + u64 clust_start; + u64 l2t_offset; + size_t length; + u64 l2t_size; + u64 l1_idx; + u64 l2_idx; + int coffset; + int csize; + + l1_idx = get_l1_index(q, offset); + if (l1_idx >= l1t->table_size) + return -1; + + clust_offset = get_cluster_offset(q, offset); + if (clust_offset >= q->cluster_size) + return -1; + + length = q->cluster_size - clust_offset; + if (length > dst_len) + length = dst_len; + + mutex_lock(&q->mutex); + + l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]); + if (!l2t_offset) + goto zero_cluster; + + l2t_size = 1 << header->l2_bits; + + /* read and cache level 2 table */ + l2t = qcow_read_l2_table(q, l2t_offset); + if (!l2t) + goto out_error; + + l2_idx = get_l2_index(q, offset); + if (l2_idx >= l2t_size) + goto out_error; + + clust_start = be64_to_cpu(l2t->table[l2_idx]); + if (clust_start & QCOW1_OFLAG_COMPRESSED) { + coffset = clust_start & q->cluster_offset_mask; + csize = clust_start >> (63 - q->header->cluster_bits); + csize &= (q->cluster_size - 1); + + if (pread_in_full(q->fd, q->cluster_data, csize, + coffset) < 0) + goto out_error; + + if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size, + q->cluster_data, csize) < 0) + goto out_error; + + memcpy(dst, q->cluster_cache + clust_offset, length); + mutex_unlock(&q->mutex); + } else { + if (!clust_start) + goto zero_cluster; + + mutex_unlock(&q->mutex); + + if (pread_in_full(q->fd, dst, length, + clust_start + clust_offset) < 0) + return -1; + } + + return length; + +zero_cluster: + mutex_unlock(&q->mutex); + memset(dst, 0, length); + return length; + +out_error: + mutex_unlock(&q->mutex); + length = -1; + return -1; +} + +static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset, + void *dst, u32 dst_len) +{ + struct qcow_header *header = q->header; + struct qcow_l1_table *l1t = &q->table; + struct qcow_l2_table *l2t; + u64 clust_offset; + u64 clust_start; + u64 l2t_offset; + size_t length; + u64 l2t_size; + u64 l1_idx; + u64 l2_idx; + int coffset; + int sector_offset; + int nb_csectors; + int csize; + + l1_idx = get_l1_index(q, offset); + if (l1_idx >= l1t->table_size) + return -1; + + clust_offset = get_cluster_offset(q, offset); + if (clust_offset >= q->cluster_size) + return -1; + + length = q->cluster_size - clust_offset; + if (length > dst_len) + length = dst_len; + + mutex_lock(&q->mutex); + + l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]); + + l2t_offset &= ~QCOW2_OFLAG_COPIED; + if (!l2t_offset) + goto zero_cluster; + + l2t_size = 1 << header->l2_bits; + + /* read and cache level 2 table */ + l2t = qcow_read_l2_table(q, l2t_offset); + if (!l2t) + goto out_error; + + l2_idx = get_l2_index(q, offset); + if (l2_idx >= l2t_size) + goto out_error; + + clust_start = be64_to_cpu(l2t->table[l2_idx]); + if (clust_start & QCOW2_OFLAG_COMPRESSED) { + coffset = clust_start & q->cluster_offset_mask; + nb_csectors = ((clust_start >> q->csize_shift) + & q->csize_mask) + 1; + sector_offset = coffset & (SECTOR_SIZE - 1); + csize = nb_csectors * SECTOR_SIZE - sector_offset; + + if (pread_in_full(q->fd, q->cluster_data, + nb_csectors * SECTOR_SIZE, + coffset & ~(SECTOR_SIZE - 1)) < 0) { + goto out_error; + } + + if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size, + q->cluster_data + sector_offset, + csize) < 0) { + goto out_error; + } + + memcpy(dst, q->cluster_cache + clust_offset, length); + mutex_unlock(&q->mutex); + } else { + clust_start &= QCOW2_OFFSET_MASK; + if (!clust_start) + goto zero_cluster; + + mutex_unlock(&q->mutex); + + if (pread_in_full(q->fd, dst, length, + clust_start + clust_offset) < 0) + return -1; + } + + return length; + +zero_cluster: + mutex_unlock(&q->mutex); + memset(dst, 0, length); + return length; + +out_error: + mutex_unlock(&q->mutex); + length = -1; + return -1; +} + +static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector, + void *dst, u32 dst_len) +{ + struct qcow *q = disk->priv; + struct qcow_header *header = q->header; + u32 nr_read; + u64 offset; + char *buf; + u32 nr; + + buf = dst; + nr_read = 0; + + while (nr_read < dst_len) { + offset = sector << SECTOR_SHIFT; + if (offset >= header->size) + return -1; + + if (q->version == QCOW1_VERSION) + nr = qcow1_read_cluster(q, offset, buf, + dst_len - nr_read); + else + nr = qcow2_read_cluster(q, offset, buf, + dst_len - nr_read); + + if (nr <= 0) + return -1; + + nr_read += nr; + buf += nr; + sector += (nr >> SECTOR_SHIFT); + } + + return dst_len; +} + +static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param) +{ + ssize_t nr, total = 0; + + while (iovcount--) { + nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len); + if (nr != (ssize_t)iov->iov_len) { + pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len); + return -1; + } + + sector += iov->iov_len >> SECTOR_SHIFT; + total += nr; + iov++; + } + + return total; +} + +static void refcount_table_free_cache(struct qcow_refcount_table *rft) +{ + struct rb_root *r = &rft->root; + struct list_head *pos, *n; + struct qcow_refcount_block *t; + + list_for_each_safe(pos, n, &rft->lru_list) { + list_del(pos); + t = list_entry(pos, struct qcow_refcount_block, list); + rb_erase(&t->node, r); + + free(t); + } +} + +static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new) +{ + struct rb_node **link = &(root->rb_node), *parent = NULL; + u64 offset = new->offset; + + /* search the tree */ + while (*link) { + struct qcow_refcount_block *t; + + t = rb_entry(*link, struct qcow_refcount_block, node); + if (!t) + goto error; + + parent = *link; + + if (t->offset > offset) + link = &(*link)->rb_left; + else if (t->offset < offset) + link = &(*link)->rb_right; + else + goto out; + } + + /* add new node */ + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); +out: + return 0; +error: + return -1; +} + +static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb) +{ + if (!rfb->dirty) + return 0; + + if (qcow_pwrite_sync(q->fd, rfb->entries, + rfb->size * sizeof(u16), rfb->offset) < 0) + return -1; + + rfb->dirty = 0; + + return 0; +} + +static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c) +{ + struct qcow_refcount_table *rft = &q->refcount_table; + struct rb_root *r = &rft->root; + struct qcow_refcount_block *lru; + + if (rft->nr_cached == MAX_CACHE_NODES) { + lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list); + + rb_erase(&lru->node, r); + list_del_init(&lru->list); + rft->nr_cached--; + + free(lru); + } + + if (refcount_block_insert(r, c) < 0) + goto error; + + list_add_tail(&c->list, &rft->lru_list); + rft->nr_cached++; + + return 0; +error: + return -1; +} + +static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset) +{ + struct qcow_refcount_block *rfb; + + rfb = malloc(sizeof *rfb + q->cluster_size); + if (!rfb) + return NULL; + + rfb->offset = rfb_offset; + rfb->size = q->cluster_size / sizeof(u16); + RB_CLEAR_NODE(&rfb->node); + INIT_LIST_HEAD(&rfb->list); + + return rfb; +} + +static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset) +{ + struct rb_node *link = root->rb_node; + + while (link) { + struct qcow_refcount_block *t; + + t = rb_entry(link, struct qcow_refcount_block, node); + if (!t) + goto out; + + if (t->offset > offset) + link = link->rb_left; + else if (t->offset < offset) + link = link->rb_right; + else + return t; + } +out: + return NULL; +} + +static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset) +{ + struct qcow_refcount_table *rft = &q->refcount_table; + struct qcow_refcount_block *rfb; + + rfb = refcount_block_lookup(&rft->root, offset); + if (!rfb) + return NULL; + + /* Update the LRU state, by moving the searched node to list tail */ + list_move_tail(&rfb->list, &rft->lru_list); + + return rfb; +} + +static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q, + u64 clust_idx) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + struct qcow_refcount_block *rfb; + u64 new_block_offset; + u64 rft_idx; + + rft_idx = clust_idx >> (header->cluster_bits - + QCOW_REFCOUNT_BLOCK_SHIFT); + + if (rft_idx >= rft->rf_size) { + pr_warning("Don't support grow refcount block table"); + return NULL; + } + + new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0); + if (new_block_offset < 0) + return NULL; + + rfb = new_refcount_block(q, new_block_offset); + if (!rfb) + return NULL; + + memset(rfb->entries, 0x00, q->cluster_size); + rfb->dirty = 1; + + /* write refcount block */ + if (write_refcount_block(q, rfb) < 0) + goto free_rfb; + + if (cache_refcount_block(q, rfb) < 0) + goto free_rfb; + + rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset); + if (update_cluster_refcount(q, new_block_offset >> + header->cluster_bits, 1) < 0) + goto recover_rft; + + if (qcow_write_refcount_table(q) < 0) + goto recover_rft; + + return rfb; + +recover_rft: + rft->rf_table[rft_idx] = 0; +free_rfb: + free(rfb); + return NULL; +} + +static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + struct qcow_refcount_block *rfb; + u64 rfb_offset; + u64 rft_idx; + + rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT); + if (rft_idx >= rft->rf_size) + return ERR_PTR(-ENOSPC); + + rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]); + if (!rfb_offset) + return ERR_PTR(-ENOSPC); + + rfb = refcount_block_search(q, rfb_offset); + if (rfb) + return rfb; + + rfb = new_refcount_block(q, rfb_offset); + if (!rfb) + return NULL; + + if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0) + goto error_free_rfb; + + if (cache_refcount_block(q, rfb) < 0) + goto error_free_rfb; + + return rfb; + +error_free_rfb: + free(rfb); + + return NULL; +} + +static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx) +{ + struct qcow_refcount_block *rfb = NULL; + struct qcow_header *header = q->header; + u64 rfb_idx; + + rfb = qcow_read_refcount_block(q, clust_idx); + if (PTR_ERR(rfb) == -ENOSPC) + return 0; + else if (IS_ERR_OR_NULL(rfb)) { + pr_warning("Error while reading refcount table"); + return -1; + } + + rfb_idx = clust_idx & (((1ULL << + (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); + + if (rfb_idx >= rfb->size) { + pr_warning("L1: refcount block index out of bounds"); + return -1; + } + + return be16_to_cpu(rfb->entries[rfb_idx]); +} + +static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append) +{ + struct qcow_refcount_block *rfb = NULL; + struct qcow_header *header = q->header; + u16 refcount; + u64 rfb_idx; + + rfb = qcow_read_refcount_block(q, clust_idx); + if (PTR_ERR(rfb) == -ENOSPC) { + rfb = qcow_grow_refcount_block(q, clust_idx); + if (!rfb) { + pr_warning("error while growing refcount table"); + return -1; + } + } else if (IS_ERR_OR_NULL(rfb)) { + pr_warning("error while reading refcount table"); + return -1; + } + + rfb_idx = clust_idx & (((1ULL << + (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1)); + if (rfb_idx >= rfb->size) { + pr_warning("refcount block index out of bounds"); + return -1; + } + + refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append; + rfb->entries[rfb_idx] = cpu_to_be16(refcount); + rfb->dirty = 1; + + /* write refcount block */ + if (write_refcount_block(q, rfb) < 0) { + pr_warning("refcount block index out of bounds"); + return -1; + } + + /* update free_clust_idx since refcount becomes zero */ + if (!refcount && clust_idx < q->free_clust_idx) + q->free_clust_idx = clust_idx; + + return 0; +} + +static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size) +{ + struct qcow_header *header = q->header; + u64 start, end, offset; + + start = clust_start & ~(q->cluster_size - 1); + end = (clust_start + size - 1) & ~(q->cluster_size - 1); + for (offset = start; offset <= end; offset += q->cluster_size) + update_cluster_refcount(q, offset >> header->cluster_bits, -1); +} + +/* + * Allocate clusters according to the size. Find a postion that + * can satisfy the size. free_clust_idx is initialized to zero and + * Record last position. + */ +static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref) +{ + struct qcow_header *header = q->header; + u16 clust_refcount; + u32 clust_idx = 0, i; + u64 clust_num; + + clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits; + +again: + for (i = 0; i < clust_num; i++) { + clust_idx = q->free_clust_idx++; + clust_refcount = qcow_get_refcount(q, clust_idx); + if (clust_refcount < 0) + return -1; + else if (clust_refcount > 0) + goto again; + } + + clust_idx++; + + if (update_ref) + for (i = 0; i < clust_num; i++) + if (update_cluster_refcount(q, + clust_idx - clust_num + i, 1)) + return -1; + + return (clust_idx - clust_num) << header->cluster_bits; +} + +static int qcow_write_l1_table(struct qcow *q) +{ + struct qcow_l1_table *l1t = &q->table; + struct qcow_header *header = q->header; + + if (qcow_pwrite_sync(q->fd, l1t->l1_table, + l1t->table_size * sizeof(u64), + header->l1_table_offset) < 0) + return -1; + + return 0; +} + +/* + * Get l2 table. If the table has been copied, read table directly. + * If the table exists, allocate a new cluster and copy the table + * to the new cluster. + */ +static int get_cluster_table(struct qcow *q, u64 offset, + struct qcow_l2_table **result_l2t, u64 *result_l2_index) +{ + struct qcow_header *header = q->header; + struct qcow_l1_table *l1t = &q->table; + struct qcow_l2_table *l2t; + u64 l1t_idx; + u64 l2t_offset; + u64 l2t_idx; + u64 l2t_size; + u64 l2t_new_offset; + + l2t_size = 1 << header->l2_bits; + + l1t_idx = get_l1_index(q, offset); + if (l1t_idx >= l1t->table_size) + return -1; + + l2t_idx = get_l2_index(q, offset); + if (l2t_idx >= l2t_size) + return -1; + + l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]); + if (l2t_offset & QCOW2_OFLAG_COPIED) { + l2t_offset &= ~QCOW2_OFLAG_COPIED; + l2t = qcow_read_l2_table(q, l2t_offset); + if (!l2t) + goto error; + } else { + l2t_new_offset = qcow_alloc_clusters(q, + l2t_size*sizeof(u64), 1); + + if (l2t_new_offset < 0) + goto error; + + l2t = new_cache_table(q, l2t_new_offset); + if (!l2t) + goto free_cluster; + + if (l2t_offset) { + l2t = qcow_read_l2_table(q, l2t_offset); + if (!l2t) + goto free_cache; + } else + memset(l2t->table, 0x00, l2t_size * sizeof(u64)); + + /* write l2 table */ + l2t->dirty = 1; + if (qcow_l2_cache_write(q, l2t) < 0) + goto free_cache; + + /* cache l2 table */ + if (cache_table(q, l2t)) + goto free_cache; + + /* update the l1 talble */ + l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset + | QCOW2_OFLAG_COPIED); + if (qcow_write_l1_table(q)) { + pr_warning("Update l1 table error"); + goto free_cache; + } + + /* free old cluster */ + qcow_free_clusters(q, l2t_offset, q->cluster_size); + } + + *result_l2t = l2t; + *result_l2_index = l2t_idx; + + return 0; + +free_cache: + free(l2t); + +free_cluster: + qcow_free_clusters(q, l2t_new_offset, q->cluster_size); + +error: + return -1; +} + +/* + * If the cluster has been copied, write data directly. If not, + * read the original data and write it to the new cluster with + * modification. + */ +static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, + void *buf, u32 src_len) +{ + struct qcow_l2_table *l2t; + u64 clust_new_start; + u64 clust_start; + u64 clust_flags; + u64 clust_off; + u64 l2t_idx; + u64 len; + + l2t = NULL; + + clust_off = get_cluster_offset(q, offset); + if (clust_off >= q->cluster_size) + return -1; + + len = q->cluster_size - clust_off; + if (len > src_len) + len = src_len; + + mutex_lock(&q->mutex); + + if (get_cluster_table(q, offset, &l2t, &l2t_idx)) { + pr_warning("Get l2 table error"); + goto error; + } + + clust_start = be64_to_cpu(l2t->table[l2t_idx]); + clust_flags = clust_start & QCOW2_OFLAGS_MASK; + + clust_start &= QCOW2_OFFSET_MASK; + if (!(clust_flags & QCOW2_OFLAG_COPIED)) { + clust_new_start = qcow_alloc_clusters(q, q->cluster_size, 1); + if (clust_new_start < 0) { + pr_warning("Cluster alloc error"); + goto error; + } + + offset &= ~(q->cluster_size - 1); + + /* if clust_start is not zero, read the original data*/ + if (clust_start) { + mutex_unlock(&q->mutex); + if (qcow2_read_cluster(q, offset, q->copy_buff, + q->cluster_size) < 0) { + pr_warning("Read copy cluster error"); + qcow_free_clusters(q, clust_new_start, + q->cluster_size); + return -1; + } + mutex_lock(&q->mutex); + } else + memset(q->copy_buff, 0x00, q->cluster_size); + + memcpy(q->copy_buff + clust_off, buf, len); + + /* Write actual data */ + if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size, + clust_new_start) < 0) + goto free_cluster; + + /* update l2 table*/ + l2t->table[l2t_idx] = cpu_to_be64(clust_new_start + | QCOW2_OFLAG_COPIED); + l2t->dirty = 1; + + if (qcow_l2_cache_write(q, l2t)) + goto free_cluster; + + /* free old cluster*/ + if (clust_flags & QCOW2_OFLAG_COMPRESSED) { + int size; + size = ((clust_start >> q->csize_shift) & + q->csize_mask) + 1; + size *= 512; + clust_start &= q->cluster_offset_mask; + clust_start &= ~511; + + qcow_free_clusters(q, clust_start, size); + } else if (clust_start) + qcow_free_clusters(q, clust_start, q->cluster_size); + + } else { + /* Write actual data */ + if (pwrite_in_full(q->fd, buf, len, + clust_start + clust_off) < 0) + goto error; + } + mutex_unlock(&q->mutex); + return len; + +free_cluster: + qcow_free_clusters(q, clust_new_start, q->cluster_size); + +error: + mutex_unlock(&q->mutex); + return -1; +} + +static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len) +{ + struct qcow *q = disk->priv; + struct qcow_header *header = q->header; + u32 nr_written; + char *buf; + u64 offset; + ssize_t nr; + + buf = src; + nr_written = 0; + offset = sector << SECTOR_SHIFT; + + while (nr_written < src_len) { + if (offset >= header->size) + return -1; + + nr = qcow_write_cluster(q, offset, buf, src_len - nr_written); + if (nr < 0) + return -1; + + nr_written += nr; + buf += nr; + offset += nr; + } + + return nr_written; +} + +static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param) +{ + ssize_t nr, total = 0; + + while (iovcount--) { + nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len); + if (nr != (ssize_t)iov->iov_len) { + pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len); + return -1; + } + + sector += iov->iov_len >> SECTOR_SHIFT; + iov++; + total += nr; + } + + return total; +} + +static int qcow_disk_flush(struct disk_image *disk) +{ + struct qcow *q = disk->priv; + struct qcow_refcount_table *rft; + struct list_head *pos, *n; + struct qcow_l1_table *l1t; + + l1t = &q->table; + rft = &q->refcount_table; + + mutex_lock(&q->mutex); + + list_for_each_safe(pos, n, &rft->lru_list) { + struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list); + + if (write_refcount_block(q, c) < 0) + goto error_unlock; + } + + list_for_each_safe(pos, n, &l1t->lru_list) { + struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list); + + if (qcow_l2_cache_write(q, c) < 0) + goto error_unlock; + } + + if (qcow_write_l1_table < 0) + goto error_unlock; + + mutex_unlock(&q->mutex); + + return fsync(disk->fd); + +error_unlock: + mutex_unlock(&q->mutex); + return -1; +} + +static int qcow_disk_close(struct disk_image *disk) +{ + struct qcow *q; + + if (!disk) + return 0; + + q = disk->priv; + + refcount_table_free_cache(&q->refcount_table); + l1_table_free_cache(&q->table); + free(q->copy_buff); + free(q->cluster_data); + free(q->cluster_cache); + free(q->refcount_table.rf_table); + free(q->table.l1_table); + free(q->header); + free(q); + + return 0; +} + +static struct disk_image_operations qcow_disk_readonly_ops = { + .read = qcow_read_sector, + .close = qcow_disk_close, +}; + +static struct disk_image_operations qcow_disk_ops = { + .read = qcow_read_sector, + .write = qcow_write_sector, + .flush = qcow_disk_flush, + .close = qcow_disk_close, +}; + +static int qcow_read_refcount_table(struct qcow *q) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + + rft->rf_size = (header->refcount_table_size * q->cluster_size) + / sizeof(u64); + + rft->rf_table = calloc(rft->rf_size, sizeof(u64)); + if (!rft->rf_table) + return -1; + + rft->root = RB_ROOT; + INIT_LIST_HEAD(&rft->lru_list); + + return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset); +} + +static int qcow_write_refcount_table(struct qcow *q) +{ + struct qcow_header *header = q->header; + struct qcow_refcount_table *rft = &q->refcount_table; + + return qcow_pwrite_sync(q->fd, rft->rf_table, + rft->rf_size * sizeof(u64), header->refcount_table_offset); +} + +static int qcow_read_l1_table(struct qcow *q) +{ + struct qcow_header *header = q->header; + struct qcow_l1_table *table = &q->table; + + table->table_size = header->l1_size; + + table->l1_table = calloc(table->table_size, sizeof(u64)); + if (!table->l1_table) + return -1; + + return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset); +} + +static void *qcow2_read_header(int fd) +{ + struct qcow2_header_disk f_header; + struct qcow_header *header; + + header = malloc(sizeof(struct qcow_header)); + if (!header) + return NULL; + + if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) { + free(header); + return NULL; + } + + be32_to_cpus(&f_header.magic); + be32_to_cpus(&f_header.version); + be64_to_cpus(&f_header.backing_file_offset); + be32_to_cpus(&f_header.backing_file_size); + be32_to_cpus(&f_header.cluster_bits); + be64_to_cpus(&f_header.size); + be32_to_cpus(&f_header.crypt_method); + be32_to_cpus(&f_header.l1_size); + be64_to_cpus(&f_header.l1_table_offset); + be64_to_cpus(&f_header.refcount_table_offset); + be32_to_cpus(&f_header.refcount_table_clusters); + be32_to_cpus(&f_header.nb_snapshots); + be64_to_cpus(&f_header.snapshots_offset); + + *header = (struct qcow_header) { + .size = f_header.size, + .l1_table_offset = f_header.l1_table_offset, + .l1_size = f_header.l1_size, + .cluster_bits = f_header.cluster_bits, + .l2_bits = f_header.cluster_bits - 3, + .refcount_table_offset = f_header.refcount_table_offset, + .refcount_table_size = f_header.refcount_table_clusters, + }; + + return header; +} + +static struct disk_image *qcow2_probe(int fd, bool readonly) +{ + struct disk_image *disk_image; + struct qcow_l1_table *l1t; + struct qcow_header *h; + struct qcow *q; + + q = calloc(1, sizeof(struct qcow)); + if (!q) + return NULL; + + mutex_init(&q->mutex); + q->fd = fd; + + l1t = &q->table; + + l1t->root = RB_ROOT; + INIT_LIST_HEAD(&l1t->lru_list); + + h = q->header = qcow2_read_header(fd); + if (!h) + goto free_qcow; + + q->version = QCOW2_VERSION; + q->csize_shift = (62 - (q->header->cluster_bits - 8)); + q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1; + q->cluster_offset_mask = (1LL << q->csize_shift) - 1; + q->cluster_size = 1 << q->header->cluster_bits; + + q->copy_buff = malloc(q->cluster_size); + if (!q->copy_buff) { + pr_warning("copy buff malloc error"); + goto free_header; + } + + q->cluster_data = malloc(q->cluster_size); + if (!q->cluster_data) { + pr_warning("cluster data malloc error"); + goto free_copy_buff; + } + + q->cluster_cache = malloc(q->cluster_size); + if (!q->cluster_cache) { + pr_warning("cluster cache malloc error"); + goto free_cluster_data; + } + + if (qcow_read_l1_table(q) < 0) + goto free_cluster_cache; + + if (qcow_read_refcount_table(q) < 0) + goto free_l1_table; + + /* + * Do not use mmap use read/write instead + */ + if (readonly) + disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR); + else + disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR); + + if (IS_ERR_OR_NULL(disk_image)) + goto free_refcount_table; + + disk_image->async = 0; + disk_image->priv = q; + + return disk_image; + +free_refcount_table: + if (q->refcount_table.rf_table) + free(q->refcount_table.rf_table); +free_l1_table: + if (q->table.l1_table) + free(q->table.l1_table); +free_cluster_cache: + if (q->cluster_cache) + free(q->cluster_cache); +free_cluster_data: + if (q->cluster_data) + free(q->cluster_data); +free_copy_buff: + if (q->copy_buff) + free(q->copy_buff); +free_header: + if (q->header) + free(q->header); +free_qcow: + free(q); + + return NULL; +} + +static bool qcow2_check_image(int fd) +{ + struct qcow2_header_disk f_header; + + if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) + return false; + + be32_to_cpus(&f_header.magic); + be32_to_cpus(&f_header.version); + + if (f_header.magic != QCOW_MAGIC) + return false; + + if (f_header.version != QCOW2_VERSION) + return false; + + return true; +} + +static void *qcow1_read_header(int fd) +{ + struct qcow1_header_disk f_header; + struct qcow_header *header; + + header = malloc(sizeof(struct qcow_header)); + if (!header) + return NULL; + + if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) { + free(header); + return NULL; + } + + be32_to_cpus(&f_header.magic); + be32_to_cpus(&f_header.version); + be64_to_cpus(&f_header.backing_file_offset); + be32_to_cpus(&f_header.backing_file_size); + be32_to_cpus(&f_header.mtime); + be64_to_cpus(&f_header.size); + be32_to_cpus(&f_header.crypt_method); + be64_to_cpus(&f_header.l1_table_offset); + + *header = (struct qcow_header) { + .size = f_header.size, + .l1_table_offset = f_header.l1_table_offset, + .l1_size = f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)), + .cluster_bits = f_header.cluster_bits, + .l2_bits = f_header.l2_bits, + }; + + return header; +} + +static struct disk_image *qcow1_probe(int fd, bool readonly) +{ + struct disk_image *disk_image; + struct qcow_l1_table *l1t; + struct qcow_header *h; + struct qcow *q; + + q = calloc(1, sizeof(struct qcow)); + if (!q) + return NULL; + + mutex_init(&q->mutex); + q->fd = fd; + + l1t = &q->table; + + l1t->root = RB_ROOT; + INIT_LIST_HEAD(&l1t->lru_list); + + h = q->header = qcow1_read_header(fd); + if (!h) + goto free_qcow; + + q->version = QCOW1_VERSION; + q->cluster_size = 1 << q->header->cluster_bits; + q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1; + q->free_clust_idx = 0; + + q->cluster_data = malloc(q->cluster_size); + if (!q->cluster_data) { + pr_warning("cluster data malloc error"); + goto free_header; + } + + q->cluster_cache = malloc(q->cluster_size); + if (!q->cluster_cache) { + pr_warning("cluster cache malloc error"); + goto free_cluster_data; + } + + if (qcow_read_l1_table(q) < 0) + goto free_cluster_cache; + + /* + * Do not use mmap use read/write instead + */ + if (readonly) + disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR); + else + disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR); + + if (!disk_image) + goto free_l1_table; + + disk_image->async = 1; + disk_image->priv = q; + + return disk_image; + +free_l1_table: + if (q->table.l1_table) + free(q->table.l1_table); +free_cluster_cache: + if (q->cluster_cache) + free(q->cluster_cache); +free_cluster_data: + if (q->cluster_data) + free(q->cluster_data); +free_header: + if (q->header) + free(q->header); +free_qcow: + free(q); + + return NULL; +} + +static bool qcow1_check_image(int fd) +{ + struct qcow1_header_disk f_header; + + if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) + return false; + + be32_to_cpus(&f_header.magic); + be32_to_cpus(&f_header.version); + + if (f_header.magic != QCOW_MAGIC) + return false; + + if (f_header.version != QCOW1_VERSION) + return false; + + return true; +} + +struct disk_image *qcow_probe(int fd, bool readonly) +{ + if (qcow1_check_image(fd)) + return qcow1_probe(fd, readonly); + + if (qcow2_check_image(fd)) + return qcow2_probe(fd, readonly); + + return NULL; +} diff --git a/tools/kvm/disk/raw.c b/tools/kvm/disk/raw.c new file mode 100644 index 000000000000..93b2b4e8db1f --- /dev/null +++ b/tools/kvm/disk/raw.c @@ -0,0 +1,141 @@ +#include "kvm/disk-image.h" + +#include <linux/err.h> + +#ifdef CONFIG_HAS_AIO +#include <libaio.h> +#endif + +ssize_t raw_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param) +{ + u64 offset = sector << SECTOR_SHIFT; + +#ifdef CONFIG_HAS_AIO + struct iocb iocb; + + return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount, offset, + disk->evt, param); +#else + return preadv_in_full(disk->fd, iov, iovcount, offset); +#endif +} + +ssize_t raw_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param) +{ + u64 offset = sector << SECTOR_SHIFT; + +#ifdef CONFIG_HAS_AIO + struct iocb iocb; + + return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount, offset, + disk->evt, param); +#else + return pwritev_in_full(disk->fd, iov, iovcount, offset); +#endif +} + +ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param) +{ + u64 offset = sector << SECTOR_SHIFT; + ssize_t total = 0; + + while (iovcount--) { + memcpy(iov->iov_base, disk->priv + offset, iov->iov_len); + + sector += iov->iov_len >> SECTOR_SHIFT; + offset += iov->iov_len; + total += iov->iov_len; + iov++; + } + + return total; +} + +ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param) +{ + u64 offset = sector << SECTOR_SHIFT; + ssize_t total = 0; + + while (iovcount--) { + memcpy(disk->priv + offset, iov->iov_base, iov->iov_len); + + sector += iov->iov_len >> SECTOR_SHIFT; + offset += iov->iov_len; + total += iov->iov_len; + iov++; + } + + return total; +} + +int raw_image__close(struct disk_image *disk) +{ + int ret = 0; + + if (disk->priv != MAP_FAILED) + ret = munmap(disk->priv, disk->size); + + close(disk->evt); + +#ifdef CONFIG_HAS_VIRTIO + io_destroy(disk->ctx); +#endif + + return ret; +} + +/* + * multiple buffer based disk image operations + */ +static struct disk_image_operations raw_image_regular_ops = { + .read = raw_image__read, + .write = raw_image__write, +}; + +struct disk_image_operations ro_ops = { + .read = raw_image__read_mmap, + .write = raw_image__write_mmap, + .close = raw_image__close, +}; + +struct disk_image_operations ro_ops_nowrite = { + .read = raw_image__read, +}; + +struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly) +{ + struct disk_image *disk; + + if (readonly) { + /* + * Use mmap's MAP_PRIVATE to implement non-persistent write + * FIXME: This does not work on 32-bit host. + */ + struct disk_image *disk; + + disk = disk_image__new(fd, st->st_size, &ro_ops, DISK_IMAGE_MMAP); + if (IS_ERR_OR_NULL(disk)) { + disk = disk_image__new(fd, st->st_size, &ro_ops_nowrite, DISK_IMAGE_REGULAR); +#ifdef CONFIG_HAS_AIO + if (!IS_ERR_OR_NULL(disk)) + disk->async = 1; +#endif + } + + return disk; + } else { + /* + * Use read/write instead of mmap + */ + disk = disk_image__new(fd, st->st_size, &raw_image_regular_ops, DISK_IMAGE_REGULAR); +#ifdef CONFIG_HAS_AIO + if (!IS_ERR_OR_NULL(disk)) + disk->async = 1; +#endif + return disk; + } +} diff --git a/tools/kvm/framebuffer.c b/tools/kvm/framebuffer.c new file mode 100644 index 000000000000..fb8f51dd1de7 --- /dev/null +++ b/tools/kvm/framebuffer.c @@ -0,0 +1,80 @@ +#include "kvm/framebuffer.h" +#include "kvm/kvm.h" + +#include <linux/kernel.h> +#include <linux/list.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <errno.h> + +static LIST_HEAD(framebuffers); + +struct framebuffer *fb__register(struct framebuffer *fb) +{ + INIT_LIST_HEAD(&fb->node); + list_add(&fb->node, &framebuffers); + + return fb; +} + +int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops) +{ + if (fb->nr_targets >= FB_MAX_TARGETS) + return -ENOSPC; + + fb->targets[fb->nr_targets++] = ops; + + return 0; +} + +static int start_targets(struct framebuffer *fb) +{ + unsigned long i; + + for (i = 0; i < fb->nr_targets; i++) { + struct fb_target_operations *ops = fb->targets[i]; + int err = 0; + + if (ops->start) + err = ops->start(fb); + + if (err) + return err; + } + + return 0; +} + +int fb__init(struct kvm *kvm) +{ + struct framebuffer *fb; + + list_for_each_entry(fb, &framebuffers, node) { + int err; + + err = start_targets(fb); + if (err) + return err; + } + + return 0; +} +firmware_init(fb__init); + +int fb__exit(struct kvm *kvm) +{ + struct framebuffer *fb; + + list_for_each_entry(fb, &framebuffers, node) { + u32 i; + + for (i = 0; i < fb->nr_targets; i++) + if (fb->targets[i]->stop) + fb->targets[i]->stop(fb); + + munmap(fb->mem, fb->mem_size); + } + + return 0; +} +firmware_exit(fb__exit); diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c new file mode 100644 index 000000000000..8c49a0323451 --- /dev/null +++ b/tools/kvm/guest/init.c @@ -0,0 +1,76 @@ +/* + * This is a simple init for shared rootfs guests. This part should be limited + * to doing mounts and running stage 2 of the init process. + */ +#include <sys/mount.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <errno.h> +#include <linux/reboot.h> + +static int run_process(char *filename) +{ + char *new_argv[] = { filename, NULL }; + char *new_env[] = { "TERM=linux", "DISPLAY=192.168.33.1:0", + "HOME=/virt/home", NULL }; + + return execve(filename, new_argv, new_env); +} + +static int run_process_sandbox(char *filename) +{ + char *new_argv[] = { filename, "/virt/sandbox.sh", NULL }; + char *new_env[] = { "TERM=linux", "HOME=/virt/home", NULL }; + + return execve(filename, new_argv, new_env); +} + +static void do_mounts(void) +{ + mount("hostfs", "/host", "9p", MS_RDONLY, "trans=virtio,version=9p2000.L"); + mount("", "/sys", "sysfs", 0, NULL); + mount("proc", "/proc", "proc", 0, NULL); + mount("devtmpfs", "/dev", "devtmpfs", 0, NULL); + mkdir("/dev/pts", 0755); + mount("devpts", "/dev/pts", "devpts", 0, NULL); +} + +int main(int argc, char *argv[]) +{ + pid_t child; + int status; + + puts("Mounting..."); + + do_mounts(); + + /* get session leader */ + setsid(); + + /* set controlling terminal */ + ioctl(0, TIOCSCTTY, 1); + + child = fork(); + if (child < 0) { + printf("Fatal: fork() failed with %d\n", child); + return 0; + } else if (child == 0) { + if (access("/virt/sandbox.sh", R_OK) == 0) + run_process_sandbox("/bin/sh"); + else + run_process("/bin/sh"); + } else { + pid_t corpse; + + do { + corpse = waitpid(-1, &status, 0); + } while (corpse != child); + } + + reboot(LINUX_REBOOT_CMD_RESTART); + + printf("Init failed: %s\n", strerror(errno)); + + return 0; +} diff --git a/tools/kvm/guest_compat.c b/tools/kvm/guest_compat.c new file mode 100644 index 000000000000..fd4704b20b16 --- /dev/null +++ b/tools/kvm/guest_compat.c @@ -0,0 +1,99 @@ +#include "kvm/guest_compat.h" + +#include "kvm/mutex.h" + +#include <linux/kernel.h> +#include <linux/list.h> + +struct compat_message { + int id; + char *title; + char *desc; + + struct list_head list; +}; + +static int id; +static DEFINE_MUTEX(compat_mtx); +static LIST_HEAD(messages); + +static void compat__free(struct compat_message *msg) +{ + free(msg->title); + free(msg->desc); + free(msg); +} + +int compat__add_message(const char *title, const char *desc) +{ + struct compat_message *msg; + int msg_id; + + msg = malloc(sizeof(*msg)); + if (msg == NULL) + goto cleanup; + + msg->title = strdup(title); + msg->desc = strdup(desc); + + if (msg->title == NULL || msg->desc == NULL) + goto cleanup; + + mutex_lock(&compat_mtx); + + msg->id = msg_id = id++; + list_add_tail(&msg->list, &messages); + + mutex_unlock(&compat_mtx); + + return msg_id; + +cleanup: + if (msg) + compat__free(msg); + + return -ENOMEM; +} + +int compat__remove_message(int id) +{ + struct compat_message *pos, *n; + + mutex_lock(&compat_mtx); + + list_for_each_entry_safe(pos, n, &messages, list) { + if (pos->id == id) { + list_del(&pos->list); + compat__free(pos); + + mutex_unlock(&compat_mtx); + + return 0; + } + } + + mutex_unlock(&compat_mtx); + + return -ENOENT; +} + +int compat__print_all_messages(void) +{ + mutex_lock(&compat_mtx); + + while (!list_empty(&messages)) { + struct compat_message *msg; + + msg = list_first_entry(&messages, struct compat_message, list); + + printf("\n # KVM compatibility warning.\n\t%s\n\t%s\n", + msg->title, msg->desc); + + list_del(&msg->list); + compat__free(msg); + } + + mutex_unlock(&compat_mtx); + + return 0; +} diff --git a/tools/kvm/hw/i8042.c b/tools/kvm/hw/i8042.c new file mode 100644 index 000000000000..90357326e171 --- /dev/null +++ b/tools/kvm/hw/i8042.c @@ -0,0 +1,355 @@ +#include "kvm/read-write.h" +#include "kvm/ioport.h" +#include "kvm/mutex.h" +#include "kvm/util.h" +#include "kvm/term.h" +#include "kvm/kvm.h" +#include "kvm/i8042.h" +#include "kvm/kvm-cpu.h" + +#include <stdint.h> + +/* + * IRQs + */ +#define KBD_IRQ 1 +#define AUX_IRQ 12 + +/* + * Registers + */ +#define I8042_DATA_REG 0x60 +#define I8042_COMMAND_REG 0x64 + +/* + * Commands + */ +#define I8042_CMD_CTL_RCTR 0x20 +#define I8042_CMD_CTL_WCTR 0x60 +#define I8042_CMD_AUX_LOOP 0xD3 +#define I8042_CMD_AUX_SEND 0xD4 +#define I8042_CMD_AUX_TEST 0xA9 +#define I8042_CMD_AUX_DISABLE 0xA7 +#define I8042_CMD_AUX_ENABLE 0xA8 +#define I8042_CMD_SYSTEM_RESET 0xFE + +#define RESPONSE_ACK 0xFA + +#define MODE_DISABLE_AUX 0x20 + +#define AUX_ENABLE_REPORTING 0x20 +#define AUX_SCALING_FLAG 0x10 +#define AUX_DEFAULT_RESOLUTION 0x2 +#define AUX_DEFAULT_SAMPLE 100 + +/* + * Status register bits + */ +#define I8042_STR_AUXDATA 0x20 +#define I8042_STR_KEYLOCK 0x10 +#define I8042_STR_CMDDAT 0x08 +#define I8042_STR_MUXERR 0x04 +#define I8042_STR_OBF 0x01 + +#define KBD_MODE_KBD_INT 0x01 +#define KBD_MODE_SYS 0x02 + +#define QUEUE_SIZE 128 + +/* + * This represents the current state of the PS/2 keyboard system, + * including the AUX device (the mouse) + */ +struct kbd_state { + struct kvm *kvm; + + char kq[QUEUE_SIZE]; /* Keyboard queue */ + int kread, kwrite; /* Indexes into the queue */ + int kcount; /* number of elements in queue */ + + char mq[QUEUE_SIZE]; + int mread, mwrite; + int mcount; + + u8 mstatus; /* Mouse status byte */ + u8 mres; /* Current mouse resolution */ + u8 msample; /* Current mouse samples/second */ + + u8 mode; /* i8042 mode register */ + u8 status; /* i8042 status register */ + /* + * Some commands (on port 0x64) have arguments; + * we store the command here while we wait for the argument + */ + u32 write_cmd; +}; + +static struct kbd_state state; + +/* + * If there are packets to be read, set the appropriate IRQs high + */ +static void kbd_update_irq(void) +{ + u8 klevel = 0; + u8 mlevel = 0; + + /* First, clear the kbd and aux output buffer full bits */ + state.status &= ~(I8042_STR_OBF | I8042_STR_AUXDATA); + + if (state.kcount > 0) { + state.status |= I8042_STR_OBF; + klevel = 1; + } + + /* Keyboard has higher priority than mouse */ + if (klevel == 0 && state.mcount != 0) { + state.status |= I8042_STR_OBF | I8042_STR_AUXDATA; + mlevel = 1; + } + + kvm__irq_line(state.kvm, KBD_IRQ, klevel); + kvm__irq_line(state.kvm, AUX_IRQ, mlevel); +} + +/* + * Add a byte to the mouse queue, then set IRQs + */ +void mouse_queue(u8 c) +{ + if (state.mcount >= QUEUE_SIZE) + return; + + state.mq[state.mwrite++ % QUEUE_SIZE] = c; + + state.mcount++; + kbd_update_irq(); +} + +/* + * Add a byte to the keyboard queue, then set IRQs + */ +void kbd_queue(u8 c) +{ + if (state.kcount >= QUEUE_SIZE) + return; + + state.kq[state.kwrite++ % QUEUE_SIZE] = c; + + state.kcount++; + kbd_update_irq(); +} + +static void kbd_write_command(struct kvm *kvm, u8 val) +{ + switch (val) { + case I8042_CMD_CTL_RCTR: + kbd_queue(state.mode); + break; + case I8042_CMD_CTL_WCTR: + case I8042_CMD_AUX_SEND: + case I8042_CMD_AUX_LOOP: + state.write_cmd = val; + break; + case I8042_CMD_AUX_TEST: + /* 0 means we're a normal PS/2 mouse */ + mouse_queue(0); + break; + case I8042_CMD_AUX_DISABLE: + state.mode |= MODE_DISABLE_AUX; + break; + case I8042_CMD_AUX_ENABLE: + state.mode &= ~MODE_DISABLE_AUX; + break; + case I8042_CMD_SYSTEM_RESET: + kvm_cpu__reboot(kvm); + break; + default: + break; + } +} + +/* + * Called when the OS reads from port 0x60 (PS/2 data) + */ +static u32 kbd_read_data(void) +{ + u32 ret; + int i; + + if (state.kcount != 0) { + /* Keyboard data gets read first */ + ret = state.kq[state.kread++ % QUEUE_SIZE]; + state.kcount--; + kvm__irq_line(state.kvm, KBD_IRQ, 0); + kbd_update_irq(); + } else if (state.mcount > 0) { + /* Followed by the mouse */ + ret = state.mq[state.mread++ % QUEUE_SIZE]; + state.mcount--; + kvm__irq_line(state.kvm, AUX_IRQ, 0); + kbd_update_irq(); + } else { + i = state.kread - 1; + if (i < 0) + i = QUEUE_SIZE; + ret = state.kq[i]; + } + return ret; +} + +/* + * Called when the OS read from port 0x64, the command port + */ +static u32 kbd_read_status(void) +{ + return (u32)state.status; +} + +/* + * Called when the OS writes to port 0x60 (data port) + * Things written here are generally arguments to commands previously + * written to port 0x64 and stored in state.write_cmd + */ +static void kbd_write_data(u32 val) +{ + switch (state.write_cmd) { + case I8042_CMD_CTL_WCTR: + state.mode = val; + kbd_update_irq(); + break; + case I8042_CMD_AUX_LOOP: + mouse_queue(val); + mouse_queue(RESPONSE_ACK); + break; + case I8042_CMD_AUX_SEND: + /* The OS wants to send a command to the mouse */ + mouse_queue(RESPONSE_ACK); + switch (val) { + case 0xe6: + /* set scaling = 1:1 */ + state.mstatus &= ~AUX_SCALING_FLAG; + break; + case 0xe8: + /* set resolution */ + state.mres = val; + break; + case 0xe9: + /* Report mouse status/config */ + mouse_queue(state.mstatus); + mouse_queue(state.mres); + mouse_queue(state.msample); + break; + case 0xf2: + /* send ID */ + mouse_queue(0); /* normal mouse */ + break; + case 0xf3: + /* set sample rate */ + state.msample = val; + break; + case 0xf4: + /* enable reporting */ + state.mstatus |= AUX_ENABLE_REPORTING; + break; + case 0xf5: + state.mstatus &= ~AUX_ENABLE_REPORTING; + break; + case 0xf6: + /* set defaults, just fall through to reset */ + case 0xff: + /* reset */ + state.mstatus = 0x0; + state.mres = AUX_DEFAULT_RESOLUTION; + state.msample = AUX_DEFAULT_SAMPLE; + break; + default: + break; + } + break; + case 0: + /* Just send the ID */ + kbd_queue(RESPONSE_ACK); + kbd_queue(0xab); + kbd_queue(0x41); + kbd_update_irq(); + break; + default: + /* Yeah whatever */ + break; + } + state.write_cmd = 0; +} + +static void kbd_reset(void) +{ + state = (struct kbd_state) { + .status = I8042_STR_MUXERR | I8042_STR_CMDDAT | I8042_STR_KEYLOCK, /* 0x1c */ + .mode = KBD_MODE_KBD_INT | KBD_MODE_SYS, /* 0x3 */ + .mres = AUX_DEFAULT_RESOLUTION, + .msample = AUX_DEFAULT_SAMPLE, + }; +} + +/* + * Called when the OS has written to one of the keyboard's ports (0x60 or 0x64) + */ +static bool kbd_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + switch (port) { + case I8042_COMMAND_REG: { + u8 value = kbd_read_status(); + ioport__write8(data, value); + break; + } + case I8042_DATA_REG: { + u32 value = kbd_read_data(); + ioport__write32(data, value); + break; + } + default: + return false; + } + + return true; +} + +static bool kbd_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + switch (port) { + case I8042_COMMAND_REG: { + u8 value = ioport__read8(data); + kbd_write_command(kvm, value); + break; + } + case I8042_DATA_REG: { + u32 value = ioport__read32(data); + kbd_write_data(value); + break; + } + default: + return false; + } + + return true; +} + +static struct ioport_operations kbd_ops = { + .io_in = kbd_in, + .io_out = kbd_out, +}; + +int kbd__init(struct kvm *kvm) +{ +#ifndef CONFIG_X86 + return 0; +#endif + + kbd_reset(); + state.kvm = kvm; + ioport__register(kvm, I8042_DATA_REG, &kbd_ops, 2, NULL); + ioport__register(kvm, I8042_COMMAND_REG, &kbd_ops, 2, NULL); + + return 0; +} +dev_init(kbd__init); diff --git a/tools/kvm/hw/pci-shmem.c b/tools/kvm/hw/pci-shmem.c new file mode 100644 index 000000000000..ec3f7711b986 --- /dev/null +++ b/tools/kvm/hw/pci-shmem.c @@ -0,0 +1,411 @@ +#include "kvm/devices.h" +#include "kvm/pci-shmem.h" +#include "kvm/virtio-pci-dev.h" +#include "kvm/irq.h" +#include "kvm/kvm.h" +#include "kvm/pci.h" +#include "kvm/util.h" +#include "kvm/ioport.h" +#include "kvm/ioeventfd.h" + +#include <linux/kvm.h> +#include <linux/byteorder.h> +#include <sys/ioctl.h> +#include <fcntl.h> +#include <sys/mman.h> + +#define MB_SHIFT (20) +#define KB_SHIFT (10) +#define GB_SHIFT (30) + +static struct pci_device_header pci_shmem_pci_device = { + .vendor_id = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET), + .device_id = cpu_to_le16(0x1110), + .header_type = PCI_HEADER_TYPE_NORMAL, + .class[2] = 0xFF, /* misc pci device */ + .status = cpu_to_le16(PCI_STATUS_CAP_LIST), + .capabilities = (void *)&pci_shmem_pci_device.msix - (void *)&pci_shmem_pci_device, + .msix.cap = PCI_CAP_ID_MSIX, + .msix.ctrl = cpu_to_le16(1), + .msix.table_offset = cpu_to_le32(1), /* Use BAR 1 */ + .msix.pba_offset = cpu_to_le32(0x1001), /* Use BAR 1 */ +}; + +static struct device_header pci_shmem_device = { + .bus_type = DEVICE_BUS_PCI, + .data = &pci_shmem_pci_device, +}; + +/* registers for the Inter-VM shared memory device */ +enum ivshmem_registers { + INTRMASK = 0, + INTRSTATUS = 4, + IVPOSITION = 8, + DOORBELL = 12, +}; + +static struct shmem_info *shmem_region; +static u16 ivshmem_registers; +static int local_fd; +static u32 local_id; +static u64 msix_block; +static u64 msix_pba; +static struct msix_table msix_table[2]; + +int pci_shmem__register_mem(struct shmem_info *si) +{ + if (shmem_region == NULL) { + shmem_region = si; + } else { + pr_warning("only single shmem currently avail. ignoring.\n"); + free(si); + } + return 0; +} + +static bool shmem_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + u16 offset = port - ivshmem_registers; + + switch (offset) { + case INTRMASK: + break; + case INTRSTATUS: + break; + case IVPOSITION: + ioport__write32(data, local_id); + break; + case DOORBELL: + break; + }; + + return true; +} + +static bool shmem_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + u16 offset = port - ivshmem_registers; + + switch (offset) { + case INTRMASK: + break; + case INTRSTATUS: + break; + case IVPOSITION: + break; + case DOORBELL: + break; + }; + + return true; +} + +static struct ioport_operations shmem_pci__io_ops = { + .io_in = shmem_pci__io_in, + .io_out = shmem_pci__io_out, +}; + +static void callback_mmio_msix(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr) +{ + void *mem; + + if (addr - msix_block < 0x1000) + mem = &msix_table; + else + mem = &msix_pba; + + if (is_write) + memcpy(mem + addr - msix_block, data, len); + else + memcpy(data, mem + addr - msix_block, len); +} + +/* + * Return an irqfd which can be used by other guests to signal this guest + * whenever they need to poke it + */ +int pci_shmem__get_local_irqfd(struct kvm *kvm) +{ + int fd, gsi, r; + struct kvm_irqfd irqfd; + + if (local_fd == 0) { + fd = eventfd(0, 0); + if (fd < 0) + return fd; + + if (pci_shmem_pci_device.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE)) { + gsi = irq__add_msix_route(kvm, &msix_table[0].msg); + } else { + gsi = pci_shmem_pci_device.irq_line; + } + + irqfd = (struct kvm_irqfd) { + .fd = fd, + .gsi = gsi, + }; + + r = ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd); + if (r < 0) + return r; + + local_fd = fd; + } + + return local_fd; +} + +/* + * Connect a new client to ivshmem by adding the appropriate datamatch + * to the DOORBELL + */ +int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd) +{ + struct kvm_ioeventfd ioevent; + + ioevent = (struct kvm_ioeventfd) { + .addr = ivshmem_registers + DOORBELL, + .len = sizeof(u32), + .datamatch = id, + .fd = fd, + .flags = KVM_IOEVENTFD_FLAG_PIO | KVM_IOEVENTFD_FLAG_DATAMATCH, + }; + + return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent); +} + +/* + * Remove a client connected to ivshmem by removing the appropriate datamatch + * from the DOORBELL + */ +int pci_shmem__remove_client(struct kvm *kvm, u32 id) +{ + struct kvm_ioeventfd ioevent; + + ioevent = (struct kvm_ioeventfd) { + .addr = ivshmem_registers + DOORBELL, + .len = sizeof(u32), + .datamatch = id, + .flags = KVM_IOEVENTFD_FLAG_PIO + | KVM_IOEVENTFD_FLAG_DATAMATCH + | KVM_IOEVENTFD_FLAG_DEASSIGN, + }; + + return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent); +} + +static void *setup_shmem(const char *key, size_t len, int creating) +{ + int fd; + int rtn; + void *mem; + int flag = O_RDWR; + + if (creating) + flag |= O_CREAT; + + fd = shm_open(key, flag, S_IRUSR | S_IWUSR); + if (fd < 0) { + pr_warning("Failed to open shared memory file %s\n", key); + return NULL; + } + + if (creating) { + rtn = ftruncate(fd, (off_t) len); + if (rtn < 0) + pr_warning("Can't ftruncate(fd,%zu)\n", len); + } + mem = mmap(NULL, len, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0); + if (mem == MAP_FAILED) { + pr_warning("Failed to mmap shared memory file"); + mem = NULL; + } + close(fd); + + return mem; +} + +int shmem_parser(const struct option *opt, const char *arg, int unset) +{ + const u64 default_size = SHMEM_DEFAULT_SIZE; + const u64 default_phys_addr = SHMEM_DEFAULT_ADDR; + const char *default_handle = SHMEM_DEFAULT_HANDLE; + struct shmem_info *si = malloc(sizeof(struct shmem_info)); + u64 phys_addr; + u64 size; + char *handle = NULL; + int create = 0; + const char *p = arg; + char *next; + int base = 10; + int verbose = 0; + + const int skip_pci = strlen("pci:"); + if (verbose) + pr_info("shmem_parser(%p,%s,%d)", opt, arg, unset); + /* parse out optional addr family */ + if (strcasestr(p, "pci:")) { + p += skip_pci; + } else if (strcasestr(p, "mem:")) { + die("I can't add to E820 map yet.\n"); + } + /* parse out physical addr */ + base = 10; + if (strcasestr(p, "0x")) + base = 16; + phys_addr = strtoll(p, &next, base); + if (next == p && phys_addr == 0) { + pr_info("shmem: no physical addr specified, using default."); + phys_addr = default_phys_addr; + } + if (*next != ':' && *next != '\0') + die("shmem: unexpected chars after phys addr.\n"); + if (*next == '\0') + p = next; + else + p = next + 1; + /* parse out size */ + base = 10; + if (strcasestr(p, "0x")) + base = 16; + size = strtoll(p, &next, base); + if (next == p && size == 0) { + pr_info("shmem: no size specified, using default."); + size = default_size; + } + /* look for [KMGkmg][Bb]* uses base 2. */ + int skip_B = 0; + if (strspn(next, "KMGkmg")) { /* might have a prefix */ + if (*(next + 1) == 'B' || *(next + 1) == 'b') + skip_B = 1; + switch (*next) { + case 'K': + case 'k': + size = size << KB_SHIFT; + break; + case 'M': + case 'm': + size = size << MB_SHIFT; + break; + case 'G': + case 'g': + size = size << GB_SHIFT; + break; + default: + die("shmem: bug in detecting size prefix."); + break; + } + next += 1 + skip_B; + } + if (*next != ':' && *next != '\0') { + die("shmem: unexpected chars after phys size. <%c><%c>\n", + *next, *p); + } + if (*next == '\0') + p = next; + else + p = next + 1; + /* parse out optional shmem handle */ + const int skip_handle = strlen("handle="); + next = strcasestr(p, "handle="); + if (*p && next) { + if (p != next) + die("unexpected chars before handle\n"); + p += skip_handle; + next = strchrnul(p, ':'); + if (next - p) { + handle = malloc(next - p + 1); + strncpy(handle, p, next - p); + handle[next - p] = '\0'; /* just in case. */ + } + if (*next == '\0') + p = next; + else + p = next + 1; + } + /* parse optional create flag to see if we should create shm seg. */ + if (*p && strcasestr(p, "create")) { + create = 1; + p += strlen("create"); + } + if (*p != '\0') + die("shmem: unexpected trailing chars\n"); + if (handle == NULL) { + handle = malloc(strlen(default_handle) + 1); + strcpy(handle, default_handle); + } + if (verbose) { + pr_info("shmem: phys_addr = %llx", phys_addr); + pr_info("shmem: size = %llx", size); + pr_info("shmem: handle = %s", handle); + pr_info("shmem: create = %d", create); + } + + si->phys_addr = phys_addr; + si->size = size; + si->handle = handle; + si->create = create; + pci_shmem__register_mem(si); /* ownership of si, etc. passed on. */ + return 0; +} + +int pci_shmem__init(struct kvm *kvm) +{ + u8 line, pin; + char *mem; + int r; + + if (shmem_region == NULL) + return 0; + + /* Register good old INTx */ + r = irq__register_device(PCI_DEVICE_ID_PCI_SHMEM, &pin, &line); + if (r < 0) + return r; + + pci_shmem_pci_device.irq_pin = pin; + pci_shmem_pci_device.irq_line = line; + + /* Register MMIO space for MSI-X */ + r = ioport__register(kvm, IOPORT_EMPTY, &shmem_pci__io_ops, IOPORT_SIZE, NULL); + if (r < 0) + return r; + ivshmem_registers = (u16)r; + + msix_block = pci_get_io_space_block(0x1010); + kvm__register_mmio(kvm, msix_block, 0x1010, false, callback_mmio_msix, NULL); + + /* + * This registers 3 BARs: + * + * 0 - ivshmem registers + * 1 - MSI-X MMIO space + * 2 - Shared memory block + */ + pci_shmem_pci_device.bar[0] = cpu_to_le32(ivshmem_registers | PCI_BASE_ADDRESS_SPACE_IO); + pci_shmem_pci_device.bar_size[0] = shmem_region->size; + pci_shmem_pci_device.bar[1] = cpu_to_le32(msix_block | PCI_BASE_ADDRESS_SPACE_MEMORY); + pci_shmem_pci_device.bar_size[1] = 0x1010; + pci_shmem_pci_device.bar[2] = cpu_to_le32(shmem_region->phys_addr | PCI_BASE_ADDRESS_SPACE_MEMORY); + pci_shmem_pci_device.bar_size[2] = shmem_region->size; + + device__register(&pci_shmem_device); + + /* Open shared memory and plug it into the guest */ + mem = setup_shmem(shmem_region->handle, shmem_region->size, + shmem_region->create); + if (mem == NULL) + return -EINVAL; + + kvm__register_mem(kvm, shmem_region->phys_addr, shmem_region->size, + mem); + return 0; +} +dev_init(pci_shmem__init); + +int pci_shmem__exit(struct kvm *kvm) +{ + return 0; +} +dev_exit(pci_shmem__exit); diff --git a/tools/kvm/hw/rtc.c b/tools/kvm/hw/rtc.c new file mode 100644 index 000000000000..5232bd791873 --- /dev/null +++ b/tools/kvm/hw/rtc.c @@ -0,0 +1,155 @@ +#include "kvm/rtc.h" + +#include "kvm/ioport.h" +#include "kvm/kvm.h" + +#include <time.h> + +/* + * MC146818 RTC registers + */ +#define RTC_SECONDS 0x00 +#define RTC_SECONDS_ALARM 0x01 +#define RTC_MINUTES 0x02 +#define RTC_MINUTES_ALARM 0x03 +#define RTC_HOURS 0x04 +#define RTC_HOURS_ALARM 0x05 +#define RTC_DAY_OF_WEEK 0x06 +#define RTC_DAY_OF_MONTH 0x07 +#define RTC_MONTH 0x08 +#define RTC_YEAR 0x09 +#define RTC_CENTURY 0x32 + +#define RTC_REG_A 0x0A +#define RTC_REG_B 0x0B +#define RTC_REG_C 0x0C +#define RTC_REG_D 0x0D + +struct rtc_device { + u8 cmos_idx; + u8 cmos_data[128]; +}; + +static struct rtc_device rtc; + +static inline unsigned char bin2bcd(unsigned val) +{ + return ((val / 10) << 4) + val % 10; +} + +static bool cmos_ram_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + struct tm *tm; + time_t ti; + + time(&ti); + + tm = gmtime(&ti); + + switch (rtc.cmos_idx) { + case RTC_SECONDS: + ioport__write8(data, bin2bcd(tm->tm_sec)); + break; + case RTC_MINUTES: + ioport__write8(data, bin2bcd(tm->tm_min)); + break; + case RTC_HOURS: + ioport__write8(data, bin2bcd(tm->tm_hour)); + break; + case RTC_DAY_OF_WEEK: + ioport__write8(data, bin2bcd(tm->tm_wday + 1)); + break; + case RTC_DAY_OF_MONTH: + ioport__write8(data, bin2bcd(tm->tm_mday)); + break; + case RTC_MONTH: + ioport__write8(data, bin2bcd(tm->tm_mon + 1)); + break; + case RTC_YEAR: { + int year; + + year = tm->tm_year + 1900; + + ioport__write8(data, bin2bcd(year % 100)); + + break; + } + case RTC_CENTURY: { + int year; + + year = tm->tm_year + 1900; + + ioport__write8(data, bin2bcd(year / 100)); + + break; + } + default: + ioport__write8(data, rtc.cmos_data[rtc.cmos_idx]); + break; + } + + return true; +} + +static bool cmos_ram_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + switch (rtc.cmos_idx) { + case RTC_REG_C: + case RTC_REG_D: + /* Read-only */ + break; + default: + rtc.cmos_data[rtc.cmos_idx] = ioport__read8(data); + break; + } + + return true; +} + +static struct ioport_operations cmos_ram_data_ioport_ops = { + .io_out = cmos_ram_data_out, + .io_in = cmos_ram_data_in, +}; + +static bool cmos_ram_index_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + u8 value = ioport__read8(data); + + kvm->nmi_disabled = value & (1UL << 7); + rtc.cmos_idx = value & ~(1UL << 7); + + return true; +} + +static struct ioport_operations cmos_ram_index_ioport_ops = { + .io_out = cmos_ram_index_out, +}; + +int rtc__init(struct kvm *kvm) +{ + int r = 0; + + /* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */ + r = ioport__register(kvm, 0x0070, &cmos_ram_index_ioport_ops, 1, NULL); + if (r < 0) + return r; + + r = ioport__register(kvm, 0x0071, &cmos_ram_data_ioport_ops, 1, NULL); + if (r < 0) { + ioport__unregister(kvm, 0x0071); + return r; + } + + return r; +} +dev_init(rtc__init); + +int rtc__exit(struct kvm *kvm) +{ + /* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */ + ioport__unregister(kvm, 0x0070); + ioport__unregister(kvm, 0x0071); + + return 0; +} +dev_exit(rtc__exit); diff --git a/tools/kvm/hw/serial.c b/tools/kvm/hw/serial.c new file mode 100644 index 000000000000..53b684ab9dc3 --- /dev/null +++ b/tools/kvm/hw/serial.c @@ -0,0 +1,452 @@ +#include "kvm/8250-serial.h" + +#include "kvm/read-write.h" +#include "kvm/ioport.h" +#include "kvm/mutex.h" +#include "kvm/util.h" +#include "kvm/term.h" +#include "kvm/kvm.h" + +#include <linux/types.h> +#include <linux/serial_reg.h> + +#include <pthread.h> + +/* + * This fakes a U6_16550A. The fifo len needs to be 64 as the kernel + * expects that for autodetection. + */ +#define FIFO_LEN 64 +#define FIFO_MASK (FIFO_LEN - 1) + +#define UART_IIR_TYPE_BITS 0xc0 + +struct serial8250_device { + struct mutex mutex; + u8 id; + + u16 iobase; + u8 irq; + u8 irq_state; + int txcnt; + int rxcnt; + int rxdone; + char txbuf[FIFO_LEN]; + char rxbuf[FIFO_LEN]; + + u8 dll; + u8 dlm; + u8 iir; + u8 ier; + u8 fcr; + u8 lcr; + u8 mcr; + u8 lsr; + u8 msr; + u8 scr; +}; + +#define SERIAL_REGS_SETTING \ + .iir = UART_IIR_NO_INT, \ + .lsr = UART_LSR_TEMT | UART_LSR_THRE, \ + .msr = UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, \ + .mcr = UART_MCR_OUT2, + +static struct serial8250_device devices[] = { + /* ttyS0 */ + [0] = { + .mutex = MUTEX_INITIALIZER, + + .id = 0, + .iobase = 0x3f8, + .irq = 4, + + SERIAL_REGS_SETTING + }, + /* ttyS1 */ + [1] = { + .mutex = MUTEX_INITIALIZER, + + .id = 1, + .iobase = 0x2f8, + .irq = 3, + + SERIAL_REGS_SETTING + }, + /* ttyS2 */ + [2] = { + .mutex = MUTEX_INITIALIZER, + + .id = 2, + .iobase = 0x3e8, + .irq = 4, + + SERIAL_REGS_SETTING + }, + /* ttyS3 */ + [3] = { + .mutex = MUTEX_INITIALIZER, + + .id = 3, + .iobase = 0x2e8, + .irq = 3, + + SERIAL_REGS_SETTING + }, +}; + +static void serial8250_flush_tx(struct kvm *kvm, struct serial8250_device *dev) +{ + dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE; + + if (dev->txcnt) { + if (kvm->cfg.active_console == CONSOLE_8250) + term_putc(dev->txbuf, dev->txcnt, dev->id); + dev->txcnt = 0; + } +} + +static void serial8250_update_irq(struct kvm *kvm, struct serial8250_device *dev) +{ + u8 iir = 0; + + /* Handle clear rx */ + if (dev->lcr & UART_FCR_CLEAR_RCVR) { + dev->lcr &= ~UART_FCR_CLEAR_RCVR; + dev->rxcnt = dev->rxdone = 0; + dev->lsr &= ~UART_LSR_DR; + } + + /* Handle clear tx */ + if (dev->lcr & UART_FCR_CLEAR_XMIT) { + dev->lcr &= ~UART_FCR_CLEAR_XMIT; + dev->txcnt = 0; + dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE; + } + + /* Data ready and rcv interrupt enabled ? */ + if ((dev->ier & UART_IER_RDI) && (dev->lsr & UART_LSR_DR)) + iir |= UART_IIR_RDI; + + /* Transmitter empty and interrupt enabled ? */ + if ((dev->ier & UART_IER_THRI) && (dev->lsr & UART_LSR_TEMT)) + iir |= UART_IIR_THRI; + + /* Now update the irq line, if necessary */ + if (!iir) { + dev->iir = UART_IIR_NO_INT; + if (dev->irq_state) + kvm__irq_line(kvm, dev->irq, 0); + } else { + dev->iir = iir; + if (!dev->irq_state) + kvm__irq_line(kvm, dev->irq, 1); + } + dev->irq_state = iir; + + /* + * If the kernel disabled the tx interrupt, we know that there + * is nothing more to transmit, so we can reset our tx logic + * here. + */ + if (!(dev->ier & UART_IER_THRI)) + serial8250_flush_tx(kvm, dev); +} + +#define SYSRQ_PENDING_NONE 0 + +static int sysrq_pending; + +static void serial8250__sysrq(struct kvm *kvm, struct serial8250_device *dev) +{ + dev->lsr |= UART_LSR_DR | UART_LSR_BI; + dev->rxbuf[dev->rxcnt++] = sysrq_pending; + sysrq_pending = SYSRQ_PENDING_NONE; +} + +static void serial8250__receive(struct kvm *kvm, struct serial8250_device *dev, + bool handle_sysrq) +{ + int c; + + /* + * If the guest transmitted a full fifo, we clear the + * TEMT/THRE bits to let the kernel escape from the 8250 + * interrupt handler. We come here only once a ms, so that + * should give the kernel the desired pause. That also flushes + * the tx fifo to the terminal. + */ + serial8250_flush_tx(kvm, dev); + + if (dev->mcr & UART_MCR_LOOP) + return; + + if ((dev->lsr & UART_LSR_DR) || dev->rxcnt) + return; + + if (handle_sysrq && sysrq_pending) { + serial8250__sysrq(kvm, dev); + return; + } + + if (kvm->cfg.active_console != CONSOLE_8250) + return; + + while (term_readable(dev->id) && + dev->rxcnt < FIFO_LEN) { + + c = term_getc(kvm, dev->id); + + if (c < 0) + break; + dev->rxbuf[dev->rxcnt++] = c; + dev->lsr |= UART_LSR_DR; + } +} + +void serial8250__update_consoles(struct kvm *kvm) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(devices); i++) { + struct serial8250_device *dev = &devices[i]; + + mutex_lock(&dev->mutex); + + /* Restrict sysrq injection to the first port */ + serial8250__receive(kvm, dev, i == 0); + + serial8250_update_irq(kvm, dev); + + mutex_unlock(&dev->mutex); + } +} + +void serial8250__inject_sysrq(struct kvm *kvm, char sysrq) +{ + sysrq_pending = sysrq; +} + +static struct serial8250_device *find_device(u16 port) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(devices); i++) { + struct serial8250_device *dev = &devices[i]; + + if (dev->iobase == (port & ~0x7)) + return dev; + } + return NULL; +} + +static bool serial8250_out(struct ioport *ioport, struct kvm *kvm, u16 port, + void *data, int size) +{ + struct serial8250_device *dev; + u16 offset; + bool ret = true; + char *addr = data; + + dev = find_device(port); + if (!dev) + return false; + + mutex_lock(&dev->mutex); + + offset = port - dev->iobase; + + switch (offset) { + case UART_TX: + if (dev->lcr & UART_LCR_DLAB) { + dev->dll = ioport__read8(data); + break; + } + + /* Loopback mode */ + if (dev->mcr & UART_MCR_LOOP) { + if (dev->rxcnt < FIFO_LEN) { + dev->rxbuf[dev->rxcnt++] = *addr; + dev->lsr |= UART_LSR_DR; + } + break; + } + + if (dev->txcnt < FIFO_LEN) { + dev->txbuf[dev->txcnt++] = *addr; + dev->lsr &= ~UART_LSR_TEMT; + if (dev->txcnt == FIFO_LEN / 2) + dev->lsr &= ~UART_LSR_THRE; + } else { + /* Should never happpen */ + dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE); + } + break; + case UART_IER: + if (!(dev->lcr & UART_LCR_DLAB)) + dev->ier = ioport__read8(data) & 0x0f; + else + dev->dlm = ioport__read8(data); + break; + case UART_FCR: + dev->fcr = ioport__read8(data); + break; + case UART_LCR: + dev->lcr = ioport__read8(data); + break; + case UART_MCR: + dev->mcr = ioport__read8(data); + break; + case UART_LSR: + /* Factory test */ + break; + case UART_MSR: + /* Not used */ + break; + case UART_SCR: + dev->scr = ioport__read8(data); + break; + default: + ret = false; + break; + } + + serial8250_update_irq(kvm, dev); + + mutex_unlock(&dev->mutex); + + return ret; +} + +static void serial8250_rx(struct serial8250_device *dev, void *data) +{ + if (dev->rxdone == dev->rxcnt) + return; + + /* Break issued ? */ + if (dev->lsr & UART_LSR_BI) { + dev->lsr &= ~UART_LSR_BI; + ioport__write8(data, 0); + return; + } + + ioport__write8(data, dev->rxbuf[dev->rxdone++]); + if (dev->rxcnt == dev->rxdone) { + dev->lsr &= ~UART_LSR_DR; + dev->rxcnt = dev->rxdone = 0; + } +} + +static bool serial8250_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + struct serial8250_device *dev; + u16 offset; + bool ret = true; + + dev = find_device(port); + if (!dev) + return false; + + mutex_lock(&dev->mutex); + + offset = port - dev->iobase; + + switch (offset) { + case UART_RX: + if (dev->lcr & UART_LCR_DLAB) + ioport__write8(data, dev->dll); + else + serial8250_rx(dev, data); + break; + case UART_IER: + if (dev->lcr & UART_LCR_DLAB) + ioport__write8(data, dev->dlm); + else + ioport__write8(data, dev->ier); + break; + case UART_IIR: + ioport__write8(data, dev->iir | UART_IIR_TYPE_BITS); + break; + case UART_LCR: + ioport__write8(data, dev->lcr); + break; + case UART_MCR: + ioport__write8(data, dev->mcr); + break; + case UART_LSR: + ioport__write8(data, dev->lsr); + break; + case UART_MSR: + ioport__write8(data, dev->msr); + break; + case UART_SCR: + ioport__write8(data, dev->scr); + break; + default: + ret = false; + break; + } + + serial8250_update_irq(kvm, dev); + + mutex_unlock(&dev->mutex); + + return ret; +} + +static struct ioport_operations serial8250_ops = { + .io_in = serial8250_in, + .io_out = serial8250_out, +}; + +static int serial8250__device_init(struct kvm *kvm, struct serial8250_device *dev) +{ + int r; + + r = ioport__register(kvm, dev->iobase, &serial8250_ops, 8, NULL); + kvm__irq_line(kvm, dev->irq, 0); + + return r; +} + +int serial8250__init(struct kvm *kvm) +{ + unsigned int i, j; + int r = 0; + + for (i = 0; i < ARRAY_SIZE(devices); i++) { + struct serial8250_device *dev = &devices[i]; + + r = serial8250__device_init(kvm, dev); + if (r < 0) + goto cleanup; + } + + return r; +cleanup: + for (j = 0; j <= i; j++) { + struct serial8250_device *dev = &devices[j]; + + ioport__unregister(kvm, dev->iobase); + } + + return r; +} +dev_init(serial8250__init); + +int serial8250__exit(struct kvm *kvm) +{ + unsigned int i; + int r; + + for (i = 0; i < ARRAY_SIZE(devices); i++) { + struct serial8250_device *dev = &devices[i]; + + r = ioport__unregister(kvm, dev->iobase); + if (r < 0) + return r; + } + + return 0; +} +dev_exit(serial8250__exit); diff --git a/tools/kvm/hw/vesa.c b/tools/kvm/hw/vesa.c new file mode 100644 index 000000000000..33a675f633a0 --- /dev/null +++ b/tools/kvm/hw/vesa.c @@ -0,0 +1,95 @@ +#include "kvm/vesa.h" + +#include "kvm/devices.h" +#include "kvm/virtio-pci-dev.h" +#include "kvm/framebuffer.h" +#include "kvm/kvm-cpu.h" +#include "kvm/ioport.h" +#include "kvm/util.h" +#include "kvm/irq.h" +#include "kvm/kvm.h" +#include "kvm/pci.h" + +#include <linux/byteorder.h> +#include <sys/mman.h> +#include <linux/err.h> +#include <sys/types.h> +#include <sys/ioctl.h> +#include <inttypes.h> +#include <unistd.h> + +static bool vesa_pci_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + return true; +} + +static bool vesa_pci_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + return true; +} + +static struct ioport_operations vesa_io_ops = { + .io_in = vesa_pci_io_in, + .io_out = vesa_pci_io_out, +}; + +static struct pci_device_header vesa_pci_device = { + .vendor_id = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET), + .device_id = cpu_to_le16(PCI_DEVICE_ID_VESA), + .header_type = PCI_HEADER_TYPE_NORMAL, + .revision_id = 0, + .class[2] = 0x03, + .subsys_vendor_id = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET), + .subsys_id = cpu_to_le16(PCI_SUBSYSTEM_ID_VESA), + .bar[1] = cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY), + .bar_size[1] = VESA_MEM_SIZE, +}; + +static struct device_header vesa_device = { + .bus_type = DEVICE_BUS_PCI, + .data = &vesa_pci_device, +}; + +static struct framebuffer vesafb; + +struct framebuffer *vesa__init(struct kvm *kvm) +{ + u16 vesa_base_addr; + u8 line, pin; + char *mem; + int r; + + if (!kvm->cfg.vnc && !kvm->cfg.sdl) + return NULL; + + r = irq__register_device(PCI_DEVICE_ID_VESA, &pin, &line); + if (r < 0) + return ERR_PTR(r); + + r = ioport__register(kvm, IOPORT_EMPTY, &vesa_io_ops, IOPORT_SIZE, NULL); + if (r < 0) + return ERR_PTR(r); + + vesa_pci_device.irq_pin = pin; + vesa_pci_device.irq_line = line; + vesa_base_addr = (u16)r; + vesa_pci_device.bar[0] = cpu_to_le32(vesa_base_addr | PCI_BASE_ADDRESS_SPACE_IO); + device__register(&vesa_device); + + mem = mmap(NULL, VESA_MEM_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0); + if (mem == MAP_FAILED) + ERR_PTR(-errno); + + kvm__register_mem(kvm, VESA_MEM_ADDR, VESA_MEM_SIZE, mem); + + vesafb = (struct framebuffer) { + .width = VESA_WIDTH, + .height = VESA_HEIGHT, + .depth = VESA_BPP, + .mem = mem, + .mem_addr = VESA_MEM_ADDR, + .mem_size = VESA_MEM_SIZE, + .kvm = kvm, + }; + return fb__register(&vesafb); +} diff --git a/tools/kvm/include/asm/hweight.h b/tools/kvm/include/asm/hweight.h new file mode 100644 index 000000000000..1a439777bb45 --- /dev/null +++ b/tools/kvm/include/asm/hweight.h @@ -0,0 +1,8 @@ +#ifndef _KVM_ASM_HWEIGHT_H_ +#define _KVM_ASM_HWEIGHT_H_ + +#include <linux/types.h> +unsigned int hweight32(unsigned int w); +unsigned long hweight64(__u64 w); + +#endif /* _KVM_ASM_HWEIGHT_H_ */ diff --git a/tools/kvm/include/bios/memcpy.h b/tools/kvm/include/bios/memcpy.h new file mode 100644 index 000000000000..e0210449e80f --- /dev/null +++ b/tools/kvm/include/bios/memcpy.h @@ -0,0 +1,9 @@ +#ifndef KVM_BIOS_MEMCPY_H +#define KVM_BIOS_MEMCPY_H + +#include <linux/types.h> +#include <stddef.h> + +void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len); + +#endif /* KVM_BIOS_MEMCPY_H */ diff --git a/tools/kvm/include/kvm/8250-serial.h b/tools/kvm/include/kvm/8250-serial.h new file mode 100644 index 000000000000..e9545517351c --- /dev/null +++ b/tools/kvm/include/kvm/8250-serial.h @@ -0,0 +1,11 @@ +#ifndef KVM__8250_SERIAL_H +#define KVM__8250_SERIAL_H + +struct kvm; + +int serial8250__init(struct kvm *kvm); +int serial8250__exit(struct kvm *kvm); +void serial8250__update_consoles(struct kvm *kvm); +void serial8250__inject_sysrq(struct kvm *kvm, char sysrq); + +#endif /* KVM__8250_SERIAL_H */ diff --git a/tools/kvm/include/kvm/apic.h b/tools/kvm/include/kvm/apic.h new file mode 100644 index 000000000000..212999709f1b --- /dev/null +++ b/tools/kvm/include/kvm/apic.h @@ -0,0 +1,17 @@ +#ifndef KVM_APIC_H_ +#define KVM_APIC_H_ + +#include <asm/apicdef.h> + +/* + * APIC, IOAPIC stuff + */ +#define APIC_BASE_ADDR_STEP 0x00400000 +#define IOAPIC_BASE_ADDR_STEP 0x00100000 + +#define APIC_ADDR(apic) (APIC_DEFAULT_PHYS_BASE + apic * APIC_BASE_ADDR_STEP) +#define IOAPIC_ADDR(ioapic) (IO_APIC_DEFAULT_PHYS_BASE + ioapic * IOAPIC_BASE_ADDR_STEP) + +#define KVM_APIC_VERSION 0x14 /* xAPIC */ + +#endif /* KVM_APIC_H_ */ diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h new file mode 100644 index 000000000000..29f72e0e8e0d --- /dev/null +++ b/tools/kvm/include/kvm/brlock.h @@ -0,0 +1,41 @@ +#ifndef KVM__BRLOCK_H +#define KVM__BRLOCK_H + +#include "kvm/kvm.h" +#include "kvm/barrier.h" + +/* + * brlock is a lock which is very cheap for reads, but very expensive + * for writes. + * This lock will be used when updates are very rare and reads are common. + * This lock is currently implemented by stopping the guest while + * performing the updates. We assume that the only threads whichread from + * the locked data are VCPU threads, and the only writer isn't a VCPU thread. + */ + +#ifndef barrier +#define barrier() __asm__ __volatile__("": : :"memory") +#endif + +#ifdef KVM_BRLOCK_DEBUG + +#include "kvm/rwsem.h" + +DECLARE_RWSEM(brlock_sem); + +#define br_read_lock(kvm) down_read(&brlock_sem); +#define br_read_unlock(kvm) up_read(&brlock_sem); + +#define br_write_lock(kvm) down_write(&brlock_sem); +#define br_write_unlock(kvm) up_write(&brlock_sem); + +#else + +#define br_read_lock(kvm) barrier() +#define br_read_unlock(kvm) barrier() + +#define br_write_lock(kvm) kvm__pause(kvm) +#define br_write_unlock(kvm) kvm__continue(kvm) +#endif + +#endif diff --git a/tools/kvm/include/kvm/builtin-balloon.h b/tools/kvm/include/kvm/builtin-balloon.h new file mode 100644 index 000000000000..77ee65605070 --- /dev/null +++ b/tools/kvm/include/kvm/builtin-balloon.h @@ -0,0 +1,9 @@ +#ifndef KVM__BALLOON_H +#define KVM__BALLOON_H + +#include <kvm/util.h> + +int kvm_cmd_balloon(int argc, const char **argv, const char *prefix); +void kvm_balloon_help(void) NORETURN; + +#endif diff --git a/tools/kvm/include/kvm/builtin-debug.h b/tools/kvm/include/kvm/builtin-debug.h new file mode 100644 index 000000000000..efa0268402b3 --- /dev/null +++ b/tools/kvm/include/kvm/builtin-debug.h @@ -0,0 +1,20 @@ +#ifndef KVM__DEBUG_H +#define KVM__DEBUG_H + +#include <kvm/util.h> +#include <linux/types.h> + +#define KVM_DEBUG_CMD_TYPE_DUMP (1 << 0) +#define KVM_DEBUG_CMD_TYPE_NMI (1 << 1) +#define KVM_DEBUG_CMD_TYPE_SYSRQ (1 << 2) + +struct debug_cmd_params { + u32 dbg_type; + u32 cpu; + char sysrq; +}; + +int kvm_cmd_debug(int argc, const char **argv, const char *prefix); +void kvm_debug_help(void) NORETURN; + +#endif diff --git a/tools/kvm/include/kvm/builtin-help.h b/tools/kvm/include/kvm/builtin-help.h new file mode 100644 index 000000000000..2946743b689b --- /dev/null +++ b/tools/kvm/include/kvm/builtin-help.h @@ -0,0 +1,6 @@ +#ifndef __KVM_HELP_H__ +#define __KVM_HELP_H__ + +int kvm_cmd_help(int argc, const char **argv, const char *prefix); + +#endif diff --git a/tools/kvm/include/kvm/builtin-list.h b/tools/kvm/include/kvm/builtin-list.h new file mode 100644 index 000000000000..47029caa25e6 --- /dev/null +++ b/tools/kvm/include/kvm/builtin-list.h @@ -0,0 +1,10 @@ +#ifndef KVM__LIST_H +#define KVM__LIST_H + +#include <kvm/util.h> + +int kvm_cmd_list(int argc, const char **argv, const char *prefix); +void kvm_list_help(void) NORETURN; +int get_vmstate(int sock); + +#endif diff --git a/tools/kvm/include/kvm/builtin-pause.h b/tools/kvm/include/kvm/builtin-pause.h new file mode 100644 index 000000000000..84aaee320fbc --- /dev/null +++ b/tools/kvm/include/kvm/builtin-pause.h @@ -0,0 +1,9 @@ +#ifndef KVM__PAUSE_H +#define KVM__PAUSE_H + +#include <kvm/util.h> + +int kvm_cmd_pause(int argc, const char **argv, const char *prefix); +void kvm_pause_help(void) NORETURN; + +#endif diff --git a/tools/kvm/include/kvm/builtin-resume.h b/tools/kvm/include/kvm/builtin-resume.h new file mode 100644 index 000000000000..7de999b2c304 --- /dev/null +++ b/tools/kvm/include/kvm/builtin-resume.h @@ -0,0 +1,9 @@ +#ifndef KVM__RESUME_H +#define KVM__RESUME_H + +#include <kvm/util.h> + +int kvm_cmd_resume(int argc, const char **argv, const char *prefix); +void kvm_resume_help(void) NORETURN; + +#endif diff --git a/tools/kvm/include/kvm/builtin-run.h b/tools/kvm/include/kvm/builtin-run.h new file mode 100644 index 000000000000..91521a58a59c --- /dev/null +++ b/tools/kvm/include/kvm/builtin-run.h @@ -0,0 +1,11 @@ +#ifndef __KVM_RUN_H__ +#define __KVM_RUN_H__ + +#include <kvm/util.h> + +int kvm_cmd_run(int argc, const char **argv, const char *prefix); +void kvm_run_help(void) NORETURN; + +void kvm_run_set_wrapper_sandbox(void); + +#endif diff --git a/tools/kvm/include/kvm/builtin-sandbox.h b/tools/kvm/include/kvm/builtin-sandbox.h new file mode 100644 index 000000000000..98cd6bee3f32 --- /dev/null +++ b/tools/kvm/include/kvm/builtin-sandbox.h @@ -0,0 +1,6 @@ +#ifndef KVM__SANDBOX_H +#define KVM__SANDBOX_H + +int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix); + +#endif diff --git a/tools/kvm/include/kvm/builtin-setup.h b/tools/kvm/include/kvm/builtin-setup.h new file mode 100644 index 000000000000..4a8d7ee39425 --- /dev/null +++ b/tools/kvm/include/kvm/builtin-setup.h @@ -0,0 +1,11 @@ +#ifndef KVM__SETUP_H +#define KVM__SETUP_H + +#include <kvm/util.h> + +int kvm_cmd_setup(int argc, const char **argv, const char *prefix); +void kvm_setup_help(void) NORETURN; +int kvm_setup_create_new(const char *guestfs_name); +void kvm_setup_resolv(const char *guestfs_name); + +#endif diff --git a/tools/kvm/include/kvm/builtin-stat.h b/tools/kvm/include/kvm/builtin-stat.h new file mode 100644 index 000000000000..4fecb37901dd --- /dev/null +++ b/tools/kvm/include/kvm/builtin-stat.h @@ -0,0 +1,9 @@ +#ifndef KVM__STAT_H +#define KVM__STAT_H + +#include <kvm/util.h> + +int kvm_cmd_stat(int argc, const char **argv, const char *prefix); +void kvm_stat_help(void) NORETURN; + +#endif diff --git a/tools/kvm/include/kvm/builtin-stop.h b/tools/kvm/include/kvm/builtin-stop.h new file mode 100644 index 000000000000..b26b2750a0ca --- /dev/null +++ b/tools/kvm/include/kvm/builtin-stop.h @@ -0,0 +1,9 @@ +#ifndef KVM__STOP_H +#define KVM__STOP_H + +#include <kvm/util.h> + +int kvm_cmd_stop(int argc, const char **argv, const char *prefix); +void kvm_stop_help(void) NORETURN; + +#endif diff --git a/tools/kvm/include/kvm/builtin-version.h b/tools/kvm/include/kvm/builtin-version.h new file mode 100644 index 000000000000..83cac4d8c71e --- /dev/null +++ b/tools/kvm/include/kvm/builtin-version.h @@ -0,0 +1,6 @@ +#ifndef KVM__VERSION_H +#define KVM__VERSION_H + +int kvm_cmd_version(int argc, const char **argv, const char *prefix); + +#endif diff --git a/tools/kvm/include/kvm/compiler.h b/tools/kvm/include/kvm/compiler.h new file mode 100644 index 000000000000..2013a8351704 --- /dev/null +++ b/tools/kvm/include/kvm/compiler.h @@ -0,0 +1,10 @@ +#ifndef KVM_COMPILER_H_ +#define KVM_COMPILER_H_ + +#ifndef __compiletime_error +# define __compiletime_error(message) +#endif + +#define notrace __attribute__((no_instrument_function)) + +#endif /* KVM_COMPILER_H_ */ diff --git a/tools/kvm/include/kvm/devices.h b/tools/kvm/include/kvm/devices.h new file mode 100644 index 000000000000..c5de3de2737e --- /dev/null +++ b/tools/kvm/include/kvm/devices.h @@ -0,0 +1,27 @@ +#ifndef KVM__DEVICES_H +#define KVM__DEVICES_H + +#include <linux/rbtree.h> +#include <linux/types.h> + +enum device_bus_type { + DEVICE_BUS_PCI, + DEVICE_BUS_MMIO, + DEVICE_BUS_MAX, +}; + +struct device_header { + enum device_bus_type bus_type; + void *data; + int dev_num; + struct rb_node node; +}; + +int device__register(struct device_header *dev); +struct device_header *device__find_dev(enum device_bus_type bus_type, + u8 dev_num); + +struct device_header *device__first_dev(enum device_bus_type bus_type); +struct device_header *device__next_dev(struct device_header *dev); + +#endif /* KVM__DEVICES_H */ diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h new file mode 100644 index 000000000000..b72805242d4d --- /dev/null +++ b/tools/kvm/include/kvm/disk-image.h @@ -0,0 +1,96 @@ +#ifndef KVM__DISK_IMAGE_H +#define KVM__DISK_IMAGE_H + +#include "kvm/read-write.h" +#include "kvm/util.h" +#include "kvm/parse-options.h" + +#include <linux/types.h> +#include <linux/fs.h> /* for BLKGETSIZE64 */ +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <stdbool.h> +#include <sys/uio.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> + +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1UL << SECTOR_SHIFT) + +enum { + DISK_IMAGE_REGULAR, + DISK_IMAGE_MMAP, +}; + +#define MAX_DISK_IMAGES 4 + +struct disk_image; + +struct disk_image_operations { + ssize_t (*read)(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param); + ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param); + int (*flush)(struct disk_image *disk); + int (*close)(struct disk_image *disk); +}; + +struct disk_image_params { + const char *filename; + /* + * wwpn == World Wide Port Number + * tpgt == Target Portal Group Tag + */ + const char *wwpn; + const char *tpgt; + bool readonly; + bool direct; +}; + +struct disk_image { + int fd; + u64 size; + struct disk_image_operations *ops; + void *priv; + void *disk_req_cb_param; + void (*disk_req_cb)(void *param, long len); + bool async; + int evt; +#ifdef CONFIG_HAS_AIO + io_context_t ctx; +#endif + const char *wwpn; + const char *tpgt; + int debug_iodelay; +}; + +int disk_img_name_parser(const struct option *opt, const char *arg, int unset); +int disk_image__init(struct kvm *kvm); +int disk_image__exit(struct kvm *kvm); +struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap); +int disk_image__flush(struct disk_image *disk); +ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param); +ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov, + int iovcount, void *param); +ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len); + +struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly); +struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st); + +ssize_t raw_image__read(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param); +ssize_t raw_image__write(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param); +ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param); +ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector, + const struct iovec *iov, int iovcount, void *param); +int raw_image__close(struct disk_image *disk); +void disk_image__set_callback(struct disk_image *disk, void (*disk_req_cb)(void *param, long len)); +#endif /* KVM__DISK_IMAGE_H */ diff --git a/tools/kvm/include/kvm/e820.h b/tools/kvm/include/kvm/e820.h new file mode 100644 index 000000000000..15f62cc660ef --- /dev/null +++ b/tools/kvm/include/kvm/e820.h @@ -0,0 +1,13 @@ +#ifndef KVM_E820_H +#define KVM_E820_H + +#include <linux/types.h> +#include <kvm/bios.h> + +#define SMAP 0x534d4150 /* ASCII "SMAP" */ + +struct biosregs; + +extern bioscall void e820_query_map(struct biosregs *regs); + +#endif /* KVM_E820_H */ diff --git a/tools/kvm/include/kvm/fdt.h b/tools/kvm/include/kvm/fdt.h new file mode 100644 index 000000000000..19f95ac24f0f --- /dev/null +++ b/tools/kvm/include/kvm/fdt.h @@ -0,0 +1,26 @@ +#ifndef KVM__FDT_H +#define KVM__FDT_H + +#include "libfdt.h" + +#include <linux/types.h> + +#define FDT_MAX_SIZE 0x10000 + +/* Helper for the various bits of code that generate FDT nodes */ +#define _FDT(exp) \ + do { \ + int ret = (exp); \ + if (ret < 0) { \ + die("Error creating device tree: %s: %s\n", \ + #exp, fdt_strerror(ret)); \ + } \ + } while (0) + +static inline u32 fdt__alloc_phandle(void) +{ + static u32 phandle = 0; + return ++phandle; +} + +#endif /* KVM__FDT_H */ diff --git a/tools/kvm/include/kvm/framebuffer.h b/tools/kvm/include/kvm/framebuffer.h new file mode 100644 index 000000000000..e3200e5b16de --- /dev/null +++ b/tools/kvm/include/kvm/framebuffer.h @@ -0,0 +1,36 @@ +#ifndef KVM__FRAMEBUFFER_H +#define KVM__FRAMEBUFFER_H + +#include <linux/types.h> +#include <linux/list.h> + +struct framebuffer; + +struct fb_target_operations { + int (*start)(struct framebuffer *fb); + int (*stop)(struct framebuffer *fb); +}; + +#define FB_MAX_TARGETS 2 + +struct framebuffer { + struct list_head node; + + u32 width; + u32 height; + u8 depth; + char *mem; + u64 mem_addr; + u64 mem_size; + struct kvm *kvm; + + unsigned long nr_targets; + struct fb_target_operations *targets[FB_MAX_TARGETS]; +}; + +struct framebuffer *fb__register(struct framebuffer *fb); +int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops); +int fb__init(struct kvm *kvm); +int fb__exit(struct kvm *kvm); + +#endif /* KVM__FRAMEBUFFER_H */ diff --git a/tools/kvm/include/kvm/guest_compat.h b/tools/kvm/include/kvm/guest_compat.h new file mode 100644 index 000000000000..ae7abbdb8be5 --- /dev/null +++ b/tools/kvm/include/kvm/guest_compat.h @@ -0,0 +1,9 @@ +#ifndef KVM__GUEST_COMPAT_H +#define KVM__GUEST_COMPAT_H + +int compat__print_all_messages(void); +int compat__remove_message(int id); +int compat__add_message(const char *title, const char *description); + + +#endif
\ No newline at end of file diff --git a/tools/kvm/include/kvm/i8042.h b/tools/kvm/include/kvm/i8042.h new file mode 100644 index 000000000000..3b4ab688b840 --- /dev/null +++ b/tools/kvm/include/kvm/i8042.h @@ -0,0 +1,12 @@ +#ifndef KVM__PCKBD_H +#define KVM__PCKBD_H + +#include <linux/types.h> + +struct kvm; + +void mouse_queue(u8 c); +void kbd_queue(u8 c); +int kbd__init(struct kvm *kvm); + +#endif diff --git a/tools/kvm/include/kvm/ioeventfd.h b/tools/kvm/include/kvm/ioeventfd.h new file mode 100644 index 000000000000..d71fa4066eb9 --- /dev/null +++ b/tools/kvm/include/kvm/ioeventfd.h @@ -0,0 +1,28 @@ +#ifndef KVM__IOEVENTFD_H +#define KVM__IOEVENTFD_H + +#include <linux/types.h> +#include <linux/list.h> +#include <sys/eventfd.h> +#include "kvm/util.h" + +struct kvm; + +struct ioevent { + u64 io_addr; + u8 io_len; + void (*fn)(struct kvm *kvm, void *ptr); + struct kvm *fn_kvm; + void *fn_ptr; + int fd; + u64 datamatch; + + struct list_head list; +}; + +int ioeventfd__init(struct kvm *kvm); +int ioeventfd__exit(struct kvm *kvm); +int ioeventfd__add_event(struct ioevent *ioevent, bool is_pio, bool poll_in_userspace); +int ioeventfd__del_event(u64 addr, u64 datamatch); + +#endif diff --git a/tools/kvm/include/kvm/ioport.h b/tools/kvm/include/kvm/ioport.h new file mode 100644 index 000000000000..6660acb66b96 --- /dev/null +++ b/tools/kvm/include/kvm/ioport.h @@ -0,0 +1,70 @@ +#ifndef KVM__IOPORT_H +#define KVM__IOPORT_H + +#include "kvm/rbtree-interval.h" + +#include <stdbool.h> +#include <limits.h> +#include <asm/types.h> +#include <linux/types.h> +#include <linux/byteorder.h> + +/* some ports we reserve for own use */ +#define IOPORT_DBG 0xe0 +#define IOPORT_START 0x6200 +#define IOPORT_SIZE 0x400 + +#define IOPORT_EMPTY USHRT_MAX + +struct kvm; + +struct ioport { + struct rb_int_node node; + struct ioport_operations *ops; + void *priv; +}; + +struct ioport_operations { + bool (*io_in)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size); + bool (*io_out)(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size); +}; + +void ioport__setup_arch(struct kvm *kvm); + +int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, + int count, void *param); +int ioport__unregister(struct kvm *kvm, u16 port); +int ioport__init(struct kvm *kvm); +int ioport__exit(struct kvm *kvm); + +static inline u8 ioport__read8(u8 *data) +{ + return *data; +} +/* On BE platforms, PCI I/O is byteswapped, i.e. LE, so swap back. */ +static inline u16 ioport__read16(u16 *data) +{ + return le16_to_cpu(*data); +} + +static inline u32 ioport__read32(u32 *data) +{ + return le32_to_cpu(*data); +} + +static inline void ioport__write8(u8 *data, u8 value) +{ + *data = value; +} + +static inline void ioport__write16(u16 *data, u16 value) +{ + *data = cpu_to_le16(value); +} + +static inline void ioport__write32(u32 *data, u32 value) +{ + *data = cpu_to_le32(value); +} + +#endif /* KVM__IOPORT_H */ diff --git a/tools/kvm/include/kvm/irq.h b/tools/kvm/include/kvm/irq.h new file mode 100644 index 000000000000..5c1274b98610 --- /dev/null +++ b/tools/kvm/include/kvm/irq.h @@ -0,0 +1,33 @@ +#ifndef KVM__IRQ_H +#define KVM__IRQ_H + +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/kvm.h> + +#include "kvm/msi.h" + +struct kvm; + +struct irq_line { + u8 line; + struct list_head node; +}; + +struct pci_dev { + struct rb_node node; + u32 id; + u8 pin; + struct list_head lines; +}; + +int irq__register_device(u32 dev, u8 *pin, u8 *line); + +struct rb_node *irq__get_pci_tree(void); + +int irq__init(struct kvm *kvm); +int irq__exit(struct kvm *kvm); +int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg); + +#endif diff --git a/tools/kvm/include/kvm/kvm-cmd.h b/tools/kvm/include/kvm/kvm-cmd.h new file mode 100644 index 000000000000..0a73bce077b9 --- /dev/null +++ b/tools/kvm/include/kvm/kvm-cmd.h @@ -0,0 +1,17 @@ +#ifndef __KVM_CMD_H__ +#define __KVM_CMD_H__ + +struct cmd_struct { + const char *cmd; + int (*fn)(int, const char **, const char *); + void (*help)(void); + int option; +}; + +extern struct cmd_struct kvm_commands[]; +struct cmd_struct *kvm_get_command(struct cmd_struct *command, + const char *cmd); + +int handle_command(struct cmd_struct *command, int argc, const char **argv); + +#endif diff --git a/tools/kvm/include/kvm/kvm-config.h b/tools/kvm/include/kvm/kvm-config.h new file mode 100644 index 000000000000..c66f48144bd2 --- /dev/null +++ b/tools/kvm/include/kvm/kvm-config.h @@ -0,0 +1,61 @@ +#ifndef KVM_CONFIG_H_ +#define KVM_CONFIG_H_ + +#include "kvm/disk-image.h" +#include "kvm/kvm-config-arch.h" + +#define DEFAULT_KVM_DEV "/dev/kvm" +#define DEFAULT_CONSOLE "serial" +#define DEFAULT_NETWORK "user" +#define DEFAULT_HOST_ADDR "192.168.33.1" +#define DEFAULT_GUEST_ADDR "192.168.33.15" +#define DEFAULT_GUEST_MAC "02:15:15:15:15:15" +#define DEFAULT_HOST_MAC "02:01:01:01:01:01" +#define DEFAULT_SCRIPT "none" +#define DEFAULT_SANDBOX_FILENAME "guest/sandbox.sh" + +#define MIN_RAM_SIZE_MB (64ULL) +#define MIN_RAM_SIZE_BYTE (MIN_RAM_SIZE_MB << MB_SHIFT) + +struct kvm_config { + struct kvm_config_arch arch; + struct disk_image_params disk_image[MAX_DISK_IMAGES]; + u64 ram_size; + u8 image_count; + u8 num_net_devices; + bool virtio_rng; + int active_console; + int debug_iodelay; + int nrcpus; + const char *kernel_cmdline; + const char *kernel_filename; + const char *vmlinux_filename; + const char *initrd_filename; + const char *firmware_filename; + const char *console; + const char *dev; + const char *network; + const char *host_ip; + const char *guest_ip; + const char *guest_mac; + const char *host_mac; + const char *script; + const char *guest_name; + const char *sandbox; + const char *hugetlbfs_path; + const char *custom_rootfs_name; + const char *real_cmdline; + struct virtio_net_params *net_params; + bool single_step; + bool vnc; + bool sdl; + bool balloon; + bool using_rootfs; + bool custom_rootfs; + bool no_net; + bool no_dhcp; + bool ioport_debug; + bool mmio_debug; +}; + +#endif diff --git a/tools/kvm/include/kvm/kvm-cpu.h b/tools/kvm/include/kvm/kvm-cpu.h new file mode 100644 index 000000000000..0ece28c32d4b --- /dev/null +++ b/tools/kvm/include/kvm/kvm-cpu.h @@ -0,0 +1,26 @@ +#ifndef KVM__KVM_CPU_H +#define KVM__KVM_CPU_H + +#include "kvm/kvm-cpu-arch.h" +#include <stdbool.h> + +int kvm_cpu__init(struct kvm *kvm); +int kvm_cpu__exit(struct kvm *kvm); +struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id); +void kvm_cpu__delete(struct kvm_cpu *vcpu); +void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu); +void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu); +void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu); +void kvm_cpu__run(struct kvm_cpu *vcpu); +void kvm_cpu__reboot(struct kvm *kvm); +int kvm_cpu__start(struct kvm_cpu *cpu); +bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu); + +int kvm_cpu__get_debug_fd(void); +void kvm_cpu__set_debug_fd(int fd); +void kvm_cpu__show_code(struct kvm_cpu *vcpu); +void kvm_cpu__show_registers(struct kvm_cpu *vcpu); +void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu); +void kvm_cpu__arch_nmi(struct kvm_cpu *cpu); + +#endif /* KVM__KVM_CPU_H */ diff --git a/tools/kvm/include/kvm/kvm-ipc.h b/tools/kvm/include/kvm/kvm-ipc.h new file mode 100644 index 000000000000..5494da4c52a7 --- /dev/null +++ b/tools/kvm/include/kvm/kvm-ipc.h @@ -0,0 +1,26 @@ +#ifndef KVM__IPC_H_ +#define KVM__IPC_H_ + +#include <linux/types.h> +#include "kvm/kvm.h" + +enum { + KVM_IPC_BALLOON = 1, + KVM_IPC_DEBUG = 2, + KVM_IPC_STAT = 3, + KVM_IPC_PAUSE = 4, + KVM_IPC_RESUME = 5, + KVM_IPC_STOP = 6, + KVM_IPC_PID = 7, + KVM_IPC_VMSTATE = 8, +}; + +int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm, + int fd, u32 type, u32 len, u8 *msg)); +int kvm_ipc__init(struct kvm *kvm); +int kvm_ipc__exit(struct kvm *kvm); + +int kvm_ipc__send(int fd, u32 type); +int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg); + +#endif diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h new file mode 100644 index 000000000000..acb08182c07b --- /dev/null +++ b/tools/kvm/include/kvm/kvm.h @@ -0,0 +1,133 @@ +#ifndef KVM__KVM_H +#define KVM__KVM_H + +#include "kvm/kvm-arch.h" +#include "kvm/kvm-config.h" +#include "kvm/util-init.h" +#include "kvm/kvm.h" + +#include <stdbool.h> +#include <linux/types.h> +#include <time.h> +#include <signal.h> +#include <sys/prctl.h> + +#define SIGKVMEXIT (SIGRTMIN + 0) +#define SIGKVMPAUSE (SIGRTMIN + 1) + +#define KVM_PID_FILE_PATH "/.lkvm/" +#define HOME_DIR getenv("HOME") +#define KVM_BINARY_NAME "lkvm" + +#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE)) + +#define DEFINE_KVM_EXT(ext) \ + .name = #ext, \ + .code = ext + +enum { + KVM_VMSTATE_RUNNING, + KVM_VMSTATE_PAUSED, +}; + +struct kvm_ext { + const char *name; + int code; +}; + +struct kvm_mem_bank { + struct list_head list; + u64 guest_phys_addr; + void *host_addr; + u64 size; +}; + +struct kvm { + struct kvm_arch arch; + struct kvm_config cfg; + int sys_fd; /* For system ioctls(), i.e. /dev/kvm */ + int vm_fd; /* For VM ioctls() */ + timer_t timerid; /* Posix timer for interrupts */ + + int nrcpus; /* Number of cpus to run */ + struct kvm_cpu **cpus; + + u32 mem_slots; /* for KVM_SET_USER_MEMORY_REGION */ + u64 ram_size; + void *ram_start; + u64 ram_pagesize; + struct list_head mem_banks; + + bool nmi_disabled; + + const char *vmlinux; + struct disk_image **disks; + int nr_disks; + + int vm_state; +}; + +void kvm__set_dir(const char *fmt, ...); +const char *kvm__get_dir(void); + +int kvm__init(struct kvm *kvm); +struct kvm *kvm__new(void); +int kvm__recommended_cpus(struct kvm *kvm); +int kvm__max_cpus(struct kvm *kvm); +void kvm__init_ram(struct kvm *kvm); +int kvm__exit(struct kvm *kvm); +bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename); +bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename, + const char *initrd_filename, const char *kernel_cmdline); +int kvm_timer__init(struct kvm *kvm); +int kvm_timer__exit(struct kvm *kvm); +void kvm__irq_line(struct kvm *kvm, int irq, int level); +void kvm__irq_trigger(struct kvm *kvm, int irq); +bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count); +bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write); +int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr); +int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce, + void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr), + void *ptr); +bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr); +void kvm__pause(struct kvm *kvm); +void kvm__continue(struct kvm *kvm); +void kvm__notify_paused(void); +int kvm__get_sock_by_instance(const char *name); +int kvm__enumerate_instances(int (*callback)(const char *name, int pid)); +void kvm__remove_socket(const char *name); + +void kvm__arch_set_cmdline(char *cmdline, bool video); +void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size); +void kvm__arch_delete_ram(struct kvm *kvm); +int kvm__arch_setup_firmware(struct kvm *kvm); +int kvm__arch_free_firmware(struct kvm *kvm); +bool kvm__arch_cpu_supports_vm(void); +void kvm__arch_periodic_poll(struct kvm *kvm); + +void *guest_flat_to_host(struct kvm *kvm, u64 offset); +u64 host_to_guest_flat(struct kvm *kvm, void *ptr); + +int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline); +bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline); + +/* + * Debugging + */ +void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size); + +extern const char *kvm_exit_reasons[]; + +static inline bool host_ptr_in_ram(struct kvm *kvm, void *p) +{ + return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size); +} + +bool kvm__supports_extension(struct kvm *kvm, unsigned int extension); + +static inline void kvm__set_thread_name(const char *name) +{ + prctl(PR_SET_NAME, name); +} + +#endif /* KVM__KVM_H */ diff --git a/tools/kvm/include/kvm/msi.h b/tools/kvm/include/kvm/msi.h new file mode 100644 index 000000000000..885eb5b95ed7 --- /dev/null +++ b/tools/kvm/include/kvm/msi.h @@ -0,0 +1,10 @@ +#ifndef LKVM_MSI_H +#define LKVM_MSI_H + +struct msi_msg { + u32 address_lo; /* low 32 bits of msi message address */ + u32 address_hi; /* high 32 bits of msi message address */ + u32 data; /* 16 bits of msi message data */ +}; + +#endif /* LKVM_MSI_H */ diff --git a/tools/kvm/include/kvm/mutex.h b/tools/kvm/include/kvm/mutex.h new file mode 100644 index 000000000000..a90584b9db87 --- /dev/null +++ b/tools/kvm/include/kvm/mutex.h @@ -0,0 +1,39 @@ +#ifndef KVM__MUTEX_H +#define KVM__MUTEX_H + +#include <pthread.h> + +#include "kvm/util.h" + +/* + * Kernel-alike mutex API - to make it easier for kernel developers + * to write user-space code! :-) + */ + +struct mutex { + pthread_mutex_t mutex; +}; +#define MUTEX_INITIALIZER (struct mutex) { .mutex = PTHREAD_MUTEX_INITIALIZER } + +#define DEFINE_MUTEX(mtx) struct mutex mtx = MUTEX_INITIALIZER + +static inline void mutex_init(struct mutex *lock) +{ + if (pthread_mutex_init(&lock->mutex, NULL) != 0) + die("unexpected pthread_mutex_init() failure!"); +} + +static inline void mutex_lock(struct mutex *lock) +{ + if (pthread_mutex_lock(&lock->mutex) != 0) + die("unexpected pthread_mutex_lock() failure!"); + +} + +static inline void mutex_unlock(struct mutex *lock) +{ + if (pthread_mutex_unlock(&lock->mutex) != 0) + die("unexpected pthread_mutex_unlock() failure!"); +} + +#endif /* KVM__MUTEX_H */ diff --git a/tools/kvm/include/kvm/parse-options.h b/tools/kvm/include/kvm/parse-options.h new file mode 100644 index 000000000000..09a5fca71117 --- /dev/null +++ b/tools/kvm/include/kvm/parse-options.h @@ -0,0 +1,221 @@ +#ifndef __PARSE_OPTIONS_H__ +#define __PARSE_OPTIONS_H__ + +#include <inttypes.h> +#include <kvm/util.h> + +enum parse_opt_type { + /* special types */ + OPTION_END, + OPTION_ARGUMENT, + OPTION_GROUP, + /* options with no arguments */ + OPTION_BIT, + OPTION_BOOLEAN, + OPTION_INCR, + OPTION_SET_UINT, + OPTION_SET_PTR, + /* options with arguments (usually) */ + OPTION_STRING, + OPTION_INTEGER, + OPTION_LONG, + OPTION_CALLBACK, + OPTION_U64, + OPTION_UINTEGER, +}; + +enum parse_opt_flags { + PARSE_OPT_KEEP_DASHDASH = 1, + PARSE_OPT_STOP_AT_NON_OPTION = 2, + PARSE_OPT_KEEP_ARGV0 = 4, + PARSE_OPT_KEEP_UNKNOWN = 8, + PARSE_OPT_NO_INTERNAL_HELP = 16, +}; + +enum parse_opt_option_flags { + PARSE_OPT_OPTARG = 1, + PARSE_OPT_NOARG = 2, + PARSE_OPT_NONEG = 4, + PARSE_OPT_HIDDEN = 8, + PARSE_OPT_LASTARG_DEFAULT = 16, +}; + +struct option; +typedef int parse_opt_cb(const struct option *, const char *arg, int unset); +/* + * `type`:: + * holds the type of the option, you must have an OPTION_END last in your + * array. + * + * `short_name`:: + * the character to use as a short option name, '\0' if none. + * + * `long_name`:: + * the long option name, without the leading dashes, NULL if none. + * + * `value`:: + * stores pointers to the values to be filled. + * + * `argh`:: + * token to explain the kind of argument this option wants. Keep it + * homogenous across the repository. + * + * `help`:: + * the short help associated to what the option does. + * Must never be NULL (except for OPTION_END). + * OPTION_GROUP uses this pointer to store the group header. + * + * `flags`:: + * mask of parse_opt_option_flags. + * PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs) + * PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs + * PARSE_OPT_NONEG: says that this option cannot be negated + * PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in + * the long one. + * + * `callback`:: + * pointer to the callback to use for OPTION_CALLBACK. + * + * `defval`:: + * default value to fill (*->value) with for PARSE_OPT_OPTARG. + * OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in + * the value when met. + * CALLBACKS can use it like they want. + */ +struct option { + enum parse_opt_type type; + int short_name; + const char *long_name; + void *value; + const char *argh; + const char *help; + void *ptr; + + int flags; + parse_opt_cb *callback; + intptr_t defval; +}; + +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) +#define check_vtype(v, type) \ + (BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v) + +#define OPT_INTEGER(s, l, v, h) \ +{ \ + .type = OPTION_INTEGER, \ + .short_name = (s), \ + .long_name = (l), \ + .value = check_vtype(v, int *), \ + .help = (h) \ +} + +#define OPT_U64(s, l, v, h) \ +{ \ + .type = OPTION_U64, \ + .short_name = (s), \ + .long_name = (l), \ + .value = check_vtype(v, u64 *), \ + .help = (h) \ +} + +#define OPT_STRING(s, l, v, a, h) \ +{ \ + .type = OPTION_STRING, \ + .short_name = (s), \ + .long_name = (l), \ + .value = check_vtype(v, const char **), (a), \ + .help = (h) \ +} + +#define OPT_BOOLEAN(s, l, v, h) \ +{ \ + .type = OPTION_BOOLEAN, \ + .short_name = (s), \ + .long_name = (l), \ + .value = check_vtype(v, bool *), \ + .help = (h) \ +} + +#define OPT_INCR(s, l, v, h) \ +{ \ + .type = OPTION_INCR, \ + .short_name = (s), \ + .long_name = (l), \ + .value = check_vtype(v, int *), \ + .help = (h) \ +} + +#define OPT_GROUP(h) \ +{ \ + .type = OPTION_GROUP, \ + .help = (h) \ +} + +#define OPT_CALLBACK(s, l, v, a, h, f, p) \ +{ \ + .type = OPTION_CALLBACK, \ + .short_name = (s), \ + .long_name = (l), \ + .value = (v), \ + (a), \ + .help = (h), \ + .callback = (f), \ + .ptr = (p), \ +} + +#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f, p) \ +{ \ + .type = OPTION_CALLBACK, \ + .short_name = (s), \ + .long_name = (l), \ + .value = (v), \ + (a), \ + .help = (h), \ + .callback = (f), \ + .flags = PARSE_OPT_NOARG, \ + .ptr = (p), \ +} + +#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d, p) \ +{ \ + .type = OPTION_CALLBACK, \ + .short_name = (s), \ + .long_name = (l), \ + .value = (v), (a), \ + .help = (h), \ + .callback = (f), \ + .defval = (intptr_t)d, \ + .flags = PARSE_OPT_LASTARG_DEFAULT, \ + .ptr = (p) \ +} + +#define OPT_END() { .type = OPTION_END } + +#define OPT_ARCH(cmd, cfg) \ + OPT_ARCH_##cmd(OPT_GROUP("Arch-specific options:"), &(cfg)->arch) + +enum { + PARSE_OPT_HELP = -1, + PARSE_OPT_DONE, + PARSE_OPT_UNKNOWN, +}; + +/* + * It's okay for the caller to consume argv/argc in the usual way. + * Other fields of that structure are private to parse-options and should not + * be modified in any way. + **/ +struct parse_opt_ctx_t { + const char **argv; + const char **out; + int argc, cpidx; + const char *opt; + int flags; +}; + +/* global functions */ +void usage_with_options(const char * const *usagestr, + const struct option *opts) NORETURN; +int parse_options(int argc, const char **argv, const struct option *options, + const char * const usagestr[], int flags); +#endif diff --git a/tools/kvm/include/kvm/pci-shmem.h b/tools/kvm/include/kvm/pci-shmem.h new file mode 100644 index 000000000000..6cff2b85bfd3 --- /dev/null +++ b/tools/kvm/include/kvm/pci-shmem.h @@ -0,0 +1,32 @@ +#ifndef KVM__PCI_SHMEM_H +#define KVM__PCI_SHMEM_H + +#include <linux/types.h> +#include <linux/list.h> + +#include "kvm/parse-options.h" + +#define SHMEM_DEFAULT_SIZE (16 << MB_SHIFT) +#define SHMEM_DEFAULT_ADDR (0xc8000000) +#define SHMEM_DEFAULT_HANDLE "/kvm_shmem" + +struct kvm; +struct shmem_info; + +struct shmem_info { + u64 phys_addr; + u64 size; + char *handle; + int create; +}; + +int pci_shmem__init(struct kvm *kvm); +int pci_shmem__exit(struct kvm *kvm); +int pci_shmem__register_mem(struct shmem_info *si); +int shmem_parser(const struct option *opt, const char *arg, int unset); + +int pci_shmem__get_local_irqfd(struct kvm *kvm); +int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd); +int pci_shmem__remove_client(struct kvm *kvm, u32 id); + +#endif diff --git a/tools/kvm/include/kvm/pci.h b/tools/kvm/include/kvm/pci.h new file mode 100644 index 000000000000..3da381175c8d --- /dev/null +++ b/tools/kvm/include/kvm/pci.h @@ -0,0 +1,93 @@ +#ifndef KVM__PCI_H +#define KVM__PCI_H + +#include <linux/types.h> +#include <linux/kvm.h> +#include <linux/pci_regs.h> +#include <endian.h> + +#include "kvm/kvm.h" +#include "kvm/msi.h" + +/* + * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1. + * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for + * details. + */ +#define PCI_CONFIG_ADDRESS 0xcf8 +#define PCI_CONFIG_DATA 0xcfc +#define PCI_CONFIG_BUS_FORWARD 0xcfa +#define PCI_IO_SIZE 0x100 + +union pci_config_address { + struct { +#if __BYTE_ORDER == __LITTLE_ENDIAN + unsigned reg_offset : 2; /* 1 .. 0 */ + unsigned register_number : 6; /* 7 .. 2 */ + unsigned function_number : 3; /* 10 .. 8 */ + unsigned device_number : 5; /* 15 .. 11 */ + unsigned bus_number : 8; /* 23 .. 16 */ + unsigned reserved : 7; /* 30 .. 24 */ + unsigned enable_bit : 1; /* 31 */ +#else + unsigned enable_bit : 1; /* 31 */ + unsigned reserved : 7; /* 30 .. 24 */ + unsigned bus_number : 8; /* 23 .. 16 */ + unsigned device_number : 5; /* 15 .. 11 */ + unsigned function_number : 3; /* 10 .. 8 */ + unsigned register_number : 6; /* 7 .. 2 */ + unsigned reg_offset : 2; /* 1 .. 0 */ +#endif + }; + u32 w; +}; + +struct msix_table { + struct msi_msg msg; + u32 ctrl; +}; + +struct msix_cap { + u8 cap; + u8 next; + u16 ctrl; + u32 table_offset; + u32 pba_offset; +}; + +struct pci_device_header { + u16 vendor_id; + u16 device_id; + u16 command; + u16 status; + u8 revision_id; + u8 class[3]; + u8 cacheline_size; + u8 latency_timer; + u8 header_type; + u8 bist; + u32 bar[6]; + u32 card_bus; + u16 subsys_vendor_id; + u16 subsys_id; + u32 exp_rom_bar; + u8 capabilities; + u8 reserved1[3]; + u32 reserved2; + u8 irq_line; + u8 irq_pin; + u8 min_gnt; + u8 max_lat; + struct msix_cap msix; + u8 empty[136]; /* Rest of PCI config space */ + u32 bar_size[6]; +} __attribute__((packed)); + +int pci__init(struct kvm *kvm); +int pci__exit(struct kvm *kvm); +struct pci_device_header *pci__find_dev(u8 dev_num); +u32 pci_get_io_space_block(u32 size); +void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size); +void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size); + +#endif /* KVM__PCI_H */ diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h new file mode 100644 index 000000000000..f8492462ddaa --- /dev/null +++ b/tools/kvm/include/kvm/qcow.h @@ -0,0 +1,133 @@ +#ifndef KVM__QCOW_H +#define KVM__QCOW_H + +#include "kvm/mutex.h" + +#include <linux/types.h> +#include <stdbool.h> +#include <linux/rbtree.h> +#include <linux/list.h> + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW1_VERSION 1 +#define QCOW2_VERSION 2 + +#define QCOW1_OFLAG_COMPRESSED (1ULL << 63) + +#define QCOW2_OFLAG_COPIED (1ULL << 63) +#define QCOW2_OFLAG_COMPRESSED (1ULL << 62) + +#define QCOW2_OFLAGS_MASK (QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED) + +#define QCOW2_OFFSET_MASK (~QCOW2_OFLAGS_MASK) + +#define MAX_CACHE_NODES 32 + +struct qcow_l2_table { + u64 offset; + struct rb_node node; + struct list_head list; + u8 dirty; + u64 table[]; +}; + +struct qcow_l1_table { + u32 table_size; + u64 *l1_table; + + /* Level2 caching data structures */ + struct rb_root root; + struct list_head lru_list; + int nr_cached; +}; + +#define QCOW_REFCOUNT_BLOCK_SHIFT 1 + +struct qcow_refcount_block { + u64 offset; + struct rb_node node; + struct list_head list; + u64 size; + u8 dirty; + u16 entries[]; +}; + +struct qcow_refcount_table { + u32 rf_size; + u64 *rf_table; + + /* Refcount block caching data structures */ + struct rb_root root; + struct list_head lru_list; + int nr_cached; +}; + +struct qcow_header { + u64 size; /* in bytes */ + u64 l1_table_offset; + u32 l1_size; + u8 cluster_bits; + u8 l2_bits; + u64 refcount_table_offset; + u32 refcount_table_size; +}; + +struct qcow { + struct mutex mutex; + struct qcow_header *header; + struct qcow_l1_table table; + struct qcow_refcount_table refcount_table; + int fd; + int csize_shift; + int csize_mask; + u32 version; + u64 cluster_size; + u64 cluster_offset_mask; + u64 free_clust_idx; + void *cluster_cache; + void *cluster_data; + void *copy_buff; +}; + +struct qcow1_header_disk { + u32 magic; + u32 version; + + u64 backing_file_offset; + u32 backing_file_size; + u32 mtime; + + u64 size; /* in bytes */ + + u8 cluster_bits; + u8 l2_bits; + u32 crypt_method; + + u64 l1_table_offset; +}; + +struct qcow2_header_disk { + u32 magic; + u32 version; + + u64 backing_file_offset; + u32 backing_file_size; + + u32 cluster_bits; + u64 size; /* in bytes */ + u32 crypt_method; + + u32 l1_size; + u64 l1_table_offset; + + u64 refcount_table_offset; + u32 refcount_table_clusters; + + u32 nb_snapshots; + u64 snapshots_offset; +}; + +struct disk_image *qcow_probe(int fd, bool readonly); + +#endif /* KVM__QCOW_H */ diff --git a/tools/kvm/include/kvm/rbtree-interval.h b/tools/kvm/include/kvm/rbtree-interval.h new file mode 100644 index 000000000000..730eb5e8551d --- /dev/null +++ b/tools/kvm/include/kvm/rbtree-interval.h @@ -0,0 +1,30 @@ +#ifndef KVM__INTERVAL_RBTREE_H +#define KVM__INTERVAL_RBTREE_H + +#include <linux/rbtree.h> +#include <linux/types.h> + +#define RB_INT_INIT(l, h) \ + (struct rb_int_node){.low = l, .high = h} +#define rb_int(n) rb_entry(n, struct rb_int_node, node) + +struct rb_int_node { + struct rb_node node; + u64 low; + u64 high; +}; + +/* Return the rb_int_node interval in which 'point' is located. */ +struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point); + +/* Return the rb_int_node in which start:len is located. */ +struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high); + +int rb_int_insert(struct rb_root *root, struct rb_int_node *data); + +static inline void rb_int_erase(struct rb_root *root, struct rb_int_node *node) +{ + rb_erase(&node->node, root); +} + +#endif diff --git a/tools/kvm/include/kvm/read-write.h b/tools/kvm/include/kvm/read-write.h new file mode 100644 index 000000000000..67571f9671c7 --- /dev/null +++ b/tools/kvm/include/kvm/read-write.h @@ -0,0 +1,43 @@ +#ifndef KVM_READ_WRITE_H +#define KVM_READ_WRITE_H + +#include <sys/types.h> +#include <sys/uio.h> +#include <unistd.h> + +#ifdef CONFIG_HAS_AIO +#include <libaio.h> +#endif + +ssize_t xread(int fd, void *buf, size_t count); +ssize_t xwrite(int fd, const void *buf, size_t count); + +ssize_t read_in_full(int fd, void *buf, size_t count); +ssize_t write_in_full(int fd, const void *buf, size_t count); + +ssize_t xpread(int fd, void *buf, size_t count, off_t offset); +ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset); + +ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset); +ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset); + +ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt); +ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt); + +ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt); +ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt); + +ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset); +ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset); + +ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset); +ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset); + +#ifdef CONFIG_HAS_AIO +int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt, + off_t offset, int ev, void *param); +int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt, + off_t offset, int ev, void *param); +#endif + +#endif /* KVM_READ_WRITE_H */ diff --git a/tools/kvm/include/kvm/rtc.h b/tools/kvm/include/kvm/rtc.h new file mode 100644 index 000000000000..6aa929913c6a --- /dev/null +++ b/tools/kvm/include/kvm/rtc.h @@ -0,0 +1,9 @@ +#ifndef KVM__RTC_H +#define KVM__RTC_H + +struct kvm; + +int rtc__init(struct kvm *kvm); +int rtc__exit(struct kvm *kvm); + +#endif /* KVM__RTC_H */ diff --git a/tools/kvm/include/kvm/rwsem.h b/tools/kvm/include/kvm/rwsem.h new file mode 100644 index 000000000000..75a22f835d20 --- /dev/null +++ b/tools/kvm/include/kvm/rwsem.h @@ -0,0 +1,39 @@ +#ifndef KVM__RWSEM_H +#define KVM__RWSEM_H + +#include <pthread.h> + +#include "kvm/util.h" + +/* + * Kernel-alike rwsem API - to make it easier for kernel developers + * to write user-space code! :-) + */ + +#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER + +static inline void down_read(pthread_rwlock_t *rwsem) +{ + if (pthread_rwlock_rdlock(rwsem) != 0) + die("unexpected pthread_rwlock_rdlock() failure!"); +} + +static inline void down_write(pthread_rwlock_t *rwsem) +{ + if (pthread_rwlock_wrlock(rwsem) != 0) + die("unexpected pthread_rwlock_wrlock() failure!"); +} + +static inline void up_read(pthread_rwlock_t *rwsem) +{ + if (pthread_rwlock_unlock(rwsem) != 0) + die("unexpected pthread_rwlock_unlock() failure!"); +} + +static inline void up_write(pthread_rwlock_t *rwsem) +{ + if (pthread_rwlock_unlock(rwsem) != 0) + die("unexpected pthread_rwlock_unlock() failure!"); +} + +#endif /* KVM__RWSEM_H */ diff --git a/tools/kvm/include/kvm/sdl.h b/tools/kvm/include/kvm/sdl.h new file mode 100644 index 000000000000..2f0c213e3dba --- /dev/null +++ b/tools/kvm/include/kvm/sdl.h @@ -0,0 +1,28 @@ +#ifndef KVM__SDL_H +#define KVM__SDL_H + +#include "kvm/util.h" + +struct framebuffer; + +#ifdef CONFIG_HAS_SDL +int sdl__init(struct kvm *kvm); +int sdl__exit(struct kvm *kvm); +#else +static inline int sdl__init(struct kvm *kvm) +{ + if (kvm->cfg.sdl) + die("SDL support not compiled in. (install the SDL-dev[el] package)"); + + return 0; +} +static inline int sdl__exit(struct kvm *kvm) +{ + if (kvm->cfg.sdl) + die("SDL support not compiled in. (install the SDL-dev[el] package)"); + + return 0; +} +#endif + +#endif /* KVM__SDL_H */ diff --git a/tools/kvm/include/kvm/segment.h b/tools/kvm/include/kvm/segment.h new file mode 100644 index 000000000000..9387a820f137 --- /dev/null +++ b/tools/kvm/include/kvm/segment.h @@ -0,0 +1,21 @@ +#ifndef KVM_SEGMENT_H +#define KVM_SEGMENT_H + +#include <linux/types.h> + +static inline u32 segment_to_flat(u16 selector, u16 offset) +{ + return ((u32)selector << 4) + (u32) offset; +} + +static inline u16 flat_to_seg16(u32 address) +{ + return address >> 4; +} + +static inline u16 flat_to_off16(u32 address, u32 segment) +{ + return address - (segment << 4); +} + +#endif /* KVM_SEGMENT_H */ diff --git a/tools/kvm/include/kvm/strbuf.h b/tools/kvm/include/kvm/strbuf.h new file mode 100644 index 000000000000..2beefbc3f3fd --- /dev/null +++ b/tools/kvm/include/kvm/strbuf.h @@ -0,0 +1,20 @@ +#ifndef __STRBUF_H__ +#define __STRBUF_H__ + +#include <sys/types.h> +#include <string.h> + +int prefixcmp(const char *str, const char *prefix); + +extern size_t strlcat(char *dest, const char *src, size_t count); +extern size_t strlcpy(char *dest, const char *src, size_t size); + +/* some inline functions */ + +static inline const char *skip_prefix(const char *str, const char *prefix) +{ + size_t len = strlen(prefix); + return strncmp(str, prefix, len) ? NULL : str + len; +} + +#endif diff --git a/tools/kvm/include/kvm/symbol.h b/tools/kvm/include/kvm/symbol.h new file mode 100644 index 000000000000..725bbaf8fa23 --- /dev/null +++ b/tools/kvm/include/kvm/symbol.h @@ -0,0 +1,30 @@ +#ifndef KVM__SYMBOL_H +#define KVM__SYMBOL_H + +#include <stddef.h> +#include <string.h> + +struct kvm; + +#define SYMBOL_DEFAULT_UNKNOWN "<unknown>" + +#ifdef CONFIG_HAS_BFD + +int symbol_init(struct kvm *kvm); +int symbol_exit(struct kvm *kvm); +char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size); + +#else + +static inline int symbol_init(struct kvm *kvm) { return 0; } +static inline char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size) +{ + char *s = strncpy(sym, SYMBOL_DEFAULT_UNKNOWN, size); + sym[size - 1] = '\0'; + return s; +} +static inline int symbol_exit(struct kvm *kvm) { return 0; } + +#endif + +#endif /* KVM__SYMBOL_H */ diff --git a/tools/kvm/include/kvm/term.h b/tools/kvm/include/kvm/term.h new file mode 100644 index 000000000000..5f6345719656 --- /dev/null +++ b/tools/kvm/include/kvm/term.h @@ -0,0 +1,24 @@ +#ifndef KVM__TERM_H +#define KVM__TERM_H + +#include "kvm/kvm.h" + +#include <sys/uio.h> +#include <stdbool.h> + +#define CONSOLE_8250 1 +#define CONSOLE_VIRTIO 2 +#define CONSOLE_HV 3 + +int term_putc_iov(struct iovec *iov, int iovcnt, int term); +int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term); +int term_putc(char *addr, int cnt, int term); +int term_getc(struct kvm *kvm, int term); + +bool term_readable(int term); +void term_set_tty(int term); +int term_init(struct kvm *kvm); +int term_exit(struct kvm *kvm); +int tty_parser(const struct option *opt, const char *arg, int unset); + +#endif /* KVM__TERM_H */ diff --git a/tools/kvm/include/kvm/threadpool.h b/tools/kvm/include/kvm/threadpool.h new file mode 100644 index 000000000000..bacb2434e6f1 --- /dev/null +++ b/tools/kvm/include/kvm/threadpool.h @@ -0,0 +1,38 @@ +#ifndef KVM__THREADPOOL_H +#define KVM__THREADPOOL_H + +#include "kvm/mutex.h" + +#include <linux/list.h> + +struct kvm; + +typedef void (*kvm_thread_callback_fn_t)(struct kvm *kvm, void *data); + +struct thread_pool__job { + kvm_thread_callback_fn_t callback; + struct kvm *kvm; + void *data; + + int signalcount; + struct mutex mutex; + + struct list_head queue; +}; + +static inline void thread_pool__init_job(struct thread_pool__job *job, struct kvm *kvm, kvm_thread_callback_fn_t callback, void *data) +{ + *job = (struct thread_pool__job) { + .kvm = kvm, + .callback = callback, + .data = data, + .mutex = MUTEX_INITIALIZER, + }; +} + +int thread_pool__init(struct kvm *kvm); +int thread_pool__exit(struct kvm *kvm); + +void thread_pool__do_job(struct thread_pool__job *job); + +#endif diff --git a/tools/kvm/include/kvm/types.h b/tools/kvm/include/kvm/types.h new file mode 100644 index 000000000000..0cbc5fbc8549 --- /dev/null +++ b/tools/kvm/include/kvm/types.h @@ -0,0 +1,7 @@ +#ifndef KVM_TYPES_H +#define KVM_TYPES_H + +/* FIXME: include/linux/if_tun.h and include/linux/if_ether.h complains */ +#define __be16 u16 + +#endif /* KVM_TYPES_H */ diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h new file mode 100644 index 000000000000..ac248d2b7757 --- /dev/null +++ b/tools/kvm/include/kvm/uip.h @@ -0,0 +1,360 @@ +#ifndef KVM__UIP_H +#define KVM__UIP_H + +#include "linux/types.h" +#include "kvm/mutex.h" + +#include <netinet/in.h> +#include <sys/uio.h> + +#define UIP_BUF_STATUS_FREE 0 +#define UIP_BUF_STATUS_INUSE 1 +#define UIP_BUF_STATUS_USED 2 + +#define UIP_ETH_P_IP 0X0800 +#define UIP_ETH_P_ARP 0X0806 + +#define UIP_IP_VER_4 0X40 +#define UIP_IP_HDR_LEN 0X05 +#define UIP_IP_TTL 0X40 +#define UIP_IP_P_UDP 0X11 +#define UIP_IP_P_TCP 0X06 +#define UIP_IP_P_ICMP 0X01 + +#define UIP_TCP_HDR_LEN 0x50 +#define UIP_TCP_WIN_SIZE 14600 +#define UIP_TCP_FLAG_FIN 1 +#define UIP_TCP_FLAG_SYN 2 +#define UIP_TCP_FLAG_RST 4 +#define UIP_TCP_FLAG_PSH 8 +#define UIP_TCP_FLAG_ACK 16 +#define UIP_TCP_FLAG_URG 32 + +#define UIP_BOOTP_VENDOR_SPECIFIC_LEN 64 +#define UIP_BOOTP_MAX_PAYLOAD_LEN 300 +#define UIP_DHCP_VENDOR_SPECIFIC_LEN 312 +#define UIP_DHCP_PORT_SERVER 67 +#define UIP_DHCP_PORT_CLIENT 68 +#define UIP_DHCP_MACPAD_LEN 10 +#define UIP_DHCP_HOSTNAME_LEN 64 +#define UIP_DHCP_FILENAME_LEN 128 +#define UIP_DHCP_MAGIC_COOKIE 0x63825363 +#define UIP_DHCP_MAGIC_COOKIE_LEN 4 +#define UIP_DHCP_LEASE_TIME 0x00003840 +#define UIP_DHCP_MAX_PAYLOAD_LEN (UIP_BOOTP_MAX_PAYLOAD_LEN - UIP_BOOTP_VENDOR_SPECIFIC_LEN + UIP_DHCP_VENDOR_SPECIFIC_LEN) +#define UIP_DHCP_OPTION_LEN (UIP_DHCP_VENDOR_SPECIFIC_LEN - UIP_DHCP_MAGIC_COOKIE_LEN) +#define UIP_DHCP_DISCOVER 1 +#define UIP_DHCP_OFFER 2 +#define UIP_DHCP_REQUEST 3 +#define UIP_DHCP_ACK 5 +#define UIP_DHCP_MAX_DNS_SERVER_NR 3 +#define UIP_DHCP_MAX_DOMAIN_NAME_LEN 256 +#define UIP_DHCP_TAG_MSG_TYPE 53 +#define UIP_DHCP_TAG_MSG_TYPE_LEN 1 +#define UIP_DHCP_TAG_SERVER_ID 54 +#define UIP_DHCP_TAG_SERVER_ID_LEN 4 +#define UIP_DHCP_TAG_LEASE_TIME 51 +#define UIP_DHCP_TAG_LEASE_TIME_LEN 4 +#define UIP_DHCP_TAG_SUBMASK 1 +#define UIP_DHCP_TAG_SUBMASK_LEN 4 +#define UIP_DHCP_TAG_ROUTER 3 +#define UIP_DHCP_TAG_ROUTER_LEN 4 +#define UIP_DHCP_TAG_ROOT 17 +#define UIP_DHCP_TAG_ROOT_LEN 4 +#define UIP_DHCP_TAG_DNS_SERVER 6 +#define UIP_DHCP_TAG_DNS_SERVER_LEN 4 +#define UIP_DHCP_TAG_DOMAIN_NAME 15 +#define UIP_DHCP_TAG_END 255 + +/* + * IP package maxium len == 64 KBytes + * IP header == 20 Bytes + * TCP header == 20 Bytes + * UDP header == 8 Bytes + */ +#define UIP_MAX_TCP_PAYLOAD (64*1024 - 20 - 20 - 1) +#define UIP_MAX_UDP_PAYLOAD (64*1024 - 20 - 8 - 1) + +struct uip_eth_addr { + u8 addr[6]; +}; + +struct uip_eth { + struct uip_eth_addr dst; + struct uip_eth_addr src; + u16 type; +} __attribute__((packed)); + +struct uip_arp { + struct uip_eth eth; + u16 hwtype; + u16 proto; + u8 hwlen; + u8 protolen; + u16 op; + struct uip_eth_addr smac; + u32 sip; + struct uip_eth_addr dmac; + u32 dip; +} __attribute__((packed)); + +struct uip_ip { + struct uip_eth eth; + u8 vhl; + u8 tos; + /* + * len = IP hdr + IP payload + */ + u16 len; + u16 id; + u16 flgfrag; + u8 ttl; + u8 proto; + u16 csum; + u32 sip; + u32 dip; +} __attribute__((packed)); + +struct uip_icmp { + struct uip_ip ip; + u8 type; + u8 code; + u16 csum; + u16 id; + u16 seq; +} __attribute__((packed)); + +struct uip_udp { + /* + * FIXME: IP Options (IP hdr len > 20 bytes) are not supported + */ + struct uip_ip ip; + u16 sport; + u16 dport; + /* + * len = UDP hdr + UDP payload + */ + u16 len; + u16 csum; + u8 payload[0]; +} __attribute__((packed)); + +struct uip_tcp { + /* + * FIXME: IP Options (IP hdr len > 20 bytes) are not supported + */ + struct uip_ip ip; + u16 sport; + u16 dport; + u32 seq; + u32 ack; + u8 off; + u8 flg; + u16 win; + u16 csum; + u16 urgent; +} __attribute__((packed)); + +struct uip_pseudo_hdr { + u32 sip; + u32 dip; + u8 zero; + u8 proto; + u16 len; +} __attribute__((packed)); + +struct uip_dhcp { + struct uip_udp udp; + u8 msg_type; + u8 hardware_type; + u8 hardware_len; + u8 hops; + u32 id; + u16 time; + u16 flg; + u32 client_ip; + u32 your_ip; + u32 server_ip; + u32 agent_ip; + struct uip_eth_addr client_mac; + u8 pad[UIP_DHCP_MACPAD_LEN]; + u8 server_hostname[UIP_DHCP_HOSTNAME_LEN]; + u8 boot_filename[UIP_DHCP_FILENAME_LEN]; + u32 magic_cookie; + u8 option[UIP_DHCP_OPTION_LEN]; +} __attribute__((packed)); + +struct uip_info { + struct list_head udp_socket_head; + struct list_head tcp_socket_head; + struct mutex udp_socket_lock; + struct mutex tcp_socket_lock; + struct uip_eth_addr guest_mac; + struct uip_eth_addr host_mac; + pthread_cond_t buf_free_cond; + pthread_cond_t buf_used_cond; + struct list_head buf_head; + struct mutex buf_lock; + pthread_t udp_thread; + int udp_epollfd; + int buf_free_nr; + int buf_used_nr; + u32 guest_ip; + u32 guest_netmask; + u32 host_ip; + u32 dns_ip[UIP_DHCP_MAX_DNS_SERVER_NR]; + char *domain_name; + u32 buf_nr; +}; + +struct uip_buf { + struct list_head list; + struct uip_info *info; + int vnet_len; + int eth_len; + int status; + char *vnet; + char *eth; + int id; +}; + +struct uip_udp_socket { + struct sockaddr_in addr; + struct list_head list; + struct mutex *lock; + u32 dport, sport; + u32 dip, sip; + int fd; +}; + +struct uip_tcp_socket { + struct sockaddr_in addr; + struct list_head list; + struct uip_info *info; + pthread_cond_t cond; + struct mutex *lock; + pthread_t thread; + u32 dport, sport; + u32 guest_acked; + u16 window_size; + /* + * Initial Sequence Number + */ + u32 isn_server; + u32 isn_guest; + u32 ack_server; + u32 seq_server; + int write_done; + int read_done; + u32 dip, sip; + u8 *payload; + int fd; +}; + +struct uip_tx_arg { + struct virtio_net_hdr *vnet; + struct uip_info *info; + struct uip_eth *eth; + int vnet_len; + int eth_len; +}; + +static inline u16 uip_ip_hdrlen(struct uip_ip *ip) +{ + return (ip->vhl & 0x0f) * 4; +} + +static inline u16 uip_ip_len(struct uip_ip *ip) +{ + return htons(ip->len); +} + +static inline u16 uip_udp_hdrlen(struct uip_udp *udp) +{ + return 8; +} + +static inline u16 uip_udp_len(struct uip_udp *udp) +{ + return ntohs(udp->len); +} + +static inline u16 uip_tcp_hdrlen(struct uip_tcp *tcp) +{ + return (tcp->off >> 4) * 4; +} + +static inline u16 uip_tcp_len(struct uip_tcp *tcp) +{ + struct uip_ip *ip; + + ip = &tcp->ip; + + return uip_ip_len(ip) - uip_ip_hdrlen(ip); +} + +static inline u16 uip_tcp_payloadlen(struct uip_tcp *tcp) +{ + return uip_tcp_len(tcp) - uip_tcp_hdrlen(tcp); +} + +static inline u8 *uip_tcp_payload(struct uip_tcp *tcp) +{ + return (u8 *)&tcp->sport + uip_tcp_hdrlen(tcp); +} + +static inline bool uip_tcp_is_syn(struct uip_tcp *tcp) +{ + return (tcp->flg & UIP_TCP_FLAG_SYN) != 0; +} + +static inline bool uip_tcp_is_fin(struct uip_tcp *tcp) +{ + return (tcp->flg & UIP_TCP_FLAG_FIN) != 0; +} + +static inline u32 uip_tcp_isn(struct uip_tcp *tcp) +{ + return ntohl(tcp->seq); +} + +static inline u32 uip_tcp_isn_alloc(void) +{ + /* + * FIXME: should increase every 4ms + */ + return 10000000; +} + +static inline u16 uip_eth_hdrlen(struct uip_eth *eth) +{ + return sizeof(*eth); +} + +int uip_tx(struct iovec *iov, u16 out, struct uip_info *info); +int uip_rx(struct iovec *iov, u16 in, struct uip_info *info); +int uip_init(struct uip_info *info); + +int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg); +int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg); +int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg); +int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg); +int uip_tx_do_ipv4(struct uip_tx_arg *arg); +int uip_tx_do_arp(struct uip_tx_arg *arg); + +u16 uip_csum_icmp(struct uip_icmp *icmp); +u16 uip_csum_udp(struct uip_udp *udp); +u16 uip_csum_tcp(struct uip_tcp *tcp); +u16 uip_csum_ip(struct uip_ip *ip); + +struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf); +struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf); +struct uip_buf *uip_buf_get_used(struct uip_info *info); +struct uip_buf *uip_buf_get_free(struct uip_info *info); +struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg); + +int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 *payload, int payload_len); +bool uip_udp_is_dhcp(struct uip_udp *udp); + +int uip_dhcp_get_dns(struct uip_info *info); +#endif /* KVM__UIP_H */ diff --git a/tools/kvm/include/kvm/util-init.h b/tools/kvm/include/kvm/util-init.h new file mode 100644 index 000000000000..13d4f04df678 --- /dev/null +++ b/tools/kvm/include/kvm/util-init.h @@ -0,0 +1,51 @@ +#ifndef KVM__UTIL_INIT_H +#define KVM__UTIL_INIT_H + +struct kvm; + +struct init_item { + struct hlist_node n; + const char *fn_name; + int (*init)(struct kvm *); +}; + +int init_list__init(struct kvm *kvm); +int init_list__exit(struct kvm *kvm); + +int init_list_add(struct init_item *t, int (*init)(struct kvm *), + int priority, const char *name); +int exit_list_add(struct init_item *t, int (*init)(struct kvm *), + int priority, const char *name); + +#define __init_list_add(cb, l) \ +static void __attribute__ ((constructor)) __init__##cb(void) \ +{ \ + static char name[] = #cb; \ + static struct init_item t; \ + init_list_add(&t, cb, l, name); \ +} + +#define __exit_list_add(cb, l) \ +static void __attribute__ ((constructor)) __init__##cb(void) \ +{ \ + static char name[] = #cb; \ + static struct init_item t; \ + exit_list_add(&t, cb, l, name); \ +} + +#define core_init(cb) __init_list_add(cb, 0) +#define base_init(cb) __init_list_add(cb, 2) +#define dev_base_init(cb) __init_list_add(cb, 4) +#define dev_init(cb) __init_list_add(cb, 5) +#define virtio_dev_init(cb) __init_list_add(cb, 6) +#define firmware_init(cb) __init_list_add(cb, 7) +#define late_init(cb) __init_list_add(cb, 9) + +#define core_exit(cb) __exit_list_add(cb, 0) +#define base_exit(cb) __exit_list_add(cb, 2) +#define dev_base_exit(cb) __exit_list_add(cb, 4) +#define dev_exit(cb) __exit_list_add(cb, 5) +#define virtio_dev_exit(cb) __exit_list_add(cb, 6) +#define firmware_exit(cb) __exit_list_add(cb, 7) +#define late_exit(cb) __exit_list_add(cb, 9) +#endif diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h new file mode 100644 index 000000000000..0df9f0dfdb43 --- /dev/null +++ b/tools/kvm/include/kvm/util.h @@ -0,0 +1,97 @@ +#include <linux/stringify.h> + +#ifndef KVM__UTIL_H +#define KVM__UTIL_H + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +/* + * Some bits are stolen from perf tool :) + */ + +#include <unistd.h> +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <stdbool.h> +#include <signal.h> +#include <errno.h> +#include <limits.h> +#include <sys/param.h> +#include <sys/types.h> +#include <linux/types.h> + +#ifdef __GNUC__ +#define NORETURN __attribute__((__noreturn__)) +#else +#define NORETURN +#ifndef __attribute__ +#define __attribute__(x) +#endif +#endif + +extern bool do_debug_print; + +#define PROT_RW (PROT_READ|PROT_WRITE) +#define MAP_ANON_NORESERVE (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE) + +extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2))); +extern void die_perror(const char *s) NORETURN; +extern int pr_err(const char *err, ...) __attribute__((format (printf, 1, 2))); +extern void pr_warning(const char *err, ...) __attribute__((format (printf, 1, 2))); +extern void pr_info(const char *err, ...) __attribute__((format (printf, 1, 2))); +extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN); + +#define pr_debug(fmt, ...) \ + do { \ + if (do_debug_print) \ + pr_info("(%s) %s:%d: " fmt, __FILE__, \ + __func__, __LINE__, ##__VA_ARGS__); \ + } while (0) + + +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#ifndef BUG_ON_HANDLER +# define BUG_ON_HANDLER(condition) \ + do { \ + if ((condition)) { \ + pr_err("BUG at %s:%d", __FILE__, __LINE__); \ + raise(SIGABRT); \ + } \ + } while (0) +#endif + +#define BUG_ON(condition) BUG_ON_HANDLER((condition)) + +#define DIE_IF(cnd) \ +do { \ + if (cnd) \ + die(" at (" __FILE__ ":" __stringify(__LINE__) "): " \ + __stringify(cnd) "\n"); \ +} while (0) + +#define WARN_ON(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (__ret_warn_on) \ + pr_warning("(%s) %s:%d: failed condition: %s", \ + __FILE__, __func__, __LINE__, \ + __stringify(condition)); \ + __ret_warn_on; \ +}) + +#define MSECS_TO_USECS(s) ((s) * 1000) + +/* Millisecond sleep */ +static inline void msleep(unsigned int msecs) +{ + usleep(MSECS_TO_USECS(msecs)); +} + +struct kvm; +void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size); +void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size); + +#endif /* KVM__UTIL_H */ diff --git a/tools/kvm/include/kvm/vesa.h b/tools/kvm/include/kvm/vesa.h new file mode 100644 index 000000000000..ac041d9d34e3 --- /dev/null +++ b/tools/kvm/include/kvm/vesa.h @@ -0,0 +1,18 @@ +#ifndef KVM__VESA_H +#define KVM__VESA_H + +#include <linux/types.h> + +#define VESA_WIDTH 640 +#define VESA_HEIGHT 480 + +#define VESA_MEM_ADDR 0xd0000000 +#define VESA_MEM_SIZE (4*VESA_WIDTH*VESA_HEIGHT) +#define VESA_BPP 32 + +struct kvm; +struct biosregs; + +struct framebuffer *vesa__init(struct kvm *self); + +#endif diff --git a/tools/kvm/include/kvm/virtio-9p.h b/tools/kvm/include/kvm/virtio-9p.h new file mode 100644 index 000000000000..19ffe505a74c --- /dev/null +++ b/tools/kvm/include/kvm/virtio-9p.h @@ -0,0 +1,76 @@ +#ifndef KVM__VIRTIO_9P_H +#define KVM__VIRTIO_9P_H +#include "kvm/virtio.h" +#include "kvm/pci.h" +#include "kvm/threadpool.h" +#include "kvm/parse-options.h" + +#include <sys/types.h> +#include <dirent.h> +#include <linux/list.h> +#include <linux/rbtree.h> + +#define NUM_VIRT_QUEUES 1 +#define VIRTQUEUE_NUM 128 +#define VIRTIO_9P_DEFAULT_TAG "kvm_9p" +#define VIRTIO_9P_HDR_LEN (sizeof(u32)+sizeof(u8)+sizeof(u16)) +#define VIRTIO_9P_VERSION_DOTL "9P2000.L" +#define MAX_TAG_LEN 32 + +struct p9_msg { + u32 size; + u8 cmd; + u16 tag; + u8 msg[0]; +} __attribute__((packed)); + +struct p9_fid { + u32 fid; + u32 uid; + char abs_path[PATH_MAX]; + char *path; + DIR *dir; + int fd; + struct rb_node node; +}; + +struct p9_dev_job { + struct virt_queue *vq; + struct p9_dev *p9dev; + struct thread_pool__job job_id; +}; + +struct p9_dev { + struct list_head list; + struct virtio_device vdev; + struct rb_root fids; + + struct virtio_9p_config *config; + u32 features; + + /* virtio queue */ + struct virt_queue vqs[NUM_VIRT_QUEUES]; + struct p9_dev_job jobs[NUM_VIRT_QUEUES]; + char root_dir[PATH_MAX]; +}; + +struct p9_pdu { + u32 queue_head; + size_t read_offset; + size_t write_offset; + u16 out_iov_cnt; + u16 in_iov_cnt; + struct iovec in_iov[VIRTQUEUE_NUM]; + struct iovec out_iov[VIRTQUEUE_NUM]; +}; + +struct kvm; + +int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset); +int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset); +int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name); +int virtio_9p__init(struct kvm *kvm); +int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...); +int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...); + +#endif diff --git a/tools/kvm/include/kvm/virtio-balloon.h b/tools/kvm/include/kvm/virtio-balloon.h new file mode 100644 index 000000000000..844a1bab7e41 --- /dev/null +++ b/tools/kvm/include/kvm/virtio-balloon.h @@ -0,0 +1,9 @@ +#ifndef KVM__BLN_VIRTIO_H +#define KVM__BLN_VIRTIO_H + +struct kvm; + +int virtio_bln__init(struct kvm *kvm); +int virtio_bln__exit(struct kvm *kvm); + +#endif /* KVM__BLN_VIRTIO_H */ diff --git a/tools/kvm/include/kvm/virtio-blk.h b/tools/kvm/include/kvm/virtio-blk.h new file mode 100644 index 000000000000..12e59b6b21fa --- /dev/null +++ b/tools/kvm/include/kvm/virtio-blk.h @@ -0,0 +1,12 @@ +#ifndef KVM__BLK_VIRTIO_H +#define KVM__BLK_VIRTIO_H + +#include "kvm/disk-image.h" + +struct kvm; + +int virtio_blk__init(struct kvm *kvm); +int virtio_blk__exit(struct kvm *kvm); +void virtio_blk_complete(void *param, long len); + +#endif /* KVM__BLK_VIRTIO_H */ diff --git a/tools/kvm/include/kvm/virtio-console.h b/tools/kvm/include/kvm/virtio-console.h new file mode 100644 index 000000000000..89809208786b --- /dev/null +++ b/tools/kvm/include/kvm/virtio-console.h @@ -0,0 +1,10 @@ +#ifndef KVM__CONSOLE_VIRTIO_H +#define KVM__CONSOLE_VIRTIO_H + +struct kvm; + +int virtio_console__init(struct kvm *kvm); +void virtio_console__inject_interrupt(struct kvm *kvm); +int virtio_console__exit(struct kvm *kvm); + +#endif /* KVM__CONSOLE_VIRTIO_H */ diff --git a/tools/kvm/include/kvm/virtio-mmio.h b/tools/kvm/include/kvm/virtio-mmio.h new file mode 100644 index 000000000000..983c8fc1eed9 --- /dev/null +++ b/tools/kvm/include/kvm/virtio-mmio.h @@ -0,0 +1,59 @@ +#ifndef KVM__VIRTIO_MMIO_H +#define KVM__VIRTIO_MMIO_H + +#include <linux/types.h> +#include <linux/virtio_mmio.h> + +#define VIRTIO_MMIO_MAX_VQ 3 +#define VIRTIO_MMIO_MAX_CONFIG 1 +#define VIRTIO_MMIO_IO_SIZE 0x200 + +struct kvm; + +struct virtio_mmio_ioevent_param { + struct virtio_device *vdev; + u32 vq; +}; + +struct virtio_mmio_hdr { + char magic[4]; + u32 version; + u32 device_id; + u32 vendor_id; + u32 host_features; + u32 host_features_sel; + u32 reserved_1[2]; + u32 guest_features; + u32 guest_features_sel; + u32 guest_page_size; + u32 reserved_2; + u32 queue_sel; + u32 queue_num_max; + u32 queue_num; + u32 queue_align; + u32 queue_pfn; + u32 reserved_3[3]; + u32 queue_notify; + u32 reserved_4[3]; + u32 interrupt_state; + u32 interrupt_ack; + u32 reserved_5[2]; + u32 status; +} __attribute__((packed)); + +struct virtio_mmio { + u32 addr; + void *dev; + struct kvm *kvm; + u8 irq; + struct virtio_mmio_hdr hdr; + struct device_header dev_hdr; + struct virtio_mmio_ioevent_param ioeventfds[VIRTIO_MMIO_MAX_VQ]; +}; + +int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq); +int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev); +int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev); +int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev, + int device_id, int subsys_id, int class); +#endif diff --git a/tools/kvm/include/kvm/virtio-net.h b/tools/kvm/include/kvm/virtio-net.h new file mode 100644 index 000000000000..db43d9874796 --- /dev/null +++ b/tools/kvm/include/kvm/virtio-net.h @@ -0,0 +1,30 @@ +#ifndef KVM__VIRTIO_NET_H +#define KVM__VIRTIO_NET_H + +#include "kvm/parse-options.h" + +struct kvm; + +struct virtio_net_params { + const char *guest_ip; + const char *host_ip; + const char *script; + const char *trans; + char guest_mac[6]; + char host_mac[6]; + struct kvm *kvm; + int mode; + int vhost; + int fd; +}; + +int virtio_net__init(struct kvm *kvm); +int virtio_net__exit(struct kvm *kvm); +int netdev_parser(const struct option *opt, const char *arg, int unset); + +enum { + NET_MODE_USER, + NET_MODE_TAP +}; + +#endif /* KVM__VIRTIO_NET_H */ diff --git a/tools/kvm/include/kvm/virtio-pci-dev.h b/tools/kvm/include/kvm/virtio-pci-dev.h new file mode 100644 index 000000000000..48ae018e43e3 --- /dev/null +++ b/tools/kvm/include/kvm/virtio-pci-dev.h @@ -0,0 +1,38 @@ +#ifndef VIRTIO_PCI_DEV_H_ +#define VIRTIO_PCI_DEV_H_ + +#include <linux/virtio_ids.h> + +/* + * Virtio PCI device constants and resources + * they do use (such as irqs and pins). + */ + +#define PCI_DEVICE_ID_VIRTIO_NET 0x1000 +#define PCI_DEVICE_ID_VIRTIO_BLK 0x1001 +#define PCI_DEVICE_ID_VIRTIO_CONSOLE 0x1003 +#define PCI_DEVICE_ID_VIRTIO_RNG 0x1004 +#define PCI_DEVICE_ID_VIRTIO_BLN 0x1005 +#define PCI_DEVICE_ID_VIRTIO_SCSI 0x1008 +#define PCI_DEVICE_ID_VIRTIO_9P 0x1009 +#define PCI_DEVICE_ID_VESA 0x2000 +#define PCI_DEVICE_ID_PCI_SHMEM 0x0001 + +#define PCI_VENDOR_ID_REDHAT_QUMRANET 0x1af4 +#define PCI_VENDOR_ID_PCI_SHMEM 0x0001 +#define PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET 0x1af4 + +#define PCI_SUBSYSTEM_ID_VESA 0x0004 +#define PCI_SUBSYSTEM_ID_PCI_SHMEM 0x0001 + +#define PCI_CLASS_BLK 0x018000 +#define PCI_CLASS_NET 0x020000 +#define PCI_CLASS_CONSOLE 0x078000 +/* + * 0xFF Device does not fit in any defined classes + */ +#define PCI_CLASS_RNG 0xff0000 +#define PCI_CLASS_BLN 0xff0000 +#define PCI_CLASS_9P 0xff0000 + +#endif /* VIRTIO_PCI_DEV_H_ */ diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h new file mode 100644 index 000000000000..6d9a55868df3 --- /dev/null +++ b/tools/kvm/include/kvm/virtio-pci.h @@ -0,0 +1,51 @@ +#ifndef KVM__VIRTIO_PCI_H +#define KVM__VIRTIO_PCI_H + +#include "kvm/devices.h" +#include "kvm/pci.h" + +#include <linux/types.h> + +#define VIRTIO_PCI_MAX_VQ 3 +#define VIRTIO_PCI_MAX_CONFIG 1 + +struct kvm; + +struct virtio_pci_ioevent_param { + struct virtio_device *vdev; + u32 vq; +}; + +#define VIRTIO_PCI_F_SIGNAL_MSI (1 << 0) + +struct virtio_pci { + struct pci_device_header pci_hdr; + struct device_header dev_hdr; + void *dev; + + u16 base_addr; + u8 status; + u8 isr; + u32 features; + + /* MSI-X */ + u16 config_vector; + u32 config_gsi; + u32 vq_vector[VIRTIO_PCI_MAX_VQ]; + u32 gsis[VIRTIO_PCI_MAX_VQ]; + u32 msix_io_block; + u64 msix_pba; + struct msix_table msix_table[VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG]; + + /* virtio queue */ + u16 queue_selector; + struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ]; +}; + +int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq); +int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev); +int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev); +int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev, + int device_id, int subsys_id, int class); + +#endif diff --git a/tools/kvm/include/kvm/virtio-rng.h b/tools/kvm/include/kvm/virtio-rng.h new file mode 100644 index 000000000000..b585b372cd49 --- /dev/null +++ b/tools/kvm/include/kvm/virtio-rng.h @@ -0,0 +1,9 @@ +#ifndef KVM__RNG_VIRTIO_H +#define KVM__RNG_VIRTIO_H + +struct kvm; + +int virtio_rng__init(struct kvm *kvm); +int virtio_rng__exit(struct kvm *kvm); + +#endif /* KVM__RNG_VIRTIO_H */ diff --git a/tools/kvm/include/kvm/virtio-scsi.h b/tools/kvm/include/kvm/virtio-scsi.h new file mode 100644 index 000000000000..a780d7eee790 --- /dev/null +++ b/tools/kvm/include/kvm/virtio-scsi.h @@ -0,0 +1,26 @@ +#ifndef KVM__SCSI_VIRTIO_H +#define KVM__SCSI_VIRTIO_H + +#include "kvm/disk-image.h" + +struct kvm; + +int virtio_scsi_init(struct kvm *kvm); +int virtio_scsi_exit(struct kvm *kvm); + +/*----------------------------------------------------*/ +/* TODO: Remove this when tcm_vhost goes upstream */ +#define TRANSPORT_IQN_LEN 224 +#define VHOST_SCSI_ABI_VERSION 0 +struct vhost_scsi_target { + int abi_version; + unsigned char vhost_wwpn[TRANSPORT_IQN_LEN]; + unsigned short vhost_tpgt; +}; +/* VHOST_SCSI specific defines */ +#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target) +#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target) +#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, struct vhost_scsi_target) +/*----------------------------------------------------*/ + +#endif /* KVM__SCSI_VIRTIO_H */ diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h new file mode 100644 index 000000000000..924279b1ba03 --- /dev/null +++ b/tools/kvm/include/kvm/virtio.h @@ -0,0 +1,92 @@ +#ifndef KVM__VIRTIO_H +#define KVM__VIRTIO_H + +#include <linux/virtio_ring.h> +#include <linux/virtio_pci.h> + +#include <linux/types.h> +#include <sys/uio.h> + +#include "kvm/kvm.h" + +#define VIRTIO_IRQ_LOW 0 +#define VIRTIO_IRQ_HIGH 1 + +#define VIRTIO_PCI_O_CONFIG 0 +#define VIRTIO_PCI_O_MSIX 1 + +struct virt_queue { + struct vring vring; + u32 pfn; + /* The last_avail_idx field is an index to ->ring of struct vring_avail. + It's where we assume the next request index is at. */ + u16 last_avail_idx; + u16 last_used_signalled; +}; + +static inline u16 virt_queue__pop(struct virt_queue *queue) +{ + return queue->vring.avail->ring[queue->last_avail_idx++ % queue->vring.num]; +} + +static inline struct vring_desc *virt_queue__get_desc(struct virt_queue *queue, u16 desc_ndx) +{ + return &queue->vring.desc[desc_ndx]; +} + +static inline bool virt_queue__available(struct virt_queue *vq) +{ + if (!vq->vring.avail) + return 0; + + vring_avail_event(&vq->vring) = vq->last_avail_idx; + return vq->vring.avail->idx != vq->last_avail_idx; +} + +struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len); + +bool virtio_queue__should_signal(struct virt_queue *vq); +u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[], + u16 *out, u16 *in, struct kvm *kvm); +u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], + u16 *out, u16 *in, u16 head, struct kvm *kvm); +u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue, + struct iovec in_iov[], struct iovec out_iov[], + u16 *in, u16 *out); +int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off); + +enum virtio_trans { + VIRTIO_PCI, + VIRTIO_MMIO, +}; + +struct virtio_device { + bool use_vhost; + void *virtio; + struct virtio_ops *ops; +}; + +struct virtio_ops { + u8 *(*get_config)(struct kvm *kvm, void *dev); + u32 (*get_host_features)(struct kvm *kvm, void *dev); + void (*set_guest_features)(struct kvm *kvm, void *dev, u32 features); + int (*init_vq)(struct kvm *kvm, void *dev, u32 vq, u32 page_size, + u32 align, u32 pfn); + int (*notify_vq)(struct kvm *kvm, void *dev, u32 vq); + int (*get_pfn_vq)(struct kvm *kvm, void *dev, u32 vq); + int (*get_size_vq)(struct kvm *kvm, void *dev, u32 vq); + int (*set_size_vq)(struct kvm *kvm, void *dev, u32 vq, int size); + void (*notify_vq_gsi)(struct kvm *kvm, void *dev, u32 vq, u32 gsi); + void (*notify_vq_eventfd)(struct kvm *kvm, void *dev, u32 vq, u32 efd); + int (*signal_vq)(struct kvm *kvm, struct virtio_device *vdev, u32 queueid); + int (*signal_config)(struct kvm *kvm, struct virtio_device *vdev); + int (*init)(struct kvm *kvm, void *dev, struct virtio_device *vdev, + int device_id, int subsys_id, int class); + int (*exit)(struct kvm *kvm, struct virtio_device *vdev); +}; + +int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev, + struct virtio_ops *ops, enum virtio_trans trans, + int device_id, int subsys_id, int class); +int virtio_compat_add_message(const char *device, const char *config); +#endif /* KVM__VIRTIO_H */ diff --git a/tools/kvm/include/kvm/vnc.h b/tools/kvm/include/kvm/vnc.h new file mode 100644 index 000000000000..c2934a45f6dc --- /dev/null +++ b/tools/kvm/include/kvm/vnc.h @@ -0,0 +1,22 @@ +#ifndef KVM__VNC_H +#define KVM__VNC_H + +#include "kvm/kvm.h" + +struct framebuffer; + +#ifdef CONFIG_HAS_VNCSERVER +int vnc__init(struct kvm *kvm); +int vnc__exit(struct kvm *kvm); +#else +static inline int vnc__init(struct kvm *kvm) +{ + return 0; +} +static inline int vnc__exit(struct kvm *kvm) +{ + return 0; +} +#endif + +#endif /* KVM__VNC_H */ diff --git a/tools/kvm/include/linux/bitops.h b/tools/kvm/include/linux/bitops.h new file mode 100644 index 000000000000..56448b71ebbf --- /dev/null +++ b/tools/kvm/include/linux/bitops.h @@ -0,0 +1,33 @@ +#ifndef _KVM_LINUX_BITOPS_H_ +#define _KVM_LINUX_BITOPS_H_ + +#include <linux/kernel.h> +#include <linux/compiler.h> +#include <asm/hweight.h> + +#define BITS_PER_LONG __WORDSIZE +#define BITS_PER_BYTE 8 +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) + +static inline void set_bit(int nr, unsigned long *addr) +{ + addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); +} + +static inline void clear_bit(int nr, unsigned long *addr) +{ + addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG)); +} + +static __always_inline int test_bit(unsigned int nr, const unsigned long *addr) +{ + return ((1UL << (nr % BITS_PER_LONG)) & + (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; +} + +static inline unsigned long hweight_long(unsigned long w) +{ + return sizeof(w) == 4 ? hweight32(w) : hweight64(w); +} + +#endif diff --git a/tools/kvm/include/linux/byteorder.h b/tools/kvm/include/linux/byteorder.h new file mode 100644 index 000000000000..c490de8a89f4 --- /dev/null +++ b/tools/kvm/include/linux/byteorder.h @@ -0,0 +1,7 @@ +#ifndef __BYTE_ORDER_H__ +#define __BYTE_ORDER_H__ + +#include <asm/byteorder.h> +#include <linux/byteorder/generic.h> + +#endif diff --git a/tools/kvm/include/linux/compiler.h b/tools/kvm/include/linux/compiler.h new file mode 100644 index 000000000000..898420b81aec --- /dev/null +++ b/tools/kvm/include/linux/compiler.h @@ -0,0 +1,20 @@ +#ifndef _PERF_LINUX_COMPILER_H_ +#define _PERF_LINUX_COMPILER_H_ + +#ifndef __always_inline +#define __always_inline inline +#endif +#define __user + +#ifndef __attribute_const__ +#define __attribute_const__ +#endif + +#define __used __attribute__((__unused__)) +#define __packed __attribute__((packed)) +#define __iomem +#define __force +#define __must_check +#define unlikely + +#endif diff --git a/tools/kvm/include/linux/kernel.h b/tools/kvm/include/linux/kernel.h new file mode 100644 index 000000000000..1e9abe9a4d0c --- /dev/null +++ b/tools/kvm/include/linux/kernel.h @@ -0,0 +1,41 @@ + +#ifndef KVM__LINUX_KERNEL_H_ +#define KVM__LINUX_KERNEL_H_ + +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) + +#define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1) +#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define true 1 + +#endif diff --git a/tools/kvm/include/linux/module.h b/tools/kvm/include/linux/module.h new file mode 100644 index 000000000000..0e4c6a3986f5 --- /dev/null +++ b/tools/kvm/include/linux/module.h @@ -0,0 +1,6 @@ +#ifndef KVM__LINUX_MODULE_H +#define KVM__LINUX_MODULE_H + +#define EXPORT_SYMBOL(name) + +#endif diff --git a/tools/kvm/include/linux/prefetch.h b/tools/kvm/include/linux/prefetch.h new file mode 100644 index 000000000000..62f67889c52f --- /dev/null +++ b/tools/kvm/include/linux/prefetch.h @@ -0,0 +1,6 @@ +#ifndef KVM__LINUX_PREFETCH_H +#define KVM__LINUX_PREFETCH_H + +static inline void prefetch(void *a __attribute__((unused))) { } + +#endif diff --git a/tools/kvm/include/linux/stddef.h b/tools/kvm/include/linux/stddef.h new file mode 100644 index 000000000000..39da8088d942 --- /dev/null +++ b/tools/kvm/include/linux/stddef.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_STDDEF_H +#define _LINUX_STDDEF_H + +#undef NULL +#define NULL ((void *)0) + +#undef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) + +#endif diff --git a/tools/kvm/include/linux/types.h b/tools/kvm/include/linux/types.h new file mode 100644 index 000000000000..5e20f10f8830 --- /dev/null +++ b/tools/kvm/include/linux/types.h @@ -0,0 +1,51 @@ +#ifndef LINUX_TYPES_H +#define LINUX_TYPES_H + +#include <kvm/compiler.h> +#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ +#include <asm/types.h> + +typedef __u64 u64; +typedef __s64 s64; + +typedef __u32 u32; +typedef __s32 s32; + +typedef __u16 u16; +typedef __s16 s16; + +typedef __u8 u8; +typedef __s8 s8; + +#ifdef __CHECKER__ +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif +#ifdef __CHECK_ENDIAN__ +#define __bitwise __bitwise__ +#else +#define __bitwise +#endif + + +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +struct list_head { + struct list_head *next, *prev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#endif /* LINUX_TYPES_H */ diff --git a/tools/kvm/ioeventfd.c b/tools/kvm/ioeventfd.c new file mode 100644 index 000000000000..ff665d410ba5 --- /dev/null +++ b/tools/kvm/ioeventfd.c @@ -0,0 +1,218 @@ +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <pthread.h> +#include <unistd.h> +#include <stdio.h> +#include <signal.h> + +#include <linux/kernel.h> +#include <linux/kvm.h> +#include <linux/types.h> + +#include "kvm/ioeventfd.h" +#include "kvm/kvm.h" +#include "kvm/util.h" + +#define IOEVENTFD_MAX_EVENTS 20 + +static struct epoll_event events[IOEVENTFD_MAX_EVENTS]; +static int epoll_fd, epoll_stop_fd; +static LIST_HEAD(used_ioevents); +static bool ioeventfd_avail; + +static void *ioeventfd__thread(void *param) +{ + u64 tmp = 1; + + kvm__set_thread_name("ioeventfd-worker"); + + for (;;) { + int nfds, i; + + nfds = epoll_wait(epoll_fd, events, IOEVENTFD_MAX_EVENTS, -1); + for (i = 0; i < nfds; i++) { + struct ioevent *ioevent; + + if (events[i].data.fd == epoll_stop_fd) + goto done; + + ioevent = events[i].data.ptr; + + if (read(ioevent->fd, &tmp, sizeof(tmp)) < 0) + die("Failed reading event"); + + ioevent->fn(ioevent->fn_kvm, ioevent->fn_ptr); + } + } + +done: + tmp = write(epoll_stop_fd, &tmp, sizeof(tmp)); + + return NULL; +} + +static int ioeventfd__start(void) +{ + pthread_t thread; + + if (!ioeventfd_avail) + return -ENOSYS; + + return pthread_create(&thread, NULL, ioeventfd__thread, NULL); +} + +int ioeventfd__init(struct kvm *kvm) +{ + struct epoll_event epoll_event = {.events = EPOLLIN}; + int r; + + ioeventfd_avail = kvm__supports_extension(kvm, KVM_CAP_IOEVENTFD); + if (!ioeventfd_avail) + return 1; /* Not fatal, but let caller determine no-go. */ + + epoll_fd = epoll_create(IOEVENTFD_MAX_EVENTS); + if (epoll_fd < 0) + return -errno; + + epoll_stop_fd = eventfd(0, 0); + epoll_event.data.fd = epoll_stop_fd; + + r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, epoll_stop_fd, &epoll_event); + if (r < 0) + goto cleanup; + + r = ioeventfd__start(); + if (r < 0) + goto cleanup; + + r = 0; + + return r; + +cleanup: + close(epoll_stop_fd); + close(epoll_fd); + + return r; +} +base_init(ioeventfd__init); + +int ioeventfd__exit(struct kvm *kvm) +{ + u64 tmp = 1; + int r; + + if (!ioeventfd_avail) + return 0; + + r = write(epoll_stop_fd, &tmp, sizeof(tmp)); + if (r < 0) + return r; + + r = read(epoll_stop_fd, &tmp, sizeof(tmp)); + if (r < 0) + return r; + + close(epoll_fd); + close(epoll_stop_fd); + + return 0; +} +base_exit(ioeventfd__exit); + +int ioeventfd__add_event(struct ioevent *ioevent, bool is_pio, bool poll_in_userspace) +{ + struct kvm_ioeventfd kvm_ioevent; + struct epoll_event epoll_event; + struct ioevent *new_ioevent; + int event, r; + + if (!ioeventfd_avail) + return -ENOSYS; + + new_ioevent = malloc(sizeof(*new_ioevent)); + if (new_ioevent == NULL) + return -ENOMEM; + + *new_ioevent = *ioevent; + event = new_ioevent->fd; + + kvm_ioevent = (struct kvm_ioeventfd) { + .addr = ioevent->io_addr, + .len = ioevent->io_len, + .datamatch = ioevent->datamatch, + .fd = event, + .flags = KVM_IOEVENTFD_FLAG_DATAMATCH, + }; + + if (is_pio) + kvm_ioevent.flags |= KVM_IOEVENTFD_FLAG_PIO; + + r = ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent); + if (r) { + r = -errno; + goto cleanup; + } + + if (!poll_in_userspace) + return 0; + + epoll_event = (struct epoll_event) { + .events = EPOLLIN, + .data.ptr = new_ioevent, + }; + + r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event, &epoll_event); + if (r) { + r = -errno; + goto cleanup; + } + + list_add_tail(&new_ioevent->list, &used_ioevents); + + return 0; + +cleanup: + free(new_ioevent); + return r; +} + +int ioeventfd__del_event(u64 addr, u64 datamatch) +{ + struct kvm_ioeventfd kvm_ioevent; + struct ioevent *ioevent; + u8 found = 0; + + if (!ioeventfd_avail) + return -ENOSYS; + + list_for_each_entry(ioevent, &used_ioevents, list) { + if (ioevent->io_addr == addr) { + found = 1; + break; + } + } + + if (found == 0 || ioevent == NULL) + return -ENOENT; + + kvm_ioevent = (struct kvm_ioeventfd) { + .addr = ioevent->io_addr, + .len = ioevent->io_len, + .datamatch = ioevent->datamatch, + .flags = KVM_IOEVENTFD_FLAG_PIO + | KVM_IOEVENTFD_FLAG_DEASSIGN + | KVM_IOEVENTFD_FLAG_DATAMATCH, + }; + + ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent); + + epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ioevent->fd, NULL); + + list_del(&ioevent->list); + + close(ioevent->fd); + free(ioevent); + + return 0; +} diff --git a/tools/kvm/ioport.c b/tools/kvm/ioport.c new file mode 100644 index 000000000000..a4f15827acb4 --- /dev/null +++ b/tools/kvm/ioport.c @@ -0,0 +1,198 @@ +#include "kvm/ioport.h" + +#include "kvm/kvm.h" +#include "kvm/util.h" +#include "kvm/brlock.h" +#include "kvm/rbtree-interval.h" +#include "kvm/mutex.h" + +#include <linux/kvm.h> /* for KVM_EXIT_* */ +#include <linux/types.h> + +#include <stdbool.h> +#include <limits.h> +#include <stdlib.h> +#include <stdio.h> + +#define ioport_node(n) rb_entry(n, struct ioport, node) + +DEFINE_MUTEX(ioport_mutex); + +static u16 free_io_port_idx; /* protected by ioport_mutex */ + +static struct rb_root ioport_tree = RB_ROOT; + +static u16 ioport__find_free_port(void) +{ + u16 free_port; + + mutex_lock(&ioport_mutex); + free_port = IOPORT_START + free_io_port_idx * IOPORT_SIZE; + free_io_port_idx++; + mutex_unlock(&ioport_mutex); + + return free_port; +} + +static struct ioport *ioport_search(struct rb_root *root, u64 addr) +{ + struct rb_int_node *node; + + node = rb_int_search_single(root, addr); + if (node == NULL) + return NULL; + + return ioport_node(node); +} + +static int ioport_insert(struct rb_root *root, struct ioport *data) +{ + return rb_int_insert(root, &data->node); +} + +static void ioport_remove(struct rb_root *root, struct ioport *data) +{ + rb_int_erase(root, &data->node); +} + +int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, int count, void *param) +{ + struct ioport *entry; + int r; + + br_write_lock(kvm); + if (port == IOPORT_EMPTY) + port = ioport__find_free_port(); + + entry = ioport_search(&ioport_tree, port); + if (entry) { + pr_warning("ioport re-registered: %x", port); + rb_int_erase(&ioport_tree, &entry->node); + } + + entry = malloc(sizeof(*entry)); + if (entry == NULL) + return -ENOMEM; + + *entry = (struct ioport) { + .node = RB_INT_INIT(port, port + count), + .ops = ops, + .priv = param, + }; + + r = ioport_insert(&ioport_tree, entry); + if (r < 0) { + free(entry); + br_write_unlock(kvm); + return r; + } + br_write_unlock(kvm); + + return port; +} + +int ioport__unregister(struct kvm *kvm, u16 port) +{ + struct ioport *entry; + int r; + + br_write_lock(kvm); + + r = -ENOENT; + entry = ioport_search(&ioport_tree, port); + if (!entry) + goto done; + + ioport_remove(&ioport_tree, entry); + + free(entry); + + r = 0; + +done: + br_write_unlock(kvm); + + return r; +} + +static void ioport__unregister_all(void) +{ + struct ioport *entry; + struct rb_node *rb; + struct rb_int_node *rb_node; + + rb = rb_first(&ioport_tree); + while (rb) { + rb_node = rb_int(rb); + entry = ioport_node(rb_node); + ioport_remove(&ioport_tree, entry); + free(entry); + rb = rb_first(&ioport_tree); + } +} + +static const char *to_direction(int direction) +{ + if (direction == KVM_EXIT_IO_IN) + return "IN"; + else + return "OUT"; +} + +static void ioport_error(u16 port, void *data, int direction, int size, u32 count) +{ + fprintf(stderr, "IO error: %s port=%x, size=%d, count=%u\n", to_direction(direction), port, size, count); +} + +bool kvm__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count) +{ + struct ioport_operations *ops; + bool ret = false; + struct ioport *entry; + void *ptr = data; + + br_read_lock(); + entry = ioport_search(&ioport_tree, port); + if (!entry) + goto error; + + ops = entry->ops; + + while (count--) { + if (direction == KVM_EXIT_IO_IN && ops->io_in) + ret = ops->io_in(entry, kvm, port, ptr, size); + else if (ops->io_out) + ret = ops->io_out(entry, kvm, port, ptr, size); + + ptr += size; + } + + br_read_unlock(); + + if (!ret) + goto error; + + return true; +error: + br_read_unlock(); + + if (kvm->cfg.ioport_debug) + ioport_error(port, data, direction, size, count); + + return !kvm->cfg.ioport_debug; +} + +int ioport__init(struct kvm *kvm) +{ + ioport__setup_arch(kvm); + + return 0; +} +dev_base_init(ioport__init); + +int ioport__exit(struct kvm *kvm) +{ + ioport__unregister_all(); + return 0; +} +dev_base_exit(ioport__exit); diff --git a/tools/kvm/kvm-cmd.c b/tools/kvm/kvm-cmd.c new file mode 100644 index 000000000000..2520b08847e8 --- /dev/null +++ b/tools/kvm/kvm-cmd.c @@ -0,0 +1,91 @@ +#include <stdio.h> +#include <string.h> +#include <errno.h> + +/* user defined header files */ +#include "kvm/builtin-debug.h" +#include "kvm/builtin-pause.h" +#include "kvm/builtin-resume.h" +#include "kvm/builtin-balloon.h" +#include "kvm/builtin-list.h" +#include "kvm/builtin-version.h" +#include "kvm/builtin-setup.h" +#include "kvm/builtin-stop.h" +#include "kvm/builtin-stat.h" +#include "kvm/builtin-help.h" +#include "kvm/builtin-sandbox.h" +#include "kvm/kvm-cmd.h" +#include "kvm/builtin-run.h" +#include "kvm/util.h" + +struct cmd_struct kvm_commands[] = { + { "pause", kvm_cmd_pause, kvm_pause_help, 0 }, + { "resume", kvm_cmd_resume, kvm_resume_help, 0 }, + { "debug", kvm_cmd_debug, kvm_debug_help, 0 }, + { "balloon", kvm_cmd_balloon, kvm_balloon_help, 0 }, + { "list", kvm_cmd_list, kvm_list_help, 0 }, + { "version", kvm_cmd_version, NULL, 0 }, + { "--version", kvm_cmd_version, NULL, 0 }, + { "stop", kvm_cmd_stop, kvm_stop_help, 0 }, + { "stat", kvm_cmd_stat, kvm_stat_help, 0 }, + { "help", kvm_cmd_help, NULL, 0 }, + { "setup", kvm_cmd_setup, kvm_setup_help, 0 }, + { "run", kvm_cmd_run, kvm_run_help, 0 }, + { "sandbox", kvm_cmd_sandbox, kvm_run_help, 0 }, + { NULL, NULL, NULL, 0 }, +}; + +/* + * kvm_get_command: Searches the command in an array of the commands and + * returns a pointer to cmd_struct if a match is found. + * + * Input parameters: + * command: Array of possible commands. The last entry in the array must be + * NULL. + * cmd: A string command to search in the array + * + * Return Value: + * NULL: If the cmd is not matched with any of the command in the command array + * p: Pointer to cmd_struct of the matching command + */ +struct cmd_struct *kvm_get_command(struct cmd_struct *command, + const char *cmd) +{ + struct cmd_struct *p = command; + + while (p->cmd) { + if (!strcmp(p->cmd, cmd)) + return p; + p++; + } + return NULL; +} + +int handle_command(struct cmd_struct *command, int argc, const char **argv) +{ + struct cmd_struct *p; + const char *prefix = NULL; + int ret = 0; + + if (!argv || !*argv) { + p = kvm_get_command(command, "help"); + BUG_ON(!p); + return p->fn(argc, argv, prefix); + } + + p = kvm_get_command(command, argv[0]); + if (!p) { + p = kvm_get_command(command, "help"); + BUG_ON(!p); + p->fn(0, NULL, prefix); + return EINVAL; + } + + ret = p->fn(argc - 1, &argv[1], prefix); + if (ret < 0) { + if (errno == EPERM) + die("Permission error - are you root?"); + } + + return ret; +} diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c new file mode 100644 index 000000000000..be05c4988dcd --- /dev/null +++ b/tools/kvm/kvm-cpu.c @@ -0,0 +1,242 @@ +#include "kvm/kvm-cpu.h" + +#include "kvm/symbol.h" +#include "kvm/util.h" +#include "kvm/kvm.h" + +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> + +extern __thread struct kvm_cpu *current_kvm_cpu; + +void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu) +{ + struct kvm_guest_debug debug = { + .control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP, + }; + + if (ioctl(vcpu->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0) + pr_warning("KVM_SET_GUEST_DEBUG failed"); +} + +void kvm_cpu__run(struct kvm_cpu *vcpu) +{ + int err; + + if (!vcpu->is_running) + return; + + err = ioctl(vcpu->vcpu_fd, KVM_RUN, 0); + if (err < 0 && (errno != EINTR && errno != EAGAIN)) + die_perror("KVM_RUN failed"); +} + +static void kvm_cpu_signal_handler(int signum) +{ + if (signum == SIGKVMEXIT) { + if (current_kvm_cpu && current_kvm_cpu->is_running) { + current_kvm_cpu->is_running = false; + kvm__continue(current_kvm_cpu->kvm); + } + } else if (signum == SIGKVMPAUSE) { + current_kvm_cpu->paused = 1; + } +} + +static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu *cpu) +{ + if (cpu->ring) { + while (cpu->ring->first != cpu->ring->last) { + struct kvm_coalesced_mmio *m; + m = &cpu->ring->coalesced_mmio[cpu->ring->first]; + kvm_cpu__emulate_mmio(cpu->kvm, + m->phys_addr, + m->data, + m->len, + 1); + cpu->ring->first = (cpu->ring->first + 1) % KVM_COALESCED_MMIO_MAX; + } + } +} + +void kvm_cpu__reboot(struct kvm *kvm) +{ + int i; + + /* The kvm->cpus array contains a null pointer in the last location */ + for (i = 0; ; i++) { + if (kvm->cpus[i]) + pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT); + else + break; + } +} + +int kvm_cpu__start(struct kvm_cpu *cpu) +{ + sigset_t sigset; + + sigemptyset(&sigset); + sigaddset(&sigset, SIGALRM); + + pthread_sigmask(SIG_BLOCK, &sigset, NULL); + + signal(SIGKVMEXIT, kvm_cpu_signal_handler); + signal(SIGKVMPAUSE, kvm_cpu_signal_handler); + + kvm_cpu__reset_vcpu(cpu); + + if (cpu->kvm->cfg.single_step) + kvm_cpu__enable_singlestep(cpu); + + while (cpu->is_running) { + if (cpu->paused) { + kvm__notify_paused(); + cpu->paused = 0; + } + + if (cpu->needs_nmi) { + kvm_cpu__arch_nmi(cpu); + cpu->needs_nmi = 0; + } + + kvm_cpu__run(cpu); + + switch (cpu->kvm_run->exit_reason) { + case KVM_EXIT_UNKNOWN: + break; + case KVM_EXIT_DEBUG: + kvm_cpu__show_registers(cpu); + kvm_cpu__show_code(cpu); + break; + case KVM_EXIT_IO: { + bool ret; + + ret = kvm_cpu__emulate_io(cpu->kvm, + cpu->kvm_run->io.port, + (u8 *)cpu->kvm_run + + cpu->kvm_run->io.data_offset, + cpu->kvm_run->io.direction, + cpu->kvm_run->io.size, + cpu->kvm_run->io.count); + + if (!ret) + goto panic_kvm; + break; + } + case KVM_EXIT_MMIO: { + bool ret; + + /* + * If we had MMIO exit, coalesced ring should be processed + * *before* processing the exit itself + */ + kvm_cpu__handle_coalesced_mmio(cpu); + + ret = kvm_cpu__emulate_mmio(cpu->kvm, + cpu->kvm_run->mmio.phys_addr, + cpu->kvm_run->mmio.data, + cpu->kvm_run->mmio.len, + cpu->kvm_run->mmio.is_write); + + if (!ret) + goto panic_kvm; + break; + } + case KVM_EXIT_INTR: + if (cpu->is_running) + break; + goto exit_kvm; + case KVM_EXIT_SHUTDOWN: + goto exit_kvm; + default: { + bool ret; + + ret = kvm_cpu__handle_exit(cpu); + if (!ret) + goto panic_kvm; + break; + } + } + kvm_cpu__handle_coalesced_mmio(cpu); + } + +exit_kvm: + return 0; + +panic_kvm: + return 1; +} + +int kvm_cpu__init(struct kvm *kvm) +{ + int max_cpus, recommended_cpus, i; + + max_cpus = kvm__max_cpus(kvm); + recommended_cpus = kvm__recommended_cpus(kvm); + + if (kvm->cfg.nrcpus > max_cpus) { + printf(" # Limit the number of CPUs to %d\n", max_cpus); + kvm->cfg.nrcpus = max_cpus; + } else if (kvm->cfg.nrcpus > recommended_cpus) { + printf(" # Warning: The maximum recommended amount of VCPUs" + " is %d\n", recommended_cpus); + } + + kvm->nrcpus = kvm->cfg.nrcpus; + + /* Alloc one pointer too many, so array ends up 0-terminated */ + kvm->cpus = calloc(kvm->nrcpus + 1, sizeof(void *)); + if (!kvm->cpus) { + pr_warning("Couldn't allocate array for %d CPUs", kvm->nrcpus); + return -ENOMEM; + } + + for (i = 0; i < kvm->nrcpus; i++) { + kvm->cpus[i] = kvm_cpu__arch_init(kvm, i); + if (!kvm->cpus[i]) { + pr_warning("unable to initialize KVM VCPU"); + goto fail_alloc; + } + } + + return 0; + +fail_alloc: + for (i = 0; i < kvm->nrcpus; i++) + free(kvm->cpus[i]); + return -ENOMEM; +} +base_init(kvm_cpu__init); + +int kvm_cpu__exit(struct kvm *kvm) +{ + int i, r; + void *ret = NULL; + + kvm_cpu__delete(kvm->cpus[0]); + kvm->cpus[0] = NULL; + + for (i = 1; i < kvm->nrcpus; i++) { + if (kvm->cpus[i]->is_running) { + pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT); + if (pthread_join(kvm->cpus[i]->thread, &ret) != 0) + die("pthread_join"); + kvm_cpu__delete(kvm->cpus[i]); + } + if (ret == NULL) + r = 0; + } + + free(kvm->cpus); + + kvm->nrcpus = 0; + + return r; +} +late_exit(kvm_cpu__exit); diff --git a/tools/kvm/kvm-ipc.c b/tools/kvm/kvm-ipc.c new file mode 100644 index 000000000000..bdcc0d1f6b73 --- /dev/null +++ b/tools/kvm/kvm-ipc.c @@ -0,0 +1,500 @@ +#include <sys/epoll.h> +#include <sys/un.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/eventfd.h> +#include <dirent.h> + +#include "kvm/kvm-ipc.h" +#include "kvm/rwsem.h" +#include "kvm/read-write.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/builtin-debug.h" +#include "kvm/strbuf.h" +#include "kvm/kvm-cpu.h" +#include "kvm/8250-serial.h" + +struct kvm_ipc_head { + u32 type; + u32 len; +}; + +#define KVM_IPC_MAX_MSGS 16 + +#define KVM_SOCK_SUFFIX ".sock" +#define KVM_SOCK_SUFFIX_LEN ((ssize_t)sizeof(KVM_SOCK_SUFFIX) - 1) + +extern __thread struct kvm_cpu *current_kvm_cpu; +static void (*msgs[KVM_IPC_MAX_MSGS])(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg); +static DECLARE_RWSEM(msgs_rwlock); +static int epoll_fd, server_fd, stop_fd; +static pthread_t thread; + +static int kvm__create_socket(struct kvm *kvm) +{ + char full_name[PATH_MAX]; + unsigned int s; + struct sockaddr_un local; + int len, r; + + /* This usually 108 bytes long */ + BUILD_BUG_ON(sizeof(local.sun_path) < 32); + + snprintf(full_name, sizeof(full_name), "%s/%s%s", + kvm__get_dir(), kvm->cfg.guest_name, KVM_SOCK_SUFFIX); + if (access(full_name, F_OK) == 0) { + pr_err("Socket file %s already exist", full_name); + return -EEXIST; + } + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) + return s; + local.sun_family = AF_UNIX; + strlcpy(local.sun_path, full_name, sizeof(local.sun_path)); + len = strlen(local.sun_path) + sizeof(local.sun_family); + r = bind(s, (struct sockaddr *)&local, len); + if (r < 0) + goto fail; + + r = listen(s, 5); + if (r < 0) + goto fail; + + return s; + +fail: + close(s); + return r; +} + +void kvm__remove_socket(const char *name) +{ + char full_name[PATH_MAX]; + + snprintf(full_name, sizeof(full_name), "%s/%s%s", + kvm__get_dir(), name, KVM_SOCK_SUFFIX); + unlink(full_name); +} + +int kvm__get_sock_by_instance(const char *name) +{ + int s, len, r; + char sock_file[PATH_MAX]; + struct sockaddr_un local; + + snprintf(sock_file, sizeof(sock_file), "%s/%s%s", + kvm__get_dir(), name, KVM_SOCK_SUFFIX); + s = socket(AF_UNIX, SOCK_STREAM, 0); + + local.sun_family = AF_UNIX; + strlcpy(local.sun_path, sock_file, sizeof(local.sun_path)); + len = strlen(local.sun_path) + sizeof(local.sun_family); + + r = connect(s, &local, len); + if (r < 0 && errno == ECONNREFUSED) { + /* Tell the user clean ghost socket file */ + pr_err("\"%s\" could be a ghost socket file, please remove it", + sock_file); + return r; + } else if (r < 0) { + return r; + } + + return s; +} + +int kvm__enumerate_instances(int (*callback)(const char *name, int fd)) +{ + int sock; + DIR *dir; + struct dirent entry, *result; + int ret = 0; + + dir = opendir(kvm__get_dir()); + if (!dir) + return -errno; + + for (;;) { + readdir_r(dir, &entry, &result); + if (result == NULL) + break; + if (entry.d_type == DT_SOCK) { + ssize_t name_len = strlen(entry.d_name); + char *p; + + if (name_len <= KVM_SOCK_SUFFIX_LEN) + continue; + + p = &entry.d_name[name_len - KVM_SOCK_SUFFIX_LEN]; + if (memcmp(KVM_SOCK_SUFFIX, p, KVM_SOCK_SUFFIX_LEN)) + continue; + + *p = 0; + sock = kvm__get_sock_by_instance(entry.d_name); + if (sock < 0) + continue; + ret = callback(entry.d_name, sock); + close(sock); + if (ret < 0) + break; + } + } + + closedir(dir); + + return ret; +} + +int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)) +{ + if (type >= KVM_IPC_MAX_MSGS) + return -ENOSPC; + + down_write(&msgs_rwlock); + msgs[type] = cb; + up_write(&msgs_rwlock); + + return 0; +} + +int kvm_ipc__send(int fd, u32 type) +{ + struct kvm_ipc_head head = {.type = type, .len = 0,}; + + if (write_in_full(fd, &head, sizeof(head)) < 0) + return -1; + + return 0; +} + +int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg) +{ + struct kvm_ipc_head head = {.type = type, .len = len,}; + + if (write_in_full(fd, &head, sizeof(head)) < 0) + return -1; + + if (write_in_full(fd, msg, len) < 0) + return -1; + + return 0; +} + +static int kvm_ipc__handle(struct kvm *kvm, int fd, u32 type, u32 len, u8 *data) +{ + void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg); + + if (type >= KVM_IPC_MAX_MSGS) + return -ENOSPC; + + down_read(&msgs_rwlock); + cb = msgs[type]; + up_read(&msgs_rwlock); + + if (cb == NULL) { + pr_warning("No device handles type %u\n", type); + return -ENODEV; + } + + cb(kvm, fd, type, len, data); + + return 0; +} + +static int kvm_ipc__new_conn(int fd) +{ + int client; + struct epoll_event ev; + + client = accept(fd, NULL, NULL); + if (client < 0) + return -1; + + ev.events = EPOLLIN | EPOLLRDHUP; + ev.data.fd = client; + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client, &ev) < 0) { + close(client); + return -1; + } + + return client; +} + +static void kvm_ipc__close_conn(int fd) +{ + epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL); + close(fd); +} + +static int kvm_ipc__receive(struct kvm *kvm, int fd) +{ + struct kvm_ipc_head head; + u8 *msg = NULL; + u32 n; + + n = read(fd, &head, sizeof(head)); + if (n != sizeof(head)) + goto done; + + msg = malloc(head.len); + if (msg == NULL) + goto done; + + n = read_in_full(fd, msg, head.len); + if (n != head.len) + goto done; + + kvm_ipc__handle(kvm, fd, head.type, head.len, msg); + + return 0; + +done: + free(msg); + return -1; +} + +static void *kvm_ipc__thread(void *param) +{ + struct epoll_event event; + struct kvm *kvm = param; + + kvm__set_thread_name("kvm-ipc"); + + for (;;) { + int nfds; + + nfds = epoll_wait(epoll_fd, &event, 1, -1); + if (nfds > 0) { + int fd = event.data.fd; + + if (fd == stop_fd && event.events & EPOLLIN) { + break; + } else if (fd == server_fd) { + int client, r; + + client = kvm_ipc__new_conn(fd); + /* + * Handle multiple IPC cmd at a time + */ + do { + r = kvm_ipc__receive(kvm, client); + } while (r == 0); + + } else if (event.events & (EPOLLERR | EPOLLRDHUP | EPOLLHUP)) { + kvm_ipc__close_conn(fd); + } else { + kvm_ipc__receive(kvm, fd); + } + } + } + + return NULL; +} + +static void kvm__pid(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg) +{ + pid_t pid = getpid(); + int r = 0; + + if (type == KVM_IPC_PID) + r = write(fd, &pid, sizeof(pid)); + + if (r < 0) + pr_warning("Failed sending PID"); +} + +static void handle_stop(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg) +{ + if (WARN_ON(type != KVM_IPC_STOP || len)) + return; + + kvm_cpu__reboot(kvm); +} + +/* Pause/resume the guest using SIGUSR2 */ +static int is_paused; + +static void handle_pause(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg) +{ + if (WARN_ON(len)) + return; + + if (type == KVM_IPC_RESUME && is_paused) { + kvm->vm_state = KVM_VMSTATE_RUNNING; + kvm__continue(kvm); + } else if (type == KVM_IPC_PAUSE && !is_paused) { + kvm->vm_state = KVM_VMSTATE_PAUSED; + ioctl(kvm->vm_fd, KVM_KVMCLOCK_CTRL); + kvm__pause(kvm); + } else { + return; + } + + is_paused = !is_paused; +} + +static void handle_vmstate(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg) +{ + int r = 0; + + if (type == KVM_IPC_VMSTATE) + r = write(fd, &kvm->vm_state, sizeof(kvm->vm_state)); + + if (r < 0) + pr_warning("Failed sending VMSTATE"); +} + +/* + * Serialize debug printout so that the output of multiple vcpus does not + * get mixed up: + */ +static int printout_done; + +static void handle_sigusr1(int sig) +{ + struct kvm_cpu *cpu = current_kvm_cpu; + int fd = kvm_cpu__get_debug_fd(); + + if (!cpu || cpu->needs_nmi) + return; + + dprintf(fd, "\n #\n # vCPU #%ld's dump:\n #\n", cpu->cpu_id); + kvm_cpu__show_registers(cpu); + kvm_cpu__show_code(cpu); + kvm_cpu__show_page_tables(cpu); + fflush(stdout); + printout_done = 1; +} + +static void handle_debug(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg) +{ + int i; + struct debug_cmd_params *params; + u32 dbg_type; + u32 vcpu; + + if (WARN_ON(type != KVM_IPC_DEBUG || len != sizeof(*params))) + return; + + params = (void *)msg; + dbg_type = params->dbg_type; + vcpu = params->cpu; + + if (dbg_type & KVM_DEBUG_CMD_TYPE_SYSRQ) + serial8250__inject_sysrq(kvm, params->sysrq); + + if (dbg_type & KVM_DEBUG_CMD_TYPE_NMI) { + if ((int)vcpu >= kvm->nrcpus) + return; + + kvm->cpus[vcpu]->needs_nmi = 1; + pthread_kill(kvm->cpus[vcpu]->thread, SIGUSR1); + } + + if (!(dbg_type & KVM_DEBUG_CMD_TYPE_DUMP)) + return; + + for (i = 0; i < kvm->nrcpus; i++) { + struct kvm_cpu *cpu = kvm->cpus[i]; + + if (!cpu) + continue; + + printout_done = 0; + + kvm_cpu__set_debug_fd(fd); + pthread_kill(cpu->thread, SIGUSR1); + /* + * Wait for the vCPU to dump state before signalling + * the next thread. Since this is debug code it does + * not matter that we are burning CPU time a bit: + */ + while (!printout_done) + sleep(0); + } + + close(fd); + + serial8250__inject_sysrq(kvm, 'p'); +} + +int kvm_ipc__init(struct kvm *kvm) +{ + int ret; + int sock = kvm__create_socket(kvm); + struct epoll_event ev = {0}; + + server_fd = sock; + + epoll_fd = epoll_create(KVM_IPC_MAX_MSGS); + if (epoll_fd < 0) { + ret = epoll_fd; + goto err; + } + + ev.events = EPOLLIN | EPOLLET; + ev.data.fd = sock; + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock, &ev) < 0) { + pr_err("Failed starting IPC thread"); + ret = -EFAULT; + goto err_epoll; + } + + stop_fd = eventfd(0, 0); + if (stop_fd < 0) { + ret = stop_fd; + goto err_epoll; + } + + ev.events = EPOLLIN | EPOLLET; + ev.data.fd = stop_fd; + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, stop_fd, &ev) < 0) { + pr_err("Failed adding stop event to epoll"); + ret = -EFAULT; + goto err_stop; + } + + if (pthread_create(&thread, NULL, kvm_ipc__thread, kvm) != 0) { + pr_err("Failed starting IPC thread"); + ret = -EFAULT; + goto err_stop; + } + + kvm_ipc__register_handler(KVM_IPC_PID, kvm__pid); + kvm_ipc__register_handler(KVM_IPC_DEBUG, handle_debug); + kvm_ipc__register_handler(KVM_IPC_PAUSE, handle_pause); + kvm_ipc__register_handler(KVM_IPC_RESUME, handle_pause); + kvm_ipc__register_handler(KVM_IPC_STOP, handle_stop); + kvm_ipc__register_handler(KVM_IPC_VMSTATE, handle_vmstate); + signal(SIGUSR1, handle_sigusr1); + + return 0; + +err_stop: + close(stop_fd); +err_epoll: + close(epoll_fd); +err: + return ret; +} +base_init(kvm_ipc__init); + +int kvm_ipc__exit(struct kvm *kvm) +{ + u64 val = 1; + int ret; + + ret = write(stop_fd, &val, sizeof(val)); + if (ret < 0) + return ret; + + close(server_fd); + close(epoll_fd); + + kvm__remove_socket(kvm->cfg.guest_name); + + return ret; +} +base_exit(kvm_ipc__exit); diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c new file mode 100644 index 000000000000..a6b3c2346ad4 --- /dev/null +++ b/tools/kvm/kvm.c @@ -0,0 +1,512 @@ +#include "kvm/kvm.h" +#include "kvm/read-write.h" +#include "kvm/util.h" +#include "kvm/strbuf.h" +#include "kvm/mutex.h" +#include "kvm/kvm-cpu.h" +#include "kvm/kvm-ipc.h" + +#include <linux/kernel.h> +#include <linux/kvm.h> +#include <linux/list.h> +#include <linux/err.h> + +#include <sys/un.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <stdbool.h> +#include <limits.h> +#include <signal.h> +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <fcntl.h> +#include <time.h> +#include <sys/eventfd.h> +#include <asm/unistd.h> +#include <dirent.h> + +#define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason + +const char *kvm_exit_reasons[] = { + DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI), + DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR), +#ifdef CONFIG_PPC64 + DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL), +#endif +}; + +static int pause_event; +static DEFINE_MUTEX(pause_lock); +extern struct kvm_ext kvm_req_ext[]; + +static char kvm_dir[PATH_MAX]; + +static int set_dir(const char *fmt, va_list args) +{ + char tmp[PATH_MAX]; + + vsnprintf(tmp, sizeof(tmp), fmt, args); + + mkdir(tmp, 0777); + + if (!realpath(tmp, kvm_dir)) + return -errno; + + strcat(kvm_dir, "/"); + + return 0; +} + +void kvm__set_dir(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + set_dir(fmt, args); + va_end(args); +} + +const char *kvm__get_dir(void) +{ + return kvm_dir; +} + +bool kvm__supports_extension(struct kvm *kvm, unsigned int extension) +{ + int ret; + + ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, extension); + if (ret < 0) + return false; + + return ret; +} + +static int kvm__check_extensions(struct kvm *kvm) +{ + int i; + + for (i = 0; ; i++) { + if (!kvm_req_ext[i].name) + break; + if (!kvm__supports_extension(kvm, kvm_req_ext[i].code)) { + pr_err("Unsuppored KVM extension detected: %s", + kvm_req_ext[i].name); + return -i; + } + } + + return 0; +} + +struct kvm *kvm__new(void) +{ + struct kvm *kvm = calloc(1, sizeof(*kvm)); + if (!kvm) + return ERR_PTR(-ENOMEM); + + kvm->sys_fd = -1; + kvm->vm_fd = -1; + + return kvm; +} + +int kvm__exit(struct kvm *kvm) +{ + struct kvm_mem_bank *bank, *tmp; + + kvm__arch_delete_ram(kvm); + + list_for_each_entry_safe(bank, tmp, &kvm->mem_banks, list) { + list_del(&bank->list); + free(bank); + } + + free(kvm); + return 0; +} +core_exit(kvm__exit); + +/* + * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping + * memory regions to it. Therefore, be careful if you use this function for + * registering memory regions for emulating hardware. + */ +int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr) +{ + struct kvm_userspace_memory_region mem; + struct kvm_mem_bank *bank; + int ret; + + bank = malloc(sizeof(*bank)); + if (!bank) + return -ENOMEM; + + INIT_LIST_HEAD(&bank->list); + bank->guest_phys_addr = guest_phys; + bank->host_addr = userspace_addr; + bank->size = size; + + mem = (struct kvm_userspace_memory_region) { + .slot = kvm->mem_slots++, + .guest_phys_addr = guest_phys, + .memory_size = size, + .userspace_addr = (unsigned long)userspace_addr, + }; + + ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem); + if (ret < 0) + return -errno; + + list_add(&bank->list, &kvm->mem_banks); + return 0; +} + +void *guest_flat_to_host(struct kvm *kvm, u64 offset) +{ + struct kvm_mem_bank *bank; + + list_for_each_entry(bank, &kvm->mem_banks, list) { + u64 bank_start = bank->guest_phys_addr; + u64 bank_end = bank_start + bank->size; + + if (offset >= bank_start && offset < bank_end) + return bank->host_addr + (offset - bank_start); + } + + pr_warning("unable to translate guest address 0x%llx to host", + (unsigned long long)offset); + return NULL; +} + +u64 host_to_guest_flat(struct kvm *kvm, void *ptr) +{ + struct kvm_mem_bank *bank; + + list_for_each_entry(bank, &kvm->mem_banks, list) { + void *bank_start = bank->host_addr; + void *bank_end = bank_start + bank->size; + + if (ptr >= bank_start && ptr < bank_end) + return bank->guest_phys_addr + (ptr - bank_start); + } + + pr_warning("unable to translate host address %p to guest", ptr); + return 0; +} + +int kvm__recommended_cpus(struct kvm *kvm) +{ + int ret; + + ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_NR_VCPUS); + if (ret <= 0) + /* + * api.txt states that if KVM_CAP_NR_VCPUS does not exist, + * assume 4. + */ + return 4; + + return ret; +} + +/* + * The following hack should be removed once 'x86: Raise the hard + * VCPU count limit' makes it's way into the mainline. + */ +#ifndef KVM_CAP_MAX_VCPUS +#define KVM_CAP_MAX_VCPUS 66 +#endif + +int kvm__max_cpus(struct kvm *kvm) +{ + int ret; + + ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_MAX_VCPUS); + if (ret <= 0) + ret = kvm__recommended_cpus(kvm); + + return ret; +} + +int kvm__init(struct kvm *kvm) +{ + int ret; + + if (!kvm__arch_cpu_supports_vm()) { + pr_err("Your CPU does not support hardware virtualization"); + ret = -ENOSYS; + goto err; + } + + kvm->sys_fd = open(kvm->cfg.dev, O_RDWR); + if (kvm->sys_fd < 0) { + if (errno == ENOENT) + pr_err("'%s' not found. Please make sure your kernel has CONFIG_KVM " + "enabled and that the KVM modules are loaded.", kvm->cfg.dev); + else if (errno == ENODEV) + pr_err("'%s' KVM driver not available.\n # (If the KVM " + "module is loaded then 'dmesg' may offer further clues " + "about the failure.)", kvm->cfg.dev); + else + pr_err("Could not open %s: ", kvm->cfg.dev); + + ret = -errno; + goto err_free; + } + + ret = ioctl(kvm->sys_fd, KVM_GET_API_VERSION, 0); + if (ret != KVM_API_VERSION) { + pr_err("KVM_API_VERSION ioctl"); + ret = -errno; + goto err_sys_fd; + } + + kvm->vm_fd = ioctl(kvm->sys_fd, KVM_CREATE_VM, 0); + if (kvm->vm_fd < 0) { + ret = kvm->vm_fd; + goto err_sys_fd; + } + + if (kvm__check_extensions(kvm)) { + pr_err("A required KVM extension is not supported by OS"); + ret = -ENOSYS; + goto err_vm_fd; + } + + kvm__arch_init(kvm, kvm->cfg.hugetlbfs_path, kvm->cfg.ram_size); + + INIT_LIST_HEAD(&kvm->mem_banks); + kvm__init_ram(kvm); + + if (!kvm->cfg.firmware_filename) { + if (!kvm__load_kernel(kvm, kvm->cfg.kernel_filename, + kvm->cfg.initrd_filename, kvm->cfg.real_cmdline)) + die("unable to load kernel %s", kvm->cfg.kernel_filename); + } + + if (kvm->cfg.firmware_filename) { + if (!kvm__load_firmware(kvm, kvm->cfg.firmware_filename)) + die("unable to load firmware image %s: %s", kvm->cfg.firmware_filename, strerror(errno)); + } else { + ret = kvm__arch_setup_firmware(kvm); + if (ret < 0) + die("kvm__arch_setup_firmware() failed with error %d\n", ret); + } + + return 0; + +err_vm_fd: + close(kvm->vm_fd); +err_sys_fd: + close(kvm->sys_fd); +err_free: + free(kvm); +err: + return ret; +} +core_init(kvm__init); + +/* RFC 1952 */ +#define GZIP_ID1 0x1f +#define GZIP_ID2 0x8b +#define CPIO_MAGIC "0707" +/* initrd may be gzipped, or a plain cpio */ +static bool initrd_check(int fd) +{ + unsigned char id[4]; + + if (read_in_full(fd, id, ARRAY_SIZE(id)) < 0) + return false; + + if (lseek(fd, 0, SEEK_SET) < 0) + die_perror("lseek"); + + return (id[0] == GZIP_ID1 && id[1] == GZIP_ID2) || + !memcmp(id, CPIO_MAGIC, 4); +} + +bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename, + const char *initrd_filename, const char *kernel_cmdline) +{ + bool ret; + int fd_kernel = -1, fd_initrd = -1; + + fd_kernel = open(kernel_filename, O_RDONLY); + if (fd_kernel < 0) + die("Unable to open kernel %s", kernel_filename); + + if (initrd_filename) { + fd_initrd = open(initrd_filename, O_RDONLY); + if (fd_initrd < 0) + die("Unable to open initrd %s", initrd_filename); + + if (!initrd_check(fd_initrd)) + die("%s is not an initrd", initrd_filename); + } + + ret = load_bzimage(kvm, fd_kernel, fd_initrd, kernel_cmdline); + + if (ret) + goto found_kernel; + + pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename); + + ret = load_flat_binary(kvm, fd_kernel, fd_initrd, kernel_cmdline); + + if (ret) + goto found_kernel; + + if (initrd_filename) + close(fd_initrd); + close(fd_kernel); + + die("%s is not a valid bzImage or flat binary", kernel_filename); + +found_kernel: + if (initrd_filename) + close(fd_initrd); + close(fd_kernel); + + return ret; +} + +#define TIMER_INTERVAL_NS 1000000 /* 1 msec */ + +/* + * This function sets up a timer that's used to inject interrupts from the + * userspace hypervisor into the guest at periodical intervals. Please note + * that clock interrupt, for example, is not handled here. + */ +int kvm_timer__init(struct kvm *kvm) +{ + struct itimerspec its; + struct sigevent sev; + int r; + + memset(&sev, 0, sizeof(struct sigevent)); + sev.sigev_value.sival_int = 0; + sev.sigev_notify = SIGEV_THREAD_ID; + sev.sigev_signo = SIGALRM; + sev.sigev_value.sival_ptr = kvm; + sev._sigev_un._tid = syscall(__NR_gettid); + + r = timer_create(CLOCK_REALTIME, &sev, &kvm->timerid); + if (r < 0) + return r; + + its.it_value.tv_sec = TIMER_INTERVAL_NS / 1000000000; + its.it_value.tv_nsec = TIMER_INTERVAL_NS % 1000000000; + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + r = timer_settime(kvm->timerid, 0, &its, NULL); + if (r < 0) { + timer_delete(kvm->timerid); + return r; + } + + return 0; +} +firmware_init(kvm_timer__init); + +int kvm_timer__exit(struct kvm *kvm) +{ + if (kvm->timerid) + if (timer_delete(kvm->timerid) < 0) + die("timer_delete()"); + + kvm->timerid = 0; + + return 0; +} +firmware_exit(kvm_timer__exit); + +void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size) +{ + unsigned char *p; + unsigned long n; + + size &= ~7; /* mod 8 */ + if (!size) + return; + + p = guest_flat_to_host(kvm, addr); + + for (n = 0; n < size; n += 8) { + if (!host_ptr_in_ram(kvm, p + n)) + break; + + printf(" 0x%08lx: %02x %02x %02x %02x %02x %02x %02x %02x\n", + addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3], + p[n + 4], p[n + 5], p[n + 6], p[n + 7]); + } +} + +void kvm__pause(struct kvm *kvm) +{ + int i, paused_vcpus = 0; + + /* Check if the guest is running */ + if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0) + return; + + mutex_lock(&pause_lock); + + pause_event = eventfd(0, 0); + if (pause_event < 0) + die("Failed creating pause notification event"); + for (i = 0; i < kvm->nrcpus; i++) + pthread_kill(kvm->cpus[i]->thread, SIGKVMPAUSE); + + while (paused_vcpus < kvm->nrcpus) { + u64 cur_read; + + if (read(pause_event, &cur_read, sizeof(cur_read)) < 0) + die("Failed reading pause event"); + paused_vcpus += cur_read; + } + close(pause_event); +} + +void kvm__continue(struct kvm *kvm) +{ + /* Check if the guest is running */ + if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0) + return; + + mutex_unlock(&pause_lock); +} + +void kvm__notify_paused(void) +{ + u64 p = 1; + + if (write(pause_event, &p, sizeof(p)) < 0) + die("Failed notifying of paused VCPU."); + + mutex_lock(&pause_lock); + mutex_unlock(&pause_lock); +} diff --git a/tools/kvm/main.c b/tools/kvm/main.c new file mode 100644 index 000000000000..05bc82c8c6fa --- /dev/null +++ b/tools/kvm/main.c @@ -0,0 +1,19 @@ +#include "kvm/kvm.h" + +#include <stdlib.h> +#include <stdio.h> + +/* user defined header files */ +#include <kvm/kvm-cmd.h> + +static int handle_kvm_command(int argc, char **argv) +{ + return handle_command(kvm_commands, argc, (const char **) &argv[0]); +} + +int main(int argc, char *argv[]) +{ + kvm__set_dir("%s/%s", HOME_DIR, KVM_PID_FILE_PATH); + + return handle_kvm_command(argc - 1, &argv[1]); +} diff --git a/tools/kvm/mmio.c b/tools/kvm/mmio.c new file mode 100644 index 000000000000..5d65d280391b --- /dev/null +++ b/tools/kvm/mmio.c @@ -0,0 +1,139 @@ +#include "kvm/kvm.h" +#include "kvm/rbtree-interval.h" +#include "kvm/brlock.h" + +#include <stdio.h> +#include <stdlib.h> + +#include <sys/ioctl.h> +#include <linux/kvm.h> +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/err.h> +#include <errno.h> + +#define mmio_node(n) rb_entry(n, struct mmio_mapping, node) + +struct mmio_mapping { + struct rb_int_node node; + void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr); + void *ptr; +}; + +static struct rb_root mmio_tree = RB_ROOT; + +static struct mmio_mapping *mmio_search(struct rb_root *root, u64 addr, u64 len) +{ + struct rb_int_node *node; + + node = rb_int_search_range(root, addr, addr + len); + if (node == NULL) + return NULL; + + return mmio_node(node); +} + +/* Find lowest match, Check for overlap */ +static struct mmio_mapping *mmio_search_single(struct rb_root *root, u64 addr) +{ + struct rb_int_node *node; + + node = rb_int_search_single(root, addr); + if (node == NULL) + return NULL; + + return mmio_node(node); +} + +static int mmio_insert(struct rb_root *root, struct mmio_mapping *data) +{ + return rb_int_insert(root, &data->node); +} + +static const char *to_direction(u8 is_write) +{ + if (is_write) + return "write"; + + return "read"; +} + +int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce, + void (*mmio_fn)(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr), + void *ptr) +{ + struct mmio_mapping *mmio; + struct kvm_coalesced_mmio_zone zone; + int ret; + + mmio = malloc(sizeof(*mmio)); + if (mmio == NULL) + return -ENOMEM; + + *mmio = (struct mmio_mapping) { + .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len), + .mmio_fn = mmio_fn, + .ptr = ptr, + }; + + if (coalesce) { + zone = (struct kvm_coalesced_mmio_zone) { + .addr = phys_addr, + .size = phys_addr_len, + }; + ret = ioctl(kvm->vm_fd, KVM_REGISTER_COALESCED_MMIO, &zone); + if (ret < 0) { + free(mmio); + return -errno; + } + } + br_write_lock(kvm); + ret = mmio_insert(&mmio_tree, mmio); + br_write_unlock(kvm); + + return ret; +} + +bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr) +{ + struct mmio_mapping *mmio; + struct kvm_coalesced_mmio_zone zone; + + br_write_lock(kvm); + mmio = mmio_search_single(&mmio_tree, phys_addr); + if (mmio == NULL) { + br_write_unlock(kvm); + return false; + } + + zone = (struct kvm_coalesced_mmio_zone) { + .addr = phys_addr, + .size = 1, + }; + ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone); + + rb_int_erase(&mmio_tree, &mmio->node); + br_write_unlock(kvm); + + free(mmio); + return true; +} + +bool kvm__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write) +{ + struct mmio_mapping *mmio; + + br_read_lock(); + mmio = mmio_search(&mmio_tree, phys_addr, len); + + if (mmio) + mmio->mmio_fn(phys_addr, data, len, is_write, mmio->ptr); + else { + if (kvm->cfg.mmio_debug) + fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n", + to_direction(is_write), phys_addr, len); + } + br_read_unlock(); + + return true; +} diff --git a/tools/kvm/net/uip/arp.c b/tools/kvm/net/uip/arp.c new file mode 100644 index 000000000000..98423da6cb19 --- /dev/null +++ b/tools/kvm/net/uip/arp.c @@ -0,0 +1,30 @@ +#include "kvm/uip.h" + +int uip_tx_do_arp(struct uip_tx_arg *arg) +{ + struct uip_arp *arp, *arp2; + struct uip_info *info; + struct uip_buf *buf; + + info = arg->info; + buf = uip_buf_clone(arg); + + arp = (struct uip_arp *)(arg->eth); + arp2 = (struct uip_arp *)(buf->eth); + + /* + * ARP replay code: 2 + */ + arp2->op = htons(0x2); + arp2->dmac = arp->smac; + arp2->dip = arp->sip; + + if (arp->dip == htonl(info->host_ip)) { + arp2->smac = info->host_mac; + arp2->sip = htonl(info->host_ip); + + uip_buf_set_used(info, buf); + } + + return 0; +} diff --git a/tools/kvm/net/uip/buf.c b/tools/kvm/net/uip/buf.c new file mode 100644 index 000000000000..f29ad41cb8fc --- /dev/null +++ b/tools/kvm/net/uip/buf.c @@ -0,0 +1,114 @@ +#include "kvm/uip.h" + +#include <linux/kernel.h> +#include <linux/list.h> + +struct uip_buf *uip_buf_get_used(struct uip_info *info) +{ + struct uip_buf *buf; + bool found = false; + + mutex_lock(&info->buf_lock); + + while (!(info->buf_used_nr > 0)) + pthread_cond_wait(&info->buf_used_cond, &info->buf_lock.mutex); + + list_for_each_entry(buf, &info->buf_head, list) { + if (buf->status == UIP_BUF_STATUS_USED) { + /* + * Set status to INUSE immediately to prevent + * someone from using this buf until we free it + */ + buf->status = UIP_BUF_STATUS_INUSE; + info->buf_used_nr--; + found = true; + break; + } + } + + mutex_unlock(&info->buf_lock); + + return found ? buf : NULL; +} + +struct uip_buf *uip_buf_get_free(struct uip_info *info) +{ + struct uip_buf *buf; + bool found = false; + + mutex_lock(&info->buf_lock); + + while (!(info->buf_free_nr > 0)) + pthread_cond_wait(&info->buf_free_cond, &info->buf_lock.mutex); + + list_for_each_entry(buf, &info->buf_head, list) { + if (buf->status == UIP_BUF_STATUS_FREE) { + /* + * Set status to INUSE immediately to prevent + * someone from using this buf until we free it + */ + buf->status = UIP_BUF_STATUS_INUSE; + info->buf_free_nr--; + found = true; + break; + } + } + + mutex_unlock(&info->buf_lock); + + return found ? buf : NULL; +} + +struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf) +{ + mutex_lock(&info->buf_lock); + + buf->status = UIP_BUF_STATUS_USED; + info->buf_used_nr++; + pthread_cond_signal(&info->buf_used_cond); + + mutex_unlock(&info->buf_lock); + + return buf; +} + +struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf) +{ + mutex_lock(&info->buf_lock); + + buf->status = UIP_BUF_STATUS_FREE; + info->buf_free_nr++; + pthread_cond_signal(&info->buf_free_cond); + + mutex_unlock(&info->buf_lock); + + return buf; +} + +struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg) +{ + struct uip_buf *buf; + struct uip_eth *eth2; + struct uip_info *info; + + info = arg->info; + + /* + * Get buffer from device to guest + */ + buf = uip_buf_get_free(info); + + /* + * Clone buffer + */ + memcpy(buf->vnet, arg->vnet, arg->vnet_len); + memcpy(buf->eth, arg->eth, arg->eth_len); + buf->vnet_len = arg->vnet_len; + buf->eth_len = arg->eth_len; + + eth2 = (struct uip_eth *)buf->eth; + eth2->src = info->host_mac; + eth2->dst = arg->eth->src; + + return buf; +} diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c new file mode 100644 index 000000000000..4e5bb82e48b1 --- /dev/null +++ b/tools/kvm/net/uip/core.c @@ -0,0 +1,190 @@ +#include "kvm/mutex.h" +#include "kvm/uip.h" + +#include <linux/virtio_net.h> +#include <linux/kernel.h> +#include <linux/list.h> + +int uip_tx(struct iovec *iov, u16 out, struct uip_info *info) +{ + struct virtio_net_hdr *vnet; + struct uip_tx_arg arg; + int eth_len, vnet_len; + struct uip_eth *eth; + u8 *buf = NULL; + u16 proto; + int i; + + /* + * Buffer from guest to device + */ + vnet_len = iov[0].iov_len; + vnet = iov[0].iov_base; + + eth_len = iov[1].iov_len; + eth = iov[1].iov_base; + + /* + * In case, ethernet frame is in more than one iov entry. + * Copy iov buffer into one linear buffer. + */ + if (out > 2) { + eth_len = 0; + for (i = 1; i < out; i++) + eth_len += iov[i].iov_len; + + buf = malloc(eth_len); + if (!buf) + return -1; + + eth = (struct uip_eth *)buf; + for (i = 1; i < out; i++) { + memcpy(buf, iov[i].iov_base, iov[i].iov_len); + buf += iov[i].iov_len; + } + } + + memset(&arg, 0, sizeof(arg)); + + arg.vnet_len = vnet_len; + arg.eth_len = eth_len; + arg.info = info; + arg.vnet = vnet; + arg.eth = eth; + + /* + * Check package type + */ + proto = ntohs(eth->type); + + switch (proto) { + case UIP_ETH_P_ARP: + uip_tx_do_arp(&arg); + break; + case UIP_ETH_P_IP: + uip_tx_do_ipv4(&arg); + break; + default: + break; + } + + if (out > 2 && buf) + free(eth); + + return vnet_len + eth_len; +} + +int uip_rx(struct iovec *iov, u16 in, struct uip_info *info) +{ + struct virtio_net_hdr *vnet; + struct uip_eth *eth; + struct uip_buf *buf; + int vnet_len; + int eth_len; + char *p; + int len; + int cnt; + int i; + + /* + * Sleep until there is a buffer for guest + */ + buf = uip_buf_get_used(info); + + /* + * Fill device to guest buffer, vnet hdr fisrt + */ + vnet_len = iov[0].iov_len; + vnet = iov[0].iov_base; + if (buf->vnet_len > vnet_len) { + len = -1; + goto out; + } + memcpy(vnet, buf->vnet, buf->vnet_len); + + /* + * Then, the real eth data + * Note: Be sure buf->eth_len is not bigger than the buffer len that guest provides + */ + cnt = buf->eth_len; + p = buf->eth; + for (i = 1; i < in; i++) { + eth_len = iov[i].iov_len; + eth = iov[i].iov_base; + if (cnt > eth_len) { + memcpy(eth, p, eth_len); + cnt -= eth_len; + p += eth_len; + } else { + memcpy(eth, p, cnt); + cnt -= cnt; + break; + } + } + + if (cnt) { + pr_warning("uip_rx error"); + len = -1; + goto out; + } + + len = buf->vnet_len + buf->eth_len; + +out: + uip_buf_set_free(info, buf); + return len; +} + +int uip_init(struct uip_info *info) +{ + struct list_head *udp_socket_head; + struct list_head *tcp_socket_head; + struct list_head *buf_head; + struct uip_buf *buf; + int buf_nr; + int i; + + udp_socket_head = &info->udp_socket_head; + tcp_socket_head = &info->tcp_socket_head; + buf_head = &info->buf_head; + buf_nr = info->buf_nr; + + INIT_LIST_HEAD(udp_socket_head); + INIT_LIST_HEAD(tcp_socket_head); + INIT_LIST_HEAD(buf_head); + + mutex_init(&info->udp_socket_lock); + mutex_init(&info->tcp_socket_lock); + mutex_init(&info->buf_lock); + + pthread_cond_init(&info->buf_used_cond, NULL); + pthread_cond_init(&info->buf_free_cond, NULL); + + + for (i = 0; i < buf_nr; i++) { + buf = malloc(sizeof(*buf)); + memset(buf, 0, sizeof(*buf)); + + buf->status = UIP_BUF_STATUS_FREE; + buf->info = info; + buf->id = i; + list_add_tail(&buf->list, buf_head); + } + + list_for_each_entry(buf, buf_head, list) { + buf->vnet = malloc(sizeof(struct virtio_net_hdr)); + buf->vnet_len = sizeof(struct virtio_net_hdr); + buf->eth = malloc(1024*64 + sizeof(struct uip_pseudo_hdr)); + buf->eth_len = 1024*64 + sizeof(struct uip_pseudo_hdr); + + memset(buf->vnet, 0, buf->vnet_len); + memset(buf->eth, 0, buf->eth_len); + } + + info->buf_free_nr = buf_nr; + info->buf_used_nr = 0; + + uip_dhcp_get_dns(info); + + return 0; +} diff --git a/tools/kvm/net/uip/csum.c b/tools/kvm/net/uip/csum.c new file mode 100644 index 000000000000..7ca8badaaeee --- /dev/null +++ b/tools/kvm/net/uip/csum.c @@ -0,0 +1,92 @@ +#include "kvm/uip.h" + +static u16 uip_csum(u16 csum, u8 *addr, u16 count) +{ + long sum = csum; + + while (count > 1) { + sum += *(u16 *)addr; + addr += 2; + count -= 2; + } + + if (count > 0) + sum += *(unsigned char *)addr; + + while (sum>>16) + sum = (sum & 0xffff) + (sum >> 16); + + return ~sum; +} + +u16 uip_csum_ip(struct uip_ip *ip) +{ + return uip_csum(0, &ip->vhl, uip_ip_hdrlen(ip)); +} + +u16 uip_csum_icmp(struct uip_icmp *icmp) +{ + struct uip_ip *ip; + + ip = &icmp->ip; + return icmp->csum = uip_csum(0, &icmp->type, htons(ip->len) - uip_ip_hdrlen(ip) - 8); /* icmp header len = 8 */ +} + +u16 uip_csum_udp(struct uip_udp *udp) +{ + struct uip_pseudo_hdr hdr; + struct uip_ip *ip; + int udp_len; + u8 *pad; + + ip = &udp->ip; + + hdr.sip = ip->sip; + hdr.dip = ip->dip; + hdr.zero = 0; + hdr.proto = ip->proto; + hdr.len = udp->len; + + udp_len = uip_udp_len(udp); + + if (udp_len % 2) { + pad = (u8 *)&udp->sport + udp_len; + *pad = 0; + memcpy((u8 *)&udp->sport + udp_len + 1, &hdr, sizeof(hdr)); + return uip_csum(0, (u8 *)&udp->sport, udp_len + 1 + sizeof(hdr)); + } else { + memcpy((u8 *)&udp->sport + udp_len, &hdr, sizeof(hdr)); + return uip_csum(0, (u8 *)&udp->sport, udp_len + sizeof(hdr)); + } + +} + +u16 uip_csum_tcp(struct uip_tcp *tcp) +{ + struct uip_pseudo_hdr hdr; + struct uip_ip *ip; + u16 tcp_len; + u8 *pad; + + ip = &tcp->ip; + tcp_len = ntohs(ip->len) - uip_ip_hdrlen(ip); + + hdr.sip = ip->sip; + hdr.dip = ip->dip; + hdr.zero = 0; + hdr.proto = ip->proto; + hdr.len = htons(tcp_len); + + if (tcp_len > UIP_MAX_TCP_PAYLOAD + 20) + pr_warning("tcp_len(%d) is too large", tcp_len); + + if (tcp_len % 2) { + pad = (u8 *)&tcp->sport + tcp_len; + *pad = 0; + memcpy((u8 *)&tcp->sport + tcp_len + 1, &hdr, sizeof(hdr)); + return uip_csum(0, (u8 *)&tcp->sport, tcp_len + 1 + sizeof(hdr)); + } else { + memcpy((u8 *)&tcp->sport + tcp_len, &hdr, sizeof(hdr)); + return uip_csum(0, (u8 *)&tcp->sport, tcp_len + sizeof(hdr)); + } +} diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c new file mode 100644 index 000000000000..b17d35239321 --- /dev/null +++ b/tools/kvm/net/uip/dhcp.c @@ -0,0 +1,202 @@ +#include "kvm/uip.h" + +#include <arpa/inet.h> + +#define EMPTY_ADDR "0.0.0.0" + +static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp) +{ + return (dhcp->option[2] == UIP_DHCP_DISCOVER && + dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN && + dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE); +} + +static inline bool uip_dhcp_is_request(struct uip_dhcp *dhcp) +{ + return (dhcp->option[2] == UIP_DHCP_REQUEST && + dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN && + dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE); +} + +bool uip_udp_is_dhcp(struct uip_udp *udp) +{ + struct uip_dhcp *dhcp; + + if (ntohs(udp->sport) != UIP_DHCP_PORT_CLIENT || + ntohs(udp->dport) != UIP_DHCP_PORT_SERVER) + return false; + + dhcp = (struct uip_dhcp *)udp; + + if (ntohl(dhcp->magic_cookie) != UIP_DHCP_MAGIC_COOKIE) + return false; + + return true; +} + +int uip_dhcp_get_dns(struct uip_info *info) +{ + char key[256], val[256]; + struct in_addr addr; + int ret = -1; + int n = 0; + FILE *fp; + u32 ip; + + fp = fopen("/etc/resolv.conf", "r"); + if (!fp) + return ret; + + while (!feof(fp)) { + if (fscanf(fp, "%s %s\n", key, val) != 2) + continue; + if (strncmp("domain", key, 6) == 0) + info->domain_name = strndup(val, UIP_DHCP_MAX_DOMAIN_NAME_LEN); + else if (strncmp("nameserver", key, 10) == 0) { + if (!inet_aton(val, &addr)) + continue; + ip = ntohl(addr.s_addr); + if (n < UIP_DHCP_MAX_DNS_SERVER_NR) + info->dns_ip[n++] = ip; + ret = 0; + } + } + + fclose(fp); + return ret; +} + +static int uip_dhcp_fill_option_name_and_server(struct uip_info *info, u8 *opt, int i) +{ + u8 domain_name_len; + u32 *addr; + int n; + + if (info->domain_name) { + domain_name_len = strlen(info->domain_name); + opt[i++] = UIP_DHCP_TAG_DOMAIN_NAME; + opt[i++] = domain_name_len; + memcpy(&opt[i], info->domain_name, domain_name_len); + i += domain_name_len; + } + + for (n = 0; n < UIP_DHCP_MAX_DNS_SERVER_NR; n++) { + if (info->dns_ip[n] == 0) + continue; + opt[i++] = UIP_DHCP_TAG_DNS_SERVER; + opt[i++] = UIP_DHCP_TAG_DNS_SERVER_LEN; + addr = (u32 *)&opt[i]; + *addr = htonl(info->dns_ip[n]); + i += UIP_DHCP_TAG_DNS_SERVER_LEN; + } + + return i; +} +static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, int reply_msg_type) +{ + int i = 0; + u32 *addr; + u8 *opt; + + opt = dhcp->option; + + opt[i++] = UIP_DHCP_TAG_MSG_TYPE; + opt[i++] = UIP_DHCP_TAG_MSG_TYPE_LEN; + opt[i++] = reply_msg_type; + + opt[i++] = UIP_DHCP_TAG_SERVER_ID; + opt[i++] = UIP_DHCP_TAG_SERVER_ID_LEN; + addr = (u32 *)&opt[i]; + *addr = htonl(info->host_ip); + i += UIP_DHCP_TAG_SERVER_ID_LEN; + + opt[i++] = UIP_DHCP_TAG_LEASE_TIME; + opt[i++] = UIP_DHCP_TAG_LEASE_TIME_LEN; + addr = (u32 *)&opt[i]; + *addr = htonl(UIP_DHCP_LEASE_TIME); + i += UIP_DHCP_TAG_LEASE_TIME_LEN; + + opt[i++] = UIP_DHCP_TAG_SUBMASK; + opt[i++] = UIP_DHCP_TAG_SUBMASK_LEN; + addr = (u32 *)&opt[i]; + *addr = htonl(info->guest_netmask); + i += UIP_DHCP_TAG_SUBMASK_LEN; + + opt[i++] = UIP_DHCP_TAG_ROUTER; + opt[i++] = UIP_DHCP_TAG_ROUTER_LEN; + addr = (u32 *)&opt[i]; + *addr = htonl(info->host_ip); + i += UIP_DHCP_TAG_ROUTER_LEN; + + opt[i++] = UIP_DHCP_TAG_ROOT; + opt[i++] = strlen(EMPTY_ADDR); + addr = (u32 *)&opt[i]; + strncpy((void *) addr, EMPTY_ADDR, strlen(EMPTY_ADDR)); + i += strlen(EMPTY_ADDR); + + i = uip_dhcp_fill_option_name_and_server(info, opt, i); + + opt[i++] = UIP_DHCP_TAG_END; + + return 0; +} + +static int uip_dhcp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 reply_msg_type) +{ + struct uip_dhcp *dhcp; + + dhcp = (struct uip_dhcp *)buf->eth; + + dhcp->msg_type = 2; + dhcp->client_ip = 0; + dhcp->your_ip = htonl(info->guest_ip); + dhcp->server_ip = htonl(info->host_ip); + dhcp->agent_ip = 0; + + uip_dhcp_fill_option(info, dhcp, reply_msg_type); + + sk->sip = htonl(info->guest_ip); + sk->dip = htonl(info->host_ip); + sk->sport = htons(UIP_DHCP_PORT_CLIENT); + sk->dport = htons(UIP_DHCP_PORT_SERVER); + + return 0; +} + +int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg) +{ + struct uip_udp_socket sk; + struct uip_dhcp *dhcp; + struct uip_info *info; + struct uip_buf *buf; + u8 reply_msg_type; + + dhcp = (struct uip_dhcp *)arg->eth; + + if (uip_dhcp_is_discovery(dhcp)) + reply_msg_type = UIP_DHCP_OFFER; + else if (uip_dhcp_is_request(dhcp)) + reply_msg_type = UIP_DHCP_ACK; + else + return -1; + + buf = uip_buf_clone(arg); + info = arg->info; + + /* + * Cook DHCP pkg + */ + uip_dhcp_make_pkg(info, &sk, buf, reply_msg_type); + + /* + * Cook UDP pkg + */ + uip_udp_make_pkg(info, &sk, buf, NULL, UIP_DHCP_MAX_PAYLOAD_LEN); + + /* + * Send data received from socket to guest + */ + uip_buf_set_used(info, buf); + + return 0; +} diff --git a/tools/kvm/net/uip/icmp.c b/tools/kvm/net/uip/icmp.c new file mode 100644 index 000000000000..233297caf44b --- /dev/null +++ b/tools/kvm/net/uip/icmp.c @@ -0,0 +1,29 @@ +#include "kvm/uip.h" + +int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg) +{ + struct uip_ip *ip, *ip2; + struct uip_icmp *icmp2; + struct uip_buf *buf; + + buf = uip_buf_clone(arg); + + icmp2 = (struct uip_icmp *)(buf->eth); + ip2 = (struct uip_ip *)(buf->eth); + ip = (struct uip_ip *)(arg->eth); + + ip2->sip = ip->dip; + ip2->dip = ip->sip; + ip2->csum = 0; + /* + * ICMP reply: 0 + */ + icmp2->type = 0; + icmp2->csum = 0; + ip2->csum = uip_csum_ip(ip2); + icmp2->csum = uip_csum_icmp(icmp2); + + uip_buf_set_used(arg->info, buf); + + return 0; +} diff --git a/tools/kvm/net/uip/ipv4.c b/tools/kvm/net/uip/ipv4.c new file mode 100644 index 000000000000..58373fd022e0 --- /dev/null +++ b/tools/kvm/net/uip/ipv4.c @@ -0,0 +1,29 @@ +#include "kvm/uip.h" + +int uip_tx_do_ipv4(struct uip_tx_arg *arg) +{ + struct uip_ip *ip; + + ip = (struct uip_ip *)(arg->eth); + + if (uip_ip_hdrlen(ip) != 20) { + pr_warning("IP header length is not 20 bytes"); + return -1; + } + + switch (ip->proto) { + case UIP_IP_P_ICMP: + uip_tx_do_ipv4_icmp(arg); + break; + case UIP_IP_P_TCP: + uip_tx_do_ipv4_tcp(arg); + break; + case UIP_IP_P_UDP: + uip_tx_do_ipv4_udp(arg); + break; + default: + break; + } + + return 0; +} diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c new file mode 100644 index 000000000000..9044f40ba2d0 --- /dev/null +++ b/tools/kvm/net/uip/tcp.c @@ -0,0 +1,348 @@ +#include "kvm/uip.h" + +#include <kvm/kvm.h> +#include <linux/virtio_net.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <arpa/inet.h> + +static int uip_tcp_socket_close(struct uip_tcp_socket *sk, int how) +{ + shutdown(sk->fd, how); + + if (sk->write_done && sk->read_done) { + shutdown(sk->fd, SHUT_RDWR); + close(sk->fd); + + mutex_lock(sk->lock); + list_del(&sk->list); + mutex_unlock(sk->lock); + + free(sk); + } + + return 0; +} + +static struct uip_tcp_socket *uip_tcp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport) +{ + struct list_head *sk_head; + struct mutex *sk_lock; + struct uip_tcp_socket *sk; + + sk_head = &arg->info->tcp_socket_head; + sk_lock = &arg->info->tcp_socket_lock; + + mutex_lock(sk_lock); + list_for_each_entry(sk, sk_head, list) { + if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) { + mutex_unlock(sk_lock); + return sk; + } + } + mutex_unlock(sk_lock); + + return NULL; +} + +static struct uip_tcp_socket *uip_tcp_socket_alloc(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport) +{ + struct list_head *sk_head; + struct uip_tcp_socket *sk; + struct mutex *sk_lock; + struct uip_tcp *tcp; + struct uip_ip *ip; + int ret; + + tcp = (struct uip_tcp *)arg->eth; + ip = (struct uip_ip *)arg->eth; + + sk_head = &arg->info->tcp_socket_head; + sk_lock = &arg->info->tcp_socket_lock; + + sk = malloc(sizeof(*sk)); + memset(sk, 0, sizeof(*sk)); + + sk->lock = sk_lock; + sk->info = arg->info; + + sk->fd = socket(AF_INET, SOCK_STREAM, 0); + sk->addr.sin_family = AF_INET; + sk->addr.sin_port = dport; + sk->addr.sin_addr.s_addr = dip; + + pthread_cond_init(&sk->cond, NULL); + + if (ntohl(dip) == arg->info->host_ip) + sk->addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + ret = connect(sk->fd, (struct sockaddr *)&sk->addr, sizeof(sk->addr)); + if (ret) { + free(sk); + return NULL; + } + + sk->sip = ip->sip; + sk->dip = ip->dip; + sk->sport = tcp->sport; + sk->dport = tcp->dport; + + mutex_lock(sk_lock); + list_add_tail(&sk->list, sk_head); + mutex_unlock(sk_lock); + + return sk; +} + +static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_len) +{ + struct uip_info *info; + struct uip_eth *eth2; + struct uip_tcp *tcp2; + struct uip_buf *buf; + struct uip_ip *ip2; + + info = sk->info; + + /* + * Get free buffer to send data to guest + */ + buf = uip_buf_get_free(info); + + /* + * Cook a ethernet frame + */ + tcp2 = (struct uip_tcp *)buf->eth; + eth2 = (struct uip_eth *)buf->eth; + ip2 = (struct uip_ip *)buf->eth; + + eth2->src = info->host_mac; + eth2->dst = info->guest_mac; + eth2->type = htons(UIP_ETH_P_IP); + + ip2->vhl = UIP_IP_VER_4 | UIP_IP_HDR_LEN; + ip2->tos = 0; + ip2->id = 0; + ip2->flgfrag = 0; + ip2->ttl = UIP_IP_TTL; + ip2->proto = UIP_IP_P_TCP; + ip2->csum = 0; + ip2->sip = sk->dip; + ip2->dip = sk->sip; + + tcp2->sport = sk->dport; + tcp2->dport = sk->sport; + tcp2->seq = htonl(sk->seq_server); + tcp2->ack = htonl(sk->ack_server); + /* + * Diable TCP options, tcp hdr len equals 20 bytes + */ + tcp2->off = UIP_TCP_HDR_LEN; + tcp2->flg = flag; + tcp2->win = htons(UIP_TCP_WIN_SIZE); + tcp2->csum = 0; + tcp2->urgent = 0; + + if (payload_len > 0) + memcpy(uip_tcp_payload(tcp2), sk->payload, payload_len); + + ip2->len = htons(uip_tcp_hdrlen(tcp2) + payload_len + uip_ip_hdrlen(ip2)); + ip2->csum = uip_csum_ip(ip2); + tcp2->csum = uip_csum_tcp(tcp2); + + /* + * virtio_net_hdr + */ + buf->vnet_len = sizeof(struct virtio_net_hdr); + memset(buf->vnet, 0, buf->vnet_len); + + buf->eth_len = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth); + + /* + * Increase server seq + */ + sk->seq_server += payload_len; + + /* + * Send data received from socket to guest + */ + uip_buf_set_used(info, buf); + + return 0; +} + +static void *uip_tcp_socket_thread(void *p) +{ + struct uip_tcp_socket *sk; + int len, left, ret; + u8 *payload, *pos; + + kvm__set_thread_name("uip-tcp"); + + sk = p; + + payload = malloc(UIP_MAX_TCP_PAYLOAD); + if (!payload) + goto out; + + while (1) { + pos = payload; + + ret = read(sk->fd, payload, UIP_MAX_TCP_PAYLOAD); + + if (ret <= 0 || ret > UIP_MAX_TCP_PAYLOAD) + goto out; + + left = ret; + + while (left > 0) { + mutex_lock(sk->lock); + while ((len = sk->guest_acked + sk->window_size - sk->seq_server) <= 0) + pthread_cond_wait(&sk->cond, &sk->lock->mutex); + mutex_unlock(sk->lock); + + sk->payload = pos; + if (len > left) + len = left; + if (len > UIP_MAX_TCP_PAYLOAD) + len = UIP_MAX_TCP_PAYLOAD; + left -= len; + pos += len; + + uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, len); + } + } + +out: + /* + * Close server to guest TCP connection + */ + uip_tcp_socket_close(sk, SHUT_RD); + + uip_tcp_payload_send(sk, UIP_TCP_FLAG_FIN | UIP_TCP_FLAG_ACK, 0); + sk->seq_server += 1; + + sk->read_done = 1; + + free(payload); + pthread_exit(NULL); + + return NULL; +} + +static int uip_tcp_socket_receive(struct uip_tcp_socket *sk) +{ + if (sk->thread == 0) + return pthread_create(&sk->thread, NULL, uip_tcp_socket_thread, (void *)sk); + + return 0; +} + +static int uip_tcp_socket_send(struct uip_tcp_socket *sk, struct uip_tcp *tcp) +{ + int len; + int ret; + u8 *payload; + + if (sk->write_done) + return 0; + + payload = uip_tcp_payload(tcp); + len = uip_tcp_payloadlen(tcp); + + ret = write(sk->fd, payload, len); + if (ret != len) + pr_warning("tcp send error"); + + return ret; +} + +int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg) +{ + struct uip_tcp_socket *sk; + struct uip_tcp *tcp; + struct uip_ip *ip; + int ret; + + tcp = (struct uip_tcp *)arg->eth; + ip = (struct uip_ip *)arg->eth; + + /* + * Guest is trying to start a TCP session, let's fake SYN-ACK to guest + */ + if (uip_tcp_is_syn(tcp)) { + sk = uip_tcp_socket_alloc(arg, ip->sip, ip->dip, tcp->sport, tcp->dport); + if (!sk) + return -1; + + sk->window_size = ntohs(tcp->win); + + /* + * Setup ISN number + */ + sk->isn_guest = uip_tcp_isn(tcp); + sk->isn_server = uip_tcp_isn_alloc(); + + sk->seq_server = sk->isn_server; + sk->ack_server = sk->isn_guest + 1; + uip_tcp_payload_send(sk, UIP_TCP_FLAG_SYN | UIP_TCP_FLAG_ACK, 0); + sk->seq_server += 1; + + /* + * Start receive thread for data from remote to guest + */ + uip_tcp_socket_receive(sk); + + goto out; + } + + /* + * Find socket we have allocated + */ + sk = uip_tcp_socket_find(arg, ip->sip, ip->dip, tcp->sport, tcp->dport); + if (!sk) + return -1; + + mutex_lock(sk->lock); + sk->window_size = ntohs(tcp->win); + sk->guest_acked = ntohl(tcp->ack); + pthread_cond_signal(&sk->cond); + mutex_unlock(sk->lock); + + if (uip_tcp_is_fin(tcp)) { + if (sk->write_done) + goto out; + + sk->write_done = 1; + sk->ack_server += 1; + uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0); + + /* + * Close guest to server TCP connection + */ + uip_tcp_socket_close(sk, SHUT_WR); + + goto out; + } + + /* + * Ignore guest to server frames with zero tcp payload + */ + if (uip_tcp_payloadlen(tcp) == 0) + goto out; + + /* + * Sent out TCP data to remote host + */ + ret = uip_tcp_socket_send(sk, tcp); + if (ret < 0) + return -1; + /* + * Send ACK to guest imediately + */ + sk->ack_server += ret; + uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0); + +out: + return 0; +} diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c new file mode 100644 index 000000000000..31c417cd5ca9 --- /dev/null +++ b/tools/kvm/net/uip/udp.c @@ -0,0 +1,239 @@ +#include "kvm/uip.h" + +#include <kvm/kvm.h> +#include <linux/virtio_net.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <sys/socket.h> +#include <sys/epoll.h> +#include <fcntl.h> + +#define UIP_UDP_MAX_EVENTS 1000 + +static struct uip_udp_socket *uip_udp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport) +{ + struct list_head *sk_head; + struct uip_udp_socket *sk; + struct mutex *sk_lock; + struct epoll_event ev; + int flags; + int ret; + + sk_head = &arg->info->udp_socket_head; + sk_lock = &arg->info->udp_socket_lock; + + /* + * Find existing sk + */ + mutex_lock(sk_lock); + list_for_each_entry(sk, sk_head, list) { + if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) { + mutex_unlock(sk_lock); + return sk; + } + } + mutex_unlock(sk_lock); + + /* + * Allocate new one + */ + sk = malloc(sizeof(*sk)); + memset(sk, 0, sizeof(*sk)); + + sk->lock = sk_lock; + + sk->fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sk->fd < 0) + goto out; + + /* + * Set non-blocking + */ + flags = fcntl(sk->fd, F_GETFL, 0); + flags |= O_NONBLOCK; + fcntl(sk->fd, F_SETFL, flags); + + /* + * Add sk->fd to epoll_wait + */ + ev.events = EPOLLIN; + ev.data.fd = sk->fd; + ev.data.ptr = sk; + if (arg->info->udp_epollfd <= 0) + arg->info->udp_epollfd = epoll_create(UIP_UDP_MAX_EVENTS); + ret = epoll_ctl(arg->info->udp_epollfd, EPOLL_CTL_ADD, sk->fd, &ev); + if (ret == -1) + pr_warning("epoll_ctl error"); + + sk->addr.sin_family = AF_INET; + sk->addr.sin_addr.s_addr = dip; + sk->addr.sin_port = dport; + + sk->sip = sip; + sk->dip = dip; + sk->sport = sport; + sk->dport = dport; + + mutex_lock(sk_lock); + list_add_tail(&sk->list, sk_head); + mutex_unlock(sk_lock); + + return sk; + +out: + free(sk); + return NULL; +} + +static int uip_udp_socket_send(struct uip_udp_socket *sk, struct uip_udp *udp) +{ + int len; + int ret; + + len = ntohs(udp->len) - uip_udp_hdrlen(udp); + + ret = sendto(sk->fd, udp->payload, len, 0, (struct sockaddr *)&sk->addr, sizeof(sk->addr)); + if (ret != len) + return -1; + + return 0; +} + +int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8* payload, int payload_len) +{ + struct uip_eth *eth2; + struct uip_udp *udp2; + struct uip_ip *ip2; + + /* + * Cook a ethernet frame + */ + udp2 = (struct uip_udp *)(buf->eth); + eth2 = (struct uip_eth *)buf->eth; + ip2 = (struct uip_ip *)(buf->eth); + + eth2->src = info->host_mac; + eth2->dst = info->guest_mac; + eth2->type = htons(UIP_ETH_P_IP); + + ip2->vhl = UIP_IP_VER_4 | UIP_IP_HDR_LEN; + ip2->tos = 0; + ip2->id = 0; + ip2->flgfrag = 0; + ip2->ttl = UIP_IP_TTL; + ip2->proto = UIP_IP_P_UDP; + ip2->csum = 0; + + ip2->sip = sk->dip; + ip2->dip = sk->sip; + udp2->sport = sk->dport; + udp2->dport = sk->sport; + + udp2->len = htons(payload_len + uip_udp_hdrlen(udp2)); + udp2->csum = 0; + + if (payload) + memcpy(udp2->payload, payload, payload_len); + + ip2->len = udp2->len + htons(uip_ip_hdrlen(ip2)); + ip2->csum = uip_csum_ip(ip2); + udp2->csum = uip_csum_udp(udp2); + + /* + * virtio_net_hdr + */ + buf->vnet_len = sizeof(struct virtio_net_hdr); + memset(buf->vnet, 0, buf->vnet_len); + + buf->eth_len = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth); + + return 0; +} + +static void *uip_udp_socket_thread(void *p) +{ + struct epoll_event events[UIP_UDP_MAX_EVENTS]; + struct uip_udp_socket *sk; + struct uip_info *info; + struct uip_buf *buf; + int payload_len; + u8 *payload; + int nfds; + int i; + + kvm__set_thread_name("uip-udp"); + + info = p; + + do { + payload = malloc(UIP_MAX_UDP_PAYLOAD); + } while (!payload); + + while (1) { + nfds = epoll_wait(info->udp_epollfd, events, UIP_UDP_MAX_EVENTS, -1); + + if (nfds == -1) + continue; + + for (i = 0; i < nfds; i++) { + + sk = events[i].data.ptr; + payload_len = recvfrom(sk->fd, payload, UIP_MAX_UDP_PAYLOAD, 0, NULL, NULL); + if (payload_len < 0) + continue; + + /* + * Get free buffer to send data to guest + */ + buf = uip_buf_get_free(info); + + uip_udp_make_pkg(info, sk, buf, payload, payload_len); + + /* + * Send data received from socket to guest + */ + uip_buf_set_used(info, buf); + } + } + + free(payload); + pthread_exit(NULL); + return NULL; +} + +int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg) +{ + struct uip_udp_socket *sk; + struct uip_info *info; + struct uip_udp *udp; + struct uip_ip *ip; + int ret; + + udp = (struct uip_udp *)(arg->eth); + ip = (struct uip_ip *)(arg->eth); + info = arg->info; + + if (uip_udp_is_dhcp(udp)) { + uip_tx_do_ipv4_udp_dhcp(arg); + return 0; + } + + /* + * Find socket we have allocated before, otherwise allocate one + */ + sk = uip_udp_socket_find(arg, ip->sip, ip->dip, udp->sport, udp->dport); + if (!sk) + return -1; + + /* + * Send out UDP data to remote host + */ + ret = uip_udp_socket_send(sk, udp); + if (ret) + return -1; + + if (!info->udp_thread) + pthread_create(&info->udp_thread, NULL, uip_udp_socket_thread, (void *)info); + + return 0; +} diff --git a/tools/kvm/pci.c b/tools/kvm/pci.c new file mode 100644 index 000000000000..8d3732d35842 --- /dev/null +++ b/tools/kvm/pci.c @@ -0,0 +1,200 @@ +#include "kvm/devices.h" +#include "kvm/pci.h" +#include "kvm/ioport.h" +#include "kvm/util.h" +#include "kvm/kvm.h" + +#include <linux/err.h> +#include <assert.h> + +#define PCI_BAR_OFFSET(b) (offsetof(struct pci_device_header, bar[b])) + +static union pci_config_address pci_config_address; + +/* This is within our PCI gap - in an unused area. + * Note this is a PCI *bus address*, is used to assign BARs etc.! + * (That's why it can still 32bit even with 64bit guests-- 64bit + * PCI isn't currently supported.) + */ +static u32 io_space_blocks = KVM_PCI_MMIO_AREA; + +u32 pci_get_io_space_block(u32 size) +{ + u32 block = io_space_blocks; + io_space_blocks += size; + + return block; +} + +static void *pci_config_address_ptr(u16 port) +{ + unsigned long offset; + void *base; + + offset = port - PCI_CONFIG_ADDRESS; + base = &pci_config_address; + + return base + offset; +} + +static bool pci_config_address_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + void *p = pci_config_address_ptr(port); + + memcpy(p, data, size); + + return true; +} + +static bool pci_config_address_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + void *p = pci_config_address_ptr(port); + + memcpy(data, p, size); + + return true; +} + +static struct ioport_operations pci_config_address_ops = { + .io_in = pci_config_address_in, + .io_out = pci_config_address_out, +}; + +static bool pci_device_exists(u8 bus_number, u8 device_number, u8 function_number) +{ + if (pci_config_address.bus_number != bus_number) + return false; + + if (pci_config_address.function_number != function_number) + return false; + + return !IS_ERR_OR_NULL(device__find_dev(DEVICE_BUS_PCI, device_number)); +} + +static bool pci_config_data_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + /* + * If someone accesses PCI configuration space offsets that are not + * aligned to 4 bytes, it uses ioports to signify that. + */ + pci_config_address.reg_offset = port - PCI_CONFIG_DATA; + + pci__config_wr(kvm, pci_config_address, data, size); + + return true; +} + +static bool pci_config_data_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + /* + * If someone accesses PCI configuration space offsets that are not + * aligned to 4 bytes, it uses ioports to signify that. + */ + pci_config_address.reg_offset = port - PCI_CONFIG_DATA; + + pci__config_rd(kvm, pci_config_address, data, size); + + return true; +} + +static struct ioport_operations pci_config_data_ops = { + .io_in = pci_config_data_in, + .io_out = pci_config_data_out, +}; + +void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size) +{ + u8 dev_num; + + dev_num = addr.device_number; + + if (pci_device_exists(0, dev_num, 0)) { + unsigned long offset; + + offset = addr.w & 0xff; + if (offset < sizeof(struct pci_device_header)) { + void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data; + struct pci_device_header *hdr = p; + u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32)); + u32 sz = PCI_IO_SIZE; + + if (bar < 6 && hdr->bar_size[bar]) + sz = hdr->bar_size[bar]; + + /* + * If the kernel masks the BAR it would expect to find the + * size of the BAR there next time it reads from it. + * When the kernel got the size it would write the address + * back. + */ + if (*(u32 *)(p + offset)) { + /* See if kernel tries to mask one of the BARs */ + if ((offset >= PCI_BAR_OFFSET(0)) && + (offset <= PCI_BAR_OFFSET(6)) && + (ioport__read32(data) == 0xFFFFFFFF)) + memcpy(p + offset, &sz, sizeof(sz)); + else + memcpy(p + offset, data, size); + } + } + } +} + +void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size) +{ + u8 dev_num; + + dev_num = addr.device_number; + + if (pci_device_exists(0, dev_num, 0)) { + unsigned long offset; + + offset = addr.w & 0xff; + if (offset < sizeof(struct pci_device_header)) { + void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data; + + memcpy(data, p + offset, size); + } else { + memset(data, 0x00, size); + } + } else { + memset(data, 0xff, size); + } +} + +struct pci_device_header *pci__find_dev(u8 dev_num) +{ + struct device_header *hdr = device__find_dev(DEVICE_BUS_PCI, dev_num); + + if (IS_ERR_OR_NULL(hdr)) + return NULL; + + return hdr->data; +} + +int pci__init(struct kvm *kvm) +{ + int r; + + r = ioport__register(kvm, PCI_CONFIG_DATA + 0, &pci_config_data_ops, 4, NULL); + if (r < 0) + return r; + + r = ioport__register(kvm, PCI_CONFIG_ADDRESS + 0, &pci_config_address_ops, 4, NULL); + if (r < 0) { + ioport__unregister(kvm, PCI_CONFIG_DATA); + return r; + } + + return 0; +} +dev_base_init(pci__init); + +int pci__exit(struct kvm *kvm) +{ + ioport__unregister(kvm, PCI_CONFIG_DATA); + ioport__unregister(kvm, PCI_CONFIG_ADDRESS); + + return 0; +} +dev_base_exit(pci__exit); diff --git a/tools/kvm/powerpc/boot.c b/tools/kvm/powerpc/boot.c new file mode 100644 index 000000000000..2557fc077e42 --- /dev/null +++ b/tools/kvm/powerpc/boot.c @@ -0,0 +1,8 @@ +#include "kvm/kvm.h" + +#include <stdbool.h> + +bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename) +{ + return false; +} diff --git a/tools/kvm/powerpc/cpu_info.c b/tools/kvm/powerpc/cpu_info.c new file mode 100644 index 000000000000..11ca14e23b8a --- /dev/null +++ b/tools/kvm/powerpc/cpu_info.c @@ -0,0 +1,195 @@ +/* + * PPC CPU identification + * + * This is a very simple "host CPU info" struct to get us going. + * For the little host information we need, I don't want to grub about + * parsing stuff in /proc/device-tree so just match host PVR to differentiate + * PPC970 and POWER7 (which is all that's currently supported). + * + * Qemu does something similar but this is MUCH simpler! + * + * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <kvm/kvm.h> +#include <sys/ioctl.h> + +#include "cpu_info.h" +#include "kvm/util.h" + +/* POWER7 */ + +static struct cpu_info cpu_power7_info = { + .name = "POWER7", + .tb_freq = 512000000, + .d_bsize = 128, + .i_bsize = 128, + .flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX, + .mmu_info = { + .flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS, + .slb_size = 32, + }, +}; + +/* PPC970/G5 */ + +static struct cpu_info cpu_970_info = { + .name = "G5", + .tb_freq = 33333333, + .d_bsize = 128, + .i_bsize = 128, + .flags = CPUINFO_FLAG_VMX, +}; + +/* This is a default catchall for 'no match' on PVR: */ +static struct cpu_info cpu_dummy_info = { .name = "unknown" }; + +static struct pvr_info host_pvr_info[] = { + { 0xffffffff, 0x0f000003, &cpu_power7_info }, + { 0xffff0000, 0x003f0000, &cpu_power7_info }, + { 0xffff0000, 0x004a0000, &cpu_power7_info }, + { 0xffff0000, 0x00390000, &cpu_970_info }, + { 0xffff0000, 0x003c0000, &cpu_970_info }, + { 0xffff0000, 0x00440000, &cpu_970_info }, + { 0xffff0000, 0x00450000, &cpu_970_info }, +}; + +/* If we can't query the kernel for supported page sizes assume 4K and 16M */ +static struct kvm_ppc_one_seg_page_size fallback_sps[] = { + [0] = { + .page_shift = 12, + .slb_enc = 0, + .enc = { + [0] = { + .page_shift = 12, + .pte_enc = 0, + }, + }, + }, + [1] = { + .page_shift = 24, + .slb_enc = 0x100, + .enc = { + [0] = { + .page_shift = 24, + .pte_enc = 0, + }, + }, + }, +}; + + +static void setup_mmu_info(struct kvm *kvm, struct cpu_info *cpu_info) +{ + static struct kvm_ppc_smmu_info *mmu_info; + struct kvm_ppc_one_seg_page_size *sps; + int i, j, k, valid; + + if (!kvm__supports_extension(kvm, KVM_CAP_PPC_GET_SMMU_INFO)) { + memcpy(&cpu_info->mmu_info.sps, fallback_sps, sizeof(fallback_sps)); + } else if (ioctl(kvm->vm_fd, KVM_PPC_GET_SMMU_INFO, &cpu_info->mmu_info) < 0) { + die_perror("KVM_PPC_GET_SMMU_INFO failed"); + } + + mmu_info = &cpu_info->mmu_info; + + if (!(mmu_info->flags & KVM_PPC_PAGE_SIZES_REAL)) + /* Guest pages are not restricted by the backing page size */ + return; + + /* Filter based on backing page size */ + + for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) { + sps = &mmu_info->sps[i]; + + if (!sps->page_shift) + break; + + if (kvm->ram_pagesize < (1ul << sps->page_shift)) { + /* Mark the whole segment size invalid */ + sps->page_shift = 0; + continue; + } + + /* Check each page size for the segment */ + for (j = 0, valid = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) { + if (!sps->enc[j].page_shift) + break; + + if (kvm->ram_pagesize < (1ul << sps->enc[j].page_shift)) + sps->enc[j].page_shift = 0; + else + valid++; + } + + if (!valid) { + /* Mark the whole segment size invalid */ + sps->page_shift = 0; + continue; + } + + /* Mark any trailing entries invalid if we broke out early */ + for (k = j; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++) + sps->enc[k].page_shift = 0; + + /* Collapse holes */ + for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) { + if (sps->enc[j].page_shift) + continue; + + for (k = j + 1; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++) { + if (sps->enc[k].page_shift) { + sps->enc[j] = sps->enc[k]; + sps->enc[k].page_shift = 0; + break; + } + } + } + } + + /* Mark any trailing entries invalid if we broke out early */ + for (j = i; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) + mmu_info->sps[j].page_shift = 0; + + /* Collapse holes */ + for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) { + if (mmu_info->sps[i].page_shift) + continue; + + for (j = i + 1; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) { + if (mmu_info->sps[j].page_shift) { + mmu_info->sps[i] = mmu_info->sps[j]; + mmu_info->sps[j].page_shift = 0; + break; + } + } + } +} + +struct cpu_info *find_cpu_info(struct kvm *kvm) +{ + struct cpu_info *info; + unsigned int i; + u32 pvr = kvm->arch.pvr; + + for (info = NULL, i = 0; i < ARRAY_SIZE(host_pvr_info); i++) { + if ((pvr & host_pvr_info[i].pvr_mask) == host_pvr_info[i].pvr) { + info = host_pvr_info[i].cpu_info; + break; + } + } + + /* Didn't find anything? Rut-ro. */ + if (!info) { + pr_warning("Host CPU unsupported by kvmtool\n"); + info = &cpu_dummy_info; + } + + setup_mmu_info(kvm, info); + + return info; +} diff --git a/tools/kvm/powerpc/cpu_info.h b/tools/kvm/powerpc/cpu_info.h new file mode 100644 index 000000000000..f61707a8075d --- /dev/null +++ b/tools/kvm/powerpc/cpu_info.h @@ -0,0 +1,42 @@ +/* + * PPC CPU identification + * + * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef CPU_INFO_H +#define CPU_INFO_H + +#include <kvm/kvm.h> + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/kvm.h> + +struct cpu_info { + const char *name; + u32 tb_freq; /* timebase frequency */ + u32 d_bsize; /* d-cache block size */ + u32 i_bsize; /* i-cache block size */ + u32 flags; + struct kvm_ppc_smmu_info mmu_info; +}; + +struct pvr_info { + u32 pvr_mask; + u32 pvr; + struct cpu_info *cpu_info; +}; + +/* Misc capabilities/CPU properties */ +#define CPUINFO_FLAG_DFP 0x00000001 +#define CPUINFO_FLAG_VMX 0x00000002 +#define CPUINFO_FLAG_VSX 0x00000004 + +struct cpu_info *find_cpu_info(struct kvm *kvm); + +#endif diff --git a/tools/kvm/powerpc/include/kvm/barrier.h b/tools/kvm/powerpc/include/kvm/barrier.h new file mode 100644 index 000000000000..dd5115acaff6 --- /dev/null +++ b/tools/kvm/powerpc/include/kvm/barrier.h @@ -0,0 +1,6 @@ +#ifndef _KVM_BARRIER_H_ +#define _KVM_BARRIER_H_ + +#include <asm/barrier.h> + +#endif /* _KVM_BARRIER_H_ */ diff --git a/tools/kvm/powerpc/include/kvm/kvm-arch.h b/tools/kvm/powerpc/include/kvm/kvm-arch.h new file mode 100644 index 000000000000..d93e1429e0ba --- /dev/null +++ b/tools/kvm/powerpc/include/kvm/kvm-arch.h @@ -0,0 +1,59 @@ +/* + * PPC64 architecture-specific definitions + * + * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef KVM__KVM_ARCH_H +#define KVM__KVM_ARCH_H + +#include <stdbool.h> +#include <linux/types.h> +#include <time.h> + +/* + * MMIO lives after RAM, but it'd be nice if it didn't constantly move. + * Choose a suitably high address, e.g. 63T... This limits RAM size. + */ +#define PPC_MMIO_START 0x3F0000000000UL +#define PPC_MMIO_SIZE 0x010000000000UL + +#define KERNEL_LOAD_ADDR 0x0000000000000000 +#define KERNEL_START_ADDR 0x0000000000000000 +#define KERNEL_SECONDARY_START_ADDR 0x0000000000000060 +#define INITRD_LOAD_ADDR 0x0000000002800000 + +#define RTAS_MAX_SIZE 0x10000 + +#define TIMEBASE_FREQ 512000000ULL + +#define KVM_MMIO_START PPC_MMIO_START + +/* + * This is the address that pci_get_io_space_block() starts allocating + * from. Note that this is a PCI bus address. + */ +#define KVM_PCI_MMIO_AREA 0x1000000 +#define KVM_VIRTIO_MMIO_AREA 0x2000000 + +#define VIRTIO_DEFAULT_TRANS VIRTIO_PCI + +struct spapr_phb; + +struct kvm_arch { + u64 sdr1; + u32 pvr; + unsigned long rtas_gra; + unsigned long rtas_size; + unsigned long fdt_gra; + unsigned long initrd_gra; + unsigned long initrd_size; + struct icp_state *icp; + struct spapr_phb *phb; +}; + +#endif /* KVM__KVM_ARCH_H */ diff --git a/tools/kvm/powerpc/include/kvm/kvm-config-arch.h b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h new file mode 100644 index 000000000000..60f61de0296f --- /dev/null +++ b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h @@ -0,0 +1,7 @@ +#ifndef KVM__KVM_CONFIG_ARCH_H +#define KVM__KVM_CONFIG_ARCH_H + +struct kvm_config_arch { +}; + +#endif /* KVM__KVM_CONFIG_ARCH_H */ diff --git a/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h new file mode 100644 index 000000000000..7520c049a948 --- /dev/null +++ b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h @@ -0,0 +1,76 @@ +/* + * PPC64 cpu-specific definitions + * + * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef KVM__KVM_CPU_ARCH_H +#define KVM__KVM_CPU_ARCH_H + +/* Architecture-specific kvm_cpu definitions. */ + +#include <linux/kvm.h> /* for struct kvm_regs */ +#include <stdbool.h> +#include <pthread.h> + +#define MSR_SF (1UL<<63) +#define MSR_HV (1UL<<60) +#define MSR_VEC (1UL<<25) +#define MSR_VSX (1UL<<23) +#define MSR_POW (1UL<<18) +#define MSR_EE (1UL<<15) +#define MSR_PR (1UL<<14) +#define MSR_FP (1UL<<13) +#define MSR_ME (1UL<<12) +#define MSR_FE0 (1UL<<11) +#define MSR_SE (1UL<<10) +#define MSR_BE (1UL<<9) +#define MSR_FE1 (1UL<<8) +#define MSR_IR (1UL<<5) +#define MSR_DR (1UL<<4) +#define MSR_PMM (1UL<<2) +#define MSR_RI (1UL<<1) +#define MSR_LE (1UL<<0) + +#define POWER7_EXT_IRQ 0 + +struct kvm; + +struct kvm_cpu { + pthread_t thread; /* VCPU thread */ + + unsigned long cpu_id; + + struct kvm *kvm; /* parent KVM */ + int vcpu_fd; /* For VCPU ioctls() */ + struct kvm_run *kvm_run; + + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_fpu fpu; + + u8 is_running; + u8 paused; + u8 needs_nmi; + /* + * Although PPC KVM doesn't yet support coalesced MMIO, generic code + * needs this in our kvm_cpu: + */ + struct kvm_coalesced_mmio_ring *ring; +}; + +void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level); + +/* This is never actually called on PPC. */ +static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count) +{ + return false; +} + +bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write); + +#endif /* KVM__KVM_CPU_ARCH_H */ diff --git a/tools/kvm/powerpc/ioport.c b/tools/kvm/powerpc/ioport.c new file mode 100644 index 000000000000..264fb7e2d57d --- /dev/null +++ b/tools/kvm/powerpc/ioport.c @@ -0,0 +1,18 @@ +/* + * PPC64 ioport platform setup. There isn't any! :-) + * + * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "kvm/ioport.h" + +#include <stdlib.h> + +void ioport__setup_arch(struct kvm *kvm) +{ + /* PPC has no legacy ioports to set up */ +} diff --git a/tools/kvm/powerpc/irq.c b/tools/kvm/powerpc/irq.c new file mode 100644 index 000000000000..ae9da507fb82 --- /dev/null +++ b/tools/kvm/powerpc/irq.c @@ -0,0 +1,50 @@ +/* + * PPC64 IRQ routines + * + * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "kvm/devices.h" +#include "kvm/irq.h" +#include "kvm/kvm.h" +#include "kvm/util.h" + +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/kvm.h> +#include <sys/ioctl.h> + +#include <stddef.h> +#include <stdlib.h> + +#include "kvm/pci.h" + +#include "xics.h" +#include "spapr_pci.h" + +/* + * FIXME: The code in this file assumes an SPAPR guest, using XICS. Make + * generic & cope with multiple PPC platform types. + */ + +int irq__register_device(u32 dev, u8 *pin, u8 *line) +{ + *pin = 1; + /* + * Have I said how nasty I find this? Line should be dontcare... PHB + * should determine which CPU/XICS IRQ to fire. + */ + *line = xics_alloc_irqnum(); + return 0; +} + +int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg) +{ + die(__FUNCTION__); + return 0; +} diff --git a/tools/kvm/powerpc/kvm-cpu.c b/tools/kvm/powerpc/kvm-cpu.c new file mode 100644 index 000000000000..8fce121705c8 --- /dev/null +++ b/tools/kvm/powerpc/kvm-cpu.c @@ -0,0 +1,290 @@ +/* + * PPC64 processor support + * + * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "kvm/kvm-cpu.h" + +#include "kvm/symbol.h" +#include "kvm/util.h" +#include "kvm/kvm.h" + +#include "spapr.h" +#include "spapr_pci.h" +#include "xics.h" + +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <assert.h> + +static int debug_fd; + +void kvm_cpu__set_debug_fd(int fd) +{ + debug_fd = fd; +} + +int kvm_cpu__get_debug_fd(void) +{ + return debug_fd; +} + +static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm) +{ + struct kvm_cpu *vcpu; + + vcpu = calloc(1, sizeof *vcpu); + if (!vcpu) + return NULL; + + vcpu->kvm = kvm; + + return vcpu; +} + +void kvm_cpu__delete(struct kvm_cpu *vcpu) +{ + free(vcpu); +} + +struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id) +{ + struct kvm_cpu *vcpu; + int mmap_size; + struct kvm_enable_cap papr_cap = { .cap = KVM_CAP_PPC_PAPR }; + + vcpu = kvm_cpu__new(kvm); + if (!vcpu) + return NULL; + + vcpu->cpu_id = cpu_id; + + vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id); + if (vcpu->vcpu_fd < 0) + die_perror("KVM_CREATE_VCPU ioctl"); + + mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0); + if (mmap_size < 0) + die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl"); + + vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0); + if (vcpu->kvm_run == MAP_FAILED) + die("unable to mmap vcpu fd"); + + if (ioctl(vcpu->vcpu_fd, KVM_ENABLE_CAP, &papr_cap) < 0) + die("unable to enable PAPR capability"); + + /* + * We start all CPUs, directing non-primary threads into the kernel's + * secondary start point. When we come to support SLOF, we will start + * only one and SLOF will RTAS call us to ask for others to be + * started. (FIXME: make more generic & interface with whichever + * firmware a platform may be using.) + */ + vcpu->is_running = true; + + return vcpu; +} + +static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu) +{ + /* Don't have to do anything, there's no expected FPU state. */ +} + +static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu) +{ + /* + * FIXME: This assumes PPC64 and Linux guest. It doesn't use the + * OpenFirmware entry method, but instead the "embedded" entry which + * passes the FDT address directly. + */ + struct kvm_regs *r = &vcpu->regs; + + if (vcpu->cpu_id == 0) { + r->pc = KERNEL_START_ADDR; + r->gpr[3] = vcpu->kvm->arch.fdt_gra; + r->gpr[5] = 0; + } else { + r->pc = KERNEL_SECONDARY_START_ADDR; + r->gpr[3] = vcpu->cpu_id; + } + r->msr = 0x8000000000001000UL; /* 64bit, non-HV, ME */ + + if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0) + die_perror("KVM_SET_REGS failed"); +} + +static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu) +{ + /* + * Some sregs setup to initialise SDR1/PVR/HIOR on PPC64 SPAPR + * platforms using PR KVM. (Technically, this is all ignored on + * SPAPR HV KVM.) Different setup is required for non-PV non-SPAPR + * platforms! (FIXME.) + */ + struct kvm_sregs sregs; + struct kvm_one_reg reg = {}; + u64 value; + + if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0) + die("KVM_GET_SREGS failed"); + + sregs.u.s.sdr1 = vcpu->kvm->arch.sdr1; + sregs.pvr = vcpu->kvm->arch.pvr; + + if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &sregs) < 0) + die("KVM_SET_SREGS failed"); + + reg.id = KVM_REG_PPC_HIOR; + value = 0; + reg.addr = (u64)&value; + if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0) + die("KVM_SET_ONE_REG failed"); +} + +/** + * kvm_cpu__reset_vcpu - reset virtual CPU to a known state + */ +void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu) +{ + kvm_cpu__setup_regs(vcpu); + kvm_cpu__setup_sregs(vcpu); + kvm_cpu__setup_fpu(vcpu); +} + +/* kvm_cpu__irq - set KVM's IRQ flag on this vcpu */ +void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level) +{ + unsigned int virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET; + + /* FIXME: POWER-specific */ + if (pin != POWER7_EXT_IRQ) + return; + if (ioctl(vcpu->vcpu_fd, KVM_INTERRUPT, &virq) < 0) + pr_warning("Could not KVM_INTERRUPT."); +} + +void kvm_cpu__arch_nmi(struct kvm_cpu *cpu) +{ +} + +bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) +{ + bool ret = true; + struct kvm_run *run = vcpu->kvm_run; + switch(run->exit_reason) { + case KVM_EXIT_PAPR_HCALL: + run->papr_hcall.ret = spapr_hypercall(vcpu, run->papr_hcall.nr, + (target_ulong*)run->papr_hcall.args); + break; + default: + ret = false; + } + return ret; +} + +bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write) +{ + /* + * FIXME: This function will need to be split in order to support + * various PowerPC platforms/PHB types, etc. It currently assumes SPAPR + * PPC64 guest. + */ + bool ret = false; + + if ((phys_addr >= SPAPR_PCI_WIN_START) && + (phys_addr < SPAPR_PCI_WIN_END)) { + ret = spapr_phb_mmio(kvm, phys_addr, data, len, is_write); + } else { + pr_warning("MMIO %s unknown address %llx (size %d)!\n", + is_write ? "write to" : "read from", + phys_addr, len); + } + return ret; +} + +#define CONDSTR_BIT(m, b) (((m) & MSR_##b) ? #b" " : "") + +void kvm_cpu__show_registers(struct kvm_cpu *vcpu) +{ + struct kvm_regs regs; + struct kvm_sregs sregs; + int r; + + if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, ®s) < 0) + die("KVM_GET_REGS failed"); + if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0) + die("KVM_GET_SREGS failed"); + + dprintf(debug_fd, "\n Registers:\n"); + dprintf(debug_fd, " NIP: %016llx MSR: %016llx " + "( %s%s%s%s%s%s%s%s%s%s%s%s)\n", + regs.pc, regs.msr, + CONDSTR_BIT(regs.msr, SF), + CONDSTR_BIT(regs.msr, HV), /* ! */ + CONDSTR_BIT(regs.msr, VEC), + CONDSTR_BIT(regs.msr, VSX), + CONDSTR_BIT(regs.msr, EE), + CONDSTR_BIT(regs.msr, PR), + CONDSTR_BIT(regs.msr, FP), + CONDSTR_BIT(regs.msr, ME), + CONDSTR_BIT(regs.msr, IR), + CONDSTR_BIT(regs.msr, DR), + CONDSTR_BIT(regs.msr, RI), + CONDSTR_BIT(regs.msr, LE)); + dprintf(debug_fd, " CTR: %016llx LR: %016llx CR: %08llx\n", + regs.ctr, regs.lr, regs.cr); + dprintf(debug_fd, " SRR0: %016llx SRR1: %016llx XER: %016llx\n", + regs.srr0, regs.srr1, regs.xer); + dprintf(debug_fd, " SPRG0: %016llx SPRG1: %016llx\n", + regs.sprg0, regs.sprg1); + dprintf(debug_fd, " SPRG2: %016llx SPRG3: %016llx\n", + regs.sprg2, regs.sprg3); + dprintf(debug_fd, " SPRG4: %016llx SPRG5: %016llx\n", + regs.sprg4, regs.sprg5); + dprintf(debug_fd, " SPRG6: %016llx SPRG7: %016llx\n", + regs.sprg6, regs.sprg7); + dprintf(debug_fd, " GPRs:\n "); + for (r = 0; r < 32; r++) { + dprintf(debug_fd, "%016llx ", regs.gpr[r]); + if ((r & 3) == 3) + dprintf(debug_fd, "\n "); + } + dprintf(debug_fd, "\n"); + + /* FIXME: Assumes SLB-based (book3s) guest */ + for (r = 0; r < 32; r++) { + dprintf(debug_fd, " SLB%02d %016llx %016llx\n", r, + sregs.u.s.ppc64.slb[r].slbe, + sregs.u.s.ppc64.slb[r].slbv); + } + dprintf(debug_fd, "----------\n"); +} + +void kvm_cpu__show_code(struct kvm_cpu *vcpu) +{ + if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0) + die("KVM_GET_REGS failed"); + + /* FIXME: Dump/disassemble some code...! */ + + dprintf(debug_fd, "\n Stack:\n"); + dprintf(debug_fd, " ------\n"); + /* Only works in real mode: */ + kvm__dump_mem(vcpu->kvm, vcpu->regs.gpr[1], 32); +} + +void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu) +{ + /* Does nothing yet */ +} diff --git a/tools/kvm/powerpc/kvm.c b/tools/kvm/powerpc/kvm.c new file mode 100644 index 000000000000..dc9f89d55500 --- /dev/null +++ b/tools/kvm/powerpc/kvm.c @@ -0,0 +1,529 @@ +/* + * PPC64 (SPAPR) platform support + * + * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * Portions of FDT setup borrowed from QEMU, copyright 2010 David Gibson, IBM + * Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "kvm/fdt.h" +#include "kvm/kvm.h" +#include "kvm/util.h" +#include "cpu_info.h" + +#include "spapr.h" +#include "spapr_hvcons.h" +#include "spapr_pci.h" + +#include <linux/kvm.h> + +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <fcntl.h> +#include <asm/unistd.h> +#include <errno.h> + +#include <linux/byteorder.h> + +#define HPT_ORDER 24 + +#define HUGETLBFS_PATH "/var/lib/hugetlbfs/global/pagesize-16MB/" + +#define PHANDLE_XICP 0x00001111 + +static char kern_cmdline[2048]; + +struct kvm_ext kvm_req_ext[] = { + { DEFINE_KVM_EXT(KVM_CAP_PPC_UNSET_IRQ) }, + { DEFINE_KVM_EXT(KVM_CAP_PPC_IRQ_LEVEL) }, + { 0, 0 } +}; + +static uint32_t mfpvr(void) +{ + uint32_t r; + asm volatile ("mfpvr %0" : "=r"(r)); + return r; +} + +bool kvm__arch_cpu_supports_vm(void) +{ + return true; +} + +void kvm__init_ram(struct kvm *kvm) +{ + u64 phys_start, phys_size; + void *host_mem; + + phys_start = 0; + phys_size = kvm->ram_size; + host_mem = kvm->ram_start; + + /* + * We put MMIO at PPC_MMIO_START, high up. Make sure that this doesn't + * crash into the end of RAM -- on PPC64 at least, this is so high + * (63TB!) that this is unlikely. + */ + if (phys_size >= PPC_MMIO_START) + die("Too much memory (%lld, what a nice problem): " + "overlaps MMIO!\n", + phys_size); + + kvm__register_mem(kvm, phys_start, phys_size, host_mem); +} + +void kvm__arch_set_cmdline(char *cmdline, bool video) +{ + /* We don't need anything unusual in here. */ +} + +/* Architecture-specific KVM init */ +void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size) +{ + int cap_ppc_rma; + unsigned long hpt; + + kvm->ram_size = ram_size; + + /* Map "default" hugetblfs path to the standard 16M mount point */ + if (hugetlbfs_path && !strcmp(hugetlbfs_path, "default")) + hugetlbfs_path = HUGETLBFS_PATH; + + kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, kvm->ram_size); + + if (kvm->ram_start == MAP_FAILED) + die("Couldn't map %lld bytes for RAM (%d)\n", + kvm->ram_size, errno); + + /* FDT goes at top of memory, RTAS just below */ + kvm->arch.fdt_gra = kvm->ram_size - FDT_MAX_SIZE; + /* FIXME: Not all PPC systems have RTAS */ + kvm->arch.rtas_gra = kvm->arch.fdt_gra - RTAS_MAX_SIZE; + madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE); + + /* FIXME: SPAPR-PR specific; allocate a guest HPT. */ + if (posix_memalign((void **)&hpt, (1<<HPT_ORDER), (1<<HPT_ORDER))) + die("Can't allocate %d bytes for HPT\n", (1<<HPT_ORDER)); + + kvm->arch.sdr1 = ((hpt + 0x3ffffULL) & ~0x3ffffULL) | (HPT_ORDER-18); + + kvm->arch.pvr = mfpvr(); + + /* FIXME: This is book3s-specific */ + cap_ppc_rma = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_PPC_RMA); + if (cap_ppc_rma == 2) + die("Need contiguous RMA allocation on this hardware, " + "which is not yet supported."); + + /* Do these before FDT setup, IRQ setup, etc. */ + /* FIXME: SPAPR-specific */ + hypercall_init(); + register_core_rtas(); + /* Now that hypercalls are initialised, register a couple for the console: */ + spapr_hvcons_init(); + spapr_create_phb(kvm, "pci", SPAPR_PCI_BUID, + SPAPR_PCI_MEM_WIN_ADDR, + SPAPR_PCI_MEM_WIN_SIZE, + SPAPR_PCI_IO_WIN_ADDR, + SPAPR_PCI_IO_WIN_SIZE); +} + +void kvm__arch_delete_ram(struct kvm *kvm) +{ + munmap(kvm->ram_start, kvm->ram_size); +} + +void kvm__irq_trigger(struct kvm *kvm, int irq) +{ + kvm__irq_line(kvm, irq, 1); + kvm__irq_line(kvm, irq, 0); +} + +void kvm__arch_periodic_poll(struct kvm *kvm) +{ + /* FIXME: Should register callbacks to platform-specific polls */ + spapr_hvcons_poll(kvm); +} + +int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline) +{ + void *p; + void *k_start; + void *i_start; + int nr; + + if (lseek(fd_kernel, 0, SEEK_SET) < 0) + die_perror("lseek"); + + p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR); + + while ((nr = read(fd_kernel, p, 65536)) > 0) + p += nr; + + pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, p-k_start); + + if (fd_initrd != -1) { + if (lseek(fd_initrd, 0, SEEK_SET) < 0) + die_perror("lseek"); + + if (p-k_start > INITRD_LOAD_ADDR) + die("Kernel overlaps initrd!"); + + /* Round up kernel size to 8byte alignment, and load initrd right after. */ + i_start = p = guest_flat_to_host(kvm, INITRD_LOAD_ADDR); + + while (((nr = read(fd_initrd, p, 65536)) > 0) && + p < (kvm->ram_start + kvm->ram_size)) + p += nr; + + if (p >= (kvm->ram_start + kvm->ram_size)) + die("initrd too big to contain in guest RAM.\n"); + + pr_info("Loaded initrd to 0x%x (%ld bytes)", + INITRD_LOAD_ADDR, p-i_start); + kvm->arch.initrd_gra = INITRD_LOAD_ADDR; + kvm->arch.initrd_size = p-i_start; + } else { + kvm->arch.initrd_size = 0; + } + strncpy(kern_cmdline, kernel_cmdline, 2048); + kern_cmdline[2047] = '\0'; + + return true; +} + +bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, + const char *kernel_cmdline) +{ + /* We don't support bzImages. */ + return false; +} + +struct fdt_prop { + void *value; + int size; +}; + +static void generate_segment_page_sizes(struct kvm_ppc_smmu_info *info, struct fdt_prop *prop) +{ + struct kvm_ppc_one_seg_page_size *sps; + int i, j, size; + u32 *p; + + for (size = 0, i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) { + sps = &info->sps[i]; + + if (sps->page_shift == 0) + break; + + /* page shift, slb enc & count */ + size += 3; + + for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) { + if (info->sps[i].enc[j].page_shift == 0) + break; + + /* page shift & pte enc */ + size += 2; + } + } + + if (!size) { + prop->value = NULL; + prop->size = 0; + return; + } + + /* Convert size to bytes */ + prop->size = size * sizeof(u32); + + prop->value = malloc(prop->size); + if (!prop->value) + die_perror("malloc failed"); + + p = (u32 *)prop->value; + for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) { + sps = &info->sps[i]; + + if (sps->page_shift == 0) + break; + + *p++ = sps->page_shift; + *p++ = sps->slb_enc; + + for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) + if (!info->sps[i].enc[j].page_shift) + break; + + *p++ = j; /* count of enc */ + + for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) { + if (!info->sps[i].enc[j].page_shift) + break; + + *p++ = info->sps[i].enc[j].page_shift; + *p++ = info->sps[i].enc[j].pte_enc; + } + } +} + +#define SMT_THREADS 4 + +/* + * Set up the FDT for the kernel: This function is currently fairly SPAPR-heavy, + * and whilst most PPC targets will require CPU/memory nodes, others like RTAS + * should eventually be added separately. + */ +static int setup_fdt(struct kvm *kvm) +{ + uint64_t mem_reg_property[] = { 0, cpu_to_be64(kvm->ram_size) }; + int smp_cpus = kvm->nrcpus; + uint32_t int_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)}; + char hypertas_prop_kvm[] = "hcall-pft\0hcall-term\0" + "hcall-dabr\0hcall-interrupt\0hcall-tce\0hcall-vio\0" + "hcall-splpar\0hcall-bulk"; + int i, j; + char cpu_name[30]; + u8 staging_fdt[FDT_MAX_SIZE]; + struct cpu_info *cpu_info = find_cpu_info(kvm); + struct fdt_prop segment_page_sizes; + u32 segment_sizes_1T[] = {0x1c, 0x28, 0xffffffff, 0xffffffff}; + + /* Generate an appropriate DT at kvm->arch.fdt_gra */ + void *fdt_dest = guest_flat_to_host(kvm, kvm->arch.fdt_gra); + void *fdt = staging_fdt; + + _FDT(fdt_create(fdt, FDT_MAX_SIZE)); + _FDT(fdt_finish_reservemap(fdt)); + + _FDT(fdt_begin_node(fdt, "")); + + _FDT(fdt_property_string(fdt, "device_type", "chrp")); + _FDT(fdt_property_string(fdt, "model", "IBM pSeries (kvmtool)")); + _FDT(fdt_property_cell(fdt, "#address-cells", 0x2)); + _FDT(fdt_property_cell(fdt, "#size-cells", 0x2)); + + /* RTAS */ + _FDT(fdt_begin_node(fdt, "rtas")); + /* This is what the kernel uses to switch 'We're an LPAR'! */ + _FDT(fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop_kvm, + sizeof(hypertas_prop_kvm))); + _FDT(fdt_property_cell(fdt, "linux,rtas-base", kvm->arch.rtas_gra)); + _FDT(fdt_property_cell(fdt, "linux,rtas-entry", kvm->arch.rtas_gra)); + _FDT(fdt_property_cell(fdt, "rtas-size", kvm->arch.rtas_size)); + /* Now add properties for all RTAS tokens: */ + if (spapr_rtas_fdt_setup(kvm, fdt)) + die("Couldn't create RTAS FDT properties\n"); + + _FDT(fdt_end_node(fdt)); + + /* /chosen */ + _FDT(fdt_begin_node(fdt, "chosen")); + /* cmdline */ + _FDT(fdt_property_string(fdt, "bootargs", kern_cmdline)); + /* Initrd */ + if (kvm->arch.initrd_size != 0) { + uint32_t ird_st_prop = cpu_to_be32(kvm->arch.initrd_gra); + uint32_t ird_end_prop = cpu_to_be32(kvm->arch.initrd_gra + + kvm->arch.initrd_size); + _FDT(fdt_property(fdt, "linux,initrd-start", + &ird_st_prop, sizeof(ird_st_prop))); + _FDT(fdt_property(fdt, "linux,initrd-end", + &ird_end_prop, sizeof(ird_end_prop))); + } + + /* + * stdout-path: This is assuming we're using the HV console. Also, the + * address is hardwired until we do a VIO bus. + */ + _FDT(fdt_property_string(fdt, "linux,stdout-path", + "/vdevice/vty@30000000")); + _FDT(fdt_end_node(fdt)); + + /* + * Memory: We don't alloc. a separate RMA yet. If we ever need to + * (CAP_PPC_RMA == 2) then have one memory node for 0->RMAsize, and + * another RMAsize->endOfMem. + */ + _FDT(fdt_begin_node(fdt, "memory@0")); + _FDT(fdt_property_string(fdt, "device_type", "memory")); + _FDT(fdt_property(fdt, "reg", mem_reg_property, + sizeof(mem_reg_property))); + _FDT(fdt_end_node(fdt)); + + generate_segment_page_sizes(&cpu_info->mmu_info, &segment_page_sizes); + + /* CPUs */ + _FDT(fdt_begin_node(fdt, "cpus")); + _FDT(fdt_property_cell(fdt, "#address-cells", 0x1)); + _FDT(fdt_property_cell(fdt, "#size-cells", 0x0)); + + for (i = 0; i < smp_cpus; i += SMT_THREADS) { + int32_t pft_size_prop[] = { 0, HPT_ORDER }; + uint32_t servers_prop[SMT_THREADS]; + uint32_t gservers_prop[SMT_THREADS * 2]; + int threads = (smp_cpus - i) >= SMT_THREADS ? SMT_THREADS : + smp_cpus - i; + + sprintf(cpu_name, "PowerPC,%s@%d", cpu_info->name, i); + _FDT(fdt_begin_node(fdt, cpu_name)); + sprintf(cpu_name, "PowerPC,%s", cpu_info->name); + _FDT(fdt_property_string(fdt, "name", cpu_name)); + _FDT(fdt_property_string(fdt, "device_type", "cpu")); + + _FDT(fdt_property_cell(fdt, "reg", i)); + _FDT(fdt_property_cell(fdt, "cpu-version", kvm->arch.pvr)); + + _FDT(fdt_property_cell(fdt, "dcache-block-size", cpu_info->d_bsize)); + _FDT(fdt_property_cell(fdt, "icache-block-size", cpu_info->i_bsize)); + + _FDT(fdt_property_cell(fdt, "timebase-frequency", cpu_info->tb_freq)); + /* Lies, but safeish lies! */ + _FDT(fdt_property_cell(fdt, "clock-frequency", 0xddbab200)); + + if (cpu_info->mmu_info.slb_size) + _FDT(fdt_property_cell(fdt, "ibm,slb-size", cpu_info->mmu_info.slb_size)); + + /* + * HPT size is hardwired; KVM currently fixes it at 16MB but the + * moment that changes we'll need to read it out of the kernel. + */ + _FDT(fdt_property(fdt, "ibm,pft-size", pft_size_prop, + sizeof(pft_size_prop))); + + _FDT(fdt_property_string(fdt, "status", "okay")); + _FDT(fdt_property(fdt, "64-bit", NULL, 0)); + /* A server for each thread in this core */ + for (j = 0; j < SMT_THREADS; j++) { + servers_prop[j] = cpu_to_be32(i+j); + /* + * Hack borrowed from QEMU, direct the group queues back + * to cpu 0: + */ + gservers_prop[j*2] = cpu_to_be32(i+j); + gservers_prop[j*2 + 1] = 0; + } + _FDT(fdt_property(fdt, "ibm,ppc-interrupt-server#s", + servers_prop, threads * sizeof(uint32_t))); + _FDT(fdt_property(fdt, "ibm,ppc-interrupt-gserver#s", + gservers_prop, + threads * 2 * sizeof(uint32_t))); + + if (segment_page_sizes.value) + _FDT(fdt_property(fdt, "ibm,segment-page-sizes", + segment_page_sizes.value, + segment_page_sizes.size)); + + if (cpu_info->mmu_info.flags & KVM_PPC_1T_SEGMENTS) + _FDT(fdt_property(fdt, "ibm,processor-segment-sizes", + segment_sizes_1T, sizeof(segment_sizes_1T))); + + /* VSX / DFP options: */ + if (cpu_info->flags & CPUINFO_FLAG_VMX) + _FDT(fdt_property_cell(fdt, "ibm,vmx", + (cpu_info->flags & + CPUINFO_FLAG_VSX) ? 2 : 1)); + if (cpu_info->flags & CPUINFO_FLAG_DFP) + _FDT(fdt_property_cell(fdt, "ibm,dfp", 0x1)); + _FDT(fdt_end_node(fdt)); + } + _FDT(fdt_end_node(fdt)); + + /* IRQ controller */ + _FDT(fdt_begin_node(fdt, "interrupt-controller@0")); + + _FDT(fdt_property_string(fdt, "device_type", + "PowerPC-External-Interrupt-Presentation")); + _FDT(fdt_property_string(fdt, "compatible", "IBM,ppc-xicp")); + _FDT(fdt_property_cell(fdt, "reg", 0)); + _FDT(fdt_property(fdt, "interrupt-controller", NULL, 0)); + _FDT(fdt_property(fdt, "ibm,interrupt-server-ranges", + int_server_ranges_prop, + sizeof(int_server_ranges_prop))); + _FDT(fdt_property_cell(fdt, "#interrupt-cells", 2)); + _FDT(fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP)); + _FDT(fdt_property_cell(fdt, "phandle", PHANDLE_XICP)); + _FDT(fdt_end_node(fdt)); + + /* + * VIO: See comment in linux,stdout-path; we don't yet represent a VIO + * bus/address allocation so addresses are hardwired here. + */ + _FDT(fdt_begin_node(fdt, "vdevice")); + _FDT(fdt_property_cell(fdt, "#address-cells", 0x1)); + _FDT(fdt_property_cell(fdt, "#size-cells", 0x0)); + _FDT(fdt_property_string(fdt, "device_type", "vdevice")); + _FDT(fdt_property_string(fdt, "compatible", "IBM,vdevice")); + _FDT(fdt_begin_node(fdt, "vty@30000000")); + _FDT(fdt_property_string(fdt, "name", "vty")); + _FDT(fdt_property_string(fdt, "device_type", "serial")); + _FDT(fdt_property_string(fdt, "compatible", "hvterm1")); + _FDT(fdt_property_cell(fdt, "reg", 0x30000000)); + _FDT(fdt_end_node(fdt)); + _FDT(fdt_end_node(fdt)); + + /* Finalise: */ + _FDT(fdt_end_node(fdt)); /* Root node */ + _FDT(fdt_finish(fdt)); + + _FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE)); + + /* PCI */ + if (spapr_populate_pci_devices(kvm, PHANDLE_XICP, fdt_dest)) + die("Fail populating PCI device nodes"); + + _FDT(fdt_add_mem_rsv(fdt_dest, kvm->arch.rtas_gra, kvm->arch.rtas_size)); + _FDT(fdt_pack(fdt_dest)); + + free(segment_page_sizes.value); + + return 0; +} +firmware_init(setup_fdt); + +/** + * kvm__arch_setup_firmware + */ +int kvm__arch_setup_firmware(struct kvm *kvm) +{ + /* + * Set up RTAS stub. All it is is a single hypercall: + * 0: 7c 64 1b 78 mr r4,r3 + * 4: 3c 60 00 00 lis r3,0 + * 8: 60 63 f0 00 ori r3,r3,61440 + * c: 44 00 00 22 sc 1 + * 10: 4e 80 00 20 blr + */ + uint32_t *rtas = guest_flat_to_host(kvm, kvm->arch.rtas_gra); + + rtas[0] = 0x7c641b78; + rtas[1] = 0x3c600000; + rtas[2] = 0x6063f000; + rtas[3] = 0x44000022; + rtas[4] = 0x4e800020; + kvm->arch.rtas_size = 20; + + pr_info("Set up %ld bytes of RTAS at 0x%lx\n", + kvm->arch.rtas_size, kvm->arch.rtas_gra); + + /* Load SLOF */ + + return 0; +} + +int kvm__arch_free_firmware(struct kvm *kvm) +{ + return 0; +} diff --git a/tools/kvm/powerpc/spapr.h b/tools/kvm/powerpc/spapr.h new file mode 100644 index 000000000000..0537f881c0e4 --- /dev/null +++ b/tools/kvm/powerpc/spapr.h @@ -0,0 +1,93 @@ +/* + * SPAPR definitions and declarations + * + * Borrowed heavily from QEMU's spapr.h, + * Copyright (c) 2010 David Gibson, IBM Corporation. + * + * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#if !defined(__HW_SPAPR_H__) +#define __HW_SPAPR_H__ + +#include <inttypes.h> + +/* We need some of the H_ hcall defs, but they're __KERNEL__ only. */ +#define __KERNEL__ +#include <asm/hvcall.h> +#undef __KERNEL__ + +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" + +typedef unsigned long target_ulong; +typedef uintptr_t target_phys_addr_t; + +/* + * The hcalls above are standardized in PAPR and implemented by pHyp + * as well. + * + * We also need some hcalls which are specific to qemu / KVM-on-POWER. + * So far we just need one for H_RTAS, but in future we'll need more + * for extensions like virtio. We put those into the 0xf000-0xfffc + * range which is reserved by PAPR for "platform-specific" hcalls. + */ +#define KVMPPC_HCALL_BASE 0xf000 +#define KVMPPC_H_RTAS (KVMPPC_HCALL_BASE + 0x0) +#define KVMPPC_HCALL_MAX KVMPPC_H_RTAS + +#define DEBUG_SPAPR_HCALLS + +#ifdef DEBUG_SPAPR_HCALLS +#define hcall_dprintf(fmt, ...) \ + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) +#else +#define hcall_dprintf(fmt, ...) \ + do { } while (0) +#endif + +typedef target_ulong (*spapr_hcall_fn)(struct kvm_cpu *vcpu, + target_ulong opcode, + target_ulong *args); + +void hypercall_init(void); +void register_core_rtas(void); + +void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn); +target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode, + target_ulong *args); + +int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt); + +static inline uint32_t rtas_ld(struct kvm *kvm, target_ulong phys, int n) +{ + return *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)); +} + +static inline void rtas_st(struct kvm *kvm, target_ulong phys, int n, uint32_t val) +{ + *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)) = val; +} + +typedef void (*spapr_rtas_fn)(struct kvm_cpu *vcpu, uint32_t token, + uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets); +void spapr_rtas_register(const char *name, spapr_rtas_fn fn); +target_ulong spapr_rtas_call(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets); + +#define SPAPR_PCI_BUID 0x800000020000001ULL +#define SPAPR_PCI_MEM_WIN_ADDR (KVM_MMIO_START + 0xA0000000) +#define SPAPR_PCI_MEM_WIN_SIZE 0x20000000 +#define SPAPR_PCI_IO_WIN_ADDR (SPAPR_PCI_MEM_WIN_ADDR + SPAPR_PCI_MEM_WIN_SIZE) +#define SPAPR_PCI_IO_WIN_SIZE 0x2000000 + +#define SPAPR_PCI_WIN_START SPAPR_PCI_MEM_WIN_ADDR +#define SPAPR_PCI_WIN_END (SPAPR_PCI_IO_WIN_ADDR + SPAPR_PCI_IO_WIN_SIZE) + +#endif /* !defined (__HW_SPAPR_H__) */ diff --git a/tools/kvm/powerpc/spapr_hcall.c b/tools/kvm/powerpc/spapr_hcall.c new file mode 100644 index 000000000000..ff1d63ac37f1 --- /dev/null +++ b/tools/kvm/powerpc/spapr_hcall.c @@ -0,0 +1,134 @@ +/* + * SPAPR hypercalls + * + * Borrowed heavily from QEMU's spapr_hcall.c, + * Copyright (c) 2010 David Gibson, IBM Corporation. + * + * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "spapr.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" + +#include <stdio.h> +#include <assert.h> + +static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1]; +static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - + KVMPPC_HCALL_BASE + 1]; + +static target_ulong h_set_dabr(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args) +{ + /* FIXME: Implement this for -PR. (-HV does this in kernel.) */ + return H_HARDWARE; +} + +static target_ulong h_rtas(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args) +{ + target_ulong rtas_r3 = args[0]; + /* + * Pointer read from phys mem; these ptrs cannot be MMIO (!) so just + * reference guest RAM directly. + */ + uint32_t token, nargs, nret; + + token = rtas_ld(vcpu->kvm, rtas_r3, 0); + nargs = rtas_ld(vcpu->kvm, rtas_r3, 1); + nret = rtas_ld(vcpu->kvm, rtas_r3, 2); + + return spapr_rtas_call(vcpu, token, nargs, rtas_r3 + 12, + nret, rtas_r3 + 12 + 4*nargs); +} + +static target_ulong h_logical_load(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args) +{ + /* SLOF will require these, though kernel doesn't. */ + die(__PRETTY_FUNCTION__); + return H_PARAMETER; +} + +static target_ulong h_logical_store(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args) +{ + /* SLOF will require these, though kernel doesn't. */ + die(__PRETTY_FUNCTION__); + return H_PARAMETER; +} + +static target_ulong h_logical_icbi(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args) +{ + /* KVM will trap this in the kernel. Die if it misses. */ + die(__PRETTY_FUNCTION__); + return H_SUCCESS; +} + +static target_ulong h_logical_dcbf(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args) +{ + /* KVM will trap this in the kernel. Die if it misses. */ + die(__PRETTY_FUNCTION__); + return H_SUCCESS; +} + +void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn) +{ + spapr_hcall_fn *slot; + + if (opcode <= MAX_HCALL_OPCODE) { + assert((opcode & 0x3) == 0); + + slot = &papr_hypercall_table[opcode / 4]; + } else { + assert((opcode >= KVMPPC_HCALL_BASE) && + (opcode <= KVMPPC_HCALL_MAX)); + + slot = &kvmppc_hypercall_table[opcode - KVMPPC_HCALL_BASE]; + } + + assert(!(*slot) || (fn == *slot)); + *slot = fn; +} + +target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode, + target_ulong *args) +{ + if ((opcode <= MAX_HCALL_OPCODE) + && ((opcode & 0x3) == 0)) { + spapr_hcall_fn fn = papr_hypercall_table[opcode / 4]; + + if (fn) { + return fn(vcpu, opcode, args); + } + } else if ((opcode >= KVMPPC_HCALL_BASE) && + (opcode <= KVMPPC_HCALL_MAX)) { + spapr_hcall_fn fn = kvmppc_hypercall_table[opcode - + KVMPPC_HCALL_BASE]; + + if (fn) { + return fn(vcpu, opcode, args); + } + } + + hcall_dprintf("Unimplemented hcall 0x%lx\n", opcode); + return H_FUNCTION; +} + +void hypercall_init(void) +{ + /* hcall-dabr */ + spapr_register_hypercall(H_SET_DABR, h_set_dabr); + + spapr_register_hypercall(H_LOGICAL_CI_LOAD, h_logical_load); + spapr_register_hypercall(H_LOGICAL_CI_STORE, h_logical_store); + spapr_register_hypercall(H_LOGICAL_CACHE_LOAD, h_logical_load); + spapr_register_hypercall(H_LOGICAL_CACHE_STORE, h_logical_store); + spapr_register_hypercall(H_LOGICAL_ICBI, h_logical_icbi); + spapr_register_hypercall(H_LOGICAL_DCBF, h_logical_dcbf); + + /* KVM-PPC specific hcalls */ + spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas); +} diff --git a/tools/kvm/powerpc/spapr_hvcons.c b/tools/kvm/powerpc/spapr_hvcons.c new file mode 100644 index 000000000000..0bdf75ba3689 --- /dev/null +++ b/tools/kvm/powerpc/spapr_hvcons.c @@ -0,0 +1,108 @@ +/* + * SPAPR HV console + * + * Borrowed lightly from QEMU's spapr_vty.c, Copyright (c) 2010 David Gibson, + * IBM Corporation. + * + * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "kvm/term.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" +#include "kvm/util.h" +#include "spapr.h" +#include "spapr_hvcons.h" + +#include <stdio.h> +#include <sys/uio.h> +#include <errno.h> + +#include <linux/byteorder.h> + +union hv_chario { + struct { + uint64_t char0_7; + uint64_t char8_15; + } a; + uint8_t buf[16]; +}; + +static unsigned long h_put_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args) +{ + /* To do: Read register from args[0], and check it. */ + unsigned long len = args[1]; + union hv_chario data; + struct iovec iov; + + if (len > 16) { + return H_PARAMETER; + } + data.a.char0_7 = cpu_to_be64(args[2]); + data.a.char8_15 = cpu_to_be64(args[3]); + + iov.iov_base = data.buf; + iov.iov_len = len; + do { + int ret; + + if (vcpu->kvm->cfg.active_console == CONSOLE_HV) + ret = term_putc_iov(&iov, 1, 0); + else + ret = 0; + if (ret < 0) { + die("term_putc_iov error %d!\n", errno); + } + iov.iov_base += ret; + iov.iov_len -= ret; + } while (iov.iov_len > 0); + + return H_SUCCESS; +} + + +static unsigned long h_get_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args) +{ + /* To do: Read register from args[0], and check it. */ + unsigned long *len = args + 0; + unsigned long *char0_7 = args + 1; + unsigned long *char8_15 = args + 2; + union hv_chario data; + struct iovec iov; + + if (vcpu->kvm->cfg.active_console != CONSOLE_HV) + return H_SUCCESS; + + if (term_readable(0)) { + iov.iov_base = data.buf; + iov.iov_len = 16; + + *len = term_getc_iov(vcpu->kvm, &iov, 1, 0); + *char0_7 = be64_to_cpu(data.a.char0_7); + *char8_15 = be64_to_cpu(data.a.char8_15); + } else { + *len = 0; + } + + return H_SUCCESS; +} + +void spapr_hvcons_poll(struct kvm *kvm) +{ + if (term_readable(0)) { + /* + * We can inject an IRQ to guest here if we want. The guest + * will happily poll, though, so not required. + */ + } +} + +void spapr_hvcons_init(void) +{ + spapr_register_hypercall(H_PUT_TERM_CHAR, h_put_term_char); + spapr_register_hypercall(H_GET_TERM_CHAR, h_get_term_char); +} diff --git a/tools/kvm/powerpc/spapr_hvcons.h b/tools/kvm/powerpc/spapr_hvcons.h new file mode 100644 index 000000000000..d3e4414a2951 --- /dev/null +++ b/tools/kvm/powerpc/spapr_hvcons.h @@ -0,0 +1,19 @@ +/* + * SPAPR HV console + * + * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef spapr_hvcons_H +#define spapr_hvcons_H + +#include "kvm/kvm.h" + +void spapr_hvcons_init(void); +void spapr_hvcons_poll(struct kvm *kvm); + +#endif diff --git a/tools/kvm/powerpc/spapr_pci.c b/tools/kvm/powerpc/spapr_pci.c new file mode 100644 index 000000000000..ed4b9ab52a7c --- /dev/null +++ b/tools/kvm/powerpc/spapr_pci.c @@ -0,0 +1,427 @@ +/* + * SPAPR PHB emulation, RTAS interface to PCI config space, device tree nodes + * for enumerated devices. + * + * Borrowed heavily from QEMU's spapr_pci.c, + * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation. + * Copyright (c) 2011 David Gibson, IBM Corporation. + * + * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "spapr.h" +#include "spapr_pci.h" +#include "kvm/devices.h" +#include "kvm/fdt.h" +#include "kvm/util.h" +#include "kvm/pci.h" + +#include <linux/pci_regs.h> +#include <linux/byteorder.h> + + +/* #define DEBUG_PHB yes */ +#ifdef DEBUG_PHB +#define phb_dprintf(fmt, ...) \ + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) +#else +#define phb_dprintf(fmt, ...) \ + do { } while (0) +#endif + +static const uint32_t bars[] = { + PCI_BASE_ADDRESS_0, PCI_BASE_ADDRESS_1, + PCI_BASE_ADDRESS_2, PCI_BASE_ADDRESS_3, + PCI_BASE_ADDRESS_4, PCI_BASE_ADDRESS_5 + /*, PCI_ROM_ADDRESS*/ +}; + +#define PCI_NUM_REGIONS 7 + +/* Macros to operate with address in OF binding to PCI */ +#define b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p)) +#define b_n(x) b_x((x), 31, 1) /* 0 if relocatable */ +#define b_p(x) b_x((x), 30, 1) /* 1 if prefetchable */ +#define b_t(x) b_x((x), 29, 1) /* 1 if the address is aliased */ +#define b_ss(x) b_x((x), 24, 2) /* the space code */ +#define b_bbbbbbbb(x) b_x((x), 16, 8) /* bus number */ +#define b_ddddd(x) b_x((x), 11, 5) /* device number */ +#define b_fff(x) b_x((x), 8, 3) /* function number */ +#define b_rrrrrrrr(x) b_x((x), 0, 8) /* register number */ + +#define SS_M64 3 +#define SS_M32 2 +#define SS_IO 1 +#define SS_CONFIG 0 + + +static struct spapr_phb phb; + + +static void rtas_ibm_read_pci_config(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + uint32_t val = 0; + uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2); + union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) }; + struct pci_device_header *dev = pci__find_dev(addr.device_number); + uint32_t size = rtas_ld(vcpu->kvm, args, 3); + + if (buid != phb.buid || !dev || (size > 4)) { + phb_dprintf("- cfgRd buid 0x%lx cfg addr 0x%x size %d not found\n", + buid, addr.w, size); + + rtas_st(vcpu->kvm, rets, 0, -1); + return; + } + pci__config_rd(vcpu->kvm, addr, &val, size); + /* It appears this wants a byteswapped result... */ + switch (size) { + case 4: + val = le32_to_cpu(val); + break; + case 2: + val = le16_to_cpu(val>>16); + break; + case 1: + val = val >> 24; + break; + } + phb_dprintf("- cfgRd buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n", + buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number, + addr.register_number, val); + + rtas_st(vcpu->kvm, rets, 0, 0); + rtas_st(vcpu->kvm, rets, 1, val); +} + +static void rtas_read_pci_config(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + uint32_t val; + union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) }; + struct pci_device_header *dev = pci__find_dev(addr.device_number); + uint32_t size = rtas_ld(vcpu->kvm, args, 1); + + if (!dev || (size > 4)) { + rtas_st(vcpu->kvm, rets, 0, -1); + return; + } + pci__config_rd(vcpu->kvm, addr, &val, size); + switch (size) { + case 4: + val = le32_to_cpu(val); + break; + case 2: + val = le16_to_cpu(val>>16); /* We're yuck-endian. */ + break; + case 1: + val = val >> 24; + break; + } + phb_dprintf("- cfgRd addr 0x%x size %d, val 0x%x\n", addr.w, size, val); + rtas_st(vcpu->kvm, rets, 0, 0); + rtas_st(vcpu->kvm, rets, 1, val); +} + +static void rtas_ibm_write_pci_config(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2); + union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) }; + struct pci_device_header *dev = pci__find_dev(addr.device_number); + uint32_t size = rtas_ld(vcpu->kvm, args, 3); + uint32_t val = rtas_ld(vcpu->kvm, args, 4); + + if (buid != phb.buid || !dev || (size > 4)) { + phb_dprintf("- cfgWr buid 0x%lx cfg addr 0x%x/%d error (val 0x%x)\n", + buid, addr.w, size, val); + + rtas_st(vcpu->kvm, rets, 0, -1); + return; + } + phb_dprintf("- cfgWr buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n", + buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number, + addr.register_number, val); + switch (size) { + case 4: + val = le32_to_cpu(val); + break; + case 2: + val = le16_to_cpu(val) << 16; + break; + case 1: + val = val >> 24; + break; + } + pci__config_wr(vcpu->kvm, addr, &val, size); + rtas_st(vcpu->kvm, rets, 0, 0); +} + +static void rtas_write_pci_config(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) }; + struct pci_device_header *dev = pci__find_dev(addr.device_number); + uint32_t size = rtas_ld(vcpu->kvm, args, 1); + uint32_t val = rtas_ld(vcpu->kvm, args, 2); + + if (!dev || (size > 4)) { + rtas_st(vcpu->kvm, rets, 0, -1); + return; + } + + phb_dprintf("- cfgWr addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n", + addr.w, size, addr.bus_number, addr.device_number, addr.function_number, + addr.register_number, val); + switch (size) { + case 4: + val = le32_to_cpu(val); + break; + case 2: + val = le16_to_cpu(val) << 16; + break; + case 1: + val = val >> 24; + break; + } + pci__config_wr(vcpu->kvm, addr, &val, size); + rtas_st(vcpu->kvm, rets, 0, 0); +} + +void spapr_create_phb(struct kvm *kvm, + const char *busname, uint64_t buid, + uint64_t mem_win_addr, uint64_t mem_win_size, + uint64_t io_win_addr, uint64_t io_win_size) +{ + /* + * Since kvmtool doesn't really have any concept of buses etc., + * there's nothing to register here. Just register RTAS. + */ + spapr_rtas_register("read-pci-config", rtas_read_pci_config); + spapr_rtas_register("write-pci-config", rtas_write_pci_config); + spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config); + spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config); + + phb.buid = buid; + phb.mem_addr = mem_win_addr; + phb.mem_size = mem_win_size; + phb.io_addr = io_win_addr; + phb.io_size = io_win_size; + + kvm->arch.phb = &phb; +} + +static uint32_t bar_to_ss(unsigned long bar) +{ + if ((bar & PCI_BASE_ADDRESS_SPACE) == + PCI_BASE_ADDRESS_SPACE_IO) + return SS_IO; + else if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) + return SS_M64; + else + return SS_M32; +} + +static unsigned long bar_to_addr(unsigned long bar) +{ + if ((bar & PCI_BASE_ADDRESS_SPACE) == + PCI_BASE_ADDRESS_SPACE_IO) + return bar & PCI_BASE_ADDRESS_IO_MASK; + else + return bar & PCI_BASE_ADDRESS_MEM_MASK; +} + +int spapr_populate_pci_devices(struct kvm *kvm, + uint32_t xics_phandle, + void *fdt) +{ + int bus_off, node_off = 0, devid, fn, i, n, devices; + struct device_header *dev_hdr; + char nodename[256]; + struct { + uint32_t hi; + uint64_t addr; + uint64_t size; + } __attribute__((packed)) reg[PCI_NUM_REGIONS + 1], + assigned_addresses[PCI_NUM_REGIONS]; + uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) }; + struct { + uint32_t hi; + uint64_t child; + uint64_t parent; + uint64_t size; + } __attribute__((packed)) ranges[] = { + { + cpu_to_be32(b_ss(1)), cpu_to_be64(0), + cpu_to_be64(phb.io_addr), + cpu_to_be64(phb.io_size), + }, + { + cpu_to_be32(b_ss(2)), cpu_to_be64(0), + cpu_to_be64(phb.mem_addr), + cpu_to_be64(phb.mem_size), + }, + }; + uint64_t bus_reg[] = { cpu_to_be64(phb.buid), 0 }; + uint32_t interrupt_map_mask[] = { + cpu_to_be32(b_ddddd(-1)|b_fff(-1)), 0x0, 0x0, 0x0}; + uint32_t interrupt_map[SPAPR_PCI_NUM_LSI][7]; + + /* Start populating the FDT */ + sprintf(nodename, "pci@%" PRIx64, phb.buid); + bus_off = fdt_add_subnode(fdt, 0, nodename); + if (bus_off < 0) { + die("error making bus subnode, %s\n", fdt_strerror(bus_off)); + return bus_off; + } + + /* Write PHB properties */ + _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci")); + _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB")); + _FDT(fdt_setprop_cell(fdt, bus_off, "#address-cells", 0x3)); + _FDT(fdt_setprop_cell(fdt, bus_off, "#size-cells", 0x2)); + _FDT(fdt_setprop_cell(fdt, bus_off, "#interrupt-cells", 0x1)); + _FDT(fdt_setprop(fdt, bus_off, "used-by-rtas", NULL, 0)); + _FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range))); + _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges))); + _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg))); + _FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask", + &interrupt_map_mask, sizeof(interrupt_map_mask))); + + /* Populate PCI devices and allocate IRQs */ + devices = 0; + dev_hdr = device__first_dev(DEVICE_BUS_PCI); + while (dev_hdr) { + uint32_t *irqmap = interrupt_map[devices]; + struct pci_device_header *hdr = dev_hdr->data; + + if (!hdr) + continue; + + devid = dev_hdr->dev_num; + fn = 0; /* kvmtool doesn't yet do multifunction devices */ + + sprintf(nodename, "pci@%u,%u", devid, fn); + + /* Allocate interrupt from the map */ + if (devid > SPAPR_PCI_NUM_LSI) { + die("Unexpected behaviour in spapr_populate_pci_devices," + "wrong devid %u\n", devid); + } + irqmap[0] = cpu_to_be32(b_ddddd(devid)|b_fff(fn)); + irqmap[1] = 0; + irqmap[2] = 0; + irqmap[3] = 0; + irqmap[4] = cpu_to_be32(xics_phandle); + /* + * This is nasty; the PCI devs are set up such that their own + * header's irq_line indicates the direct XICS IRQ number to + * use. There REALLY needs to be a hierarchical system in place + * to 'raise' an IRQ on the bridge which indexes/looks up which + * XICS IRQ to fire. + */ + irqmap[5] = cpu_to_be32(hdr->irq_line); + irqmap[6] = cpu_to_be32(0x8); + + /* Add node to FDT */ + node_off = fdt_add_subnode(fdt, bus_off, nodename); + if (node_off < 0) { + die("error making node subnode, %s\n", fdt_strerror(bus_off)); + return node_off; + } + + _FDT(fdt_setprop_cell(fdt, node_off, "vendor-id", + le16_to_cpu(hdr->vendor_id))); + _FDT(fdt_setprop_cell(fdt, node_off, "device-id", + le16_to_cpu(hdr->device_id))); + _FDT(fdt_setprop_cell(fdt, node_off, "revision-id", + hdr->revision_id)); + _FDT(fdt_setprop_cell(fdt, node_off, "class-code", + hdr->class[0] | (hdr->class[1] << 8) | (hdr->class[2] << 16))); + _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id", + le16_to_cpu(hdr->subsys_id))); + _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id", + le16_to_cpu(hdr->subsys_vendor_id))); + + /* Config space region comes first */ + reg[0].hi = cpu_to_be32( + b_n(0) | + b_p(0) | + b_t(0) | + b_ss(SS_CONFIG) | + b_bbbbbbbb(0) | + b_ddddd(devid) | + b_fff(fn)); + reg[0].addr = 0; + reg[0].size = 0; + + n = 0; + /* Six BARs, no ROM supported, addresses are 32bit */ + for (i = 0; i < 6; ++i) { + if (0 == hdr->bar[i]) { + continue; + } + + reg[n+1].hi = cpu_to_be32( + b_n(0) | + b_p(0) | + b_t(0) | + b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) | + b_bbbbbbbb(0) | + b_ddddd(devid) | + b_fff(fn) | + b_rrrrrrrr(bars[i])); + reg[n+1].addr = 0; + reg[n+1].size = cpu_to_be64(hdr->bar_size[i]); + + assigned_addresses[n].hi = cpu_to_be32( + b_n(1) | + b_p(0) | + b_t(0) | + b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) | + b_bbbbbbbb(0) | + b_ddddd(devid) | + b_fff(fn) | + b_rrrrrrrr(bars[i])); + + /* + * Writing zeroes to assigned_addresses causes the guest kernel to + * reassign BARs + */ + assigned_addresses[n].addr = cpu_to_be64(bar_to_addr(le32_to_cpu(hdr->bar[i]))); + assigned_addresses[n].size = reg[n+1].size; + + ++n; + } + _FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1))); + _FDT(fdt_setprop(fdt, node_off, "assigned-addresses", + assigned_addresses, + sizeof(assigned_addresses[0])*(n))); + _FDT(fdt_setprop_cell(fdt, node_off, "interrupts", + hdr->irq_pin)); + + /* We don't set ibm,dma-window property as we don't have an IOMMU. */ + + ++devices; + dev_hdr = device__next_dev(dev_hdr); + } + + /* Write interrupt map */ + _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, + devices * sizeof(interrupt_map[0]))); + + return 0; +} diff --git a/tools/kvm/powerpc/spapr_pci.h b/tools/kvm/powerpc/spapr_pci.h new file mode 100644 index 000000000000..48b221c5dc73 --- /dev/null +++ b/tools/kvm/powerpc/spapr_pci.h @@ -0,0 +1,57 @@ +/* + * SPAPR PHB definitions + * + * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef SPAPR_PCI_H +#define SPAPR_PCI_H + +#include "kvm/kvm.h" +#include "spapr.h" +#include <inttypes.h> + +/* With XICS, we can easily accomodate 1 IRQ per PCI device. */ + +#define SPAPR_PCI_NUM_LSI 256 + +struct spapr_phb { + uint64_t buid; + uint64_t mem_addr; + uint64_t mem_size; + uint64_t io_addr; + uint64_t io_size; +}; + +void spapr_create_phb(struct kvm *kvm, + const char *busname, uint64_t buid, + uint64_t mem_win_addr, uint64_t mem_win_size, + uint64_t io_win_addr, uint64_t io_win_size); + +int spapr_populate_pci_devices(struct kvm *kvm, + uint32_t xics_phandle, + void *fdt); + +static inline bool spapr_phb_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write) +{ + if ((phys_addr >= SPAPR_PCI_IO_WIN_ADDR) && + (phys_addr < SPAPR_PCI_IO_WIN_ADDR + + SPAPR_PCI_IO_WIN_SIZE)) { + return kvm__emulate_io(kvm, phys_addr - SPAPR_PCI_IO_WIN_ADDR, + data, is_write ? KVM_EXIT_IO_OUT : + KVM_EXIT_IO_IN, + len, 1); + } else if ((phys_addr >= SPAPR_PCI_MEM_WIN_ADDR) && + (phys_addr < SPAPR_PCI_MEM_WIN_ADDR + + SPAPR_PCI_MEM_WIN_SIZE)) { + return kvm__emulate_mmio(kvm, phys_addr - SPAPR_PCI_MEM_WIN_ADDR, + data, len, is_write); + } + return false; +} + +#endif diff --git a/tools/kvm/powerpc/spapr_rtas.c b/tools/kvm/powerpc/spapr_rtas.c new file mode 100644 index 000000000000..c81d82b3857c --- /dev/null +++ b/tools/kvm/powerpc/spapr_rtas.c @@ -0,0 +1,233 @@ +/* + * SPAPR base RTAS calls + * + * Borrowed heavily from QEMU's spapr_rtas.c + * Copyright (c) 2010-2011 David Gibson, IBM Corporation. + * + * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" +#include "kvm/util.h" +#include "kvm/term.h" +#include "libfdt.h" + +#include "spapr.h" + +#include <stdio.h> +#include <assert.h> + +#define TOKEN_BASE 0x2000 +#define TOKEN_MAX 0x100 + +#define RTAS_CONSOLE + +static struct rtas_call { + const char *name; + spapr_rtas_fn fn; +} rtas_table[TOKEN_MAX]; + +struct rtas_call *rtas_next = rtas_table; + + +static void rtas_display_character(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + char c = rtas_ld(vcpu->kvm, args, 0); + term_putc(&c, 1, 0); + rtas_st(vcpu->kvm, rets, 0, 0); +} + +#ifdef RTAS_CONSOLE +static void rtas_put_term_char(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + char c = rtas_ld(vcpu->kvm, args, 0); + + if (vcpu->kvm->cfg.active_console == CONSOLE_HV) + term_putc(&c, 1, 0); + + rtas_st(vcpu->kvm, rets, 0, 0); +} + +static void rtas_get_term_char(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + int c; + + if (vcpu->kvm->cfg.active_console == CONSOLE_HV && term_readable(0) && + (c = term_getc(vcpu->kvm, 0)) >= 0) { + rtas_st(vcpu->kvm, rets, 0, 0); + rtas_st(vcpu->kvm, rets, 1, c); + } else { + rtas_st(vcpu->kvm, rets, 0, -2); + } +} +#endif + +static void rtas_get_time_of_day(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + struct tm tm; + time_t tnow; + + if (nret != 8) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + tnow = time(NULL); + /* Guest time is currently not offset in any way. */ + gmtime_r(&tnow, &tm); + + rtas_st(vcpu->kvm, rets, 0, 0); /* Success */ + rtas_st(vcpu->kvm, rets, 1, tm.tm_year + 1900); + rtas_st(vcpu->kvm, rets, 2, tm.tm_mon + 1); + rtas_st(vcpu->kvm, rets, 3, tm.tm_mday); + rtas_st(vcpu->kvm, rets, 4, tm.tm_hour); + rtas_st(vcpu->kvm, rets, 5, tm.tm_min); + rtas_st(vcpu->kvm, rets, 6, tm.tm_sec); + rtas_st(vcpu->kvm, rets, 7, 0); +} + +static void rtas_set_time_of_day(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + pr_warning("%s called; TOD set ignored.\n", __FUNCTION__); +} + +static void rtas_power_off(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets) +{ + if (nargs != 2 || nret != 1) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + kvm_cpu__reboot(vcpu->kvm); +} + +static void rtas_query_cpu_stopped_state(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + if (nargs != 1 || nret != 2) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + /* + * Can read id = rtas_ld(vcpu->kvm, args, 0), but + * we currently start all CPUs. So just return true. + */ + rtas_st(vcpu->kvm, rets, 0, 0); + rtas_st(vcpu->kvm, rets, 1, 2); +} + +static void rtas_start_cpu(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, + target_ulong args, + uint32_t nret, target_ulong rets) +{ + die(__FUNCTION__); +} + +target_ulong spapr_rtas_call(struct kvm_cpu *vcpu, + uint32_t token, uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets) +{ + if ((token >= TOKEN_BASE) + && ((token - TOKEN_BASE) < TOKEN_MAX)) { + struct rtas_call *call = rtas_table + (token - TOKEN_BASE); + + if (call->fn) { + call->fn(vcpu, token, nargs, args, nret, rets); + return H_SUCCESS; + } + } + + /* + * HACK: Some Linux early debug code uses RTAS display-character, + * but assumes the token value is 0xa (which it is on some real + * machines) without looking it up in the device tree. This + * special case makes this work + */ + if (token == 0xa) { + rtas_display_character(vcpu, 0xa, nargs, args, nret, rets); + return H_SUCCESS; + } + + hcall_dprintf("Unknown RTAS token 0x%x\n", token); + rtas_st(vcpu->kvm, rets, 0, -3); + return H_PARAMETER; +} + +void spapr_rtas_register(const char *name, spapr_rtas_fn fn) +{ + assert(rtas_next < (rtas_table + TOKEN_MAX)); + + rtas_next->name = name; + rtas_next->fn = fn; + + rtas_next++; +} + +/* + * This is called from the context of an open /rtas node, in order to add + * properties for the rtas call tokens. + */ +int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt) +{ + int ret; + int i; + + for (i = 0; i < TOKEN_MAX; i++) { + struct rtas_call *call = &rtas_table[i]; + + if (!call->fn) { + continue; + } + + ret = fdt_property_cell(fdt, call->name, i + TOKEN_BASE); + + if (ret < 0) { + pr_warning("Couldn't add rtas token for %s: %s\n", + call->name, fdt_strerror(ret)); + return ret; + } + + } + return 0; +} + +void register_core_rtas(void) +{ + spapr_rtas_register("display-character", rtas_display_character); + spapr_rtas_register("get-time-of-day", rtas_get_time_of_day); + spapr_rtas_register("set-time-of-day", rtas_set_time_of_day); + spapr_rtas_register("power-off", rtas_power_off); + spapr_rtas_register("query-cpu-stopped-state", + rtas_query_cpu_stopped_state); + spapr_rtas_register("start-cpu", rtas_start_cpu); +#ifdef RTAS_CONSOLE + /* These are unused: We do console I/O via hcalls, not rtas. */ + spapr_rtas_register("put-term-char", rtas_put_term_char); + spapr_rtas_register("get-term-char", rtas_get_term_char); +#endif +} diff --git a/tools/kvm/powerpc/xics.c b/tools/kvm/powerpc/xics.c new file mode 100644 index 000000000000..d4b5caae8af7 --- /dev/null +++ b/tools/kvm/powerpc/xics.c @@ -0,0 +1,522 @@ +/* + * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics + * + * Borrowed heavily from QEMU's xics.c, + * Copyright (c) 2010,2011 David Gibson, IBM Corporation. + * + * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "spapr.h" +#include "xics.h" +#include "kvm/util.h" + +#include <stdio.h> +#include <malloc.h> + +#define XICS_NUM_IRQS 1024 + + +/* #define DEBUG_XICS yes */ +#ifdef DEBUG_XICS +#define xics_dprintf(fmt, ...) \ + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) +#else +#define xics_dprintf(fmt, ...) \ + do { } while (0) +#endif + +/* + * ICP: Presentation layer + */ + +struct icp_server_state { + uint32_t xirr; + uint8_t pending_priority; + uint8_t mfrr; + struct kvm_cpu *cpu; +}; + +#define XICS_IRQ_OFFSET 16 +#define XISR_MASK 0x00ffffff +#define CPPR_MASK 0xff000000 + +#define XISR(ss) (((ss)->xirr) & XISR_MASK) +#define CPPR(ss) (((ss)->xirr) >> 24) + +struct ics_state; + +struct icp_state { + unsigned long nr_servers; + struct icp_server_state *ss; + struct ics_state *ics; +}; + +static void ics_reject(struct ics_state *ics, int nr); +static void ics_resend(struct ics_state *ics); +static void ics_eoi(struct ics_state *ics, int nr); + +static inline void cpu_irq_raise(struct kvm_cpu *vcpu) +{ + xics_dprintf("INT1[%p]\n", vcpu); + kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1); +} + +static inline void cpu_irq_lower(struct kvm_cpu *vcpu) +{ + xics_dprintf("INT0[%p]\n", vcpu); + kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 0); +} + +static void icp_check_ipi(struct icp_state *icp, int server) +{ + struct icp_server_state *ss = icp->ss + server; + + if (XISR(ss) && (ss->pending_priority <= ss->mfrr)) { + return; + } + + if (XISR(ss)) { + ics_reject(icp->ics, XISR(ss)); + } + + ss->xirr = (ss->xirr & ~XISR_MASK) | XICS_IPI; + ss->pending_priority = ss->mfrr; + cpu_irq_raise(ss->cpu); +} + +static void icp_resend(struct icp_state *icp, int server) +{ + struct icp_server_state *ss = icp->ss + server; + + if (ss->mfrr < CPPR(ss)) { + icp_check_ipi(icp, server); + } + ics_resend(icp->ics); +} + +static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr) +{ + struct icp_server_state *ss = icp->ss + server; + uint8_t old_cppr; + uint32_t old_xisr; + + old_cppr = CPPR(ss); + ss->xirr = (ss->xirr & ~CPPR_MASK) | (cppr << 24); + + if (cppr < old_cppr) { + if (XISR(ss) && (cppr <= ss->pending_priority)) { + old_xisr = XISR(ss); + ss->xirr &= ~XISR_MASK; /* Clear XISR */ + cpu_irq_lower(ss->cpu); + ics_reject(icp->ics, old_xisr); + } + } else { + if (!XISR(ss)) { + icp_resend(icp, server); + } + } +} + +static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr) +{ + struct icp_server_state *ss = icp->ss + nr; + + ss->mfrr = mfrr; + if (mfrr < CPPR(ss)) { + icp_check_ipi(icp, nr); + } +} + +static uint32_t icp_accept(struct icp_server_state *ss) +{ + uint32_t xirr; + + cpu_irq_lower(ss->cpu); + xirr = ss->xirr; + ss->xirr = ss->pending_priority << 24; + return xirr; +} + +static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr) +{ + struct icp_server_state *ss = icp->ss + server; + + ics_eoi(icp->ics, xirr & XISR_MASK); + /* Send EOI -> ICS */ + ss->xirr = (ss->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK); + if (!XISR(ss)) { + icp_resend(icp, server); + } +} + +static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority) +{ + struct icp_server_state *ss = icp->ss + server; + xics_dprintf("icp_irq(nr %d, server %d, prio 0x%x)\n", nr, server, priority); + if ((priority >= CPPR(ss)) + || (XISR(ss) && (ss->pending_priority <= priority))) { + xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n", + nr, CPPR(ss), XISR(ss), ss->pending_priority, priority); + ics_reject(icp->ics, nr); + } else { + if (XISR(ss)) { + xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n", + nr, CPPR(ss), XISR(ss), ss->pending_priority, priority); + ics_reject(icp->ics, XISR(ss)); + } + ss->xirr = (ss->xirr & ~XISR_MASK) | (nr & XISR_MASK); + ss->pending_priority = priority; + cpu_irq_raise(ss->cpu); + } +} + +/* + * ICS: Source layer + */ + +struct ics_irq_state { + int server; + uint8_t priority; + uint8_t saved_priority; + int rejected:1; + int masked_pending:1; +}; + +struct ics_state { + unsigned int nr_irqs; + unsigned int offset; + struct ics_irq_state *irqs; + struct icp_state *icp; +}; + +static int ics_valid_irq(struct ics_state *ics, uint32_t nr) +{ + return (nr >= ics->offset) + && (nr < (ics->offset + ics->nr_irqs)); +} + +static void ics_set_irq_msi(struct ics_state *ics, int srcno, int val) +{ + struct ics_irq_state *irq = ics->irqs + srcno; + + if (val) { + if (irq->priority == 0xff) { + xics_dprintf(" irq pri ff, masked pending\n"); + irq->masked_pending = 1; + } else { + icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority); + } + } +} + +static void ics_reject_msi(struct ics_state *ics, int nr) +{ + struct ics_irq_state *irq = ics->irqs + nr - ics->offset; + + irq->rejected = 1; +} + +static void ics_resend_msi(struct ics_state *ics) +{ + unsigned int i; + + for (i = 0; i < ics->nr_irqs; i++) { + struct ics_irq_state *irq = ics->irqs + i; + + /* FIXME: filter by server#? */ + if (irq->rejected) { + irq->rejected = 0; + if (irq->priority != 0xff) { + icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority); + } + } + } +} + +static void ics_write_xive_msi(struct ics_state *ics, int nr, int server, + uint8_t priority) +{ + struct ics_irq_state *irq = ics->irqs + nr - ics->offset; + + irq->server = server; + irq->priority = priority; + xics_dprintf("ics_write_xive_msi(nr %d, server %d, pri 0x%x)\n", nr, server, priority); + + if (!irq->masked_pending || (priority == 0xff)) { + return; + } + + irq->masked_pending = 0; + icp_irq(ics->icp, server, nr, priority); +} + +static void ics_reject(struct ics_state *ics, int nr) +{ + ics_reject_msi(ics, nr); +} + +static void ics_resend(struct ics_state *ics) +{ + ics_resend_msi(ics); +} + +static void ics_eoi(struct ics_state *ics, int nr) +{ +} + +/* + * Exported functions + */ + +static int allocated_irqnum = XICS_IRQ_OFFSET; + +/* + * xics_alloc_irqnum(): This is hacky. The problem boils down to the PCI device + * code which just calls kvm__irq_line( .. pcidev->pci_hdr.irq_line ..) at will. + * Each PCI device's IRQ line is allocated by irq__register_device() (which + * allocates an IRQ AND allocates a.. PCI device num..). + * + * In future I'd like to at least mimic some kind of 'upstream IRQ controller' + * whereby PCI devices let their PHB know when they want to IRQ, and that + * percolates up. + * + * For now, allocate a REAL xics irq number and (via irq__register_device) push + * that into the config space. 8 bits only though! + */ +int xics_alloc_irqnum(void) +{ + int irq = allocated_irqnum++; + + if (irq > 255) + die("Huge numbers of IRQs aren't supported with the daft kvmtool IRQ system."); + + return irq; +} + +static target_ulong h_cppr(struct kvm_cpu *vcpu, + target_ulong opcode, target_ulong *args) +{ + target_ulong cppr = args[0]; + + xics_dprintf("h_cppr(%lx)\n", cppr); + icp_set_cppr(vcpu->kvm->arch.icp, vcpu->cpu_id, cppr); + return H_SUCCESS; +} + +static target_ulong h_ipi(struct kvm_cpu *vcpu, + target_ulong opcode, target_ulong *args) +{ + target_ulong server = args[0]; + target_ulong mfrr = args[1]; + + xics_dprintf("h_ipi(%lx, %lx)\n", server, mfrr); + if (server >= vcpu->kvm->arch.icp->nr_servers) { + return H_PARAMETER; + } + + icp_set_mfrr(vcpu->kvm->arch.icp, server, mfrr); + return H_SUCCESS; +} + +static target_ulong h_xirr(struct kvm_cpu *vcpu, + target_ulong opcode, target_ulong *args) +{ + uint32_t xirr = icp_accept(vcpu->kvm->arch.icp->ss + vcpu->cpu_id); + + xics_dprintf("h_xirr() = %x\n", xirr); + args[0] = xirr; + return H_SUCCESS; +} + +static target_ulong h_eoi(struct kvm_cpu *vcpu, + target_ulong opcode, target_ulong *args) +{ + target_ulong xirr = args[0]; + + xics_dprintf("h_eoi(%lx)\n", xirr); + icp_eoi(vcpu->kvm->arch.icp, vcpu->cpu_id, xirr); + return H_SUCCESS; +} + +static void rtas_set_xive(struct kvm_cpu *vcpu, uint32_t token, + uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets) +{ + struct ics_state *ics = vcpu->kvm->arch.icp->ics; + uint32_t nr, server, priority; + + if ((nargs != 3) || (nret != 1)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + nr = rtas_ld(vcpu->kvm, args, 0); + server = rtas_ld(vcpu->kvm, args, 1); + priority = rtas_ld(vcpu->kvm, args, 2); + + xics_dprintf("rtas_set_xive(%x,%x,%x)\n", nr, server, priority); + if (!ics_valid_irq(ics, nr) || (server >= ics->icp->nr_servers) + || (priority > 0xff)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + ics_write_xive_msi(ics, nr, server, priority); + + rtas_st(vcpu->kvm, rets, 0, 0); /* Success */ +} + +static void rtas_get_xive(struct kvm_cpu *vcpu, uint32_t token, + uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets) +{ + struct ics_state *ics = vcpu->kvm->arch.icp->ics; + uint32_t nr; + + if ((nargs != 1) || (nret != 3)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + nr = rtas_ld(vcpu->kvm, args, 0); + + if (!ics_valid_irq(ics, nr)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + rtas_st(vcpu->kvm, rets, 0, 0); /* Success */ + rtas_st(vcpu->kvm, rets, 1, ics->irqs[nr - ics->offset].server); + rtas_st(vcpu->kvm, rets, 2, ics->irqs[nr - ics->offset].priority); +} + +static void rtas_int_off(struct kvm_cpu *vcpu, uint32_t token, + uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets) +{ + struct ics_state *ics = vcpu->kvm->arch.icp->ics; + uint32_t nr; + + if ((nargs != 1) || (nret != 1)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + nr = rtas_ld(vcpu->kvm, args, 0); + + if (!ics_valid_irq(ics, nr)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + /* ME: QEMU wrote xive_msi here, in #if 0. Deleted. */ + + rtas_st(vcpu->kvm, rets, 0, 0); /* Success */ +} + +static void rtas_int_on(struct kvm_cpu *vcpu, uint32_t token, + uint32_t nargs, target_ulong args, + uint32_t nret, target_ulong rets) +{ + struct ics_state *ics = vcpu->kvm->arch.icp->ics; + uint32_t nr; + + if ((nargs != 1) || (nret != 1)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + nr = rtas_ld(vcpu->kvm, args, 0); + + if (!ics_valid_irq(ics, nr)) { + rtas_st(vcpu->kvm, rets, 0, -3); + return; + } + + /* ME: QEMU wrote xive_msi here, in #if 0. Deleted. */ + + rtas_st(vcpu->kvm, rets, 0, 0); /* Success */ +} + +static int xics_init(struct kvm *kvm) +{ + int max_server_num; + unsigned int i; + struct icp_state *icp; + struct ics_state *ics; + int j; + + max_server_num = kvm->nrcpus; + + icp = malloc(sizeof(*icp)); + icp->nr_servers = max_server_num + 1; + icp->ss = malloc(icp->nr_servers * sizeof(struct icp_server_state)); + + for (i = 0; i < icp->nr_servers; i++) { + icp->ss[i].xirr = 0; + icp->ss[i].pending_priority = 0; + icp->ss[i].cpu = 0; + icp->ss[i].mfrr = 0xff; + } + + /* + * icp->ss[env->cpu_index].cpu is set by CPUs calling in to + * xics_cpu_register(). + */ + + ics = malloc(sizeof(*ics)); + ics->nr_irqs = XICS_NUM_IRQS; + ics->offset = XICS_IRQ_OFFSET; + ics->irqs = malloc(ics->nr_irqs * sizeof(struct ics_irq_state)); + + icp->ics = ics; + ics->icp = icp; + + for (i = 0; i < ics->nr_irqs; i++) { + ics->irqs[i].server = 0; + ics->irqs[i].priority = 0xff; + ics->irqs[i].saved_priority = 0xff; + ics->irqs[i].rejected = 0; + ics->irqs[i].masked_pending = 0; + } + + spapr_register_hypercall(H_CPPR, h_cppr); + spapr_register_hypercall(H_IPI, h_ipi); + spapr_register_hypercall(H_XIRR, h_xirr); + spapr_register_hypercall(H_EOI, h_eoi); + + spapr_rtas_register("ibm,set-xive", rtas_set_xive); + spapr_rtas_register("ibm,get-xive", rtas_get_xive); + spapr_rtas_register("ibm,int-off", rtas_int_off); + spapr_rtas_register("ibm,int-on", rtas_int_on); + + for (j = 0; j < kvm->nrcpus; j++) { + struct kvm_cpu *vcpu = kvm->cpus[j]; + + if (vcpu->cpu_id >= icp->nr_servers) + die("Invalid server number for cpuid %ld\n", vcpu->cpu_id); + + icp->ss[vcpu->cpu_id].cpu = vcpu; + } + + kvm->arch.icp = icp; + + return 0; +} +base_init(xics_init); + + +void kvm__irq_line(struct kvm *kvm, int irq, int level) +{ + /* + * Route event to ICS, which routes to ICP, which eventually does a + * kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1) + */ + xics_dprintf("Raising IRQ %d -> %d\n", irq, level); + ics_set_irq_msi(kvm->arch.icp->ics, irq - kvm->arch.icp->ics->offset, level); +} diff --git a/tools/kvm/powerpc/xics.h b/tools/kvm/powerpc/xics.h new file mode 100644 index 000000000000..d5bc6f92fa82 --- /dev/null +++ b/tools/kvm/powerpc/xics.h @@ -0,0 +1,18 @@ +/* + * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics + * + * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#ifndef XICS_H +#define XICS_H + +#define XICS_IPI 0x2 + +int xics_alloc_irqnum(void); + +#endif diff --git a/tools/kvm/symbol.c b/tools/kvm/symbol.c new file mode 100644 index 000000000000..07dd9d541065 --- /dev/null +++ b/tools/kvm/symbol.c @@ -0,0 +1,133 @@ +#include "kvm/symbol.h" + +#include "kvm/kvm.h" + +#include <linux/err.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <bfd.h> + +static bfd *abfd; + +int symbol_init(struct kvm *kvm) +{ + int ret = 0; + + if (!kvm->vmlinux) + return 0; + + bfd_init(); + + abfd = bfd_openr(kvm->vmlinux, NULL); + if (abfd == NULL) { + bfd_error_type err = bfd_get_error(); + + switch (err) { + case bfd_error_no_memory: + ret = -ENOMEM; + break; + case bfd_error_invalid_target: + ret = -EINVAL; + break; + default: + ret = -EFAULT; + break; + } + } + + return ret; +} +late_init(symbol_init); + +static asymbol *lookup(asymbol **symbols, int nr_symbols, const char *symbol_name) +{ + int i, ret; + + ret = -ENOENT; + + for (i = 0; i < nr_symbols; i++) { + asymbol *symbol = symbols[i]; + + if (!strcmp(bfd_asymbol_name(symbol), symbol_name)) + return symbol; + } + + return ERR_PTR(ret); +} + +char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size) +{ + const char *filename; + bfd_vma sym_offset; + bfd_vma sym_start; + asection *section; + unsigned int line; + const char *func; + long symtab_size; + asymbol *symbol; + asymbol **syms; + int nr_syms, ret; + + ret = -ENOENT; + if (!abfd) + goto not_found; + + if (!bfd_check_format(abfd, bfd_object)) + goto not_found; + + symtab_size = bfd_get_symtab_upper_bound(abfd); + if (!symtab_size) + goto not_found; + + ret = -ENOMEM; + syms = malloc(symtab_size); + if (!syms) + goto not_found; + + nr_syms = bfd_canonicalize_symtab(abfd, syms); + + ret = -ENOENT; + section = bfd_get_section_by_name(abfd, ".debug_aranges"); + if (!section) + goto not_found; + + if (!bfd_find_nearest_line(abfd, section, NULL, addr, &filename, &func, &line)) + goto not_found; + + if (!func) + goto not_found; + + symbol = lookup(syms, nr_syms, func); + if (IS_ERR(symbol)) + goto not_found; + + sym_start = bfd_asymbol_value(symbol); + + sym_offset = addr - sym_start; + + snprintf(sym, size, "%s+%llx (%s:%i)", func, (long long) sym_offset, filename, line); + + sym[size - 1] = '\0'; + + free(syms); + + return sym; + +not_found: + return ERR_PTR(ret); +} + +int symbol_exit(struct kvm *kvm) +{ + bfd_boolean ret = TRUE; + + if (abfd) + ret = bfd_close(abfd); + + if (ret == TRUE) + return 0; + + return -EFAULT; +} +late_exit(symbol_exit); diff --git a/tools/kvm/term.c b/tools/kvm/term.c new file mode 100644 index 000000000000..4413450f57d2 --- /dev/null +++ b/tools/kvm/term.c @@ -0,0 +1,171 @@ +#include <poll.h> +#include <stdbool.h> +#include <termios.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/uio.h> +#include <signal.h> +#include <pty.h> +#include <utmp.h> + +#include "kvm/read-write.h" +#include "kvm/term.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" + +#define TERM_FD_IN 0 +#define TERM_FD_OUT 1 + +static struct termios orig_term; + +int term_escape_char = 0x01; /* ctrl-a is used for escape */ +bool term_got_escape = false; + +int term_fds[4][2]; + +int term_getc(struct kvm *kvm, int term) +{ + unsigned char c; + + if (read_in_full(term_fds[term][TERM_FD_IN], &c, 1) < 0) + return -1; + + if (term_got_escape) { + term_got_escape = false; + if (c == 'x') + kvm_cpu__reboot(kvm); + if (c == term_escape_char) + return c; + } + + if (c == term_escape_char) { + term_got_escape = true; + return -1; + } + + return c; +} + +int term_putc(char *addr, int cnt, int term) +{ + int ret; + + while (cnt--) { + ret = write(term_fds[term][TERM_FD_OUT], addr++, 1); + if (ret < 0) + return 0; + } + + return cnt; +} + +int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term) +{ + int c; + + c = term_getc(kvm, term); + + if (c < 0) + return 0; + + *((char *)iov[TERM_FD_IN].iov_base) = (char)c; + + return sizeof(char); +} + +int term_putc_iov(struct iovec *iov, int iovcnt, int term) +{ + return writev(term_fds[term][TERM_FD_OUT], iov, iovcnt); +} + +bool term_readable(int term) +{ + struct pollfd pollfd = (struct pollfd) { + .fd = term_fds[term][TERM_FD_IN], + .events = POLLIN, + .revents = 0, + }; + + return poll(&pollfd, 1, 0) > 0; +} + +static void term_cleanup(void) +{ + int i; + + for (i = 0; i < 4; i++) + tcsetattr(term_fds[i][TERM_FD_IN], TCSANOW, &orig_term); +} + +static void term_sig_cleanup(int sig) +{ + term_cleanup(); + signal(sig, SIG_DFL); + raise(sig); +} + +void term_set_tty(int term) +{ + struct termios orig_term; + int master, slave; + char new_pty[PATH_MAX]; + + if (tcgetattr(STDIN_FILENO, &orig_term) < 0) + die("unable to save initial standard input settings"); + + orig_term.c_lflag &= ~(ICANON | ECHO | ISIG); + + if (openpty(&master, &slave, new_pty, &orig_term, NULL) < 0) + return; + + close(slave); + + pr_info("Assigned terminal %d to pty %s\n", term, new_pty); + + term_fds[term][TERM_FD_IN] = term_fds[term][TERM_FD_OUT] = master; +} + +int tty_parser(const struct option *opt, const char *arg, int unset) +{ + int tty = atoi(arg); + + term_set_tty(tty); + + return 0; +} + +int term_init(struct kvm *kvm) +{ + struct termios term; + int i, r; + + r = tcgetattr(STDIN_FILENO, &orig_term); + if (r < 0) { + pr_warning("unable to save initial standard input settings"); + return r; + } + + + term = orig_term; + term.c_lflag &= ~(ICANON | ECHO | ISIG); + tcsetattr(STDIN_FILENO, TCSANOW, &term); + + for (i = 0; i < 4; i++) + if (term_fds[i][TERM_FD_IN] == 0) { + term_fds[i][TERM_FD_IN] = STDIN_FILENO; + term_fds[i][TERM_FD_OUT] = STDOUT_FILENO; + } + + signal(SIGTERM, term_sig_cleanup); + atexit(term_cleanup); + + return 0; +} +dev_init(term_init); + +int term_exit(struct kvm *kvm) +{ + return 0; +} +dev_exit(term_exit); diff --git a/tools/kvm/tests/Makefile b/tools/kvm/tests/Makefile new file mode 100644 index 000000000000..cad14ecbae1b --- /dev/null +++ b/tools/kvm/tests/Makefile @@ -0,0 +1,19 @@ +all: kernel pit boot + +kernel: + $(MAKE) -C kernel +.PHONY: kernel + +pit: + $(MAKE) -C pit +.PHONY: pit + +boot: + $(MAKE) -C boot +.PHONY: boot + +clean: + $(MAKE) -C kernel clean + $(MAKE) -C pit clean + $(MAKE) -C boot clean +.PHONY: clean diff --git a/tools/kvm/tests/boot/Makefile b/tools/kvm/tests/boot/Makefile new file mode 100644 index 000000000000..40cba6847ccd --- /dev/null +++ b/tools/kvm/tests/boot/Makefile @@ -0,0 +1,13 @@ +NAME := init + +OBJ := $(NAME).o + +all: $(.o) + rm -rf rootfs + mkdir rootfs + gcc -static init.c -o rootfs/init + mkisofs rootfs > boot_test.iso + +clean: + rm -rf rootfs boot_test.iso +.PHONY: clean diff --git a/tools/kvm/tests/boot/init.c b/tools/kvm/tests/boot/init.c new file mode 100644 index 000000000000..094f8ba37317 --- /dev/null +++ b/tools/kvm/tests/boot/init.c @@ -0,0 +1,11 @@ +#include <linux/reboot.h> +#include <unistd.h> + +int main(int argc, char *argv[]) +{ + puts("hello, KVM guest!\r"); + + reboot(LINUX_REBOOT_CMD_RESTART); + + return 0; +} diff --git a/tools/kvm/tests/kernel/.gitignore b/tools/kvm/tests/kernel/.gitignore new file mode 100644 index 000000000000..d0cd209e5078 --- /dev/null +++ b/tools/kvm/tests/kernel/.gitignore @@ -0,0 +1,2 @@ +kernel.bin +kernel.elf diff --git a/tools/kvm/tests/kernel/Makefile b/tools/kvm/tests/kernel/Makefile new file mode 100644 index 000000000000..c7dd8da33332 --- /dev/null +++ b/tools/kvm/tests/kernel/Makefile @@ -0,0 +1,20 @@ +NAME := kernel + +BIN := $(NAME).bin +ELF := $(NAME).elf +OBJ := $(NAME).o + +all: $(BIN) + +$(BIN): $(ELF) + objcopy -O binary $< $@ + +$(ELF): $(OBJ) + ld -Ttext=0x00 -nostdlib -static $< -o $@ + +%.o: %.S + gcc -nostdinc -c $< -o $@ + +clean: + rm -f $(BIN) $(ELF) $(OBJ) +.PHONY: clean diff --git a/tools/kvm/tests/kernel/README b/tools/kvm/tests/kernel/README new file mode 100644 index 000000000000..2923777e6d65 --- /dev/null +++ b/tools/kvm/tests/kernel/README @@ -0,0 +1,16 @@ +Compiling +--------- + +You can simply type: + + $Â make + +to build a 16-bit binary that uses the i8086 instruction set. + +Disassembling +------------- + +Use the "-m i8086" command line option with objdump to make sure it knows we're +dealing with i8086 instruction set: + + $ objdump -d -m i8086 i8086.elf diff --git a/tools/kvm/tests/kernel/kernel.S b/tools/kvm/tests/kernel/kernel.S new file mode 100644 index 000000000000..2824b64da657 --- /dev/null +++ b/tools/kvm/tests/kernel/kernel.S @@ -0,0 +1,8 @@ + .code16gcc + .text + .globl _start + .type _start, @function +_start: + # "This is probably the largest possible kernel that is bug free." -- Avi Kivity + 1: + jmp 1b diff --git a/tools/kvm/tests/pit/.gitignore b/tools/kvm/tests/pit/.gitignore new file mode 100644 index 000000000000..43f0aa8d37f1 --- /dev/null +++ b/tools/kvm/tests/pit/.gitignore @@ -0,0 +1,2 @@ +*.bin +*.elf diff --git a/tools/kvm/tests/pit/Makefile b/tools/kvm/tests/pit/Makefile new file mode 100644 index 000000000000..2fae9b2aec2f --- /dev/null +++ b/tools/kvm/tests/pit/Makefile @@ -0,0 +1,20 @@ +NAME := tick + +BIN := $(NAME).bin +ELF := $(NAME).elf +OBJ := $(NAME).o + +all: $(BIN) + +$(BIN): $(ELF) + objcopy -O binary $< $@ + +$(ELF): $(OBJ) + ld -Ttext=0x00 -nostdlib -static $< -o $@ + +%.o: %.S + gcc -nostdinc -c $< -o $@ + +clean: + rm -f $(BIN) $(ELF) $(OBJ) +.PHONY: clean diff --git a/tools/kvm/tests/pit/README b/tools/kvm/tests/pit/README new file mode 100644 index 000000000000..2923777e6d65 --- /dev/null +++ b/tools/kvm/tests/pit/README @@ -0,0 +1,16 @@ +Compiling +--------- + +You can simply type: + + $Â make + +to build a 16-bit binary that uses the i8086 instruction set. + +Disassembling +------------- + +Use the "-m i8086" command line option with objdump to make sure it knows we're +dealing with i8086 instruction set: + + $ objdump -d -m i8086 i8086.elf diff --git a/tools/kvm/tests/pit/tick.S b/tools/kvm/tests/pit/tick.S new file mode 100644 index 000000000000..635dc8dd46bc --- /dev/null +++ b/tools/kvm/tests/pit/tick.S @@ -0,0 +1,101 @@ +#define IO_PIC 0x20 +#define IRQ_OFFSET 32 +#define IO_PIT 0x40 +#define TIMER_FREQ 1193182 +#define TIMER_DIV(x) ((TIMER_FREQ+(x)/2)/(x)) + +#define TEST_COUNT 0x0200 + + .code16gcc + .text + .globl _start + .type _start, @function +_start: +/* + * fill up noop handlers + */ + xorw %ax, %ax + xorw %di, %di + movw %ax, %es + movw $256, %cx +fill_noop_idt: + movw $noop_handler, %es:(%di) + movw %cs, %es:2(%di) + add $4, %di + loop fill_noop_idt + +set_idt: + movw $timer_isr, %es:(IRQ_OFFSET*4) + movw %cs, %es:(IRQ_OFFSET*4+2) + +set_pic: + # ICW1 + mov $0x11, %al + mov $(IO_PIC), %dx + out %al,%dx + # ICW2 + mov $(IRQ_OFFSET), %al + mov $(IO_PIC+1), %dx + out %al, %dx + # ICW3 + mov $0x00, %al + mov $(IO_PIC+1), %dx + out %al, %dx + # ICW4 + mov $0x3, %al + mov $(IO_PIC+1), %dx + out %al, %dx + +set_pit: + # set 8254 mode + mov $(IO_PIT+3), %dx + mov $0x34, %al + outb %al, %dx + # set 8254 freq 1KHz + mov $(IO_PIT), %dx + movb $(TIMER_DIV(1000) % 256), %al + outb %al, %dx + movb $(TIMER_DIV(1000) / 256), %al + outb %al, %dx + +enable_irq0: + mov $0xfe, %al + mov $(IO_PIC+1), %dx + out %al, %dx + sti +loop: + 1: + jmp 1b + +test_ok: + mov $0x3f8,%dx + cs lea msg2, %si + mov $(msg2_end-msg2), %cx + cs rep/outsb + + /* Reboot by using the i8042 reboot line */ + mov $0xfe, %al + outb %al, $0x64 + +timer_isr: + cli + pushaw + pushfw + mov $0x3f8,%dx + mov $0x2e, %al # . + out %al,%dx + decw count + jz test_ok + popfw + popaw + iretw + +noop_handler: + iretw + +count: + .word TEST_COUNT + +msg2: + .asciz "\nTest OK\n" +msg2_end: diff --git a/tools/kvm/ui/sdl.c b/tools/kvm/ui/sdl.c new file mode 100644 index 000000000000..9994490022b6 --- /dev/null +++ b/tools/kvm/ui/sdl.c @@ -0,0 +1,323 @@ +#include "kvm/sdl.h" + +#include "kvm/framebuffer.h" +#include "kvm/i8042.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/kvm-cpu.h" +#include "kvm/vesa.h" + +#include <SDL/SDL.h> +#include <pthread.h> +#include <signal.h> +#include <linux/err.h> + +#define FRAME_RATE 25 + +#define SCANCODE_UNKNOWN 0 +#define SCANCODE_NORMAL 1 +#define SCANCODE_ESCAPED 2 +#define SCANCODE_KEY_PAUSE 3 +#define SCANCODE_KEY_PRNTSCRN 4 + +struct set2_scancode { + u8 code; + u8 type; +}; + +#define DEFINE_SC(_code) {\ + .code = _code,\ + .type = SCANCODE_NORMAL,\ +} + +/* escaped scancodes */ +#define DEFINE_ESC(_code) {\ + .code = _code,\ + .type = SCANCODE_ESCAPED,\ +} + +static const struct set2_scancode const keymap[256] = { + [9] = DEFINE_SC(0x76), /* <esc> */ + [10] = DEFINE_SC(0x16), /* 1 */ + [11] = DEFINE_SC(0x1e), /* 2 */ + [12] = DEFINE_SC(0x26), /* 3 */ + [13] = DEFINE_SC(0x25), /* 4 */ + [14] = DEFINE_SC(0x2e), /* 5 */ + [15] = DEFINE_SC(0x36), /* 6 */ + [16] = DEFINE_SC(0x3d), /* 7 */ + [17] = DEFINE_SC(0x3e), /* 8 */ + [18] = DEFINE_SC(0x46), /* 9 */ + [19] = DEFINE_SC(0x45), /* 9 */ + [20] = DEFINE_SC(0x4e), /* - */ + [21] = DEFINE_SC(0x55), /* + */ + [22] = DEFINE_SC(0x66), /* <backspace> */ + [23] = DEFINE_SC(0x0d), /* <tab> */ + [24] = DEFINE_SC(0x15), /* q */ + [25] = DEFINE_SC(0x1d), /* w */ + [26] = DEFINE_SC(0x24), /* e */ + [27] = DEFINE_SC(0x2d), /* r */ + [28] = DEFINE_SC(0x2c), /* t */ + [29] = DEFINE_SC(0x35), /* y */ + [30] = DEFINE_SC(0x3c), /* u */ + [31] = DEFINE_SC(0x43), /* i */ + [32] = DEFINE_SC(0x44), /* o */ + [33] = DEFINE_SC(0x4d), /* p */ + [34] = DEFINE_SC(0x54), /* [ */ + [35] = DEFINE_SC(0x5b), /* ] */ + [36] = DEFINE_SC(0x5a), /* <enter> */ + [37] = DEFINE_SC(0x14), /* <left ctrl> */ + [38] = DEFINE_SC(0x1c), /* a */ + [39] = DEFINE_SC(0x1b), /* s */ + [40] = DEFINE_SC(0x23), /* d */ + [41] = DEFINE_SC(0x2b), /* f */ + [42] = DEFINE_SC(0x34), /* g */ + [43] = DEFINE_SC(0x33), /* h */ + [44] = DEFINE_SC(0x3b), /* j */ + [45] = DEFINE_SC(0x42), /* k */ + [46] = DEFINE_SC(0x4b), /* l */ + [47] = DEFINE_SC(0x4c), /* ; */ + [48] = DEFINE_SC(0x52), /* ' */ + [49] = DEFINE_SC(0x0e), /* ` */ + [50] = DEFINE_SC(0x12), /* <left shift> */ + [51] = DEFINE_SC(0x5d), /* \ */ + [52] = DEFINE_SC(0x1a), /* z */ + [53] = DEFINE_SC(0x22), /* x */ + [54] = DEFINE_SC(0x21), /* c */ + [55] = DEFINE_SC(0x2a), /* v */ + [56] = DEFINE_SC(0x32), /* b */ + [57] = DEFINE_SC(0x31), /* n */ + [58] = DEFINE_SC(0x3a), /* m */ + [59] = DEFINE_SC(0x41), /* < */ + [60] = DEFINE_SC(0x49), /* > */ + [61] = DEFINE_SC(0x4a), /* / */ + [62] = DEFINE_SC(0x59), /* <right shift> */ + [63] = DEFINE_SC(0x7c), /* keypad * */ + [64] = DEFINE_SC(0x11), /* <left alt> */ + [65] = DEFINE_SC(0x29), /* <space> */ + + [67] = DEFINE_SC(0x05), /* <F1> */ + [68] = DEFINE_SC(0x06), /* <F2> */ + [69] = DEFINE_SC(0x04), /* <F3> */ + [70] = DEFINE_SC(0x0c), /* <F4> */ + [71] = DEFINE_SC(0x03), /* <F5> */ + [72] = DEFINE_SC(0x0b), /* <F6> */ + [73] = DEFINE_SC(0x83), /* <F7> */ + [74] = DEFINE_SC(0x0a), /* <F8> */ + [75] = DEFINE_SC(0x01), /* <F9> */ + [76] = DEFINE_SC(0x09), /* <F10> */ + + [79] = DEFINE_SC(0x6c), /* keypad 7 */ + [80] = DEFINE_SC(0x75), /* keypad 8 */ + [81] = DEFINE_SC(0x7d), /* keypad 9 */ + [82] = DEFINE_SC(0x7b), /* keypad - */ + [83] = DEFINE_SC(0x6b), /* keypad 4 */ + [84] = DEFINE_SC(0x73), /* keypad 5 */ + [85] = DEFINE_SC(0x74), /* keypad 6 */ + [86] = DEFINE_SC(0x79), /* keypad + */ + [87] = DEFINE_SC(0x69), /* keypad 1 */ + [88] = DEFINE_SC(0x72), /* keypad 2 */ + [89] = DEFINE_SC(0x7a), /* keypad 3 */ + [90] = DEFINE_SC(0x70), /* keypad 0 */ + [91] = DEFINE_SC(0x71), /* keypad . */ + + [94] = DEFINE_SC(0x61), /* <INT 1> */ + [95] = DEFINE_SC(0x78), /* <F11> */ + [96] = DEFINE_SC(0x07), /* <F12> */ + + [104] = DEFINE_ESC(0x5a), /* keypad <enter> */ + [105] = DEFINE_ESC(0x14), /* <right ctrl> */ + [106] = DEFINE_ESC(0x4a), /* keypad / */ + [108] = DEFINE_ESC(0x11), /* <right alt> */ + [110] = DEFINE_ESC(0x6c), /* <home> */ + [111] = DEFINE_ESC(0x75), /* <up> */ + [112] = DEFINE_ESC(0x7d), /* <pag up> */ + [113] = DEFINE_ESC(0x6b), /* <left> */ + [114] = DEFINE_ESC(0x74), /* <right> */ + [115] = DEFINE_ESC(0x69), /* <end> */ + [116] = DEFINE_ESC(0x72), /* <down> */ + [117] = DEFINE_ESC(0x7a), /* <pag down> */ + [118] = DEFINE_ESC(0x70), /* <ins> */ + [119] = DEFINE_ESC(0x71), /* <delete> */ +}; +static bool running, done; + +static const struct set2_scancode *to_code(u8 scancode) +{ + return &keymap[scancode]; +} + +static void key_press(const struct set2_scancode *sc) +{ + switch (sc->type) { + case SCANCODE_ESCAPED: + kbd_queue(0xe0); + /* fallthrough */ + case SCANCODE_NORMAL: + kbd_queue(sc->code); + break; + case SCANCODE_KEY_PAUSE: + kbd_queue(0xe1); + kbd_queue(0x14); + kbd_queue(0x77); + kbd_queue(0xe1); + kbd_queue(0xf0); + kbd_queue(0x14); + kbd_queue(0x77); + break; + case SCANCODE_KEY_PRNTSCRN: + kbd_queue(0xe0); + kbd_queue(0x12); + kbd_queue(0xe0); + kbd_queue(0x7c); + break; + } +} + +static void key_release(const struct set2_scancode *sc) +{ + switch (sc->type) { + case SCANCODE_ESCAPED: + kbd_queue(0xe0); + /* fallthrough */ + case SCANCODE_NORMAL: + kbd_queue(0xf0); + kbd_queue(sc->code); + break; + case SCANCODE_KEY_PAUSE: + /* nothing to do */ + break; + case SCANCODE_KEY_PRNTSCRN: + kbd_queue(0xe0); + kbd_queue(0xf0); + kbd_queue(0x7c); + kbd_queue(0xe0); + kbd_queue(0xf0); + kbd_queue(0x12); + break; + } +} + +static void *sdl__thread(void *p) +{ + Uint32 rmask, gmask, bmask, amask; + struct framebuffer *fb = p; + SDL_Surface *guest_screen; + SDL_Surface *screen; + SDL_Event ev; + Uint32 flags; + + kvm__set_thread_name("kvm-sdl-worker"); + + if (SDL_Init(SDL_INIT_VIDEO) != 0) + die("Unable to initialize SDL"); + + rmask = 0x000000ff; + gmask = 0x0000ff00; + bmask = 0x00ff0000; + amask = 0x00000000; + + guest_screen = SDL_CreateRGBSurfaceFrom(fb->mem, fb->width, fb->height, fb->depth, fb->width * fb->depth / 8, rmask, gmask, bmask, amask); + if (!guest_screen) + die("Unable to create SDL RBG surface"); + + flags = SDL_HWSURFACE | SDL_ASYNCBLIT | SDL_HWACCEL | SDL_DOUBLEBUF; + + SDL_WM_SetCaption("KVM tool", "KVM tool"); + + screen = SDL_SetVideoMode(fb->width, fb->height, fb->depth, flags); + if (!screen) + die("Unable to set SDL video mode"); + + SDL_EnableKeyRepeat(200, 50); + + while (running) { + SDL_BlitSurface(guest_screen, NULL, screen, NULL); + SDL_Flip(screen); + + while (SDL_PollEvent(&ev)) { + switch (ev.type) { + case SDL_KEYDOWN: { + const struct set2_scancode *sc = to_code(ev.key.keysym.scancode); + if (sc->type == SCANCODE_UNKNOWN) { + pr_warning("key '%d' not found in keymap", ev.key.keysym.scancode); + break; + } + key_press(sc); + break; + } + case SDL_KEYUP: { + const struct set2_scancode *sc = to_code(ev.key.keysym.scancode); + if (sc->type == SCANCODE_UNKNOWN) + break; + key_release(sc); + break; + } + case SDL_QUIT: + goto exit; + } + } + + SDL_Delay(1000 / FRAME_RATE); + } + + if (running == false && done == false) { + done = true; + return NULL; + } +exit: + kvm_cpu__reboot(fb->kvm); + + return NULL; +} + +static int sdl__start(struct framebuffer *fb) +{ + pthread_t thread; + + running = true; + + if (pthread_create(&thread, NULL, sdl__thread, fb) != 0) + return -1; + + return 0; +} + +static int sdl__stop(struct framebuffer *fb) +{ + running = false; + while (done == false) + sleep(0); + + return 0; +} + +static struct fb_target_operations sdl_ops = { + .start = sdl__start, + .stop = sdl__stop, +}; + +int sdl__init(struct kvm *kvm) +{ + struct framebuffer *fb; + + if (!kvm->cfg.sdl) + return 0; + + fb = vesa__init(kvm); + if (IS_ERR(fb)) { + pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb)); + return PTR_ERR(fb); + } + + return fb__attach(fb, &sdl_ops); +} +dev_init(sdl__init); + +int sdl__exit(struct kvm *kvm) +{ + if (kvm->cfg.sdl) + return sdl__stop(NULL); + + return 0; +} +dev_exit(sdl__exit); diff --git a/tools/kvm/ui/vnc.c b/tools/kvm/ui/vnc.c new file mode 100644 index 000000000000..12e4bd53fe0d --- /dev/null +++ b/tools/kvm/ui/vnc.c @@ -0,0 +1,250 @@ +#include "kvm/vnc.h" + +#include "kvm/framebuffer.h" +#include "kvm/i8042.h" +#include "kvm/vesa.h" + +#include <linux/types.h> +#include <rfb/keysym.h> +#include <rfb/rfb.h> +#include <pthread.h> +#include <linux/err.h> + +#define VESA_QUEUE_SIZE 128 +#define VESA_IRQ 14 + +/* + * This "6000" value is pretty much the result of experimentation + * It seems that around this value, things update pretty smoothly + */ +#define VESA_UPDATE_TIME 6000 + +/* + * We can map the letters and numbers without a fuss, + * but the other characters not so much. + */ +static char letters[26] = { + 0x1c, 0x32, 0x21, 0x23, 0x24, /* a-e */ + 0x2b, 0x34, 0x33, 0x43, 0x3b, /* f-j */ + 0x42, 0x4b, 0x3a, 0x31, 0x44, /* k-o */ + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, /* p-t */ + 0x3c, 0x2a, 0x1d, 0x22, 0x35, /* u-y */ + 0x1a, +}; + +static rfbScreenInfoPtr server; +static char num[10] = { + 0x45, 0x16, 0x1e, 0x26, 0x2e, 0x23, 0x36, 0x3d, 0x3e, 0x46, +}; + +/* + * This is called when the VNC server receives a key event + * The reason this function is such a beast is that we have + * to convert from ASCII characters (which is what VNC gets) + * to PC keyboard scancodes, which is what Linux expects to + * get from its keyboard. ASCII and the scancode set don't + * really seem to mesh in any good way beyond some basics with + * the letters and numbers. + */ +static void kbd_handle_key(rfbBool down, rfbKeySym key, rfbClientPtr cl) +{ + char tosend = 0; + + if (key >= 0x41 && key <= 0x5a) + key += 0x20; /* convert to lowercase */ + + if (key >= 0x61 && key <= 0x7a) /* a-z */ + tosend = letters[key - 0x61]; + + if (key >= 0x30 && key <= 0x39) + tosend = num[key - 0x30]; + + switch (key) { + case XK_Insert: kbd_queue(0xe0); tosend = 0x70; break; + case XK_Delete: kbd_queue(0xe0); tosend = 0x71; break; + case XK_Up: kbd_queue(0xe0); tosend = 0x75; break; + case XK_Down: kbd_queue(0xe0); tosend = 0x72; break; + case XK_Left: kbd_queue(0xe0); tosend = 0x6b; break; + case XK_Right: kbd_queue(0xe0); tosend = 0x74; break; + case XK_Page_Up: kbd_queue(0xe0); tosend = 0x7d; break; + case XK_Page_Down: kbd_queue(0xe0); tosend = 0x7a; break; + case XK_Home: kbd_queue(0xe0); tosend = 0x6c; break; + case XK_BackSpace: tosend = 0x66; break; + case XK_Tab: tosend = 0x0d; break; + case XK_Return: tosend = 0x5a; break; + case XK_Escape: tosend = 0x76; break; + case XK_End: tosend = 0x69; break; + case XK_Shift_L: tosend = 0x12; break; + case XK_Shift_R: tosend = 0x59; break; + case XK_Control_R: kbd_queue(0xe0); + case XK_Control_L: tosend = 0x14; break; + case XK_Alt_R: kbd_queue(0xe0); + case XK_Alt_L: tosend = 0x11; break; + case XK_quoteleft: tosend = 0x0e; break; + case XK_minus: tosend = 0x4e; break; + case XK_equal: tosend = 0x55; break; + case XK_bracketleft: tosend = 0x54; break; + case XK_bracketright: tosend = 0x5b; break; + case XK_backslash: tosend = 0x5d; break; + case XK_Caps_Lock: tosend = 0x58; break; + case XK_semicolon: tosend = 0x4c; break; + case XK_quoteright: tosend = 0x52; break; + case XK_comma: tosend = 0x41; break; + case XK_period: tosend = 0x49; break; + case XK_slash: tosend = 0x4a; break; + case XK_space: tosend = 0x29; break; + + /* + * This is where I handle the shifted characters. + * They don't really map nicely the way A-Z maps to a-z, + * so I'm doing it manually + */ + case XK_exclam: tosend = 0x16; break; + case XK_quotedbl: tosend = 0x52; break; + case XK_numbersign: tosend = 0x26; break; + case XK_dollar: tosend = 0x25; break; + case XK_percent: tosend = 0x2e; break; + case XK_ampersand: tosend = 0x3d; break; + case XK_parenleft: tosend = 0x46; break; + case XK_parenright: tosend = 0x45; break; + case XK_asterisk: tosend = 0x3e; break; + case XK_plus: tosend = 0x55; break; + case XK_colon: tosend = 0x4c; break; + case XK_less: tosend = 0x41; break; + case XK_greater: tosend = 0x49; break; + case XK_question: tosend = 0x4a; break; + case XK_at: tosend = 0x1e; break; + case XK_asciicircum: tosend = 0x36; break; + case XK_underscore: tosend = 0x4e; break; + case XK_braceleft: tosend = 0x54; break; + case XK_braceright: tosend = 0x5b; break; + case XK_bar: tosend = 0x5d; break; + case XK_asciitilde: tosend = 0x0e; break; + default: break; + } + + /* + * If this is a "key up" event (the user has released the key, we + * need to send 0xf0 first. + */ + if (!down && tosend != 0x0) + kbd_queue(0xf0); + + if (tosend) + kbd_queue(tosend); +} + +/* The previous X and Y coordinates of the mouse */ +static int xlast, ylast = -1; + +/* + * This function is called by the VNC server whenever a mouse event occurs. + */ +static void kbd_handle_ptr(int buttonMask, int x, int y, rfbClientPtr cl) +{ + int dx, dy; + char b1 = 0x8; + + /* The VNC mask and the PS/2 button encoding are the same */ + b1 |= buttonMask; + + if (xlast >= 0 && ylast >= 0) { + /* The PS/2 mouse sends deltas, not absolutes */ + dx = x - xlast; + dy = ylast - y; + + /* Set overflow bits if needed */ + if (dy > 255) + b1 |= 0x80; + if (dx > 255) + b1 |= 0x40; + + /* Set negative bits if needed */ + if (dy < 0) + b1 |= 0x20; + if (dx < 0) + b1 |= 0x10; + + mouse_queue(b1); + mouse_queue(dx); + mouse_queue(dy); + } + + xlast = x; + ylast = y; + rfbDefaultPtrAddEvent(buttonMask, x, y, cl); +} + +static void *vnc__thread(void *p) +{ + struct framebuffer *fb = p; + /* + * Make a fake argc and argv because the getscreen function + * seems to want it. + */ + char argv[1][1] = {{0}}; + int argc = 1; + + kvm__set_thread_name("kvm-vnc-worker"); + + server = rfbGetScreen(&argc, (char **) argv, fb->width, fb->height, 8, 3, 4); + server->frameBuffer = fb->mem; + server->alwaysShared = TRUE; + server->kbdAddEvent = kbd_handle_key; + server->ptrAddEvent = kbd_handle_ptr; + rfbInitServer(server); + + while (rfbIsActive(server)) { + rfbMarkRectAsModified(server, 0, 0, fb->width, fb->height); + rfbProcessEvents(server, server->deferUpdateTime * VESA_UPDATE_TIME); + } + return NULL; +} + +static int vnc__start(struct framebuffer *fb) +{ + pthread_t thread; + + if (pthread_create(&thread, NULL, vnc__thread, fb) != 0) + return -1; + + return 0; +} + +static int vnc__stop(struct framebuffer *fb) +{ + rfbShutdownServer(server, TRUE); + + return 0; +} + +static struct fb_target_operations vnc_ops = { + .start = vnc__start, + .stop = vnc__stop, +}; + +int vnc__init(struct kvm *kvm) +{ + struct framebuffer *fb; + + if (!kvm->cfg.vnc) + return 0; + + fb = vesa__init(kvm); + if (IS_ERR(fb)) { + pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb)); + return PTR_ERR(fb); + } + + return fb__attach(fb, &vnc_ops); +} +dev_init(vnc__init); + +int vnc__exit(struct kvm *kvm) +{ + if (kvm->cfg.vnc) + return vnc__stop(NULL); + + return 0; +} +dev_exit(vnc__exit); diff --git a/tools/kvm/util/KVMTOOLS-VERSION-GEN b/tools/kvm/util/KVMTOOLS-VERSION-GEN new file mode 100755 index 000000000000..1af9d6c26f2a --- /dev/null +++ b/tools/kvm/util/KVMTOOLS-VERSION-GEN @@ -0,0 +1,40 @@ +#!/bin/sh + +if [ $# -eq 1 ] ; then + OUTPUT=$1 +fi + +GVF=${OUTPUT}KVMTOOLS-VERSION-FILE + +LF=' +' + +# First check if there is a .git to get the version from git describe +# otherwise try to get the version from the kernel makefile +if test -d ../../.git -o -f ../../.git && + VN=$(git describe --abbrev=4 HEAD 2>/dev/null) && + case "$VN" in + *$LF*) (exit 1) ;; + v[0-9]*) + git update-index -q --refresh + test -z "$(git diff-index --name-only HEAD --)" || + VN="$VN-dirty" ;; + esac +then + VN=$(echo "$VN" | sed -e 's/-/./g'); +else + VN=$(MAKEFLAGS= make -sC ../.. kernelversion) +fi + +VN=$(expr "$VN" : v*'\(.*\)') + +if test -r $GVF +then + VC=$(sed -e 's/^KVMTOOLS_VERSION = //' <$GVF) +else + VC=unset +fi +test "$VN" = "$VC" || { + echo >&2 "KVMTOOLS_VERSION = $VN" + echo "KVMTOOLS_VERSION = $VN" >$GVF +} diff --git a/tools/kvm/util/generate-cmdlist.sh b/tools/kvm/util/generate-cmdlist.sh new file mode 100755 index 000000000000..c8be0bd07b4a --- /dev/null +++ b/tools/kvm/util/generate-cmdlist.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +echo "/* Automatically generated by $0 */ +struct cmdname_help +{ + char name[16]; + char help[80]; +}; + +static struct cmdname_help common_cmds[] = {" + +sed -n 's/^lkvm-\([^ \t]*\).*common/\1/p' command-list.txt | +while read cmd +do + # TODO following sed command should be fixed + sed -n '/^NAME/,/^lkvm-'"$cmd"'/ { + /NAME/d + /--/d + s/.*kvm-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ + p + }' "Documentation/kvm-$cmd.txt" +done +echo "};" diff --git a/tools/kvm/util/init.c b/tools/kvm/util/init.c new file mode 100644 index 000000000000..33a595268021 --- /dev/null +++ b/tools/kvm/util/init.c @@ -0,0 +1,69 @@ +#include <linux/list.h> +#include <linux/kernel.h> + +#include "kvm/kvm.h" +#include "kvm/util-init.h" + +#define PRIORITY_LISTS 10 + +static struct hlist_head init_lists[PRIORITY_LISTS]; +static struct hlist_head exit_lists[PRIORITY_LISTS]; + +int init_list_add(struct init_item *t, int (*init)(struct kvm *), + int priority, const char *name) +{ + t->init = init; + t->fn_name = name; + hlist_add_head(&t->n, &init_lists[priority]); + + return 0; +} + +int exit_list_add(struct init_item *t, int (*init)(struct kvm *), + int priority, const char *name) +{ + t->init = init; + t->fn_name = name; + hlist_add_head(&t->n, &exit_lists[priority]); + + return 0; +} + +int init_list__init(struct kvm *kvm) +{ + unsigned int i; + int r = 0; + struct hlist_node *n; + struct init_item *t; + + for (i = 0; i < ARRAY_SIZE(init_lists); i++) + hlist_for_each_entry(t, n, &init_lists[i], n) { + r = t->init(kvm); + if (r < 0) { + pr_warning("Failed init: %s\n", t->fn_name); + goto fail; + } + } + +fail: + return r; +} + +int init_list__exit(struct kvm *kvm) +{ + int i; + int r = 0; + struct hlist_node *n; + struct init_item *t; + + for (i = ARRAY_SIZE(exit_lists) - 1; i >= 0; i--) + hlist_for_each_entry(t, n, &exit_lists[i], n) { + r = t->init(kvm); + if (r < 0) { + pr_warning("%s failed.\n", t->fn_name); + goto fail; + } + } +fail: + return r; +} diff --git a/tools/kvm/util/kvm-ifup-vbr0 b/tools/kvm/util/kvm-ifup-vbr0 new file mode 100755 index 000000000000..a91c37f7ee00 --- /dev/null +++ b/tools/kvm/util/kvm-ifup-vbr0 @@ -0,0 +1,6 @@ +#!/bin/sh +switch=vbr0 +/sbin/ifconfig $1 0.0.0.0 up +/usr/sbin/brctl addif ${switch} $1 +/usr/sbin/brctl setfd ${switch} 0 +/usr/sbin/brctl stp ${switch} off diff --git a/tools/kvm/util/parse-options.c b/tools/kvm/util/parse-options.c new file mode 100644 index 000000000000..9a1bbee6c271 --- /dev/null +++ b/tools/kvm/util/parse-options.c @@ -0,0 +1,577 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <stdbool.h> + +/* user defined includes */ +#include <linux/types.h> +#include <kvm/util.h> +#include <kvm/parse-options.h> +#include <kvm/strbuf.h> + +#define OPT_SHORT 1 +#define OPT_UNSET 2 + +static int opterror(const struct option *opt, const char *reason, int flags) +{ + if (flags & OPT_SHORT) + return pr_err("switch `%c' %s", opt->short_name, reason); + if (flags & OPT_UNSET) + return pr_err("option `no-%s' %s", opt->long_name, reason); + return pr_err("option `%s' %s", opt->long_name, reason); +} + +static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt, + int flags, const char **arg) +{ + if (p->opt) { + *arg = p->opt; + p->opt = NULL; + } else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 || + **(p->argv + 1) == '-')) { + *arg = (const char *)opt->defval; + } else if (p->argc > 1) { + p->argc--; + *arg = *++p->argv; + } else + return opterror(opt, "requires a value", flags); + return 0; +} + +static int readnum(const struct option *opt, int flags, + const char *str, char **end) +{ + switch (opt->type) { + case OPTION_INTEGER: + *(int *)opt->value = strtol(str, end, 0); + break; + case OPTION_UINTEGER: + *(unsigned int *)opt->value = strtol(str, end, 0); + break; + case OPTION_LONG: + *(long *)opt->value = strtol(str, end, 0); + break; + case OPTION_U64: + *(u64 *)opt->value = strtoull(str, end, 0); + break; + default: + return opterror(opt, "invalid numeric conversion", flags); + } + + return 0; +} + +static int get_value(struct parse_opt_ctx_t *p, + const struct option *opt, int flags) +{ + const char *s, *arg = NULL; + const int unset = flags & OPT_UNSET; + + if (unset && p->opt) + return opterror(opt, "takes no value", flags); + if (unset && (opt->flags & PARSE_OPT_NONEG)) + return opterror(opt, "isn't available", flags); + + if (!(flags & OPT_SHORT) && p->opt) { + switch (opt->type) { + case OPTION_CALLBACK: + if (!(opt->flags & PARSE_OPT_NOARG)) + break; + /* FALLTHROUGH */ + case OPTION_BOOLEAN: + case OPTION_INCR: + case OPTION_BIT: + case OPTION_SET_UINT: + case OPTION_SET_PTR: + return opterror(opt, "takes no value", flags); + case OPTION_END: + case OPTION_ARGUMENT: + case OPTION_GROUP: + case OPTION_STRING: + case OPTION_INTEGER: + case OPTION_UINTEGER: + case OPTION_LONG: + case OPTION_U64: + default: + break; + } + } + + switch (opt->type) { + case OPTION_BIT: + if (unset) + *(int *)opt->value &= ~opt->defval; + else + *(int *)opt->value |= opt->defval; + return 0; + + case OPTION_BOOLEAN: + *(bool *)opt->value = unset ? false : true; + return 0; + + case OPTION_INCR: + *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1; + return 0; + + case OPTION_SET_UINT: + *(unsigned int *)opt->value = unset ? 0 : opt->defval; + return 0; + + case OPTION_SET_PTR: + *(void **)opt->value = unset ? NULL : (void *)opt->defval; + return 0; + + case OPTION_STRING: + if (unset) + *(const char **)opt->value = NULL; + else if (opt->flags & PARSE_OPT_OPTARG && !p->opt) + *(const char **)opt->value = (const char *)opt->defval; + else + return get_arg(p, opt, flags, + (const char **)opt->value); + return 0; + + case OPTION_CALLBACK: + if (unset) + return (*opt->callback)(opt, NULL, 1) ? (-1) : 0; + if (opt->flags & PARSE_OPT_NOARG) + return (*opt->callback)(opt, NULL, 0) ? (-1) : 0; + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) + return (*opt->callback)(opt, NULL, 0) ? (-1) : 0; + if (get_arg(p, opt, flags, &arg)) + return -1; + return (*opt->callback)(opt, arg, 0) ? (-1) : 0; + + case OPTION_INTEGER: + if (unset) { + *(int *)opt->value = 0; + return 0; + } + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { + *(int *)opt->value = opt->defval; + return 0; + } + if (get_arg(p, opt, flags, &arg)) + return -1; + return readnum(opt, flags, arg, (char **)&s); + + case OPTION_UINTEGER: + if (unset) { + *(unsigned int *)opt->value = 0; + return 0; + } + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { + *(unsigned int *)opt->value = opt->defval; + return 0; + } + if (get_arg(p, opt, flags, &arg)) + return -1; + return readnum(opt, flags, arg, (char **)&s); + + case OPTION_LONG: + if (unset) { + *(long *)opt->value = 0; + return 0; + } + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { + *(long *)opt->value = opt->defval; + return 0; + } + if (get_arg(p, opt, flags, &arg)) + return -1; + return readnum(opt, flags, arg, (char **)&s); + + case OPTION_U64: + if (unset) { + *(u64 *)opt->value = 0; + return 0; + } + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { + *(u64 *)opt->value = opt->defval; + return 0; + } + if (get_arg(p, opt, flags, &arg)) + return -1; + return readnum(opt, flags, arg, (char **)&s); + + case OPTION_END: + case OPTION_ARGUMENT: + case OPTION_GROUP: + default: + die("should not happen, someone must be hit on the forehead"); + } +} + +#define USAGE_OPTS_WIDTH 24 +#define USAGE_GAP 2 + +static int usage_with_options_internal(const char * const *usagestr, + const struct option *opts, int full) +{ + if (!usagestr) + return PARSE_OPT_HELP; + + fprintf(stderr, "\n usage: %s\n", *usagestr++); + while (*usagestr && **usagestr) + fprintf(stderr, " or: %s\n", *usagestr++); + while (*usagestr) { + fprintf(stderr, "%s%s\n", + **usagestr ? " " : "", + *usagestr); + usagestr++; + } + + if (opts->type != OPTION_GROUP) + fputc('\n', stderr); + + for (; opts->type != OPTION_END; opts++) { + size_t pos; + int pad; + + if (opts->type == OPTION_GROUP) { + fputc('\n', stderr); + if (*opts->help) + fprintf(stderr, "%s\n", opts->help); + continue; + } + if (!full && (opts->flags & PARSE_OPT_HIDDEN)) + continue; + + pos = fprintf(stderr, " "); + if (opts->short_name) + pos += fprintf(stderr, "-%c", opts->short_name); + else + pos += fprintf(stderr, " "); + + if (opts->long_name && opts->short_name) + pos += fprintf(stderr, ", "); + if (opts->long_name) + pos += fprintf(stderr, "--%s", opts->long_name); + + switch (opts->type) { + case OPTION_ARGUMENT: + break; + case OPTION_LONG: + case OPTION_U64: + case OPTION_INTEGER: + case OPTION_UINTEGER: + if (opts->flags & PARSE_OPT_OPTARG) + if (opts->long_name) + pos += fprintf(stderr, "[=<n>]"); + else + pos += fprintf(stderr, "[<n>]"); + else + pos += fprintf(stderr, " <n>"); + break; + case OPTION_CALLBACK: + if (opts->flags & PARSE_OPT_NOARG) + break; + /* FALLTHROUGH */ + case OPTION_STRING: + if (opts->argh) { + if (opts->flags & PARSE_OPT_OPTARG) + if (opts->long_name) + pos += fprintf(stderr, "[=<%s>]", opts->argh); + else + pos += fprintf(stderr, "[<%s>]", opts->argh); + else + pos += fprintf(stderr, " <%s>", opts->argh); + } else { + if (opts->flags & PARSE_OPT_OPTARG) + if (opts->long_name) + pos += fprintf(stderr, "[=...]"); + else + pos += fprintf(stderr, "[...]"); + else + pos += fprintf(stderr, " ..."); + } + break; + default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */ + case OPTION_END: + case OPTION_GROUP: + case OPTION_BIT: + case OPTION_BOOLEAN: + case OPTION_INCR: + case OPTION_SET_UINT: + case OPTION_SET_PTR: + break; + } + if (pos <= USAGE_OPTS_WIDTH) + pad = USAGE_OPTS_WIDTH - pos; + else { + fputc('\n', stderr); + pad = USAGE_OPTS_WIDTH; + } + fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help); + } + fputc('\n', stderr); + + return PARSE_OPT_HELP; +} + +void usage_with_options(const char * const *usagestr, + const struct option *opts) +{ + usage_with_options_internal(usagestr, opts, 0); + exit(129); +} + +static void check_typos(const char *arg, const struct option *options) +{ + if (strlen(arg) < 3) + return; + + if (!prefixcmp(arg, "no-")) { + pr_err("did you mean `--%s` (with two dashes ?)", arg); + exit(129); + } + + for (; options->type != OPTION_END; options++) { + if (!options->long_name) + continue; + if (!prefixcmp(options->long_name, arg)) { + pr_err("did you mean `--%s` (with two dashes ?)", arg); + exit(129); + } + } +} + +static int parse_options_usage(const char * const *usagestr, + const struct option *opts) +{ + return usage_with_options_internal(usagestr, opts, 0); +} + +static int parse_short_opt(struct parse_opt_ctx_t *p, + const struct option *options) +{ + for (; options->type != OPTION_END; options++) { + if (options->short_name == *p->opt) { + p->opt = p->opt[1] ? p->opt + 1 : NULL; + return get_value(p, options, OPT_SHORT); + } + } + return -2; +} + +static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg, + const struct option *options) +{ + const char *arg_end = strchr(arg, '='); + const struct option *abbrev_option = NULL, *ambiguous_option = NULL; + int abbrev_flags = 0, ambiguous_flags = 0; + + if (!arg_end) + arg_end = arg + strlen(arg); + + for (; options->type != OPTION_END; options++) { + const char *rest; + int flags = 0; + + if (!options->long_name) + continue; + + rest = skip_prefix(arg, options->long_name); + if (options->type == OPTION_ARGUMENT) { + if (!rest) + continue; + if (*rest == '=') + return opterror(options, "takes no value", + flags); + if (*rest) + continue; + p->out[p->cpidx++] = arg - 2; + return 0; + } + if (!rest) { + /* abbreviated? */ + if (!strncmp(options->long_name, arg, arg_end - arg)) { +is_abbreviated: + if (abbrev_option) { + /* + * If this is abbreviated, it is + * ambiguous. So when there is no + * exact match later, we need to + * error out. + */ + ambiguous_option = abbrev_option; + ambiguous_flags = abbrev_flags; + } + if (!(flags & OPT_UNSET) && *arg_end) + p->opt = arg_end + 1; + abbrev_option = options; + abbrev_flags = flags; + continue; + } + /* negated and abbreviated very much? */ + if (!prefixcmp("no-", arg)) { + flags |= OPT_UNSET; + goto is_abbreviated; + } + /* negated? */ + if (strncmp(arg, "no-", 3)) + continue; + flags |= OPT_UNSET; + rest = skip_prefix(arg + 3, options->long_name); + /* abbreviated and negated? */ + if (!rest && !prefixcmp(options->long_name, arg + 3)) + goto is_abbreviated; + if (!rest) + continue; + } + if (*rest) { + if (*rest != '=') + continue; + p->opt = rest + 1; + } + return get_value(p, options, flags); + } + + if (ambiguous_option) + return pr_err("Ambiguous option: %s " + "(could be --%s%s or --%s%s)", + arg, + (ambiguous_flags & OPT_UNSET) ? "no-" : "", + ambiguous_option->long_name, + (abbrev_flags & OPT_UNSET) ? "no-" : "", + abbrev_option->long_name); + if (abbrev_option) + return get_value(p, abbrev_option, abbrev_flags); + return -2; +} + + +static void parse_options_start(struct parse_opt_ctx_t *ctx, int argc, + const char **argv, int flags) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->argc = argc; + ctx->argv = argv; + ctx->out = argv; + ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0); + ctx->flags = flags; + if ((flags & PARSE_OPT_KEEP_UNKNOWN) && + (flags & PARSE_OPT_STOP_AT_NON_OPTION)) + die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together"); +} + +static int parse_options_end(struct parse_opt_ctx_t *ctx) +{ + memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out)); + ctx->out[ctx->cpidx + ctx->argc] = NULL; + return ctx->cpidx + ctx->argc; +} + + +static int parse_options_step(struct parse_opt_ctx_t *ctx, + const struct option *options, const char * const usagestr[]) +{ + int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP); + + /* we must reset ->opt, unknown short option leave it dangling */ + ctx->opt = NULL; + + for (; ctx->argc; ctx->argc--, ctx->argv++) { + const char *arg = ctx->argv[0]; + + if (*arg != '-' || !arg[1]) { + if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION) + break; + ctx->out[ctx->cpidx++] = ctx->argv[0]; + continue; + } + + if (arg[1] != '-') { + ctx->opt = arg + 1; + if (internal_help && *ctx->opt == 'h') + return parse_options_usage(usagestr, options); + switch (parse_short_opt(ctx, options)) { + case -1: + return parse_options_usage(usagestr, options); + case -2: + goto unknown; + default: + break; + } + if (ctx->opt) + check_typos(arg + 1, options); + while (ctx->opt) { + if (internal_help && *ctx->opt == 'h') + return parse_options_usage(usagestr, + options); + switch (parse_short_opt(ctx, options)) { + case -1: + return parse_options_usage(usagestr, + options); + case -2: + /* fake a short option thing to hide + * the fact that we may have + * started to parse aggregated stuff + * + * This is leaky, too bad. + */ + ctx->argv[0] = strdup(ctx->opt - 1); + *(char *)ctx->argv[0] = '-'; + goto unknown; + default: + break; + } + } + continue; + } + + if (!arg[2]) { /* "--" */ + if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) { + ctx->argc--; + ctx->argv++; + } + break; + } + + if (internal_help && !strcmp(arg + 2, "help-all")) + return usage_with_options_internal(usagestr, options, + 1); + if (internal_help && !strcmp(arg + 2, "help")) + return parse_options_usage(usagestr, options); + switch (parse_long_opt(ctx, arg + 2, options)) { + case -1: + return parse_options_usage(usagestr, options); + case -2: + goto unknown; + default: + break; + } + continue; +unknown: + if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN)) + return PARSE_OPT_UNKNOWN; + ctx->out[ctx->cpidx++] = ctx->argv[0]; + ctx->opt = NULL; + } + return PARSE_OPT_DONE; +} + +int parse_options(int argc, const char **argv, const struct option *options, + const char * const usagestr[], int flags) +{ + struct parse_opt_ctx_t ctx; + + parse_options_start(&ctx, argc, argv, flags); + switch (parse_options_step(&ctx, options, usagestr)) { + case PARSE_OPT_HELP: + exit(129); + case PARSE_OPT_DONE: + break; + default: /* PARSE_OPT_UNKNOWN */ + if (ctx.argv[0][1] == '-') { + pr_err("unknown option `%s'", ctx.argv[0] + 2); + } else { + pr_err("unknown switch `%c'", *ctx.opt); + } + usage_with_options(usagestr, options); + } + + return parse_options_end(&ctx); +} diff --git a/tools/kvm/util/rbtree-interval.c b/tools/kvm/util/rbtree-interval.c new file mode 100644 index 000000000000..3630a6d80d6e --- /dev/null +++ b/tools/kvm/util/rbtree-interval.c @@ -0,0 +1,58 @@ +#include <kvm/rbtree-interval.h> +#include <stddef.h> +#include <errno.h> + +struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct rb_int_node *cur = rb_int(node); + + if (point < cur->low) + node = node->rb_left; + else if (cur->high <= point) + node = node->rb_right; + else + return cur; + } + + return NULL; +} + +struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high) +{ + struct rb_int_node *range; + + range = rb_int_search_single(root, low); + if (range == NULL) + return NULL; + + /* We simply verify that 'high' is smaller than the end of the range where 'low' is located */ + if (range->high < high) + return NULL; + + return range; +} + +int rb_int_insert(struct rb_root *root, struct rb_int_node *i_node) +{ + struct rb_node **node = &root->rb_node, *parent = NULL; + + while (*node) { + struct rb_int_node *cur = rb_int(*node); + + parent = *node; + if (i_node->high <= cur->low) + node = &cur->node.rb_left; + else if (cur->high <= i_node->low) + node = &cur->node.rb_right; + else + return -EEXIST; + } + + rb_link_node(&i_node->node, parent, node); + rb_insert_color(&i_node->node, root); + + return 0; +} diff --git a/tools/kvm/util/read-write.c b/tools/kvm/util/read-write.c new file mode 100644 index 000000000000..44709dfd4353 --- /dev/null +++ b/tools/kvm/util/read-write.c @@ -0,0 +1,354 @@ +#include "kvm/read-write.h" + +#include <sys/types.h> +#include <sys/uio.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> + +/* Same as read(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xread(int fd, void *buf, size_t count) +{ + ssize_t nr; + +restart: + nr = read(fd, buf, count); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +/* Same as write(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xwrite(int fd, const void *buf, size_t count) +{ + ssize_t nr; + +restart: + nr = write(fd, buf, count); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +ssize_t read_in_full(int fd, void *buf, size_t count) +{ + ssize_t total = 0; + char *p = buf; + + while (count > 0) { + ssize_t nr; + + nr = xread(fd, p, count); + if (nr <= 0) { + if (total > 0) + return total; + + return -1; + } + + count -= nr; + total += nr; + p += nr; + } + + return total; +} + +ssize_t write_in_full(int fd, const void *buf, size_t count) +{ + const char *p = buf; + ssize_t total = 0; + + while (count > 0) { + ssize_t nr; + + nr = xwrite(fd, p, count); + if (nr < 0) + return -1; + if (nr == 0) { + errno = ENOSPC; + return -1; + } + count -= nr; + total += nr; + p += nr; + } + + return total; +} + +/* Same as pread(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xpread(int fd, void *buf, size_t count, off_t offset) +{ + ssize_t nr; + +restart: + nr = pread(fd, buf, count, offset); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +/* Same as pwrite(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset) +{ + ssize_t nr; + +restart: + nr = pwrite(fd, buf, count, offset); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset) +{ + ssize_t total = 0; + char *p = buf; + + while (count > 0) { + ssize_t nr; + + nr = xpread(fd, p, count, offset); + if (nr <= 0) { + if (total > 0) + return total; + + return -1; + } + + count -= nr; + total += nr; + p += nr; + offset += nr; + } + + return total; +} + +ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset) +{ + const char *p = buf; + ssize_t total = 0; + + while (count > 0) { + ssize_t nr; + + nr = xpwrite(fd, p, count, offset); + if (nr < 0) + return -1; + if (nr == 0) { + errno = ENOSPC; + return -1; + } + count -= nr; + total += nr; + p += nr; + offset += nr; + } + + return total; +} + +/* Same as readv(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t nr; + +restart: + nr = readv(fd, iov, iovcnt); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +/* Same as writev(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t nr; + +restart: + nr = writev(fd, iov, iovcnt); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +static inline ssize_t get_iov_size(const struct iovec *iov, int iovcnt) +{ + size_t size = 0; + while (iovcnt--) + size += (iov++)->iov_len; + + return size; +} + +static inline void shift_iovec(const struct iovec **iov, int *iovcnt, + size_t nr, ssize_t *total, size_t *count, off_t *offset) +{ + while (nr >= (*iov)->iov_len) { + nr -= (*iov)->iov_len; + *total += (*iov)->iov_len; + *count -= (*iov)->iov_len; + if (offset) + *offset += (*iov)->iov_len; + (*iovcnt)--; + (*iov)++; + } +} + +ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t total = 0; + size_t count = get_iov_size(iov, iovcnt); + + while (count > 0) { + ssize_t nr; + + nr = xreadv(fd, iov, iovcnt); + if (nr <= 0) { + if (total > 0) + return total; + + return -1; + } + + shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL); + } + + return total; +} + +ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt) +{ + ssize_t total = 0; + size_t count = get_iov_size(iov, iovcnt); + + while (count > 0) { + ssize_t nr; + + nr = xwritev(fd, iov, iovcnt); + if (nr < 0) + return -1; + if (nr == 0) { + errno = ENOSPC; + return -1; + } + + shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL); + } + + return total; +} + +/* Same as preadv(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + ssize_t nr; + +restart: + nr = preadv(fd, iov, iovcnt, offset); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +/* Same as pwritev(2) except that this function never returns EAGAIN or EINTR. */ +ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + ssize_t nr; + +restart: + nr = pwritev(fd, iov, iovcnt, offset); + if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR))) + goto restart; + + return nr; +} + +ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + ssize_t total = 0; + size_t count = get_iov_size(iov, iovcnt); + + while (count > 0) { + ssize_t nr; + + nr = xpreadv(fd, iov, iovcnt, offset); + if (nr <= 0) { + if (total > 0) + return total; + + return -1; + } + + shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset); + } + + return total; +} + +ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + ssize_t total = 0; + size_t count = get_iov_size(iov, iovcnt); + + while (count > 0) { + ssize_t nr; + + nr = xpwritev(fd, iov, iovcnt, offset); + if (nr < 0) + return -1; + if (nr == 0) { + errno = ENOSPC; + return -1; + } + + shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset); + } + + return total; +} + +#ifdef CONFIG_HAS_AIO +int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt, + off_t offset, int ev, void *param) +{ + struct iocb *ios[1] = { iocb }; + int ret; + + io_prep_pwritev(iocb, fd, iov, iovcnt, offset); + io_set_eventfd(iocb, ev); + iocb->data = param; + +restart: + ret = io_submit(ctx, 1, ios); + if (ret == -EAGAIN) + goto restart; + return ret; +} + +int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt, + off_t offset, int ev, void *param) +{ + struct iocb *ios[1] = { iocb }; + int ret; + + io_prep_preadv(iocb, fd, iov, iovcnt, offset); + io_set_eventfd(iocb, ev); + iocb->data = param; + +restart: + ret = io_submit(ctx, 1, ios); + if (ret == -EAGAIN) + goto restart; + return ret; +} +#endif diff --git a/tools/kvm/util/set_private_br.sh b/tools/kvm/util/set_private_br.sh new file mode 100755 index 000000000000..49867ddca6a7 --- /dev/null +++ b/tools/kvm/util/set_private_br.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Author: Amos Kong <kongjianjun@gmail.com> +# Date: Apr 14, 2011 +# Description: this script is used to create/delete a private bridge, +# launch a dhcp server on the bridge by dnsmasq. +# +# @ ./set_private_br.sh $bridge_name $subnet_prefix +# @ ./set_private_br.sh vbr0 192.168.33 + +brname='vbr0' +subnet='192.168.33' + +add_br() +{ + echo "add new private bridge: $brname" + /usr/sbin/brctl addbr $brname + echo 1 > /proc/sys/net/ipv6/conf/$brname/disable_ipv6 + echo 1 > /proc/sys/net/ipv4/ip_forward + /usr/sbin/brctl stp $brname on + /usr/sbin/brctl setfd $brname 0 + ifconfig $brname $subnet.1 + ifconfig $brname up + # Add forward rule, then guest can access public network + iptables -t nat -A POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE + /etc/init.d/dnsmasq stop + /etc/init.d/tftpd-hpa stop 2>/dev/null + dnsmasq --strict-order --bind-interfaces --listen-address $subnet.1 --dhcp-range $subnet.1,$subnet.254 $tftp_cmd +} + +del_br() +{ + echo "cleanup bridge setup" + kill -9 `pgrep dnsmasq|tail -1` + ifconfig $brname down + /usr/sbin/brctl delbr $brname + iptables -t nat -D POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE +} + + +if [ $# = 0 ]; then + del_br 2>/dev/null + exit +fi +if [ $# > 1 ]; then + brname="$1" +fi +if [ $# = 2 ]; then + subnet="$2" +fi +add_br diff --git a/tools/kvm/util/strbuf.c b/tools/kvm/util/strbuf.c new file mode 100644 index 000000000000..99d6b0c08fb4 --- /dev/null +++ b/tools/kvm/util/strbuf.c @@ -0,0 +1,62 @@ + +/* user defined headers */ +#include <kvm/util.h> +#include <kvm/strbuf.h> + +int prefixcmp(const char *str, const char *prefix) +{ + for (; ; str++, prefix++) { + if (!*prefix) + return 0; + else if (*str != *prefix) + return (unsigned char)*prefix - (unsigned char)*str; + } +} + +/** + * strlcat - Append a length-limited, %NUL-terminated string to another + * @dest: The string to be appended to + * @src: The string to append to it + * @count: The size of the destination buffer. + */ +size_t strlcat(char *dest, const char *src, size_t count) +{ + size_t dsize = strlen(dest); + size_t len = strlen(src); + size_t res = dsize + len; + + DIE_IF(dsize >= count); + + dest += dsize; + count -= dsize; + if (len >= count) + len = count - 1; + + memcpy(dest, src, len); + dest[len] = 0; + + return res; +} + +/** + * strlcpy - Copy a %NUL terminated string into a sized buffer + * @dest: Where to copy the string to + * @src: Where to copy the string from + * @size: size of destination buffer + * + * Compatible with *BSD: the result is always a valid + * NUL-terminated string that fits in the buffer (unless, + * of course, the buffer size is zero). It does not pad + * out the result like strncpy() does. + */ +size_t strlcpy(char *dest, const char *src, size_t size) +{ + size_t ret = strlen(src); + + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + memcpy(dest, src, len); + dest[len] = '\0'; + } + return ret; +} diff --git a/tools/kvm/util/threadpool.c b/tools/kvm/util/threadpool.c new file mode 100644 index 000000000000..e64aa26dada4 --- /dev/null +++ b/tools/kvm/util/threadpool.c @@ -0,0 +1,175 @@ +#include "kvm/threadpool.h" +#include "kvm/mutex.h" +#include "kvm/kvm.h" + +#include <linux/kernel.h> +#include <linux/list.h> +#include <pthread.h> +#include <stdbool.h> + +static DEFINE_MUTEX(job_mutex); +static DEFINE_MUTEX(thread_mutex); +static pthread_cond_t job_cond = PTHREAD_COND_INITIALIZER; + +static LIST_HEAD(head); + +static pthread_t *threads; +static long threadcount; +static bool running; + +static struct thread_pool__job *thread_pool__job_pop_locked(void) +{ + struct thread_pool__job *job; + + if (list_empty(&head)) + return NULL; + + job = list_first_entry(&head, struct thread_pool__job, queue); + list_del(&job->queue); + + return job; +} + +static void thread_pool__job_push_locked(struct thread_pool__job *job) +{ + list_add_tail(&job->queue, &head); +} + +static struct thread_pool__job *thread_pool__job_pop(void) +{ + struct thread_pool__job *job; + + mutex_lock(&job_mutex); + job = thread_pool__job_pop_locked(); + mutex_unlock(&job_mutex); + return job; +} + +static void thread_pool__job_push(struct thread_pool__job *job) +{ + mutex_lock(&job_mutex); + thread_pool__job_push_locked(job); + mutex_unlock(&job_mutex); +} + +static void thread_pool__handle_job(struct thread_pool__job *job) +{ + while (job) { + job->callback(job->kvm, job->data); + + mutex_lock(&job->mutex); + + if (--job->signalcount > 0) + /* If the job was signaled again while we were working */ + thread_pool__job_push(job); + + mutex_unlock(&job->mutex); + + job = thread_pool__job_pop(); + } +} + +static void thread_pool__threadfunc_cleanup(void *param) +{ + mutex_unlock(&job_mutex); +} + +static void *thread_pool__threadfunc(void *param) +{ + pthread_cleanup_push(thread_pool__threadfunc_cleanup, NULL); + + kvm__set_thread_name("threadpool-worker"); + + while (running) { + struct thread_pool__job *curjob = NULL; + + mutex_lock(&job_mutex); + while (running && (curjob = thread_pool__job_pop_locked()) == NULL) + pthread_cond_wait(&job_cond, &job_mutex.mutex); + mutex_unlock(&job_mutex); + + if (running) + thread_pool__handle_job(curjob); + } + + pthread_cleanup_pop(0); + + return NULL; +} + +static int thread_pool__addthread(void) +{ + int res; + void *newthreads; + + mutex_lock(&thread_mutex); + newthreads = realloc(threads, (threadcount + 1) * sizeof(pthread_t)); + if (newthreads == NULL) { + mutex_unlock(&thread_mutex); + return -1; + } + + threads = newthreads; + + res = pthread_create(threads + threadcount, NULL, + thread_pool__threadfunc, NULL); + + if (res == 0) + threadcount++; + mutex_unlock(&thread_mutex); + + return res; +} + +int thread_pool__init(struct kvm *kvm) +{ + unsigned long i; + unsigned int thread_count = sysconf(_SC_NPROCESSORS_ONLN); + + running = true; + + for (i = 0; i < thread_count; i++) + if (thread_pool__addthread() < 0) + return i; + + return i; +} +late_init(thread_pool__init); + +int thread_pool__exit(struct kvm *kvm) +{ + int i; + void *NUL = NULL; + + running = false; + + for (i = 0; i < threadcount; i++) { + mutex_lock(&job_mutex); + pthread_cond_signal(&job_cond); + mutex_unlock(&job_mutex); + } + + for (i = 0; i < threadcount; i++) { + pthread_join(threads[i], NUL); + } + + return 0; +} +late_exit(thread_pool__exit); + +void thread_pool__do_job(struct thread_pool__job *job) +{ + struct thread_pool__job *jobinfo = job; + + if (jobinfo == NULL || jobinfo->callback == NULL) + return; + + mutex_lock(&jobinfo->mutex); + if (jobinfo->signalcount++ == 0) + thread_pool__job_push(job); + mutex_unlock(&jobinfo->mutex); + + mutex_lock(&job_mutex); + pthread_cond_signal(&job_cond); + mutex_unlock(&job_mutex); +} diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c new file mode 100644 index 000000000000..c11a15a304a5 --- /dev/null +++ b/tools/kvm/util/util.c @@ -0,0 +1,133 @@ +/* + * Taken from perf which in turn take it from GIT + */ + +#include "kvm/util.h" + +#include <kvm/kvm.h> +#include <linux/magic.h> /* For HUGETLBFS_MAGIC */ +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/statfs.h> + +static void report(const char *prefix, const char *err, va_list params) +{ + char msg[1024]; + vsnprintf(msg, sizeof(msg), err, params); + fprintf(stderr, " %s%s\n", prefix, msg); +} + +static NORETURN void die_builtin(const char *err, va_list params) +{ + report(" Fatal: ", err, params); + exit(128); +} + +static void error_builtin(const char *err, va_list params) +{ + report(" Error: ", err, params); +} + +static void warn_builtin(const char *warn, va_list params) +{ + report(" Warning: ", warn, params); +} + +static void info_builtin(const char *info, va_list params) +{ + report(" Info: ", info, params); +} + +void die(const char *err, ...) +{ + va_list params; + + va_start(params, err); + die_builtin(err, params); + va_end(params); +} + +int pr_err(const char *err, ...) +{ + va_list params; + + va_start(params, err); + error_builtin(err, params); + va_end(params); + return -1; +} + +void pr_warning(const char *warn, ...) +{ + va_list params; + + va_start(params, warn); + warn_builtin(warn, params); + va_end(params); +} + +void pr_info(const char *info, ...) +{ + va_list params; + + va_start(params, info); + info_builtin(info, params); + va_end(params); +} + +void die_perror(const char *s) +{ + perror(s); + exit(1); +} + +void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size) +{ + char mpath[PATH_MAX]; + int fd; + struct statfs sfs; + void *addr; + unsigned long blk_size; + + if (statfs(htlbfs_path, &sfs) < 0) + die("Can't stat %s\n", htlbfs_path); + + if ((unsigned int)sfs.f_type != HUGETLBFS_MAGIC) + die("%s is not hugetlbfs!\n", htlbfs_path); + + blk_size = (unsigned long)sfs.f_bsize; + if (sfs.f_bsize == 0 || blk_size > size) { + die("Can't use hugetlbfs pagesize %ld for mem size %lld\n", + blk_size, size); + } + + kvm->ram_pagesize = blk_size; + + snprintf(mpath, PATH_MAX, "%s/kvmtoolXXXXXX", htlbfs_path); + fd = mkstemp(mpath); + if (fd < 0) + die("Can't open %s for hugetlbfs map\n", mpath); + unlink(mpath); + if (ftruncate(fd, size) < 0) + die("Can't ftruncate for mem mapping size %lld\n", + size); + addr = mmap(NULL, size, PROT_RW, MAP_PRIVATE, fd, 0); + close(fd); + + return addr; +} + +/* This function wraps the decision between hugetlbfs map (if requested) or normal mmap */ +void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size) +{ + if (hugetlbfs_path) + /* + * We don't /need/ to map guest RAM from hugetlbfs, but we do so + * if the user specifies a hugetlbfs path. + */ + return mmap_hugetlbfs(kvm, hugetlbfs_path, size); + else { + kvm->ram_pagesize = getpagesize(); + return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0); + } +} diff --git a/tools/kvm/virtio/9p-pdu.c b/tools/kvm/virtio/9p-pdu.c new file mode 100644 index 000000000000..b9ce8ce60f2f --- /dev/null +++ b/tools/kvm/virtio/9p-pdu.c @@ -0,0 +1,287 @@ +#include "kvm/util.h" +#include "kvm/virtio-9p.h" + +#include <endian.h> +#include <stdint.h> + +#include <linux/compiler.h> +#include <net/9p/9p.h> + +static void virtio_p9_pdu_read(struct p9_pdu *pdu, void *data, size_t size) +{ + size_t len; + int i, copied = 0; + u16 iov_cnt = pdu->out_iov_cnt; + size_t offset = pdu->read_offset; + struct iovec *iov = pdu->out_iov; + + for (i = 0; i < iov_cnt && size; i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } else { + len = MIN(iov[i].iov_len - offset, size); + memcpy(data, iov[i].iov_base + offset, len); + size -= len; + data += len; + offset = 0; + copied += len; + } + } + pdu->read_offset += copied; +} + +static void virtio_p9_pdu_write(struct p9_pdu *pdu, + const void *data, size_t size) +{ + size_t len; + int i, copied = 0; + u16 iov_cnt = pdu->in_iov_cnt; + size_t offset = pdu->write_offset; + struct iovec *iov = pdu->in_iov; + + for (i = 0; i < iov_cnt && size; i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } else { + len = MIN(iov[i].iov_len - offset, size); + memcpy(iov[i].iov_base + offset, data, len); + size -= len; + data += len; + offset = 0; + copied += len; + } + } + pdu->write_offset += copied; +} + +static void virtio_p9_wstat_free(struct p9_wstat *stbuf) +{ + free(stbuf->name); + free(stbuf->uid); + free(stbuf->gid); + free(stbuf->muid); +} + +static int virtio_p9_decode(struct p9_pdu *pdu, const char *fmt, va_list ap) +{ + int retval = 0; + const char *ptr; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b': + { + int8_t *val = va_arg(ap, int8_t *); + virtio_p9_pdu_read(pdu, val, sizeof(*val)); + } + break; + case 'w': + { + int16_t le_val; + int16_t *val = va_arg(ap, int16_t *); + virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val)); + *val = le16toh(le_val); + } + break; + case 'd': + { + int32_t le_val; + int32_t *val = va_arg(ap, int32_t *); + virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val)); + *val = le32toh(le_val); + } + break; + case 'q': + { + int64_t le_val; + int64_t *val = va_arg(ap, int64_t *); + virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val)); + *val = le64toh(le_val); + } + break; + case 's': + { + int16_t len; + char **str = va_arg(ap, char **); + + virtio_p9_pdu_readf(pdu, "w", &len); + *str = malloc(len + 1); + if (*str == NULL) { + retval = ENOMEM; + break; + } + virtio_p9_pdu_read(pdu, *str, len); + (*str)[len] = 0; + } + break; + case 'Q': + { + struct p9_qid *qid = va_arg(ap, struct p9_qid *); + retval = virtio_p9_pdu_readf(pdu, "bdq", + &qid->type, &qid->version, + &qid->path); + } + break; + case 'S': + { + struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *); + memset(stbuf, 0, sizeof(struct p9_wstat)); + stbuf->n_uid = stbuf->n_gid = stbuf->n_muid = -1; + retval = virtio_p9_pdu_readf(pdu, "wwdQdddqssss", + &stbuf->size, &stbuf->type, + &stbuf->dev, &stbuf->qid, + &stbuf->mode, &stbuf->atime, + &stbuf->mtime, &stbuf->length, + &stbuf->name, &stbuf->uid, + &stbuf->gid, &stbuf->muid); + if (retval) + virtio_p9_wstat_free(stbuf); + } + break; + case 'I': + { + struct p9_iattr_dotl *p9attr = va_arg(ap, + struct p9_iattr_dotl *); + + retval = virtio_p9_pdu_readf(pdu, "ddddqqqqq", + &p9attr->valid, + &p9attr->mode, + &p9attr->uid, + &p9attr->gid, + &p9attr->size, + &p9attr->atime_sec, + &p9attr->atime_nsec, + &p9attr->mtime_sec, + &p9attr->mtime_nsec); + } + break; + default: + retval = EINVAL; + break; + } + } + return retval; +} + +static int virtio_p9_pdu_encode(struct p9_pdu *pdu, const char *fmt, va_list ap) +{ + int retval = 0; + const char *ptr; + + for (ptr = fmt; *ptr; ptr++) { + switch (*ptr) { + case 'b': + { + int8_t val = va_arg(ap, int); + virtio_p9_pdu_write(pdu, &val, sizeof(val)); + } + break; + case 'w': + { + int16_t val = htole16(va_arg(ap, int)); + virtio_p9_pdu_write(pdu, &val, sizeof(val)); + } + break; + case 'd': + { + int32_t val = htole32(va_arg(ap, int32_t)); + virtio_p9_pdu_write(pdu, &val, sizeof(val)); + } + break; + case 'q': + { + int64_t val = htole64(va_arg(ap, int64_t)); + virtio_p9_pdu_write(pdu, &val, sizeof(val)); + } + break; + case 's': + { + uint16_t len = 0; + const char *s = va_arg(ap, char *); + if (s) + len = MIN(strlen(s), USHRT_MAX); + virtio_p9_pdu_writef(pdu, "w", len); + virtio_p9_pdu_write(pdu, s, len); + } + break; + case 'Q': + { + struct p9_qid *qid = va_arg(ap, struct p9_qid *); + retval = virtio_p9_pdu_writef(pdu, "bdq", + qid->type, qid->version, + qid->path); + } + break; + case 'S': + { + struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *); + retval = virtio_p9_pdu_writef(pdu, "wwdQdddqssss", + stbuf->size, stbuf->type, + stbuf->dev, &stbuf->qid, + stbuf->mode, stbuf->atime, + stbuf->mtime, stbuf->length, + stbuf->name, stbuf->uid, + stbuf->gid, stbuf->muid); + } + break; + case 'A': + { + struct p9_stat_dotl *stbuf = va_arg(ap, + struct p9_stat_dotl *); + retval = virtio_p9_pdu_writef(pdu, + "qQdddqqqqqqqqqqqqqqq", + stbuf->st_result_mask, + &stbuf->qid, + stbuf->st_mode, + stbuf->st_uid, + stbuf->st_gid, + stbuf->st_nlink, + stbuf->st_rdev, + stbuf->st_size, + stbuf->st_blksize, + stbuf->st_blocks, + stbuf->st_atime_sec, + stbuf->st_atime_nsec, + stbuf->st_mtime_sec, + stbuf->st_mtime_nsec, + stbuf->st_ctime_sec, + stbuf->st_ctime_nsec, + stbuf->st_btime_sec, + stbuf->st_btime_nsec, + stbuf->st_gen, + stbuf->st_data_version); + } + break; + default: + retval = EINVAL; + break; + } + } + return retval; +} + +int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...) +{ + int ret; + va_list ap; + + va_start(ap, fmt); + ret = virtio_p9_decode(pdu, fmt, ap); + va_end(ap); + + return ret; +} + +int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...) +{ + int ret; + va_list ap; + + va_start(ap, fmt); + ret = virtio_p9_pdu_encode(pdu, fmt, ap); + va_end(ap); + + return ret; +} diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c new file mode 100644 index 000000000000..60865dd0ca7c --- /dev/null +++ b/tools/kvm/virtio/9p.c @@ -0,0 +1,1441 @@ +#include "kvm/virtio-pci-dev.h" +#include "kvm/ioport.h" +#include "kvm/util.h" +#include "kvm/threadpool.h" +#include "kvm/irq.h" +#include "kvm/virtio-9p.h" +#include "kvm/guest_compat.h" +#include "kvm/builtin-setup.h" + +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <sys/vfs.h> + +#include <linux/virtio_ring.h> +#include <linux/virtio_9p.h> +#include <net/9p/9p.h> + +static LIST_HEAD(devs); +static int compat_id = -1; + +static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid); +static struct p9_fid *find_or_create_fid(struct p9_dev *dev, u32 fid) +{ + struct rb_node *node = dev->fids.rb_node; + struct p9_fid *pfid = NULL; + + while (node) { + struct p9_fid *cur = rb_entry(node, struct p9_fid, node); + + if (fid < cur->fid) { + node = node->rb_left; + } else if (fid > cur->fid) { + node = node->rb_right; + } else { + return cur; + } + } + + pfid = calloc(sizeof(*pfid), 1); + if (!pfid) + return NULL; + + pfid->fid = fid; + strcpy(pfid->abs_path, dev->root_dir); + pfid->path = pfid->abs_path + strlen(dev->root_dir); + + insert_new_fid(dev, pfid); + + return pfid; +} + +static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid) +{ + struct rb_node **node = &(dev->fids.rb_node), *parent = NULL; + + while (*node) { + int result = fid->fid - rb_entry(*node, struct p9_fid, node)->fid; + + parent = *node; + if (result < 0) + node = &((*node)->rb_left); + else if (result > 0) + node = &((*node)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&fid->node, parent, node); + rb_insert_color(&fid->node, &dev->fids); + return 0; +} + +static struct p9_fid *get_fid(struct p9_dev *p9dev, int fid) +{ + struct p9_fid *new; + + new = find_or_create_fid(p9dev, fid); + + return new; +} + +/* Warning: Immediately use value returned from this function */ +static const char *rel_to_abs(struct p9_dev *p9dev, + const char *path, char *abs_path) +{ + sprintf(abs_path, "%s/%s", p9dev->root_dir, path); + + return abs_path; +} + +static void stat2qid(struct stat *st, struct p9_qid *qid) +{ + *qid = (struct p9_qid) { + .path = st->st_ino, + .version = st->st_mtime, + }; + + if (S_ISDIR(st->st_mode)) + qid->type |= P9_QTDIR; +} + +static void close_fid(struct p9_dev *p9dev, u32 fid) +{ + struct p9_fid *pfid = get_fid(p9dev, fid); + + if (pfid->fd > 0) + close(pfid->fd); + + if (pfid->dir) + closedir(pfid->dir); + + rb_erase(&pfid->node, &p9dev->fids); + free(pfid); +} + +static void virtio_p9_set_reply_header(struct p9_pdu *pdu, u32 size) +{ + u8 cmd; + u16 tag; + + pdu->read_offset = sizeof(u32); + virtio_p9_pdu_readf(pdu, "bw", &cmd, &tag); + pdu->write_offset = 0; + /* cmd + 1 is the reply message */ + virtio_p9_pdu_writef(pdu, "dbw", size, cmd + 1, tag); +} + +static u16 virtio_p9_update_iov_cnt(struct iovec iov[], u32 count, int iov_cnt) +{ + int i; + u32 total = 0; + for (i = 0; (i < iov_cnt) && (total < count); i++) { + if (total + iov[i].iov_len > count) { + /* we don't need this iov fully */ + iov[i].iov_len -= ((total + iov[i].iov_len) - count); + i++; + break; + } + total += iov[i].iov_len; + } + return i; +} + +static void virtio_p9_error_reply(struct p9_dev *p9dev, + struct p9_pdu *pdu, int err, u32 *outlen) +{ + u16 tag; + + pdu->write_offset = VIRTIO_9P_HDR_LEN; + virtio_p9_pdu_writef(pdu, "d", err); + *outlen = pdu->write_offset; + + /* read the tag from input */ + pdu->read_offset = sizeof(u32) + sizeof(u8); + virtio_p9_pdu_readf(pdu, "w", &tag); + + /* Update the header */ + pdu->write_offset = 0; + virtio_p9_pdu_writef(pdu, "dbw", *outlen, P9_RLERROR, tag); +} + +static void virtio_p9_version(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u32 msize; + char *version; + virtio_p9_pdu_readf(pdu, "ds", &msize, &version); + /* + * reply with the same msize the client sent us + * Error out if the request is not for 9P2000.L + */ + if (!strcmp(version, VIRTIO_9P_VERSION_DOTL)) + virtio_p9_pdu_writef(pdu, "ds", msize, version); + else + virtio_p9_pdu_writef(pdu, "ds", msize, "unknown"); + + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + free(version); + return; +} + +static void virtio_p9_clunk(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u32 fid; + + virtio_p9_pdu_readf(pdu, "d", &fid); + close_fid(p9dev, fid); + + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +} + +/* + * FIXME!! Need to map to protocol independent value. Upstream + * 9p also have the same BUG + */ +static int virtio_p9_openflags(int flags) +{ + flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT | O_DIRECT); + flags |= O_NOFOLLOW; + return flags; +} + +static bool is_dir(struct p9_fid *fid) +{ + struct stat st; + + stat(fid->abs_path, &st); + + return S_ISDIR(st.st_mode); +} + +static void virtio_p9_open(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u32 fid, flags; + struct stat st; + struct p9_qid qid; + struct p9_fid *new_fid; + + + virtio_p9_pdu_readf(pdu, "dd", &fid, &flags); + new_fid = get_fid(p9dev, fid); + + if (lstat(new_fid->abs_path, &st) < 0) + goto err_out; + + stat2qid(&st, &qid); + + if (is_dir(new_fid)) { + new_fid->dir = opendir(new_fid->abs_path); + if (!new_fid->dir) + goto err_out; + } else { + new_fid->fd = open(new_fid->abs_path, + virtio_p9_openflags(flags)); + if (new_fid->fd < 0) + goto err_out; + } + /* FIXME!! need ot send proper iounit */ + virtio_p9_pdu_writef(pdu, "Qd", &qid, 0); + + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_create(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int fd, ret; + char *name; + struct stat st; + struct p9_qid qid; + struct p9_fid *dfid; + char full_path[PATH_MAX]; + u32 dfid_val, flags, mode, gid; + + virtio_p9_pdu_readf(pdu, "dsddd", &dfid_val, + &name, &flags, &mode, &gid); + dfid = get_fid(p9dev, dfid_val); + + flags = virtio_p9_openflags(flags); + + sprintf(full_path, "%s/%s", dfid->abs_path, name); + fd = open(full_path, flags | O_CREAT, mode); + if (fd < 0) + goto err_out; + dfid->fd = fd; + + if (lstat(full_path, &st) < 0) + goto err_out; + + ret = chmod(full_path, mode & 0777); + if (ret < 0) + goto err_out; + + sprintf(dfid->path, "%s/%s", dfid->path, name); + stat2qid(&st, &qid); + virtio_p9_pdu_writef(pdu, "Qd", &qid, 0); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + free(name); + return; +err_out: + free(name); + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_mkdir(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + char *name; + struct stat st; + struct p9_qid qid; + struct p9_fid *dfid; + char full_path[PATH_MAX]; + u32 dfid_val, mode, gid; + + virtio_p9_pdu_readf(pdu, "dsdd", &dfid_val, + &name, &mode, &gid); + dfid = get_fid(p9dev, dfid_val); + + sprintf(full_path, "%s/%s", dfid->abs_path, name); + ret = mkdir(full_path, mode); + if (ret < 0) + goto err_out; + + if (lstat(full_path, &st) < 0) + goto err_out; + + ret = chmod(full_path, mode & 0777); + if (ret < 0) + goto err_out; + + stat2qid(&st, &qid); + virtio_p9_pdu_writef(pdu, "Qd", &qid, 0); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + free(name); + return; +err_out: + free(name); + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_walk(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u8 i; + u16 nwqid; + u16 nwname; + struct p9_qid wqid; + struct p9_fid *new_fid, *old_fid; + u32 fid_val, newfid_val; + + + virtio_p9_pdu_readf(pdu, "ddw", &fid_val, &newfid_val, &nwname); + new_fid = get_fid(p9dev, newfid_val); + + nwqid = 0; + if (nwname) { + struct p9_fid *fid = get_fid(p9dev, fid_val); + + strcpy(new_fid->path, fid->path); + /* skip the space for count */ + pdu->write_offset += sizeof(u16); + for (i = 0; i < nwname; i++) { + struct stat st; + char tmp[PATH_MAX] = {0}; + char full_path[PATH_MAX]; + char *str; + + virtio_p9_pdu_readf(pdu, "s", &str); + + /* Format the new path we're 'walk'ing into */ + sprintf(tmp, "%s/%s", new_fid->path, str); + + free(str); + + if (lstat(rel_to_abs(p9dev, tmp, full_path), &st) < 0) + goto err_out; + + stat2qid(&st, &wqid); + strcpy(new_fid->path, tmp); + new_fid->uid = fid->uid; + nwqid++; + virtio_p9_pdu_writef(pdu, "Q", &wqid); + } + } else { + /* + * update write_offset so our outlen get correct value + */ + pdu->write_offset += sizeof(u16); + old_fid = get_fid(p9dev, fid_val); + strcpy(new_fid->path, old_fid->path); + new_fid->uid = old_fid->uid; + } + *outlen = pdu->write_offset; + pdu->write_offset = VIRTIO_9P_HDR_LEN; + virtio_p9_pdu_writef(pdu, "d", nwqid); + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_attach(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + char *uname; + char *aname; + struct stat st; + struct p9_qid qid; + struct p9_fid *fid; + u32 fid_val, afid, uid; + + virtio_p9_pdu_readf(pdu, "ddssd", &fid_val, &afid, + &uname, &aname, &uid); + + free(uname); + free(aname); + + if (lstat(p9dev->root_dir, &st) < 0) + goto err_out; + + stat2qid(&st, &qid); + + fid = get_fid(p9dev, fid_val); + fid->uid = uid; + strcpy(fid->path, "/"); + + virtio_p9_pdu_writef(pdu, "Q", &qid); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_fill_stat(struct p9_dev *p9dev, + struct stat *st, struct p9_stat_dotl *statl) +{ + memset(statl, 0, sizeof(*statl)); + statl->st_mode = st->st_mode; + statl->st_nlink = st->st_nlink; + statl->st_uid = st->st_uid; + statl->st_gid = st->st_gid; + statl->st_rdev = st->st_rdev; + statl->st_size = st->st_size; + statl->st_blksize = st->st_blksize; + statl->st_blocks = st->st_blocks; + statl->st_atime_sec = st->st_atime; + statl->st_atime_nsec = st->st_atim.tv_nsec; + statl->st_mtime_sec = st->st_mtime; + statl->st_mtime_nsec = st->st_mtim.tv_nsec; + statl->st_ctime_sec = st->st_ctime; + statl->st_ctime_nsec = st->st_ctim.tv_nsec; + /* Currently we only support BASIC fields in stat */ + statl->st_result_mask = P9_STATS_BASIC; + stat2qid(st, &statl->qid); +} + +static void virtio_p9_read(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u64 offset; + u32 fid_val; + u16 iov_cnt; + void *iov_base; + size_t iov_len; + u32 count, rcount; + struct p9_fid *fid; + + + rcount = 0; + virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count); + fid = get_fid(p9dev, fid_val); + + iov_base = pdu->in_iov[0].iov_base; + iov_len = pdu->in_iov[0].iov_len; + iov_cnt = pdu->in_iov_cnt; + pdu->in_iov[0].iov_base += VIRTIO_9P_HDR_LEN + sizeof(u32); + pdu->in_iov[0].iov_len -= VIRTIO_9P_HDR_LEN + sizeof(u32); + pdu->in_iov_cnt = virtio_p9_update_iov_cnt(pdu->in_iov, + count, + pdu->in_iov_cnt); + rcount = preadv(fid->fd, pdu->in_iov, + pdu->in_iov_cnt, offset); + if (rcount > count) + rcount = count; + /* + * Update the iov_base back, so that rest of + * pdu_writef works correctly. + */ + pdu->in_iov[0].iov_base = iov_base; + pdu->in_iov[0].iov_len = iov_len; + pdu->in_iov_cnt = iov_cnt; + + pdu->write_offset = VIRTIO_9P_HDR_LEN; + virtio_p9_pdu_writef(pdu, "d", rcount); + *outlen = pdu->write_offset + rcount; + virtio_p9_set_reply_header(pdu, *outlen); + return; +} + +static int virtio_p9_dentry_size(struct dirent *dent) +{ + /* + * Size of each dirent: + * qid(13) + offset(8) + type(1) + name_len(2) + name + */ + return 24 + strlen(dent->d_name); +} + +static void virtio_p9_readdir(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u32 fid_val; + u32 count, rcount; + struct stat st; + struct p9_fid *fid; + struct dirent *dent; + char full_path[PATH_MAX]; + u64 offset, old_offset; + + rcount = 0; + virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count); + fid = get_fid(p9dev, fid_val); + + if (!is_dir(fid)) { + errno = EINVAL; + goto err_out; + } + + /* Move the offset specified */ + seekdir(fid->dir, offset); + + old_offset = offset; + /* If reading a dir, fill the buffer with p9_stat entries */ + dent = readdir(fid->dir); + + /* Skip the space for writing count */ + pdu->write_offset += sizeof(u32); + while (dent) { + u32 read; + struct p9_qid qid; + + if ((rcount + virtio_p9_dentry_size(dent)) > count) { + /* seek to the previous offset and return */ + seekdir(fid->dir, old_offset); + break; + } + old_offset = dent->d_off; + lstat(rel_to_abs(p9dev, dent->d_name, full_path), &st); + stat2qid(&st, &qid); + read = pdu->write_offset; + virtio_p9_pdu_writef(pdu, "Qqbs", &qid, dent->d_off, + dent->d_type, dent->d_name); + rcount += pdu->write_offset - read; + dent = readdir(fid->dir); + } + + pdu->write_offset = VIRTIO_9P_HDR_LEN; + virtio_p9_pdu_writef(pdu, "d", rcount); + *outlen = pdu->write_offset + rcount; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + + +static void virtio_p9_getattr(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u32 fid_val; + struct stat st; + u64 request_mask; + struct p9_fid *fid; + struct p9_stat_dotl statl; + + virtio_p9_pdu_readf(pdu, "dq", &fid_val, &request_mask); + fid = get_fid(p9dev, fid_val); + if (lstat(fid->abs_path, &st) < 0) + goto err_out; + + virtio_p9_fill_stat(p9dev, &st, &statl); + virtio_p9_pdu_writef(pdu, "A", &statl); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +/* FIXME!! from linux/fs.h */ +/* + * Attribute flags. These should be or-ed together to figure out what + * has been changed! + */ +#define ATTR_MODE (1 << 0) +#define ATTR_UID (1 << 1) +#define ATTR_GID (1 << 2) +#define ATTR_SIZE (1 << 3) +#define ATTR_ATIME (1 << 4) +#define ATTR_MTIME (1 << 5) +#define ATTR_CTIME (1 << 6) +#define ATTR_ATIME_SET (1 << 7) +#define ATTR_MTIME_SET (1 << 8) +#define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ +#define ATTR_ATTR_FLAG (1 << 10) +#define ATTR_KILL_SUID (1 << 11) +#define ATTR_KILL_SGID (1 << 12) +#define ATTR_FILE (1 << 13) +#define ATTR_KILL_PRIV (1 << 14) +#define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ +#define ATTR_TIMES_SET (1 << 16) + +#define ATTR_MASK 127 + +static void virtio_p9_setattr(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret = 0; + u32 fid_val; + struct p9_fid *fid; + struct p9_iattr_dotl p9attr; + + virtio_p9_pdu_readf(pdu, "dI", &fid_val, &p9attr); + fid = get_fid(p9dev, fid_val); + + if (p9attr.valid & ATTR_MODE) { + ret = chmod(fid->abs_path, p9attr.mode); + if (ret < 0) + goto err_out; + } + if (p9attr.valid & (ATTR_ATIME | ATTR_MTIME)) { + struct timespec times[2]; + if (p9attr.valid & ATTR_ATIME) { + if (p9attr.valid & ATTR_ATIME_SET) { + times[0].tv_sec = p9attr.atime_sec; + times[0].tv_nsec = p9attr.atime_nsec; + } else { + times[0].tv_nsec = UTIME_NOW; + } + } else { + times[0].tv_nsec = UTIME_OMIT; + } + if (p9attr.valid & ATTR_MTIME) { + if (p9attr.valid & ATTR_MTIME_SET) { + times[1].tv_sec = p9attr.mtime_sec; + times[1].tv_nsec = p9attr.mtime_nsec; + } else { + times[1].tv_nsec = UTIME_NOW; + } + } else + times[1].tv_nsec = UTIME_OMIT; + + ret = utimensat(-1, fid->abs_path, times, AT_SYMLINK_NOFOLLOW); + if (ret < 0) + goto err_out; + } + /* + * If the only valid entry in iattr is ctime we can call + * chown(-1,-1) to update the ctime of the file + */ + if ((p9attr.valid & (ATTR_UID | ATTR_GID)) || + ((p9attr.valid & ATTR_CTIME) + && !((p9attr.valid & ATTR_MASK) & ~ATTR_CTIME))) { + if (!(p9attr.valid & ATTR_UID)) + p9attr.uid = -1; + + if (!(p9attr.valid & ATTR_GID)) + p9attr.gid = -1; + + ret = lchown(fid->abs_path, p9attr.uid, p9attr.gid); + if (ret < 0) + goto err_out; + } + if (p9attr.valid & (ATTR_SIZE)) { + ret = truncate(fid->abs_path, p9attr.size); + if (ret < 0) + goto err_out; + } + *outlen = VIRTIO_9P_HDR_LEN; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_write(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + + u64 offset; + u32 fid_val; + u32 count; + ssize_t res; + u16 iov_cnt; + void *iov_base; + size_t iov_len; + struct p9_fid *fid; + /* u32 fid + u64 offset + u32 count */ + int twrite_size = sizeof(u32) + sizeof(u64) + sizeof(u32); + + virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count); + fid = get_fid(p9dev, fid_val); + + iov_base = pdu->out_iov[0].iov_base; + iov_len = pdu->out_iov[0].iov_len; + iov_cnt = pdu->out_iov_cnt; + + /* Adjust the iovec to skip the header and meta data */ + pdu->out_iov[0].iov_base += (sizeof(struct p9_msg) + twrite_size); + pdu->out_iov[0].iov_len -= (sizeof(struct p9_msg) + twrite_size); + pdu->out_iov_cnt = virtio_p9_update_iov_cnt(pdu->out_iov, count, + pdu->out_iov_cnt); + res = pwritev(fid->fd, pdu->out_iov, pdu->out_iov_cnt, offset); + /* + * Update the iov_base back, so that rest of + * pdu_readf works correctly. + */ + pdu->out_iov[0].iov_base = iov_base; + pdu->out_iov[0].iov_len = iov_len; + pdu->out_iov_cnt = iov_cnt; + + if (res < 0) + goto err_out; + virtio_p9_pdu_writef(pdu, "d", res); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_remove(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + u32 fid_val; + struct p9_fid *fid; + + virtio_p9_pdu_readf(pdu, "d", &fid_val); + fid = get_fid(p9dev, fid_val); + + ret = remove(fid->abs_path); + if (ret < 0) + goto err_out; + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; + +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_rename(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + u32 fid_val, new_fid_val; + struct p9_fid *fid, *new_fid; + char full_path[PATH_MAX], *new_name; + + virtio_p9_pdu_readf(pdu, "dds", &fid_val, &new_fid_val, &new_name); + fid = get_fid(p9dev, fid_val); + new_fid = get_fid(p9dev, new_fid_val); + + sprintf(full_path, "%s/%s", new_fid->abs_path, new_name); + ret = rename(fid->abs_path, full_path); + if (ret < 0) + goto err_out; + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; + +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_readlink(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + u32 fid_val; + struct p9_fid *fid; + char target_path[PATH_MAX]; + + virtio_p9_pdu_readf(pdu, "d", &fid_val); + fid = get_fid(p9dev, fid_val); + + memset(target_path, 0, PATH_MAX); + ret = readlink(fid->abs_path, target_path, PATH_MAX - 1); + if (ret < 0) + goto err_out; + + virtio_p9_pdu_writef(pdu, "s", target_path); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_statfs(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + u64 fsid; + u32 fid_val; + struct p9_fid *fid; + struct statfs stat_buf; + + virtio_p9_pdu_readf(pdu, "d", &fid_val); + fid = get_fid(p9dev, fid_val); + + ret = statfs(fid->abs_path, &stat_buf); + if (ret < 0) + goto err_out; + /* FIXME!! f_blocks needs update based on client msize */ + fsid = (unsigned int) stat_buf.f_fsid.__val[0] | + (unsigned long long)stat_buf.f_fsid.__val[1] << 32; + virtio_p9_pdu_writef(pdu, "ddqqqqqqd", stat_buf.f_type, + stat_buf.f_bsize, stat_buf.f_blocks, + stat_buf.f_bfree, stat_buf.f_bavail, + stat_buf.f_files, stat_buf.f_ffree, + fsid, stat_buf.f_namelen); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_mknod(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + char *name; + struct stat st; + struct p9_fid *dfid; + struct p9_qid qid; + char full_path[PATH_MAX]; + u32 fid_val, mode, major, minor, gid; + + virtio_p9_pdu_readf(pdu, "dsdddd", &fid_val, &name, &mode, + &major, &minor, &gid); + + dfid = get_fid(p9dev, fid_val); + sprintf(full_path, "%s/%s", dfid->abs_path, name); + ret = mknod(full_path, mode, makedev(major, minor)); + if (ret < 0) + goto err_out; + + if (lstat(full_path, &st) < 0) + goto err_out; + + ret = chmod(full_path, mode & 0777); + if (ret < 0) + goto err_out; + + stat2qid(&st, &qid); + virtio_p9_pdu_writef(pdu, "Q", &qid); + free(name); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + free(name); + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_fsync(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + struct p9_fid *fid; + u32 fid_val, datasync; + + virtio_p9_pdu_readf(pdu, "dd", &fid_val, &datasync); + fid = get_fid(p9dev, fid_val); + + if (datasync) + ret = fdatasync(fid->fd); + else + ret = fsync(fid->fd); + if (ret < 0) + goto err_out; + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_symlink(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + struct stat st; + u32 fid_val, gid; + struct p9_qid qid; + struct p9_fid *dfid; + char new_name[PATH_MAX]; + char *old_path, *name; + + virtio_p9_pdu_readf(pdu, "dssd", &fid_val, &name, &old_path, &gid); + + dfid = get_fid(p9dev, fid_val); + sprintf(new_name, "%s/%s", dfid->abs_path, name); + ret = symlink(old_path, new_name); + if (ret < 0) + goto err_out; + + if (lstat(new_name, &st) < 0) + goto err_out; + + stat2qid(&st, &qid); + virtio_p9_pdu_writef(pdu, "Q", &qid); + free(name); + free(old_path); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + free(name); + free(old_path); + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_link(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + char *name; + u32 fid_val, dfid_val; + struct p9_fid *dfid, *fid; + char full_path[PATH_MAX]; + + virtio_p9_pdu_readf(pdu, "dds", &dfid_val, &fid_val, &name); + + dfid = get_fid(p9dev, dfid_val); + fid = get_fid(p9dev, fid_val); + sprintf(full_path, "%s/%s", dfid->abs_path, name); + ret = link(fid->abs_path, full_path); + if (ret < 0) + goto err_out; + free(name); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + free(name); + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; + +} + +static void virtio_p9_lock(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u8 ret; + u32 fid_val; + struct p9_flock flock; + + virtio_p9_pdu_readf(pdu, "dbdqqds", &fid_val, &flock.type, + &flock.flags, &flock.start, &flock.length, + &flock.proc_id, &flock.client_id); + + /* Just return success */ + ret = P9_LOCK_SUCCESS; + virtio_p9_pdu_writef(pdu, "d", ret); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + free(flock.client_id); + return; +} + +static void virtio_p9_getlock(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u32 fid_val; + struct p9_getlock glock; + virtio_p9_pdu_readf(pdu, "dbqqds", &fid_val, &glock.type, + &glock.start, &glock.length, &glock.proc_id, + &glock.client_id); + + /* Just return success */ + glock.type = F_UNLCK; + virtio_p9_pdu_writef(pdu, "bqqds", glock.type, + glock.start, glock.length, glock.proc_id, + glock.client_id); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + free(glock.client_id); + return; +} + +static int virtio_p9_ancestor(char *path, char *ancestor) +{ + int size = strlen(ancestor); + if (!strncmp(path, ancestor, size)) { + /* + * Now check whether ancestor is a full name or + * or directory component and not just part + * of a name. + */ + if (path[size] == '\0' || path[size] == '/') + return 1; + } + return 0; +} + +static void virtio_p9_fix_path(char *fid_path, char *old_name, char *new_name) +{ + char tmp_name[PATH_MAX]; + size_t rp_sz = strlen(old_name); + + if (rp_sz == strlen(fid_path)) { + /* replace the full name */ + strcpy(fid_path, new_name); + return; + } + /* save the trailing path details */ + strcpy(tmp_name, fid_path + rp_sz); + sprintf(fid_path, "%s%s", new_name, tmp_name); + return; +} + +static void rename_fids(struct p9_dev *p9dev, char *old_name, char *new_name) +{ + struct rb_node *node = rb_first(&p9dev->fids); + + while (node) { + struct p9_fid *fid = rb_entry(node, struct p9_fid, node); + + if (fid->fid != P9_NOFID && virtio_p9_ancestor(fid->path, old_name)) { + virtio_p9_fix_path(fid->path, old_name, new_name); + } + node = rb_next(node); + } +} + +static void virtio_p9_renameat(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + char *old_name, *new_name; + u32 old_dfid_val, new_dfid_val; + struct p9_fid *old_dfid, *new_dfid; + char old_full_path[PATH_MAX], new_full_path[PATH_MAX]; + + + virtio_p9_pdu_readf(pdu, "dsds", &old_dfid_val, &old_name, + &new_dfid_val, &new_name); + + old_dfid = get_fid(p9dev, old_dfid_val); + new_dfid = get_fid(p9dev, new_dfid_val); + + sprintf(old_full_path, "%s/%s", old_dfid->abs_path, old_name); + sprintf(new_full_path, "%s/%s", new_dfid->abs_path, new_name); + ret = rename(old_full_path, new_full_path); + if (ret < 0) + goto err_out; + /* + * Now fix path in other fids, if the renamed path is part of + * that. + */ + rename_fids(p9dev, old_name, new_name); + free(old_name); + free(new_name); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + free(old_name); + free(new_name); + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_unlinkat(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + int ret; + char *name; + u32 fid_val, flags; + struct p9_fid *fid; + char full_path[PATH_MAX]; + + virtio_p9_pdu_readf(pdu, "dsd", &fid_val, &name, &flags); + fid = get_fid(p9dev, fid_val); + + sprintf(full_path, "%s/%s", fid->abs_path, name); + ret = remove(full_path); + if (ret < 0) + goto err_out; + free(name); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + return; +err_out: + free(name); + virtio_p9_error_reply(p9dev, pdu, errno, outlen); + return; +} + +static void virtio_p9_flush(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + u16 tag, oldtag; + + virtio_p9_pdu_readf(pdu, "ww", &tag, &oldtag); + virtio_p9_pdu_writef(pdu, "w", tag); + *outlen = pdu->write_offset; + virtio_p9_set_reply_header(pdu, *outlen); + + return; +} + +static void virtio_p9_eopnotsupp(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen) +{ + return virtio_p9_error_reply(p9dev, pdu, EOPNOTSUPP, outlen); +} + +typedef void p9_handler(struct p9_dev *p9dev, + struct p9_pdu *pdu, u32 *outlen); + +/* FIXME should be removed when merging with latest linus tree */ +#define P9_TRENAMEAT 74 +#define P9_TUNLINKAT 76 + +static p9_handler *virtio_9p_dotl_handler [] = { + [P9_TREADDIR] = virtio_p9_readdir, + [P9_TSTATFS] = virtio_p9_statfs, + [P9_TGETATTR] = virtio_p9_getattr, + [P9_TSETATTR] = virtio_p9_setattr, + [P9_TXATTRWALK] = virtio_p9_eopnotsupp, + [P9_TXATTRCREATE] = virtio_p9_eopnotsupp, + [P9_TMKNOD] = virtio_p9_mknod, + [P9_TLOCK] = virtio_p9_lock, + [P9_TGETLOCK] = virtio_p9_getlock, + [P9_TRENAMEAT] = virtio_p9_renameat, + [P9_TREADLINK] = virtio_p9_readlink, + [P9_TUNLINKAT] = virtio_p9_unlinkat, + [P9_TMKDIR] = virtio_p9_mkdir, + [P9_TVERSION] = virtio_p9_version, + [P9_TLOPEN] = virtio_p9_open, + [P9_TATTACH] = virtio_p9_attach, + [P9_TWALK] = virtio_p9_walk, + [P9_TCLUNK] = virtio_p9_clunk, + [P9_TFSYNC] = virtio_p9_fsync, + [P9_TREAD] = virtio_p9_read, + [P9_TFLUSH] = virtio_p9_flush, + [P9_TLINK] = virtio_p9_link, + [P9_TSYMLINK] = virtio_p9_symlink, + [P9_TLCREATE] = virtio_p9_create, + [P9_TWRITE] = virtio_p9_write, + [P9_TREMOVE] = virtio_p9_remove, + [P9_TRENAME] = virtio_p9_rename, +}; + +static struct p9_pdu *virtio_p9_pdu_init(struct kvm *kvm, struct virt_queue *vq) +{ + struct p9_pdu *pdu = calloc(1, sizeof(*pdu)); + if (!pdu) + return NULL; + + /* skip the pdu header p9_msg */ + pdu->read_offset = VIRTIO_9P_HDR_LEN; + pdu->write_offset = VIRTIO_9P_HDR_LEN; + pdu->queue_head = virt_queue__get_inout_iov(kvm, vq, pdu->in_iov, + pdu->out_iov, &pdu->in_iov_cnt, &pdu->out_iov_cnt); + return pdu; +} + +static u8 virtio_p9_get_cmd(struct p9_pdu *pdu) +{ + struct p9_msg *msg; + /* + * we can peek directly into pdu for a u8 + * value. The host endianess won't be an issue + */ + msg = pdu->out_iov[0].iov_base; + return msg->cmd; +} + +static bool virtio_p9_do_io_request(struct kvm *kvm, struct p9_dev_job *job) +{ + u8 cmd; + u32 len = 0; + p9_handler *handler; + struct p9_dev *p9dev; + struct virt_queue *vq; + struct p9_pdu *p9pdu; + + vq = job->vq; + p9dev = job->p9dev; + + p9pdu = virtio_p9_pdu_init(kvm, vq); + cmd = virtio_p9_get_cmd(p9pdu); + + if ((cmd >= ARRAY_SIZE(virtio_9p_dotl_handler)) || + !virtio_9p_dotl_handler[cmd]) + handler = virtio_p9_eopnotsupp; + else + handler = virtio_9p_dotl_handler[cmd]; + + handler(p9dev, p9pdu, &len); + virt_queue__set_used_elem(vq, p9pdu->queue_head, len); + free(p9pdu); + return true; +} + +static void virtio_p9_do_io(struct kvm *kvm, void *param) +{ + struct p9_dev_job *job = (struct p9_dev_job *)param; + struct p9_dev *p9dev = job->p9dev; + struct virt_queue *vq = job->vq; + + while (virt_queue__available(vq)) { + virtio_p9_do_io_request(kvm, job); + p9dev->vdev.ops->signal_vq(kvm, &p9dev->vdev, vq - p9dev->vqs); + } +} + +static u8 *get_config(struct kvm *kvm, void *dev) +{ + struct p9_dev *p9dev = dev; + + return ((u8 *)(p9dev->config)); +} + +static u32 get_host_features(struct kvm *kvm, void *dev) +{ + return 1 << VIRTIO_9P_MOUNT_TAG; +} + +static void set_guest_features(struct kvm *kvm, void *dev, u32 features) +{ + struct p9_dev *p9dev = dev; + + p9dev->features = features; +} + +static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, + u32 pfn) +{ + struct p9_dev *p9dev = dev; + struct p9_dev_job *job; + struct virt_queue *queue; + void *p; + + compat__remove_message(compat_id); + + queue = &p9dev->vqs[vq]; + queue->pfn = pfn; + p = guest_flat_to_host(kvm, queue->pfn * page_size); + job = &p9dev->jobs[vq]; + + vring_init(&queue->vring, VIRTQUEUE_NUM, p, align); + + *job = (struct p9_dev_job) { + .vq = queue, + .p9dev = p9dev, + }; + thread_pool__init_job(&job->job_id, kvm, virtio_p9_do_io, job); + + return 0; +} + +static int notify_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct p9_dev *p9dev = dev; + + thread_pool__do_job(&p9dev->jobs[vq].job_id); + + return 0; +} + +static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct p9_dev *p9dev = dev; + + return p9dev->vqs[vq].pfn; +} + +static int get_size_vq(struct kvm *kvm, void *dev, u32 vq) +{ + return VIRTQUEUE_NUM; +} + +static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size) +{ + /* FIXME: dynamic */ + return size; +} + +struct virtio_ops p9_dev_virtio_ops = (struct virtio_ops) { + .get_config = get_config, + .get_host_features = get_host_features, + .set_guest_features = set_guest_features, + .init_vq = init_vq, + .notify_vq = notify_vq, + .get_pfn_vq = get_pfn_vq, + .get_size_vq = get_size_vq, + .set_size_vq = set_size_vq, +}; + +int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset) +{ + char *tag_name; + char tmp[PATH_MAX]; + struct kvm *kvm = opt->ptr; + + /* + * 9p dir can be of the form dirname,tag_name or + * just dirname. In the later case we use the + * default tag name + */ + tag_name = strstr(arg, ","); + if (tag_name) { + *tag_name = '\0'; + tag_name++; + } + if (realpath(arg, tmp)) { + if (virtio_9p__register(kvm, tmp, tag_name) < 0) + die("Unable to initialize virtio 9p"); + } else + die("Failed resolving 9p path"); + return 0; +} + +int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset) +{ + char path[PATH_MAX]; + struct stat st; + struct kvm *kvm = opt->ptr; + + if (stat(arg, &st) == 0 && + S_ISDIR(st.st_mode)) { + char tmp[PATH_MAX]; + + if (kvm->cfg.using_rootfs) + die("Please use only one rootfs directory atmost"); + + if (realpath(arg, tmp) == 0 || + virtio_9p__register(kvm, tmp, "/dev/root") < 0) + die("Unable to initialize virtio 9p"); + kvm->cfg.using_rootfs = 1; + return 0; + } + + snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg); + + if (stat(path, &st) == 0 && + S_ISDIR(st.st_mode)) { + char tmp[PATH_MAX]; + + if (kvm->cfg.using_rootfs) + die("Please use only one rootfs directory atmost"); + + if (realpath(path, tmp) == 0 || + virtio_9p__register(kvm, tmp, "/dev/root") < 0) + die("Unable to initialize virtio 9p"); + if (virtio_9p__register(kvm, "/", "hostfs") < 0) + die("Unable to initialize virtio 9p"); + kvm_setup_resolv(arg); + kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1; + kvm->cfg.custom_rootfs_name = arg; + return 0; + } + + return -1; +} + +int virtio_9p__init(struct kvm *kvm) +{ + struct p9_dev *p9dev; + + list_for_each_entry(p9dev, &devs, list) { + virtio_init(kvm, p9dev, &p9dev->vdev, &p9_dev_virtio_ops, + VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_9P, + VIRTIO_ID_9P, PCI_CLASS_9P); + } + + return 0; +} +virtio_dev_init(virtio_9p__init); + +int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name) +{ + struct p9_dev *p9dev; + int err = 0; + + p9dev = calloc(1, sizeof(*p9dev)); + if (!p9dev) + return -ENOMEM; + + if (!tag_name) + tag_name = VIRTIO_9P_DEFAULT_TAG; + + p9dev->config = calloc(1, sizeof(*p9dev->config) + strlen(tag_name) + 1); + if (p9dev->config == NULL) { + err = -ENOMEM; + goto free_p9dev; + } + + strcpy(p9dev->root_dir, root); + p9dev->config->tag_len = strlen(tag_name); + if (p9dev->config->tag_len > MAX_TAG_LEN) { + err = -EINVAL; + goto free_p9dev_config; + } + + memcpy(&p9dev->config->tag, tag_name, strlen(tag_name)); + + list_add(&p9dev->list, &devs); + + if (compat_id == -1) + compat_id = virtio_compat_add_message("virtio-9p", "CONFIG_NET_9P_VIRTIO"); + + return err; + +free_p9dev_config: + free(p9dev->config); +free_p9dev: + free(p9dev); + return err; +} diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c new file mode 100644 index 000000000000..d1b64fabbc65 --- /dev/null +++ b/tools/kvm/virtio/balloon.c @@ -0,0 +1,279 @@ +#include "kvm/virtio-balloon.h" + +#include "kvm/virtio-pci-dev.h" + +#include "kvm/virtio.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/pci.h" +#include "kvm/threadpool.h" +#include "kvm/guest_compat.h" +#include "kvm/kvm-ipc.h" + +#include <linux/virtio_ring.h> +#include <linux/virtio_balloon.h> + +#include <linux/kernel.h> +#include <linux/list.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <pthread.h> +#include <sys/eventfd.h> + +#define NUM_VIRT_QUEUES 3 +#define VIRTIO_BLN_QUEUE_SIZE 128 +#define VIRTIO_BLN_INFLATE 0 +#define VIRTIO_BLN_DEFLATE 1 +#define VIRTIO_BLN_STATS 2 + +struct bln_dev { + struct list_head list; + struct virtio_device vdev; + + u32 features; + + /* virtio queue */ + struct virt_queue vqs[NUM_VIRT_QUEUES]; + struct thread_pool__job jobs[NUM_VIRT_QUEUES]; + + struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; + struct virtio_balloon_stat *cur_stat; + u32 cur_stat_head; + u16 stat_count; + int stat_waitfd; + + struct virtio_balloon_config config; +}; + +static struct bln_dev bdev; +static int compat_id = -1; + +static bool virtio_bln_do_io_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue) +{ + struct iovec iov[VIRTIO_BLN_QUEUE_SIZE]; + unsigned int len = 0; + u16 out, in, head; + u32 *ptrs, i; + + head = virt_queue__get_iov(queue, iov, &out, &in, kvm); + ptrs = iov[0].iov_base; + len = iov[0].iov_len / sizeof(u32); + + for (i = 0 ; i < len ; i++) { + void *guest_ptr; + + guest_ptr = guest_flat_to_host(kvm, ptrs[i] << VIRTIO_BALLOON_PFN_SHIFT); + if (queue == &bdev->vqs[VIRTIO_BLN_INFLATE]) { + madvise(guest_ptr, 1 << VIRTIO_BALLOON_PFN_SHIFT, MADV_DONTNEED); + bdev->config.actual++; + } else if (queue == &bdev->vqs[VIRTIO_BLN_DEFLATE]) { + bdev->config.actual--; + } + } + + virt_queue__set_used_elem(queue, head, len); + + return true; +} + +static bool virtio_bln_do_stat_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue) +{ + struct iovec iov[VIRTIO_BLN_QUEUE_SIZE]; + u16 out, in, head; + struct virtio_balloon_stat *stat; + u64 wait_val = 1; + + head = virt_queue__get_iov(queue, iov, &out, &in, kvm); + stat = iov[0].iov_base; + + /* Initial empty stat buffer */ + if (bdev->cur_stat == NULL) { + bdev->cur_stat = stat; + bdev->cur_stat_head = head; + + return true; + } + + memcpy(bdev->stats, stat, iov[0].iov_len); + + bdev->stat_count = iov[0].iov_len / sizeof(struct virtio_balloon_stat); + bdev->cur_stat = stat; + bdev->cur_stat_head = head; + + if (write(bdev->stat_waitfd, &wait_val, sizeof(wait_val)) <= 0) + return -EFAULT; + + return 1; +} + +static void virtio_bln_do_io(struct kvm *kvm, void *param) +{ + struct virt_queue *vq = param; + + if (vq == &bdev.vqs[VIRTIO_BLN_STATS]) { + virtio_bln_do_stat_request(kvm, &bdev, vq); + bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS); + return; + } + + while (virt_queue__available(vq)) { + virtio_bln_do_io_request(kvm, &bdev, vq); + bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, vq - bdev.vqs); + } +} + +static int virtio_bln__collect_stats(struct kvm *kvm) +{ + u64 tmp; + + virt_queue__set_used_elem(&bdev.vqs[VIRTIO_BLN_STATS], bdev.cur_stat_head, + sizeof(struct virtio_balloon_stat)); + bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS); + + if (read(bdev.stat_waitfd, &tmp, sizeof(tmp)) <= 0) + return -EFAULT; + + return 0; +} + +static void virtio_bln__print_stats(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg) +{ + int r; + + if (WARN_ON(type != KVM_IPC_STAT || len)) + return; + + if (virtio_bln__collect_stats(kvm) < 0) + return; + + r = write(fd, bdev.stats, sizeof(bdev.stats)); + if (r < 0) + pr_warning("Failed sending memory stats"); +} + +static void handle_mem(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg) +{ + int mem; + + if (WARN_ON(type != KVM_IPC_BALLOON || len != sizeof(int))) + return; + + mem = *(int *)msg; + if (mem > 0) { + bdev.config.num_pages += 256 * mem; + } else if (mem < 0) { + if (bdev.config.num_pages < (u32)(256 * (-mem))) + return; + + bdev.config.num_pages += 256 * mem; + } + + /* Notify that the configuration space has changed */ + bdev.vdev.ops->signal_config(kvm, &bdev.vdev); +} + +static u8 *get_config(struct kvm *kvm, void *dev) +{ + struct bln_dev *bdev = dev; + + return ((u8 *)(&bdev->config)); +} + +static u32 get_host_features(struct kvm *kvm, void *dev) +{ + return 1 << VIRTIO_BALLOON_F_STATS_VQ; +} + +static void set_guest_features(struct kvm *kvm, void *dev, u32 features) +{ + struct bln_dev *bdev = dev; + + bdev->features = features; +} + +static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, + u32 pfn) +{ + struct bln_dev *bdev = dev; + struct virt_queue *queue; + void *p; + + compat__remove_message(compat_id); + + queue = &bdev->vqs[vq]; + queue->pfn = pfn; + p = guest_flat_to_host(kvm, queue->pfn * page_size); + + thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue); + vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, align); + + return 0; +} + +static int notify_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct bln_dev *bdev = dev; + + thread_pool__do_job(&bdev->jobs[vq]); + + return 0; +} + +static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct bln_dev *bdev = dev; + + return bdev->vqs[vq].pfn; +} + +static int get_size_vq(struct kvm *kvm, void *dev, u32 vq) +{ + return VIRTIO_BLN_QUEUE_SIZE; +} + +static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size) +{ + /* FIXME: dynamic */ + return size; +} + +struct virtio_ops bln_dev_virtio_ops = (struct virtio_ops) { + .get_config = get_config, + .get_host_features = get_host_features, + .set_guest_features = set_guest_features, + .init_vq = init_vq, + .notify_vq = notify_vq, + .get_pfn_vq = get_pfn_vq, + .get_size_vq = get_size_vq, + .set_size_vq = set_size_vq, +}; + +int virtio_bln__init(struct kvm *kvm) +{ + if (!kvm->cfg.balloon) + return 0; + + kvm_ipc__register_handler(KVM_IPC_BALLOON, handle_mem); + kvm_ipc__register_handler(KVM_IPC_STAT, virtio_bln__print_stats); + + bdev.stat_waitfd = eventfd(0, 0); + memset(&bdev.config, 0, sizeof(struct virtio_balloon_config)); + + virtio_init(kvm, &bdev, &bdev.vdev, &bln_dev_virtio_ops, + VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_BLN, + VIRTIO_ID_BALLOON, PCI_CLASS_BLN); + + if (compat_id == -1) + compat_id = virtio_compat_add_message("virtio-balloon", "CONFIG_VIRTIO_BALLOON"); + + return 0; +} +virtio_dev_init(virtio_bln__init); + +int virtio_bln__exit(struct kvm *kvm) +{ + return 0; +} +virtio_dev_exit(virtio_bln__exit); diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c new file mode 100644 index 000000000000..44ac44baffdb --- /dev/null +++ b/tools/kvm/virtio/blk.c @@ -0,0 +1,319 @@ +#include "kvm/virtio-blk.h" + +#include "kvm/virtio-pci-dev.h" +#include "kvm/disk-image.h" +#include "kvm/mutex.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/pci.h" +#include "kvm/threadpool.h" +#include "kvm/ioeventfd.h" +#include "kvm/guest_compat.h" +#include "kvm/virtio-pci.h" +#include "kvm/virtio.h" + +#include <linux/virtio_ring.h> +#include <linux/virtio_blk.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/types.h> +#include <pthread.h> + +#define VIRTIO_BLK_MAX_DEV 4 + +/* + * the header and status consume too entries + */ +#define DISK_SEG_MAX (VIRTIO_BLK_QUEUE_SIZE - 2) +#define VIRTIO_BLK_QUEUE_SIZE 256 +#define NUM_VIRT_QUEUES 1 + +struct blk_dev_req { + struct virt_queue *vq; + struct blk_dev *bdev; + struct iovec iov[VIRTIO_BLK_QUEUE_SIZE]; + u16 out, in, head; + struct kvm *kvm; +}; + +struct blk_dev { + struct mutex mutex; + + struct list_head list; + + struct virtio_device vdev; + struct virtio_blk_config blk_config; + struct disk_image *disk; + u32 features; + + struct virt_queue vqs[NUM_VIRT_QUEUES]; + struct blk_dev_req reqs[VIRTIO_BLK_QUEUE_SIZE]; + + pthread_t io_thread; + int io_efd; + + struct kvm *kvm; +}; + +static LIST_HEAD(bdevs); +static int compat_id = -1; + +void virtio_blk_complete(void *param, long len) +{ + struct blk_dev_req *req = param; + struct blk_dev *bdev = req->bdev; + int queueid = req->vq - bdev->vqs; + u8 *status; + + /* status */ + status = req->iov[req->out + req->in - 1].iov_base; + *status = (len < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + + mutex_lock(&bdev->mutex); + virt_queue__set_used_elem(req->vq, req->head, len); + mutex_unlock(&bdev->mutex); + + if (virtio_queue__should_signal(&bdev->vqs[queueid])) + bdev->vdev.ops->signal_vq(req->kvm, &bdev->vdev, queueid); +} + +static void virtio_blk_do_io_request(struct kvm *kvm, struct blk_dev_req *req) +{ + struct virtio_blk_outhdr *req_hdr; + ssize_t block_cnt; + struct blk_dev *bdev; + struct iovec *iov; + u16 out, in; + + block_cnt = -1; + bdev = req->bdev; + iov = req->iov; + out = req->out; + in = req->in; + req_hdr = iov[0].iov_base; + + switch (req_hdr->type) { + case VIRTIO_BLK_T_IN: + block_cnt = disk_image__read(bdev->disk, req_hdr->sector, + iov + 1, in + out - 2, req); + break; + case VIRTIO_BLK_T_OUT: + block_cnt = disk_image__write(bdev->disk, req_hdr->sector, + iov + 1, in + out - 2, req); + break; + case VIRTIO_BLK_T_FLUSH: + block_cnt = disk_image__flush(bdev->disk); + virtio_blk_complete(req, block_cnt); + break; + case VIRTIO_BLK_T_GET_ID: + block_cnt = VIRTIO_BLK_ID_BYTES; + disk_image__get_serial(bdev->disk, + (iov + 1)->iov_base, &block_cnt); + virtio_blk_complete(req, block_cnt); + break; + default: + pr_warning("request type %d", req_hdr->type); + block_cnt = -1; + break; + } +} + +static void virtio_blk_do_io(struct kvm *kvm, struct virt_queue *vq, struct blk_dev *bdev) +{ + struct blk_dev_req *req; + u16 head; + + while (virt_queue__available(vq)) { + head = virt_queue__pop(vq); + req = &bdev->reqs[head]; + req->head = virt_queue__get_head_iov(vq, req->iov, &req->out, + &req->in, head, kvm); + req->vq = vq; + + virtio_blk_do_io_request(kvm, req); + } +} + +static u8 *get_config(struct kvm *kvm, void *dev) +{ + struct blk_dev *bdev = dev; + + return ((u8 *)(&bdev->blk_config)); +} + +static u32 get_host_features(struct kvm *kvm, void *dev) +{ + return 1UL << VIRTIO_BLK_F_SEG_MAX + | 1UL << VIRTIO_BLK_F_FLUSH + | 1UL << VIRTIO_RING_F_EVENT_IDX + | 1UL << VIRTIO_RING_F_INDIRECT_DESC; +} + +static void set_guest_features(struct kvm *kvm, void *dev, u32 features) +{ + struct blk_dev *bdev = dev; + + bdev->features = features; +} + +static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, + u32 pfn) +{ + struct blk_dev *bdev = dev; + struct virt_queue *queue; + void *p; + + compat__remove_message(compat_id); + + queue = &bdev->vqs[vq]; + queue->pfn = pfn; + p = guest_flat_to_host(kvm, queue->pfn * page_size); + + vring_init(&queue->vring, VIRTIO_BLK_QUEUE_SIZE, p, align); + + return 0; +} + +static void *virtio_blk_thread(void *dev) +{ + struct blk_dev *bdev = dev; + u64 data; + int r; + + kvm__set_thread_name("virtio-blk-io"); + + while (1) { + r = read(bdev->io_efd, &data, sizeof(u64)); + if (r < 0) + continue; + virtio_blk_do_io(bdev->kvm, &bdev->vqs[0], bdev); + } + + pthread_exit(NULL); + return NULL; +} + +static int notify_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct blk_dev *bdev = dev; + u64 data = 1; + int r; + + r = write(bdev->io_efd, &data, sizeof(data)); + if (r < 0) + return r; + + return 0; +} + +static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct blk_dev *bdev = dev; + + return bdev->vqs[vq].pfn; +} + +static int get_size_vq(struct kvm *kvm, void *dev, u32 vq) +{ + /* FIXME: dynamic */ + return VIRTIO_BLK_QUEUE_SIZE; +} + +static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size) +{ + /* FIXME: dynamic */ + return size; +} + +static struct virtio_ops blk_dev_virtio_ops = (struct virtio_ops) { + .get_config = get_config, + .get_host_features = get_host_features, + .set_guest_features = set_guest_features, + .init_vq = init_vq, + .notify_vq = notify_vq, + .get_pfn_vq = get_pfn_vq, + .get_size_vq = get_size_vq, + .set_size_vq = set_size_vq, +}; + +static int virtio_blk__init_one(struct kvm *kvm, struct disk_image *disk) +{ + struct blk_dev *bdev; + unsigned int i; + + if (!disk) + return -EINVAL; + + bdev = calloc(1, sizeof(struct blk_dev)); + if (bdev == NULL) + return -ENOMEM; + + *bdev = (struct blk_dev) { + .mutex = MUTEX_INITIALIZER, + .disk = disk, + .blk_config = (struct virtio_blk_config) { + .capacity = disk->size / SECTOR_SIZE, + .seg_max = DISK_SEG_MAX, + }, + .io_efd = eventfd(0, 0), + .kvm = kvm, + }; + + virtio_init(kvm, bdev, &bdev->vdev, &blk_dev_virtio_ops, + VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_BLK, + VIRTIO_ID_BLOCK, PCI_CLASS_BLK); + + list_add_tail(&bdev->list, &bdevs); + + for (i = 0; i < ARRAY_SIZE(bdev->reqs); i++) { + bdev->reqs[i].bdev = bdev; + bdev->reqs[i].kvm = kvm; + } + + disk_image__set_callback(bdev->disk, virtio_blk_complete); + + pthread_create(&bdev->io_thread, NULL, virtio_blk_thread, bdev); + if (compat_id == -1) + compat_id = virtio_compat_add_message("virtio-blk", "CONFIG_VIRTIO_BLK"); + + return 0; +} + +static int virtio_blk__exit_one(struct kvm *kvm, struct blk_dev *bdev) +{ + list_del(&bdev->list); + free(bdev); + + return 0; +} + +int virtio_blk__init(struct kvm *kvm) +{ + int i, r = 0; + + for (i = 0; i < kvm->nr_disks; i++) { + if (kvm->disks[i]->wwpn) + continue; + r = virtio_blk__init_one(kvm, kvm->disks[i]); + if (r < 0) + goto cleanup; + } + + return 0; +cleanup: + return virtio_blk__exit(kvm); +} +virtio_dev_init(virtio_blk__init); + +int virtio_blk__exit(struct kvm *kvm) +{ + while (!list_empty(&bdevs)) { + struct blk_dev *bdev; + + bdev = list_first_entry(&bdevs, struct blk_dev, list); + virtio_blk__exit_one(kvm, bdev); + } + + return 0; +} +virtio_dev_exit(virtio_blk__exit); diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c new file mode 100644 index 000000000000..b18d3a925928 --- /dev/null +++ b/tools/kvm/virtio/console.c @@ -0,0 +1,212 @@ +#include "kvm/virtio-console.h" +#include "kvm/virtio-pci-dev.h" +#include "kvm/disk-image.h" +#include "kvm/virtio.h" +#include "kvm/ioport.h" +#include "kvm/util.h" +#include "kvm/term.h" +#include "kvm/mutex.h" +#include "kvm/kvm.h" +#include "kvm/pci.h" +#include "kvm/threadpool.h" +#include "kvm/irq.h" +#include "kvm/guest_compat.h" + +#include <linux/virtio_console.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_blk.h> + +#include <sys/uio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <termios.h> +#include <unistd.h> +#include <fcntl.h> + +#define VIRTIO_CONSOLE_QUEUE_SIZE 128 +#define VIRTIO_CONSOLE_NUM_QUEUES 2 +#define VIRTIO_CONSOLE_RX_QUEUE 0 +#define VIRTIO_CONSOLE_TX_QUEUE 1 + +struct con_dev { + struct mutex mutex; + + struct virtio_device vdev; + struct virt_queue vqs[VIRTIO_CONSOLE_NUM_QUEUES]; + struct virtio_console_config config; + u32 features; + + struct thread_pool__job jobs[VIRTIO_CONSOLE_NUM_QUEUES]; +}; + +static struct con_dev cdev = { + .mutex = MUTEX_INITIALIZER, + + .config = { + .cols = 80, + .rows = 24, + .max_nr_ports = 1, + }, +}; + +static int compat_id = -1; + +/* + * Interrupts are injected for hvc0 only. + */ +static void virtio_console__inject_interrupt_callback(struct kvm *kvm, void *param) +{ + struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE]; + struct virt_queue *vq; + u16 out, in; + u16 head; + int len; + + if (kvm->cfg.active_console != CONSOLE_VIRTIO) + return; + + mutex_lock(&cdev.mutex); + + vq = param; + + if (term_readable(0) && virt_queue__available(vq)) { + head = virt_queue__get_iov(vq, iov, &out, &in, kvm); + len = term_getc_iov(kvm, iov, in, 0); + virt_queue__set_used_elem(vq, head, len); + cdev.vdev.ops->signal_vq(kvm, &cdev.vdev, vq - cdev.vqs); + } + + mutex_unlock(&cdev.mutex); +} + +void virtio_console__inject_interrupt(struct kvm *kvm) +{ + thread_pool__do_job(&cdev.jobs[VIRTIO_CONSOLE_RX_QUEUE]); +} + +static void virtio_console_handle_callback(struct kvm *kvm, void *param) +{ + struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE]; + struct virt_queue *vq; + u16 out, in; + u16 head; + u32 len; + + vq = param; + + /* + * The current Linux implementation polls for the buffer + * to be used, rather than waiting for an interrupt. + * So there is no need to inject an interrupt for the tx path. + */ + + while (virt_queue__available(vq)) { + head = virt_queue__get_iov(vq, iov, &out, &in, kvm); + if (kvm->cfg.active_console == CONSOLE_VIRTIO) + len = term_putc_iov(iov, out, 0); + else + len = 0; + virt_queue__set_used_elem(vq, head, len); + } + +} + +static u8 *get_config(struct kvm *kvm, void *dev) +{ + struct con_dev *cdev = dev; + + return ((u8 *)(&cdev->config)); +} + +static u32 get_host_features(struct kvm *kvm, void *dev) +{ + return 0; +} + +static void set_guest_features(struct kvm *kvm, void *dev, u32 features) +{ + /* Unused */ +} + +static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, + u32 pfn) +{ + struct virt_queue *queue; + void *p; + + BUG_ON(vq >= VIRTIO_CONSOLE_NUM_QUEUES); + + compat__remove_message(compat_id); + + queue = &cdev.vqs[vq]; + queue->pfn = pfn; + p = guest_flat_to_host(kvm, queue->pfn * page_size); + + vring_init(&queue->vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, align); + + if (vq == VIRTIO_CONSOLE_TX_QUEUE) + thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console_handle_callback, queue); + else if (vq == VIRTIO_CONSOLE_RX_QUEUE) + thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console__inject_interrupt_callback, queue); + + return 0; +} + +static int notify_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct con_dev *cdev = dev; + + thread_pool__do_job(&cdev->jobs[vq]); + + return 0; +} + +static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct con_dev *cdev = dev; + + return cdev->vqs[vq].pfn; +} + +static int get_size_vq(struct kvm *kvm, void *dev, u32 vq) +{ + return VIRTIO_CONSOLE_QUEUE_SIZE; +} + +static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size) +{ + /* FIXME: dynamic */ + return size; +} + +static struct virtio_ops con_dev_virtio_ops = (struct virtio_ops) { + .get_config = get_config, + .get_host_features = get_host_features, + .set_guest_features = set_guest_features, + .init_vq = init_vq, + .notify_vq = notify_vq, + .get_pfn_vq = get_pfn_vq, + .get_size_vq = get_size_vq, + .set_size_vq = set_size_vq, +}; + +int virtio_console__init(struct kvm *kvm) +{ + if (kvm->cfg.active_console != CONSOLE_VIRTIO) + return 0; + + virtio_init(kvm, &cdev, &cdev.vdev, &con_dev_virtio_ops, + VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_CONSOLE, + VIRTIO_ID_CONSOLE, PCI_CLASS_CONSOLE); + if (compat_id == -1) + compat_id = virtio_compat_add_message("virtio-console", "CONFIG_VIRTIO_CONSOLE"); + + return 0; +} +virtio_dev_init(virtio_console__init); + +int virtio_console__exit(struct kvm *kvm) +{ + return 0; +} +virtio_dev_exit(virtio_console__exit); diff --git a/tools/kvm/virtio/core.c b/tools/kvm/virtio/core.c new file mode 100644 index 000000000000..2dfb828d177f --- /dev/null +++ b/tools/kvm/virtio/core.c @@ -0,0 +1,233 @@ +#include <linux/virtio_ring.h> +#include <linux/types.h> +#include <sys/uio.h> +#include <stdlib.h> + +#include "kvm/guest_compat.h" +#include "kvm/barrier.h" +#include "kvm/virtio.h" +#include "kvm/virtio-pci.h" +#include "kvm/virtio-mmio.h" +#include "kvm/util.h" +#include "kvm/kvm.h" + + +struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len) +{ + struct vring_used_elem *used_elem; + + used_elem = &queue->vring.used->ring[queue->vring.used->idx % queue->vring.num]; + used_elem->id = head; + used_elem->len = len; + + /* + * Use wmb to assure that used elem was updated with head and len. + * We need a wmb here since we can't advance idx unless we're ready + * to pass the used element to the guest. + */ + wmb(); + queue->vring.used->idx++; + + /* + * Use wmb to assure used idx has been increased before we signal the guest. + * Without a wmb here the guest may ignore the queue since it won't see + * an updated idx. + */ + wmb(); + + return used_elem; +} + +/* + * Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, or vq->vring.num if we're + * at the end. + */ +static unsigned next_desc(struct vring_desc *desc, + unsigned int i, unsigned int max) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc[i].flags & VRING_DESC_F_NEXT)) + return max; + + /* Check they're not leading us off end of descriptors. */ + next = desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + wmb(); + + return next; +} + +u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, u16 head, struct kvm *kvm) +{ + struct vring_desc *desc; + u16 idx; + u16 max; + + idx = head; + *out = *in = 0; + max = vq->vring.num; + desc = vq->vring.desc; + + if (desc[idx].flags & VRING_DESC_F_INDIRECT) { + max = desc[idx].len / sizeof(struct vring_desc); + desc = guest_flat_to_host(kvm, desc[idx].addr); + idx = 0; + } + + do { + /* Grab the first descriptor, and check it's OK. */ + iov[*out + *in].iov_len = desc[idx].len; + iov[*out + *in].iov_base = guest_flat_to_host(kvm, desc[idx].addr); + /* If this is an input descriptor, increment that count. */ + if (desc[idx].flags & VRING_DESC_F_WRITE) + (*in)++; + else + (*out)++; + } while ((idx = next_desc(desc, idx, max)) != max); + + return head; +} + +u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm) +{ + u16 head; + + head = virt_queue__pop(vq); + + return virt_queue__get_head_iov(vq, iov, out, in, head, kvm); +} + +/* in and out are relative to guest */ +u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue, + struct iovec in_iov[], struct iovec out_iov[], + u16 *in, u16 *out) +{ + struct vring_desc *desc; + u16 head, idx; + + idx = head = virt_queue__pop(queue); + *out = *in = 0; + do { + desc = virt_queue__get_desc(queue, idx); + if (desc->flags & VRING_DESC_F_WRITE) { + in_iov[*in].iov_base = guest_flat_to_host(kvm, + desc->addr); + in_iov[*in].iov_len = desc->len; + (*in)++; + } else { + out_iov[*out].iov_base = guest_flat_to_host(kvm, + desc->addr); + out_iov[*out].iov_len = desc->len; + (*out)++; + } + if (desc->flags & VRING_DESC_F_NEXT) + idx = desc->next; + else + break; + } while (1); + + return head; +} + +int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off) +{ + if (msix) { + if (offset < 4) + return VIRTIO_PCI_O_MSIX; + else + offset -= 4; + } + + *config_off = offset; + + return VIRTIO_PCI_O_CONFIG; +} + +bool virtio_queue__should_signal(struct virt_queue *vq) +{ + u16 old_idx, new_idx, event_idx; + + old_idx = vq->last_used_signalled; + new_idx = vq->vring.used->idx; + event_idx = vring_used_event(&vq->vring); + + if (vring_need_event(event_idx, new_idx, old_idx)) { + vq->last_used_signalled = new_idx; + return true; + } + + return false; +} + +int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev, + struct virtio_ops *ops, enum virtio_trans trans, + int device_id, int subsys_id, int class) +{ + void *virtio; + + switch (trans) { + case VIRTIO_PCI: + virtio = calloc(sizeof(struct virtio_pci), 1); + if (!virtio) + return -ENOMEM; + vdev->virtio = virtio; + vdev->ops = ops; + vdev->ops->signal_vq = virtio_pci__signal_vq; + vdev->ops->signal_config = virtio_pci__signal_config; + vdev->ops->init = virtio_pci__init; + vdev->ops->exit = virtio_pci__exit; + vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class); + break; + case VIRTIO_MMIO: + virtio = calloc(sizeof(struct virtio_mmio), 1); + if (!virtio) + return -ENOMEM; + vdev->virtio = virtio; + vdev->ops = ops; + vdev->ops->signal_vq = virtio_mmio_signal_vq; + vdev->ops->signal_config = virtio_mmio_signal_config; + vdev->ops->init = virtio_mmio_init; + vdev->ops->exit = virtio_mmio_exit; + vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class); + break; + default: + return -1; + }; + + return 0; +} + +int virtio_compat_add_message(const char *device, const char *config) +{ + int len = 1024; + int compat_id; + char *title; + char *desc; + + title = malloc(len); + if (!title) + return -ENOMEM; + + desc = malloc(len); + if (!desc) { + free(title); + return -ENOMEM; + } + + snprintf(title, len, "%s device was not detected.", device); + snprintf(desc, len, "While you have requested a %s device, " + "the guest kernel did not initialize it.\n" + "\tPlease make sure that the guest kernel was " + "compiled with %s=y enabled in .config.", + device, config); + + compat_id = compat__add_message(title, desc); + + free(desc); + free(title); + + return compat_id; +} diff --git a/tools/kvm/virtio/mmio.c b/tools/kvm/virtio/mmio.c new file mode 100644 index 000000000000..bd30f375950c --- /dev/null +++ b/tools/kvm/virtio/mmio.c @@ -0,0 +1,271 @@ +#include "kvm/devices.h" +#include "kvm/virtio-mmio.h" +#include "kvm/ioeventfd.h" +#include "kvm/ioport.h" +#include "kvm/virtio.h" +#include "kvm/kvm.h" +#include "kvm/irq.h" + +#include <linux/virtio_mmio.h> +#include <string.h> + +static u32 virtio_mmio_io_space_blocks = KVM_VIRTIO_MMIO_AREA; + +static u32 virtio_mmio_get_io_space_block(u32 size) +{ + u32 block = virtio_mmio_io_space_blocks; + virtio_mmio_io_space_blocks += size; + + return block; +} + +static void virtio_mmio_ioevent_callback(struct kvm *kvm, void *param) +{ + struct virtio_mmio_ioevent_param *ioeventfd = param; + struct virtio_mmio *vmmio = ioeventfd->vdev->virtio; + + ioeventfd->vdev->ops->notify_vq(kvm, vmmio->dev, ioeventfd->vq); +} + +static int virtio_mmio_init_ioeventfd(struct kvm *kvm, + struct virtio_device *vdev, u32 vq) +{ + struct virtio_mmio *vmmio = vdev->virtio; + struct ioevent ioevent; + int err; + + vmmio->ioeventfds[vq] = (struct virtio_mmio_ioevent_param) { + .vdev = vdev, + .vq = vq, + }; + + ioevent = (struct ioevent) { + .io_addr = vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY, + .io_len = sizeof(u32), + .fn = virtio_mmio_ioevent_callback, + .fn_ptr = &vmmio->ioeventfds[vq], + .datamatch = vq, + .fn_kvm = kvm, + .fd = eventfd(0, 0), + }; + + if (vdev->use_vhost) + /* + * Vhost will poll the eventfd in host kernel side, + * no need to poll in userspace. + */ + err = ioeventfd__add_event(&ioevent, true, false); + else + /* Need to poll in userspace. */ + err = ioeventfd__add_event(&ioevent, true, true); + if (err) + return err; + + if (vdev->ops->notify_vq_eventfd) + vdev->ops->notify_vq_eventfd(kvm, vmmio->dev, vq, ioevent.fd); + + return 0; +} + +int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq) +{ + struct virtio_mmio *vmmio = vdev->virtio; + + vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_VRING; + kvm__irq_trigger(vmmio->kvm, vmmio->irq); + + return 0; +} + +int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev) +{ + struct virtio_mmio *vmmio = vdev->virtio; + + vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_CONFIG; + kvm__irq_trigger(vmmio->kvm, vmmio->irq); + + return 0; +} + +static void virtio_mmio_device_specific(u64 addr, u8 *data, u32 len, + u8 is_write, struct virtio_device *vdev) +{ + struct virtio_mmio *vmmio = vdev->virtio; + u32 i; + + for (i = 0; i < len; i++) { + if (is_write) + vdev->ops->get_config(vmmio->kvm, vmmio->dev)[addr + i] = + *(u8 *)data + i; + else + data[i] = vdev->ops->get_config(vmmio->kvm, + vmmio->dev)[addr + i]; + } +} + +static void virtio_mmio_config_in(u64 addr, void *data, u32 len, + struct virtio_device *vdev) +{ + struct virtio_mmio *vmmio = vdev->virtio; + u32 val = 0; + + switch (addr) { + case VIRTIO_MMIO_MAGIC_VALUE: + case VIRTIO_MMIO_VERSION: + case VIRTIO_MMIO_DEVICE_ID: + case VIRTIO_MMIO_VENDOR_ID: + case VIRTIO_MMIO_STATUS: + case VIRTIO_MMIO_INTERRUPT_STATUS: + ioport__write32(data, *(u32 *)(((void *)&vmmio->hdr) + addr)); + break; + case VIRTIO_MMIO_HOST_FEATURES: + if (vmmio->hdr.host_features_sel == 0) + val = vdev->ops->get_host_features(vmmio->kvm, + vmmio->dev); + ioport__write32(data, val); + break; + case VIRTIO_MMIO_QUEUE_PFN: + val = vdev->ops->get_pfn_vq(vmmio->kvm, vmmio->dev, + vmmio->hdr.queue_sel); + ioport__write32(data, val); + break; + case VIRTIO_MMIO_QUEUE_NUM_MAX: + val = vdev->ops->get_size_vq(vmmio->kvm, vmmio->dev, + vmmio->hdr.queue_sel); + ioport__write32(data, val); + break; + default: + break; + } +} + +static void virtio_mmio_config_out(u64 addr, void *data, u32 len, + struct virtio_device *vdev) +{ + struct virtio_mmio *vmmio = vdev->virtio; + u32 val = 0; + + switch (addr) { + case VIRTIO_MMIO_HOST_FEATURES_SEL: + case VIRTIO_MMIO_GUEST_FEATURES_SEL: + case VIRTIO_MMIO_QUEUE_SEL: + case VIRTIO_MMIO_STATUS: + val = ioport__read32(data); + *(u32 *)(((void *)&vmmio->hdr) + addr) = val; + break; + case VIRTIO_MMIO_GUEST_FEATURES: + if (vmmio->hdr.guest_features_sel == 0) { + val = ioport__read32(data); + vdev->ops->set_guest_features(vmmio->kvm, + vmmio->dev, val); + } + break; + case VIRTIO_MMIO_GUEST_PAGE_SIZE: + val = ioport__read32(data); + vmmio->hdr.guest_page_size = val; + break; + case VIRTIO_MMIO_QUEUE_NUM: + val = ioport__read32(data); + vmmio->hdr.queue_num = val; + vdev->ops->set_size_vq(vmmio->kvm, vmmio->dev, + vmmio->hdr.queue_sel, val); + break; + case VIRTIO_MMIO_QUEUE_ALIGN: + val = ioport__read32(data); + vmmio->hdr.queue_align = val; + break; + case VIRTIO_MMIO_QUEUE_PFN: + val = ioport__read32(data); + virtio_mmio_init_ioeventfd(vmmio->kvm, vdev, vmmio->hdr.queue_sel); + vdev->ops->init_vq(vmmio->kvm, vmmio->dev, + vmmio->hdr.queue_sel, + vmmio->hdr.guest_page_size, + vmmio->hdr.queue_align, + val); + break; + case VIRTIO_MMIO_QUEUE_NOTIFY: + val = ioport__read32(data); + vdev->ops->notify_vq(vmmio->kvm, vmmio->dev, val); + break; + case VIRTIO_MMIO_INTERRUPT_ACK: + val = ioport__read32(data); + vmmio->hdr.interrupt_state &= ~val; + break; + default: + break; + }; +} + +static void virtio_mmio_mmio_callback(u64 addr, u8 *data, u32 len, + u8 is_write, void *ptr) +{ + struct virtio_device *vdev = ptr; + struct virtio_mmio *vmmio = vdev->virtio; + u32 offset = addr - vmmio->addr; + + if (offset >= VIRTIO_MMIO_CONFIG) { + offset -= VIRTIO_MMIO_CONFIG; + virtio_mmio_device_specific(offset, data, len, is_write, ptr); + return; + } + + if (is_write) + virtio_mmio_config_out(offset, data, len, ptr); + else + virtio_mmio_config_in(offset, data, len, ptr); +} + +int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev, + int device_id, int subsys_id, int class) +{ + struct virtio_mmio *vmmio = vdev->virtio; + u8 pin, line; + + vmmio->addr = virtio_mmio_get_io_space_block(VIRTIO_MMIO_IO_SIZE); + vmmio->kvm = kvm; + vmmio->dev = dev; + + kvm__register_mmio(kvm, vmmio->addr, VIRTIO_MMIO_IO_SIZE, + false, virtio_mmio_mmio_callback, vdev); + + vmmio->hdr = (struct virtio_mmio_hdr) { + .magic = {'v', 'i', 'r', 't'}, + .version = 1, + .device_id = subsys_id, + .vendor_id = 0x4d564b4c , /* 'LKVM' */ + .queue_num_max = 256, + }; + + if (irq__register_device(subsys_id, &pin, &line) < 0) + return -1; + vmmio->irq = line; + vmmio->dev_hdr = (struct device_header) { + .bus_type = DEVICE_BUS_MMIO, + .data = vmmio, + }; + + device__register(&vmmio->dev_hdr); + + /* + * Instantiate guest virtio-mmio devices using kernel command line + * (or module) parameter, e.g + * + * virtio_mmio.devices=0x200@0xd2000000:5,0x200@0xd2000200:6 + */ + pr_info("virtio-mmio.devices=0x%x@0x%x:%d\n", VIRTIO_MMIO_IO_SIZE, vmmio->addr, line); + + return 0; +} + +int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev) +{ + struct virtio_mmio *vmmio = vdev->virtio; + int i; + + kvm__deregister_mmio(kvm, vmmio->addr); + + for (i = 0; i < VIRTIO_MMIO_MAX_VQ; i++) + ioeventfd__del_event(vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY, i); + + return 0; +} diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c new file mode 100644 index 000000000000..68bd107254a7 --- /dev/null +++ b/tools/kvm/virtio/net.c @@ -0,0 +1,674 @@ +#include "kvm/virtio-pci-dev.h" +#include "kvm/virtio-net.h" +#include "kvm/virtio.h" +#include "kvm/types.h" +#include "kvm/mutex.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/irq.h" +#include "kvm/uip.h" +#include "kvm/guest_compat.h" + +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <linux/if_tun.h> +#include <linux/types.h> + +#include <arpa/inet.h> +#include <net/if.h> + +#include <unistd.h> +#include <fcntl.h> + +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/eventfd.h> + +#define VIRTIO_NET_QUEUE_SIZE 256 +#define VIRTIO_NET_NUM_QUEUES 2 +#define VIRTIO_NET_RX_QUEUE 0 +#define VIRTIO_NET_TX_QUEUE 1 + +struct net_dev; + +struct net_dev_operations { + int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev); + int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev); +}; + +struct net_dev { + struct mutex mutex; + struct virtio_device vdev; + struct list_head list; + + struct virt_queue vqs[VIRTIO_NET_NUM_QUEUES]; + struct virtio_net_config config; + u32 features; + + pthread_t io_rx_thread; + struct mutex io_rx_lock; + pthread_cond_t io_rx_cond; + + pthread_t io_tx_thread; + struct mutex io_tx_lock; + pthread_cond_t io_tx_cond; + + int vhost_fd; + int tap_fd; + char tap_name[IFNAMSIZ]; + + int mode; + + struct uip_info info; + struct net_dev_operations *ops; + struct kvm *kvm; +}; + +static LIST_HEAD(ndevs); +static int compat_id = -1; + +static void *virtio_net_rx_thread(void *p) +{ + struct iovec iov[VIRTIO_NET_QUEUE_SIZE]; + struct virt_queue *vq; + struct kvm *kvm; + struct net_dev *ndev = p; + u16 out, in; + u16 head; + int len; + + kvm__set_thread_name("virtio-net-rx"); + + kvm = ndev->kvm; + vq = &ndev->vqs[VIRTIO_NET_RX_QUEUE]; + + while (1) { + mutex_lock(&ndev->io_rx_lock); + if (!virt_queue__available(vq)) + pthread_cond_wait(&ndev->io_rx_cond, &ndev->io_rx_lock.mutex); + mutex_unlock(&ndev->io_rx_lock); + + while (virt_queue__available(vq)) { + head = virt_queue__get_iov(vq, iov, &out, &in, kvm); + len = ndev->ops->rx(iov, in, ndev); + virt_queue__set_used_elem(vq, head, len); + + /* We should interrupt guest right now, otherwise latency is huge. */ + if (virtio_queue__should_signal(&ndev->vqs[VIRTIO_NET_RX_QUEUE])) + ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, + VIRTIO_NET_RX_QUEUE); + } + } + + pthread_exit(NULL); + return NULL; + +} + +static void *virtio_net_tx_thread(void *p) +{ + struct iovec iov[VIRTIO_NET_QUEUE_SIZE]; + struct virt_queue *vq; + struct kvm *kvm; + struct net_dev *ndev = p; + u16 out, in; + u16 head; + int len; + + kvm__set_thread_name("virtio-net-tx"); + + kvm = ndev->kvm; + vq = &ndev->vqs[VIRTIO_NET_TX_QUEUE]; + + while (1) { + mutex_lock(&ndev->io_tx_lock); + if (!virt_queue__available(vq)) + pthread_cond_wait(&ndev->io_tx_cond, &ndev->io_tx_lock.mutex); + mutex_unlock(&ndev->io_tx_lock); + + while (virt_queue__available(vq)) { + head = virt_queue__get_iov(vq, iov, &out, &in, kvm); + len = ndev->ops->tx(iov, out, ndev); + virt_queue__set_used_elem(vq, head, len); + } + + if (virtio_queue__should_signal(&ndev->vqs[VIRTIO_NET_TX_QUEUE])) + ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, VIRTIO_NET_TX_QUEUE); + } + + pthread_exit(NULL); + + return NULL; + +} + +static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue) +{ + switch (queue) { + case VIRTIO_NET_TX_QUEUE: + mutex_lock(&ndev->io_tx_lock); + pthread_cond_signal(&ndev->io_tx_cond); + mutex_unlock(&ndev->io_tx_lock); + break; + case VIRTIO_NET_RX_QUEUE: + mutex_lock(&ndev->io_rx_lock); + pthread_cond_signal(&ndev->io_rx_cond); + mutex_unlock(&ndev->io_rx_lock); + break; + default: + pr_warning("Unknown queue index %u", queue); + } +} + +static bool virtio_net__tap_init(const struct virtio_net_params *params, + struct net_dev *ndev) +{ + int sock = socket(AF_INET, SOCK_STREAM, 0); + int pid, status, offload, hdr_len; + struct sockaddr_in sin = {0}; + struct ifreq ifr; + + /* Did the user already gave us the FD? */ + if (params->fd) { + ndev->tap_fd = params->fd; + return 1; + } + + ndev->tap_fd = open("/dev/net/tun", O_RDWR); + if (ndev->tap_fd < 0) { + pr_warning("Unable to open /dev/net/tun"); + goto fail; + } + + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; + if (ioctl(ndev->tap_fd, TUNSETIFF, &ifr) < 0) { + pr_warning("Config tap device error. Are you root?"); + goto fail; + } + + strncpy(ndev->tap_name, ifr.ifr_name, sizeof(ndev->tap_name)); + + if (ioctl(ndev->tap_fd, TUNSETNOCSUM, 1) < 0) { + pr_warning("Config tap device TUNSETNOCSUM error"); + goto fail; + } + + hdr_len = sizeof(struct virtio_net_hdr); + if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0) + pr_warning("Config tap device TUNSETVNETHDRSZ error"); + + offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO; + if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) { + pr_warning("Config tap device TUNSETOFFLOAD error"); + goto fail; + } + + if (strcmp(params->script, "none")) { + pid = fork(); + if (pid == 0) { + execl(params->script, params->script, ndev->tap_name, NULL); + _exit(1); + } else { + waitpid(pid, &status, 0); + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { + pr_warning("Fail to setup tap by %s", params->script); + goto fail; + } + } + } else { + memset(&ifr, 0, sizeof(ifr)); + strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name)); + sin.sin_addr.s_addr = inet_addr(params->host_ip); + memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr)); + ifr.ifr_addr.sa_family = AF_INET; + if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) { + pr_warning("Could not set ip address on tap device"); + goto fail; + } + } + + memset(&ifr, 0, sizeof(ifr)); + strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name)); + ioctl(sock, SIOCGIFFLAGS, &ifr); + ifr.ifr_flags |= IFF_UP | IFF_RUNNING; + if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) + pr_warning("Could not bring tap device up"); + + close(sock); + + return 1; + +fail: + if (sock >= 0) + close(sock); + if (ndev->tap_fd >= 0) + close(ndev->tap_fd); + + return 0; +} + +static void virtio_net__io_thread_init(struct kvm *kvm, struct net_dev *ndev) +{ + mutex_init(&ndev->io_tx_lock); + mutex_init(&ndev->io_rx_lock); + + pthread_cond_init(&ndev->io_tx_cond, NULL); + pthread_cond_init(&ndev->io_rx_cond, NULL); + + pthread_create(&ndev->io_tx_thread, NULL, virtio_net_tx_thread, ndev); + pthread_create(&ndev->io_rx_thread, NULL, virtio_net_rx_thread, ndev); +} + +static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev) +{ + return writev(ndev->tap_fd, iov, out); +} + +static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev) +{ + return readv(ndev->tap_fd, iov, in); +} + +static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev) +{ + return uip_tx(iov, out, &ndev->info); +} + +static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev) +{ + return uip_rx(iov, in, &ndev->info); +} + +static struct net_dev_operations tap_ops = { + .rx = tap_ops_rx, + .tx = tap_ops_tx, +}; + +static struct net_dev_operations uip_ops = { + .rx = uip_ops_rx, + .tx = uip_ops_tx, +}; + +static u8 *get_config(struct kvm *kvm, void *dev) +{ + struct net_dev *ndev = dev; + + return ((u8 *)(&ndev->config)); +} + +static u32 get_host_features(struct kvm *kvm, void *dev) +{ + return 1UL << VIRTIO_NET_F_MAC + | 1UL << VIRTIO_NET_F_CSUM + | 1UL << VIRTIO_NET_F_HOST_UFO + | 1UL << VIRTIO_NET_F_HOST_TSO4 + | 1UL << VIRTIO_NET_F_HOST_TSO6 + | 1UL << VIRTIO_NET_F_GUEST_UFO + | 1UL << VIRTIO_NET_F_GUEST_TSO4 + | 1UL << VIRTIO_NET_F_GUEST_TSO6 + | 1UL << VIRTIO_RING_F_EVENT_IDX + | 1UL << VIRTIO_RING_F_INDIRECT_DESC; +} + +static void set_guest_features(struct kvm *kvm, void *dev, u32 features) +{ + struct net_dev *ndev = dev; + + ndev->features = features; +} + +static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, + u32 pfn) +{ + struct vhost_vring_state state = { .index = vq }; + struct vhost_vring_addr addr; + struct net_dev *ndev = dev; + struct virt_queue *queue; + void *p; + int r; + + compat__remove_message(compat_id); + + queue = &ndev->vqs[vq]; + queue->pfn = pfn; + p = guest_flat_to_host(kvm, queue->pfn * page_size); + + vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, align); + + if (ndev->vhost_fd == 0) + return 0; + + state.num = queue->vring.num; + r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state); + if (r < 0) + die_perror("VHOST_SET_VRING_NUM failed"); + state.num = 0; + r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state); + if (r < 0) + die_perror("VHOST_SET_VRING_BASE failed"); + + addr = (struct vhost_vring_addr) { + .index = vq, + .desc_user_addr = (u64)(unsigned long)queue->vring.desc, + .avail_user_addr = (u64)(unsigned long)queue->vring.avail, + .used_user_addr = (u64)(unsigned long)queue->vring.used, + }; + + r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr); + if (r < 0) + die_perror("VHOST_SET_VRING_ADDR failed"); + + return 0; +} + +static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi) +{ + struct net_dev *ndev = dev; + struct kvm_irqfd irq; + struct vhost_vring_file file; + int r; + + if (ndev->vhost_fd == 0) + return; + + irq = (struct kvm_irqfd) { + .gsi = gsi, + .fd = eventfd(0, 0), + }; + file = (struct vhost_vring_file) { + .index = vq, + .fd = irq.fd, + }; + + r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq); + if (r < 0) + die_perror("KVM_IRQFD failed"); + + r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file); + if (r < 0) + die_perror("VHOST_SET_VRING_CALL failed"); + file.fd = ndev->tap_fd; + r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file); + if (r != 0) + die("VHOST_NET_SET_BACKEND failed %d", errno); + +} + +static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd) +{ + struct net_dev *ndev = dev; + struct vhost_vring_file file = { + .index = vq, + .fd = efd, + }; + int r; + + if (ndev->vhost_fd == 0) + return; + + r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file); + if (r < 0) + die_perror("VHOST_SET_VRING_KICK failed"); +} + +static int notify_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct net_dev *ndev = dev; + + virtio_net_handle_callback(kvm, ndev, vq); + + return 0; +} + +static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct net_dev *ndev = dev; + + return ndev->vqs[vq].pfn; +} + +static int get_size_vq(struct kvm *kvm, void *dev, u32 vq) +{ + /* FIXME: dynamic */ + return VIRTIO_NET_QUEUE_SIZE; +} + +static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size) +{ + /* FIXME: dynamic */ + return size; +} + +static struct virtio_ops net_dev_virtio_ops = (struct virtio_ops) { + .get_config = get_config, + .get_host_features = get_host_features, + .set_guest_features = set_guest_features, + .init_vq = init_vq, + .get_pfn_vq = get_pfn_vq, + .get_size_vq = get_size_vq, + .set_size_vq = set_size_vq, + .notify_vq = notify_vq, + .notify_vq_gsi = notify_vq_gsi, + .notify_vq_eventfd = notify_vq_eventfd, +}; + +static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev) +{ + u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX; + struct vhost_memory *mem; + int r; + + ndev->vhost_fd = open("/dev/vhost-net", O_RDWR); + if (ndev->vhost_fd < 0) + die_perror("Failed openning vhost-net device"); + + mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region)); + if (mem == NULL) + die("Failed allocating memory for vhost memory map"); + + mem->nregions = 1; + mem->regions[0] = (struct vhost_memory_region) { + .guest_phys_addr = 0, + .memory_size = kvm->ram_size, + .userspace_addr = (unsigned long)kvm->ram_start, + }; + + r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER); + if (r != 0) + die_perror("VHOST_SET_OWNER failed"); + + r = ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features); + if (r != 0) + die_perror("VHOST_SET_FEATURES failed"); + r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem); + if (r != 0) + die_perror("VHOST_SET_MEM_TABLE failed"); + + ndev->vdev.use_vhost = true; + + free(mem); +} + +static inline void str_to_mac(const char *str, char *mac) +{ + sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + mac, mac+1, mac+2, mac+3, mac+4, mac+5); +} +static int set_net_param(struct kvm *kvm, struct virtio_net_params *p, + const char *param, const char *val) +{ + if (strcmp(param, "guest_mac") == 0) { + str_to_mac(val, p->guest_mac); + } else if (strcmp(param, "mode") == 0) { + if (!strncmp(val, "user", 4)) { + int i; + + for (i = 0; i < kvm->cfg.num_net_devices; i++) + if (kvm->cfg.net_params[i].mode == NET_MODE_USER) + die("Only one usermode network device allowed at a time"); + p->mode = NET_MODE_USER; + } else if (!strncmp(val, "tap", 3)) { + p->mode = NET_MODE_TAP; + } else if (!strncmp(val, "none", 4)) { + kvm->cfg.no_net = 1; + return -1; + } else + die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network); + } else if (strcmp(param, "script") == 0) { + p->script = strdup(val); + } else if (strcmp(param, "guest_ip") == 0) { + p->guest_ip = strdup(val); + } else if (strcmp(param, "host_ip") == 0) { + p->host_ip = strdup(val); + } else if (strcmp(param, "trans") == 0) { + p->trans = strdup(val); + } else if (strcmp(param, "vhost") == 0) { + p->vhost = atoi(val); + } else if (strcmp(param, "fd") == 0) { + p->fd = atoi(val); + } else + die("Unknown network parameter %s", param); + + return 0; +} + +int netdev_parser(const struct option *opt, const char *arg, int unset) +{ + struct virtio_net_params p; + char *buf = NULL, *cmd = NULL, *cur = NULL; + bool on_cmd = true; + struct kvm *kvm = opt->ptr; + + if (arg) { + buf = strdup(arg); + if (buf == NULL) + die("Failed allocating new net buffer"); + cur = strtok(buf, ",="); + } + + p = (struct virtio_net_params) { + .guest_ip = DEFAULT_GUEST_ADDR, + .host_ip = DEFAULT_HOST_ADDR, + .script = DEFAULT_SCRIPT, + .mode = NET_MODE_TAP, + }; + + str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac); + p.guest_mac[5] += kvm->cfg.num_net_devices; + + while (cur) { + if (on_cmd) { + cmd = cur; + } else { + if (set_net_param(kvm, &p, cmd, cur) < 0) + goto done; + } + on_cmd = !on_cmd; + + cur = strtok(NULL, ",="); + }; + + kvm->cfg.num_net_devices++; + + kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params)); + if (kvm->cfg.net_params == NULL) + die("Failed adding new network device"); + + kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p; + +done: + free(buf); + return 0; +} + +static int virtio_net__init_one(struct virtio_net_params *params) +{ + int i; + struct net_dev *ndev; + + ndev = calloc(1, sizeof(struct net_dev)); + if (ndev == NULL) + return -ENOMEM; + + list_add_tail(&ndev->list, &ndevs); + + ndev->kvm = params->kvm; + + mutex_init(&ndev->mutex); + ndev->config.status = VIRTIO_NET_S_LINK_UP; + + for (i = 0 ; i < 6 ; i++) { + ndev->config.mac[i] = params->guest_mac[i]; + ndev->info.guest_mac.addr[i] = params->guest_mac[i]; + ndev->info.host_mac.addr[i] = params->host_mac[i]; + } + + ndev->mode = params->mode; + if (ndev->mode == NET_MODE_TAP) { + if (!virtio_net__tap_init(params, ndev)) + die_perror("You have requested a TAP device, but creation of one has failed because"); + ndev->ops = &tap_ops; + } else { + ndev->info.host_ip = ntohl(inet_addr(params->host_ip)); + ndev->info.guest_ip = ntohl(inet_addr(params->guest_ip)); + ndev->info.guest_netmask = ntohl(inet_addr("255.255.255.0")); + ndev->info.buf_nr = 20, + uip_init(&ndev->info); + ndev->ops = &uip_ops; + } + + if (params->trans && strcmp(params->trans, "mmio") == 0) + virtio_init(params->kvm, ndev, &ndev->vdev, &net_dev_virtio_ops, + VIRTIO_MMIO, PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET); + else + virtio_init(params->kvm, ndev, &ndev->vdev, &net_dev_virtio_ops, + VIRTIO_PCI, PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET); + + if (params->vhost) + virtio_net__vhost_init(params->kvm, ndev); + else + virtio_net__io_thread_init(params->kvm, ndev); + + if (compat_id == -1) + compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET"); + + return 0; +} + +int virtio_net__init(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->cfg.num_net_devices; i++) { + kvm->cfg.net_params[i].kvm = kvm; + virtio_net__init_one(&kvm->cfg.net_params[i]); + } + + if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) { + struct virtio_net_params net_params; + + net_params = (struct virtio_net_params) { + .guest_ip = kvm->cfg.guest_ip, + .host_ip = kvm->cfg.host_ip, + .kvm = kvm, + .script = kvm->cfg.script, + .mode = NET_MODE_USER, + }; + str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac); + str_to_mac(kvm->cfg.host_mac, net_params.host_mac); + + virtio_net__init_one(&net_params); + } + + return 0; +} +virtio_dev_init(virtio_net__init); + +int virtio_net__exit(struct kvm *kvm) +{ + return 0; +} +virtio_dev_exit(virtio_net__exit); diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c new file mode 100644 index 000000000000..227d5674f3a6 --- /dev/null +++ b/tools/kvm/virtio/pci.c @@ -0,0 +1,410 @@ +#include "kvm/virtio-pci.h" + +#include "kvm/ioport.h" +#include "kvm/kvm.h" +#include "kvm/virtio-pci-dev.h" +#include "kvm/irq.h" +#include "kvm/virtio.h" +#include "kvm/ioeventfd.h" + +#include <sys/ioctl.h> +#include <linux/virtio_pci.h> +#include <linux/byteorder.h> +#include <string.h> + +static void virtio_pci__ioevent_callback(struct kvm *kvm, void *param) +{ + struct virtio_pci_ioevent_param *ioeventfd = param; + struct virtio_pci *vpci = ioeventfd->vdev->virtio; + + ioeventfd->vdev->ops->notify_vq(kvm, vpci->dev, ioeventfd->vq); +} + +static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_device *vdev, u32 vq) +{ + struct ioevent ioevent; + struct virtio_pci *vpci = vdev->virtio; + int r; + + vpci->ioeventfds[vq] = (struct virtio_pci_ioevent_param) { + .vdev = vdev, + .vq = vq, + }; + + ioevent = (struct ioevent) { + .io_addr = vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY, + .io_len = sizeof(u16), + .fn = virtio_pci__ioevent_callback, + .fn_ptr = &vpci->ioeventfds[vq], + .datamatch = vq, + .fn_kvm = kvm, + .fd = eventfd(0, 0), + }; + + if (vdev->use_vhost) + /* + * Vhost will poll the eventfd in host kernel side, + * no need to poll in userspace. + */ + r = ioeventfd__add_event(&ioevent, true, false); + else + /* Need to poll in userspace. */ + r = ioeventfd__add_event(&ioevent, true, true); + if (r) + return r; + + if (vdev->ops->notify_vq_eventfd) + vdev->ops->notify_vq_eventfd(kvm, vpci->dev, vq, ioevent.fd); + + return 0; +} + +static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci) +{ + return vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE); +} + +static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_device *vdev, u16 port, + void *data, int size, int offset) +{ + u32 config_offset; + struct virtio_pci *vpci = vdev->virtio; + int type = virtio__get_dev_specific_field(offset - 20, + virtio_pci__msix_enabled(vpci), + &config_offset); + if (type == VIRTIO_PCI_O_MSIX) { + switch (offset) { + case VIRTIO_MSI_CONFIG_VECTOR: + ioport__write16(data, vpci->config_vector); + break; + case VIRTIO_MSI_QUEUE_VECTOR: + ioport__write16(data, vpci->vq_vector[vpci->queue_selector]); + break; + }; + + return true; + } else if (type == VIRTIO_PCI_O_CONFIG) { + u8 cfg; + + cfg = vdev->ops->get_config(kvm, vpci->dev)[config_offset]; + ioport__write8(data, cfg); + return true; + } + + return false; +} + +static bool virtio_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + unsigned long offset; + bool ret = true; + struct virtio_device *vdev; + struct virtio_pci *vpci; + u32 val; + + vdev = ioport->priv; + vpci = vdev->virtio; + offset = port - vpci->base_addr; + + switch (offset) { + case VIRTIO_PCI_HOST_FEATURES: + val = vdev->ops->get_host_features(kvm, vpci->dev); + ioport__write32(data, val); + break; + case VIRTIO_PCI_QUEUE_PFN: + val = vdev->ops->get_pfn_vq(kvm, vpci->dev, vpci->queue_selector); + ioport__write32(data, val); + break; + case VIRTIO_PCI_QUEUE_NUM: + val = vdev->ops->get_size_vq(kvm, vpci->dev, vpci->queue_selector); + ioport__write16(data, val); + break; + case VIRTIO_PCI_STATUS: + ioport__write8(data, vpci->status); + break; + case VIRTIO_PCI_ISR: + ioport__write8(data, vpci->isr); + kvm__irq_line(kvm, vpci->pci_hdr.irq_line, VIRTIO_IRQ_LOW); + vpci->isr = VIRTIO_IRQ_LOW; + break; + default: + ret = virtio_pci__specific_io_in(kvm, vdev, port, data, size, offset); + break; + }; + + return ret; +} + +static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_device *vdev, u16 port, + void *data, int size, int offset) +{ + struct virtio_pci *vpci = vdev->virtio; + u32 config_offset, gsi, vec; + int type = virtio__get_dev_specific_field(offset - 20, virtio_pci__msix_enabled(vpci), + &config_offset); + if (type == VIRTIO_PCI_O_MSIX) { + switch (offset) { + case VIRTIO_MSI_CONFIG_VECTOR: + vec = vpci->config_vector = ioport__read16(data); + if (vec == VIRTIO_MSI_NO_VECTOR) + break; + + gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg); + + vpci->config_gsi = gsi; + break; + case VIRTIO_MSI_QUEUE_VECTOR: + vec = vpci->vq_vector[vpci->queue_selector] = ioport__read16(data); + + if (vec == VIRTIO_MSI_NO_VECTOR) + break; + + gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg); + vpci->gsis[vpci->queue_selector] = gsi; + if (vdev->ops->notify_vq_gsi) + vdev->ops->notify_vq_gsi(kvm, vpci->dev, + vpci->queue_selector, gsi); + break; + }; + + return true; + } else if (type == VIRTIO_PCI_O_CONFIG) { + vdev->ops->get_config(kvm, vpci->dev)[config_offset] = *(u8 *)data; + + return true; + } + + return false; +} + +static bool virtio_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + unsigned long offset; + bool ret = true; + struct virtio_device *vdev; + struct virtio_pci *vpci; + u32 val; + + vdev = ioport->priv; + vpci = vdev->virtio; + offset = port - vpci->base_addr; + + switch (offset) { + case VIRTIO_PCI_GUEST_FEATURES: + val = ioport__read32(data); + vdev->ops->set_guest_features(kvm, vpci->dev, val); + break; + case VIRTIO_PCI_QUEUE_PFN: + val = ioport__read32(data); + virtio_pci__init_ioeventfd(kvm, vdev, vpci->queue_selector); + vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector, + 1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT, + VIRTIO_PCI_VRING_ALIGN, val); + break; + case VIRTIO_PCI_QUEUE_SEL: + vpci->queue_selector = ioport__read16(data); + break; + case VIRTIO_PCI_QUEUE_NOTIFY: + val = ioport__read16(data); + vdev->ops->notify_vq(kvm, vpci->dev, val); + break; + case VIRTIO_PCI_STATUS: + vpci->status = ioport__read8(data); + break; + default: + ret = virtio_pci__specific_io_out(kvm, vdev, port, data, size, offset); + break; + }; + + return ret; +} + +static struct ioport_operations virtio_pci__io_ops = { + .io_in = virtio_pci__io_in, + .io_out = virtio_pci__io_out, +}; + +static void virtio_pci__mmio_callback(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr) +{ + struct virtio_pci *vpci = ptr; + void *table; + u32 offset; + + if (addr > vpci->msix_io_block + PCI_IO_SIZE) { + table = &vpci->msix_pba; + offset = vpci->msix_io_block + PCI_IO_SIZE; + } else { + table = &vpci->msix_table; + offset = vpci->msix_io_block; + } + + if (is_write) + memcpy(table + addr - offset, data, len); + else + memcpy(data, table + addr - offset, len); +} + +static void virtio_pci__signal_msi(struct kvm *kvm, struct virtio_pci *vpci, int vec) +{ + struct kvm_msi msi = { + .address_lo = vpci->msix_table[vec].msg.address_lo, + .address_hi = vpci->msix_table[vec].msg.address_hi, + .data = vpci->msix_table[vec].msg.data, + }; + + ioctl(kvm->vm_fd, KVM_SIGNAL_MSI, &msi); +} + +int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq) +{ + struct virtio_pci *vpci = vdev->virtio; + int tbl = vpci->vq_vector[vq]; + + if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) { + if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) || + vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) { + + vpci->msix_pba |= 1 << tbl; + return 0; + } + + if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI) + virtio_pci__signal_msi(kvm, vpci, vpci->vq_vector[vq]); + else + kvm__irq_trigger(kvm, vpci->gsis[vq]); + } else { + vpci->isr = VIRTIO_IRQ_HIGH; + kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line); + } + return 0; +} + +int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev) +{ + struct virtio_pci *vpci = vdev->virtio; + int tbl = vpci->config_vector; + + if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) { + if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) || + vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) { + + vpci->msix_pba |= 1 << tbl; + return 0; + } + + if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI) + virtio_pci__signal_msi(kvm, vpci, tbl); + else + kvm__irq_trigger(kvm, vpci->config_gsi); + } else { + vpci->isr = VIRTIO_PCI_ISR_CONFIG; + kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line); + } + + return 0; +} + +int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev, + int device_id, int subsys_id, int class) +{ + struct virtio_pci *vpci = vdev->virtio; + u8 pin, line; + int r; + + vpci->dev = dev; + vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE * 2); + + r = ioport__register(kvm, IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vdev); + if (r < 0) + return r; + + vpci->base_addr = (u16)r; + r = kvm__register_mmio(kvm, vpci->msix_io_block, PCI_IO_SIZE, false, + virtio_pci__mmio_callback, vpci); + if (r < 0) + goto free_ioport; + + vpci->pci_hdr = (struct pci_device_header) { + .vendor_id = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET), + .device_id = cpu_to_le16(device_id), + .header_type = PCI_HEADER_TYPE_NORMAL, + .revision_id = 0, + .class[0] = class & 0xff, + .class[1] = (class >> 8) & 0xff, + .class[2] = (class >> 16) & 0xff, + .subsys_vendor_id = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET), + .subsys_id = cpu_to_le16(subsys_id), + .bar[0] = cpu_to_le32(vpci->base_addr + | PCI_BASE_ADDRESS_SPACE_IO), + .bar[1] = cpu_to_le32(vpci->msix_io_block + | PCI_BASE_ADDRESS_SPACE_MEMORY), + .status = cpu_to_le16(PCI_STATUS_CAP_LIST), + .capabilities = (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr, + .bar_size[0] = IOPORT_SIZE, + .bar_size[1] = PCI_IO_SIZE, + .bar_size[3] = PCI_IO_SIZE, + }; + + vpci->dev_hdr = (struct device_header) { + .bus_type = DEVICE_BUS_PCI, + .data = &vpci->pci_hdr, + }; + + vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX; + vpci->pci_hdr.msix.next = 0; + /* + * We at most have VIRTIO_PCI_MAX_VQ entries for virt queue, + * VIRTIO_PCI_MAX_CONFIG entries for config. + * + * To quote the PCI spec: + * + * System software reads this field to determine the + * MSI-X Table Size N, which is encoded as N-1. + * For example, a returned value of "00000000011" + * indicates a table size of 4. + */ + vpci->pci_hdr.msix.ctrl = cpu_to_le16(VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG - 1); + + /* + * Both table and PBA could be mapped on the same BAR, but for now + * we're not in short of BARs + */ + vpci->pci_hdr.msix.table_offset = cpu_to_le32(1); /* Use BAR 1 */ + vpci->pci_hdr.msix.pba_offset = cpu_to_le32(1 | PCI_IO_SIZE); /* Use BAR 3 */ + vpci->config_vector = 0; + + r = irq__register_device(subsys_id, &pin, &line); + if (r < 0) + goto free_mmio; + + if (kvm__supports_extension(kvm, KVM_CAP_SIGNAL_MSI)) + vpci->features |= VIRTIO_PCI_F_SIGNAL_MSI; + + vpci->pci_hdr.irq_pin = pin; + vpci->pci_hdr.irq_line = line; + r = device__register(&vpci->dev_hdr); + if (r < 0) + goto free_ioport; + + return 0; + +free_mmio: + kvm__deregister_mmio(kvm, vpci->msix_io_block); +free_ioport: + ioport__unregister(kvm, vpci->base_addr); + return r; +} + +int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev) +{ + struct virtio_pci *vpci = vdev->virtio; + int i; + + kvm__deregister_mmio(kvm, vpci->msix_io_block); + ioport__unregister(kvm, vpci->base_addr); + + for (i = 0; i < VIRTIO_PCI_MAX_VQ; i++) + ioeventfd__del_event(vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY, i); + + return 0; +} diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c new file mode 100644 index 000000000000..2ce8afdc89bf --- /dev/null +++ b/tools/kvm/virtio/rng.c @@ -0,0 +1,204 @@ +#include "kvm/virtio-rng.h" + +#include "kvm/virtio-pci-dev.h" + +#include "kvm/virtio.h" +#include "kvm/util.h" +#include "kvm/kvm.h" +#include "kvm/threadpool.h" +#include "kvm/guest_compat.h" + +#include <linux/virtio_ring.h> +#include <linux/virtio_rng.h> + +#include <linux/list.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <pthread.h> +#include <linux/kernel.h> + +#define NUM_VIRT_QUEUES 1 +#define VIRTIO_RNG_QUEUE_SIZE 128 + +struct rng_dev_job { + struct virt_queue *vq; + struct rng_dev *rdev; + struct thread_pool__job job_id; +}; + +struct rng_dev { + struct list_head list; + struct virtio_device vdev; + + int fd; + + /* virtio queue */ + struct virt_queue vqs[NUM_VIRT_QUEUES]; + struct rng_dev_job jobs[NUM_VIRT_QUEUES]; +}; + +static LIST_HEAD(rdevs); +static int compat_id = -1; + +static u8 *get_config(struct kvm *kvm, void *dev) +{ + /* Unused */ + return 0; +} + +static u32 get_host_features(struct kvm *kvm, void *dev) +{ + /* Unused */ + return 0; +} + +static void set_guest_features(struct kvm *kvm, void *dev, u32 features) +{ + /* Unused */ +} + +static bool virtio_rng_do_io_request(struct kvm *kvm, struct rng_dev *rdev, struct virt_queue *queue) +{ + struct iovec iov[VIRTIO_RNG_QUEUE_SIZE]; + ssize_t len = 0; + u16 out, in, head; + + head = virt_queue__get_iov(queue, iov, &out, &in, kvm); + len = readv(rdev->fd, iov, in); + if (len < 0 && errno == EAGAIN) + len = 0; + + virt_queue__set_used_elem(queue, head, len); + + return true; +} + +static void virtio_rng_do_io(struct kvm *kvm, void *param) +{ + struct rng_dev_job *job = param; + struct virt_queue *vq = job->vq; + struct rng_dev *rdev = job->rdev; + + while (virt_queue__available(vq)) + virtio_rng_do_io_request(kvm, rdev, vq); + + rdev->vdev.ops->signal_vq(kvm, &rdev->vdev, vq - rdev->vqs); +} + +static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, + u32 pfn) +{ + struct rng_dev *rdev = dev; + struct virt_queue *queue; + struct rng_dev_job *job; + void *p; + + compat__remove_message(compat_id); + + queue = &rdev->vqs[vq]; + queue->pfn = pfn; + p = guest_flat_to_host(kvm, queue->pfn * page_size); + + job = &rdev->jobs[vq]; + + vring_init(&queue->vring, VIRTIO_RNG_QUEUE_SIZE, p, align); + + *job = (struct rng_dev_job) { + .vq = queue, + .rdev = rdev, + }; + + thread_pool__init_job(&job->job_id, kvm, virtio_rng_do_io, job); + + return 0; +} + +static int notify_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct rng_dev *rdev = dev; + + thread_pool__do_job(&rdev->jobs[vq].job_id); + + return 0; +} + +static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct rng_dev *rdev = dev; + + return rdev->vqs[vq].pfn; +} + +static int get_size_vq(struct kvm *kvm, void *dev, u32 vq) +{ + return VIRTIO_RNG_QUEUE_SIZE; +} + +static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size) +{ + /* FIXME: dynamic */ + return size; +} + +static struct virtio_ops rng_dev_virtio_ops = (struct virtio_ops) { + .get_config = get_config, + .get_host_features = get_host_features, + .set_guest_features = set_guest_features, + .init_vq = init_vq, + .notify_vq = notify_vq, + .get_pfn_vq = get_pfn_vq, + .get_size_vq = get_size_vq, + .set_size_vq = set_size_vq, +}; + +int virtio_rng__init(struct kvm *kvm) +{ + struct rng_dev *rdev; + int r; + + if (!kvm->cfg.virtio_rng) + return 0; + + rdev = malloc(sizeof(*rdev)); + if (rdev == NULL) + return -ENOMEM; + + rdev->fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + if (rdev->fd < 0) { + r = rdev->fd; + goto cleanup; + } + + r = virtio_init(kvm, rdev, &rdev->vdev, &rng_dev_virtio_ops, + VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_RNG, + VIRTIO_ID_RNG, PCI_CLASS_RNG); + if (r < 0) + goto cleanup; + + list_add_tail(&rdev->list, &rdevs); + + if (compat_id == -1) + compat_id = virtio_compat_add_message("virtio-rng", "CONFIG_HW_RANDOM_VIRTIO"); + return 0; +cleanup: + close(rdev->fd); + free(rdev); + + return r; +} +virtio_dev_init(virtio_rng__init); + +int virtio_rng__exit(struct kvm *kvm) +{ + struct rng_dev *rdev, *tmp; + + list_for_each_entry_safe(rdev, tmp, &rdevs, list) { + list_del(&rdev->list); + rdev->vdev.ops->exit(kvm, &rdev->vdev); + free(rdev); + } + + return 0; +} +virtio_dev_exit(virtio_rng__exit); diff --git a/tools/kvm/virtio/scsi.c b/tools/kvm/virtio/scsi.c new file mode 100644 index 000000000000..05b2dc60844d --- /dev/null +++ b/tools/kvm/virtio/scsi.c @@ -0,0 +1,311 @@ +#include "kvm/virtio-scsi.h" +#include "kvm/virtio-pci-dev.h" +#include "kvm/disk-image.h" +#include "kvm/kvm.h" +#include "kvm/pci.h" +#include "kvm/ioeventfd.h" +#include "kvm/guest_compat.h" +#include "kvm/virtio-pci.h" +#include "kvm/virtio.h" + +#include <linux/kernel.h> +#include <linux/virtio_scsi.h> +#include <linux/vhost.h> + +#define VIRTIO_SCSI_QUEUE_SIZE 128 +#define NUM_VIRT_QUEUES 3 + +static LIST_HEAD(sdevs); +static int compat_id = -1; + +struct scsi_dev { + struct virt_queue vqs[NUM_VIRT_QUEUES]; + struct virtio_scsi_config config; + struct vhost_scsi_target target; + u32 features; + int vhost_fd; + struct virtio_device vdev; + struct list_head list; + struct kvm *kvm; +}; + +static u8 *get_config(struct kvm *kvm, void *dev) +{ + struct scsi_dev *sdev = dev; + + return ((u8 *)(&sdev->config)); +} + +static u32 get_host_features(struct kvm *kvm, void *dev) +{ + return 1UL << VIRTIO_RING_F_EVENT_IDX | + 1UL << VIRTIO_RING_F_INDIRECT_DESC; +} + +static void set_guest_features(struct kvm *kvm, void *dev, u32 features) +{ + struct scsi_dev *sdev = dev; + + sdev->features = features; +} + +static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align, + u32 pfn) +{ + struct vhost_vring_state state = { .index = vq }; + struct vhost_vring_addr addr; + struct scsi_dev *sdev = dev; + struct virt_queue *queue; + void *p; + int r; + + compat__remove_message(compat_id); + + queue = &sdev->vqs[vq]; + queue->pfn = pfn; + p = guest_flat_to_host(kvm, queue->pfn * page_size); + + vring_init(&queue->vring, VIRTIO_SCSI_QUEUE_SIZE, p, align); + + if (sdev->vhost_fd == 0) + return 0; + + state.num = queue->vring.num; + r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_NUM, &state); + if (r < 0) + die_perror("VHOST_SET_VRING_NUM failed"); + state.num = 0; + r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_BASE, &state); + if (r < 0) + die_perror("VHOST_SET_VRING_BASE failed"); + + addr = (struct vhost_vring_addr) { + .index = vq, + .desc_user_addr = (u64)(unsigned long)queue->vring.desc, + .avail_user_addr = (u64)(unsigned long)queue->vring.avail, + .used_user_addr = (u64)(unsigned long)queue->vring.used, + }; + + r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_ADDR, &addr); + if (r < 0) + die_perror("VHOST_SET_VRING_ADDR failed"); + + return 0; +} + +static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi) +{ + struct vhost_vring_file file; + struct scsi_dev *sdev = dev; + struct kvm_irqfd irq; + int r; + + if (sdev->vhost_fd == 0) + return; + + irq = (struct kvm_irqfd) { + .gsi = gsi, + .fd = eventfd(0, 0), + }; + file = (struct vhost_vring_file) { + .index = vq, + .fd = irq.fd, + }; + + r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq); + if (r < 0) + die_perror("KVM_IRQFD failed"); + + r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_CALL, &file); + if (r < 0) + die_perror("VHOST_SET_VRING_CALL failed"); + + if (vq > 0) + return; + + r = ioctl(sdev->vhost_fd, VHOST_SCSI_SET_ENDPOINT, &sdev->target); + if (r != 0) + die("VHOST_SCSI_SET_ENDPOINT failed %d", errno); +} + +static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd) +{ + struct scsi_dev *sdev = dev; + struct vhost_vring_file file = { + .index = vq, + .fd = efd, + }; + int r; + + if (sdev->vhost_fd == 0) + return; + + r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_KICK, &file); + if (r < 0) + die_perror("VHOST_SET_VRING_KICK failed"); +} + +static int notify_vq(struct kvm *kvm, void *dev, u32 vq) +{ + return 0; +} + +static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq) +{ + struct scsi_dev *sdev = dev; + + return sdev->vqs[vq].pfn; +} + +static int get_size_vq(struct kvm *kvm, void *dev, u32 vq) +{ + return VIRTIO_SCSI_QUEUE_SIZE; +} + +static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size) +{ + return size; +} + +static struct virtio_ops scsi_dev_virtio_ops = (struct virtio_ops) { + .get_config = get_config, + .get_host_features = get_host_features, + .set_guest_features = set_guest_features, + .init_vq = init_vq, + .get_pfn_vq = get_pfn_vq, + .get_size_vq = get_size_vq, + .set_size_vq = set_size_vq, + .notify_vq = notify_vq, + .notify_vq_gsi = notify_vq_gsi, + .notify_vq_eventfd = notify_vq_eventfd, +}; + +static void virtio_scsi_vhost_init(struct kvm *kvm, struct scsi_dev *sdev) +{ + struct vhost_memory *mem; + u64 features; + int r; + + sdev->vhost_fd = open("/dev/vhost-scsi", O_RDWR); + if (sdev->vhost_fd < 0) + die_perror("Failed openning vhost-scsi device"); + + mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region)); + if (mem == NULL) + die("Failed allocating memory for vhost memory map"); + + mem->nregions = 1; + mem->regions[0] = (struct vhost_memory_region) { + .guest_phys_addr = 0, + .memory_size = kvm->ram_size, + .userspace_addr = (unsigned long)kvm->ram_start, + }; + + r = ioctl(sdev->vhost_fd, VHOST_SET_OWNER); + if (r != 0) + die_perror("VHOST_SET_OWNER failed"); + + r = ioctl(sdev->vhost_fd, VHOST_GET_FEATURES, &features); + if (r != 0) + die_perror("VHOST_GET_FEATURES failed"); + + r = ioctl(sdev->vhost_fd, VHOST_SET_FEATURES, &features); + if (r != 0) + die_perror("VHOST_SET_FEATURES failed"); + r = ioctl(sdev->vhost_fd, VHOST_SET_MEM_TABLE, mem); + if (r != 0) + die_perror("VHOST_SET_MEM_TABLE failed"); + + sdev->vdev.use_vhost = true; + + free(mem); +} + + +static int virtio_scsi_init_one(struct kvm *kvm, struct disk_image *disk) +{ + struct scsi_dev *sdev; + + if (!disk) + return -EINVAL; + + sdev = calloc(1, sizeof(struct scsi_dev)); + if (sdev == NULL) + return -ENOMEM; + + *sdev = (struct scsi_dev) { + .config = (struct virtio_scsi_config) { + .num_queues = NUM_VIRT_QUEUES - 2, + .seg_max = VIRTIO_SCSI_CDB_SIZE - 2, + .max_sectors = 65535, + .cmd_per_lun = 128, + .sense_size = VIRTIO_SCSI_SENSE_SIZE, + .cdb_size = VIRTIO_SCSI_CDB_SIZE, + .max_channel = 0, + .max_target = 0, + .max_lun = 16383, + .event_info_size = sizeof(struct virtio_scsi_event), + }, + .kvm = kvm, + }; + strncpy((char *)&sdev->target.vhost_wwpn, disk->wwpn, sizeof(sdev->target.vhost_wwpn)); + sdev->target.vhost_tpgt = strtol(disk->tpgt, NULL, 0); + + virtio_init(kvm, sdev, &sdev->vdev, &scsi_dev_virtio_ops, + VIRTIO_DEFAULT_TRANS, PCI_DEVICE_ID_VIRTIO_SCSI, + VIRTIO_ID_SCSI, PCI_CLASS_BLK); + + list_add_tail(&sdev->list, &sdevs); + + virtio_scsi_vhost_init(kvm, sdev); + + if (compat_id == -1) + compat_id = virtio_compat_add_message("virtio-scsi", "CONFIG_VIRTIO_SCSI"); + + return 0; +} + +static int virtio_scsi_exit_one(struct kvm *kvm, struct scsi_dev *sdev) +{ + int r; + + r = ioctl(sdev->vhost_fd, VHOST_SCSI_CLEAR_ENDPOINT, &sdev->target); + if (r != 0) + die("VHOST_SCSI_CLEAR_ENDPOINT failed %d", errno); + + list_del(&sdev->list); + free(sdev); + + return 0; +} + +int virtio_scsi_init(struct kvm *kvm) +{ + int i, r = 0; + + for (i = 0; i < kvm->nr_disks; i++) { + if (!kvm->disks[i]->wwpn) + continue; + r = virtio_scsi_init_one(kvm, kvm->disks[i]); + if (r < 0) + goto cleanup; + } + + return 0; +cleanup: + return virtio_scsi_exit(kvm); +} +virtio_dev_init(virtio_scsi_init); + +int virtio_scsi_exit(struct kvm *kvm) +{ + while (!list_empty(&sdevs)) { + struct scsi_dev *sdev; + + sdev = list_first_entry(&sdevs, struct scsi_dev, list); + virtio_scsi_exit_one(kvm, sdev); + } + + return 0; +} +virtio_dev_exit(virtio_scsi_exit); diff --git a/tools/kvm/x86/bios.c b/tools/kvm/x86/bios.c new file mode 100644 index 000000000000..f05cc021f02c --- /dev/null +++ b/tools/kvm/x86/bios.c @@ -0,0 +1,174 @@ +#include "kvm/kvm.h" +#include "kvm/boot-protocol.h" +#include "kvm/e820.h" +#include "kvm/interrupt.h" +#include "kvm/util.h" + +#include <string.h> +#include <asm/e820.h> + +#include "bios/bios-rom.h" + +struct irq_handler { + unsigned long address; + unsigned int irq; + void *handler; + size_t size; +}; + +#define BIOS_IRQ_PA_ADDR(name) (MB_BIOS_BEGIN + BIOS_OFFSET__##name) +#define BIOS_IRQ_FUNC(name) ((char *)&bios_rom[BIOS_OFFSET__##name]) +#define BIOS_IRQ_SIZE(name) (BIOS_ENTRY_SIZE(BIOS_OFFSET__##name)) + +#define DEFINE_BIOS_IRQ_HANDLER(_irq, _handler) \ + { \ + .irq = _irq, \ + .address = BIOS_IRQ_PA_ADDR(_handler), \ + .handler = BIOS_IRQ_FUNC(_handler), \ + .size = BIOS_IRQ_SIZE(_handler), \ + } + +static struct irq_handler bios_irq_handlers[] = { + DEFINE_BIOS_IRQ_HANDLER(0x10, bios_int10), + DEFINE_BIOS_IRQ_HANDLER(0x15, bios_int15), +}; + +static void setup_irq_handler(struct kvm *kvm, struct irq_handler *handler) +{ + struct real_intr_desc intr_desc; + void *p; + + p = guest_flat_to_host(kvm, handler->address); + memcpy(p, handler->handler, handler->size); + + intr_desc = (struct real_intr_desc) { + .segment = REAL_SEGMENT(MB_BIOS_BEGIN), + .offset = handler->address - MB_BIOS_BEGIN, + }; + + DIE_IF((handler->address - MB_BIOS_BEGIN) > 0xffffUL); + + interrupt_table__set(&kvm->arch.interrupt_table, &intr_desc, handler->irq); +} + +/** + * e820_setup - setup some simple E820 memory map + * @kvm - guest system descriptor + */ +static void e820_setup(struct kvm *kvm) +{ + struct e820map *e820; + struct e820entry *mem_map; + unsigned int i = 0; + + e820 = guest_flat_to_host(kvm, E820_MAP_START); + mem_map = e820->map; + + mem_map[i++] = (struct e820entry) { + .addr = REAL_MODE_IVT_BEGIN, + .size = EBDA_START - REAL_MODE_IVT_BEGIN, + .type = E820_RAM, + }; + mem_map[i++] = (struct e820entry) { + .addr = EBDA_START, + .size = VGA_RAM_BEGIN - EBDA_START, + .type = E820_RESERVED, + }; + mem_map[i++] = (struct e820entry) { + .addr = MB_BIOS_BEGIN, + .size = MB_BIOS_END - MB_BIOS_BEGIN, + .type = E820_RESERVED, + }; + if (kvm->ram_size < KVM_32BIT_GAP_START) { + mem_map[i++] = (struct e820entry) { + .addr = BZ_KERNEL_START, + .size = kvm->ram_size - BZ_KERNEL_START, + .type = E820_RAM, + }; + } else { + mem_map[i++] = (struct e820entry) { + .addr = BZ_KERNEL_START, + .size = KVM_32BIT_GAP_START - BZ_KERNEL_START, + .type = E820_RAM, + }; + mem_map[i++] = (struct e820entry) { + .addr = KVM_32BIT_MAX_MEM_SIZE, + .size = kvm->ram_size - KVM_32BIT_MAX_MEM_SIZE, + .type = E820_RAM, + }; + } + + BUG_ON(i > E820_X_MAX); + + e820->nr_map = i; +} + +static void setup_vga_rom(struct kvm *kvm) +{ + u16 *mode; + void *p; + + p = guest_flat_to_host(kvm, VGA_ROM_OEM_STRING); + memset(p, 0, VGA_ROM_OEM_STRING_SIZE); + strncpy(p, "KVM VESA", VGA_ROM_OEM_STRING_SIZE); + + mode = guest_flat_to_host(kvm, VGA_ROM_MODES); + mode[0] = 0x0112; + mode[1] = 0xffff; +} + +/** + * setup_bios - inject BIOS into guest memory + * @kvm - guest system descriptor + */ +void setup_bios(struct kvm *kvm) +{ + unsigned long address = MB_BIOS_BEGIN; + struct real_intr_desc intr_desc; + unsigned int i; + void *p; + + /* + * before anything else -- clean some known areas + * we definitely don't want any trash here + */ + p = guest_flat_to_host(kvm, BDA_START); + memset(p, 0, BDA_END - BDA_START); + + p = guest_flat_to_host(kvm, EBDA_START); + memset(p, 0, EBDA_END - EBDA_START); + + p = guest_flat_to_host(kvm, MB_BIOS_BEGIN); + memset(p, 0, MB_BIOS_END - MB_BIOS_BEGIN); + + p = guest_flat_to_host(kvm, VGA_ROM_BEGIN); + memset(p, 0, VGA_ROM_END - VGA_ROM_BEGIN); + + /* just copy the bios rom into the place */ + p = guest_flat_to_host(kvm, MB_BIOS_BEGIN); + memcpy(p, bios_rom, bios_rom_size); + + /* E820 memory map must be present */ + e820_setup(kvm); + + /* VESA needs own tricks */ + setup_vga_rom(kvm); + + /* + * Setup a *fake* real mode vector table, it has only + * one real handler which does just iret + */ + address = BIOS_IRQ_PA_ADDR(bios_intfake); + intr_desc = (struct real_intr_desc) { + .segment = REAL_SEGMENT(MB_BIOS_BEGIN), + .offset = address - MB_BIOS_BEGIN, + }; + interrupt_table__setup(&kvm->arch.interrupt_table, &intr_desc); + + for (i = 0; i < ARRAY_SIZE(bios_irq_handlers); i++) + setup_irq_handler(kvm, &bios_irq_handlers[i]); + + /* we almost done */ + p = guest_flat_to_host(kvm, 0); + interrupt_table__copy(&kvm->arch.interrupt_table, p, REAL_INTR_SIZE); +} diff --git a/tools/kvm/x86/bios/.gitignore b/tools/kvm/x86/bios/.gitignore new file mode 100644 index 000000000000..1f0080bcc5f2 --- /dev/null +++ b/tools/kvm/x86/bios/.gitignore @@ -0,0 +1,3 @@ +bios-rom.bin +bios-rom.bin.elf +bios-rom.h diff --git a/tools/kvm/x86/bios/bios-rom.S b/tools/kvm/x86/bios/bios-rom.S new file mode 100644 index 000000000000..3269ce9793ae --- /dev/null +++ b/tools/kvm/x86/bios/bios-rom.S @@ -0,0 +1,12 @@ +#include <kvm/assembly.h> + + .org 0 +#ifdef CONFIG_X86_64 + .code64 +#else + .code32 +#endif + +GLOBAL(bios_rom) + .incbin "x86/bios/bios.bin" +END(bios_rom) diff --git a/tools/kvm/x86/bios/e820.c b/tools/kvm/x86/bios/e820.c new file mode 100644 index 000000000000..a9bca29bff73 --- /dev/null +++ b/tools/kvm/x86/bios/e820.c @@ -0,0 +1,72 @@ +#include "kvm/e820.h" + +#include "kvm/segment.h" +#include "kvm/bios.h" + +#include <asm/processor-flags.h> +#include <asm/e820.h> + +static inline void set_fs(u16 seg) +{ + asm volatile("movw %0,%%fs" : : "rm" (seg)); +} + +static inline u8 rdfs8(unsigned long addr) +{ + u8 v; + + asm volatile("addr32 movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); + + return v; +} + +static inline u32 rdfs32(unsigned long addr) +{ + u32 v; + + asm volatile("addr32 movl %%fs:%1,%0" : "=q" (v) : "m" (*(u32 *)addr)); + + return v; +} + +bioscall void e820_query_map(struct biosregs *regs) +{ + struct e820map *e820; + u32 map_size; + u16 fs_seg; + u32 ndx; + + e820 = (struct e820map *)E820_MAP_START; + fs_seg = flat_to_seg16(E820_MAP_START); + set_fs(fs_seg); + + ndx = regs->ebx; + + map_size = rdfs32(flat_to_off16((u32)&e820->nr_map, fs_seg)); + + if (ndx < map_size) { + u32 start; + unsigned int i; + u8 *p; + + fs_seg = flat_to_seg16(E820_MAP_START); + set_fs(fs_seg); + + start = (u32)&e820->map[ndx]; + + p = (void *) regs->edi; + + for (i = 0; i < sizeof(struct e820entry); i++) + *p++ = rdfs8(flat_to_off16(start + i, fs_seg)); + } + + regs->eax = SMAP; + regs->ecx = sizeof(struct e820entry); + regs->ebx = ++ndx; + + /* Clear CF to indicate success. */ + regs->eflags &= ~X86_EFLAGS_CF; + + if (ndx >= map_size) + regs->ebx = 0; /* end of map */ +} diff --git a/tools/kvm/x86/bios/entry.S b/tools/kvm/x86/bios/entry.S new file mode 100644 index 000000000000..85056e9816c4 --- /dev/null +++ b/tools/kvm/x86/bios/entry.S @@ -0,0 +1,92 @@ +/* + * Our pretty trivial BIOS emulation + */ + +#include <kvm/bios.h> +#include <kvm/assembly.h> + + .org 0 + .code16gcc + +#define EFLAGS_CF (1 << 0) + +#include "macro.S" + +/* If you change these macros, remember to update 'struct biosregs' */ +.macro SAVE_BIOSREGS + pushl %fs + pushl %es + pushl %ds + pushl %edi + pushl %esi + pushl %ebp + pushl %esp + pushl %edx + pushl %ecx + pushl %ebx + pushl %eax +.endm + +.macro RESTORE_BIOSREGS + popl %eax + popl %ebx + popl %ecx + popl %edx + popl %esp + popl %ebp + popl %esi + popl %edi + popl %ds + popl %es + popl %fs +.endm + +/* + * fake interrupt handler, nothing can be faster ever + */ +ENTRY(bios_intfake) + /* + * Set CF to indicate failure. We don't want callers to think that the + * interrupt handler succeeded and then treat the return values in + * registers as valid data. + */ + orl $EFLAGS_CF, 0x4(%esp) + + IRET +ENTRY_END(bios_intfake) + +/* + * int 10 - video - service + */ +ENTRY(bios_int10) + SAVE_BIOSREGS + + movl %esp, %eax + /* this is way easier than doing it in assembly */ + /* just push all the regs and jump to a C handler */ + call int10_handler + + RESTORE_BIOSREGS + + /* Clear CF to indicate success. */ + andl $~EFLAGS_CF, 0x4(%esp) + + IRET +ENTRY_END(bios_int10) + +ENTRY(bios_int15) + SAVE_BIOSREGS + + movl %esp, %eax + call int15_handler + + RESTORE_BIOSREGS + + IRET +ENTRY_END(bios_int15) + +GLOBAL(__locals) + +#include "local.S" + +END(__locals) diff --git a/tools/kvm/x86/bios/gen-offsets.sh b/tools/kvm/x86/bios/gen-offsets.sh new file mode 100644 index 000000000000..8771bbe0b1ea --- /dev/null +++ b/tools/kvm/x86/bios/gen-offsets.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +echo "/* Autogenerated file, don't edit */" +echo "#ifndef BIOS_OFFSETS_H" +echo "#define BIOS_OFFSETS_H" + +echo "" +echo "#define BIOS_ENTRY_SIZE(name) (name##_end - name)" +echo "" + +nm bios.bin.elf | grep ' [Tt] ' | awk '{ print "#define BIOS_OFFSET__" $3 " 0x" $1; }' + +echo "" +echo "#endif" diff --git a/tools/kvm/x86/bios/int10.c b/tools/kvm/x86/bios/int10.c new file mode 100644 index 000000000000..7cc0b3f2162e --- /dev/null +++ b/tools/kvm/x86/bios/int10.c @@ -0,0 +1,110 @@ +#include "kvm/segment.h" +#include "kvm/bios.h" +#include "kvm/vesa.h" + +#include "bios/memcpy.h" + +#include <boot/vesa.h> + +static far_ptr gen_far_ptr(unsigned int pa) +{ + far_ptr ptr; + + ptr.seg = (pa >> 4); + ptr.off = pa - (ptr.seg << 4); + + return ptr; +} + +static inline void outb(unsigned short port, unsigned char val) +{ + asm volatile("outb %0, %1" : : "a"(val), "Nd"(port)); +} + +/* + * It's probably much more useful to make this print to the serial + * line rather than print to a non-displayed VGA memory + */ +static inline void int10_putchar(struct biosregs *args) +{ + u8 al = args->eax & 0xFF; + + outb(0x3f8, al); +} + +static void vbe_get_mode(struct biosregs *args) +{ + struct vesa_mode_info *info = (struct vesa_mode_info *) args->edi; + + *info = (struct vesa_mode_info) { + .mode_attr = 0xd9, /* 11011011 */ + .logical_scan = VESA_WIDTH*4, + .h_res = VESA_WIDTH, + .v_res = VESA_HEIGHT, + .bpp = VESA_BPP, + .memory_layout = 6, + .memory_planes = 1, + .lfb_ptr = VESA_MEM_ADDR, + .rmask = 8, + .gmask = 8, + .bmask = 8, + .resv_mask = 8, + .resv_pos = 24, + .bpos = 16, + .gpos = 8, + }; +} + +static void vbe_get_info(struct biosregs *args) +{ + struct vesa_general_info *infop = (struct vesa_general_info *) args->edi; + struct vesa_general_info info; + + info = (struct vesa_general_info) { + .signature = VESA_MAGIC, + .version = 0x102, + .vendor_string = gen_far_ptr(VGA_ROM_BEGIN), + .capabilities = 0x10, + .video_mode_ptr = gen_far_ptr(VGA_ROM_MODES), + .total_memory = (4 * VESA_WIDTH * VESA_HEIGHT) / 0x10000, + }; + + memcpy16(args->es, infop, args->ds, &info, sizeof(info)); +} + +#define VBE_STATUS_OK 0x004F + +static void int10_vesa(struct biosregs *args) +{ + u8 al; + + al = args->eax & 0xff; + + switch (al) { + case 0x00: + vbe_get_info(args); + break; + case 0x01: + vbe_get_mode(args); + break; + } + + args->eax = VBE_STATUS_OK; +} + +bioscall void int10_handler(struct biosregs *args) +{ + u8 ah; + + ah = (args->eax & 0xff00) >> 8; + + switch (ah) { + case 0x0e: + int10_putchar(args); + break; + case 0x4f: + int10_vesa(args); + break; + } + +} diff --git a/tools/kvm/x86/bios/int15.c b/tools/kvm/x86/bios/int15.c new file mode 100644 index 000000000000..faf5343ea509 --- /dev/null +++ b/tools/kvm/x86/bios/int15.c @@ -0,0 +1,18 @@ +#include "kvm/bios.h" + +#include "kvm/e820.h" + +#include <asm/processor-flags.h> + +bioscall void int15_handler(struct biosregs *regs) +{ + switch (regs->eax) { + case 0xe820: + e820_query_map(regs); + break; + default: + /* Set CF to indicate failure. */ + regs->eflags |= X86_EFLAGS_CF; + break; + } +} diff --git a/tools/kvm/x86/bios/local.S b/tools/kvm/x86/bios/local.S new file mode 100644 index 000000000000..f2cdbf4c3e1c --- /dev/null +++ b/tools/kvm/x86/bios/local.S @@ -0,0 +1,7 @@ +/* + * Local variables for almost every BIOS irq handler + * Must be put somewhere inside irq handler body + */ +__CALLER_SS: .int 0 +__CALLER_SP: .long 0 +__CALLER_CLOBBER: .long 0 diff --git a/tools/kvm/x86/bios/macro.S b/tools/kvm/x86/bios/macro.S new file mode 100644 index 000000000000..0d5e567e7cf1 --- /dev/null +++ b/tools/kvm/x86/bios/macro.S @@ -0,0 +1,25 @@ +/* + * handy BIOS macros + */ + +/* + * switch to BIOS stack + */ +.macro stack_swap + movw %ss, %cs:(__CALLER_SS) + movl %esp, %cs:(__CALLER_SP) + movl %edx, %cs:(__CALLER_CLOBBER) + movw $MB_BIOS_SS, %dx + movw %dx, %ss + movw $MB_BIOS_SP, %sp + movl %cs:(__CALLER_CLOBBER), %edx +.endm + +/* + * restore the original stack + */ +.macro stack_restore + movl %cs:(__CALLER_SP), %esp + movw %cs:(__CALLER_SS), %ss +.endm + diff --git a/tools/kvm/x86/bios/memcpy.c b/tools/kvm/x86/bios/memcpy.c new file mode 100644 index 000000000000..40b9b65fa9e4 --- /dev/null +++ b/tools/kvm/x86/bios/memcpy.c @@ -0,0 +1,23 @@ +#include "bios/memcpy.h" + +/* + * Copy memory area in 16-bit real mode. + */ +void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len) +{ + __asm__ __volatile__ ( + "pushw %%ds \n" + "pushw %%es \n" + "movw %[src_seg], %%ds \n" + "movw %[dst_seg], %%es \n" + "rep movsb %%ds:(%%si), %%es:(%%di) \n" + "popw %%es \n" + "popw %%ds \n" + : + : "S"(src), + "D"(dst), + "c"(len), + [src_seg] "r"(src_seg), + [dst_seg] "r"(dst_seg) + : "cc", "memory"); +} diff --git a/tools/kvm/x86/bios/rom.ld.S b/tools/kvm/x86/bios/rom.ld.S new file mode 100644 index 000000000000..f4f183579327 --- /dev/null +++ b/tools/kvm/x86/bios/rom.ld.S @@ -0,0 +1,16 @@ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_ARCH(i386) + +SECTIONS { + .text 0 : { + *(.text) + } + + /DISCARD/ : { + *(.debug*) + *(.data) + *(.bss) + *(.eh_frame*) + } +} + diff --git a/tools/kvm/x86/boot.c b/tools/kvm/x86/boot.c new file mode 100644 index 000000000000..61535eb57bce --- /dev/null +++ b/tools/kvm/x86/boot.c @@ -0,0 +1,41 @@ +#include "kvm/kvm.h" + +#include "kvm/util.h" + +#include <sys/types.h> +#include <sys/stat.h> +#include <stdbool.h> +#include <fcntl.h> + +#define BIOS_SELECTOR 0xf000 +#define BIOS_IP 0xfff0 +#define BIOS_SP 0x8000 + +bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename) +{ + struct stat st; + void *p; + int fd; + int nr; + + fd = open(firmware_filename, O_RDONLY); + if (fd < 0) + return false; + + if (fstat(fd, &st)) + return false; + + if (st.st_size > MB_FIRMWARE_BIOS_SIZE) + die("firmware image %s is too big to fit in memory (%Lu KB).\n", firmware_filename, (u64)(st.st_size / 1024)); + + p = guest_flat_to_host(kvm, MB_FIRMWARE_BIOS_BEGIN); + + while ((nr = read(fd, p, st.st_size)) > 0) + p += nr; + + kvm->arch.boot_selector = BIOS_SELECTOR; + kvm->arch.boot_ip = BIOS_IP; + kvm->arch.boot_sp = BIOS_SP; + + return true; +} diff --git a/tools/kvm/x86/cpuid.c b/tools/kvm/x86/cpuid.c new file mode 100644 index 000000000000..4c140f0c57e6 --- /dev/null +++ b/tools/kvm/x86/cpuid.c @@ -0,0 +1,60 @@ +#include "kvm/kvm-cpu.h" + +#include "kvm/kvm.h" +#include "kvm/util.h" + +#include <sys/ioctl.h> +#include <stdlib.h> + +#define CPUID_FUNC_PERFMON 0x0A + +#define MAX_KVM_CPUID_ENTRIES 100 + +static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid) +{ + unsigned int i; + + /* + * Filter CPUID functions that are not supported by the hypervisor. + */ + for (i = 0; i < kvm_cpuid->nent; i++) { + struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i]; + + switch (entry->function) { + case 1: + /* Set X86_FEATURE_HYPERVISOR */ + if (entry->index == 0) + entry->ecx |= (1 << 31); + break; + case 6: + /* Clear X86_FEATURE_EPB */ + entry->ecx = entry->ecx & ~(1 << 3); + break; + case CPUID_FUNC_PERFMON: + entry->eax = 0x00; /* disable it */ + break; + default: + /* Keep the CPUID function as -is */ + break; + }; + } +} + +void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu) +{ + struct kvm_cpuid2 *kvm_cpuid; + + kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) + + MAX_KVM_CPUID_ENTRIES * sizeof(*kvm_cpuid->entries)); + + kvm_cpuid->nent = MAX_KVM_CPUID_ENTRIES; + if (ioctl(vcpu->kvm->sys_fd, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0) + die_perror("KVM_GET_SUPPORTED_CPUID failed"); + + filter_cpuid(kvm_cpuid); + + if (ioctl(vcpu->vcpu_fd, KVM_SET_CPUID2, kvm_cpuid) < 0) + die_perror("KVM_SET_CPUID2 failed"); + + free(kvm_cpuid); +} diff --git a/tools/kvm/x86/include/kvm/assembly.h b/tools/kvm/x86/include/kvm/assembly.h new file mode 100644 index 000000000000..e70baab39cce --- /dev/null +++ b/tools/kvm/x86/include/kvm/assembly.h @@ -0,0 +1,24 @@ +#ifndef ASSEMBLY_H_ +#define ASSEMBLY_H_ + +#define __ALIGN .p2align 4, 0x90 +#define ENTRY(name) \ + __ALIGN; \ + .globl name; \ + name: + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY_END(name) GLOBAL(name##_end) +#define END(name) GLOBAL(name##_end) + +/* + * gas produces size override prefix with which + * we are unhappy, lets make it hardcoded for + * 16 bit mode + */ +#define IRET .byte 0xcf + +#endif /* ASSEMBLY_H_ */ diff --git a/tools/kvm/x86/include/kvm/barrier.h b/tools/kvm/x86/include/kvm/barrier.h new file mode 100644 index 000000000000..46d14f67b027 --- /dev/null +++ b/tools/kvm/x86/include/kvm/barrier.h @@ -0,0 +1,20 @@ +#ifndef _KVM_BARRIER_H_ +#define _KVM_BARRIER_H_ + +#define barrier() asm volatile("": : :"memory") + +#define mb() asm volatile ("mfence": : :"memory") +#define rmb() asm volatile ("lfence": : :"memory") +#define wmb() asm volatile ("sfence": : :"memory") + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#define smp_rmb() rmb() +#define smp_wmb() wmb() +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#endif + +#endif /* _KVM_BARRIER_H_ */ diff --git a/tools/kvm/x86/include/kvm/bios-export.h b/tools/kvm/x86/include/kvm/bios-export.h new file mode 100644 index 000000000000..23825aab031d --- /dev/null +++ b/tools/kvm/x86/include/kvm/bios-export.h @@ -0,0 +1,13 @@ +#ifndef BIOS_EXPORT_H_ +#define BIOS_EXPORT_H_ + +struct kvm; + +extern char bios_rom[0]; +extern char bios_rom_end[0]; + +#define bios_rom_size (bios_rom_end - bios_rom) + +extern void setup_bios(struct kvm *kvm); + +#endif /* BIOS_EXPORT_H_ */ diff --git a/tools/kvm/x86/include/kvm/bios.h b/tools/kvm/x86/include/kvm/bios.h new file mode 100644 index 000000000000..ec7ed715e134 --- /dev/null +++ b/tools/kvm/x86/include/kvm/bios.h @@ -0,0 +1,93 @@ +#ifndef BIOS_H_ +#define BIOS_H_ + +/* + * X86-32 Memory Map (typical) + * start end + * Real Mode Interrupt Vector Table 0x00000000 0x000003FF + * BDA area 0x00000400 0x000004FF + * Conventional Low Memory 0x00000500 0x0009FBFF + * EBDA area 0x0009FC00 0x0009FFFF + * VIDEO RAM 0x000A0000 0x000BFFFF + * VIDEO ROM (BIOS) 0x000C0000 0x000C7FFF + * ROMs & unus. space (mapped hw & misc)0x000C8000 0x000EFFFF 160 KiB (typically) + * Motherboard BIOS 0x000F0000 0x000FFFFF + * Extended Memory 0x00100000 0xFEBFFFFF + * Reserved (configs, ACPI, PnP, etc) 0xFEC00000 0xFFFFFFFF + */ + +#define REAL_MODE_IVT_BEGIN 0x00000000 +#define REAL_MODE_IVT_END 0x000003ff + +#define BDA_START 0x00000400 +#define BDA_END 0x000004ff + +#define EBDA_START 0x0009fc00 +#define EBDA_END 0x0009ffff + +#define E820_MAP_START EBDA_START + +#define MB_BIOS_BEGIN 0x000f0000 +#define MB_FIRMWARE_BIOS_BEGIN 0x000e0000 +#define MB_BIOS_END 0x000fffff + +#define MB_BIOS_SIZE (MB_BIOS_END - MB_BIOS_BEGIN + 1) +#define MB_FIRMWARE_BIOS_SIZE (MB_BIOS_END - MB_FIRMWARE_BIOS_BEGIN + 1) + +#define VGA_RAM_BEGIN 0x000a0000 +#define VGA_RAM_END 0x000bffff + +#define VGA_ROM_BEGIN 0x000c0000 +#define VGA_ROM_OEM_STRING VGA_ROM_BEGIN +#define VGA_ROM_OEM_STRING_SIZE 16 +#define VGA_ROM_MODES (VGA_ROM_OEM_STRING + VGA_ROM_OEM_STRING_SIZE) +#define VGA_ROM_MODES_SIZE 32 +#define VGA_ROM_END 0x000c7fff + +/* we handle one page only */ +#define VGA_RAM_SEG (VGA_RAM_BEGIN >> 4) +#define VGA_PAGE_SIZE 0x007d0 /* 80x25 */ + +/* real mode interrupt vector table */ +#define REAL_INTR_BASE REAL_MODE_IVT_BEGIN +#define REAL_INTR_VECTORS 256 + +/* + * BIOS stack must be at absolute predefined memory address + * We reserve 64 bytes for BIOS stack + */ +#define MB_BIOS_SS 0xfff7 +#define MB_BIOS_SP 0x40 + +/* + * When interfere with assembler code we need to be sure how + * arguments are passed in real mode. + */ +#define bioscall __attribute__((regparm(3))) + +#ifndef __ASSEMBLER__ + +#include <linux/types.h> + +struct biosregs { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esp; + u32 ebp; + u32 esi; + u32 edi; + u32 ds; + u32 es; + u32 fs; + u32 eip; + u32 eflags; +}; + +extern bioscall void int10_handler(struct biosregs *regs); +extern bioscall void int15_handler(struct biosregs *regs); + +#endif + +#endif /* BIOS_H_ */ diff --git a/tools/kvm/x86/include/kvm/boot-protocol.h b/tools/kvm/x86/include/kvm/boot-protocol.h new file mode 100644 index 000000000000..85b637f585c2 --- /dev/null +++ b/tools/kvm/x86/include/kvm/boot-protocol.h @@ -0,0 +1,16 @@ +/* + * Linux boot protocol specifics + */ + +#ifndef BOOT_PROTOCOL_H_ +#define BOOT_PROTOCOL_H_ + +/* + * The protected mode kernel part of a modern bzImage is loaded + * at 1 MB by default. + */ +#define BZ_DEFAULT_SETUP_SECTS 4 +#define BZ_KERNEL_START 0x100000UL +#define INITRD_START 0x1000000UL + +#endif /* BOOT_PROTOCOL_H_ */ diff --git a/tools/kvm/x86/include/kvm/cpufeature.h b/tools/kvm/x86/include/kvm/cpufeature.h new file mode 100644 index 000000000000..bc4abbbb42f0 --- /dev/null +++ b/tools/kvm/x86/include/kvm/cpufeature.h @@ -0,0 +1,41 @@ +#ifndef KVM__CPUFEATURE_H +#define KVM__CPUFEATURE_H + +#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */ +#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */ +#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */ + +#define CPUID_VENDOR_AMD_1 0x68747541 /* "Auth" */ +#define CPUID_VENDOR_AMD_2 0x69746e65 /* "enti" */ +#define CPUID_VENDOR_AMD_3 0x444d4163 /* "cAMD" */ + +/* + * CPUID flags we need to deal with + */ +#define KVM__X86_FEATURE_VMX 5 /* Hardware virtualization */ +#define KVM__X86_FEATURE_SVM 2 /* Secure virtual machine */ +#define KVM__X86_FEATURE_XSAVE 26 /* XSAVE/XRSTOR/XSETBV/XGETBV */ + +#define cpu_feature_disable(reg, feature) \ + ((reg) & ~(1 << (feature))) +#define cpu_feature_enable(reg, feature) \ + ((reg) | (1 << (feature))) + +struct cpuid_regs { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +static inline void host_cpuid(struct cpuid_regs *regs) +{ + asm volatile("cpuid" + : "=a" (regs->eax), + "=b" (regs->ebx), + "=c" (regs->ecx), + "=d" (regs->edx) + : "0" (regs->eax), "2" (regs->ecx)); +} + +#endif /* KVM__CPUFEATURE_H */ diff --git a/tools/kvm/x86/include/kvm/interrupt.h b/tools/kvm/x86/include/kvm/interrupt.h new file mode 100644 index 000000000000..00c7ed7dc3c1 --- /dev/null +++ b/tools/kvm/x86/include/kvm/interrupt.h @@ -0,0 +1,26 @@ +#ifndef KVM__INTERRUPT_H +#define KVM__INTERRUPT_H + +#include <linux/types.h> +#include "kvm/bios.h" +#include "kvm/bios-export.h" + +struct real_intr_desc { + u16 offset; + u16 segment; +} __attribute__((packed)); + +#define REAL_SEGMENT_SHIFT 4 +#define REAL_SEGMENT(addr) ((addr) >> REAL_SEGMENT_SHIFT) +#define REAL_OFFSET(addr) ((addr) & ((1 << REAL_SEGMENT_SHIFT) - 1)) +#define REAL_INTR_SIZE (REAL_INTR_VECTORS * sizeof(struct real_intr_desc)) + +struct interrupt_table { + struct real_intr_desc entries[REAL_INTR_VECTORS]; +}; + +void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size); +void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry); +void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num); + +#endif /* KVM__INTERRUPT_H */ diff --git a/tools/kvm/x86/include/kvm/kvm-arch.h b/tools/kvm/x86/include/kvm/kvm-arch.h new file mode 100644 index 000000000000..1e0949ed9506 --- /dev/null +++ b/tools/kvm/x86/include/kvm/kvm-arch.h @@ -0,0 +1,36 @@ +#ifndef KVM__KVM_ARCH_H +#define KVM__KVM_ARCH_H + +#include "kvm/interrupt.h" +#include "kvm/segment.h" + +#include <stdbool.h> +#include <linux/types.h> +#include <time.h> + +/* + * The hole includes VESA framebuffer and PCI memory. + */ +#define KVM_32BIT_MAX_MEM_SIZE (1ULL << 32) +#define KVM_32BIT_GAP_SIZE (768 << 20) +#define KVM_32BIT_GAP_START (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE) + +#define KVM_MMIO_START KVM_32BIT_GAP_START + +/* This is the address that pci_get_io_space_block() starts allocating + * from. Note that this is a PCI bus address (though same on x86). + */ +#define KVM_PCI_MMIO_AREA (KVM_MMIO_START + 0x1000000) +#define KVM_VIRTIO_MMIO_AREA (KVM_MMIO_START + 0x2000000) + +#define VIRTIO_DEFAULT_TRANS VIRTIO_PCI + +struct kvm_arch { + u16 boot_selector; + u16 boot_ip; + u16 boot_sp; + + struct interrupt_table interrupt_table; +}; + +#endif /* KVM__KVM_ARCH_H */ diff --git a/tools/kvm/x86/include/kvm/kvm-config-arch.h b/tools/kvm/x86/include/kvm/kvm-config-arch.h new file mode 100644 index 000000000000..3eae8dbce0b8 --- /dev/null +++ b/tools/kvm/x86/include/kvm/kvm-config-arch.h @@ -0,0 +1,15 @@ +#ifndef KVM__KVM_CONFIG_ARCH_H +#define KVM__KVM_CONFIG_ARCH_H + +#include "kvm/parse-options.h" + +struct kvm_config_arch { + int vidmode; +}; + +#define OPT_ARCH_RUN(pfx, cfg) \ + pfx, \ + OPT_GROUP("BIOS options:"), \ + OPT_INTEGER('\0', "vidmode", &(cfg)->vidmode, "Video mode"), + +#endif /* KVM__KVM_CONFIG_ARCH_H */ diff --git a/tools/kvm/x86/include/kvm/kvm-cpu-arch.h b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h new file mode 100644 index 000000000000..198efe68a6f0 --- /dev/null +++ b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h @@ -0,0 +1,49 @@ +#ifndef KVM__KVM_CPU_ARCH_H +#define KVM__KVM_CPU_ARCH_H + +/* Architecture-specific kvm_cpu definitions. */ + +#include <linux/kvm.h> /* for struct kvm_regs */ +#include "kvm/kvm.h" /* for kvm__emulate_{mm}io() */ +#include <stdbool.h> +#include <pthread.h> + +struct kvm; + +struct kvm_cpu { + pthread_t thread; /* VCPU thread */ + + unsigned long cpu_id; + + struct kvm *kvm; /* parent KVM */ + int vcpu_fd; /* For VCPU ioctls() */ + struct kvm_run *kvm_run; + + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_fpu fpu; + + struct kvm_msrs *msrs; /* dynamically allocated */ + + u8 is_running; + u8 paused; + u8 needs_nmi; + + struct kvm_coalesced_mmio_ring *ring; +}; + +/* + * As these are such simple wrappers, let's have them in the header so they'll + * be cheaper to call: + */ +static inline bool kvm_cpu__emulate_io(struct kvm *kvm, u16 port, void *data, int direction, int size, u32 count) +{ + return kvm__emulate_io(kvm, port, data, direction, size, count); +} + +static inline bool kvm_cpu__emulate_mmio(struct kvm *kvm, u64 phys_addr, u8 *data, u32 len, u8 is_write) +{ + return kvm__emulate_mmio(kvm, phys_addr, data, len, is_write); +} + +#endif /* KVM__KVM_CPU_ARCH_H */ diff --git a/tools/kvm/x86/include/kvm/mptable.h b/tools/kvm/x86/include/kvm/mptable.h new file mode 100644 index 000000000000..9e3cfa6be1ce --- /dev/null +++ b/tools/kvm/x86/include/kvm/mptable.h @@ -0,0 +1,9 @@ +#ifndef KVM_MPTABLE_H_ +#define KVM_MPTABLE_H_ + +struct kvm; + +int mptable__init(struct kvm *kvm); +int mptable__exit(struct kvm *kvm); + +#endif /* KVM_MPTABLE_H_ */ diff --git a/tools/kvm/x86/interrupt.c b/tools/kvm/x86/interrupt.c new file mode 100644 index 000000000000..7d478690fb54 --- /dev/null +++ b/tools/kvm/x86/interrupt.c @@ -0,0 +1,28 @@ +#include "kvm/interrupt.h" + +#include "kvm/util.h" + +#include <string.h> + +void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size) +{ + if (size < sizeof(itable->entries)) + die("An attempt to overwrite host memory"); + + memcpy(dst, itable->entries, sizeof(itable->entries)); +} + +void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry) +{ + unsigned int i; + + for (i = 0; i < REAL_INTR_VECTORS; i++) + itable->entries[i] = *entry; +} + +void interrupt_table__set(struct interrupt_table *itable, + struct real_intr_desc *entry, unsigned int num) +{ + if (num < REAL_INTR_VECTORS) + itable->entries[num] = *entry; +} diff --git a/tools/kvm/x86/ioport.c b/tools/kvm/x86/ioport.c new file mode 100644 index 000000000000..824ef257cdb5 --- /dev/null +++ b/tools/kvm/x86/ioport.c @@ -0,0 +1,99 @@ +#include "kvm/ioport.h" + +#include <stdlib.h> +#include <stdio.h> + +static bool debug_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + return 0; +} + +static struct ioport_operations debug_ops = { + .io_out = debug_io_out, +}; + +static bool seabios_debug_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + char ch; + + ch = ioport__read8(data); + + putchar(ch); + + return true; +} + +static struct ioport_operations seabios_debug_ops = { + .io_out = seabios_debug_io_out, +}; + +static bool dummy_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + return true; +} + +static bool dummy_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + return true; +} + +static struct ioport_operations dummy_read_write_ioport_ops = { + .io_in = dummy_io_in, + .io_out = dummy_io_out, +}; + +static struct ioport_operations dummy_write_only_ioport_ops = { + .io_out = dummy_io_out, +}; + +/* + * The "fast A20 gate" + */ + +static bool ps2_control_a_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size) +{ + /* + * A20 is always enabled. + */ + ioport__write8(data, 0x02); + + return true; +} + +static struct ioport_operations ps2_control_a_ops = { + .io_in = ps2_control_a_io_in, + .io_out = dummy_io_out, +}; + +void ioport__setup_arch(struct kvm *kvm) +{ + /* Legacy ioport setup */ + + /* 0x0020 - 0x003F - 8259A PIC 1 */ + ioport__register(kvm, 0x0020, &dummy_read_write_ioport_ops, 2, NULL); + + /* PORT 0040-005F - PIT - PROGRAMMABLE INTERVAL TIMER (8253, 8254) */ + ioport__register(kvm, 0x0040, &dummy_read_write_ioport_ops, 4, NULL); + + /* 0092 - PS/2 system control port A */ + ioport__register(kvm, 0x0092, &ps2_control_a_ops, 1, NULL); + + /* 0x00A0 - 0x00AF - 8259A PIC 2 */ + ioport__register(kvm, 0x00A0, &dummy_read_write_ioport_ops, 2, NULL); + + /* PORT 00E0-00EF are 'motherboard specific' so we use them for our + internal debugging purposes. */ + ioport__register(kvm, IOPORT_DBG, &debug_ops, 1, NULL); + + /* PORT 00ED - DUMMY PORT FOR DELAY??? */ + ioport__register(kvm, 0x00ED, &dummy_write_only_ioport_ops, 1, NULL); + + /* 0x00F0 - 0x00FF - Math co-processor */ + ioport__register(kvm, 0x00F0, &dummy_write_only_ioport_ops, 2, NULL); + + /* PORT 03D4-03D5 - COLOR VIDEO - CRT CONTROL REGISTERS */ + ioport__register(kvm, 0x03D4, &dummy_read_write_ioport_ops, 1, NULL); + ioport__register(kvm, 0x03D5, &dummy_write_only_ioport_ops, 1, NULL); + + ioport__register(kvm, 0x402, &seabios_debug_ops, 1, NULL); +} diff --git a/tools/kvm/x86/irq.c b/tools/kvm/x86/irq.c new file mode 100644 index 000000000000..7447c6b7d7aa --- /dev/null +++ b/tools/kvm/x86/irq.c @@ -0,0 +1,222 @@ +#include "kvm/irq.h" +#include "kvm/kvm.h" +#include "kvm/util.h" + +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/list.h> +#include <linux/kvm.h> +#include <sys/ioctl.h> + +#include <stddef.h> +#include <stdlib.h> + +#define IRQ_MAX_GSI 64 +#define IRQCHIP_MASTER 0 +#define IRQCHIP_SLAVE 1 +#define IRQCHIP_IOAPIC 2 + +static u8 next_line = 5; +static struct rb_root pci_tree = RB_ROOT; + +/* First 24 GSIs are routed between IRQCHIPs and IOAPICs */ +static u32 gsi = 24; + +struct kvm_irq_routing *irq_routing; + +static int irq__add_routing(u32 gsi, u32 type, u32 irqchip, u32 pin) +{ + if (gsi >= IRQ_MAX_GSI) + return -ENOSPC; + + irq_routing->entries[irq_routing->nr++] = + (struct kvm_irq_routing_entry) { + .gsi = gsi, + .type = type, + .u.irqchip.irqchip = irqchip, + .u.irqchip.pin = pin, + }; + + return 0; +} + +static struct pci_dev *search(struct rb_root *root, u32 id) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct pci_dev *data = rb_entry(node, struct pci_dev, node); + int result; + + result = id - data->id; + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static int insert(struct rb_root *root, struct pci_dev *data) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct pci_dev *this = container_of(*new, struct pci_dev, node); + int result = data->id - this->id; + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + + return 0; +} + +int irq__register_device(u32 dev, u8 *pin, u8 *line) +{ + struct pci_dev *node; + int r; + + node = search(&pci_tree, dev); + + if (!node) { + /* We haven't found a node - First device of it's kind */ + node = malloc(sizeof(*node)); + if (node == NULL) + return -ENOMEM; + + *node = (struct pci_dev) { + .id = dev, + /* + * PCI supports only INTA#,B#,C#,D# per device. + * A#,B#,C#,D# are allowed for multifunctional + * devices so stick with A# for our single + * function devices. + */ + .pin = 1, + }; + + INIT_LIST_HEAD(&node->lines); + + r = insert(&pci_tree, node); + if (r) { + free(node); + return r; + } + } + + if (node) { + /* This device already has a pin assigned, give out a new line and device id */ + struct irq_line *new = malloc(sizeof(*new)); + if (new == NULL) + return -ENOMEM; + + new->line = next_line++; + *line = new->line; + *pin = node->pin; + + list_add(&new->node, &node->lines); + + return 0; + } + + return -EFAULT; +} + +int irq__init(struct kvm *kvm) +{ + int i, r; + + irq_routing = calloc(sizeof(struct kvm_irq_routing) + + IRQ_MAX_GSI * sizeof(struct kvm_irq_routing_entry), 1); + if (irq_routing == NULL) + return -ENOMEM; + + /* Hook first 8 GSIs to master IRQCHIP */ + for (i = 0; i < 8; i++) + if (i != 2) + irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_MASTER, i); + + /* Hook next 8 GSIs to slave IRQCHIP */ + for (i = 8; i < 16; i++) + irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_SLAVE, i - 8); + + /* Last but not least, IOAPIC */ + for (i = 0; i < 24; i++) { + if (i == 0) + irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, 2); + else if (i != 2) + irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, i); + } + + r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing); + if (r) { + free(irq_routing); + return errno; + } + + return 0; +} +dev_base_init(irq__init); + +int irq__exit(struct kvm *kvm) +{ + struct rb_node *ent; + + free(irq_routing); + + while ((ent = rb_first(&pci_tree))) { + struct pci_dev *dev; + struct irq_line *line; + + dev = rb_entry(ent, struct pci_dev, node); + while (!list_empty(&dev->lines)) { + line = list_first_entry(&dev->lines, struct irq_line, node); + list_del(&line->node); + free(line); + } + rb_erase(&dev->node, &pci_tree); + free(dev); + } + + return 0; +} +dev_base_exit(irq__exit); + +int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg) +{ + int r; + + irq_routing->entries[irq_routing->nr++] = + (struct kvm_irq_routing_entry) { + .gsi = gsi, + .type = KVM_IRQ_ROUTING_MSI, + .u.msi.address_hi = msg->address_hi, + .u.msi.address_lo = msg->address_lo, + .u.msi.data = msg->data, + }; + + r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing); + if (r) + return r; + + return gsi++; +} + +struct rb_node *irq__get_pci_tree(void) +{ + return rb_first(&pci_tree); +} diff --git a/tools/kvm/x86/kvm-cpu.c b/tools/kvm/x86/kvm-cpu.c new file mode 100644 index 000000000000..b6190ed31395 --- /dev/null +++ b/tools/kvm/x86/kvm-cpu.c @@ -0,0 +1,425 @@ +#include "kvm/kvm-cpu.h" + +#include "kvm/symbol.h" +#include "kvm/util.h" +#include "kvm/kvm.h" + +#include <asm/msr-index.h> +#include <asm/apicdef.h> +#include <linux/err.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> + +static int debug_fd; + +void kvm_cpu__set_debug_fd(int fd) +{ + debug_fd = fd; +} + +int kvm_cpu__get_debug_fd(void) +{ + return debug_fd; +} + +static inline bool is_in_protected_mode(struct kvm_cpu *vcpu) +{ + return vcpu->sregs.cr0 & 0x01; +} + +static inline u64 ip_to_flat(struct kvm_cpu *vcpu, u64 ip) +{ + u64 cs; + + /* + * NOTE! We should take code segment base address into account here. + * Luckily it's usually zero because Linux uses flat memory model. + */ + if (is_in_protected_mode(vcpu)) + return ip; + + cs = vcpu->sregs.cs.selector; + + return ip + (cs << 4); +} + +static inline u32 selector_to_base(u16 selector) +{ + /* + * KVM on Intel requires 'base' to be 'selector * 16' in real mode. + */ + return (u32)selector << 4; +} + +static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm) +{ + struct kvm_cpu *vcpu; + + vcpu = calloc(1, sizeof(*vcpu)); + if (!vcpu) + return NULL; + + vcpu->kvm = kvm; + + return vcpu; +} + +void kvm_cpu__delete(struct kvm_cpu *vcpu) +{ + if (vcpu->msrs) + free(vcpu->msrs); + + free(vcpu); +} + +static int kvm_cpu__set_lint(struct kvm_cpu *vcpu) +{ + struct local_apic lapic; + + if (ioctl(vcpu->vcpu_fd, KVM_GET_LAPIC, &lapic)) + return -1; + + lapic.lvt_lint0.delivery_mode = APIC_MODE_EXTINT; + lapic.lvt_lint1.delivery_mode = APIC_MODE_NMI; + + return ioctl(vcpu->vcpu_fd, KVM_SET_LAPIC, &lapic); +} + +struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id) +{ + struct kvm_cpu *vcpu; + int mmap_size; + int coalesced_offset; + + vcpu = kvm_cpu__new(kvm); + if (!vcpu) + return NULL; + + vcpu->cpu_id = cpu_id; + + vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id); + if (vcpu->vcpu_fd < 0) + die_perror("KVM_CREATE_VCPU ioctl"); + + mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0); + if (mmap_size < 0) + die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl"); + + vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0); + if (vcpu->kvm_run == MAP_FAILED) + die("unable to mmap vcpu fd"); + + coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO); + if (coalesced_offset) + vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE); + + if (kvm_cpu__set_lint(vcpu)) + die_perror("KVM_SET_LAPIC failed"); + + vcpu->is_running = true; + + return vcpu; +} + +static struct kvm_msrs *kvm_msrs__new(size_t nmsrs) +{ + struct kvm_msrs *vcpu = calloc(1, sizeof(*vcpu) + (sizeof(struct kvm_msr_entry) * nmsrs)); + + if (!vcpu) + die("out of memory"); + + return vcpu; +} + +#define KVM_MSR_ENTRY(_index, _data) \ + (struct kvm_msr_entry) { .index = _index, .data = _data } + +static void kvm_cpu__setup_msrs(struct kvm_cpu *vcpu) +{ + unsigned long ndx = 0; + + vcpu->msrs = kvm_msrs__new(100); + + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS, 0x0); + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP, 0x0); + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP, 0x0); +#ifdef CONFIG_X86_64 + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_STAR, 0x0); + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_CSTAR, 0x0); + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_KERNEL_GS_BASE, 0x0); + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_SYSCALL_MASK, 0x0); + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_LSTAR, 0x0); +#endif + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TSC, 0x0); + vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_FAST_STRING); + + vcpu->msrs->nmsrs = ndx; + + if (ioctl(vcpu->vcpu_fd, KVM_SET_MSRS, vcpu->msrs) < 0) + die_perror("KVM_SET_MSRS failed"); +} + +static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu) +{ + vcpu->fpu = (struct kvm_fpu) { + .fcw = 0x37f, + .mxcsr = 0x1f80, + }; + + if (ioctl(vcpu->vcpu_fd, KVM_SET_FPU, &vcpu->fpu) < 0) + die_perror("KVM_SET_FPU failed"); +} + +static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu) +{ + vcpu->regs = (struct kvm_regs) { + /* We start the guest in 16-bit real mode */ + .rflags = 0x0000000000000002ULL, + + .rip = vcpu->kvm->arch.boot_ip, + .rsp = vcpu->kvm->arch.boot_sp, + .rbp = vcpu->kvm->arch.boot_sp, + }; + + if (vcpu->regs.rip > USHRT_MAX) + die("ip 0x%llx is too high for real mode", (u64)vcpu->regs.rip); + + if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0) + die_perror("KVM_SET_REGS failed"); +} + +static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu) +{ + if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0) + die_perror("KVM_GET_SREGS failed"); + + vcpu->sregs.cs.selector = vcpu->kvm->arch.boot_selector; + vcpu->sregs.cs.base = selector_to_base(vcpu->kvm->arch.boot_selector); + vcpu->sregs.ss.selector = vcpu->kvm->arch.boot_selector; + vcpu->sregs.ss.base = selector_to_base(vcpu->kvm->arch.boot_selector); + vcpu->sregs.ds.selector = vcpu->kvm->arch.boot_selector; + vcpu->sregs.ds.base = selector_to_base(vcpu->kvm->arch.boot_selector); + vcpu->sregs.es.selector = vcpu->kvm->arch.boot_selector; + vcpu->sregs.es.base = selector_to_base(vcpu->kvm->arch.boot_selector); + vcpu->sregs.fs.selector = vcpu->kvm->arch.boot_selector; + vcpu->sregs.fs.base = selector_to_base(vcpu->kvm->arch.boot_selector); + vcpu->sregs.gs.selector = vcpu->kvm->arch.boot_selector; + vcpu->sregs.gs.base = selector_to_base(vcpu->kvm->arch.boot_selector); + + if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &vcpu->sregs) < 0) + die_perror("KVM_SET_SREGS failed"); +} + +/** + * kvm_cpu__reset_vcpu - reset virtual CPU to a known state + */ +void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu) +{ + kvm_cpu__setup_cpuid(vcpu); + kvm_cpu__setup_sregs(vcpu); + kvm_cpu__setup_regs(vcpu); + kvm_cpu__setup_fpu(vcpu); + kvm_cpu__setup_msrs(vcpu); +} + +bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu) +{ + return false; +} + +static void print_dtable(const char *name, struct kvm_dtable *dtable) +{ + dprintf(debug_fd, " %s %016llx %08hx\n", + name, (u64) dtable->base, (u16) dtable->limit); +} + +static void print_segment(const char *name, struct kvm_segment *seg) +{ + dprintf(debug_fd, " %s %04hx %016llx %08x %02hhx %x %x %x %x %x %x %x\n", + name, (u16) seg->selector, (u64) seg->base, (u32) seg->limit, + (u8) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl); +} + +void kvm_cpu__show_registers(struct kvm_cpu *vcpu) +{ + unsigned long cr0, cr2, cr3; + unsigned long cr4, cr8; + unsigned long rax, rbx, rcx; + unsigned long rdx, rsi, rdi; + unsigned long rbp, r8, r9; + unsigned long r10, r11, r12; + unsigned long r13, r14, r15; + unsigned long rip, rsp; + struct kvm_sregs sregs; + unsigned long rflags; + struct kvm_regs regs; + int i; + + if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, ®s) < 0) + die("KVM_GET_REGS failed"); + + rflags = regs.rflags; + + rip = regs.rip; rsp = regs.rsp; + rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx; + rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi; + rbp = regs.rbp; r8 = regs.r8; r9 = regs.r9; + r10 = regs.r10; r11 = regs.r11; r12 = regs.r12; + r13 = regs.r13; r14 = regs.r14; r15 = regs.r15; + + dprintf(debug_fd, "\n Registers:\n"); + dprintf(debug_fd, " ----------\n"); + dprintf(debug_fd, " rip: %016lx rsp: %016lx flags: %016lx\n", rip, rsp, rflags); + dprintf(debug_fd, " rax: %016lx rbx: %016lx rcx: %016lx\n", rax, rbx, rcx); + dprintf(debug_fd, " rdx: %016lx rsi: %016lx rdi: %016lx\n", rdx, rsi, rdi); + dprintf(debug_fd, " rbp: %016lx r8: %016lx r9: %016lx\n", rbp, r8, r9); + dprintf(debug_fd, " r10: %016lx r11: %016lx r12: %016lx\n", r10, r11, r12); + dprintf(debug_fd, " r13: %016lx r14: %016lx r15: %016lx\n", r13, r14, r15); + + if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0) + die("KVM_GET_REGS failed"); + + cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3; + cr4 = sregs.cr4; cr8 = sregs.cr8; + + dprintf(debug_fd, " cr0: %016lx cr2: %016lx cr3: %016lx\n", cr0, cr2, cr3); + dprintf(debug_fd, " cr4: %016lx cr8: %016lx\n", cr4, cr8); + dprintf(debug_fd, "\n Segment registers:\n"); + dprintf(debug_fd, " ------------------\n"); + dprintf(debug_fd, " register selector base limit type p dpl db s l g avl\n"); + print_segment("cs ", &sregs.cs); + print_segment("ss ", &sregs.ss); + print_segment("ds ", &sregs.ds); + print_segment("es ", &sregs.es); + print_segment("fs ", &sregs.fs); + print_segment("gs ", &sregs.gs); + print_segment("tr ", &sregs.tr); + print_segment("ldt", &sregs.ldt); + print_dtable("gdt", &sregs.gdt); + print_dtable("idt", &sregs.idt); + + dprintf(debug_fd, "\n APIC:\n"); + dprintf(debug_fd, " -----\n"); + dprintf(debug_fd, " efer: %016llx apic base: %016llx nmi: %s\n", + (u64) sregs.efer, (u64) sregs.apic_base, + (vcpu->kvm->nmi_disabled ? "disabled" : "enabled")); + + dprintf(debug_fd, "\n Interrupt bitmap:\n"); + dprintf(debug_fd, " -----------------\n"); + for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) + dprintf(debug_fd, " %016llx", (u64) sregs.interrupt_bitmap[i]); + dprintf(debug_fd, "\n"); +} + +#define MAX_SYM_LEN 128 + +void kvm_cpu__show_code(struct kvm_cpu *vcpu) +{ + unsigned int code_bytes = 64; + unsigned int code_prologue = 43; + unsigned int code_len = code_bytes; + char sym[MAX_SYM_LEN] = SYMBOL_DEFAULT_UNKNOWN, *psym; + unsigned char c; + unsigned int i; + u8 *ip; + + if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0) + die("KVM_GET_REGS failed"); + + if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0) + die("KVM_GET_SREGS failed"); + + ip = guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip) - code_prologue); + + dprintf(debug_fd, "\n Code:\n"); + dprintf(debug_fd, " -----\n"); + + psym = symbol_lookup(vcpu->kvm, vcpu->regs.rip, sym, MAX_SYM_LEN); + if (IS_ERR(psym)) + dprintf(debug_fd, + "Warning: symbol_lookup() failed to find symbol " + "with error: %ld\n", PTR_ERR(psym)); + + dprintf(debug_fd, " rip: [<%016lx>] %s\n\n", (unsigned long) vcpu->regs.rip, sym); + + for (i = 0; i < code_len; i++, ip++) { + if (!host_ptr_in_ram(vcpu->kvm, ip)) + break; + + c = *ip; + + if (ip == guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip))) + dprintf(debug_fd, " <%02x>", c); + else + dprintf(debug_fd, " %02x", c); + } + + dprintf(debug_fd, "\n"); + + dprintf(debug_fd, "\n Stack:\n"); + dprintf(debug_fd, " ------\n"); + kvm__dump_mem(vcpu->kvm, vcpu->regs.rsp, 32); +} + +void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu) +{ + u64 *pte1; + u64 *pte2; + u64 *pte3; + u64 *pte4; + + if (!is_in_protected_mode(vcpu)) + return; + + if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0) + die("KVM_GET_SREGS failed"); + + pte4 = guest_flat_to_host(vcpu->kvm, vcpu->sregs.cr3); + if (!host_ptr_in_ram(vcpu->kvm, pte4)) + return; + + pte3 = guest_flat_to_host(vcpu->kvm, (*pte4 & ~0xfff)); + if (!host_ptr_in_ram(vcpu->kvm, pte3)) + return; + + pte2 = guest_flat_to_host(vcpu->kvm, (*pte3 & ~0xfff)); + if (!host_ptr_in_ram(vcpu->kvm, pte2)) + return; + + pte1 = guest_flat_to_host(vcpu->kvm, (*pte2 & ~0xfff)); + if (!host_ptr_in_ram(vcpu->kvm, pte1)) + return; + + dprintf(debug_fd, "Page Tables:\n"); + if (*pte2 & (1 << 7)) + dprintf(debug_fd, " pte4: %016llx pte3: %016llx" + " pte2: %016llx\n", + *pte4, *pte3, *pte2); + else + dprintf(debug_fd, " pte4: %016llx pte3: %016llx pte2: %016" + "llx pte1: %016llx\n", + *pte4, *pte3, *pte2, *pte1); +} + +void kvm_cpu__arch_nmi(struct kvm_cpu *cpu) +{ + struct kvm_lapic_state klapic; + struct local_apic *lapic = (void *)&klapic; + + if (ioctl(cpu->vcpu_fd, KVM_GET_LAPIC, &klapic) != 0) + return; + + if (lapic->lvt_lint1.mask) + return; + + if (lapic->lvt_lint1.delivery_mode != APIC_MODE_NMI) + return; + + ioctl(cpu->vcpu_fd, KVM_NMI); +} diff --git a/tools/kvm/x86/kvm.c b/tools/kvm/x86/kvm.c new file mode 100644 index 000000000000..687e6b7acd4e --- /dev/null +++ b/tools/kvm/x86/kvm.c @@ -0,0 +1,381 @@ +#include "kvm/kvm.h" +#include "kvm/boot-protocol.h" +#include "kvm/cpufeature.h" +#include "kvm/interrupt.h" +#include "kvm/mptable.h" +#include "kvm/util.h" +#include "kvm/8250-serial.h" +#include "kvm/virtio-console.h" + +#include <asm/bootparam.h> +#include <linux/kvm.h> + +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <fcntl.h> + +struct kvm_ext kvm_req_ext[] = { + { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) }, + { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) }, + { DEFINE_KVM_EXT(KVM_CAP_PIT2) }, + { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) }, + { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) }, + { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) }, + { DEFINE_KVM_EXT(KVM_CAP_HLT) }, + { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) }, + { DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) }, + { 0, 0 } +}; + +bool kvm__arch_cpu_supports_vm(void) +{ + struct cpuid_regs regs; + u32 eax_base; + int feature; + + regs = (struct cpuid_regs) { + .eax = 0x00, + }; + host_cpuid(®s); + + switch (regs.ebx) { + case CPUID_VENDOR_INTEL_1: + eax_base = 0x00; + feature = KVM__X86_FEATURE_VMX; + break; + + case CPUID_VENDOR_AMD_1: + eax_base = 0x80000000; + feature = KVM__X86_FEATURE_SVM; + break; + + default: + return false; + } + + regs = (struct cpuid_regs) { + .eax = eax_base, + }; + host_cpuid(®s); + + if (regs.eax < eax_base + 0x01) + return false; + + regs = (struct cpuid_regs) { + .eax = eax_base + 0x01 + }; + host_cpuid(®s); + + return regs.ecx & (1 << feature); +} + +/* + * Allocating RAM size bigger than 4GB requires us to leave a gap + * in the RAM which is used for PCI MMIO, hotplug, and unconfigured + * devices (see documentation of e820_setup_gap() for details). + * + * If we're required to initialize RAM bigger than 4GB, we will create + * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space. + */ + +void kvm__init_ram(struct kvm *kvm) +{ + u64 phys_start, phys_size; + void *host_mem; + + if (kvm->ram_size < KVM_32BIT_GAP_START) { + /* Use a single block of RAM for 32bit RAM */ + + phys_start = 0; + phys_size = kvm->ram_size; + host_mem = kvm->ram_start; + + kvm__register_mem(kvm, phys_start, phys_size, host_mem); + } else { + /* First RAM range from zero to the PCI gap: */ + + phys_start = 0; + phys_size = KVM_32BIT_GAP_START; + host_mem = kvm->ram_start; + + kvm__register_mem(kvm, phys_start, phys_size, host_mem); + + /* Second RAM range from 4GB to the end of RAM: */ + + phys_start = KVM_32BIT_MAX_MEM_SIZE; + phys_size = kvm->ram_size - phys_start; + host_mem = kvm->ram_start + phys_start; + + kvm__register_mem(kvm, phys_start, phys_size, host_mem); + } +} + +/* Arch-specific commandline setup */ +void kvm__arch_set_cmdline(char *cmdline, bool video) +{ + strcpy(cmdline, "noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 " + "i8042.dumbkbd=1 i8042.nopnp=1"); + if (video) + strcat(cmdline, " video=vesafb console=tty0"); + else + strcat(cmdline, " console=ttyS0 earlyprintk=serial i8042.noaux=1"); +} + +/* Architecture-specific KVM init */ +void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size) +{ + struct kvm_pit_config pit_config = { .flags = 0, }; + int ret; + + ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000); + if (ret < 0) + die_perror("KVM_SET_TSS_ADDR ioctl"); + + ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config); + if (ret < 0) + die_perror("KVM_CREATE_PIT2 ioctl"); + + if (ram_size < KVM_32BIT_GAP_START) { + kvm->ram_size = ram_size; + kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size); + } else { + kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size + KVM_32BIT_GAP_SIZE); + kvm->ram_size = ram_size + KVM_32BIT_GAP_SIZE; + if (kvm->ram_start != MAP_FAILED) + /* + * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that + * if we accidently write to it, we will know. + */ + mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE); + } + if (kvm->ram_start == MAP_FAILED) + die("out of memory"); + + madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE); + + ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP); + if (ret < 0) + die_perror("KVM_CREATE_IRQCHIP ioctl"); +} + +void kvm__arch_delete_ram(struct kvm *kvm) +{ + munmap(kvm->ram_start, kvm->ram_size); +} + +void kvm__irq_line(struct kvm *kvm, int irq, int level) +{ + struct kvm_irq_level irq_level; + + irq_level = (struct kvm_irq_level) { + { + .irq = irq, + }, + .level = level, + }; + + if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0) + die_perror("KVM_IRQ_LINE failed"); +} + +void kvm__irq_trigger(struct kvm *kvm, int irq) +{ + kvm__irq_line(kvm, irq, 1); + kvm__irq_line(kvm, irq, 0); +} + +#define BOOT_LOADER_SELECTOR 0x1000 +#define BOOT_LOADER_IP 0x0000 +#define BOOT_LOADER_SP 0x8000 +#define BOOT_CMDLINE_OFFSET 0x20000 + +#define BOOT_PROTOCOL_REQUIRED 0x206 +#define LOAD_HIGH 0x01 + +static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 offset) +{ + unsigned long flat = segment_to_flat(selector, offset); + + return guest_flat_to_host(kvm, flat); +} + +int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline) +{ + void *p; + int nr; + + /* + * Some architectures may support loading an initrd alongside the flat kernel, + * but we do not. + */ + if (fd_initrd != -1) + pr_warning("Loading initrd with flat binary not supported."); + + if (lseek(fd_kernel, 0, SEEK_SET) < 0) + die_perror("lseek"); + + p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP); + + while ((nr = read(fd_kernel, p, 65536)) > 0) + p += nr; + + kvm->arch.boot_selector = BOOT_LOADER_SELECTOR; + kvm->arch.boot_ip = BOOT_LOADER_IP; + kvm->arch.boot_sp = BOOT_LOADER_SP; + + return true; +} + +static const char *BZIMAGE_MAGIC = "HdrS"; + +bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, + const char *kernel_cmdline) +{ + struct boot_params *kern_boot; + unsigned long setup_sects; + struct boot_params boot; + size_t cmdline_size; + ssize_t setup_size; + void *p; + int nr; + u16 vidmode; + + /* + * See Documentation/x86/boot.txt for details no bzImage on-disk and + * memory layout. + */ + + if (lseek(fd_kernel, 0, SEEK_SET) < 0) + die_perror("lseek"); + + if (read(fd_kernel, &boot, sizeof(boot)) != sizeof(boot)) + return false; + + if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC))) + return false; + + if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED) + die("Too old kernel"); + + if (lseek(fd_kernel, 0, SEEK_SET) < 0) + die_perror("lseek"); + + if (!boot.hdr.setup_sects) + boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS; + setup_sects = boot.hdr.setup_sects + 1; + + setup_size = setup_sects << 9; + p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP); + + /* copy setup.bin to mem*/ + if (read(fd_kernel, p, setup_size) != setup_size) + die_perror("read"); + + /* copy vmlinux.bin to BZ_KERNEL_START*/ + p = guest_flat_to_host(kvm, BZ_KERNEL_START); + + while ((nr = read(fd_kernel, p, 65536)) > 0) + p += nr; + + p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET); + if (kernel_cmdline) { + cmdline_size = strlen(kernel_cmdline) + 1; + if (cmdline_size > boot.hdr.cmdline_size) + cmdline_size = boot.hdr.cmdline_size; + + memset(p, 0, boot.hdr.cmdline_size); + memcpy(p, kernel_cmdline, cmdline_size - 1); + } + + if (!kvm->cfg.arch.vidmode) + vidmode = -1; + + /* vidmode should be either specified or set by default */ + if (kvm->cfg.vnc || kvm->cfg.sdl) { + if (vidmode == -1) + vidmode = 0x312; + } else { + vidmode = 0; + } + + kern_boot = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00); + + kern_boot->hdr.cmd_line_ptr = BOOT_CMDLINE_OFFSET; + kern_boot->hdr.type_of_loader = 0xff; + kern_boot->hdr.heap_end_ptr = 0xfe00; + kern_boot->hdr.loadflags |= CAN_USE_HEAP; + kern_boot->hdr.vid_mode = vidmode; + + /* + * Read initrd image into guest memory + */ + if (fd_initrd >= 0) { + struct stat initrd_stat; + unsigned long addr; + + if (fstat(fd_initrd, &initrd_stat)) + die_perror("fstat"); + + addr = boot.hdr.initrd_addr_max & ~0xfffff; + for (;;) { + if (addr < BZ_KERNEL_START) + die("Not enough memory for initrd"); + else if (addr < (kvm->ram_size - initrd_stat.st_size)) + break; + addr -= 0x100000; + } + + p = guest_flat_to_host(kvm, addr); + nr = read(fd_initrd, p, initrd_stat.st_size); + if (nr != initrd_stat.st_size) + die("Failed to read initrd"); + + kern_boot->hdr.ramdisk_image = addr; + kern_boot->hdr.ramdisk_size = initrd_stat.st_size; + } + + kvm->arch.boot_selector = BOOT_LOADER_SELECTOR; + /* + * The real-mode setup code starts at offset 0x200 of a bzImage. See + * Documentation/x86/boot.txt for details. + */ + kvm->arch.boot_ip = BOOT_LOADER_IP + 0x200; + kvm->arch.boot_sp = BOOT_LOADER_SP; + + return true; +} + +/** + * kvm__arch_setup_firmware - inject BIOS into guest system memory + * @kvm - guest system descriptor + * + * This function is a main routine where we poke guest memory + * and install BIOS there. + */ +int kvm__arch_setup_firmware(struct kvm *kvm) +{ + /* standart minimal configuration */ + setup_bios(kvm); + + /* FIXME: SMP, ACPI and friends here */ + + return 0; +} + +int kvm__arch_free_firmware(struct kvm *kvm) +{ + return 0; +} + +void kvm__arch_periodic_poll(struct kvm *kvm) +{ + serial8250__update_consoles(kvm); + virtio_console__inject_interrupt(kvm); +} diff --git a/tools/kvm/x86/mptable.c b/tools/kvm/x86/mptable.c new file mode 100644 index 000000000000..ea8c6e8c848f --- /dev/null +++ b/tools/kvm/x86/mptable.c @@ -0,0 +1,289 @@ +#include "kvm/kvm.h" +#include "kvm/bios.h" +#include "kvm/apic.h" +#include "kvm/mptable.h" +#include "kvm/util.h" +#include "kvm/irq.h" + +#include <linux/kernel.h> +#include <string.h> + +#include <asm/mpspec_def.h> +#include <linux/types.h> + +/* + * FIXME: please make sure the addresses borrowed + * for apic/ioapic never overlaped! We need a global + * tracker of system resources (including io, mmio, + * and friends). + */ + +static unsigned int mpf_checksum(unsigned char *mp, int len) +{ + unsigned int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +static unsigned int gen_cpu_flag(unsigned int cpu, unsigned int ncpu) +{ + /* sets enabled/disabled | BSP/AP processor */ + return ( (cpu < ncpu) ? CPU_ENABLED : 0) | + ((cpu == 0) ? CPU_BOOTPROCESSOR : 0x00); +} + +#define MPTABLE_SIG_FLOATING "_MP_" +#define MPTABLE_OEM "KVMCPU00" +#define MPTABLE_PRODUCTID "0.1 " +#define MPTABLE_PCIBUSTYPE "PCI " +#define MPTABLE_ISABUSTYPE "ISA " + +#define MPTABLE_STRNCPY(d, s) memcpy(d, s, sizeof(d)) + +/* It should be more than enough */ +#define MPTABLE_MAX_SIZE (32 << 20) + +/* + * Too many cpus will require x2apic mode + * and rather ACPI support so we limit it + * here for a while. + */ +#define MPTABLE_MAX_CPUS 255 + +static void mptable_add_irq_src(struct mpc_intsrc *mpc_intsrc, + u16 srcbusid, u16 srcbusirq, + u16 dstapic, u16 dstirq) +{ + *mpc_intsrc = (struct mpc_intsrc) { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQDIR_DEFAULT, + .srcbus = srcbusid, + .srcbusirq = srcbusirq, + .dstapic = dstapic, + .dstirq = dstirq + }; +} + +/** + * mptable_setup - create mptable and fill guest memory with it + */ +int mptable__init(struct kvm *kvm) +{ + unsigned long real_mpc_table, real_mpf_intel, size; + struct mpf_intel *mpf_intel; + struct mpc_table *mpc_table; + struct mpc_cpu *mpc_cpu; + struct mpc_bus *mpc_bus; + struct mpc_ioapic *mpc_ioapic; + struct mpc_intsrc *mpc_intsrc; + struct rb_node *pci_tree; + + const int pcibusid = 0; + const int isabusid = 1; + + unsigned int i, nentries = 0, ncpus = kvm->nrcpus; + unsigned int ioapicid; + void *last_addr; + + /* That is where MP table will be in guest memory */ + real_mpc_table = ALIGN(MB_BIOS_BEGIN + bios_rom_size, 16); + + if (ncpus > MPTABLE_MAX_CPUS) { + pr_warning("Too many cpus: %d limited to %d", + ncpus, MPTABLE_MAX_CPUS); + ncpus = MPTABLE_MAX_CPUS; + } + + mpc_table = calloc(1, MPTABLE_MAX_SIZE); + if (!mpc_table) + return -ENOMEM; + + MPTABLE_STRNCPY(mpc_table->signature, MPC_SIGNATURE); + MPTABLE_STRNCPY(mpc_table->oem, MPTABLE_OEM); + MPTABLE_STRNCPY(mpc_table->productid, MPTABLE_PRODUCTID); + + mpc_table->spec = 4; + mpc_table->lapic = APIC_ADDR(0); + mpc_table->oemcount = ncpus; /* will be updated again at end */ + + /* + * CPUs enumeration. Technically speaking we should + * ask either host or HV for apic version supported + * but for a while we simply put some random value + * here. + */ + mpc_cpu = (void *)&mpc_table[1]; + for (i = 0; i < ncpus; i++) { + mpc_cpu->type = MP_PROCESSOR; + mpc_cpu->apicid = i; + mpc_cpu->apicver = KVM_APIC_VERSION; + mpc_cpu->cpuflag = gen_cpu_flag(i, ncpus); + mpc_cpu->cpufeature = 0x600; /* some default value */ + mpc_cpu->featureflag = 0x201; /* some default value */ + mpc_cpu++; + } + + last_addr = (void *)mpc_cpu; + nentries += ncpus; + + /* + * PCI buses. + * FIXME: Some callback here to obtain real number + * of PCI buses present in system. + */ + mpc_bus = last_addr; + mpc_bus->type = MP_BUS; + mpc_bus->busid = pcibusid; + MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_PCIBUSTYPE); + + last_addr = (void *)&mpc_bus[1]; + nentries++; + + /* + * ISA bus. + * FIXME: Same issue as for PCI bus. + */ + mpc_bus = last_addr; + mpc_bus->type = MP_BUS; + mpc_bus->busid = isabusid; + MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_ISABUSTYPE); + + last_addr = (void *)&mpc_bus[1]; + nentries++; + + /* + * IO-APIC chip. + */ + ioapicid = ncpus + 1; + mpc_ioapic = last_addr; + mpc_ioapic->type = MP_IOAPIC; + mpc_ioapic->apicid = ioapicid; + mpc_ioapic->apicver = KVM_APIC_VERSION; + mpc_ioapic->flags = MPC_APIC_USABLE; + mpc_ioapic->apicaddr = IOAPIC_ADDR(0); + + last_addr = (void *)&mpc_ioapic[1]; + nentries++; + + /* + * IRQ sources. + * + * FIXME: Same issue as with buses. We definitely + * need kind of collector routine which enumerate + * resources used first and pass them here. + * At moment we know we have only virtio block device + * and virtio console but this is g00berfish. + * + * Also note we use PCI irqs here, no for ISA bus yet. + */ + + for (pci_tree = irq__get_pci_tree(); pci_tree; pci_tree = rb_next(pci_tree)) { + struct pci_dev *dev = rb_entry(pci_tree, struct pci_dev, node); + struct irq_line *irq_line; + + list_for_each_entry(irq_line, &dev->lines, node) { + unsigned char srcbusirq; + + srcbusirq = (dev->id << 2) | (dev->pin - 1); + + mpc_intsrc = last_addr; + + mptable_add_irq_src(mpc_intsrc, pcibusid, srcbusirq, ioapicid, irq_line->line); + last_addr = (void *)&mpc_intsrc[1]; + nentries++; + } + } + + /* + * Local IRQs assignment (LINT0, LINT1) + */ + mpc_intsrc = last_addr; + mpc_intsrc->type = MP_LINTSRC; + mpc_intsrc->irqtype = mp_ExtINT; + mpc_intsrc->irqtype = mp_INT; + mpc_intsrc->irqflag = MP_IRQDIR_DEFAULT; + mpc_intsrc->srcbus = isabusid; + mpc_intsrc->srcbusirq = 0; + mpc_intsrc->dstapic = 0; /* FIXME: BSP apic */ + mpc_intsrc->dstirq = 0; /* LINT0 */ + + last_addr = (void *)&mpc_intsrc[1]; + nentries++; + + mpc_intsrc = last_addr; + mpc_intsrc->type = MP_LINTSRC; + mpc_intsrc->irqtype = mp_NMI; + mpc_intsrc->irqflag = MP_IRQDIR_DEFAULT; + mpc_intsrc->srcbus = isabusid; + mpc_intsrc->srcbusirq = 0; + mpc_intsrc->dstapic = 0; /* FIXME: BSP apic */ + mpc_intsrc->dstirq = 1; /* LINT1 */ + + last_addr = (void *)&mpc_intsrc[1]; + nentries++; + + /* + * Floating MP table finally. + */ + real_mpf_intel = ALIGN((unsigned long)last_addr - (unsigned long)mpc_table, 16); + mpf_intel = (void *)((unsigned long)mpc_table + real_mpf_intel); + + MPTABLE_STRNCPY(mpf_intel->signature, MPTABLE_SIG_FLOATING); + mpf_intel->length = 1; + mpf_intel->specification= 4; + mpf_intel->physptr = (unsigned int)real_mpc_table; + mpf_intel->checksum = -mpf_checksum((unsigned char *)mpf_intel, sizeof(*mpf_intel)); + + /* + * No last_addr inclrement here please, we need last + * active position here to compute table size. + */ + + /* + * Don't forget to update header in fixed table. + */ + mpc_table->oemcount = nentries; + mpc_table->length = last_addr - (void *)mpc_table; + mpc_table->checksum = -mpf_checksum((unsigned char *)mpc_table, mpc_table->length); + + + /* + * We will copy the whole table, no need to separate + * floating structure and table itkvm. + */ + size = (unsigned long)mpf_intel + sizeof(*mpf_intel) - (unsigned long)mpc_table; + + /* + * The finial check -- never get out of system bios + * area. Lets also check for allocated memory overrun, + * in real it's late but still usefull. + */ + + if (size > (unsigned long)(MB_BIOS_END - bios_rom_size) || + size > MPTABLE_MAX_SIZE) { + free(mpc_table); + pr_err("MP table is too big"); + + return -E2BIG; + } + + /* + * OK, it is time to move it to guest memory. + */ + memcpy(guest_flat_to_host(kvm, real_mpc_table), mpc_table, size); + + free(mpc_table); + + return 0; +} +firmware_init(mptable__init); + +int mptable__exit(struct kvm *kvm) +{ + return 0; +} +firmware_exit(mptable__exit); diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 5a824e355d04..bb8b3db0e583 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -13,8 +13,7 @@ * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * License along with this program; if not, see <http://www.gnu.org/licenses> * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * @@ -1463,7 +1462,8 @@ static int event_read_fields(struct event_format *event, struct format_field **f if (read_expect_type(EVENT_ITEM, &token)) goto fail; - /* add signed type */ + if (strtoul(token, NULL, 0)) + field->flags |= FIELD_IS_SIGNED; free_token(token); if (read_expected(EVENT_OP, ";") < 0) @@ -1785,6 +1785,8 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) strcmp(token, "/") == 0 || strcmp(token, "<") == 0 || strcmp(token, ">") == 0 || + strcmp(token, "<=") == 0 || + strcmp(token, ">=") == 0 || strcmp(token, "==") == 0 || strcmp(token, "!=") == 0) { @@ -2481,7 +2483,7 @@ process_dynamic_array(struct event_format *event, struct print_arg *arg, char ** free_token(token); arg = alloc_arg(); - if (!field) { + if (!arg) { do_warning("%s: not enough memory!", __func__); *tok = NULL; return EVENT_ERROR; diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index 24a4bbabc5d5..7be7e89533e4 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -13,8 +13,7 @@ * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * License along with this program; if not, see <http://www.gnu.org/licenses> * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ diff --git a/tools/lib/traceevent/event-utils.h b/tools/lib/traceevent/event-utils.h index bc075006966e..e76c9acb92cd 100644 --- a/tools/lib/traceevent/event-utils.h +++ b/tools/lib/traceevent/event-utils.h @@ -13,8 +13,7 @@ * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * License along with this program; if not, see <http://www.gnu.org/licenses> * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 5ea4326ad11f..2500e75583fc 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -13,8 +13,7 @@ * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * License along with this program; if not, see <http://www.gnu.org/licenses> * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ diff --git a/tools/lib/traceevent/parse-utils.c b/tools/lib/traceevent/parse-utils.c index f023a133abb6..bba701cf10e6 100644 --- a/tools/lib/traceevent/parse-utils.c +++ b/tools/lib/traceevent/parse-utils.c @@ -1,3 +1,22 @@ +/* + * Copyright (C) 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License (not later!) + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ #include <stdio.h> #include <stdlib.h> #include <string.h> diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c index b1ccc923e8a5..a57db805136a 100644 --- a/tools/lib/traceevent/trace-seq.c +++ b/tools/lib/traceevent/trace-seq.c @@ -13,8 +13,7 @@ * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * License along with this program; if not, see <http://www.gnu.org/licenses> * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index ef6d22e879eb..eb30044a922a 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -222,10 +222,14 @@ install-pdf: pdf #install-html: html # '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir) +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),tags) $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE $(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) $(OUTPUT)PERF-VERSION-FILE -include $(OUTPUT)PERF-VERSION-FILE +endif +endif # # Determine "include::" file references in asciidoc files. diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt index c1057701a7dc..8e798baae0fd 100644 --- a/tools/perf/Documentation/perf-buildid-cache.txt +++ b/tools/perf/Documentation/perf-buildid-cache.txt @@ -24,6 +24,9 @@ OPTIONS -r:: --remove=:: Remove specified file from the cache. +-M:: +--missing=:: + List missing build ids in the cache for the specified file. -v:: --verbose:: Be more verbose. diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index 194f37d635df..5b3123d5721f 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -22,10 +22,6 @@ specified perf.data files. OPTIONS ------- --M:: ---displacement:: - Show position displacement relative to baseline. - -D:: --dump-raw-trace:: Dump raw trace in ASCII. diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index f4d91bebd59d..848a0dcb6dfd 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -57,11 +57,44 @@ OPTIONS -s:: --sort=:: - Sort by key(s): pid, comm, dso, symbol, parent, srcline. + Sort histogram entries by given key(s) - multiple keys can be specified + in CSV format. Following sort keys are available: + pid, comm, dso, symbol, parent, cpu, srcline. + + Each key has following meaning: + + - comm: command (name) of the task which can be read via /proc/<pid>/comm + - pid: command and tid of the task + - dso: name of library or module executed at the time of sample + - symbol: name of function executed at the time of sample + - parent: name of function matched to the parent regex filter. Unmatched + entries are displayed as "[other]". + - cpu: cpu number the task ran at the time of sample + - srcline: filename and line number executed at the time of sample. The + DWARF debuggin info must be provided. + + By default, comm, dso and symbol keys are used. + (i.e. --sort comm,dso,symbol) + + If --branch-stack option is used, following sort keys are also + available: + dso_from, dso_to, symbol_from, symbol_to, mispredict. + + - dso_from: name of library or module branched from + - dso_to: name of library or module branched to + - symbol_from: name of function branched from + - symbol_to: name of function branched to + - mispredict: "N" for predicted branch, "Y" for mispredicted branch + + And default sort keys are changed to comm, dso_from, symbol_from, dso_to + and symbol_to, see '--branch-stack'. -p:: --parent=<regex>:: - regex filter to identify parent, see: '--sort parent' + A regex filter to identify parent. The parent is a caller of this + function and searched through the callchain, thus it requires callchain + information recorded. The pattern is in the exteneded regex format and + defaults to "\^sys_|^do_page_fault", see '--sort parent'. -x:: --exclude-other:: @@ -74,7 +107,6 @@ OPTIONS -t:: --field-separator=:: - Use a special separator character and don't pad with spaces, replacing all occurrences of this separator in symbol names (and other output) with a '.' character, that thus it's the only non valid separator. diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt index a4027f221a53..9f1f054b8432 100644 --- a/tools/perf/Documentation/perf-script-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt @@ -336,7 +336,6 @@ scripts listed by the 'perf script -l' command e.g.: ---- root@tropicana:~# perf script -l List of available trace scripts: - workqueue-stats workqueue stats (ins/exe/create/destroy) wakeup-latency system-wide min/max/avg wakeup latency rw-by-file <comm> r/w activity for a program, by file rw-by-pid system-wide r/w activity @@ -402,7 +401,6 @@ should show a new entry for your script: ---- root@tropicana:~# perf script -l List of available trace scripts: - workqueue-stats workqueue stats (ins/exe/create/destroy) wakeup-latency system-wide min/max/avg wakeup latency rw-by-file <comm> r/w activity for a program, by file rw-by-pid system-wide r/w activity diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt index b24ac40fcd58..d1d3e5121f89 100644 --- a/tools/perf/Documentation/perf-test.txt +++ b/tools/perf/Documentation/perf-test.txt @@ -23,6 +23,10 @@ from 'perf test list'. OPTIONS ------- +-s:: +--skip:: + Tests to skip (comma separater numeric list). + -v:: --verbose:: Be more verbose. diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 5b80d84d6b4a..a414bc95fd52 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -60,7 +60,7 @@ Default is to monitor all CPUS. -i:: --inherit:: - Child tasks inherit counters, only makes sens with -p option. + Child tasks do not inherit counters. -k <path>:: --vmlinux=<path>:: diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 8ab05e543ef4..b62dbc0d974a 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -50,7 +50,6 @@ include config/utilities.mak $(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE @$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT) --include $(OUTPUT)PERF-VERSION-FILE uname_M := $(shell uname -m 2>/dev/null || echo not) @@ -104,7 +103,7 @@ ifdef PARSER_DEBUG endif CFLAGS = -fno-omit-frame-pointer -ggdb3 -funwind-tables -Wall -Wextra -std=gnu99 $(CFLAGS_WERROR) $(CFLAGS_OPTIMIZE) $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) $(PARSER_DEBUG_CFLAGS) -EXTLIBS = -lpthread -lrt -lelf -lm +EXTLIBS = -lpthread -lrt -lelf -lm -lnuma ALL_CFLAGS = $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE ALL_LDFLAGS = $(LDFLAGS) STRIP ?= strip @@ -153,6 +152,8 @@ INSTALL = install # explicitly what architecture to check for. Fix this up for yours.. SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),tags) -include config/feature-tests.mak ifeq ($(call try-cc,$(SOURCE_HELLO),$(CFLAGS) -Werror -fstack-protector-all,-fstack-protector-all),y) @@ -206,6 +207,8 @@ ifeq ($(call try-cc,$(SOURCE_BIONIC),$(CFLAGS),bionic),y) EXTLIBS := $(filter-out -lpthread,$(EXTLIBS)) BASIC_CFLAGS += -I. endif +endif # MAKECMDGOALS != tags +endif # MAKECMDGOALS != clean # Guard against environment variables BUILTIN_OBJS = @@ -230,11 +233,19 @@ endif LIBTRACEEVENT = $(TE_PATH)libtraceevent.a TE_LIB := -L$(TE_PATH) -ltraceevent +export LIBTRACEEVENT + +# python extension build directories +PYTHON_EXTBUILD := $(OUTPUT)python_ext_build/ +PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/ +PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/ +export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP + +python-clean := rm -rf $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so + PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources) PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py -export LIBTRACEEVENT - $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' $(PYTHON_WORD) util/setup.py \ --quiet build_ext; \ @@ -378,8 +389,11 @@ LIB_H += util/rblist.h LIB_H += util/intlist.h LIB_H += util/perf_regs.h LIB_H += util/unwind.h -LIB_H += ui/helpline.h LIB_H += util/vdso.h +LIB_H += ui/helpline.h +LIB_H += ui/progress.h +LIB_H += ui/util.h +LIB_H += ui/ui.h LIB_OBJS += $(OUTPUT)util/abspath.o LIB_OBJS += $(OUTPUT)util/alias.o @@ -453,6 +467,7 @@ LIB_OBJS += $(OUTPUT)util/stat.o LIB_OBJS += $(OUTPUT)ui/setup.o LIB_OBJS += $(OUTPUT)ui/helpline.o LIB_OBJS += $(OUTPUT)ui/progress.o +LIB_OBJS += $(OUTPUT)ui/util.o LIB_OBJS += $(OUTPUT)ui/hist.o LIB_OBJS += $(OUTPUT)ui/stdio/hist.o @@ -471,11 +486,13 @@ LIB_OBJS += $(OUTPUT)tests/rdpmc.o LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o LIB_OBJS += $(OUTPUT)tests/pmu.o -LIB_OBJS += $(OUTPUT)tests/util.o +LIB_OBJS += $(OUTPUT)tests/hists_link.o +LIB_OBJS += $(OUTPUT)tests/python-use.o BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o BUILTIN_OBJS += $(OUTPUT)builtin-bench.o # Benchmark modules +BUILTIN_OBJS += $(OUTPUT)bench/numa.o BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o ifeq ($(RAW_ARCH),x86_64) @@ -510,14 +527,13 @@ PERFLIBS = $(LIB_FILE) $(LIBTRACEEVENT) # # Platform specific tweaks # +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),tags) # We choose to avoid "if .. else if .. else .. endif endif" # because maintaining the nesting to match is a pain. If # we had "elif" things would have been much nicer... --include config.mak.autogen --include config.mak - ifdef NO_LIBELF NO_DWARF := 1 NO_DEMANGLE := 1 @@ -646,7 +662,6 @@ ifndef NO_NEWT LIB_OBJS += $(OUTPUT)ui/browsers/hists.o LIB_OBJS += $(OUTPUT)ui/browsers/map.o LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o - LIB_OBJS += $(OUTPUT)ui/util.o LIB_OBJS += $(OUTPUT)ui/tui/setup.o LIB_OBJS += $(OUTPUT)ui/tui/util.o LIB_OBJS += $(OUTPUT)ui/tui/helpline.o @@ -655,9 +670,6 @@ ifndef NO_NEWT LIB_H += ui/browsers/map.h LIB_H += ui/keysyms.h LIB_H += ui/libslang.h - LIB_H += ui/progress.h - LIB_H += ui/util.h - LIB_H += ui/ui.h endif endif @@ -673,14 +685,11 @@ ifndef NO_GTK2 BASIC_CFLAGS += $(shell pkg-config --cflags gtk+-2.0 2>/dev/null) EXTLIBS += $(shell pkg-config --libs gtk+-2.0 2>/dev/null) LIB_OBJS += $(OUTPUT)ui/gtk/browser.o + LIB_OBJS += $(OUTPUT)ui/gtk/hists.o LIB_OBJS += $(OUTPUT)ui/gtk/setup.o LIB_OBJS += $(OUTPUT)ui/gtk/util.o LIB_OBJS += $(OUTPUT)ui/gtk/helpline.o LIB_OBJS += $(OUTPUT)ui/gtk/progress.o - # Make sure that it'd be included only once. - ifeq ($(findstring -DNEWT_SUPPORT,$(BASIC_CFLAGS)),) - LIB_OBJS += $(OUTPUT)ui/util.o - endif endif endif @@ -707,7 +716,7 @@ disable-python = $(eval $(disable-python_code)) define disable-python_code BASIC_CFLAGS += -DNO_LIBPYTHON $(if $(1),$(warning No $(1) was found)) - $(warning Python support won't be built) + $(warning Python support will not be built) endef override PYTHON := \ @@ -715,19 +724,10 @@ override PYTHON := \ ifndef PYTHON $(call disable-python,python interpreter) - python-clean := else PYTHON_WORD := $(call shell-wordify,$(PYTHON)) - # python extension build directories - PYTHON_EXTBUILD := $(OUTPUT)python_ext_build/ - PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/ - PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/ - export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP - - python-clean := rm -rf $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so - ifdef NO_LIBPYTHON $(call disable-python) else @@ -843,6 +843,9 @@ ifdef ASCIIDOC8 export ASCIIDOC8 endif +endif # MAKECMDGOALS != tags +endif # MAKECMDGOALS != clean + # Shell quote (do not use $(call) to accommodate ancient setups); ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG)) @@ -884,7 +887,7 @@ strip: $(PROGRAMS) $(OUTPUT)perf $(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS - $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \ + $(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \ '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ $(ALL_CFLAGS) -c $(filter %.c,$^) -o $@ @@ -948,7 +951,13 @@ $(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS $(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \ - '-DBINDIR="$(bindir_SQ)"' \ + '-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \ + $< + +$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS + $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \ + -DPYTHONPATH='"$(OUTPUT)python"' \ + -DPYTHON='"$(PYTHON_WORD)"' \ $< $(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS @@ -1099,7 +1108,7 @@ perfexec_instdir = $(prefix)/$(perfexecdir) endif perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir)) -install: all try-install-man +install-bin: all $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' $(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)' $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace' @@ -1120,6 +1129,8 @@ install: all try-install-man $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr' $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr' +install: install-bin try-install-man + install-python_ext: $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)' diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 8f89998eeaf4..a5223e6a7b43 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -1,6 +1,7 @@ #ifndef BENCH_H #define BENCH_H +extern int bench_numa(int argc, const char **argv, const char *prefix); extern int bench_sched_messaging(int argc, const char **argv, const char *prefix); extern int bench_sched_pipe(int argc, const char **argv, const char *prefix); extern int bench_mem_memcpy(int argc, const char **argv, diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c new file mode 100644 index 000000000000..30d1c3225b46 --- /dev/null +++ b/tools/perf/bench/numa.c @@ -0,0 +1,1731 @@ +/* + * numa.c + * + * numa: Simulate NUMA-sensitive workload and measure their NUMA performance + */ + +#include "../perf.h" +#include "../builtin.h" +#include "../util/util.h" +#include "../util/parse-options.h" + +#include "bench.h" + +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <assert.h> +#include <malloc.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/prctl.h> +#include <sys/types.h> + +#include <numa.h> +#include <numaif.h> + +/* + * Regular printout to the terminal, supressed if -q is specified: + */ +#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) + +/* + * Debug printf: + */ +#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) + +struct thread_data { + int curr_cpu; + cpu_set_t bind_cpumask; + int bind_node; + u8 *process_data; + int process_nr; + int thread_nr; + int task_nr; + unsigned int loops_done; + u64 val; + u64 runtime_ns; + pthread_mutex_t *process_lock; +}; + +/* Parameters set by options: */ + +struct params { + /* Startup synchronization: */ + bool serialize_startup; + + /* Task hierarchy: */ + int nr_proc; + int nr_threads; + + /* Working set sizes: */ + const char *mb_global_str; + const char *mb_proc_str; + const char *mb_proc_locked_str; + const char *mb_thread_str; + + double mb_global; + double mb_proc; + double mb_proc_locked; + double mb_thread; + + /* Access patterns to the working set: */ + bool data_reads; + bool data_writes; + bool data_backwards; + bool data_zero_memset; + bool data_rand_walk; + u32 nr_loops; + u32 nr_secs; + u32 sleep_usecs; + + /* Working set initialization: */ + bool init_zero; + bool init_random; + bool init_cpu0; + + /* Misc options: */ + int show_details; + int run_all; + int thp; + + long bytes_global; + long bytes_process; + long bytes_process_locked; + long bytes_thread; + + int nr_tasks; + bool show_quiet; + + bool show_convergence; + bool measure_convergence; + + int perturb_secs; + int nr_cpus; + int nr_nodes; + + /* Affinity options -C and -N: */ + char *cpu_list_str; + char *node_list_str; +}; + + +/* Global, read-writable area, accessible to all processes and threads: */ + +struct global_info { + u8 *data; + + pthread_mutex_t startup_mutex; + int nr_tasks_started; + + pthread_mutex_t startup_done_mutex; + + pthread_mutex_t start_work_mutex; + int nr_tasks_working; + + pthread_mutex_t stop_work_mutex; + u64 bytes_done; + + struct thread_data *threads; + + /* Convergence latency measurement: */ + bool all_converged; + bool stop_work; + + int print_once; + + struct params p; +}; + +static struct global_info *g = NULL; + +static int parse_cpus_opt(const struct option *opt, const char *arg, int unset); +static int parse_nodes_opt(const struct option *opt, const char *arg, int unset); + +struct params p0; + +static const struct option options[] = { + OPT_INTEGER('p', "nr_proc" , &p0.nr_proc, "number of processes"), + OPT_INTEGER('t', "nr_threads" , &p0.nr_threads, "number of threads per process"), + + OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"), + OPT_STRING('P', "mb_proc" , &p0.mb_proc_str, "MB", "process memory (MBs)"), + OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"), + OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"), + + OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run"), + OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run"), + OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"), + + OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"), + OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"), + OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"), + OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"), + OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"), + + + OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"), + OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"), + OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"), + OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"), + + OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"), + OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"), + OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), + OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), + OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), + OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "bzero the initial allocations"), + OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), + + /* Special option string parsing callbacks: */ + OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]", + "bind the first N tasks to these specific cpus (the rest is unbound)", + parse_cpus_opt), + OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]", + "bind the first N tasks to these specific memory nodes (the rest is unbound)", + parse_nodes_opt), + OPT_END() +}; + +static const char * const bench_numa_usage[] = { + "perf bench numa <options>", + NULL +}; + +static const char * const numa_usage[] = { + "perf bench numa mem [<options>]", + NULL +}; + +static cpu_set_t bind_to_cpu(int target_cpu) +{ + cpu_set_t orig_mask, mask; + int ret; + + ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); + BUG_ON(ret); + + CPU_ZERO(&mask); + + if (target_cpu == -1) { + int cpu; + + for (cpu = 0; cpu < g->p.nr_cpus; cpu++) + CPU_SET(cpu, &mask); + } else { + BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); + CPU_SET(target_cpu, &mask); + } + + ret = sched_setaffinity(0, sizeof(mask), &mask); + BUG_ON(ret); + + return orig_mask; +} + +static cpu_set_t bind_to_node(int target_node) +{ + int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes; + cpu_set_t orig_mask, mask; + int cpu; + int ret; + + BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus); + BUG_ON(!cpus_per_node); + + ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); + BUG_ON(ret); + + CPU_ZERO(&mask); + + if (target_node == -1) { + for (cpu = 0; cpu < g->p.nr_cpus; cpu++) + CPU_SET(cpu, &mask); + } else { + int cpu_start = (target_node + 0) * cpus_per_node; + int cpu_stop = (target_node + 1) * cpus_per_node; + + BUG_ON(cpu_stop > g->p.nr_cpus); + + for (cpu = cpu_start; cpu < cpu_stop; cpu++) + CPU_SET(cpu, &mask); + } + + ret = sched_setaffinity(0, sizeof(mask), &mask); + BUG_ON(ret); + + return orig_mask; +} + +static void bind_to_cpumask(cpu_set_t mask) +{ + int ret; + + ret = sched_setaffinity(0, sizeof(mask), &mask); + BUG_ON(ret); +} + +static void mempol_restore(void) +{ + int ret; + + ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1); + + BUG_ON(ret); +} + +static void bind_to_memnode(int node) +{ + unsigned long nodemask; + int ret; + + if (node == -1) + return; + + BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)); + nodemask = 1L << node; + + ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); + dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); + + BUG_ON(ret); +} + +#define HPSIZE (2*1024*1024) + +#define set_taskname(fmt...) \ +do { \ + char name[20]; \ + \ + snprintf(name, 20, fmt); \ + prctl(PR_SET_NAME, name); \ +} while (0) + +static u8 *alloc_data(ssize_t bytes0, int map_flags, + int init_zero, int init_cpu0, int thp, int init_random) +{ + cpu_set_t orig_mask; + ssize_t bytes; + u8 *buf; + int ret; + + if (!bytes0) + return NULL; + + /* Allocate and initialize all memory on CPU#0: */ + if (init_cpu0) { + orig_mask = bind_to_node(0); + bind_to_memnode(0); + } + + bytes = bytes0 + HPSIZE; + + buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0); + BUG_ON(buf == (void *)-1); + + if (map_flags == MAP_PRIVATE) { + if (thp > 0) { + ret = madvise(buf, bytes, MADV_HUGEPAGE); + if (ret && !g->print_once) { + g->print_once = 1; + printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n"); + } + } + if (thp < 0) { + ret = madvise(buf, bytes, MADV_NOHUGEPAGE); + if (ret && !g->print_once) { + g->print_once = 1; + printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n"); + } + } + } + + if (init_zero) { + bzero(buf, bytes); + } else { + /* Initialize random contents, different in each word: */ + if (init_random) { + u64 *wbuf = (void *)buf; + long off = rand(); + long i; + + for (i = 0; i < bytes/8; i++) + wbuf[i] = i + off; + } + } + + /* Align to 2MB boundary: */ + buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1)); + + /* Restore affinity: */ + if (init_cpu0) { + bind_to_cpumask(orig_mask); + mempol_restore(); + } + + return buf; +} + +static void free_data(void *data, ssize_t bytes) +{ + int ret; + + if (!data) + return; + + ret = munmap(data, bytes); + BUG_ON(ret); +} + +/* + * Create a shared memory buffer that can be shared between processes, zeroed: + */ +static void * zalloc_shared_data(ssize_t bytes) +{ + return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random); +} + +/* + * Create a shared memory buffer that can be shared between processes: + */ +static void * setup_shared_data(ssize_t bytes) +{ + return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); +} + +/* + * Allocate process-local memory - this will either be shared between + * threads of this process, or only be accessed by this thread: + */ +static void * setup_private_data(ssize_t bytes) +{ + return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); +} + +/* + * Return a process-shared (global) mutex: + */ +static void init_global_mutex(pthread_mutex_t *mutex) +{ + pthread_mutexattr_t attr; + + pthread_mutexattr_init(&attr); + pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(mutex, &attr); +} + +static int parse_cpu_list(const char *arg) +{ + p0.cpu_list_str = strdup(arg); + + dprintf("got CPU list: {%s}\n", p0.cpu_list_str); + + return 0; +} + +static void parse_setup_cpu_list(void) +{ + struct thread_data *td; + char *str0, *str; + int t; + + if (!g->p.cpu_list_str) + return; + + dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); + + str0 = str = strdup(g->p.cpu_list_str); + t = 0; + + BUG_ON(!str); + + tprintf("# binding tasks to CPUs:\n"); + tprintf("# "); + + while (true) { + int bind_cpu, bind_cpu_0, bind_cpu_1; + char *tok, *tok_end, *tok_step, *tok_len, *tok_mul; + int bind_len; + int step; + int mul; + + tok = strsep(&str, ","); + if (!tok) + break; + + tok_end = strstr(tok, "-"); + + dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); + if (!tok_end) { + /* Single CPU specified: */ + bind_cpu_0 = bind_cpu_1 = atol(tok); + } else { + /* CPU range specified (for example: "5-11"): */ + bind_cpu_0 = atol(tok); + bind_cpu_1 = atol(tok_end + 1); + } + + step = 1; + tok_step = strstr(tok, "#"); + if (tok_step) { + step = atol(tok_step + 1); + BUG_ON(step <= 0 || step >= g->p.nr_cpus); + } + + /* + * Mask length. + * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4', + * where the _4 means the next 4 CPUs are allowed. + */ + bind_len = 1; + tok_len = strstr(tok, "_"); + if (tok_len) { + bind_len = atol(tok_len + 1); + BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus); + } + + /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ + mul = 1; + tok_mul = strstr(tok, "x"); + if (tok_mul) { + mul = atol(tok_mul + 1); + BUG_ON(mul <= 0); + } + + dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul); + + BUG_ON(bind_cpu_0 < 0 || bind_cpu_0 >= g->p.nr_cpus); + BUG_ON(bind_cpu_1 < 0 || bind_cpu_1 >= g->p.nr_cpus); + BUG_ON(bind_cpu_0 > bind_cpu_1); + + for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { + int i; + + for (i = 0; i < mul; i++) { + int cpu; + + if (t >= g->p.nr_tasks) { + printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu); + goto out; + } + td = g->threads + t; + + if (t) + tprintf(","); + if (bind_len > 1) { + tprintf("%2d/%d", bind_cpu, bind_len); + } else { + tprintf("%2d", bind_cpu); + } + + CPU_ZERO(&td->bind_cpumask); + for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { + BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus); + CPU_SET(cpu, &td->bind_cpumask); + } + t++; + } + } + } +out: + + tprintf("\n"); + + if (t < g->p.nr_tasks) + printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t); + + free(str0); +} + +static int parse_cpus_opt(const struct option *opt __maybe_unused, + const char *arg, int unset __maybe_unused) +{ + if (!arg) + return -1; + + return parse_cpu_list(arg); +} + +static int parse_node_list(const char *arg) +{ + p0.node_list_str = strdup(arg); + + dprintf("got NODE list: {%s}\n", p0.node_list_str); + + return 0; +} + +static void parse_setup_node_list(void) +{ + struct thread_data *td; + char *str0, *str; + int t; + + if (!g->p.node_list_str) + return; + + dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); + + str0 = str = strdup(g->p.node_list_str); + t = 0; + + BUG_ON(!str); + + tprintf("# binding tasks to NODEs:\n"); + tprintf("# "); + + while (true) { + int bind_node, bind_node_0, bind_node_1; + char *tok, *tok_end, *tok_step, *tok_mul; + int step; + int mul; + + tok = strsep(&str, ","); + if (!tok) + break; + + tok_end = strstr(tok, "-"); + + dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); + if (!tok_end) { + /* Single NODE specified: */ + bind_node_0 = bind_node_1 = atol(tok); + } else { + /* NODE range specified (for example: "5-11"): */ + bind_node_0 = atol(tok); + bind_node_1 = atol(tok_end + 1); + } + + step = 1; + tok_step = strstr(tok, "#"); + if (tok_step) { + step = atol(tok_step + 1); + BUG_ON(step <= 0 || step >= g->p.nr_nodes); + } + + /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ + mul = 1; + tok_mul = strstr(tok, "x"); + if (tok_mul) { + mul = atol(tok_mul + 1); + BUG_ON(mul <= 0); + } + + dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step); + + BUG_ON(bind_node_0 < 0 || bind_node_0 >= g->p.nr_nodes); + BUG_ON(bind_node_1 < 0 || bind_node_1 >= g->p.nr_nodes); + BUG_ON(bind_node_0 > bind_node_1); + + for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) { + int i; + + for (i = 0; i < mul; i++) { + if (t >= g->p.nr_tasks) { + printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node); + goto out; + } + td = g->threads + t; + + if (!t) + tprintf(" %2d", bind_node); + else + tprintf(",%2d", bind_node); + + td->bind_node = bind_node; + t++; + } + } + } +out: + + tprintf("\n"); + + if (t < g->p.nr_tasks) + printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t); + + free(str0); +} + +static int parse_nodes_opt(const struct option *opt __maybe_unused, + const char *arg, int unset __maybe_unused) +{ + if (!arg) + return -1; + + return parse_node_list(arg); + + return 0; +} + +#define BIT(x) (1ul << x) + +static inline uint32_t lfsr_32(uint32_t lfsr) +{ + const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); + return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps); +} + +/* + * Make sure there's real data dependency to RAM (when read + * accesses are enabled), so the compiler, the CPU and the + * kernel (KSM, zero page, etc.) cannot optimize away RAM + * accesses: + */ +static inline u64 access_data(u64 *data __attribute__((unused)), u64 val) +{ + if (g->p.data_reads) + val += *data; + if (g->p.data_writes) + *data = val + 1; + return val; +} + +/* + * The worker process does two types of work, a forwards going + * loop and a backwards going loop. + * + * We do this so that on multiprocessor systems we do not create + * a 'train' of processing, with highly synchronized processes, + * skewing the whole benchmark. + */ +static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val) +{ + long words = bytes/sizeof(u64); + u64 *data = (void *)__data; + long chunk_0, chunk_1; + u64 *d0, *d, *d1; + long off; + long i; + + BUG_ON(!data && words); + BUG_ON(data && !words); + + if (!data) + return val; + + /* Very simple memset() work variant: */ + if (g->p.data_zero_memset && !g->p.data_rand_walk) { + bzero(data, bytes); + return val; + } + + /* Spread out by PID/TID nr and by loop nr: */ + chunk_0 = words/nr_max; + chunk_1 = words/g->p.nr_loops; + off = nr*chunk_0 + loop*chunk_1; + + while (off >= words) + off -= words; + + if (g->p.data_rand_walk) { + u32 lfsr = nr + loop + val; + int j; + + for (i = 0; i < words/1024; i++) { + long start, end; + + lfsr = lfsr_32(lfsr); + + start = lfsr % words; + end = min(start + 1024, words-1); + + if (g->p.data_zero_memset) { + bzero(data + start, (end-start) * sizeof(u64)); + } else { + for (j = start; j < end; j++) + val = access_data(data + j, val); + } + } + } else if (!g->p.data_backwards || (nr + loop) & 1) { + + d0 = data + off; + d = data + off + 1; + d1 = data + words; + + /* Process data forwards: */ + for (;;) { + if (unlikely(d >= d1)) + d = data; + if (unlikely(d == d0)) + break; + + val = access_data(d, val); + + d++; + } + } else { + /* Process data backwards: */ + + d0 = data + off; + d = data + off - 1; + d1 = data + words; + + /* Process data forwards: */ + for (;;) { + if (unlikely(d < data)) + d = data + words-1; + if (unlikely(d == d0)) + break; + + val = access_data(d, val); + + d--; + } + } + + return val; +} + +static void update_curr_cpu(int task_nr, unsigned long bytes_worked) +{ + unsigned int cpu; + + cpu = sched_getcpu(); + + g->threads[task_nr].curr_cpu = cpu; + prctl(0, bytes_worked); +} + +#define MAX_NR_NODES 64 + +/* + * Count the number of nodes a process's threads + * are spread out on. + * + * A count of 1 means that the process is compressed + * to a single node. A count of g->p.nr_nodes means it's + * spread out on the whole system. + */ +static int count_process_nodes(int process_nr) +{ + char node_present[MAX_NR_NODES] = { 0, }; + int nodes; + int n, t; + + for (t = 0; t < g->p.nr_threads; t++) { + struct thread_data *td; + int task_nr; + int node; + + task_nr = process_nr*g->p.nr_threads + t; + td = g->threads + task_nr; + + node = numa_node_of_cpu(td->curr_cpu); + node_present[node] = 1; + } + + nodes = 0; + + for (n = 0; n < MAX_NR_NODES; n++) + nodes += node_present[n]; + + return nodes; +} + +/* + * Count the number of distinct process-threads a node contains. + * + * A count of 1 means that the node contains only a single + * process. If all nodes on the system contain at most one + * process then we are well-converged. + */ +static int count_node_processes(int node) +{ + int processes = 0; + int t, p; + + for (p = 0; p < g->p.nr_proc; p++) { + for (t = 0; t < g->p.nr_threads; t++) { + struct thread_data *td; + int task_nr; + int n; + + task_nr = p*g->p.nr_threads + t; + td = g->threads + task_nr; + + n = numa_node_of_cpu(td->curr_cpu); + if (n == node) { + processes++; + break; + } + } + } + + return processes; +} + +static void calc_convergence_compression(int *strong) +{ + unsigned int nodes_min, nodes_max; + int p; + + nodes_min = -1; + nodes_max = 0; + + for (p = 0; p < g->p.nr_proc; p++) { + unsigned int nodes = count_process_nodes(p); + + nodes_min = min(nodes, nodes_min); + nodes_max = max(nodes, nodes_max); + } + + /* Strong convergence: all threads compress on a single node: */ + if (nodes_min == 1 && nodes_max == 1) { + *strong = 1; + } else { + *strong = 0; + tprintf(" {%d-%d}", nodes_min, nodes_max); + } +} + +static void calc_convergence(double runtime_ns_max, double *convergence) +{ + unsigned int loops_done_min, loops_done_max; + int process_groups; + int nodes[MAX_NR_NODES]; + int distance; + int nr_min; + int nr_max; + int strong; + int sum; + int nr; + int node; + int cpu; + int t; + + if (!g->p.show_convergence && !g->p.measure_convergence) + return; + + for (node = 0; node < g->p.nr_nodes; node++) + nodes[node] = 0; + + loops_done_min = -1; + loops_done_max = 0; + + for (t = 0; t < g->p.nr_tasks; t++) { + struct thread_data *td = g->threads + t; + unsigned int loops_done; + + cpu = td->curr_cpu; + + /* Not all threads have written it yet: */ + if (cpu < 0) + continue; + + node = numa_node_of_cpu(cpu); + + nodes[node]++; + + loops_done = td->loops_done; + loops_done_min = min(loops_done, loops_done_min); + loops_done_max = max(loops_done, loops_done_max); + } + + nr_max = 0; + nr_min = g->p.nr_tasks; + sum = 0; + + for (node = 0; node < g->p.nr_nodes; node++) { + nr = nodes[node]; + nr_min = min(nr, nr_min); + nr_max = max(nr, nr_max); + sum += nr; + } + BUG_ON(nr_min > nr_max); + + BUG_ON(sum > g->p.nr_tasks); + + if (0 && (sum < g->p.nr_tasks)) + return; + + /* + * Count the number of distinct process groups present + * on nodes - when we are converged this will decrease + * to g->p.nr_proc: + */ + process_groups = 0; + + for (node = 0; node < g->p.nr_nodes; node++) { + int processes = count_node_processes(node); + + nr = nodes[node]; + tprintf(" %2d/%-2d", nr, processes); + + process_groups += processes; + } + + distance = nr_max - nr_min; + + tprintf(" [%2d/%-2d]", distance, process_groups); + + tprintf(" l:%3d-%-3d (%3d)", + loops_done_min, loops_done_max, loops_done_max-loops_done_min); + + if (loops_done_min && loops_done_max) { + double skew = 1.0 - (double)loops_done_min/loops_done_max; + + tprintf(" [%4.1f%%]", skew * 100.0); + } + + calc_convergence_compression(&strong); + + if (strong && process_groups == g->p.nr_proc) { + if (!*convergence) { + *convergence = runtime_ns_max; + tprintf(" (%6.1fs converged)\n", *convergence/1e9); + if (g->p.measure_convergence) { + g->all_converged = true; + g->stop_work = true; + } + } + } else { + if (*convergence) { + tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9); + *convergence = 0; + } + tprintf("\n"); + } +} + +static void show_summary(double runtime_ns_max, int l, double *convergence) +{ + tprintf("\r # %5.1f%% [%.1f mins]", + (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0); + + calc_convergence(runtime_ns_max, convergence); + + if (g->p.show_details >= 0) + fflush(stdout); +} + +static void *worker_thread(void *__tdata) +{ + struct thread_data *td = __tdata; + struct timeval start0, start, stop, diff; + int process_nr = td->process_nr; + int thread_nr = td->thread_nr; + unsigned long last_perturbance; + int task_nr = td->task_nr; + int details = g->p.show_details; + int first_task, last_task; + double convergence = 0; + u64 val = td->val; + double runtime_ns_max; + u8 *global_data; + u8 *process_data; + u8 *thread_data; + u64 bytes_done; + long work_done; + u32 l; + + bind_to_cpumask(td->bind_cpumask); + bind_to_memnode(td->bind_node); + + set_taskname("thread %d/%d", process_nr, thread_nr); + + global_data = g->data; + process_data = td->process_data; + thread_data = setup_private_data(g->p.bytes_thread); + + bytes_done = 0; + + last_task = 0; + if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) + last_task = 1; + + first_task = 0; + if (process_nr == 0 && thread_nr == 0) + first_task = 1; + + if (details >= 2) { + printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n", + process_nr, thread_nr, global_data, process_data, thread_data); + } + + if (g->p.serialize_startup) { + pthread_mutex_lock(&g->startup_mutex); + g->nr_tasks_started++; + pthread_mutex_unlock(&g->startup_mutex); + + /* Here we will wait for the main process to start us all at once: */ + pthread_mutex_lock(&g->start_work_mutex); + g->nr_tasks_working++; + + /* Last one wake the main process: */ + if (g->nr_tasks_working == g->p.nr_tasks) + pthread_mutex_unlock(&g->startup_done_mutex); + + pthread_mutex_unlock(&g->start_work_mutex); + } + + gettimeofday(&start0, NULL); + + start = stop = start0; + last_perturbance = start.tv_sec; + + for (l = 0; l < g->p.nr_loops; l++) { + start = stop; + + if (g->stop_work) + break; + + val += do_work(global_data, g->p.bytes_global, process_nr, g->p.nr_proc, l, val); + val += do_work(process_data, g->p.bytes_process, thread_nr, g->p.nr_threads, l, val); + val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); + + if (g->p.sleep_usecs) { + pthread_mutex_lock(td->process_lock); + usleep(g->p.sleep_usecs); + pthread_mutex_unlock(td->process_lock); + } + /* + * Amount of work to be done under a process-global lock: + */ + if (g->p.bytes_process_locked) { + pthread_mutex_lock(td->process_lock); + val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); + pthread_mutex_unlock(td->process_lock); + } + + work_done = g->p.bytes_global + g->p.bytes_process + + g->p.bytes_process_locked + g->p.bytes_thread; + + update_curr_cpu(task_nr, work_done); + bytes_done += work_done; + + if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) + continue; + + td->loops_done = l; + + gettimeofday(&stop, NULL); + + /* Check whether our max runtime timed out: */ + if (g->p.nr_secs) { + timersub(&stop, &start0, &diff); + if (diff.tv_sec >= g->p.nr_secs) { + g->stop_work = true; + break; + } + } + + /* Update the summary at most once per second: */ + if (start.tv_sec == stop.tv_sec) + continue; + + /* + * Perturb the first task's equilibrium every g->p.perturb_secs seconds, + * by migrating to CPU#0: + */ + if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { + cpu_set_t orig_mask; + int target_cpu; + int this_cpu; + + last_perturbance = stop.tv_sec; + + /* + * Depending on where we are running, move into + * the other half of the system, to create some + * real disturbance: + */ + this_cpu = g->threads[task_nr].curr_cpu; + if (this_cpu < g->p.nr_cpus/2) + target_cpu = g->p.nr_cpus-1; + else + target_cpu = 0; + + orig_mask = bind_to_cpu(target_cpu); + + /* Here we are running on the target CPU already */ + if (details >= 1) + printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); + + bind_to_cpumask(orig_mask); + } + + if (details >= 3) { + timersub(&stop, &start, &diff); + runtime_ns_max = diff.tv_sec * 1000000000; + runtime_ns_max += diff.tv_usec * 1000; + + if (details >= 0) { + printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016lx]\n", + process_nr, thread_nr, runtime_ns_max / bytes_done, val); + } + fflush(stdout); + } + if (!last_task) + continue; + + timersub(&stop, &start0, &diff); + runtime_ns_max = diff.tv_sec * 1000000000ULL; + runtime_ns_max += diff.tv_usec * 1000ULL; + + show_summary(runtime_ns_max, l, &convergence); + } + + gettimeofday(&stop, NULL); + timersub(&stop, &start0, &diff); + td->runtime_ns = diff.tv_sec * 1000000000ULL; + td->runtime_ns += diff.tv_usec * 1000ULL; + + free_data(thread_data, g->p.bytes_thread); + + pthread_mutex_lock(&g->stop_work_mutex); + g->bytes_done += bytes_done; + pthread_mutex_unlock(&g->stop_work_mutex); + + return NULL; +} + +/* + * A worker process starts a couple of threads: + */ +static void worker_process(int process_nr) +{ + pthread_mutex_t process_lock; + struct thread_data *td; + pthread_t *pthreads; + u8 *process_data; + int task_nr; + int ret; + int t; + + pthread_mutex_init(&process_lock, NULL); + set_taskname("process %d", process_nr); + + /* + * Pick up the memory policy and the CPU binding of our first thread, + * so that we initialize memory accordingly: + */ + task_nr = process_nr*g->p.nr_threads; + td = g->threads + task_nr; + + bind_to_memnode(td->bind_node); + bind_to_cpumask(td->bind_cpumask); + + pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t)); + process_data = setup_private_data(g->p.bytes_process); + + if (g->p.show_details >= 3) { + printf(" # process %2d global mem: %p, process mem: %p\n", + process_nr, g->data, process_data); + } + + for (t = 0; t < g->p.nr_threads; t++) { + task_nr = process_nr*g->p.nr_threads + t; + td = g->threads + task_nr; + + td->process_data = process_data; + td->process_nr = process_nr; + td->thread_nr = t; + td->task_nr = task_nr; + td->val = rand(); + td->curr_cpu = -1; + td->process_lock = &process_lock; + + ret = pthread_create(pthreads + t, NULL, worker_thread, td); + BUG_ON(ret); + } + + for (t = 0; t < g->p.nr_threads; t++) { + ret = pthread_join(pthreads[t], NULL); + BUG_ON(ret); + } + + free_data(process_data, g->p.bytes_process); + free(pthreads); +} + +static void print_summary(void) +{ + if (g->p.show_details < 0) + return; + + printf("\n ###\n"); + printf(" # %d %s will execute (on %d nodes, %d CPUs):\n", + g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus); + printf(" # %5dx %5ldMB global shared mem operations\n", + g->p.nr_loops, g->p.bytes_global/1024/1024); + printf(" # %5dx %5ldMB process shared mem operations\n", + g->p.nr_loops, g->p.bytes_process/1024/1024); + printf(" # %5dx %5ldMB thread local mem operations\n", + g->p.nr_loops, g->p.bytes_thread/1024/1024); + + printf(" ###\n"); + + printf("\n ###\n"); fflush(stdout); +} + +static void init_thread_data(void) +{ + ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; + int t; + + g->threads = zalloc_shared_data(size); + + for (t = 0; t < g->p.nr_tasks; t++) { + struct thread_data *td = g->threads + t; + int cpu; + + /* Allow all nodes by default: */ + td->bind_node = -1; + + /* Allow all CPUs by default: */ + CPU_ZERO(&td->bind_cpumask); + for (cpu = 0; cpu < g->p.nr_cpus; cpu++) + CPU_SET(cpu, &td->bind_cpumask); + } +} + +static void deinit_thread_data(void) +{ + ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; + + free_data(g->threads, size); +} + +static int init(void) +{ + g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0); + + /* Copy over options: */ + g->p = p0; + + g->p.nr_cpus = numa_num_configured_cpus(); + + g->p.nr_nodes = numa_max_node() + 1; + + /* char array in count_process_nodes(): */ + BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); + + if (g->p.show_quiet && !g->p.show_details) + g->p.show_details = -1; + + /* Some memory should be specified: */ + if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str) + return -1; + + if (g->p.mb_global_str) { + g->p.mb_global = atof(g->p.mb_global_str); + BUG_ON(g->p.mb_global < 0); + } + + if (g->p.mb_proc_str) { + g->p.mb_proc = atof(g->p.mb_proc_str); + BUG_ON(g->p.mb_proc < 0); + } + + if (g->p.mb_proc_locked_str) { + g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str); + BUG_ON(g->p.mb_proc_locked < 0); + BUG_ON(g->p.mb_proc_locked > g->p.mb_proc); + } + + if (g->p.mb_thread_str) { + g->p.mb_thread = atof(g->p.mb_thread_str); + BUG_ON(g->p.mb_thread < 0); + } + + BUG_ON(g->p.nr_threads <= 0); + BUG_ON(g->p.nr_proc <= 0); + + g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads; + + g->p.bytes_global = g->p.mb_global *1024L*1024L; + g->p.bytes_process = g->p.mb_proc *1024L*1024L; + g->p.bytes_process_locked = g->p.mb_proc_locked *1024L*1024L; + g->p.bytes_thread = g->p.mb_thread *1024L*1024L; + + g->data = setup_shared_data(g->p.bytes_global); + + /* Startup serialization: */ + init_global_mutex(&g->start_work_mutex); + init_global_mutex(&g->startup_mutex); + init_global_mutex(&g->startup_done_mutex); + init_global_mutex(&g->stop_work_mutex); + + init_thread_data(); + + tprintf("#\n"); + parse_setup_cpu_list(); + parse_setup_node_list(); + tprintf("#\n"); + + print_summary(); + + return 0; +} + +static void deinit(void) +{ + free_data(g->data, g->p.bytes_global); + g->data = NULL; + + deinit_thread_data(); + + free_data(g, sizeof(*g)); + g = NULL; +} + +/* + * Print a short or long result, depending on the verbosity setting: + */ +static void print_res(const char *name, double val, + const char *txt_unit, const char *txt_short, const char *txt_long) +{ + if (!name) + name = "main,"; + + if (g->p.show_quiet) + printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); + else + printf(" %14.3f %s\n", val, txt_long); +} + +static int __bench_numa(const char *name) +{ + struct timeval start, stop, diff; + u64 runtime_ns_min, runtime_ns_sum; + pid_t *pids, pid, wpid; + double delta_runtime; + double runtime_avg; + double runtime_sec_max; + double runtime_sec_min; + int wait_stat; + double bytes; + int i, t; + + if (init()) + return -1; + + pids = zalloc(g->p.nr_proc * sizeof(*pids)); + pid = -1; + + /* All threads try to acquire it, this way we can wait for them to start up: */ + pthread_mutex_lock(&g->start_work_mutex); + + if (g->p.serialize_startup) { + tprintf(" #\n"); + tprintf(" # Startup synchronization: ..."); fflush(stdout); + } + + gettimeofday(&start, NULL); + + for (i = 0; i < g->p.nr_proc; i++) { + pid = fork(); + dprintf(" # process %2d: PID %d\n", i, pid); + + BUG_ON(pid < 0); + if (!pid) { + /* Child process: */ + worker_process(i); + + exit(0); + } + pids[i] = pid; + + } + /* Wait for all the threads to start up: */ + while (g->nr_tasks_started != g->p.nr_tasks) + usleep(1000); + + BUG_ON(g->nr_tasks_started != g->p.nr_tasks); + + if (g->p.serialize_startup) { + double startup_sec; + + pthread_mutex_lock(&g->startup_done_mutex); + + /* This will start all threads: */ + pthread_mutex_unlock(&g->start_work_mutex); + + /* This mutex is locked - the last started thread will wake us: */ + pthread_mutex_lock(&g->startup_done_mutex); + + gettimeofday(&stop, NULL); + + timersub(&stop, &start, &diff); + + startup_sec = diff.tv_sec * 1000000000.0; + startup_sec += diff.tv_usec * 1000.0; + startup_sec /= 1e9; + + tprintf(" threads initialized in %.6f seconds.\n", startup_sec); + tprintf(" #\n"); + + start = stop; + pthread_mutex_unlock(&g->startup_done_mutex); + } else { + gettimeofday(&start, NULL); + } + + /* Parent process: */ + + + for (i = 0; i < g->p.nr_proc; i++) { + wpid = waitpid(pids[i], &wait_stat, 0); + BUG_ON(wpid < 0); + BUG_ON(!WIFEXITED(wait_stat)); + + } + + runtime_ns_sum = 0; + runtime_ns_min = -1LL; + + for (t = 0; t < g->p.nr_tasks; t++) { + u64 thread_runtime_ns = g->threads[t].runtime_ns; + + runtime_ns_sum += thread_runtime_ns; + runtime_ns_min = min(thread_runtime_ns, runtime_ns_min); + } + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + BUG_ON(bench_format != BENCH_FORMAT_DEFAULT); + + tprintf("\n ###\n"); + tprintf("\n"); + + runtime_sec_max = diff.tv_sec * 1000000000.0; + runtime_sec_max += diff.tv_usec * 1000.0; + runtime_sec_max /= 1e9; + + runtime_sec_min = runtime_ns_min/1e9; + + bytes = g->bytes_done; + runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9; + + if (g->p.measure_convergence) { + print_res(name, runtime_sec_max, + "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge"); + } + + print_res(name, runtime_sec_max, + "secs,", "runtime-max/thread", "secs slowest (max) thread-runtime"); + + print_res(name, runtime_sec_min, + "secs,", "runtime-min/thread", "secs fastest (min) thread-runtime"); + + print_res(name, runtime_avg, + "secs,", "runtime-avg/thread", "secs average thread-runtime"); + + delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0; + print_res(name, delta_runtime / runtime_sec_max * 100.0, + "%,", "spread-runtime/thread", "% difference between max/avg runtime"); + + print_res(name, bytes / g->p.nr_tasks / 1e9, + "GB,", "data/thread", "GB data processed, per thread"); + + print_res(name, bytes / 1e9, + "GB,", "data-total", "GB data processed, total"); + + print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks), + "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime"); + + print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, + "GB/sec,", "thread-speed", "GB/sec/thread speed"); + + print_res(name, bytes / runtime_sec_max / 1e9, + "GB/sec,", "total-speed", "GB/sec total speed"); + + free(pids); + + deinit(); + + return 0; +} + +#define MAX_ARGS 50 + +static int command_size(const char **argv) +{ + int size = 0; + + while (*argv) { + size++; + argv++; + } + + BUG_ON(size >= MAX_ARGS); + + return size; +} + +static void init_params(struct params *p, const char *name, int argc, const char **argv) +{ + int i; + + printf("\n # Running %s \"perf bench numa", name); + + for (i = 0; i < argc; i++) + printf(" %s", argv[i]); + + printf("\"\n"); + + memset(p, 0, sizeof(*p)); + + /* Initialize nonzero defaults: */ + + p->serialize_startup = 1; + p->data_reads = true; + p->data_writes = true; + p->data_backwards = true; + p->data_rand_walk = true; + p->nr_loops = -1; + p->init_random = true; +} + +static int run_bench_numa(const char *name, const char **argv) +{ + int argc = command_size(argv); + + init_params(&p0, name, argc, argv); + argc = parse_options(argc, argv, options, bench_numa_usage, 0); + if (argc) + goto err; + + if (__bench_numa(name)) + goto err; + + return 0; + +err: + usage_with_options(numa_usage, options); + return -1; +} + +#define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk" +#define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1" + +#define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1" +#define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1" + +#define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1" +#define OPT_BW_NOTHP OPT_BW, "--thp", "-1" + +/* + * The built-in test-suite executed by "perf bench numa -a". + * + * (A minimum of 4 nodes and 16 GB of RAM is recommended.) + */ +static const char *tests[][MAX_ARGS] = { + /* Basic single-stream NUMA bandwidth measurements: */ + { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", + "-C" , "0", "-M", "0", OPT_BW_RAM }, + { "RAM-bw-local-NOTHP,", + "mem", "-p", "1", "-t", "1", "-P", "1024", + "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP }, + { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", + "-C" , "0", "-M", "1", OPT_BW_RAM }, + + /* 2-stream NUMA bandwidth measurements: */ + { "RAM-bw-local-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", + "-C", "0,2", "-M", "0x2", OPT_BW_RAM }, + { "RAM-bw-remote-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024", + "-C", "0,2", "-M", "1x2", OPT_BW_RAM }, + + /* Cross-stream NUMA bandwidth measurement: */ + { "RAM-bw-cross,", "mem", "-p", "2", "-t", "1", "-P", "1024", + "-C", "0,8", "-M", "1,0", OPT_BW_RAM }, + + /* Convergence latency measurements: */ + { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV }, + { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV }, + { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV }, + { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, + { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, + { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV }, + { " 4x4-convergence-NOTHP,", + "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, + { " 4x6-convergence,", "mem", "-p", "4", "-t", "6", "-P", "1020", OPT_CONV }, + { " 4x8-convergence,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_CONV }, + { " 8x4-convergence,", "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV }, + { " 8x4-convergence-NOTHP,", + "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV_NOTHP }, + { " 3x1-convergence,", "mem", "-p", "3", "-t", "1", "-P", "512", OPT_CONV }, + { " 4x1-convergence,", "mem", "-p", "4", "-t", "1", "-P", "512", OPT_CONV }, + { " 8x1-convergence,", "mem", "-p", "8", "-t", "1", "-P", "512", OPT_CONV }, + { "16x1-convergence,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_CONV }, + { "32x1-convergence,", "mem", "-p", "32", "-t", "1", "-P", "128", OPT_CONV }, + + /* Various NUMA process/thread layout bandwidth measurements: */ + { " 2x1-bw-process,", "mem", "-p", "2", "-t", "1", "-P", "1024", OPT_BW }, + { " 3x1-bw-process,", "mem", "-p", "3", "-t", "1", "-P", "1024", OPT_BW }, + { " 4x1-bw-process,", "mem", "-p", "4", "-t", "1", "-P", "1024", OPT_BW }, + { " 8x1-bw-process,", "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW }, + { " 8x1-bw-process-NOTHP,", + "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP }, + { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW }, + + { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, + { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, + { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, + { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, + + { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, + { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, + { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, + { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, + { " 4x8-bw-thread-NOTHP,", + "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP }, + { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, + { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, + + { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, + { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, + + { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, + { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP }, + { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW }, + { "numa01-bw-thread-NOTHP,", + "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW_NOTHP }, +}; + +static int bench_all(void) +{ + int nr = ARRAY_SIZE(tests); + int ret; + int i; + + ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'"); + BUG_ON(ret < 0); + + for (i = 0; i < nr; i++) { + if (run_bench_numa(tests[i][0], tests[i] + 1)) + return -1; + } + + printf("\n"); + + return 0; +} + +int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused) +{ + init_params(&p0, "main,", argc, argv); + argc = parse_options(argc, argv, options, bench_numa_usage, 0); + if (argc) + goto err; + + if (p0.run_all) + return bench_all(); + + if (__bench_numa(NULL)) + goto err; + + return 0; + +err: + usage_with_options(numa_usage, options); + return -1; +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index cae9a5fd2ecf..e5d514bf5365 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -35,6 +35,16 @@ struct bench_suite { /* sentinel: easy for help */ #define suite_all { "all", "Test all benchmark suites", NULL } +static struct bench_suite numa_suites[] = { + { "mem", + "Benchmark for NUMA workloads", + bench_numa }, + suite_all, + { NULL, + NULL, + NULL } +}; + static struct bench_suite sched_suites[] = { { "messaging", "Benchmark for scheduler and IPC mechanisms", @@ -68,6 +78,9 @@ struct bench_subsys { }; static struct bench_subsys subsystems[] = { + { "numa", + "NUMA scheduling and MM behavior", + numa_suites }, { "sched", "scheduler and IPC mechanism", sched_suites }, @@ -159,6 +172,7 @@ static void all_suite(struct bench_subsys *subsys) /* FROM HERE */ printf("# Running %s/%s benchmark...\n", subsys->name, suites[i].name); + fflush(stdout); argv[1] = suites[i].name; suites[i].fn(1, argv, NULL); @@ -225,6 +239,7 @@ int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused) printf("# Running %s/%s benchmark...\n", subsystems[i].name, subsystems[i].suites[j].name); + fflush(stdout); status = subsystems[i].suites[j].fn(argc - 1, argv + 1, prefix); goto end; diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index fae8b250b2ca..a336014e0286 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -14,6 +14,7 @@ #include "util/parse-options.h" #include "util/strlist.h" #include "util/build-id.h" +#include "util/session.h" #include "util/symbol.h" static int build_id_cache__add_file(const char *filename, const char *debugdir) @@ -58,19 +59,59 @@ static int build_id_cache__remove_file(const char *filename, return err; } +static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused) +{ + char filename[PATH_MAX]; + u8 build_id[BUILD_ID_SIZE]; + + if (dso__build_id_filename(dso, filename, sizeof(filename)) && + filename__read_build_id(filename, build_id, + sizeof(build_id)) != sizeof(build_id)) { + if (errno == ENOENT) + return false; + + pr_warning("Problems with %s file, consider removing it from the cache\n", + filename); + } else if (memcmp(dso->build_id, build_id, sizeof(dso->build_id))) { + pr_warning("Problems with %s file, consider removing it from the cache\n", + filename); + } + + return true; +} + +static int build_id_cache__fprintf_missing(const char *filename, bool force, FILE *fp) +{ + struct perf_session *session = perf_session__new(filename, O_RDONLY, + force, false, NULL); + if (session == NULL) + return -1; + + perf_session__fprintf_dsos_buildid(session, fp, dso__missing_buildid_cache, 0); + perf_session__delete(session); + + return 0; +} + int cmd_buildid_cache(int argc, const char **argv, const char *prefix __maybe_unused) { struct strlist *list; struct str_node *pos; + int ret = 0; + bool force = false; char debugdir[PATH_MAX]; char const *add_name_list_str = NULL, - *remove_name_list_str = NULL; + *remove_name_list_str = NULL, + *missing_filename = NULL; const struct option buildid_cache_options[] = { OPT_STRING('a', "add", &add_name_list_str, "file list", "file(s) to add"), OPT_STRING('r', "remove", &remove_name_list_str, "file list", "file(s) to remove"), + OPT_STRING('M', "missing", &missing_filename, "file", + "to find missing build ids in the cache"), + OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_END() }; @@ -125,5 +166,8 @@ int cmd_buildid_cache(int argc, const char **argv, } } - return 0; + if (missing_filename) + ret = build_id_cache__fprintf_missing(missing_filename, force, stdout); + + return ret; } diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index a82d99fec83e..e74366a13218 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -44,23 +44,26 @@ static int filename__fprintf_build_id(const char *name, FILE *fp) return fprintf(fp, "%s\n", sbuild_id); } +static bool dso__skip_buildid(struct dso *dso, int with_hits) +{ + return with_hits && !dso->hit; +} + static int perf_session__list_build_ids(bool force, bool with_hits) { struct perf_session *session; symbol__elf_init(); - - session = perf_session__new(input_name, O_RDONLY, force, false, - &build_id__mark_dso_hit_ops); - if (session == NULL) - return -1; - /* * See if this is an ELF file first: */ - if (filename__fprintf_build_id(session->filename, stdout)) + if (filename__fprintf_build_id(input_name, stdout)) goto out; + session = perf_session__new(input_name, O_RDONLY, force, false, + &build_id__mark_dso_hit_ops); + if (session == NULL) + return -1; /* * in pipe-mode, the only way to get the buildids is to parse * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID @@ -68,9 +71,9 @@ static int perf_session__list_build_ids(bool force, bool with_hits) if (with_hits || session->fd_pipe) perf_session__process_events(session, &build_id__mark_dso_hit_ops); - perf_session__fprintf_dsos_buildid(session, stdout, with_hits); -out: + perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits); perf_session__delete(session); +out: return 0; } diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 93b852f8a5d5..4af0b580b046 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -23,7 +23,6 @@ static char const *input_old = "perf.data.old", *input_new = "perf.data"; static char diff__default_sort_order[] = "dso,symbol"; static bool force; -static bool show_displacement; static bool show_period; static bool show_formula; static bool show_baseline_only; @@ -146,58 +145,47 @@ static int setup_compute(const struct option *opt, const char *str, return -EINVAL; } -static double get_period_percent(struct hist_entry *he, u64 period) +double perf_diff__period_percent(struct hist_entry *he, u64 period) { u64 total = he->hists->stats.total_period; return (period * 100.0) / total; } -double perf_diff__compute_delta(struct hist_entry *he) +double perf_diff__compute_delta(struct hist_entry *he, struct hist_entry *pair) { - struct hist_entry *pair = hist_entry__next_pair(he); - double new_percent = get_period_percent(he, he->stat.period); - double old_percent = pair ? get_period_percent(pair, pair->stat.period) : 0.0; + double new_percent = perf_diff__period_percent(he, he->stat.period); + double old_percent = perf_diff__period_percent(pair, pair->stat.period); he->diff.period_ratio_delta = new_percent - old_percent; he->diff.computed = true; return he->diff.period_ratio_delta; } -double perf_diff__compute_ratio(struct hist_entry *he) +double perf_diff__compute_ratio(struct hist_entry *he, struct hist_entry *pair) { - struct hist_entry *pair = hist_entry__next_pair(he); double new_period = he->stat.period; - double old_period = pair ? pair->stat.period : 0; + double old_period = pair->stat.period; he->diff.computed = true; - he->diff.period_ratio = pair ? (new_period / old_period) : 0; + he->diff.period_ratio = new_period / old_period; return he->diff.period_ratio; } -s64 perf_diff__compute_wdiff(struct hist_entry *he) +s64 perf_diff__compute_wdiff(struct hist_entry *he, struct hist_entry *pair) { - struct hist_entry *pair = hist_entry__next_pair(he); u64 new_period = he->stat.period; - u64 old_period = pair ? pair->stat.period : 0; + u64 old_period = pair->stat.period; he->diff.computed = true; - - if (!pair) - he->diff.wdiff = 0; - else - he->diff.wdiff = new_period * compute_wdiff_w2 - - old_period * compute_wdiff_w1; + he->diff.wdiff = new_period * compute_wdiff_w2 - + old_period * compute_wdiff_w1; return he->diff.wdiff; } -static int formula_delta(struct hist_entry *he, char *buf, size_t size) +static int formula_delta(struct hist_entry *he, struct hist_entry *pair, + char *buf, size_t size) { - struct hist_entry *pair = hist_entry__next_pair(he); - - if (!pair) - return -1; - return scnprintf(buf, size, "(%" PRIu64 " * 100 / %" PRIu64 ") - " "(%" PRIu64 " * 100 / %" PRIu64 ")", @@ -205,41 +193,36 @@ static int formula_delta(struct hist_entry *he, char *buf, size_t size) pair->stat.period, pair->hists->stats.total_period); } -static int formula_ratio(struct hist_entry *he, char *buf, size_t size) +static int formula_ratio(struct hist_entry *he, struct hist_entry *pair, + char *buf, size_t size) { - struct hist_entry *pair = hist_entry__next_pair(he); double new_period = he->stat.period; - double old_period = pair ? pair->stat.period : 0; - - if (!pair) - return -1; + double old_period = pair->stat.period; return scnprintf(buf, size, "%.0F / %.0F", new_period, old_period); } -static int formula_wdiff(struct hist_entry *he, char *buf, size_t size) +static int formula_wdiff(struct hist_entry *he, struct hist_entry *pair, + char *buf, size_t size) { - struct hist_entry *pair = hist_entry__next_pair(he); u64 new_period = he->stat.period; - u64 old_period = pair ? pair->stat.period : 0; - - if (!pair) - return -1; + u64 old_period = pair->stat.period; return scnprintf(buf, size, "(%" PRIu64 " * " "%" PRId64 ") - (%" PRIu64 " * " "%" PRId64 ")", new_period, compute_wdiff_w2, old_period, compute_wdiff_w1); } -int perf_diff__formula(char *buf, size_t size, struct hist_entry *he) +int perf_diff__formula(struct hist_entry *he, struct hist_entry *pair, + char *buf, size_t size) { switch (compute) { case COMPUTE_DELTA: - return formula_delta(he, buf, size); + return formula_delta(he, pair, buf, size); case COMPUTE_RATIO: - return formula_ratio(he, buf, size); + return formula_ratio(he, pair, buf, size); case COMPUTE_WEIGHTED_DIFF: - return formula_wdiff(he, buf, size); + return formula_wdiff(he, pair, buf, size); default: BUG_ON(1); } @@ -292,48 +275,6 @@ static struct perf_tool tool = { .ordering_requires_timestamps = true, }; -static void insert_hist_entry_by_name(struct rb_root *root, - struct hist_entry *he) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct hist_entry *iter; - - while (*p != NULL) { - parent = *p; - iter = rb_entry(parent, struct hist_entry, rb_node); - if (hist_entry__cmp(he, iter) < 0) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&he->rb_node, parent, p); - rb_insert_color(&he->rb_node, root); -} - -static void hists__name_resort(struct hists *self, bool sort) -{ - unsigned long position = 1; - struct rb_root tmp = RB_ROOT; - struct rb_node *next = rb_first(&self->entries); - - while (next != NULL) { - struct hist_entry *n = rb_entry(next, struct hist_entry, rb_node); - - next = rb_next(&n->rb_node); - n->position = position++; - - if (sort) { - rb_erase(&n->rb_node, &self->entries); - insert_hist_entry_by_name(&tmp, n); - } - } - - if (sort) - self->entries = tmp; -} - static struct perf_evsel *evsel_match(struct perf_evsel *evsel, struct perf_evlist *evlist) { @@ -346,34 +287,34 @@ static struct perf_evsel *evsel_match(struct perf_evsel *evsel, return NULL; } -static void perf_evlist__resort_hists(struct perf_evlist *evlist, bool name) +static void perf_evlist__collapse_resort(struct perf_evlist *evlist) { struct perf_evsel *evsel; list_for_each_entry(evsel, &evlist->entries, node) { struct hists *hists = &evsel->hists; - hists__output_resort(hists); - - /* - * The hists__name_resort only sets possition - * if name is false. - */ - if (name || ((!name) && show_displacement)) - hists__name_resort(hists, name); + hists__collapse_resort(hists); } } static void hists__baseline_only(struct hists *hists) { - struct rb_node *next = rb_first(&hists->entries); + struct rb_root *root; + struct rb_node *next; + + if (sort__need_collapse) + root = &hists->entries_collapsed; + else + root = hists->entries_in; + next = rb_first(root); while (next != NULL) { - struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node); + struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node_in); - next = rb_next(&he->rb_node); + next = rb_next(&he->rb_node_in); if (!hist_entry__next_pair(he)) { - rb_erase(&he->rb_node, &hists->entries); + rb_erase(&he->rb_node_in, root); hist_entry__free(he); } } @@ -385,18 +326,21 @@ static void hists__precompute(struct hists *hists) while (next != NULL) { struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node); + struct hist_entry *pair = hist_entry__next_pair(he); next = rb_next(&he->rb_node); + if (!pair) + continue; switch (compute) { case COMPUTE_DELTA: - perf_diff__compute_delta(he); + perf_diff__compute_delta(he, pair); break; case COMPUTE_RATIO: - perf_diff__compute_ratio(he); + perf_diff__compute_ratio(he, pair); break; case COMPUTE_WEIGHTED_DIFF: - perf_diff__compute_wdiff(he); + perf_diff__compute_wdiff(he, pair); break; default: BUG_ON(1); @@ -470,19 +414,30 @@ static void insert_hist_entry_by_compute(struct rb_root *root, static void hists__compute_resort(struct hists *hists) { - struct rb_root tmp = RB_ROOT; - struct rb_node *next = rb_first(&hists->entries); + struct rb_root *root; + struct rb_node *next; + + if (sort__need_collapse) + root = &hists->entries_collapsed; + else + root = hists->entries_in; + + hists->entries = RB_ROOT; + next = rb_first(root); + + hists->nr_entries = 0; + hists->stats.total_period = 0; + hists__reset_col_len(hists); while (next != NULL) { - struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node); + struct hist_entry *he; - next = rb_next(&he->rb_node); + he = rb_entry(next, struct hist_entry, rb_node_in); + next = rb_next(&he->rb_node_in); - rb_erase(&he->rb_node, &hists->entries); - insert_hist_entry_by_compute(&tmp, he, compute); + insert_hist_entry_by_compute(&hists->entries, he, compute); + hists__inc_nr_entries(hists, he); } - - hists->entries = tmp; } static void hists__process(struct hists *old, struct hists *new) @@ -497,6 +452,8 @@ static void hists__process(struct hists *old, struct hists *new) if (sort_compute) { hists__precompute(new); hists__compute_resort(new); + } else { + hists__output_resort(new); } hists__fprintf(new, true, 0, 0, stdout); @@ -528,8 +485,8 @@ static int __cmd_diff(void) evlist_old = older->evlist; evlist_new = newer->evlist; - perf_evlist__resort_hists(evlist_old, true); - perf_evlist__resort_hists(evlist_new, false); + perf_evlist__collapse_resort(evlist_old); + perf_evlist__collapse_resort(evlist_new); list_for_each_entry(evsel, &evlist_new->entries, node) { struct perf_evsel *evsel_old; @@ -562,8 +519,6 @@ static const char * const diff_usage[] = { static const struct option options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), - OPT_BOOLEAN('M', "displacement", &show_displacement, - "Show position displacement relative to baseline"), OPT_BOOLEAN('b', "baseline-only", &show_baseline_only, "Show only items with match in baseline"), OPT_CALLBACK('c', "compute", &compute, @@ -597,40 +552,32 @@ static const struct option options[] = { static void ui_init(void) { - perf_hpp__init(); - - /* No overhead column. */ - perf_hpp__column_enable(PERF_HPP__OVERHEAD, false); - /* - * Display baseline/delta/ratio/displacement/ + * Display baseline/delta/ratio * formula/periods columns. */ - perf_hpp__column_enable(PERF_HPP__BASELINE, true); + perf_hpp__column_enable(PERF_HPP__BASELINE); switch (compute) { case COMPUTE_DELTA: - perf_hpp__column_enable(PERF_HPP__DELTA, true); + perf_hpp__column_enable(PERF_HPP__DELTA); break; case COMPUTE_RATIO: - perf_hpp__column_enable(PERF_HPP__RATIO, true); + perf_hpp__column_enable(PERF_HPP__RATIO); break; case COMPUTE_WEIGHTED_DIFF: - perf_hpp__column_enable(PERF_HPP__WEIGHTED_DIFF, true); + perf_hpp__column_enable(PERF_HPP__WEIGHTED_DIFF); break; default: BUG_ON(1); }; - if (show_displacement) - perf_hpp__column_enable(PERF_HPP__DISPL, true); - if (show_formula) - perf_hpp__column_enable(PERF_HPP__FORMULA, true); + perf_hpp__column_enable(PERF_HPP__FORMULA); if (show_period) { - perf_hpp__column_enable(PERF_HPP__PERIOD, true); - perf_hpp__column_enable(PERF_HPP__PERIOD_BASELINE, true); + perf_hpp__column_enable(PERF_HPP__PERIOD); + perf_hpp__column_enable(PERF_HPP__PERIOD_BASELINE); } } diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index c20f1dcfb7e2..1312a5e03ec7 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -15,39 +15,6 @@ #include "util/parse-options.h" #include "util/session.h" -struct perf_attr_details { - bool freq; - bool verbose; -}; - -static int comma_printf(bool *first, const char *fmt, ...) -{ - va_list args; - int ret = 0; - - if (!*first) { - ret += printf(","); - } else { - ret += printf(":"); - *first = false; - } - - va_start(args, fmt); - ret += vprintf(fmt, args); - va_end(args); - return ret; -} - -static int __if_print(bool *first, const char *field, u64 value) -{ - if (value == 0) - return 0; - - return comma_printf(first, " %s: %" PRIu64, field, value); -} - -#define if_print(field) __if_print(&first, #field, pos->attr.field) - static int __cmd_evlist(const char *file_name, struct perf_attr_details *details) { struct perf_session *session; @@ -57,52 +24,8 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details if (session == NULL) return -ENOMEM; - list_for_each_entry(pos, &session->evlist->entries, node) { - bool first = true; - - printf("%s", perf_evsel__name(pos)); - - if (details->verbose || details->freq) { - comma_printf(&first, " sample_freq=%" PRIu64, - (u64)pos->attr.sample_freq); - } - - if (details->verbose) { - if_print(type); - if_print(config); - if_print(config1); - if_print(config2); - if_print(size); - if_print(sample_type); - if_print(read_format); - if_print(disabled); - if_print(inherit); - if_print(pinned); - if_print(exclusive); - if_print(exclude_user); - if_print(exclude_kernel); - if_print(exclude_hv); - if_print(exclude_idle); - if_print(mmap); - if_print(comm); - if_print(freq); - if_print(inherit_stat); - if_print(enable_on_exec); - if_print(task); - if_print(watermark); - if_print(precise_ip); - if_print(mmap_data); - if_print(sample_id_all); - if_print(exclude_host); - if_print(exclude_guest); - if_print(__reserved_1); - if_print(wakeup_events); - if_print(bp_type); - if_print(branch_sample_type); - } - - putchar('\n'); - } + list_for_each_entry(pos, &session->evlist->entries, node) + perf_evsel__fprintf(pos, details, stdout); perf_session__delete(session); return 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 0b4b796167be..c746108c5d48 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -340,7 +340,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session, int n_lines, int is_caller) { struct rb_node *next; - struct machine *machine; + struct machine *machine = &session->machines.host; printf("%.102s\n", graph_dotted_line); printf(" %-34s |", is_caller ? "Callsite": "Alloc Ptr"); @@ -349,11 +349,6 @@ static void __print_result(struct rb_root *root, struct perf_session *session, next = rb_first(root); - machine = perf_session__find_host_machine(session); - if (!machine) { - pr_err("__print_result: couldn't find kernel information\n"); - return; - } while (next && n_lines--) { struct alloc_stat *data = rb_entry(next, struct alloc_stat, node); @@ -614,8 +609,7 @@ static struct sort_dimension *avail_sorts[] = { &pingpong_sort_dimension, }; -#define NUM_AVAIL_SORTS \ - (int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *)) +#define NUM_AVAIL_SORTS ((int)ARRAY_SIZE(avail_sorts)) static int sort_dimension__add(const char *tok, struct list_head *list) { diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index ca3f80ebc100..37a769d7f9fe 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -973,8 +973,7 @@ __cmd_buildid_list(const char *file_name, int argc, const char **argv) int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused) { - const char *file_name; - + const char *file_name = NULL; const struct option kvm_options[] = { OPT_STRING('i', "input", &file_name, "file", "Input file name"), diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index f3151d3c70ce..2ac690cad411 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -224,130 +224,28 @@ static bool perf_evlist__equal(struct perf_evlist *evlist, static int perf_record__open(struct perf_record *rec) { + char msg[512]; struct perf_evsel *pos; struct perf_evlist *evlist = rec->evlist; struct perf_session *session = rec->session; struct perf_record_opts *opts = &rec->opts; int rc = 0; - /* - * Set the evsel leader links before we configure attributes, - * since some might depend on this info. - */ - if (opts->group) - perf_evlist__set_leader(evlist); - - perf_evlist__config_attrs(evlist, opts); + perf_evlist__config(evlist, opts); list_for_each_entry(pos, &evlist->entries, node) { - struct perf_event_attr *attr = &pos->attr; - /* - * Check if parse_single_tracepoint_event has already asked for - * PERF_SAMPLE_TIME. - * - * XXX this is kludgy but short term fix for problems introduced by - * eac23d1c that broke 'perf script' by having different sample_types - * when using multiple tracepoint events when we use a perf binary - * that tries to use sample_id_all on an older kernel. - * - * We need to move counter creation to perf_session, support - * different sample_types, etc. - */ - bool time_needed = attr->sample_type & PERF_SAMPLE_TIME; - -fallback_missing_features: - if (opts->exclude_guest_missing) - attr->exclude_guest = attr->exclude_host = 0; -retry_sample_id: - attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1; try_again: if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) { - int err = errno; - - if (err == EPERM || err == EACCES) { - ui__error_paranoid(); - rc = -err; - goto out; - } else if (err == ENODEV && opts->target.cpu_list) { - pr_err("No such device - did you specify" - " an out-of-range profile CPU?\n"); - rc = -err; - goto out; - } else if (err == EINVAL) { - if (!opts->exclude_guest_missing && - (attr->exclude_guest || attr->exclude_host)) { - pr_debug("Old kernel, cannot exclude " - "guest or host samples.\n"); - opts->exclude_guest_missing = true; - goto fallback_missing_features; - } else if (!opts->sample_id_all_missing) { - /* - * Old kernel, no attr->sample_id_type_all field - */ - opts->sample_id_all_missing = true; - if (!opts->sample_time && !opts->raw_samples && !time_needed) - attr->sample_type &= ~PERF_SAMPLE_TIME; - - goto retry_sample_id; - } - } - - /* - * If it's cycles then fall back to hrtimer - * based cpu-clock-tick sw counter, which - * is always available even if no PMU support. - * - * PPC returns ENXIO until 2.6.37 (behavior changed - * with commit b0a873e). - */ - if ((err == ENOENT || err == ENXIO) - && attr->type == PERF_TYPE_HARDWARE - && attr->config == PERF_COUNT_HW_CPU_CYCLES) { - + if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) { if (verbose) - ui__warning("The cycles event is not supported, " - "trying to fall back to cpu-clock-ticks\n"); - attr->type = PERF_TYPE_SOFTWARE; - attr->config = PERF_COUNT_SW_CPU_CLOCK; - if (pos->name) { - free(pos->name); - pos->name = NULL; - } + ui__warning("%s\n", msg); goto try_again; } - if (err == ENOENT) { - ui__error("The %s event is not supported.\n", - perf_evsel__name(pos)); - rc = -err; - goto out; - } else if ((err == EOPNOTSUPP) && (attr->precise_ip)) { - ui__error("\'precise\' request may not be supported. " - "Try removing 'p' modifier\n"); - rc = -err; - goto out; - } - - printf("\n"); - error("sys_perf_event_open() syscall returned with %d " - "(%s) for event %s. /bin/dmesg may provide " - "additional information.\n", - err, strerror(err), perf_evsel__name(pos)); - -#if defined(__i386__) || defined(__x86_64__) - if (attr->type == PERF_TYPE_HARDWARE && - err == EOPNOTSUPP) { - pr_err("No hardware sampling interrupt available." - " No APIC? If so then you can boot the kernel" - " with the \"lapic\" boot parameter to" - " force-enable it.\n"); - rc = -err; - goto out; - } -#endif - - pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); - rc = -err; + rc = -errno; + perf_evsel__open_strerror(pos, &opts->target, + errno, msg, sizeof(msg)); + ui__error("%s\n", msg); goto out; } } @@ -430,10 +328,6 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data) { int err; struct perf_tool *tool = data; - - if (machine__is_host(machine)) - return; - /* *As for guest kernel when processing subcommand record&report, *we arrange module mmap prior to guest kernel mmap and trigger @@ -618,12 +512,7 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) rec->post_processing_offset = lseek(output, 0, SEEK_CUR); - machine = perf_session__find_host_machine(session); - if (!machine) { - pr_err("Couldn't find native kernel information.\n"); - err = -1; - goto out_delete_session; - } + machine = &session->machines.host; if (opts->pipe_output) { err = perf_event__synthesize_attrs(tool, session, @@ -676,9 +565,10 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv) "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/modules permission or run as root.\n"); - if (perf_guest) - perf_session__process_machines(session, tool, - perf_event__synthesize_guest_os); + if (perf_guest) { + machines__process_guests(&session->machines, + perf_event__synthesize_guest_os, tool); + } if (!opts->target.system_wide) err = perf_event__synthesize_thread_map(tool, evsel_list->threads, @@ -875,11 +765,10 @@ static int get_stack_size(char *str, unsigned long *_size) } #endif /* LIBUNWIND_SUPPORT */ -static int -parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg, - int unset) +int record_parse_callchain_opt(const struct option *opt, + const char *arg, int unset) { - struct perf_record *rec = (struct perf_record *)opt->value; + struct perf_record_opts *opts = opt->value; char *tok, *name, *saveptr = NULL; char *buf; int ret = -1; @@ -905,7 +794,7 @@ parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg, /* Framepointer style */ if (!strncmp(name, "fp", sizeof("fp"))) { if (!strtok_r(NULL, ",", &saveptr)) { - rec->opts.call_graph = CALLCHAIN_FP; + opts->call_graph = CALLCHAIN_FP; ret = 0; } else pr_err("callchain: No more arguments " @@ -918,20 +807,20 @@ parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg, const unsigned long default_stack_dump_size = 8192; ret = 0; - rec->opts.call_graph = CALLCHAIN_DWARF; - rec->opts.stack_dump_size = default_stack_dump_size; + opts->call_graph = CALLCHAIN_DWARF; + opts->stack_dump_size = default_stack_dump_size; tok = strtok_r(NULL, ",", &saveptr); if (tok) { unsigned long size = 0; ret = get_stack_size(tok, &size); - rec->opts.stack_dump_size = size; + opts->stack_dump_size = size; } if (!ret) pr_debug("callchain: stack dump size %d\n", - rec->opts.stack_dump_size); + opts->stack_dump_size); #endif /* LIBUNWIND_SUPPORT */ } else { pr_err("callchain: Unknown -g option " @@ -944,7 +833,7 @@ parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg, free(buf); if (!ret) - pr_debug("callchain: type %d\n", rec->opts.call_graph); + pr_debug("callchain: type %d\n", opts->call_graph); return ret; } @@ -982,9 +871,9 @@ static struct perf_record record = { #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: " #ifdef LIBUNWIND_SUPPORT -static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf"; +const char record_callchain_help[] = CALLCHAIN_HELP "[fp] dwarf"; #else -static const char callchain_help[] = CALLCHAIN_HELP "[fp]"; +const char record_callchain_help[] = CALLCHAIN_HELP "[fp]"; #endif /* @@ -1028,9 +917,9 @@ const struct option record_options[] = { "number of mmap data pages"), OPT_BOOLEAN(0, "group", &record.opts.group, "put the counters into a counter group"), - OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]", - callchain_help, &parse_callchain_opt, - "fp"), + OPT_CALLBACK_DEFAULT('g', "call-graph", &record.opts, + "mode[,dump_size]", record_callchain_help, + &record_parse_callchain_opt, "fp"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index fc251005dd3d..47a864478543 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -372,7 +372,7 @@ static int __cmd_report(struct perf_report *rep) if (ret) goto out_delete; - kernel_map = session->host_machine.vmlinux_maps[MAP__FUNCTION]; + kernel_map = session->machines.host.vmlinux_maps[MAP__FUNCTION]; kernel_kmap = map__kmap(kernel_map); if (kernel_map == NULL || (kernel_map->dso->hit && @@ -595,8 +595,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) OPT_BOOLEAN(0, "stdio", &report.use_stdio, "Use the stdio interface"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", - "sort by key(s): pid, comm, dso, symbol, parent, dso_to," - " dso_from, symbol_to, symbol_from, mispredict"), + "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline," + " dso_to, dso_from, symbol_to, symbol_from, mispredict"), OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, "Show sample percentage for different cpu modes"), OPT_STRING('p', "parent", &parent_pattern, "regex", @@ -692,6 +692,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) setup_browser(true); else { use_browser = 0; + perf_hpp__column_enable(PERF_HPP__OVERHEAD); perf_hpp__init(); } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index cc28b85dabd5..138229439a93 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1475,9 +1475,9 @@ static int perf_sched__read_events(struct perf_sched *sched, bool destroy, goto out_delete; } - sched->nr_events = session->hists.stats.nr_events[0]; - sched->nr_lost_events = session->hists.stats.total_lost; - sched->nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST]; + sched->nr_events = session->stats.nr_events[0]; + sched->nr_lost_events = session->stats.total_lost; + sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST]; } if (destroy) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b363e7b292b2..92d4658f56fb 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -692,7 +692,7 @@ static int parse_output_fields(const struct option *opt __maybe_unused, const char *arg, int unset __maybe_unused) { char *tok; - int i, imax = sizeof(all_output_options) / sizeof(struct output_option); + int i, imax = ARRAY_SIZE(all_output_options); int j; int rc = 0; char *str = strdup(arg); @@ -909,18 +909,6 @@ static const char *ends_with(const char *str, const char *suffix) return NULL; } -static char *ltrim(char *str) -{ - int len = strlen(str); - - while (len && isspace(*str)) { - len--; - str++; - } - - return str; -} - static int read_script_info(struct script_desc *desc, const char *filename) { char line[BUFSIZ], *p; @@ -1487,7 +1475,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) return -1; } - perf_session__fprintf_info(session, stdout, show_full_info); + if (!script_name && !generate_script_lang) + perf_session__fprintf_info(session, stdout, show_full_info); if (!no_callchain) symbol_conf.use_callchain = true; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c247faca7127..1c2ac148a7d5 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -132,8 +132,6 @@ static struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) { struct perf_event_attr *attr = &evsel->attr; - bool exclude_guest_missing = false; - int ret; if (scale) attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | @@ -141,38 +139,16 @@ static int create_perf_stat_counter(struct perf_evsel *evsel) attr->inherit = !no_inherit; -retry: - if (exclude_guest_missing) - evsel->attr.exclude_guest = evsel->attr.exclude_host = 0; - - if (perf_target__has_cpu(&target)) { - ret = perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); - if (ret) - goto check_ret; - return 0; - } + if (perf_target__has_cpu(&target)) + return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); if (!perf_target__has_task(&target) && - !perf_evsel__is_group_member(evsel)) { + perf_evsel__is_group_leader(evsel)) { attr->disabled = 1; attr->enable_on_exec = 1; } - ret = perf_evsel__open_per_thread(evsel, evsel_list->threads); - if (!ret) - return 0; - /* fall through */ -check_ret: - if (ret && errno == EINVAL) { - if (!exclude_guest_missing && - (evsel->attr.exclude_guest || evsel->attr.exclude_host)) { - pr_debug("Old kernel, cannot exclude " - "guest or host samples.\n"); - exclude_guest_missing = true; - goto retry; - } - } - return ret; + return perf_evsel__open_per_thread(evsel, evsel_list->threads); } /* @@ -271,6 +247,7 @@ static int read_counter(struct perf_evsel *counter) static int __run_perf_stat(int argc __maybe_unused, const char **argv) { + char msg[512]; unsigned long long t0, t1; struct perf_evsel *counter; int status = 0; @@ -348,20 +325,13 @@ static int __run_perf_stat(int argc __maybe_unused, const char **argv) continue; } - if (errno == EPERM || errno == EACCES) { - error("You may not have permission to collect %sstats.\n" - "\t Consider tweaking" - " /proc/sys/kernel/perf_event_paranoid or running as root.", - target.system_wide ? "system-wide " : ""); - } else { - error("open_counter returned with %d (%s). " - "/bin/dmesg may provide additional information.\n", - errno, strerror(errno)); - } + perf_evsel__open_strerror(counter, &target, + errno, msg, sizeof(msg)); + ui__error("%s\n", msg); + if (child_pid != -1) kill(child_pid, SIGTERM); - pr_err("Not all events could be opened.\n"); return -1; } counter->supported = true; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c9ff3950cd4b..7978c8117b7f 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -68,28 +68,6 @@ #include <linux/unistd.h> #include <linux/types.h> -void get_term_dimensions(struct winsize *ws) -{ - char *s = getenv("LINES"); - - if (s != NULL) { - ws->ws_row = atoi(s); - s = getenv("COLUMNS"); - if (s != NULL) { - ws->ws_col = atoi(s); - if (ws->ws_row && ws->ws_col) - return; - } - } -#ifdef TIOCGWINSZ - if (ioctl(1, TIOCGWINSZ, ws) == 0 && - ws->ws_row && ws->ws_col) - return; -#endif - ws->ws_row = 25; - ws->ws_col = 80; -} - static void perf_top__update_print_entries(struct perf_top *top) { if (top->print_entries > 9) @@ -596,7 +574,7 @@ static void *display_thread_tui(void *arg) * via --uid. */ list_for_each_entry(pos, &top->evlist->entries, node) - pos->hists.uid_filter_str = top->target.uid_str; + pos->hists.uid_filter_str = top->record_opts.target.uid_str; perf_evlist__tui_browse_hists(top->evlist, help, &hbt, &top->session->header.env); @@ -716,7 +694,7 @@ static void perf_event__process_sample(struct perf_tool *tool, static struct intlist *seen; if (!seen) - seen = intlist__new(); + seen = intlist__new(NULL); if (!intlist__has_entry(seen, event->ip.pid)) { pr_err("Can't find guest [%d]'s kernel information\n", @@ -727,8 +705,8 @@ static void perf_event__process_sample(struct perf_tool *tool, } if (!machine) { - pr_err("%u unprocessable samples recorded.", - top->session->hists.stats.nr_unprocessable_samples++); + pr_err("%u unprocessable samples recorded.\n", + top->session->stats.nr_unprocessable_samples++); return; } @@ -847,13 +825,13 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) ++top->us_samples; if (top->hide_user_symbols) continue; - machine = perf_session__find_host_machine(session); + machine = &session->machines.host; break; case PERF_RECORD_MISC_KERNEL: ++top->kernel_samples; if (top->hide_kernel_symbols) continue; - machine = perf_session__find_host_machine(session); + machine = &session->machines.host; break; case PERF_RECORD_MISC_GUEST_KERNEL: ++top->guest_kernel_samples; @@ -878,7 +856,7 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) hists__inc_nr_events(&evsel->hists, event->header.type); machine__process_event(machine, event); } else - ++session->hists.stats.nr_unknown_events; + ++session->stats.nr_unknown_events; } } @@ -892,111 +870,31 @@ static void perf_top__mmap_read(struct perf_top *top) static void perf_top__start_counters(struct perf_top *top) { + char msg[512]; struct perf_evsel *counter; struct perf_evlist *evlist = top->evlist; + struct perf_record_opts *opts = &top->record_opts; - if (top->group) - perf_evlist__set_leader(evlist); + perf_evlist__config(evlist, opts); list_for_each_entry(counter, &evlist->entries, node) { - struct perf_event_attr *attr = &counter->attr; - - attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; - - if (top->freq) { - attr->sample_type |= PERF_SAMPLE_PERIOD; - attr->freq = 1; - attr->sample_freq = top->freq; - } - - if (evlist->nr_entries > 1) { - attr->sample_type |= PERF_SAMPLE_ID; - attr->read_format |= PERF_FORMAT_ID; - } - - if (perf_target__has_cpu(&top->target)) - attr->sample_type |= PERF_SAMPLE_CPU; - - if (symbol_conf.use_callchain) - attr->sample_type |= PERF_SAMPLE_CALLCHAIN; - - attr->mmap = 1; - attr->comm = 1; - attr->inherit = top->inherit; -fallback_missing_features: - if (top->exclude_guest_missing) - attr->exclude_guest = attr->exclude_host = 0; -retry_sample_id: - attr->sample_id_all = top->sample_id_all_missing ? 0 : 1; try_again: if (perf_evsel__open(counter, top->evlist->cpus, top->evlist->threads) < 0) { - int err = errno; - - if (err == EPERM || err == EACCES) { - ui__error_paranoid(); - goto out_err; - } else if (err == EINVAL) { - if (!top->exclude_guest_missing && - (attr->exclude_guest || attr->exclude_host)) { - pr_debug("Old kernel, cannot exclude " - "guest or host samples.\n"); - top->exclude_guest_missing = true; - goto fallback_missing_features; - } else if (!top->sample_id_all_missing) { - /* - * Old kernel, no attr->sample_id_type_all field - */ - top->sample_id_all_missing = true; - goto retry_sample_id; - } - } - /* - * If it's cycles then fall back to hrtimer - * based cpu-clock-tick sw counter, which - * is always available even if no PMU support: - */ - if ((err == ENOENT || err == ENXIO) && - (attr->type == PERF_TYPE_HARDWARE) && - (attr->config == PERF_COUNT_HW_CPU_CYCLES)) { - + if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { if (verbose) - ui__warning("Cycles event not supported,\n" - "trying to fall back to cpu-clock-ticks\n"); - - attr->type = PERF_TYPE_SOFTWARE; - attr->config = PERF_COUNT_SW_CPU_CLOCK; - if (counter->name) { - free(counter->name); - counter->name = NULL; - } + ui__warning("%s\n", msg); goto try_again; } - if (err == ENOENT) { - ui__error("The %s event is not supported.\n", - perf_evsel__name(counter)); - goto out_err; - } else if (err == EMFILE) { - ui__error("Too many events are opened.\n" - "Try again after reducing the number of events\n"); - goto out_err; - } else if ((err == EOPNOTSUPP) && (attr->precise_ip)) { - ui__error("\'precise\' request may not be supported. " - "Try removing 'p' modifier\n"); - goto out_err; - } - - ui__error("The sys_perf_event_open() syscall " - "returned with %d (%s). /bin/dmesg " - "may provide additional information.\n" - "No CONFIG_PERF_EVENTS=y kernel support " - "configured?\n", err, strerror(err)); + perf_evsel__open_strerror(counter, &opts->target, + errno, msg, sizeof(msg)); + ui__error("%s\n", msg); goto out_err; } } - if (perf_evlist__mmap(evlist, top->mmap_pages, false) < 0) { + if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) { ui__error("Failed to mmap with %d (%s)\n", errno, strerror(errno)); goto out_err; @@ -1016,7 +914,7 @@ static int perf_top__setup_sample_type(struct perf_top *top) ui__error("Selected -g but \"sym\" not present in --sort/-s."); return -EINVAL; } - } else if (!top->dont_use_callchains && callchain_param.mode != CHAIN_NONE) { + } else if (callchain_param.mode != CHAIN_NONE) { if (callchain_register_param(&callchain_param) < 0) { ui__error("Can't register callchain params.\n"); return -EINVAL; @@ -1028,6 +926,7 @@ static int perf_top__setup_sample_type(struct perf_top *top) static int __cmd_top(struct perf_top *top) { + struct perf_record_opts *opts = &top->record_opts; pthread_t thread; int ret; /* @@ -1042,17 +941,28 @@ static int __cmd_top(struct perf_top *top) if (ret) goto out_delete; - if (perf_target__has_task(&top->target)) + if (perf_target__has_task(&opts->target)) perf_event__synthesize_thread_map(&top->tool, top->evlist->threads, perf_event__process, - &top->session->host_machine); + &top->session->machines.host); else perf_event__synthesize_threads(&top->tool, perf_event__process, - &top->session->host_machine); + &top->session->machines.host); perf_top__start_counters(top); top->session->evlist = top->evlist; perf_session__set_id_hdr_size(top->session); + /* + * When perf is starting the traced process, all the events (apart from + * group members) have enable_on_exec=1 set, so don't spoil it by + * prematurely enabling them. + * + * XXX 'top' still doesn't start workloads like record, trace, but should, + * so leave the check here. + */ + if (!perf_target__none(&opts->target)) + perf_evlist__enable(top->evlist); + /* Wait for a minimal set of events before starting the snapshot */ poll(top->evlist->pollfd, top->evlist->nr_fds, 100); @@ -1093,116 +1003,56 @@ out_delete: static int parse_callchain_opt(const struct option *opt, const char *arg, int unset) { - struct perf_top *top = (struct perf_top *)opt->value; - char *tok, *tok2; - char *endptr; - /* * --no-call-graph */ - if (unset) { - top->dont_use_callchains = true; + if (unset) return 0; - } symbol_conf.use_callchain = true; - if (!arg) - return 0; - - tok = strtok((char *)arg, ","); - if (!tok) - return -1; - - /* get the output mode */ - if (!strncmp(tok, "graph", strlen(arg))) - callchain_param.mode = CHAIN_GRAPH_ABS; - - else if (!strncmp(tok, "flat", strlen(arg))) - callchain_param.mode = CHAIN_FLAT; - - else if (!strncmp(tok, "fractal", strlen(arg))) - callchain_param.mode = CHAIN_GRAPH_REL; - - else if (!strncmp(tok, "none", strlen(arg))) { - callchain_param.mode = CHAIN_NONE; - symbol_conf.use_callchain = false; - - return 0; - } else - return -1; - - /* get the min percentage */ - tok = strtok(NULL, ","); - if (!tok) - goto setup; - - callchain_param.min_percent = strtod(tok, &endptr); - if (tok == endptr) - return -1; - - /* get the print limit */ - tok2 = strtok(NULL, ","); - if (!tok2) - goto setup; - - if (tok2[0] != 'c') { - callchain_param.print_limit = strtod(tok2, &endptr); - tok2 = strtok(NULL, ","); - if (!tok2) - goto setup; - } - - /* get the call chain order */ - if (!strcmp(tok2, "caller")) - callchain_param.order = ORDER_CALLER; - else if (!strcmp(tok2, "callee")) - callchain_param.order = ORDER_CALLEE; - else - return -1; -setup: - if (callchain_register_param(&callchain_param) < 0) { - fprintf(stderr, "Can't register callchain params\n"); - return -1; - } - return 0; + return record_parse_callchain_opt(opt, arg, unset); } int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) { - struct perf_evsel *pos; int status; char errbuf[BUFSIZ]; struct perf_top top = { .count_filter = 5, .delay_secs = 2, - .freq = 4000, /* 4 KHz */ - .mmap_pages = 128, - .sym_pcnt_filter = 5, - .target = { - .uses_mmap = true, + .record_opts = { + .mmap_pages = UINT_MAX, + .user_freq = UINT_MAX, + .user_interval = ULLONG_MAX, + .freq = 4000, /* 4 KHz */ + .target = { + .uses_mmap = true, + }, }, + .sym_pcnt_filter = 5, }; - char callchain_default_opt[] = "fractal,0.5,callee"; + struct perf_record_opts *opts = &top.record_opts; + struct perf_target *target = &opts->target; const struct option options[] = { OPT_CALLBACK('e', "event", &top.evlist, "event", "event selector. use 'perf list' to list available events", parse_events_option), - OPT_INTEGER('c', "count", &top.default_interval, - "event period to sample"), - OPT_STRING('p', "pid", &top.target.pid, "pid", + OPT_U64('c', "count", &opts->user_interval, "event period to sample"), + OPT_STRING('p', "pid", &target->pid, "pid", "profile events on existing process id"), - OPT_STRING('t', "tid", &top.target.tid, "tid", + OPT_STRING('t', "tid", &target->tid, "tid", "profile events on existing thread id"), - OPT_BOOLEAN('a', "all-cpus", &top.target.system_wide, + OPT_BOOLEAN('a', "all-cpus", &target->system_wide, "system-wide collection from all CPUs"), - OPT_STRING('C', "cpu", &top.target.cpu_list, "cpu", + OPT_STRING('C', "cpu", &target->cpu_list, "cpu", "list of cpus to monitor"), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols, "hide kernel symbols"), - OPT_UINTEGER('m', "mmap-pages", &top.mmap_pages, "number of mmap data pages"), + OPT_UINTEGER('m', "mmap-pages", &opts->mmap_pages, + "number of mmap data pages"), OPT_INTEGER('r', "realtime", &top.realtime_prio, "collect data with this RT SCHED_FIFO priority"), OPT_INTEGER('d', "delay", &top.delay_secs, @@ -1211,16 +1061,14 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "dump the symbol table used for profiling"), OPT_INTEGER('f', "count-filter", &top.count_filter, "only display functions with more events than this"), - OPT_BOOLEAN('g', "group", &top.group, + OPT_BOOLEAN('g', "group", &opts->group, "put the counters into a counter group"), - OPT_BOOLEAN('i', "inherit", &top.inherit, - "child tasks inherit counters"), + OPT_BOOLEAN('i', "no-inherit", &opts->no_inherit, + "child tasks do not inherit counters"), OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name", "symbol to annotate"), - OPT_BOOLEAN('z', "zero", &top.zero, - "zero history across updates"), - OPT_INTEGER('F', "freq", &top.freq, - "profile at this frequency"), + OPT_BOOLEAN('z', "zero", &top.zero, "zero history across updates"), + OPT_UINTEGER('F', "freq", &opts->user_freq, "profile at this frequency"), OPT_INTEGER('E', "entries", &top.print_entries, "display this many functions"), OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols, @@ -1233,10 +1081,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_CALLBACK_DEFAULT('G', "call-graph", &top, "output_type,min_percent, call_order", - "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. " - "Default: fractal,0.5,callee", &parse_callchain_opt, - callchain_default_opt), + OPT_CALLBACK_DEFAULT('G', "call-graph", &top.record_opts, + "mode[,dump_size]", record_callchain_help, + &parse_callchain_opt, "fp"), OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, "Show a column with the sum of periods"), OPT_STRING(0, "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", @@ -1251,7 +1098,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "Display raw encoding of assembly instructions (default)"), OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), - OPT_STRING('u', "uid", &top.target.uid_str, "user", "user to profile"), + OPT_STRING('u', "uid", &target->uid_str, "user", "user to profile"), OPT_END() }; const char * const top_usage[] = { @@ -1281,27 +1128,27 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) setup_browser(false); - status = perf_target__validate(&top.target); + status = perf_target__validate(target); if (status) { - perf_target__strerror(&top.target, status, errbuf, BUFSIZ); + perf_target__strerror(target, status, errbuf, BUFSIZ); ui__warning("%s", errbuf); } - status = perf_target__parse_uid(&top.target); + status = perf_target__parse_uid(target); if (status) { int saved_errno = errno; - perf_target__strerror(&top.target, status, errbuf, BUFSIZ); + perf_target__strerror(target, status, errbuf, BUFSIZ); ui__error("%s", errbuf); status = -saved_errno; goto out_delete_evlist; } - if (perf_target__none(&top.target)) - top.target.system_wide = true; + if (perf_target__none(target)) + target->system_wide = true; - if (perf_evlist__create_maps(top.evlist, &top.target) < 0) + if (perf_evlist__create_maps(top.evlist, target) < 0) usage_with_options(top_usage, options); if (!top.evlist->nr_entries && @@ -1315,24 +1162,22 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) if (top.delay_secs < 1) top.delay_secs = 1; + if (opts->user_interval != ULLONG_MAX) + opts->default_interval = opts->user_interval; + if (opts->user_freq != UINT_MAX) + opts->freq = opts->user_freq; + /* * User specified count overrides default frequency. */ - if (top.default_interval) - top.freq = 0; - else if (top.freq) { - top.default_interval = top.freq; + if (opts->default_interval) + opts->freq = 0; + else if (opts->freq) { + opts->default_interval = opts->freq; } else { ui__error("frequency and count are zero, aborting\n"); - exit(EXIT_FAILURE); - } - - list_for_each_entry(pos, &top.evlist->entries, node) { - /* - * Fill in the ones not specifically initialized via -c: - */ - if (!pos->attr.sample_period) - pos->attr.sample_period = top.default_interval; + status = -EINVAL; + goto out_delete_evlist; } top.sym_evsel = perf_evlist__first(top.evlist); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 7932ffa29889..d222d7fc7e96 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -455,7 +455,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_delete_evlist; } - perf_evlist__config_attrs(evlist, &trace->opts); + perf_evlist__config(evlist, &trace->opts); signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak index e5413125e6bb..8ef3bd30a549 100644 --- a/tools/perf/config/utilities.mak +++ b/tools/perf/config/utilities.mak @@ -13,7 +13,7 @@ newline := $(newline) # what should replace a newline when escaping # newlines; the default is a bizarre string. # -nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n) +nl-escape = $(if $(1),$(1),m822df3020w6a44id34bt574ctac44eb9f4n) # escape-nl # @@ -173,9 +173,9 @@ _ge-abspath = $(if $(is-executable),$(1)) # Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default) # define get-executable-or-default -$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2))) +$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2),$(1))) endef -_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2))) +_ge_attempt = $(if $(get-executable),$(get-executable),$(_gea_warn)$(call _gea_err,$(2))) _gea_warn = $(warning The path '$(1)' is not executable.) _gea_err = $(if $(1),$(error Please set '$(1)' appropriately)) diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 0f661fbce6a8..095b88207cd3 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -328,14 +328,23 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode)) return 0; + status = 1; /* Check for ENOSPC and EIO errors.. */ - if (fflush(stdout)) - die("write failure on standard output: %s", strerror(errno)); - if (ferror(stdout)) - die("unknown write failure on standard output"); - if (fclose(stdout)) - die("close failed on standard output: %s", strerror(errno)); - return 0; + if (fflush(stdout)) { + fprintf(stderr, "write failure on standard output: %s", strerror(errno)); + goto out; + } + if (ferror(stdout)) { + fprintf(stderr, "unknown write failure on standard output"); + goto out; + } + if (fclose(stdout)) { + fprintf(stderr, "close failed on standard output: %s", strerror(errno)); + goto out; + } + status = 0; +out: + return status; } static void handle_internal_command(int argc, const char **argv) @@ -467,7 +476,8 @@ int main(int argc, const char **argv) cmd += 5; argv[0] = cmd; handle_internal_command(argc, argv); - die("cannot handle %s internally", cmd); + fprintf(stderr, "cannot handle %s internally", cmd); + goto out; } /* Look for flags.. */ @@ -485,7 +495,7 @@ int main(int argc, const char **argv) printf("\n usage: %s\n\n", perf_usage_string); list_common_cmds_help(); printf("\n %s\n\n", perf_more_info_string); - exit(1); + goto out; } cmd = argv[0]; @@ -517,7 +527,7 @@ int main(int argc, const char **argv) fprintf(stderr, "Expansion of alias '%s' failed; " "'%s' is not a perf-command\n", cmd, argv[0]); - exit(1); + goto out; } if (!done_help) { cmd = argv[0] = help_unknown_cmd(cmd); @@ -528,6 +538,6 @@ int main(int argc, const char **argv) fprintf(stderr, "Failed to run command '%s': %s\n", cmd, strerror(errno)); - +out: return 1; } diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 2c340e7da458..8f3bf388e414 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -1,10 +1,6 @@ #ifndef _PERF_PERF_H #define _PERF_PERF_H -struct winsize; - -void get_term_dimensions(struct winsize *ws); - #include <asm/unistd.h> #if defined(__i386__) @@ -237,8 +233,6 @@ struct perf_record_opts { bool raw_samples; bool sample_address; bool sample_time; - bool sample_id_all_missing; - bool exclude_guest_missing; bool period; unsigned int freq; unsigned int mmap_pages; diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-record b/tools/perf/scripts/perl/bin/workqueue-stats-record deleted file mode 100644 index 8edda9078d5d..000000000000 --- a/tools/perf/scripts/perl/bin/workqueue-stats-record +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -perf record -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion $@ diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report deleted file mode 100644 index 6d91411d248c..000000000000 --- a/tools/perf/scripts/perl/bin/workqueue-stats-report +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -# description: workqueue stats (ins/exe/create/destroy) -perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl deleted file mode 100644 index a8eaff5119e0..000000000000 --- a/tools/perf/scripts/perl/workqueue-stats.pl +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/perl -w -# (c) 2009, Tom Zanussi <tzanussi@gmail.com> -# Licensed under the terms of the GNU GPL License version 2 - -# Displays workqueue stats -# -# Usage: -# -# perf record -c 1 -f -a -R -e workqueue:workqueue_creation -e -# workqueue:workqueue_destruction -e workqueue:workqueue_execution -# -e workqueue:workqueue_insertion -# -# perf script -p -s tools/perf/scripts/perl/workqueue-stats.pl - -use 5.010000; -use strict; -use warnings; - -use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib"; -use lib "./Perf-Trace-Util/lib"; -use Perf::Trace::Core; -use Perf::Trace::Util; - -my @cpus; - -sub workqueue::workqueue_destruction -{ - my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, - $thread_comm, $thread_pid) = @_; - - $cpus[$common_cpu]{$thread_pid}{destroyed}++; - $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm; -} - -sub workqueue::workqueue_creation -{ - my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, - $thread_comm, $thread_pid, $cpu) = @_; - - $cpus[$common_cpu]{$thread_pid}{created}++; - $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm; -} - -sub workqueue::workqueue_execution -{ - my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, - $thread_comm, $thread_pid, $func) = @_; - - $cpus[$common_cpu]{$thread_pid}{executed}++; - $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm; -} - -sub workqueue::workqueue_insertion -{ - my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm, - $thread_comm, $thread_pid, $func) = @_; - - $cpus[$common_cpu]{$thread_pid}{inserted}++; - $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm; -} - -sub trace_end -{ - print "workqueue work stats:\n\n"; - my $cpu = 0; - printf("%3s %6s %6s\t%-20s\n", "cpu", "ins", "exec", "name"); - printf("%3s %6s %6s\t%-20s\n", "---", "---", "----", "----"); - foreach my $pidhash (@cpus) { - while ((my $pid, my $wqhash) = each %$pidhash) { - my $ins = $$wqhash{'inserted'} || 0; - my $exe = $$wqhash{'executed'} || 0; - my $comm = $$wqhash{'comm'} || ""; - if ($ins || $exe) { - printf("%3u %6u %6u\t%-20s\n", $cpu, $ins, $exe, $comm); - } - } - $cpu++; - } - - $cpu = 0; - print "\nworkqueue lifecycle stats:\n\n"; - printf("%3s %6s %6s\t%-20s\n", "cpu", "created", "destroyed", "name"); - printf("%3s %6s %6s\t%-20s\n", "---", "-------", "---------", "----"); - foreach my $pidhash (@cpus) { - while ((my $pid, my $wqhash) = each %$pidhash) { - my $created = $$wqhash{'created'} || 0; - my $destroyed = $$wqhash{'destroyed'} || 0; - my $comm = $$wqhash{'comm'} || ""; - if ($created || $destroyed) { - printf("%3u %6u %6u\t%-20s\n", $cpu, $created, $destroyed, - $comm); - } - } - $cpu++; - } - - print_unhandled(); -} - -my %unhandled; - -sub print_unhandled -{ - if ((scalar keys %unhandled) == 0) { - return; - } - - print "\nunhandled events:\n\n"; - - printf("%-40s %10s\n", "event", "count"); - printf("%-40s %10s\n", "----------------------------------------", - "-----------"); - - foreach my $event_name (keys %unhandled) { - printf("%-40s %10d\n", $event_name, $unhandled{$event_name}); - } -} - -sub trace_unhandled -{ - my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs, - $common_pid, $common_comm) = @_; - - $unhandled{$event_name}++; -} diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c index 25638a986257..f61dd3fb546b 100644 --- a/tools/perf/tests/attr.c +++ b/tools/perf/tests/attr.c @@ -33,8 +33,6 @@ extern int verbose; -bool test_attr__enabled; - static char *dir; void test_attr__init(void) @@ -146,7 +144,7 @@ static int run_dir(const char *d, const char *perf) { char cmd[3*PATH_MAX]; - snprintf(cmd, 3*PATH_MAX, "python %s/attr.py -d %s/attr/ -p %s %s", + snprintf(cmd, 3*PATH_MAX, PYTHON " %s/attr.py -d %s/attr/ -p %s %s", d, d, perf, verbose ? "-v" : ""); return system(cmd); diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py index e702b82dcb86..2f629ca485bc 100644 --- a/tools/perf/tests/attr.py +++ b/tools/perf/tests/attr.py @@ -68,7 +68,7 @@ class Event(dict): self[key] = val def __init__(self, name, data, base): - log.info(" Event %s" % name); + log.debug(" Event %s" % name); self.name = name; self.group = '' self.add(base) @@ -97,6 +97,14 @@ class Event(dict): return False return True + def diff(self, other): + for t in Event.terms: + if not self.has_key(t) or not other.has_key(t): + continue + if not self.compare_data(self[t], other[t]): + log.warning("expected %s=%s, got %s" % (t, self[t], other[t])) + + # Test file description needs to have following sections: # [config] # - just single instance in file @@ -113,7 +121,7 @@ class Test(object): parser = ConfigParser.SafeConfigParser() parser.read(path) - log.warning("running '%s'" % path) + log.debug("running '%s'" % path) self.path = path self.test_dir = options.test_dir @@ -128,7 +136,7 @@ class Test(object): self.expect = {} self.result = {} - log.info(" loading expected events"); + log.debug(" loading expected events"); self.load_events(path, self.expect) def is_event(self, name): @@ -164,7 +172,7 @@ class Test(object): self.perf, self.command, tempdir, self.args) ret = os.WEXITSTATUS(os.system(cmd)) - log.info(" running '%s' ret %d " % (cmd, ret)) + log.warning(" running '%s' ret %d " % (cmd, ret)) if ret != int(self.ret): raise Unsup(self) @@ -172,7 +180,7 @@ class Test(object): def compare(self, expect, result): match = {} - log.info(" compare"); + log.debug(" compare"); # For each expected event find all matching # events in result. Fail if there's not any. @@ -187,10 +195,11 @@ class Test(object): else: log.debug(" ->FAIL"); - log.info(" match: [%s] matches %s" % (exp_name, str(exp_list))) + log.debug(" match: [%s] matches %s" % (exp_name, str(exp_list))) # we did not any matching event - fail if (not exp_list): + exp_event.diff(res_event) raise Fail(self, 'match failure'); match[exp_name] = exp_list @@ -208,10 +217,10 @@ class Test(object): if res_group not in match[group]: raise Fail(self, 'group failure') - log.info(" group: [%s] matches group leader %s" % + log.debug(" group: [%s] matches group leader %s" % (exp_name, str(match[group]))) - log.info(" matched") + log.debug(" matched") def resolve_groups(self, events): for name, event in events.items(): @@ -233,7 +242,7 @@ class Test(object): self.run_cmd(tempdir); # load events expectation for the test - log.info(" loading result events"); + log.debug(" loading result events"); for f in glob.glob(tempdir + '/event*'): self.load_events(f, self.result); diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index f1485d8e6a0b..5bc3880f7be5 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -7,7 +7,7 @@ size=96 config=0 sample_period=4000 sample_type=263 -read_format=7 +read_format=0 disabled=1 inherit=1 pinned=0 diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group index a6599e9a19d3..57739cacdb2a 100644 --- a/tools/perf/tests/attr/test-record-group +++ b/tools/perf/tests/attr/test-record-group @@ -6,12 +6,14 @@ args = --group -e cycles,instructions kill >/dev/null 2>&1 fd=1 group_fd=-1 sample_type=327 +read_format=4 [event-2:base-record] fd=2 group_fd=1 config=1 sample_type=327 +read_format=4 mmap=0 comm=0 enable_on_exec=0 diff --git a/tools/perf/tests/attr/test-record-group1 b/tools/perf/tests/attr/test-record-group1 index 5a8359da38af..c5548d054aff 100644 --- a/tools/perf/tests/attr/test-record-group1 +++ b/tools/perf/tests/attr/test-record-group1 @@ -1,11 +1,12 @@ [config] command = record -args = -e '{cycles,instructions}' kill >/tmp/krava 2>&1 +args = -e '{cycles,instructions}' kill >/dev/null 2>&1 [event-1:base-record] fd=1 group_fd=-1 sample_type=327 +read_format=4 [event-2:base-record] fd=2 @@ -13,6 +14,7 @@ group_fd=1 type=0 config=1 sample_type=327 +read_format=4 mmap=0 comm=0 enable_on_exec=0 diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 186f67535494..acb98e0e39f2 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -4,6 +4,7 @@ * Builtin regression testing command: ever growing number of sanity tests */ #include "builtin.h" +#include "intlist.h" #include "tests.h" #include "debug.h" #include "color.h" @@ -69,6 +70,14 @@ static struct test { .func = test__attr, }, { + .desc = "Test matching and linking mutliple hists", + .func = test__hists_link, + }, + { + .desc = "Try 'use perf' in python, checking link problems", + .func = test__python_use, + }, + { .func = NULL, }, }; @@ -97,7 +106,7 @@ static bool perf_test__matches(int curr, int argc, const char *argv[]) return false; } -static int __cmd_test(int argc, const char *argv[]) +static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist) { int i = 0; int width = 0; @@ -118,13 +127,28 @@ static int __cmd_test(int argc, const char *argv[]) continue; pr_info("%2d: %-*s:", i, width, tests[curr].desc); + + if (intlist__find(skiplist, i)) { + color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip (user override)\n"); + continue; + } + pr_debug("\n--- start ---\n"); err = tests[curr].func(); pr_debug("---- end ----\n%s:", tests[curr].desc); - if (err) - color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n"); - else + + switch (err) { + case TEST_OK: pr_info(" Ok\n"); + break; + case TEST_SKIP: + color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip\n"); + break; + case TEST_FAIL: + default: + color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n"); + break; + } } return 0; @@ -152,11 +176,14 @@ int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused) "perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]", NULL, }; + const char *skip = NULL; const struct option test_options[] = { + OPT_STRING('s', "skip", &skip, "tests", "tests to skip"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_END() }; + struct intlist *skiplist = NULL; argc = parse_options(argc, argv, test_options, test_usage, 0); if (argc >= 1 && !strcmp(argv[0], "list")) @@ -169,5 +196,8 @@ int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused) if (symbol__init() < 0) return -1; - return __cmd_test(argc, argv); + if (skip != NULL) + skiplist = intlist__new(skip); + + return __cmd_test(argc, argv, skiplist); } diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c index e61fc828a158..0fd99a9adb91 100644 --- a/tools/perf/tests/evsel-roundtrip-name.c +++ b/tools/perf/tests/evsel-roundtrip-name.c @@ -22,7 +22,7 @@ static int perf_evsel__roundtrip_cache_name_test(void) for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { __perf_evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name)); - err = parse_events(evlist, name, 0); + err = parse_events(evlist, name); if (err) ret = err; } @@ -70,7 +70,7 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names) return -ENOMEM; for (i = 0; i < nr_names; ++i) { - err = parse_events(evlist, names[i], 0); + err = parse_events(evlist, names[i]); if (err) { pr_debug("failed to parse event '%s', err %d\n", names[i], err); diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c new file mode 100644 index 000000000000..0afd9223bde7 --- /dev/null +++ b/tools/perf/tests/hists_link.c @@ -0,0 +1,499 @@ +#include "perf.h" +#include "tests.h" +#include "debug.h" +#include "symbol.h" +#include "sort.h" +#include "evsel.h" +#include "evlist.h" +#include "machine.h" +#include "thread.h" +#include "parse-events.h" + +static struct { + u32 pid; + const char *comm; +} fake_threads[] = { + { 100, "perf" }, + { 200, "perf" }, + { 300, "bash" }, +}; + +static struct { + u32 pid; + u64 start; + const char *filename; +} fake_mmap_info[] = { + { 100, 0x40000, "perf" }, + { 100, 0x50000, "libc" }, + { 100, 0xf0000, "[kernel]" }, + { 200, 0x40000, "perf" }, + { 200, 0x50000, "libc" }, + { 200, 0xf0000, "[kernel]" }, + { 300, 0x40000, "bash" }, + { 300, 0x50000, "libc" }, + { 300, 0xf0000, "[kernel]" }, +}; + +struct fake_sym { + u64 start; + u64 length; + const char *name; +}; + +static struct fake_sym perf_syms[] = { + { 700, 100, "main" }, + { 800, 100, "run_command" }, + { 900, 100, "cmd_record" }, +}; + +static struct fake_sym bash_syms[] = { + { 700, 100, "main" }, + { 800, 100, "xmalloc" }, + { 900, 100, "xfree" }, +}; + +static struct fake_sym libc_syms[] = { + { 700, 100, "malloc" }, + { 800, 100, "free" }, + { 900, 100, "realloc" }, +}; + +static struct fake_sym kernel_syms[] = { + { 700, 100, "schedule" }, + { 800, 100, "page_fault" }, + { 900, 100, "sys_perf_event_open" }, +}; + +static struct { + const char *dso_name; + struct fake_sym *syms; + size_t nr_syms; +} fake_symbols[] = { + { "perf", perf_syms, ARRAY_SIZE(perf_syms) }, + { "bash", bash_syms, ARRAY_SIZE(bash_syms) }, + { "libc", libc_syms, ARRAY_SIZE(libc_syms) }, + { "[kernel]", kernel_syms, ARRAY_SIZE(kernel_syms) }, +}; + +static struct machine *setup_fake_machine(struct machines *machines) +{ + struct machine *machine = machines__find(machines, HOST_KERNEL_ID); + size_t i; + + if (machine == NULL) { + pr_debug("Not enough memory for machine setup\n"); + return NULL; + } + + for (i = 0; i < ARRAY_SIZE(fake_threads); i++) { + struct thread *thread; + + thread = machine__findnew_thread(machine, fake_threads[i].pid); + if (thread == NULL) + goto out; + + thread__set_comm(thread, fake_threads[i].comm); + } + + for (i = 0; i < ARRAY_SIZE(fake_mmap_info); i++) { + union perf_event fake_mmap_event = { + .mmap = { + .header = { .misc = PERF_RECORD_MISC_USER, }, + .pid = fake_mmap_info[i].pid, + .start = fake_mmap_info[i].start, + .len = 0x1000ULL, + .pgoff = 0ULL, + }, + }; + + strcpy(fake_mmap_event.mmap.filename, + fake_mmap_info[i].filename); + + machine__process_mmap_event(machine, &fake_mmap_event); + } + + for (i = 0; i < ARRAY_SIZE(fake_symbols); i++) { + size_t k; + struct dso *dso; + + dso = __dsos__findnew(&machine->user_dsos, + fake_symbols[i].dso_name); + if (dso == NULL) + goto out; + + /* emulate dso__load() */ + dso__set_loaded(dso, MAP__FUNCTION); + + for (k = 0; k < fake_symbols[i].nr_syms; k++) { + struct symbol *sym; + struct fake_sym *fsym = &fake_symbols[i].syms[k]; + + sym = symbol__new(fsym->start, fsym->length, + STB_GLOBAL, fsym->name); + if (sym == NULL) + goto out; + + symbols__insert(&dso->symbols[MAP__FUNCTION], sym); + } + } + + return machine; + +out: + pr_debug("Not enough memory for machine setup\n"); + machine__delete_threads(machine); + machine__delete(machine); + return NULL; +} + +struct sample { + u32 pid; + u64 ip; + struct thread *thread; + struct map *map; + struct symbol *sym; +}; + +static struct sample fake_common_samples[] = { + /* perf [kernel] schedule() */ + { .pid = 100, .ip = 0xf0000 + 700, }, + /* perf [perf] main() */ + { .pid = 200, .ip = 0x40000 + 700, }, + /* perf [perf] cmd_record() */ + { .pid = 200, .ip = 0x40000 + 900, }, + /* bash [bash] xmalloc() */ + { .pid = 300, .ip = 0x40000 + 800, }, + /* bash [libc] malloc() */ + { .pid = 300, .ip = 0x50000 + 700, }, +}; + +static struct sample fake_samples[][5] = { + { + /* perf [perf] run_command() */ + { .pid = 100, .ip = 0x40000 + 800, }, + /* perf [libc] malloc() */ + { .pid = 100, .ip = 0x50000 + 700, }, + /* perf [kernel] page_fault() */ + { .pid = 100, .ip = 0xf0000 + 800, }, + /* perf [kernel] sys_perf_event_open() */ + { .pid = 200, .ip = 0xf0000 + 900, }, + /* bash [libc] free() */ + { .pid = 300, .ip = 0x50000 + 800, }, + }, + { + /* perf [libc] free() */ + { .pid = 200, .ip = 0x50000 + 800, }, + /* bash [libc] malloc() */ + { .pid = 300, .ip = 0x50000 + 700, }, /* will be merged */ + /* bash [bash] xfee() */ + { .pid = 300, .ip = 0x40000 + 900, }, + /* bash [libc] realloc() */ + { .pid = 300, .ip = 0x50000 + 900, }, + /* bash [kernel] page_fault() */ + { .pid = 300, .ip = 0xf0000 + 800, }, + }, +}; + +static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine) +{ + struct perf_evsel *evsel; + struct addr_location al; + struct hist_entry *he; + struct perf_sample sample = { .cpu = 0, }; + size_t i = 0, k; + + /* + * each evsel will have 10 samples - 5 common and 5 distinct. + * However the second evsel also has a collapsed entry for + * "bash [libc] malloc" so total 9 entries will be in the tree. + */ + list_for_each_entry(evsel, &evlist->entries, node) { + for (k = 0; k < ARRAY_SIZE(fake_common_samples); k++) { + const union perf_event event = { + .ip = { + .header = { + .misc = PERF_RECORD_MISC_USER, + }, + .pid = fake_common_samples[k].pid, + .ip = fake_common_samples[k].ip, + }, + }; + + if (perf_event__preprocess_sample(&event, machine, &al, + &sample, 0) < 0) + goto out; + + he = __hists__add_entry(&evsel->hists, &al, NULL, 1); + if (he == NULL) + goto out; + + fake_common_samples[k].thread = al.thread; + fake_common_samples[k].map = al.map; + fake_common_samples[k].sym = al.sym; + } + + for (k = 0; k < ARRAY_SIZE(fake_samples[i]); k++) { + const union perf_event event = { + .ip = { + .header = { + .misc = PERF_RECORD_MISC_USER, + }, + .pid = fake_samples[i][k].pid, + .ip = fake_samples[i][k].ip, + }, + }; + + if (perf_event__preprocess_sample(&event, machine, &al, + &sample, 0) < 0) + goto out; + + he = __hists__add_entry(&evsel->hists, &al, NULL, 1); + if (he == NULL) + goto out; + + fake_samples[i][k].thread = al.thread; + fake_samples[i][k].map = al.map; + fake_samples[i][k].sym = al.sym; + } + i++; + } + + return 0; + +out: + pr_debug("Not enough memory for adding a hist entry\n"); + return -1; +} + +static int find_sample(struct sample *samples, size_t nr_samples, + struct thread *t, struct map *m, struct symbol *s) +{ + while (nr_samples--) { + if (samples->thread == t && samples->map == m && + samples->sym == s) + return 1; + samples++; + } + return 0; +} + +static int __validate_match(struct hists *hists) +{ + size_t count = 0; + struct rb_root *root; + struct rb_node *node; + + /* + * Only entries from fake_common_samples should have a pair. + */ + if (sort__need_collapse) + root = &hists->entries_collapsed; + else + root = hists->entries_in; + + node = rb_first(root); + while (node) { + struct hist_entry *he; + + he = rb_entry(node, struct hist_entry, rb_node_in); + + if (hist_entry__has_pairs(he)) { + if (find_sample(fake_common_samples, + ARRAY_SIZE(fake_common_samples), + he->thread, he->ms.map, he->ms.sym)) { + count++; + } else { + pr_debug("Can't find the matched entry\n"); + return -1; + } + } + + node = rb_next(node); + } + + if (count != ARRAY_SIZE(fake_common_samples)) { + pr_debug("Invalid count for matched entries: %zd of %zd\n", + count, ARRAY_SIZE(fake_common_samples)); + return -1; + } + + return 0; +} + +static int validate_match(struct hists *leader, struct hists *other) +{ + return __validate_match(leader) || __validate_match(other); +} + +static int __validate_link(struct hists *hists, int idx) +{ + size_t count = 0; + size_t count_pair = 0; + size_t count_dummy = 0; + struct rb_root *root; + struct rb_node *node; + + /* + * Leader hists (idx = 0) will have dummy entries from other, + * and some entries will have no pair. However every entry + * in other hists should have (dummy) pair. + */ + if (sort__need_collapse) + root = &hists->entries_collapsed; + else + root = hists->entries_in; + + node = rb_first(root); + while (node) { + struct hist_entry *he; + + he = rb_entry(node, struct hist_entry, rb_node_in); + + if (hist_entry__has_pairs(he)) { + if (!find_sample(fake_common_samples, + ARRAY_SIZE(fake_common_samples), + he->thread, he->ms.map, he->ms.sym) && + !find_sample(fake_samples[idx], + ARRAY_SIZE(fake_samples[idx]), + he->thread, he->ms.map, he->ms.sym)) { + count_dummy++; + } + count_pair++; + } else if (idx) { + pr_debug("A entry from the other hists should have pair\n"); + return -1; + } + + count++; + node = rb_next(node); + } + + /* + * Note that we have a entry collapsed in the other (idx = 1) hists. + */ + if (idx == 0) { + if (count_dummy != ARRAY_SIZE(fake_samples[1]) - 1) { + pr_debug("Invalid count of dummy entries: %zd of %zd\n", + count_dummy, ARRAY_SIZE(fake_samples[1]) - 1); + return -1; + } + if (count != count_pair + ARRAY_SIZE(fake_samples[0])) { + pr_debug("Invalid count of total leader entries: %zd of %zd\n", + count, count_pair + ARRAY_SIZE(fake_samples[0])); + return -1; + } + } else { + if (count != count_pair) { + pr_debug("Invalid count of total other entries: %zd of %zd\n", + count, count_pair); + return -1; + } + if (count_dummy > 0) { + pr_debug("Other hists should not have dummy entries: %zd\n", + count_dummy); + return -1; + } + } + + return 0; +} + +static int validate_link(struct hists *leader, struct hists *other) +{ + return __validate_link(leader, 0) || __validate_link(other, 1); +} + +static void print_hists(struct hists *hists) +{ + int i = 0; + struct rb_root *root; + struct rb_node *node; + + if (sort__need_collapse) + root = &hists->entries_collapsed; + else + root = hists->entries_in; + + pr_info("----- %s --------\n", __func__); + node = rb_first(root); + while (node) { + struct hist_entry *he; + + he = rb_entry(node, struct hist_entry, rb_node_in); + + pr_info("%2d: entry: %-8s [%-8s] %20s: period = %"PRIu64"\n", + i, he->thread->comm, he->ms.map->dso->short_name, + he->ms.sym->name, he->stat.period); + + i++; + node = rb_next(node); + } +} + +int test__hists_link(void) +{ + int err = -1; + struct machines machines; + struct machine *machine = NULL; + struct perf_evsel *evsel, *first; + struct perf_evlist *evlist = perf_evlist__new(NULL, NULL); + + if (evlist == NULL) + return -ENOMEM; + + err = parse_events(evlist, "cpu-clock"); + if (err) + goto out; + err = parse_events(evlist, "task-clock"); + if (err) + goto out; + + /* default sort order (comm,dso,sym) will be used */ + setup_sorting(NULL, NULL); + + machines__init(&machines); + + /* setup threads/dso/map/symbols also */ + machine = setup_fake_machine(&machines); + if (!machine) + goto out; + + if (verbose > 1) + machine__fprintf(machine, stderr); + + /* process sample events */ + err = add_hist_entries(evlist, machine); + if (err < 0) + goto out; + + list_for_each_entry(evsel, &evlist->entries, node) { + hists__collapse_resort(&evsel->hists); + + if (verbose > 2) + print_hists(&evsel->hists); + } + + first = perf_evlist__first(evlist); + evsel = perf_evlist__last(evlist); + + /* match common entries */ + hists__match(&first->hists, &evsel->hists); + err = validate_match(&first->hists, &evsel->hists); + if (err) + goto out; + + /* link common and/or dummy entries */ + hists__link(&first->hists, &evsel->hists); + err = validate_link(&first->hists, &evsel->hists); + if (err) + goto out; + + err = 0; + +out: + /* tear down everything */ + perf_evlist__delete(evlist); + machines__exit(&machines); + + return err; +} diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index e1746811e14b..cdd50755af51 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -22,36 +22,16 @@ int test__basic_mmap(void) struct thread_map *threads; struct cpu_map *cpus; struct perf_evlist *evlist; - struct perf_event_attr attr = { - .type = PERF_TYPE_TRACEPOINT, - .read_format = PERF_FORMAT_ID, - .sample_type = PERF_SAMPLE_ID, - .watermark = 0, - }; cpu_set_t cpu_set; const char *syscall_names[] = { "getsid", "getppid", "getpgrp", "getpgid", }; pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp, (void*)getpgid }; #define nsyscalls ARRAY_SIZE(syscall_names) - int ids[nsyscalls]; unsigned int nr_events[nsyscalls], expected_nr_events[nsyscalls], i, j; struct perf_evsel *evsels[nsyscalls], *evsel; - for (i = 0; i < nsyscalls; ++i) { - char name[64]; - - snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]); - ids[i] = trace_event__id(name); - if (ids[i] < 0) { - pr_debug("Is debugfs mounted on /sys/kernel/debug?\n"); - return -1; - } - nr_events[i] = 0; - expected_nr_events[i] = random() % 257; - } - threads = thread_map__new(-1, getpid(), UINT_MAX); if (threads == NULL) { pr_debug("thread_map__new\n"); @@ -79,18 +59,19 @@ int test__basic_mmap(void) goto out_free_cpus; } - /* anonymous union fields, can't be initialized above */ - attr.wakeup_events = 1; - attr.sample_period = 1; - for (i = 0; i < nsyscalls; ++i) { - attr.config = ids[i]; - evsels[i] = perf_evsel__new(&attr, i); + char name[64]; + + snprintf(name, sizeof(name), "sys_enter_%s", syscall_names[i]); + evsels[i] = perf_evsel__newtp("syscalls", name, i); if (evsels[i] == NULL) { pr_debug("perf_evsel__new\n"); goto out_free_evlist; } + evsels[i]->attr.wakeup_events = 1; + perf_evsel__set_sample_id(evsels[i]); + perf_evlist__add(evlist, evsels[i]); if (perf_evsel__open(evsels[i], cpus, threads) < 0) { @@ -99,6 +80,9 @@ int test__basic_mmap(void) strerror(errno)); goto out_close_fd; } + + nr_events[i] = 0; + expected_nr_events[i] = 1 + rand() % 127; } if (perf_evlist__mmap(evlist, 128, true) < 0) { @@ -128,6 +112,7 @@ int test__basic_mmap(void) goto out_munmap; } + err = -1; evsel = perf_evlist__id2evsel(evlist, sample.id); if (evsel == NULL) { pr_debug("event with id %" PRIu64 @@ -137,16 +122,17 @@ int test__basic_mmap(void) nr_events[evsel->idx]++; } + err = 0; list_for_each_entry(evsel, &evlist->entries, node) { if (nr_events[evsel->idx] != expected_nr_events[evsel->idx]) { pr_debug("expected %d %s events, got %d\n", expected_nr_events[evsel->idx], perf_evsel__name(evsel), nr_events[evsel->idx]); + err = -1; goto out_munmap; } } - err = 0; out_munmap: perf_evlist__munmap(evlist); out_close_fd: diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/open-syscall-all-cpus.c index 31072aba0d54..9b920a0cce79 100644 --- a/tools/perf/tests/open-syscall-all-cpus.c +++ b/tools/perf/tests/open-syscall-all-cpus.c @@ -7,20 +7,12 @@ int test__open_syscall_event_on_all_cpus(void) { int err = -1, fd, cpu; - struct thread_map *threads; struct cpu_map *cpus; struct perf_evsel *evsel; - struct perf_event_attr attr; unsigned int nr_open_calls = 111, i; cpu_set_t cpu_set; - int id = trace_event__id("sys_enter_open"); + struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX); - if (id < 0) { - pr_debug("is debugfs mounted on /sys/kernel/debug?\n"); - return -1; - } - - threads = thread_map__new(-1, getpid(), UINT_MAX); if (threads == NULL) { pr_debug("thread_map__new\n"); return -1; @@ -32,15 +24,11 @@ int test__open_syscall_event_on_all_cpus(void) goto out_thread_map_delete; } - CPU_ZERO(&cpu_set); - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_TRACEPOINT; - attr.config = id; - evsel = perf_evsel__new(&attr, 0); + evsel = perf_evsel__newtp("syscalls", "sys_enter_open", 0); if (evsel == NULL) { - pr_debug("perf_evsel__new\n"); + pr_debug("is debugfs mounted on /sys/kernel/debug?\n"); goto out_thread_map_delete; } diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/open-syscall.c index 98be8b518b4f..befc0671f95d 100644 --- a/tools/perf/tests/open-syscall.c +++ b/tools/perf/tests/open-syscall.c @@ -6,29 +6,18 @@ int test__open_syscall_event(void) { int err = -1, fd; - struct thread_map *threads; struct perf_evsel *evsel; - struct perf_event_attr attr; unsigned int nr_open_calls = 111, i; - int id = trace_event__id("sys_enter_open"); + struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX); - if (id < 0) { - pr_debug("is debugfs mounted on /sys/kernel/debug?\n"); - return -1; - } - - threads = thread_map__new(-1, getpid(), UINT_MAX); if (threads == NULL) { pr_debug("thread_map__new\n"); return -1; } - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_TRACEPOINT; - attr.config = id; - evsel = perf_evsel__new(&attr, 0); + evsel = perf_evsel__newtp("syscalls", "sys_enter_open", 0); if (evsel == NULL) { - pr_debug("perf_evsel__new\n"); + pr_debug("is debugfs mounted on /sys/kernel/debug?\n"); goto out_thread_map_delete; } diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index 32ee478905eb..20acaff295d2 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -3,6 +3,7 @@ #include "evsel.h" #include "evlist.h" #include "sysfs.h" +#include "debugfs.h" #include "tests.h" #include <linux/hw_breakpoint.h> @@ -463,10 +464,10 @@ static int test__checkevent_pmu_events(struct perf_evlist *evlist) static int test__checkterms_simple(struct list_head *terms) { - struct parse_events__term *term; + struct parse_events_term *term; /* config=10 */ - term = list_entry(terms->next, struct parse_events__term, list); + term = list_entry(terms->next, struct parse_events_term, list); TEST_ASSERT_VAL("wrong type term", term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG); TEST_ASSERT_VAL("wrong type val", @@ -475,7 +476,7 @@ static int test__checkterms_simple(struct list_head *terms) TEST_ASSERT_VAL("wrong config", !term->config); /* config1 */ - term = list_entry(term->list.next, struct parse_events__term, list); + term = list_entry(term->list.next, struct parse_events_term, list); TEST_ASSERT_VAL("wrong type term", term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG1); TEST_ASSERT_VAL("wrong type val", @@ -484,7 +485,7 @@ static int test__checkterms_simple(struct list_head *terms) TEST_ASSERT_VAL("wrong config", !term->config); /* config2=3 */ - term = list_entry(term->list.next, struct parse_events__term, list); + term = list_entry(term->list.next, struct parse_events_term, list); TEST_ASSERT_VAL("wrong type term", term->type_term == PARSE_EVENTS__TERM_TYPE_CONFIG2); TEST_ASSERT_VAL("wrong type val", @@ -493,7 +494,7 @@ static int test__checkterms_simple(struct list_head *terms) TEST_ASSERT_VAL("wrong config", !term->config); /* umask=1*/ - term = list_entry(term->list.next, struct parse_events__term, list); + term = list_entry(term->list.next, struct parse_events_term, list); TEST_ASSERT_VAL("wrong type term", term->type_term == PARSE_EVENTS__TERM_TYPE_USER); TEST_ASSERT_VAL("wrong type val", @@ -521,7 +522,7 @@ static int test__group1(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); /* cycles:upp */ evsel = perf_evsel__next(evsel); @@ -557,7 +558,7 @@ static int test__group2(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); /* cache-references + :u modifier */ evsel = perf_evsel__next(evsel); @@ -583,7 +584,7 @@ static int test__group2(struct perf_evlist *evlist) TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); return 0; } @@ -606,7 +607,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong group name", !strcmp(leader->group_name, "group1")); @@ -636,7 +637,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); TEST_ASSERT_VAL("wrong group name", !strcmp(leader->group_name, "group2")); @@ -663,7 +664,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); return 0; } @@ -687,7 +688,7 @@ static int test__group4(struct perf_evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip == 1); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); /* instructions:kp + p */ evsel = perf_evsel__next(evsel); @@ -724,7 +725,7 @@ static int test__group5(struct perf_evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); /* instructions + G */ evsel = perf_evsel__next(evsel); @@ -751,7 +752,7 @@ static int test__group5(struct perf_evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); TEST_ASSERT_VAL("wrong group name", !evsel->group_name); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); /* instructions:G */ evsel = perf_evsel__next(evsel); @@ -777,18 +778,75 @@ static int test__group5(struct perf_evlist *evlist __maybe_unused) TEST_ASSERT_VAL("wrong exclude guest", evsel->attr.exclude_guest); TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); - TEST_ASSERT_VAL("wrong leader", !perf_evsel__is_group_member(evsel)); + TEST_ASSERT_VAL("wrong leader", perf_evsel__is_group_leader(evsel)); return 0; } -struct test__event_st { +static int count_tracepoints(void) +{ + char events_path[PATH_MAX]; + struct dirent *events_ent; + DIR *events_dir; + int cnt = 0; + + scnprintf(events_path, PATH_MAX, "%s/tracing/events", + debugfs_find_mountpoint()); + + events_dir = opendir(events_path); + + TEST_ASSERT_VAL("Can't open events dir", events_dir); + + while ((events_ent = readdir(events_dir))) { + char sys_path[PATH_MAX]; + struct dirent *sys_ent; + DIR *sys_dir; + + if (!strcmp(events_ent->d_name, ".") + || !strcmp(events_ent->d_name, "..") + || !strcmp(events_ent->d_name, "enable") + || !strcmp(events_ent->d_name, "header_event") + || !strcmp(events_ent->d_name, "header_page")) + continue; + + scnprintf(sys_path, PATH_MAX, "%s/%s", + events_path, events_ent->d_name); + + sys_dir = opendir(sys_path); + TEST_ASSERT_VAL("Can't open sys dir", sys_dir); + + while ((sys_ent = readdir(sys_dir))) { + if (!strcmp(sys_ent->d_name, ".") + || !strcmp(sys_ent->d_name, "..") + || !strcmp(sys_ent->d_name, "enable") + || !strcmp(sys_ent->d_name, "filter")) + continue; + + cnt++; + } + + closedir(sys_dir); + } + + closedir(events_dir); + return cnt; +} + +static int test__all_tracepoints(struct perf_evlist *evlist) +{ + TEST_ASSERT_VAL("wrong events count", + count_tracepoints() == evlist->nr_entries); + + return test__checkevent_tracepoint_multi(evlist); +} + +struct evlist_test { const char *name; __u32 type; int (*check)(struct perf_evlist *evlist); }; -static struct test__event_st test__events[] = { +static struct evlist_test test__events[] = { [0] = { .name = "syscalls:sys_enter_open", .check = test__checkevent_tracepoint, @@ -921,9 +979,13 @@ static struct test__event_st test__events[] = { .name = "{cycles,instructions}:G,{cycles:G,instructions:G},cycles", .check = test__group5, }, + [33] = { + .name = "*:*", + .check = test__all_tracepoints, + }, }; -static struct test__event_st test__events_pmu[] = { +static struct evlist_test test__events_pmu[] = { [0] = { .name = "cpu/config=10,config1,config2=3,period=1000/u", .check = test__checkevent_pmu, @@ -934,20 +996,20 @@ static struct test__event_st test__events_pmu[] = { }, }; -struct test__term { +struct terms_test { const char *str; __u32 type; int (*check)(struct list_head *terms); }; -static struct test__term test__terms[] = { +static struct terms_test test__terms[] = { [0] = { .str = "config=10,config1,config2=3,umask=1", .check = test__checkterms_simple, }, }; -static int test_event(struct test__event_st *e) +static int test_event(struct evlist_test *e) { struct perf_evlist *evlist; int ret; @@ -956,7 +1018,7 @@ static int test_event(struct test__event_st *e) if (evlist == NULL) return -ENOMEM; - ret = parse_events(evlist, e->name, 0); + ret = parse_events(evlist, e->name); if (ret) { pr_debug("failed to parse event '%s', err %d\n", e->name, ret); @@ -969,13 +1031,13 @@ static int test_event(struct test__event_st *e) return ret; } -static int test_events(struct test__event_st *events, unsigned cnt) +static int test_events(struct evlist_test *events, unsigned cnt) { int ret1, ret2 = 0; unsigned i; for (i = 0; i < cnt; i++) { - struct test__event_st *e = &events[i]; + struct evlist_test *e = &events[i]; pr_debug("running test %d '%s'\n", i, e->name); ret1 = test_event(e); @@ -986,7 +1048,7 @@ static int test_events(struct test__event_st *events, unsigned cnt) return ret2; } -static int test_term(struct test__term *t) +static int test_term(struct terms_test *t) { struct list_head *terms; int ret; @@ -1010,13 +1072,13 @@ static int test_term(struct test__term *t) return ret; } -static int test_terms(struct test__term *terms, unsigned cnt) +static int test_terms(struct terms_test *terms, unsigned cnt) { int ret = 0; unsigned i; for (i = 0; i < cnt; i++) { - struct test__term *t = &terms[i]; + struct terms_test *t = &terms[i]; pr_debug("running test %d '%s'\n", i, t->str); ret = test_term(t); @@ -1067,7 +1129,7 @@ static int test_pmu_events(void) while (!ret && (ent = readdir(dir))) { #define MAX_NAME 100 - struct test__event_st e; + struct evlist_test e; char name[MAX_NAME]; if (!strcmp(ent->d_name, ".") || diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 70e0d4421df8..6ea66cf6791b 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -103,10 +103,10 @@ int test__PERF_RECORD(void) * Config the evsels, setting attr->comm on the first one, etc. */ evsel = perf_evlist__first(evlist); - evsel->attr.sample_type |= PERF_SAMPLE_CPU; - evsel->attr.sample_type |= PERF_SAMPLE_TID; - evsel->attr.sample_type |= PERF_SAMPLE_TIME; - perf_evlist__config_attrs(evlist, &opts); + perf_evsel__set_sample_bit(evsel, CPU); + perf_evsel__set_sample_bit(evsel, TID); + perf_evsel__set_sample_bit(evsel, TIME); + perf_evlist__config(evlist, &opts); err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask); if (err < 0) { diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c index a5f379863b8f..12b322fa3475 100644 --- a/tools/perf/tests/pmu.c +++ b/tools/perf/tests/pmu.c @@ -19,10 +19,8 @@ static struct test_format { { "krava23", "config2:28-29,38\n", }, }; -#define TEST_FORMATS_CNT (sizeof(test_formats) / sizeof(struct test_format)) - /* Simulated users input. */ -static struct parse_events__term test_terms[] = { +static struct parse_events_term test_terms[] = { { .config = (char *) "krava01", .val.num = 15, @@ -78,7 +76,6 @@ static struct parse_events__term test_terms[] = { .type_term = PARSE_EVENTS__TERM_TYPE_USER, }, }; -#define TERMS_CNT (sizeof(test_terms) / sizeof(struct parse_events__term)) /* * Prepare format directory data, exported by kernel @@ -93,7 +90,7 @@ static char *test_format_dir_get(void) if (!mkdtemp(dir)) return NULL; - for (i = 0; i < TEST_FORMATS_CNT; i++) { + for (i = 0; i < ARRAY_SIZE(test_formats); i++) { static char name[PATH_MAX]; struct test_format *format = &test_formats[i]; FILE *file; @@ -130,14 +127,12 @@ static struct list_head *test_terms_list(void) static LIST_HEAD(terms); unsigned int i; - for (i = 0; i < TERMS_CNT; i++) + for (i = 0; i < ARRAY_SIZE(test_terms); i++) list_add_tail(&test_terms[i].list, &terms); return &terms; } -#undef TERMS_CNT - int test__pmu(void) { char *format = test_format_dir_get(); diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c new file mode 100644 index 000000000000..7760277c6def --- /dev/null +++ b/tools/perf/tests/python-use.c @@ -0,0 +1,23 @@ +/* + * Just test if we can load the python binding. + */ + +#include <stdio.h> +#include <stdlib.h> +#include "tests.h" + +extern int verbose; + +int test__python_use(void) +{ + char *cmd; + int ret; + + if (asprintf(&cmd, "echo \"import sys ; sys.path.append('%s'); import perf\" | %s %s", + PYTHONPATH, PYTHON, verbose ? "" : "2> /dev/null") < 0) + return -1; + + ret = system(cmd) ? -1 : 0; + free(cmd); + return ret; +} diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index fc121edab016..5de0be1ff4b6 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -1,6 +1,12 @@ #ifndef TESTS_H #define TESTS_H +enum { + TEST_OK = 0, + TEST_FAIL = -1, + TEST_SKIP = -2, +}; + /* Tests */ int test__vmlinux_matches_kallsyms(void); int test__open_syscall_event(void); @@ -15,8 +21,7 @@ int test__pmu(void); int test__attr(void); int test__dso_data(void); int test__parse_events(void); - -/* Util */ -int trace_event__id(const char *evname); +int test__hists_link(void); +int test__python_use(void); #endif /* TESTS_H */ diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c deleted file mode 100644 index 748f2e8f6961..000000000000 --- a/tools/perf/tests/util.c +++ /dev/null @@ -1,30 +0,0 @@ -#include <stdio.h> -#include <unistd.h> -#include <stdlib.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include "tests.h" -#include "debugfs.h" - -int trace_event__id(const char *evname) -{ - char *filename; - int err = -1, fd; - - if (asprintf(&filename, - "%s/syscalls/%s/id", - tracing_events_path, evname) < 0) - return -1; - - fd = open(filename, O_RDONLY); - if (fd >= 0) { - char id[16]; - if (read(fd, id, sizeof(id)) > 0) - err = atoi(id); - close(fd); - } - - free(filename); - return err; -} diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 0d1cdbee2f59..a1a8442829b4 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -101,7 +101,8 @@ int test__vmlinux_matches_kallsyms(void) */ if (machine__load_vmlinux_path(&vmlinux, type, vmlinux_matches_kallsyms_filter) <= 0) { - pr_debug("machine__load_vmlinux_path "); + pr_debug("Couldn't find a vmlinux that matches the kernel running on this machine, skipping test\n"); + err = TEST_SKIP; goto out; } diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c index 4aeb7d5df939..588bcb2d008b 100644 --- a/tools/perf/ui/browser.c +++ b/tools/perf/ui/browser.c @@ -471,7 +471,7 @@ unsigned int ui_browser__list_head_refresh(struct ui_browser *browser) return row; } -static struct ui_browser__colorset { +static struct ui_browser_colorset { const char *name, *fg, *bg; int colorset; } ui_browser__colorsets[] = { @@ -706,7 +706,7 @@ void ui_browser__init(void) perf_config(ui_browser__color_config, NULL); while (ui_browser__colorsets[i].name) { - struct ui_browser__colorset *c = &ui_browser__colorsets[i++]; + struct ui_browser_colorset *c = &ui_browser__colorsets[i++]; sltt_set_color(c->colorset, c->name, c->fg, c->bg); } diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 5dab3ca96980..7dca1555c610 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -182,6 +182,16 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int ab->selection = dl; } +static bool disasm_line__is_valid_jump(struct disasm_line *dl, struct symbol *sym) +{ + if (!dl || !dl->ins || !ins__is_jump(dl->ins) + || !disasm_line__has_offset(dl) + || dl->ops.target.offset >= symbol__size(sym)) + return false; + + return true; +} + static void annotate_browser__draw_current_jump(struct ui_browser *browser) { struct annotate_browser *ab = container_of(browser, struct annotate_browser, b); @@ -195,8 +205,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser) if (strstr(sym->name, "@plt")) return; - if (!cursor || !cursor->ins || !ins__is_jump(cursor->ins) || - !disasm_line__has_offset(cursor)) + if (!disasm_line__is_valid_jump(cursor, sym)) return; target = ab->offsets[cursor->ops.target.offset]; @@ -788,17 +797,9 @@ static void annotate_browser__mark_jump_targets(struct annotate_browser *browser struct disasm_line *dl = browser->offsets[offset], *dlt; struct browser_disasm_line *bdlt; - if (!dl || !dl->ins || !ins__is_jump(dl->ins) || - !disasm_line__has_offset(dl)) + if (!disasm_line__is_valid_jump(dl, sym)) continue; - if (dl->ops.target.offset >= size) { - ui__error("jump to after symbol!\n" - "size: %zx, jump target: %" PRIx64, - size, dl->ops.target.offset); - continue; - } - dlt = browser->offsets[dl->ops.target.offset]; /* * FIXME: Oops, no jump target? Buggy disassembler? Or do we @@ -921,11 +922,11 @@ out_free_offsets: #define ANNOTATE_CFG(n) \ { .name = #n, .value = &annotate_browser__opts.n, } - + /* * Keep the entries sorted, they are bsearch'ed */ -static struct annotate__config { +static struct annotate_config { const char *name; bool *value; } annotate__configs[] = { @@ -939,7 +940,7 @@ static struct annotate__config { static int annotate_config__cmp(const void *name, const void *cfgp) { - const struct annotate__config *cfg = cfgp; + const struct annotate_config *cfg = cfgp; return strcmp(name, cfg->name); } @@ -947,7 +948,7 @@ static int annotate_config__cmp(const void *name, const void *cfgp) static int annotate__config(const char *var, const char *value, void *data __maybe_unused) { - struct annotate__config *cfg; + struct annotate_config *cfg; const char *name; if (prefixcmp(var, "annotate.") != 0) @@ -955,7 +956,7 @@ static int annotate__config(const char *var, const char *value, name = var + 9; cfg = bsearch(name, annotate__configs, ARRAY_SIZE(annotate__configs), - sizeof(struct annotate__config), annotate_config__cmp); + sizeof(struct annotate_config), annotate_config__cmp); if (cfg == NULL) return -1; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index ccc4bd161420..57b82c26cd05 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -587,6 +587,8 @@ HPP__COLOR_FN(overhead_guest_us, period_guest_us) void hist_browser__init_hpp(void) { + perf_hpp__column_enable(PERF_HPP__OVERHEAD); + perf_hpp__init(); perf_hpp__format[PERF_HPP__OVERHEAD].color = @@ -607,12 +609,13 @@ static int hist_browser__show_entry(struct hist_browser *browser, { char s[256]; double percent; - int i, printed = 0; + int printed = 0; int width = browser->b.width; char folded_sign = ' '; bool current_entry = ui_browser__is_current_entry(&browser->b, row); off_t row_offset = entry->row_offset; bool first = true; + struct perf_hpp_fmt *fmt; if (current_entry) { browser->he_selection = entry; @@ -629,12 +632,11 @@ static int hist_browser__show_entry(struct hist_browser *browser, .buf = s, .size = sizeof(s), }; + int i = 0; ui_browser__gotorc(&browser->b, row, 0); - for (i = 0; i < PERF_HPP__MAX_INDEX; i++) { - if (!perf_hpp__format[i].cond) - continue; + perf_hpp__for_each_format(fmt) { if (!first) { slsmg_printf(" "); @@ -642,14 +644,14 @@ static int hist_browser__show_entry(struct hist_browser *browser, } first = false; - if (perf_hpp__format[i].color) { + if (fmt->color) { hpp.ptr = &percent; /* It will set percent for us. See HPP__COLOR_FN above. */ - width -= perf_hpp__format[i].color(&hpp, entry); + width -= fmt->color(&hpp, entry); ui_browser__set_percent_color(&browser->b, percent, current_entry); - if (i == PERF_HPP__OVERHEAD && symbol_conf.use_callchain) { + if (!i && symbol_conf.use_callchain) { slsmg_printf("%c ", folded_sign); width -= 2; } @@ -659,9 +661,11 @@ static int hist_browser__show_entry(struct hist_browser *browser, if (!current_entry || !browser->b.navkeypressed) ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL); } else { - width -= perf_hpp__format[i].entry(&hpp, entry); + width -= fmt->entry(&hpp, entry); slsmg_printf("%s", s); } + + i++; } /* The scroll bar isn't being used */ diff --git a/tools/perf/ui/gtk/browser.c b/tools/perf/ui/gtk/browser.c index 253b6219a39e..c95012cdb438 100644 --- a/tools/perf/ui/gtk/browser.c +++ b/tools/perf/ui/gtk/browser.c @@ -8,15 +8,13 @@ #include <signal.h> -#define MAX_COLUMNS 32 - -static void perf_gtk__signal(int sig) +void perf_gtk__signal(int sig) { perf_gtk__exit(false); psignal(sig, "perf"); } -static void perf_gtk__resize_window(GtkWidget *window) +void perf_gtk__resize_window(GtkWidget *window) { GdkRectangle rect; GdkScreen *screen; @@ -36,7 +34,7 @@ static void perf_gtk__resize_window(GtkWidget *window) gtk_window_resize(GTK_WINDOW(window), width, height); } -static const char *perf_gtk__get_percent_color(double percent) +const char *perf_gtk__get_percent_color(double percent) { if (percent >= MIN_RED) return "<span fgcolor='red'>"; @@ -45,155 +43,8 @@ static const char *perf_gtk__get_percent_color(double percent) return NULL; } -#define HPP__COLOR_FN(_name, _field) \ -static int perf_gtk__hpp_color_ ## _name(struct perf_hpp *hpp, \ - struct hist_entry *he) \ -{ \ - struct hists *hists = he->hists; \ - double percent = 100.0 * he->stat._field / hists->stats.total_period; \ - const char *markup; \ - int ret = 0; \ - \ - markup = perf_gtk__get_percent_color(percent); \ - if (markup) \ - ret += scnprintf(hpp->buf, hpp->size, "%s", markup); \ - ret += scnprintf(hpp->buf + ret, hpp->size - ret, "%6.2f%%", percent); \ - if (markup) \ - ret += scnprintf(hpp->buf + ret, hpp->size - ret, "</span>"); \ - \ - return ret; \ -} - -HPP__COLOR_FN(overhead, period) -HPP__COLOR_FN(overhead_sys, period_sys) -HPP__COLOR_FN(overhead_us, period_us) -HPP__COLOR_FN(overhead_guest_sys, period_guest_sys) -HPP__COLOR_FN(overhead_guest_us, period_guest_us) - -#undef HPP__COLOR_FN - -void perf_gtk__init_hpp(void) -{ - perf_hpp__init(); - - perf_hpp__format[PERF_HPP__OVERHEAD].color = - perf_gtk__hpp_color_overhead; - perf_hpp__format[PERF_HPP__OVERHEAD_SYS].color = - perf_gtk__hpp_color_overhead_sys; - perf_hpp__format[PERF_HPP__OVERHEAD_US].color = - perf_gtk__hpp_color_overhead_us; - perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].color = - perf_gtk__hpp_color_overhead_guest_sys; - perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].color = - perf_gtk__hpp_color_overhead_guest_us; -} - -static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists) -{ - GType col_types[MAX_COLUMNS]; - GtkCellRenderer *renderer; - struct sort_entry *se; - GtkListStore *store; - struct rb_node *nd; - GtkWidget *view; - int i, col_idx; - int nr_cols; - char s[512]; - - struct perf_hpp hpp = { - .buf = s, - .size = sizeof(s), - }; - - nr_cols = 0; - - for (i = 0; i < PERF_HPP__MAX_INDEX; i++) { - if (!perf_hpp__format[i].cond) - continue; - - col_types[nr_cols++] = G_TYPE_STRING; - } - - list_for_each_entry(se, &hist_entry__sort_list, list) { - if (se->elide) - continue; - - col_types[nr_cols++] = G_TYPE_STRING; - } - - store = gtk_list_store_newv(nr_cols, col_types); - - view = gtk_tree_view_new(); - - renderer = gtk_cell_renderer_text_new(); - - col_idx = 0; - - for (i = 0; i < PERF_HPP__MAX_INDEX; i++) { - if (!perf_hpp__format[i].cond) - continue; - - perf_hpp__format[i].header(&hpp); - - gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view), - -1, s, - renderer, "markup", - col_idx++, NULL); - } - - list_for_each_entry(se, &hist_entry__sort_list, list) { - if (se->elide) - continue; - - gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view), - -1, se->se_header, - renderer, "text", - col_idx++, NULL); - } - - gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store)); - - g_object_unref(GTK_TREE_MODEL(store)); - - for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) { - struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node); - GtkTreeIter iter; - - if (h->filtered) - continue; - - gtk_list_store_append(store, &iter); - - col_idx = 0; - - for (i = 0; i < PERF_HPP__MAX_INDEX; i++) { - if (!perf_hpp__format[i].cond) - continue; - - if (perf_hpp__format[i].color) - perf_hpp__format[i].color(&hpp, h); - else - perf_hpp__format[i].entry(&hpp, h); - - gtk_list_store_set(store, &iter, col_idx++, s, -1); - } - - list_for_each_entry(se, &hist_entry__sort_list, list) { - if (se->elide) - continue; - - se->se_snprintf(h, s, ARRAY_SIZE(s), - hists__col_len(hists, se->se_width_idx)); - - gtk_list_store_set(store, &iter, col_idx++, s, -1); - } - } - - gtk_container_add(GTK_CONTAINER(window), view); -} - #ifdef HAVE_GTK_INFO_BAR -static GtkWidget *perf_gtk__setup_info_bar(void) +GtkWidget *perf_gtk__setup_info_bar(void) { GtkWidget *info_bar; GtkWidget *label; @@ -220,7 +71,7 @@ static GtkWidget *perf_gtk__setup_info_bar(void) } #endif -static GtkWidget *perf_gtk__setup_statusbar(void) +GtkWidget *perf_gtk__setup_statusbar(void) { GtkWidget *stbar; unsigned ctxid; @@ -234,79 +85,3 @@ static GtkWidget *perf_gtk__setup_statusbar(void) return stbar; } - -int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, - const char *help, - struct hist_browser_timer *hbt __maybe_unused) -{ - struct perf_evsel *pos; - GtkWidget *vbox; - GtkWidget *notebook; - GtkWidget *info_bar; - GtkWidget *statbar; - GtkWidget *window; - - signal(SIGSEGV, perf_gtk__signal); - signal(SIGFPE, perf_gtk__signal); - signal(SIGINT, perf_gtk__signal); - signal(SIGQUIT, perf_gtk__signal); - signal(SIGTERM, perf_gtk__signal); - - window = gtk_window_new(GTK_WINDOW_TOPLEVEL); - - gtk_window_set_title(GTK_WINDOW(window), "perf report"); - - g_signal_connect(window, "delete_event", gtk_main_quit, NULL); - - pgctx = perf_gtk__activate_context(window); - if (!pgctx) - return -1; - - vbox = gtk_vbox_new(FALSE, 0); - - notebook = gtk_notebook_new(); - - list_for_each_entry(pos, &evlist->entries, node) { - struct hists *hists = &pos->hists; - const char *evname = perf_evsel__name(pos); - GtkWidget *scrolled_window; - GtkWidget *tab_label; - - scrolled_window = gtk_scrolled_window_new(NULL, NULL); - - gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window), - GTK_POLICY_AUTOMATIC, - GTK_POLICY_AUTOMATIC); - - perf_gtk__show_hists(scrolled_window, hists); - - tab_label = gtk_label_new(evname); - - gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window, tab_label); - } - - gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0); - - info_bar = perf_gtk__setup_info_bar(); - if (info_bar) - gtk_box_pack_start(GTK_BOX(vbox), info_bar, FALSE, FALSE, 0); - - statbar = perf_gtk__setup_statusbar(); - gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0); - - gtk_container_add(GTK_CONTAINER(window), vbox); - - gtk_widget_show_all(window); - - perf_gtk__resize_window(window); - - gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER); - - ui_helpline__push(help); - - gtk_main(); - - perf_gtk__deactivate_context(&pgctx); - - return 0; -} diff --git a/tools/perf/ui/gtk/gtk.h b/tools/perf/ui/gtk/gtk.h index 856320e2cc05..5d3693754828 100644 --- a/tools/perf/ui/gtk/gtk.h +++ b/tools/perf/ui/gtk/gtk.h @@ -33,7 +33,14 @@ void perf_gtk__init_helpline(void); void perf_gtk__init_progress(void); void perf_gtk__init_hpp(void); -#ifndef HAVE_GTK_INFO_BAR +void perf_gtk__signal(int sig); +void perf_gtk__resize_window(GtkWidget *window); +const char *perf_gtk__get_percent_color(double percent); +GtkWidget *perf_gtk__setup_statusbar(void); + +#ifdef HAVE_GTK_INFO_BAR +GtkWidget *perf_gtk__setup_info_bar(void); +#else static inline GtkWidget *perf_gtk__setup_info_bar(void) { return NULL; diff --git a/tools/perf/ui/gtk/helpline.c b/tools/perf/ui/gtk/helpline.c index 5db4432ff12a..3388cbd12186 100644 --- a/tools/perf/ui/gtk/helpline.c +++ b/tools/perf/ui/gtk/helpline.c @@ -24,17 +24,7 @@ static void gtk_helpline_push(const char *msg) pgctx->statbar_ctx_id, msg); } -static struct ui_helpline gtk_helpline_fns = { - .pop = gtk_helpline_pop, - .push = gtk_helpline_push, -}; - -void perf_gtk__init_helpline(void) -{ - helpline_fns = >k_helpline_fns; -} - -int perf_gtk__show_helpline(const char *fmt, va_list ap) +static int gtk_helpline_show(const char *fmt, va_list ap) { int ret; char *ptr; @@ -54,3 +44,14 @@ int perf_gtk__show_helpline(const char *fmt, va_list ap) return ret; } + +static struct ui_helpline gtk_helpline_fns = { + .pop = gtk_helpline_pop, + .push = gtk_helpline_push, + .show = gtk_helpline_show, +}; + +void perf_gtk__init_helpline(void) +{ + helpline_fns = >k_helpline_fns; +} diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c new file mode 100644 index 000000000000..c03da79d524f --- /dev/null +++ b/tools/perf/ui/gtk/hists.c @@ -0,0 +1,226 @@ +#include "../evlist.h" +#include "../cache.h" +#include "../evsel.h" +#include "../sort.h" +#include "../hist.h" +#include "../helpline.h" +#include "gtk.h" + +#define MAX_COLUMNS 32 + +#define HPP__COLOR_FN(_name, _field) \ +static int perf_gtk__hpp_color_ ## _name(struct perf_hpp *hpp, \ + struct hist_entry *he) \ +{ \ + struct hists *hists = he->hists; \ + double percent = 100.0 * he->stat._field / hists->stats.total_period; \ + const char *markup; \ + int ret = 0; \ + \ + markup = perf_gtk__get_percent_color(percent); \ + if (markup) \ + ret += scnprintf(hpp->buf, hpp->size, "%s", markup); \ + ret += scnprintf(hpp->buf + ret, hpp->size - ret, "%6.2f%%", percent); \ + if (markup) \ + ret += scnprintf(hpp->buf + ret, hpp->size - ret, "</span>"); \ + \ + return ret; \ +} + +HPP__COLOR_FN(overhead, period) +HPP__COLOR_FN(overhead_sys, period_sys) +HPP__COLOR_FN(overhead_us, period_us) +HPP__COLOR_FN(overhead_guest_sys, period_guest_sys) +HPP__COLOR_FN(overhead_guest_us, period_guest_us) + +#undef HPP__COLOR_FN + + +void perf_gtk__init_hpp(void) +{ + perf_hpp__column_enable(PERF_HPP__OVERHEAD); + + perf_hpp__init(); + + perf_hpp__format[PERF_HPP__OVERHEAD].color = + perf_gtk__hpp_color_overhead; + perf_hpp__format[PERF_HPP__OVERHEAD_SYS].color = + perf_gtk__hpp_color_overhead_sys; + perf_hpp__format[PERF_HPP__OVERHEAD_US].color = + perf_gtk__hpp_color_overhead_us; + perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].color = + perf_gtk__hpp_color_overhead_guest_sys; + perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].color = + perf_gtk__hpp_color_overhead_guest_us; +} + +static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists) +{ + struct perf_hpp_fmt *fmt; + GType col_types[MAX_COLUMNS]; + GtkCellRenderer *renderer; + struct sort_entry *se; + GtkListStore *store; + struct rb_node *nd; + GtkWidget *view; + int col_idx; + int nr_cols; + char s[512]; + + struct perf_hpp hpp = { + .buf = s, + .size = sizeof(s), + }; + + nr_cols = 0; + + perf_hpp__for_each_format(fmt) + col_types[nr_cols++] = G_TYPE_STRING; + + list_for_each_entry(se, &hist_entry__sort_list, list) { + if (se->elide) + continue; + + col_types[nr_cols++] = G_TYPE_STRING; + } + + store = gtk_list_store_newv(nr_cols, col_types); + + view = gtk_tree_view_new(); + + renderer = gtk_cell_renderer_text_new(); + + col_idx = 0; + + perf_hpp__for_each_format(fmt) { + fmt->header(&hpp); + + gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view), + -1, s, + renderer, "markup", + col_idx++, NULL); + } + + list_for_each_entry(se, &hist_entry__sort_list, list) { + if (se->elide) + continue; + + gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view), + -1, se->se_header, + renderer, "text", + col_idx++, NULL); + } + + gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store)); + + g_object_unref(GTK_TREE_MODEL(store)); + + for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) { + struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node); + GtkTreeIter iter; + + if (h->filtered) + continue; + + gtk_list_store_append(store, &iter); + + col_idx = 0; + + perf_hpp__for_each_format(fmt) { + if (fmt->color) + fmt->color(&hpp, h); + else + fmt->entry(&hpp, h); + + gtk_list_store_set(store, &iter, col_idx++, s, -1); + } + + list_for_each_entry(se, &hist_entry__sort_list, list) { + if (se->elide) + continue; + + se->se_snprintf(h, s, ARRAY_SIZE(s), + hists__col_len(hists, se->se_width_idx)); + + gtk_list_store_set(store, &iter, col_idx++, s, -1); + } + } + + gtk_container_add(GTK_CONTAINER(window), view); +} + +int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist, + const char *help, + struct hist_browser_timer *hbt __maybe_unused) +{ + struct perf_evsel *pos; + GtkWidget *vbox; + GtkWidget *notebook; + GtkWidget *info_bar; + GtkWidget *statbar; + GtkWidget *window; + + signal(SIGSEGV, perf_gtk__signal); + signal(SIGFPE, perf_gtk__signal); + signal(SIGINT, perf_gtk__signal); + signal(SIGQUIT, perf_gtk__signal); + signal(SIGTERM, perf_gtk__signal); + + window = gtk_window_new(GTK_WINDOW_TOPLEVEL); + + gtk_window_set_title(GTK_WINDOW(window), "perf report"); + + g_signal_connect(window, "delete_event", gtk_main_quit, NULL); + + pgctx = perf_gtk__activate_context(window); + if (!pgctx) + return -1; + + vbox = gtk_vbox_new(FALSE, 0); + + notebook = gtk_notebook_new(); + + gtk_box_pack_start(GTK_BOX(vbox), notebook, TRUE, TRUE, 0); + + info_bar = perf_gtk__setup_info_bar(); + if (info_bar) + gtk_box_pack_start(GTK_BOX(vbox), info_bar, FALSE, FALSE, 0); + + statbar = perf_gtk__setup_statusbar(); + gtk_box_pack_start(GTK_BOX(vbox), statbar, FALSE, FALSE, 0); + + gtk_container_add(GTK_CONTAINER(window), vbox); + + list_for_each_entry(pos, &evlist->entries, node) { + struct hists *hists = &pos->hists; + const char *evname = perf_evsel__name(pos); + GtkWidget *scrolled_window; + GtkWidget *tab_label; + + scrolled_window = gtk_scrolled_window_new(NULL, NULL); + + gtk_scrolled_window_set_policy(GTK_SCROLLED_WINDOW(scrolled_window), + GTK_POLICY_AUTOMATIC, + GTK_POLICY_AUTOMATIC); + + perf_gtk__show_hists(scrolled_window, hists); + + tab_label = gtk_label_new(evname); + + gtk_notebook_append_page(GTK_NOTEBOOK(notebook), scrolled_window, tab_label); + } + + gtk_widget_show_all(window); + + perf_gtk__resize_window(window); + + gtk_window_set_position(GTK_WINDOW(window), GTK_WIN_POS_CENTER); + + ui_helpline__push(help); + + gtk_main(); + + perf_gtk__deactivate_context(&pgctx); + + return 0; +} diff --git a/tools/perf/ui/helpline.c b/tools/perf/ui/helpline.c index a49bcf3c190b..700fb3cfa1c7 100644 --- a/tools/perf/ui/helpline.c +++ b/tools/perf/ui/helpline.c @@ -16,9 +16,16 @@ static void nop_helpline__push(const char *msg __maybe_unused) { } +static int nop_helpline__show(const char *fmt __maybe_unused, + va_list ap __maybe_unused) +{ + return 0; +} + static struct ui_helpline default_helpline_fns = { .pop = nop_helpline__pop, .push = nop_helpline__push, + .show = nop_helpline__show, }; struct ui_helpline *helpline_fns = &default_helpline_fns; @@ -59,3 +66,8 @@ void ui_helpline__puts(const char *msg) ui_helpline__pop(); ui_helpline__push(msg); } + +int ui_helpline__vshow(const char *fmt, va_list ap) +{ + return helpline_fns->show(fmt, ap); +} diff --git a/tools/perf/ui/helpline.h b/tools/perf/ui/helpline.h index baa28a4d16b9..46181f4fc07e 100644 --- a/tools/perf/ui/helpline.h +++ b/tools/perf/ui/helpline.h @@ -9,6 +9,7 @@ struct ui_helpline { void (*pop)(void); void (*push)(const char *msg); + int (*show)(const char *fmt, va_list ap); }; extern struct ui_helpline *helpline_fns; @@ -20,28 +21,9 @@ void ui_helpline__push(const char *msg); void ui_helpline__vpush(const char *fmt, va_list ap); void ui_helpline__fpush(const char *fmt, ...); void ui_helpline__puts(const char *msg); +int ui_helpline__vshow(const char *fmt, va_list ap); extern char ui_helpline__current[512]; - -#ifdef NEWT_SUPPORT extern char ui_helpline__last_msg[]; -int ui_helpline__show_help(const char *format, va_list ap); -#else -static inline int ui_helpline__show_help(const char *format __maybe_unused, - va_list ap __maybe_unused) -{ - return 0; -} -#endif /* NEWT_SUPPORT */ - -#ifdef GTK2_SUPPORT -int perf_gtk__show_helpline(const char *format, va_list ap); -#else -static inline int perf_gtk__show_helpline(const char *format __maybe_unused, - va_list ap __maybe_unused) -{ - return 0; -} -#endif /* GTK2_SUPPORT */ #endif /* _PERF_UI_HELPLINE_H_ */ diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index aa84130024d5..1889c12ca81f 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -268,14 +268,18 @@ static int hpp__width_delta(struct perf_hpp *hpp __maybe_unused) static int hpp__entry_delta(struct perf_hpp *hpp, struct hist_entry *he) { + struct hist_entry *pair = hist_entry__next_pair(he); const char *fmt = symbol_conf.field_sep ? "%s" : "%7.7s"; char buf[32] = " "; - double diff; + double diff = 0.0; - if (he->diff.computed) - diff = he->diff.period_ratio_delta; - else - diff = perf_diff__compute_delta(he); + if (pair) { + if (he->diff.computed) + diff = he->diff.period_ratio_delta; + else + diff = perf_diff__compute_delta(he, pair); + } else + diff = perf_diff__period_percent(he, he->stat.period); if (fabs(diff) >= 0.01) scnprintf(buf, sizeof(buf), "%+4.2F%%", diff); @@ -297,14 +301,17 @@ static int hpp__width_ratio(struct perf_hpp *hpp __maybe_unused) static int hpp__entry_ratio(struct perf_hpp *hpp, struct hist_entry *he) { + struct hist_entry *pair = hist_entry__next_pair(he); const char *fmt = symbol_conf.field_sep ? "%s" : "%14s"; char buf[32] = " "; - double ratio; + double ratio = 0.0; - if (he->diff.computed) - ratio = he->diff.period_ratio; - else - ratio = perf_diff__compute_ratio(he); + if (pair) { + if (he->diff.computed) + ratio = he->diff.period_ratio; + else + ratio = perf_diff__compute_ratio(he, pair); + } if (ratio > 0.0) scnprintf(buf, sizeof(buf), "%+14.6F", ratio); @@ -326,14 +333,17 @@ static int hpp__width_wdiff(struct perf_hpp *hpp __maybe_unused) static int hpp__entry_wdiff(struct perf_hpp *hpp, struct hist_entry *he) { + struct hist_entry *pair = hist_entry__next_pair(he); const char *fmt = symbol_conf.field_sep ? "%s" : "%14s"; char buf[32] = " "; - s64 wdiff; + s64 wdiff = 0; - if (he->diff.computed) - wdiff = he->diff.wdiff; - else - wdiff = perf_diff__compute_wdiff(he); + if (pair) { + if (he->diff.computed) + wdiff = he->diff.wdiff; + else + wdiff = perf_diff__compute_wdiff(he, pair); + } if (wdiff != 0) scnprintf(buf, sizeof(buf), "%14ld", wdiff); @@ -341,30 +351,6 @@ static int hpp__entry_wdiff(struct perf_hpp *hpp, struct hist_entry *he) return scnprintf(hpp->buf, hpp->size, fmt, buf); } -static int hpp__header_displ(struct perf_hpp *hpp) -{ - return scnprintf(hpp->buf, hpp->size, "Displ."); -} - -static int hpp__width_displ(struct perf_hpp *hpp __maybe_unused) -{ - return 6; -} - -static int hpp__entry_displ(struct perf_hpp *hpp, - struct hist_entry *he) -{ - struct hist_entry *pair = hist_entry__next_pair(he); - long displacement = pair ? pair->position - he->position : 0; - const char *fmt = symbol_conf.field_sep ? "%s" : "%6.6s"; - char buf[32] = " "; - - if (displacement) - scnprintf(buf, sizeof(buf), "%+4ld", displacement); - - return scnprintf(hpp->buf, hpp->size, fmt, buf); -} - static int hpp__header_formula(struct perf_hpp *hpp) { const char *fmt = symbol_conf.field_sep ? "%s" : "%70s"; @@ -379,67 +365,80 @@ static int hpp__width_formula(struct perf_hpp *hpp __maybe_unused) static int hpp__entry_formula(struct perf_hpp *hpp, struct hist_entry *he) { + struct hist_entry *pair = hist_entry__next_pair(he); const char *fmt = symbol_conf.field_sep ? "%s" : "%-70s"; char buf[96] = " "; - perf_diff__formula(buf, sizeof(buf), he); + if (pair) + perf_diff__formula(he, pair, buf, sizeof(buf)); + return scnprintf(hpp->buf, hpp->size, fmt, buf); } -#define HPP__COLOR_PRINT_FNS(_name) \ - .header = hpp__header_ ## _name, \ - .width = hpp__width_ ## _name, \ - .color = hpp__color_ ## _name, \ - .entry = hpp__entry_ ## _name +#define HPP__COLOR_PRINT_FNS(_name) \ + { \ + .header = hpp__header_ ## _name, \ + .width = hpp__width_ ## _name, \ + .color = hpp__color_ ## _name, \ + .entry = hpp__entry_ ## _name \ + } -#define HPP__PRINT_FNS(_name) \ - .header = hpp__header_ ## _name, \ - .width = hpp__width_ ## _name, \ - .entry = hpp__entry_ ## _name +#define HPP__PRINT_FNS(_name) \ + { \ + .header = hpp__header_ ## _name, \ + .width = hpp__width_ ## _name, \ + .entry = hpp__entry_ ## _name \ + } struct perf_hpp_fmt perf_hpp__format[] = { - { .cond = false, HPP__COLOR_PRINT_FNS(baseline) }, - { .cond = true, HPP__COLOR_PRINT_FNS(overhead) }, - { .cond = false, HPP__COLOR_PRINT_FNS(overhead_sys) }, - { .cond = false, HPP__COLOR_PRINT_FNS(overhead_us) }, - { .cond = false, HPP__COLOR_PRINT_FNS(overhead_guest_sys) }, - { .cond = false, HPP__COLOR_PRINT_FNS(overhead_guest_us) }, - { .cond = false, HPP__PRINT_FNS(samples) }, - { .cond = false, HPP__PRINT_FNS(period) }, - { .cond = false, HPP__PRINT_FNS(period_baseline) }, - { .cond = false, HPP__PRINT_FNS(delta) }, - { .cond = false, HPP__PRINT_FNS(ratio) }, - { .cond = false, HPP__PRINT_FNS(wdiff) }, - { .cond = false, HPP__PRINT_FNS(displ) }, - { .cond = false, HPP__PRINT_FNS(formula) } + HPP__COLOR_PRINT_FNS(baseline), + HPP__COLOR_PRINT_FNS(overhead), + HPP__COLOR_PRINT_FNS(overhead_sys), + HPP__COLOR_PRINT_FNS(overhead_us), + HPP__COLOR_PRINT_FNS(overhead_guest_sys), + HPP__COLOR_PRINT_FNS(overhead_guest_us), + HPP__PRINT_FNS(samples), + HPP__PRINT_FNS(period), + HPP__PRINT_FNS(period_baseline), + HPP__PRINT_FNS(delta), + HPP__PRINT_FNS(ratio), + HPP__PRINT_FNS(wdiff), + HPP__PRINT_FNS(formula) }; +LIST_HEAD(perf_hpp__list); + #undef HPP__COLOR_PRINT_FNS #undef HPP__PRINT_FNS void perf_hpp__init(void) { if (symbol_conf.show_cpu_utilization) { - perf_hpp__format[PERF_HPP__OVERHEAD_SYS].cond = true; - perf_hpp__format[PERF_HPP__OVERHEAD_US].cond = true; + perf_hpp__column_enable(PERF_HPP__OVERHEAD_SYS); + perf_hpp__column_enable(PERF_HPP__OVERHEAD_US); if (perf_guest) { - perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_SYS].cond = true; - perf_hpp__format[PERF_HPP__OVERHEAD_GUEST_US].cond = true; + perf_hpp__column_enable(PERF_HPP__OVERHEAD_GUEST_SYS); + perf_hpp__column_enable(PERF_HPP__OVERHEAD_GUEST_US); } } if (symbol_conf.show_nr_samples) - perf_hpp__format[PERF_HPP__SAMPLES].cond = true; + perf_hpp__column_enable(PERF_HPP__SAMPLES); if (symbol_conf.show_total_period) - perf_hpp__format[PERF_HPP__PERIOD].cond = true; + perf_hpp__column_enable(PERF_HPP__PERIOD); +} + +void perf_hpp__column_register(struct perf_hpp_fmt *format) +{ + list_add_tail(&format->list, &perf_hpp__list); } -void perf_hpp__column_enable(unsigned col, bool enable) +void perf_hpp__column_enable(unsigned col) { BUG_ON(col >= PERF_HPP__MAX_INDEX); - perf_hpp__format[col].cond = enable; + perf_hpp__column_register(&perf_hpp__format[col]); } static inline void advance_hpp(struct perf_hpp *hpp, int inc) @@ -452,27 +451,29 @@ int hist_entry__period_snprintf(struct perf_hpp *hpp, struct hist_entry *he, bool color) { const char *sep = symbol_conf.field_sep; + struct perf_hpp_fmt *fmt; char *start = hpp->buf; - int i, ret; + int ret; bool first = true; if (symbol_conf.exclude_other && !he->parent) return 0; - for (i = 0; i < PERF_HPP__MAX_INDEX; i++) { - if (!perf_hpp__format[i].cond) - continue; - + perf_hpp__for_each_format(fmt) { + /* + * If there's no field_sep, we still need + * to display initial ' '. + */ if (!sep || !first) { ret = scnprintf(hpp->buf, hpp->size, "%s", sep ?: " "); advance_hpp(hpp, ret); + } else first = false; - } - if (color && perf_hpp__format[i].color) - ret = perf_hpp__format[i].color(hpp, he); + if (color && fmt->color) + ret = fmt->color(hpp, he); else - ret = perf_hpp__format[i].entry(hpp, he); + ret = fmt->entry(hpp, he); advance_hpp(hpp, ret); } @@ -504,16 +505,15 @@ int hist_entry__sort_snprintf(struct hist_entry *he, char *s, size_t size, */ unsigned int hists__sort_list_width(struct hists *hists) { + struct perf_hpp_fmt *fmt; struct sort_entry *se; - int i, ret = 0; + int i = 0, ret = 0; - for (i = 0; i < PERF_HPP__MAX_INDEX; i++) { - if (!perf_hpp__format[i].cond) - continue; + perf_hpp__for_each_format(fmt) { if (i) ret += 2; - ret += perf_hpp__format[i].width(NULL); + ret += fmt->width(NULL); } list_for_each_entry(se, &hist_entry__sort_list, list) diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c index ebb4cc107876..166f13df3134 100644 --- a/tools/perf/ui/setup.c +++ b/tools/perf/ui/setup.c @@ -30,6 +30,7 @@ void setup_browser(bool fallback_to_pager) if (fallback_to_pager) setup_pager(); + perf_hpp__column_enable(PERF_HPP__OVERHEAD); perf_hpp__init(); break; } diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index f0ee204f99bb..f9798298e3e0 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -335,13 +335,14 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size, size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, int max_cols, FILE *fp) { + struct perf_hpp_fmt *fmt; struct sort_entry *se; struct rb_node *nd; size_t ret = 0; unsigned int width; const char *sep = symbol_conf.field_sep; const char *col_width = symbol_conf.col_width_list_str; - int idx, nr_rows = 0; + int nr_rows = 0; char bf[96]; struct perf_hpp dummy_hpp = { .buf = bf, @@ -355,16 +356,14 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, goto print_entries; fprintf(fp, "# "); - for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) { - if (!perf_hpp__format[idx].cond) - continue; + perf_hpp__for_each_format(fmt) { if (!first) fprintf(fp, "%s", sep ?: " "); else first = false; - perf_hpp__format[idx].header(&dummy_hpp); + fmt->header(&dummy_hpp); fprintf(fp, "%s", bf); } @@ -400,18 +399,16 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows, first = true; fprintf(fp, "# "); - for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) { - unsigned int i; - if (!perf_hpp__format[idx].cond) - continue; + perf_hpp__for_each_format(fmt) { + unsigned int i; if (!first) fprintf(fp, "%s", sep ?: " "); else first = false; - width = perf_hpp__format[idx].width(&dummy_hpp); + width = fmt->width(&dummy_hpp); for (i = 0; i < width; i++) fprintf(fp, "."); } @@ -462,7 +459,7 @@ out: return ret; } -size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp) +size_t events_stats__fprintf(struct events_stats *stats, FILE *fp) { int i; size_t ret = 0; @@ -470,7 +467,7 @@ size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp) for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) { const char *name; - if (hists->stats.nr_events[i] == 0) + if (stats->nr_events[i] == 0) continue; name = perf_event__name(i); @@ -478,7 +475,7 @@ size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp) continue; ret += fprintf(fp, "%16s events: %10d\n", name, - hists->stats.nr_events[i]); + stats->nr_events[i]); } return ret; diff --git a/tools/perf/ui/tui/helpline.c b/tools/perf/ui/tui/helpline.c index 2884d2f41e33..1c8b9afd5d6e 100644 --- a/tools/perf/ui/tui/helpline.c +++ b/tools/perf/ui/tui/helpline.c @@ -8,6 +8,8 @@ #include "../ui.h" #include "../libslang.h" +char ui_helpline__last_msg[1024]; + static void tui_helpline__pop(void) { } @@ -23,20 +25,7 @@ static void tui_helpline__push(const char *msg) strncpy(ui_helpline__current, msg, sz)[sz - 1] = '\0'; } -struct ui_helpline tui_helpline_fns = { - .pop = tui_helpline__pop, - .push = tui_helpline__push, -}; - -void ui_helpline__init(void) -{ - helpline_fns = &tui_helpline_fns; - ui_helpline__puts(" "); -} - -char ui_helpline__last_msg[1024]; - -int ui_helpline__show_help(const char *format, va_list ap) +static int tui_helpline__show(const char *format, va_list ap) { int ret; static int backlog; @@ -55,3 +44,15 @@ int ui_helpline__show_help(const char *format, va_list ap) return ret; } + +struct ui_helpline tui_helpline_fns = { + .pop = tui_helpline__pop, + .push = tui_helpline__push, + .show = tui_helpline__show, +}; + +void ui_helpline__init(void) +{ + helpline_fns = &tui_helpline_fns; + ui_helpline__puts(" "); +} diff --git a/tools/perf/ui/util.c b/tools/perf/ui/util.c index 4f989774c8c6..e3e0a963d03a 100644 --- a/tools/perf/ui/util.c +++ b/tools/perf/ui/util.c @@ -52,7 +52,6 @@ int ui__warning(const char *format, ...) return ret; } - /** * perf_error__register - Register error logging functions * @eops: The pointer to error logging function struct diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN index 6aa34e5afdcf..055fef34b6f6 100755 --- a/tools/perf/util/PERF-VERSION-GEN +++ b/tools/perf/util/PERF-VERSION-GEN @@ -26,13 +26,13 @@ VN=$(expr "$VN" : v*'\(.*\)') if test -r $GVF then - VC=$(sed -e 's/^PERF_VERSION = //' <$GVF) + VC=$(sed -e 's/^#define PERF_VERSION "\(.*\)"/\1/' <$GVF) else VC=unset fi test "$VN" = "$VC" || { echo >&2 "PERF_VERSION = $VN" - echo "PERF_VERSION = $VN" >$GVF + echo "#define PERF_VERSION \"$VN\"" >$GVF } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index eb340571e7d6..3ee9f67d5af0 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -143,4 +143,9 @@ static inline void callchain_cursor_advance(struct callchain_cursor *cursor) cursor->curr = cursor->curr->next; cursor->pos++; } + +struct option; + +int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset); +extern const char record_callchain_help[]; #endif /* __PERF_CALLCHAIN_H */ diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c index 03f830b48148..399e74c34c1a 100644 --- a/tools/perf/util/debug.c +++ b/tools/perf/util/debug.c @@ -23,10 +23,8 @@ int eprintf(int level, const char *fmt, ...) if (verbose >= level) { va_start(args, fmt); - if (use_browser == 1) - ret = ui_helpline__show_help(fmt, args); - else if (use_browser == 2) - ret = perf_gtk__show_helpline(fmt, args); + if (use_browser >= 1) + ui_helpline__vshow(fmt, args); else ret = vfprintf(stderr, fmt, args); va_end(args); @@ -49,28 +47,6 @@ int dump_printf(const char *fmt, ...) return ret; } -#if !defined(NEWT_SUPPORT) && !defined(GTK2_SUPPORT) -int ui__warning(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - return 0; -} -#endif - -int ui__error_paranoid(void) -{ - return ui__error("Permission error - are you root?\n" - "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n" - " -1 - Not paranoid at all\n" - " 0 - Disallow raw tracepoint access for unpriv\n" - " 1 - Disallow cpu events for unpriv\n" - " 2 - Disallow kernel profiling for unpriv\n"); -} - void trace_event(union perf_event *event) { unsigned char *raw_event = (void *)event; diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h index 83e8d234af6b..efbd98805ad0 100644 --- a/tools/perf/util/debug.h +++ b/tools/perf/util/debug.h @@ -5,6 +5,8 @@ #include <stdbool.h> #include "event.h" #include "../ui/helpline.h" +#include "../ui/progress.h" +#include "../ui/util.h" extern int verbose; extern bool quiet, dump_trace; @@ -12,39 +14,7 @@ extern bool quiet, dump_trace; int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2))); void trace_event(union perf_event *event); -struct ui_progress; -struct perf_error_ops; - -#if defined(NEWT_SUPPORT) || defined(GTK2_SUPPORT) - -#include "../ui/progress.h" int ui__error(const char *format, ...) __attribute__((format(printf, 1, 2))); -#include "../ui/util.h" - -#else - -static inline void ui_progress__update(u64 curr __maybe_unused, - u64 total __maybe_unused, - const char *title __maybe_unused) {} -static inline void ui_progress__finish(void) {} - -#define ui__error(format, arg...) ui__warning(format, ##arg) - -static inline int -perf_error__register(struct perf_error_ops *eops __maybe_unused) -{ - return 0; -} - -static inline int -perf_error__unregister(struct perf_error_ops *eops __maybe_unused) -{ - return 0; -} - -#endif /* NEWT_SUPPORT || GTK2_SUPPORT */ - int ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2))); -int ui__error_paranoid(void); #endif /* __PERF_DEBUG_H */ diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index d6d9a465acdb..6f7d5a9d6b05 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -539,13 +539,13 @@ struct dso *__dsos__findnew(struct list_head *head, const char *name) } size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, - bool with_hits) + bool (skip)(struct dso *dso, int parm), int parm) { struct dso *pos; size_t ret = 0; list_for_each_entry(pos, head, node) { - if (with_hits && !pos->hit) + if (skip && skip(pos, parm)) continue; ret += dso__fprintf_buildid(pos, fp); ret += fprintf(fp, " %s\n", pos->long_name); @@ -583,7 +583,7 @@ size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp) if (dso->short_name != dso->long_name) ret += fprintf(fp, "%s, ", dso->long_name); ret += fprintf(fp, "%s, %sloaded, ", map_type__name[type], - dso->loaded ? "" : "NOT "); + dso__loaded(dso, type) ? "" : "NOT "); ret += dso__fprintf_buildid(dso, fp); ret += fprintf(fp, ")\n"); for (nd = rb_first(&dso->symbols[type]); nd; nd = rb_next(nd)) { diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index e03276940b99..450199ab51b5 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -138,7 +138,7 @@ struct dso *__dsos__findnew(struct list_head *head, const char *name); bool __dsos__read_build_ids(struct list_head *head, bool with_hits); size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp, - bool with_hits); + bool (skip)(struct dso *dso, int parm), int parm); size_t __dsos__fprintf(struct list_head *head, FILE *fp); size_t dso__fprintf_buildid(struct dso *dso, FILE *fp); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 705293489e3c..dc8aee97a488 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -49,10 +49,16 @@ struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, return evlist; } -void perf_evlist__config_attrs(struct perf_evlist *evlist, - struct perf_record_opts *opts) +void perf_evlist__config(struct perf_evlist *evlist, + struct perf_record_opts *opts) { struct perf_evsel *evsel; + /* + * Set the evsel leader links before we configure attributes, + * since some might depend on this info. + */ + if (opts->group) + perf_evlist__set_leader(evlist); if (evlist->cpus->map[0] < 0) opts->no_inherit = true; @@ -61,7 +67,7 @@ void perf_evlist__config_attrs(struct perf_evlist *evlist, perf_evsel__config(evsel, opts); if (evlist->nr_entries > 1) - evsel->attr.sample_type |= PERF_SAMPLE_ID; + perf_evsel__set_sample_id(evsel); } } @@ -111,7 +117,6 @@ void __perf_evlist__set_leader(struct list_head *list) struct perf_evsel *evsel, *leader; leader = list_entry(list->next, struct perf_evsel, node); - leader->leader = NULL; list_for_each_entry(evsel, list, node) { if (evsel != leader) @@ -222,7 +227,7 @@ void perf_evlist__disable(struct perf_evlist *evlist) for (cpu = 0; cpu < evlist->cpus->nr; cpu++) { list_for_each_entry(pos, &evlist->entries, node) { - if (perf_evsel__is_group_member(pos)) + if (!perf_evsel__is_group_leader(pos)) continue; for (thread = 0; thread < evlist->threads->nr; thread++) ioctl(FD(pos, cpu, thread), @@ -238,7 +243,7 @@ void perf_evlist__enable(struct perf_evlist *evlist) for (cpu = 0; cpu < cpu_map__nr(evlist->cpus); cpu++) { list_for_each_entry(pos, &evlist->entries, node) { - if (perf_evsel__is_group_member(pos)) + if (!perf_evsel__is_group_leader(pos)) continue; for (thread = 0; thread < evlist->threads->nr; thread++) ioctl(FD(pos, cpu, thread), diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 56003f779e60..457e2350d21d 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -76,8 +76,8 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx); int perf_evlist__open(struct perf_evlist *evlist); -void perf_evlist__config_attrs(struct perf_evlist *evlist, - struct perf_record_opts *opts); +void perf_evlist__config(struct perf_evlist *evlist, + struct perf_record_opts *opts); int perf_evlist__prepare_workload(struct perf_evlist *evlist, struct perf_record_opts *opts, diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1b16dd1edc8e..e45332d08a58 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -22,6 +22,11 @@ #include <linux/perf_event.h> #include "perf_regs.h" +static struct { + bool sample_id_all; + bool exclude_guest; +} perf_missing_features; + #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) static int __perf_evsel__sample_size(u64 sample_type) @@ -50,11 +55,36 @@ void hists__init(struct hists *hists) pthread_mutex_init(&hists->lock, NULL); } +void __perf_evsel__set_sample_bit(struct perf_evsel *evsel, + enum perf_event_sample_format bit) +{ + if (!(evsel->attr.sample_type & bit)) { + evsel->attr.sample_type |= bit; + evsel->sample_size += sizeof(u64); + } +} + +void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel, + enum perf_event_sample_format bit) +{ + if (evsel->attr.sample_type & bit) { + evsel->attr.sample_type &= ~bit; + evsel->sample_size -= sizeof(u64); + } +} + +void perf_evsel__set_sample_id(struct perf_evsel *evsel) +{ + perf_evsel__set_sample_bit(evsel, ID); + evsel->attr.read_format |= PERF_FORMAT_ID; +} + void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr, int idx) { evsel->idx = idx; evsel->attr = *attr; + evsel->leader = evsel; INIT_LIST_HEAD(&evsel->node); hists__init(&evsel->hists); evsel->sample_size = __perf_evsel__sample_size(attr->sample_type); @@ -438,13 +468,11 @@ void perf_evsel__config(struct perf_evsel *evsel, struct perf_event_attr *attr = &evsel->attr; int track = !evsel->idx; /* only the first counter needs these */ - attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1; + attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1; attr->inherit = !opts->no_inherit; - attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING | - PERF_FORMAT_ID; - attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; + perf_evsel__set_sample_bit(evsel, IP); + perf_evsel__set_sample_bit(evsel, TID); /* * We default some events to a 1 default interval. But keep @@ -453,7 +481,7 @@ void perf_evsel__config(struct perf_evsel *evsel, if (!attr->sample_period || (opts->user_freq != UINT_MAX && opts->user_interval != ULLONG_MAX)) { if (opts->freq) { - attr->sample_type |= PERF_SAMPLE_PERIOD; + perf_evsel__set_sample_bit(evsel, PERIOD); attr->freq = 1; attr->sample_freq = opts->freq; } else { @@ -468,16 +496,16 @@ void perf_evsel__config(struct perf_evsel *evsel, attr->inherit_stat = 1; if (opts->sample_address) { - attr->sample_type |= PERF_SAMPLE_ADDR; + perf_evsel__set_sample_bit(evsel, ADDR); attr->mmap_data = track; } if (opts->call_graph) { - attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + perf_evsel__set_sample_bit(evsel, CALLCHAIN); if (opts->call_graph == CALLCHAIN_DWARF) { - attr->sample_type |= PERF_SAMPLE_REGS_USER | - PERF_SAMPLE_STACK_USER; + perf_evsel__set_sample_bit(evsel, REGS_USER); + perf_evsel__set_sample_bit(evsel, STACK_USER); attr->sample_regs_user = PERF_REGS_MASK; attr->sample_stack_user = opts->stack_dump_size; attr->exclude_callchain_user = 1; @@ -485,20 +513,20 @@ void perf_evsel__config(struct perf_evsel *evsel, } if (perf_target__has_cpu(&opts->target)) - attr->sample_type |= PERF_SAMPLE_CPU; + perf_evsel__set_sample_bit(evsel, CPU); if (opts->period) - attr->sample_type |= PERF_SAMPLE_PERIOD; + perf_evsel__set_sample_bit(evsel, PERIOD); - if (!opts->sample_id_all_missing && + if (!perf_missing_features.sample_id_all && (opts->sample_time || !opts->no_inherit || perf_target__has_cpu(&opts->target))) - attr->sample_type |= PERF_SAMPLE_TIME; + perf_evsel__set_sample_bit(evsel, TIME); if (opts->raw_samples) { - attr->sample_type |= PERF_SAMPLE_TIME; - attr->sample_type |= PERF_SAMPLE_RAW; - attr->sample_type |= PERF_SAMPLE_CPU; + perf_evsel__set_sample_bit(evsel, TIME); + perf_evsel__set_sample_bit(evsel, RAW); + perf_evsel__set_sample_bit(evsel, CPU); } if (opts->no_delay) { @@ -506,7 +534,7 @@ void perf_evsel__config(struct perf_evsel *evsel, attr->wakeup_events = 1; } if (opts->branch_stack) { - attr->sample_type |= PERF_SAMPLE_BRANCH_STACK; + perf_evsel__set_sample_bit(evsel, BRANCH_STACK); attr->branch_sample_type = opts->branch_stack; } @@ -519,14 +547,14 @@ void perf_evsel__config(struct perf_evsel *evsel, * Disabling only independent events or group leaders, * keeping group members enabled. */ - if (!perf_evsel__is_group_member(evsel)) + if (perf_evsel__is_group_leader(evsel)) attr->disabled = 1; /* * Setting enable_on_exec for independent events and * group leaders for traced executed by perf. */ - if (perf_target__none(&opts->target) && !perf_evsel__is_group_member(evsel)) + if (perf_target__none(&opts->target) && perf_evsel__is_group_leader(evsel)) attr->enable_on_exec = 1; } @@ -707,7 +735,7 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread) struct perf_evsel *leader = evsel->leader; int fd; - if (!perf_evsel__is_group_member(evsel)) + if (perf_evsel__is_group_leader(evsel)) return -1; /* @@ -738,6 +766,13 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, pid = evsel->cgrp->fd; } +fallback_missing_features: + if (perf_missing_features.exclude_guest) + evsel->attr.exclude_guest = evsel->attr.exclude_host = 0; +retry_sample_id: + if (perf_missing_features.sample_id_all) + evsel->attr.sample_id_all = 0; + for (cpu = 0; cpu < cpus->nr; cpu++) { for (thread = 0; thread < threads->nr; thread++) { @@ -754,13 +789,26 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, group_fd, flags); if (FD(evsel, cpu, thread) < 0) { err = -errno; - goto out_close; + goto try_fallback; } } } return 0; +try_fallback: + if (err != -EINVAL || cpu > 0 || thread > 0) + goto out_close; + + if (!perf_missing_features.exclude_guest && + (evsel->attr.exclude_guest || evsel->attr.exclude_host)) { + perf_missing_features.exclude_guest = true; + goto fallback_missing_features; + } else if (!perf_missing_features.sample_id_all) { + perf_missing_features.sample_id_all = true; + goto retry_sample_id; + } + out_close: do { while (--thread >= 0) { @@ -1205,3 +1253,205 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample, return 0; } + +static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...) +{ + va_list args; + int ret = 0; + + if (!*first) { + ret += fprintf(fp, ","); + } else { + ret += fprintf(fp, ":"); + *first = false; + } + + va_start(args, fmt); + ret += vfprintf(fp, fmt, args); + va_end(args); + return ret; +} + +static int __if_fprintf(FILE *fp, bool *first, const char *field, u64 value) +{ + if (value == 0) + return 0; + + return comma_fprintf(fp, first, " %s: %" PRIu64, field, value); +} + +#define if_print(field) printed += __if_fprintf(fp, &first, #field, evsel->attr.field) + +struct bit_names { + int bit; + const char *name; +}; + +static int bits__fprintf(FILE *fp, const char *field, u64 value, + struct bit_names *bits, bool *first) +{ + int i = 0, printed = comma_fprintf(fp, first, " %s: ", field); + bool first_bit = true; + + do { + if (value & bits[i].bit) { + printed += fprintf(fp, "%s%s", first_bit ? "" : "|", bits[i].name); + first_bit = false; + } + } while (bits[++i].name != NULL); + + return printed; +} + +static int sample_type__fprintf(FILE *fp, bool *first, u64 value) +{ +#define bit_name(n) { PERF_SAMPLE_##n, #n } + struct bit_names bits[] = { + bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR), + bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU), + bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), + bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), + { .name = NULL, } + }; +#undef bit_name + return bits__fprintf(fp, "sample_type", value, bits, first); +} + +static int read_format__fprintf(FILE *fp, bool *first, u64 value) +{ +#define bit_name(n) { PERF_FORMAT_##n, #n } + struct bit_names bits[] = { + bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING), + bit_name(ID), bit_name(GROUP), + { .name = NULL, } + }; +#undef bit_name + return bits__fprintf(fp, "read_format", value, bits, first); +} + +int perf_evsel__fprintf(struct perf_evsel *evsel, + struct perf_attr_details *details, FILE *fp) +{ + bool first = true; + int printed = fprintf(fp, "%s", perf_evsel__name(evsel)); + + if (details->verbose || details->freq) { + printed += comma_fprintf(fp, &first, " sample_freq=%" PRIu64, + (u64)evsel->attr.sample_freq); + } + + if (details->verbose) { + if_print(type); + if_print(config); + if_print(config1); + if_print(config2); + if_print(size); + printed += sample_type__fprintf(fp, &first, evsel->attr.sample_type); + if (evsel->attr.read_format) + printed += read_format__fprintf(fp, &first, evsel->attr.read_format); + if_print(disabled); + if_print(inherit); + if_print(pinned); + if_print(exclusive); + if_print(exclude_user); + if_print(exclude_kernel); + if_print(exclude_hv); + if_print(exclude_idle); + if_print(mmap); + if_print(comm); + if_print(freq); + if_print(inherit_stat); + if_print(enable_on_exec); + if_print(task); + if_print(watermark); + if_print(precise_ip); + if_print(mmap_data); + if_print(sample_id_all); + if_print(exclude_host); + if_print(exclude_guest); + if_print(__reserved_1); + if_print(wakeup_events); + if_print(bp_type); + if_print(branch_sample_type); + } + + fputc('\n', fp); + return ++printed; +} + +bool perf_evsel__fallback(struct perf_evsel *evsel, int err, + char *msg, size_t msgsize) +{ + if ((err == ENOENT || err == ENXIO) && + evsel->attr.type == PERF_TYPE_HARDWARE && + evsel->attr.config == PERF_COUNT_HW_CPU_CYCLES) { + /* + * If it's cycles then fall back to hrtimer based + * cpu-clock-tick sw counter, which is always available even if + * no PMU support. + * + * PPC returns ENXIO until 2.6.37 (behavior changed with commit + * b0a873e). + */ + scnprintf(msg, msgsize, "%s", +"The cycles event is not supported, trying to fall back to cpu-clock-ticks"); + + evsel->attr.type = PERF_TYPE_SOFTWARE; + evsel->attr.config = PERF_COUNT_SW_CPU_CLOCK; + + free(evsel->name); + evsel->name = NULL; + return true; + } + + return false; +} + +int perf_evsel__open_strerror(struct perf_evsel *evsel, + struct perf_target *target, + int err, char *msg, size_t size) +{ + switch (err) { + case EPERM: + case EACCES: + return scnprintf(msg, size, "%s", + "You may not have permission to collect %sstats.\n" + "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n" + " -1 - Not paranoid at all\n" + " 0 - Disallow raw tracepoint access for unpriv\n" + " 1 - Disallow cpu events for unpriv\n" + " 2 - Disallow kernel profiling for unpriv", + target->system_wide ? "system-wide " : ""); + case ENOENT: + return scnprintf(msg, size, "The %s event is not supported.", + perf_evsel__name(evsel)); + case EMFILE: + return scnprintf(msg, size, "%s", + "Too many events are opened.\n" + "Try again after reducing the number of events."); + case ENODEV: + if (target->cpu_list) + return scnprintf(msg, size, "%s", + "No such device - did you specify an out-of-range profile CPU?\n"); + break; + case EOPNOTSUPP: + if (evsel->attr.precise_ip) + return scnprintf(msg, size, "%s", + "\'precise\' request may not be supported. Try removing 'p' modifier."); +#if defined(__i386__) || defined(__x86_64__) + if (evsel->attr.type == PERF_TYPE_HARDWARE) + return scnprintf(msg, size, "%s", + "No hardware sampling interrupt available.\n" + "No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it."); +#endif + break; + default: + break; + } + + return scnprintf(msg, size, + "The sys_perf_event_open() syscall returned with %d (%s) for event (%s). \n" + "/bin/dmesg may provide additional information.\n" + "No CONFIG_PERF_EVENTS=y kernel support configured?\n", + err, strerror(err), perf_evsel__name(evsel)); +} diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 3d2b8017438c..c68d1b82e843 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -118,6 +118,19 @@ void perf_evsel__free_fd(struct perf_evsel *evsel); void perf_evsel__free_id(struct perf_evsel *evsel); void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads); +void __perf_evsel__set_sample_bit(struct perf_evsel *evsel, + enum perf_event_sample_format bit); +void __perf_evsel__reset_sample_bit(struct perf_evsel *evsel, + enum perf_event_sample_format bit); + +#define perf_evsel__set_sample_bit(evsel, bit) \ + __perf_evsel__set_sample_bit(evsel, PERF_SAMPLE_##bit) + +#define perf_evsel__reset_sample_bit(evsel, bit) \ + __perf_evsel__reset_sample_bit(evsel, PERF_SAMPLE_##bit) + +void perf_evsel__set_sample_id(struct perf_evsel *evsel); + int perf_evsel__set_filter(struct perf_evsel *evsel, int ncpus, int nthreads, const char *filter); @@ -226,8 +239,22 @@ static inline struct perf_evsel *perf_evsel__next(struct perf_evsel *evsel) return list_entry(evsel->node.next, struct perf_evsel, node); } -static inline bool perf_evsel__is_group_member(const struct perf_evsel *evsel) +static inline bool perf_evsel__is_group_leader(const struct perf_evsel *evsel) { - return evsel->leader != NULL; + return evsel->leader == evsel; } + +struct perf_attr_details { + bool freq; + bool verbose; +}; + +int perf_evsel__fprintf(struct perf_evsel *evsel, + struct perf_attr_details *details, FILE *fp); + +bool perf_evsel__fallback(struct perf_evsel *evsel, int err, + char *msg, size_t msgsize); +int perf_evsel__open_strerror(struct perf_evsel *evsel, + struct perf_target *target, + int err, char *msg, size_t size); #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index b7da4634a047..fccd69dbbbb9 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -148,7 +148,7 @@ static char *do_read_string(int fd, struct perf_header *ph) u32 len; char *buf; - sz = read(fd, &len, sizeof(len)); + sz = readn(fd, &len, sizeof(len)); if (sz < (ssize_t)sizeof(len)) return NULL; @@ -159,7 +159,7 @@ static char *do_read_string(int fd, struct perf_header *ph) if (!buf) return NULL; - ret = read(fd, buf, len); + ret = readn(fd, buf, len); if (ret == (ssize_t)len) { /* * strings are padded by zeroes @@ -287,12 +287,12 @@ static int dsos__write_buildid_table(struct perf_header *header, int fd) struct perf_session *session = container_of(header, struct perf_session, header); struct rb_node *nd; - int err = machine__write_buildid_table(&session->host_machine, fd); + int err = machine__write_buildid_table(&session->machines.host, fd); if (err) return err; - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); err = machine__write_buildid_table(pos, fd); if (err) @@ -448,9 +448,9 @@ static int perf_session__cache_build_ids(struct perf_session *session) if (mkdir(debugdir, 0755) != 0 && errno != EEXIST) return -1; - ret = machine__cache_build_ids(&session->host_machine, debugdir); + ret = machine__cache_build_ids(&session->machines.host, debugdir); - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); ret |= machine__cache_build_ids(pos, debugdir); } @@ -467,9 +467,9 @@ static bool machine__read_build_ids(struct machine *machine, bool with_hits) static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) { struct rb_node *nd; - bool ret = machine__read_build_ids(&session->host_machine, with_hits); + bool ret = machine__read_build_ids(&session->machines.host, with_hits); - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + for (nd = rb_first(&session->machines.guests); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); ret |= machine__read_build_ids(pos, with_hits); } @@ -1051,16 +1051,25 @@ static int write_pmu_mappings(int fd, struct perf_header *h __maybe_unused, struct perf_pmu *pmu = NULL; off_t offset = lseek(fd, 0, SEEK_CUR); __u32 pmu_num = 0; + int ret; /* write real pmu_num later */ - do_write(fd, &pmu_num, sizeof(pmu_num)); + ret = do_write(fd, &pmu_num, sizeof(pmu_num)); + if (ret < 0) + return ret; while ((pmu = perf_pmu__scan(pmu))) { if (!pmu->name) continue; pmu_num++; - do_write(fd, &pmu->type, sizeof(pmu->type)); - do_write_string(fd, pmu->name); + + ret = do_write(fd, &pmu->type, sizeof(pmu->type)); + if (ret < 0) + return ret; + + ret = do_write_string(fd, pmu->name); + if (ret < 0) + return ret; } if (pwrite(fd, &pmu_num, sizeof(pmu_num), offset) != sizeof(pmu_num)) { @@ -1209,14 +1218,14 @@ read_event_desc(struct perf_header *ph, int fd) size_t msz; /* number of events */ - ret = read(fd, &nre, sizeof(nre)); + ret = readn(fd, &nre, sizeof(nre)); if (ret != (ssize_t)sizeof(nre)) goto error; if (ph->needs_swap) nre = bswap_32(nre); - ret = read(fd, &sz, sizeof(sz)); + ret = readn(fd, &sz, sizeof(sz)); if (ret != (ssize_t)sizeof(sz)) goto error; @@ -1244,7 +1253,7 @@ read_event_desc(struct perf_header *ph, int fd) * must read entire on-file attr struct to * sync up with layout. */ - ret = read(fd, buf, sz); + ret = readn(fd, buf, sz); if (ret != (ssize_t)sz) goto error; @@ -1253,7 +1262,7 @@ read_event_desc(struct perf_header *ph, int fd) memcpy(&evsel->attr, buf, msz); - ret = read(fd, &nr, sizeof(nr)); + ret = readn(fd, &nr, sizeof(nr)); if (ret != (ssize_t)sizeof(nr)) goto error; @@ -1274,7 +1283,7 @@ read_event_desc(struct perf_header *ph, int fd) evsel->id = id; for (j = 0 ; j < nr; j++) { - ret = read(fd, id, sizeof(*id)); + ret = readn(fd, id, sizeof(*id)); if (ret != (ssize_t)sizeof(*id)) goto error; if (ph->needs_swap) @@ -1506,14 +1515,14 @@ static int perf_header__read_build_ids_abi_quirk(struct perf_header *header, while (offset < limit) { ssize_t len; - if (read(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev)) + if (readn(input, &old_bev, sizeof(old_bev)) != sizeof(old_bev)) return -1; if (header->needs_swap) perf_event_header__bswap(&old_bev.header); len = old_bev.header.size - sizeof(old_bev); - if (read(input, filename, len) != len) + if (readn(input, filename, len) != len) return -1; bev.header = old_bev.header; @@ -1548,14 +1557,14 @@ static int perf_header__read_build_ids(struct perf_header *header, while (offset < limit) { ssize_t len; - if (read(input, &bev, sizeof(bev)) != sizeof(bev)) + if (readn(input, &bev, sizeof(bev)) != sizeof(bev)) goto out; if (header->needs_swap) perf_event_header__bswap(&bev.header); len = bev.header.size - sizeof(bev); - if (read(input, filename, len) != len) + if (readn(input, filename, len) != len) goto out; /* * The a1645ce1 changeset: @@ -1641,7 +1650,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused, size_t ret; u32 nr; - ret = read(fd, &nr, sizeof(nr)); + ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) return -1; @@ -1650,7 +1659,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused, ph->env.nr_cpus_online = nr; - ret = read(fd, &nr, sizeof(nr)); + ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) return -1; @@ -1684,7 +1693,7 @@ static int process_total_mem(struct perf_file_section *section __maybe_unused, uint64_t mem; size_t ret; - ret = read(fd, &mem, sizeof(mem)); + ret = readn(fd, &mem, sizeof(mem)); if (ret != sizeof(mem)) return -1; @@ -1756,7 +1765,7 @@ static int process_cmdline(struct perf_file_section *section __maybe_unused, u32 nr, i; struct strbuf sb; - ret = read(fd, &nr, sizeof(nr)); + ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) return -1; @@ -1792,7 +1801,7 @@ static int process_cpu_topology(struct perf_file_section *section __maybe_unused char *str; struct strbuf sb; - ret = read(fd, &nr, sizeof(nr)); + ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) return -1; @@ -1813,7 +1822,7 @@ static int process_cpu_topology(struct perf_file_section *section __maybe_unused } ph->env.sibling_cores = strbuf_detach(&sb, NULL); - ret = read(fd, &nr, sizeof(nr)); + ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) return -1; @@ -1850,7 +1859,7 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse struct strbuf sb; /* nr nodes */ - ret = read(fd, &nr, sizeof(nr)); + ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) goto error; @@ -1862,15 +1871,15 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse for (i = 0; i < nr; i++) { /* node number */ - ret = read(fd, &node, sizeof(node)); + ret = readn(fd, &node, sizeof(node)); if (ret != sizeof(node)) goto error; - ret = read(fd, &mem_total, sizeof(u64)); + ret = readn(fd, &mem_total, sizeof(u64)); if (ret != sizeof(u64)) goto error; - ret = read(fd, &mem_free, sizeof(u64)); + ret = readn(fd, &mem_free, sizeof(u64)); if (ret != sizeof(u64)) goto error; @@ -1909,7 +1918,7 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused u32 type; struct strbuf sb; - ret = read(fd, &pmu_num, sizeof(pmu_num)); + ret = readn(fd, &pmu_num, sizeof(pmu_num)); if (ret != sizeof(pmu_num)) return -1; @@ -1925,7 +1934,7 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused strbuf_init(&sb, 128); while (pmu_num) { - if (read(fd, &type, sizeof(type)) != sizeof(type)) + if (readn(fd, &type, sizeof(type)) != sizeof(type)) goto error; if (ph->needs_swap) type = bswap_32(type); @@ -2912,7 +2921,7 @@ int perf_event__process_tracing_data(union perf_event *event, session->repipe); padding = PERF_ALIGN(size_read, sizeof(u64)) - size_read; - if (read(session->fd, buf, padding) < 0) + if (readn(session->fd, buf, padding) < 0) die("reading input file"); if (session->repipe) { int retw = write(STDOUT_FILENO, buf, padding); diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index cb17e2a8c6ed..8170a3d11ffa 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -82,6 +82,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) hists__new_col_len(hists, HISTC_DSO, len); } + if (h->parent) + hists__new_col_len(hists, HISTC_PARENT, h->parent->namelen); + if (h->branch_info) { int symlen; /* @@ -242,6 +245,14 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template) if (he->ms.map) he->ms.map->referenced = true; + + if (he->branch_info) { + if (he->branch_info->from.map) + he->branch_info->from.map->referenced = true; + if (he->branch_info->to.map) + he->branch_info->to.map->referenced = true; + } + if (symbol_conf.use_callchain) callchain_init(he->callchain); @@ -251,7 +262,7 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template) return he; } -static void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h) +void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h) { if (!h->filtered) { hists__calc_col_len(hists, h); @@ -285,7 +296,13 @@ static struct hist_entry *add_hist_entry(struct hists *hists, parent = *p; he = rb_entry(parent, struct hist_entry, rb_node_in); - cmp = hist_entry__cmp(entry, he); + /* + * Make sure that it receives arguments in a same order as + * hist_entry__collapse() so that we can use an appropriate + * function when searching an entry regardless which sort + * keys were used. + */ + cmp = hist_entry__cmp(he, entry); if (!cmp) { he_stat__add_period(&he->stat, period); @@ -711,25 +728,38 @@ int hist_entry__annotate(struct hist_entry *he, size_t privsize) return symbol__annotate(he->ms.sym, he->ms.map, privsize); } +void events_stats__inc(struct events_stats *stats, u32 type) +{ + ++stats->nr_events[0]; + ++stats->nr_events[type]; +} + void hists__inc_nr_events(struct hists *hists, u32 type) { - ++hists->stats.nr_events[0]; - ++hists->stats.nr_events[type]; + events_stats__inc(&hists->stats, type); } static struct hist_entry *hists__add_dummy_entry(struct hists *hists, struct hist_entry *pair) { - struct rb_node **p = &hists->entries.rb_node; + struct rb_root *root; + struct rb_node **p; struct rb_node *parent = NULL; struct hist_entry *he; int cmp; + if (sort__need_collapse) + root = &hists->entries_collapsed; + else + root = hists->entries_in; + + p = &root->rb_node; + while (*p != NULL) { parent = *p; - he = rb_entry(parent, struct hist_entry, rb_node); + he = rb_entry(parent, struct hist_entry, rb_node_in); - cmp = hist_entry__cmp(pair, he); + cmp = hist_entry__collapse(he, pair); if (!cmp) goto out; @@ -744,8 +774,8 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists, if (he) { memset(&he->stat, 0, sizeof(he->stat)); he->hists = hists; - rb_link_node(&he->rb_node, parent, p); - rb_insert_color(&he->rb_node, &hists->entries); + rb_link_node(&he->rb_node_in, parent, p); + rb_insert_color(&he->rb_node_in, root); hists__inc_nr_entries(hists, he); } out: @@ -755,11 +785,16 @@ out: static struct hist_entry *hists__find_entry(struct hists *hists, struct hist_entry *he) { - struct rb_node *n = hists->entries.rb_node; + struct rb_node *n; + + if (sort__need_collapse) + n = hists->entries_collapsed.rb_node; + else + n = hists->entries_in->rb_node; while (n) { - struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node); - int64_t cmp = hist_entry__cmp(he, iter); + struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node_in); + int64_t cmp = hist_entry__collapse(iter, he); if (cmp < 0) n = n->rb_left; @@ -777,15 +812,21 @@ static struct hist_entry *hists__find_entry(struct hists *hists, */ void hists__match(struct hists *leader, struct hists *other) { + struct rb_root *root; struct rb_node *nd; struct hist_entry *pos, *pair; - for (nd = rb_first(&leader->entries); nd; nd = rb_next(nd)) { - pos = rb_entry(nd, struct hist_entry, rb_node); + if (sort__need_collapse) + root = &leader->entries_collapsed; + else + root = leader->entries_in; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + pos = rb_entry(nd, struct hist_entry, rb_node_in); pair = hists__find_entry(other, pos); if (pair) - hist__entry_add_pair(pos, pair); + hist_entry__add_pair(pair, pos); } } @@ -796,17 +837,23 @@ void hists__match(struct hists *leader, struct hists *other) */ int hists__link(struct hists *leader, struct hists *other) { + struct rb_root *root; struct rb_node *nd; struct hist_entry *pos, *pair; - for (nd = rb_first(&other->entries); nd; nd = rb_next(nd)) { - pos = rb_entry(nd, struct hist_entry, rb_node); + if (sort__need_collapse) + root = &other->entries_collapsed; + else + root = other->entries_in; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + pos = rb_entry(nd, struct hist_entry, rb_node_in); if (!hist_entry__has_pairs(pos)) { pair = hists__add_dummy_entry(leader, pos); if (pair == NULL) return -1; - hist__entry_add_pair(pair, pos); + hist_entry__add_pair(pos, pair); } } diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 8b091a51e4a2..38624686ee9a 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -96,8 +96,10 @@ void hists__decay_entries_threaded(struct hists *hists, bool zap_user, bool zap_kernel); void hists__output_recalc_col_len(struct hists *hists, int max_rows); +void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h); void hists__inc_nr_events(struct hists *self, u32 type); -size_t hists__fprintf_nr_events(struct hists *self, FILE *fp); +void events_stats__inc(struct events_stats *stats, u32 type); +size_t events_stats__fprintf(struct events_stats *stats, FILE *fp); size_t hists__fprintf(struct hists *self, bool show_header, int max_rows, int max_cols, FILE *fp); @@ -126,13 +128,19 @@ struct perf_hpp { }; struct perf_hpp_fmt { - bool cond; int (*header)(struct perf_hpp *hpp); int (*width)(struct perf_hpp *hpp); int (*color)(struct perf_hpp *hpp, struct hist_entry *he); int (*entry)(struct perf_hpp *hpp, struct hist_entry *he); + + struct list_head list; }; +extern struct list_head perf_hpp__list; + +#define perf_hpp__for_each_format(format) \ + list_for_each_entry(format, &perf_hpp__list, list) + extern struct perf_hpp_fmt perf_hpp__format[]; enum { @@ -148,14 +156,14 @@ enum { PERF_HPP__DELTA, PERF_HPP__RATIO, PERF_HPP__WEIGHTED_DIFF, - PERF_HPP__DISPL, PERF_HPP__FORMULA, PERF_HPP__MAX_INDEX }; void perf_hpp__init(void); -void perf_hpp__column_enable(unsigned col, bool enable); +void perf_hpp__column_register(struct perf_hpp_fmt *format); +void perf_hpp__column_enable(unsigned col); int hist_entry__period_snprintf(struct perf_hpp *hpp, struct hist_entry *he, bool color); @@ -219,8 +227,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist __maybe_unused, unsigned int hists__sort_list_width(struct hists *self); -double perf_diff__compute_delta(struct hist_entry *he); -double perf_diff__compute_ratio(struct hist_entry *he); -s64 perf_diff__compute_wdiff(struct hist_entry *he); -int perf_diff__formula(char *buf, size_t size, struct hist_entry *he); +double perf_diff__compute_delta(struct hist_entry *he, struct hist_entry *pair); +double perf_diff__compute_ratio(struct hist_entry *he, struct hist_entry *pair); +s64 perf_diff__compute_wdiff(struct hist_entry *he, struct hist_entry *pair); +int perf_diff__formula(struct hist_entry *he, struct hist_entry *pair, + char *buf, size_t size); +double perf_diff__period_percent(struct hist_entry *he, u64 period); #endif /* __PERF_HIST_H */ diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index a55d8cf083c9..45cf10a562bd 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -14,6 +14,7 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32)) +#define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE) #define for_each_set_bit(bit, addr, size) \ for ((bit) = find_first_bit((addr), (size)); \ diff --git a/tools/perf/util/intlist.c b/tools/perf/util/intlist.c index 9d0740024ba8..11a8d86f7fea 100644 --- a/tools/perf/util/intlist.c +++ b/tools/perf/util/intlist.c @@ -59,16 +59,40 @@ void intlist__remove(struct intlist *ilist, struct int_node *node) struct int_node *intlist__find(struct intlist *ilist, int i) { - struct int_node *node = NULL; - struct rb_node *rb_node = rblist__find(&ilist->rblist, (void *)((long)i)); + struct int_node *node; + struct rb_node *rb_node; + if (ilist == NULL) + return NULL; + + node = NULL; + rb_node = rblist__find(&ilist->rblist, (void *)((long)i)); if (rb_node) node = container_of(rb_node, struct int_node, rb_node); return node; } -struct intlist *intlist__new(void) +static int intlist__parse_list(struct intlist *ilist, const char *s) +{ + char *sep; + int err; + + do { + long value = strtol(s, &sep, 10); + err = -EINVAL; + if (*sep != ',' && *sep != '\0') + break; + err = intlist__add(ilist, value); + if (err) + break; + s = sep + 1; + } while (*sep != '\0'); + + return err; +} + +struct intlist *intlist__new(const char *slist) { struct intlist *ilist = malloc(sizeof(*ilist)); @@ -77,9 +101,15 @@ struct intlist *intlist__new(void) ilist->rblist.node_cmp = intlist__node_cmp; ilist->rblist.node_new = intlist__node_new; ilist->rblist.node_delete = intlist__node_delete; + + if (slist && intlist__parse_list(ilist, slist)) + goto out_delete; } return ilist; +out_delete: + intlist__delete(ilist); + return NULL; } void intlist__delete(struct intlist *ilist) diff --git a/tools/perf/util/intlist.h b/tools/perf/util/intlist.h index 6d63ab90db50..62351dad848f 100644 --- a/tools/perf/util/intlist.h +++ b/tools/perf/util/intlist.h @@ -15,7 +15,7 @@ struct intlist { struct rblist rblist; }; -struct intlist *intlist__new(void); +struct intlist *intlist__new(const char *slist); void intlist__delete(struct intlist *ilist); void intlist__remove(struct intlist *ilist, struct int_node *in); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 1f09d0581e6b..efdb38e65a92 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1,10 +1,15 @@ +#include "callchain.h" #include "debug.h" #include "event.h" +#include "evsel.h" +#include "hist.h" #include "machine.h" #include "map.h" +#include "sort.h" #include "strlist.h" #include "thread.h" #include <stdbool.h> +#include "unwind.h" int machine__init(struct machine *machine, const char *root_dir, pid_t pid) { @@ -48,6 +53,29 @@ static void dsos__delete(struct list_head *dsos) } } +void machine__delete_dead_threads(struct machine *machine) +{ + struct thread *n, *t; + + list_for_each_entry_safe(t, n, &machine->dead_threads, node) { + list_del(&t->node); + thread__delete(t); + } +} + +void machine__delete_threads(struct machine *machine) +{ + struct rb_node *nd = rb_first(&machine->threads); + + while (nd) { + struct thread *t = rb_entry(nd, struct thread, rb_node); + + rb_erase(&t->rb_node, &machine->threads); + nd = rb_next(nd); + thread__delete(t); + } +} + void machine__exit(struct machine *machine) { map_groups__exit(&machine->kmaps); @@ -63,10 +91,22 @@ void machine__delete(struct machine *machine) free(machine); } -struct machine *machines__add(struct rb_root *machines, pid_t pid, +void machines__init(struct machines *machines) +{ + machine__init(&machines->host, "", HOST_KERNEL_ID); + machines->guests = RB_ROOT; +} + +void machines__exit(struct machines *machines) +{ + machine__exit(&machines->host); + /* XXX exit guest */ +} + +struct machine *machines__add(struct machines *machines, pid_t pid, const char *root_dir) { - struct rb_node **p = &machines->rb_node; + struct rb_node **p = &machines->guests.rb_node; struct rb_node *parent = NULL; struct machine *pos, *machine = malloc(sizeof(*machine)); @@ -88,18 +128,21 @@ struct machine *machines__add(struct rb_root *machines, pid_t pid, } rb_link_node(&machine->rb_node, parent, p); - rb_insert_color(&machine->rb_node, machines); + rb_insert_color(&machine->rb_node, &machines->guests); return machine; } -struct machine *machines__find(struct rb_root *machines, pid_t pid) +struct machine *machines__find(struct machines *machines, pid_t pid) { - struct rb_node **p = &machines->rb_node; + struct rb_node **p = &machines->guests.rb_node; struct rb_node *parent = NULL; struct machine *machine; struct machine *default_machine = NULL; + if (pid == HOST_KERNEL_ID) + return &machines->host; + while (*p != NULL) { parent = *p; machine = rb_entry(parent, struct machine, rb_node); @@ -116,7 +159,7 @@ struct machine *machines__find(struct rb_root *machines, pid_t pid) return default_machine; } -struct machine *machines__findnew(struct rb_root *machines, pid_t pid) +struct machine *machines__findnew(struct machines *machines, pid_t pid) { char path[PATH_MAX]; const char *root_dir = ""; @@ -150,12 +193,12 @@ out: return machine; } -void machines__process(struct rb_root *machines, - machine__process_t process, void *data) +void machines__process_guests(struct machines *machines, + machine__process_t process, void *data) { struct rb_node *nd; - for (nd = rb_first(machines); nd; nd = rb_next(nd)) { + for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) { struct machine *pos = rb_entry(nd, struct machine, rb_node); process(pos, data); } @@ -175,12 +218,14 @@ char *machine__mmap_name(struct machine *machine, char *bf, size_t size) return bf; } -void machines__set_id_hdr_size(struct rb_root *machines, u16 id_hdr_size) +void machines__set_id_hdr_size(struct machines *machines, u16 id_hdr_size) { struct rb_node *node; struct machine *machine; - for (node = rb_first(machines); node; node = rb_next(node)) { + machines->host.id_hdr_size = id_hdr_size; + + for (node = rb_first(&machines->guests); node; node = rb_next(node)) { machine = rb_entry(node, struct machine, rb_node); machine->id_hdr_size = id_hdr_size; } @@ -264,6 +309,537 @@ int machine__process_lost_event(struct machine *machine __maybe_unused, return 0; } +struct map *machine__new_module(struct machine *machine, u64 start, + const char *filename) +{ + struct map *map; + struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename); + + if (dso == NULL) + return NULL; + + map = map__new2(start, dso, MAP__FUNCTION); + if (map == NULL) + return NULL; + + if (machine__is_host(machine)) + dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE; + else + dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE; + map_groups__insert(&machine->kmaps, map); + return map; +} + +size_t machines__fprintf_dsos(struct machines *machines, FILE *fp) +{ + struct rb_node *nd; + size_t ret = __dsos__fprintf(&machines->host.kernel_dsos, fp) + + __dsos__fprintf(&machines->host.user_dsos, fp); + + for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + ret += __dsos__fprintf(&pos->kernel_dsos, fp); + ret += __dsos__fprintf(&pos->user_dsos, fp); + } + + return ret; +} + +size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp, + bool (skip)(struct dso *dso, int parm), int parm) +{ + return __dsos__fprintf_buildid(&machine->kernel_dsos, fp, skip, parm) + + __dsos__fprintf_buildid(&machine->user_dsos, fp, skip, parm); +} + +size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp, + bool (skip)(struct dso *dso, int parm), int parm) +{ + struct rb_node *nd; + size_t ret = machine__fprintf_dsos_buildid(&machines->host, fp, skip, parm); + + for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + ret += machine__fprintf_dsos_buildid(pos, fp, skip, parm); + } + return ret; +} + +size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp) +{ + int i; + size_t printed = 0; + struct dso *kdso = machine->vmlinux_maps[MAP__FUNCTION]->dso; + + if (kdso->has_build_id) { + char filename[PATH_MAX]; + if (dso__build_id_filename(kdso, filename, sizeof(filename))) + printed += fprintf(fp, "[0] %s\n", filename); + } + + for (i = 0; i < vmlinux_path__nr_entries; ++i) + printed += fprintf(fp, "[%d] %s\n", + i + kdso->has_build_id, vmlinux_path[i]); + + return printed; +} + +size_t machine__fprintf(struct machine *machine, FILE *fp) +{ + size_t ret = 0; + struct rb_node *nd; + + for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) { + struct thread *pos = rb_entry(nd, struct thread, rb_node); + + ret += thread__fprintf(pos, fp); + } + + return ret; +} + +static struct dso *machine__get_kernel(struct machine *machine) +{ + const char *vmlinux_name = NULL; + struct dso *kernel; + + if (machine__is_host(machine)) { + vmlinux_name = symbol_conf.vmlinux_name; + if (!vmlinux_name) + vmlinux_name = "[kernel.kallsyms]"; + + kernel = dso__kernel_findnew(machine, vmlinux_name, + "[kernel]", + DSO_TYPE_KERNEL); + } else { + char bf[PATH_MAX]; + + if (machine__is_default_guest(machine)) + vmlinux_name = symbol_conf.default_guest_vmlinux_name; + if (!vmlinux_name) + vmlinux_name = machine__mmap_name(machine, bf, + sizeof(bf)); + + kernel = dso__kernel_findnew(machine, vmlinux_name, + "[guest.kernel]", + DSO_TYPE_GUEST_KERNEL); + } + + if (kernel != NULL && (!kernel->has_build_id)) + dso__read_running_kernel_build_id(kernel, machine); + + return kernel; +} + +struct process_args { + u64 start; +}; + +static int symbol__in_kernel(void *arg, const char *name, + char type __maybe_unused, u64 start) +{ + struct process_args *args = arg; + + if (strchr(name, '[')) + return 0; + + args->start = start; + return 1; +} + +/* Figure out the start address of kernel map from /proc/kallsyms */ +static u64 machine__get_kernel_start_addr(struct machine *machine) +{ + const char *filename; + char path[PATH_MAX]; + struct process_args args; + + if (machine__is_host(machine)) { + filename = "/proc/kallsyms"; + } else { + if (machine__is_default_guest(machine)) + filename = (char *)symbol_conf.default_guest_kallsyms; + else { + sprintf(path, "%s/proc/kallsyms", machine->root_dir); + filename = path; + } + } + + if (symbol__restricted_filename(filename, "/proc/kallsyms")) + return 0; + + if (kallsyms__parse(filename, &args, symbol__in_kernel) <= 0) + return 0; + + return args.start; +} + +int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel) +{ + enum map_type type; + u64 start = machine__get_kernel_start_addr(machine); + + for (type = 0; type < MAP__NR_TYPES; ++type) { + struct kmap *kmap; + + machine->vmlinux_maps[type] = map__new2(start, kernel, type); + if (machine->vmlinux_maps[type] == NULL) + return -1; + + machine->vmlinux_maps[type]->map_ip = + machine->vmlinux_maps[type]->unmap_ip = + identity__map_ip; + kmap = map__kmap(machine->vmlinux_maps[type]); + kmap->kmaps = &machine->kmaps; + map_groups__insert(&machine->kmaps, + machine->vmlinux_maps[type]); + } + + return 0; +} + +void machine__destroy_kernel_maps(struct machine *machine) +{ + enum map_type type; + + for (type = 0; type < MAP__NR_TYPES; ++type) { + struct kmap *kmap; + + if (machine->vmlinux_maps[type] == NULL) + continue; + + kmap = map__kmap(machine->vmlinux_maps[type]); + map_groups__remove(&machine->kmaps, + machine->vmlinux_maps[type]); + if (kmap->ref_reloc_sym) { + /* + * ref_reloc_sym is shared among all maps, so free just + * on one of them. + */ + if (type == MAP__FUNCTION) { + free((char *)kmap->ref_reloc_sym->name); + kmap->ref_reloc_sym->name = NULL; + free(kmap->ref_reloc_sym); + } + kmap->ref_reloc_sym = NULL; + } + + map__delete(machine->vmlinux_maps[type]); + machine->vmlinux_maps[type] = NULL; + } +} + +int machines__create_guest_kernel_maps(struct machines *machines) +{ + int ret = 0; + struct dirent **namelist = NULL; + int i, items = 0; + char path[PATH_MAX]; + pid_t pid; + char *endp; + + if (symbol_conf.default_guest_vmlinux_name || + symbol_conf.default_guest_modules || + symbol_conf.default_guest_kallsyms) { + machines__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID); + } + + if (symbol_conf.guestmount) { + items = scandir(symbol_conf.guestmount, &namelist, NULL, NULL); + if (items <= 0) + return -ENOENT; + for (i = 0; i < items; i++) { + if (!isdigit(namelist[i]->d_name[0])) { + /* Filter out . and .. */ + continue; + } + pid = (pid_t)strtol(namelist[i]->d_name, &endp, 10); + if ((*endp != '\0') || + (endp == namelist[i]->d_name) || + (errno == ERANGE)) { + pr_debug("invalid directory (%s). Skipping.\n", + namelist[i]->d_name); + continue; + } + sprintf(path, "%s/%s/proc/kallsyms", + symbol_conf.guestmount, + namelist[i]->d_name); + ret = access(path, R_OK); + if (ret) { + pr_debug("Can't access file %s\n", path); + goto failure; + } + machines__create_kernel_maps(machines, pid); + } +failure: + free(namelist); + } + + return ret; +} + +void machines__destroy_kernel_maps(struct machines *machines) +{ + struct rb_node *next = rb_first(&machines->guests); + + machine__destroy_kernel_maps(&machines->host); + + while (next) { + struct machine *pos = rb_entry(next, struct machine, rb_node); + + next = rb_next(&pos->rb_node); + rb_erase(&pos->rb_node, &machines->guests); + machine__delete(pos); + } +} + +int machines__create_kernel_maps(struct machines *machines, pid_t pid) +{ + struct machine *machine = machines__findnew(machines, pid); + + if (machine == NULL) + return -1; + + return machine__create_kernel_maps(machine); +} + +int machine__load_kallsyms(struct machine *machine, const char *filename, + enum map_type type, symbol_filter_t filter) +{ + struct map *map = machine->vmlinux_maps[type]; + int ret = dso__load_kallsyms(map->dso, filename, map, filter); + + if (ret > 0) { + dso__set_loaded(map->dso, type); + /* + * Since /proc/kallsyms will have multiple sessions for the + * kernel, with modules between them, fixup the end of all + * sections. + */ + __map_groups__fixup_end(&machine->kmaps, type); + } + + return ret; +} + +int machine__load_vmlinux_path(struct machine *machine, enum map_type type, + symbol_filter_t filter) +{ + struct map *map = machine->vmlinux_maps[type]; + int ret = dso__load_vmlinux_path(map->dso, map, filter); + + if (ret > 0) { + dso__set_loaded(map->dso, type); + map__reloc_vmlinux(map); + } + + return ret; +} + +static void map_groups__fixup_end(struct map_groups *mg) +{ + int i; + for (i = 0; i < MAP__NR_TYPES; ++i) + __map_groups__fixup_end(mg, i); +} + +static char *get_kernel_version(const char *root_dir) +{ + char version[PATH_MAX]; + FILE *file; + char *name, *tmp; + const char *prefix = "Linux version "; + + sprintf(version, "%s/proc/version", root_dir); + file = fopen(version, "r"); + if (!file) + return NULL; + + version[0] = '\0'; + tmp = fgets(version, sizeof(version), file); + fclose(file); + + name = strstr(version, prefix); + if (!name) + return NULL; + name += strlen(prefix); + tmp = strchr(name, ' '); + if (tmp) + *tmp = '\0'; + + return strdup(name); +} + +static int map_groups__set_modules_path_dir(struct map_groups *mg, + const char *dir_name) +{ + struct dirent *dent; + DIR *dir = opendir(dir_name); + int ret = 0; + + if (!dir) { + pr_debug("%s: cannot open %s dir\n", __func__, dir_name); + return -1; + } + + while ((dent = readdir(dir)) != NULL) { + char path[PATH_MAX]; + struct stat st; + + /*sshfs might return bad dent->d_type, so we have to stat*/ + snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); + if (stat(path, &st)) + continue; + + if (S_ISDIR(st.st_mode)) { + if (!strcmp(dent->d_name, ".") || + !strcmp(dent->d_name, "..")) + continue; + + ret = map_groups__set_modules_path_dir(mg, path); + if (ret < 0) + goto out; + } else { + char *dot = strrchr(dent->d_name, '.'), + dso_name[PATH_MAX]; + struct map *map; + char *long_name; + + if (dot == NULL || strcmp(dot, ".ko")) + continue; + snprintf(dso_name, sizeof(dso_name), "[%.*s]", + (int)(dot - dent->d_name), dent->d_name); + + strxfrchar(dso_name, '-', '_'); + map = map_groups__find_by_name(mg, MAP__FUNCTION, + dso_name); + if (map == NULL) + continue; + + long_name = strdup(path); + if (long_name == NULL) { + ret = -1; + goto out; + } + dso__set_long_name(map->dso, long_name); + map->dso->lname_alloc = 1; + dso__kernel_module_get_build_id(map->dso, ""); + } + } + +out: + closedir(dir); + return ret; +} + +static int machine__set_modules_path(struct machine *machine) +{ + char *version; + char modules_path[PATH_MAX]; + + version = get_kernel_version(machine->root_dir); + if (!version) + return -1; + + snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel", + machine->root_dir, version); + free(version); + + return map_groups__set_modules_path_dir(&machine->kmaps, modules_path); +} + +static int machine__create_modules(struct machine *machine) +{ + char *line = NULL; + size_t n; + FILE *file; + struct map *map; + const char *modules; + char path[PATH_MAX]; + + if (machine__is_default_guest(machine)) + modules = symbol_conf.default_guest_modules; + else { + sprintf(path, "%s/proc/modules", machine->root_dir); + modules = path; + } + + if (symbol__restricted_filename(path, "/proc/modules")) + return -1; + + file = fopen(modules, "r"); + if (file == NULL) + return -1; + + while (!feof(file)) { + char name[PATH_MAX]; + u64 start; + char *sep; + int line_len; + + line_len = getline(&line, &n, file); + if (line_len < 0) + break; + + if (!line) + goto out_failure; + + line[--line_len] = '\0'; /* \n */ + + sep = strrchr(line, 'x'); + if (sep == NULL) + continue; + + hex2u64(sep + 1, &start); + + sep = strchr(line, ' '); + if (sep == NULL) + continue; + + *sep = '\0'; + + snprintf(name, sizeof(name), "[%s]", line); + map = machine__new_module(machine, start, name); + if (map == NULL) + goto out_delete_line; + dso__kernel_module_get_build_id(map->dso, machine->root_dir); + } + + free(line); + fclose(file); + + return machine__set_modules_path(machine); + +out_delete_line: + free(line); +out_failure: + return -1; +} + +int machine__create_kernel_maps(struct machine *machine) +{ + struct dso *kernel = machine__get_kernel(machine); + + if (kernel == NULL || + __machine__create_kernel_maps(machine, kernel) < 0) + return -1; + + if (symbol_conf.use_modules && machine__create_modules(machine) < 0) { + if (machine__is_host(machine)) + pr_debug("Problems creating module maps, " + "continuing anyway...\n"); + else + pr_debug("Problems creating module maps for guest %d, " + "continuing anyway...\n", machine->pid); + } + + /* + * Now that we have all the maps created, just set the ->end of them: + */ + map_groups__fixup_end(&machine->kmaps); + return 0; +} + static void machine__set_kernel_mmap_len(struct machine *machine, union perf_event *event) { @@ -462,3 +1038,189 @@ int machine__process_event(struct machine *machine, union perf_event *event) return ret; } + +void machine__remove_thread(struct machine *machine, struct thread *th) +{ + machine->last_match = NULL; + rb_erase(&th->rb_node, &machine->threads); + /* + * We may have references to this thread, for instance in some hist_entry + * instances, so just move them to a separate list. + */ + list_add_tail(&th->node, &machine->dead_threads); +} + +static bool symbol__match_parent_regex(struct symbol *sym) +{ + if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0)) + return 1; + + return 0; +} + +static const u8 cpumodes[] = { + PERF_RECORD_MISC_USER, + PERF_RECORD_MISC_KERNEL, + PERF_RECORD_MISC_GUEST_USER, + PERF_RECORD_MISC_GUEST_KERNEL +}; +#define NCPUMODES (sizeof(cpumodes)/sizeof(u8)) + +static void ip__resolve_ams(struct machine *machine, struct thread *thread, + struct addr_map_symbol *ams, + u64 ip) +{ + struct addr_location al; + size_t i; + u8 m; + + memset(&al, 0, sizeof(al)); + + for (i = 0; i < NCPUMODES; i++) { + m = cpumodes[i]; + /* + * We cannot use the header.misc hint to determine whether a + * branch stack address is user, kernel, guest, hypervisor. + * Branches may straddle the kernel/user/hypervisor boundaries. + * Thus, we have to try consecutively until we find a match + * or else, the symbol is unknown + */ + thread__find_addr_location(thread, machine, m, MAP__FUNCTION, + ip, &al, NULL); + if (al.sym) + goto found; + } +found: + ams->addr = ip; + ams->al_addr = al.addr; + ams->sym = al.sym; + ams->map = al.map; +} + +struct branch_info *machine__resolve_bstack(struct machine *machine, + struct thread *thr, + struct branch_stack *bs) +{ + struct branch_info *bi; + unsigned int i; + + bi = calloc(bs->nr, sizeof(struct branch_info)); + if (!bi) + return NULL; + + for (i = 0; i < bs->nr; i++) { + ip__resolve_ams(machine, thr, &bi[i].to, bs->entries[i].to); + ip__resolve_ams(machine, thr, &bi[i].from, bs->entries[i].from); + bi[i].flags = bs->entries[i].flags; + } + return bi; +} + +static int machine__resolve_callchain_sample(struct machine *machine, + struct thread *thread, + struct ip_callchain *chain, + struct symbol **parent) + +{ + u8 cpumode = PERF_RECORD_MISC_USER; + unsigned int i; + int err; + + callchain_cursor_reset(&callchain_cursor); + + if (chain->nr > PERF_MAX_STACK_DEPTH) { + pr_warning("corrupted callchain. skipping...\n"); + return 0; + } + + for (i = 0; i < chain->nr; i++) { + u64 ip; + struct addr_location al; + + if (callchain_param.order == ORDER_CALLEE) + ip = chain->ips[i]; + else + ip = chain->ips[chain->nr - i - 1]; + + if (ip >= PERF_CONTEXT_MAX) { + switch (ip) { + case PERF_CONTEXT_HV: + cpumode = PERF_RECORD_MISC_HYPERVISOR; + break; + case PERF_CONTEXT_KERNEL: + cpumode = PERF_RECORD_MISC_KERNEL; + break; + case PERF_CONTEXT_USER: + cpumode = PERF_RECORD_MISC_USER; + break; + default: + pr_debug("invalid callchain context: " + "%"PRId64"\n", (s64) ip); + /* + * It seems the callchain is corrupted. + * Discard all. + */ + callchain_cursor_reset(&callchain_cursor); + return 0; + } + continue; + } + + al.filtered = false; + thread__find_addr_location(thread, machine, cpumode, + MAP__FUNCTION, ip, &al, NULL); + if (al.sym != NULL) { + if (sort__has_parent && !*parent && + symbol__match_parent_regex(al.sym)) + *parent = al.sym; + if (!symbol_conf.use_callchain) + break; + } + + err = callchain_cursor_append(&callchain_cursor, + ip, al.map, al.sym); + if (err) + return err; + } + + return 0; +} + +static int unwind_entry(struct unwind_entry *entry, void *arg) +{ + struct callchain_cursor *cursor = arg; + return callchain_cursor_append(cursor, entry->ip, + entry->map, entry->sym); +} + +int machine__resolve_callchain(struct machine *machine, + struct perf_evsel *evsel, + struct thread *thread, + struct perf_sample *sample, + struct symbol **parent) + +{ + int ret; + + callchain_cursor_reset(&callchain_cursor); + + ret = machine__resolve_callchain_sample(machine, thread, + sample->callchain, parent); + if (ret) + return ret; + + /* Can we do dwarf post unwind? */ + if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) && + (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER))) + return 0; + + /* Bail out if nothing was captured. */ + if ((!sample->user_regs.regs) || + (!sample->user_stack.size)) + return 0; + + return unwind__get_entries(unwind_entry, &callchain_cursor, machine, + thread, evsel->attr.sample_regs_user, + sample); + +} diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index b7cde7467d55..5ac5892f2326 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -47,23 +47,32 @@ int machine__process_event(struct machine *machine, union perf_event *event); typedef void (*machine__process_t)(struct machine *machine, void *data); -void machines__process(struct rb_root *machines, - machine__process_t process, void *data); +struct machines { + struct machine host; + struct rb_root guests; +}; + +void machines__init(struct machines *machines); +void machines__exit(struct machines *machines); -struct machine *machines__add(struct rb_root *machines, pid_t pid, +void machines__process_guests(struct machines *machines, + machine__process_t process, void *data); + +struct machine *machines__add(struct machines *machines, pid_t pid, const char *root_dir); -struct machine *machines__find_host(struct rb_root *machines); -struct machine *machines__find(struct rb_root *machines, pid_t pid); -struct machine *machines__findnew(struct rb_root *machines, pid_t pid); +struct machine *machines__find_host(struct machines *machines); +struct machine *machines__find(struct machines *machines, pid_t pid); +struct machine *machines__findnew(struct machines *machines, pid_t pid); -void machines__set_id_hdr_size(struct rb_root *machines, u16 id_hdr_size); +void machines__set_id_hdr_size(struct machines *machines, u16 id_hdr_size); char *machine__mmap_name(struct machine *machine, char *bf, size_t size); int machine__init(struct machine *machine, const char *root_dir, pid_t pid); void machine__exit(struct machine *machine); +void machine__delete_dead_threads(struct machine *machine); +void machine__delete_threads(struct machine *machine); void machine__delete(struct machine *machine); - struct branch_info *machine__resolve_bstack(struct machine *machine, struct thread *thread, struct branch_stack *bs); @@ -129,19 +138,19 @@ int machine__load_kallsyms(struct machine *machine, const char *filename, int machine__load_vmlinux_path(struct machine *machine, enum map_type type, symbol_filter_t filter); -size_t machine__fprintf_dsos_buildid(struct machine *machine, - FILE *fp, bool with_hits); -size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp); -size_t machines__fprintf_dsos_buildid(struct rb_root *machines, - FILE *fp, bool with_hits); +size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp, + bool (skip)(struct dso *dso, int parm), int parm); +size_t machines__fprintf_dsos(struct machines *machines, FILE *fp); +size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp, + bool (skip)(struct dso *dso, int parm), int parm); void machine__destroy_kernel_maps(struct machine *machine); int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel); int machine__create_kernel_maps(struct machine *machine); -int machines__create_kernel_maps(struct rb_root *machines, pid_t pid); -int machines__create_guest_kernel_maps(struct rb_root *machines); -void machines__destroy_guest_kernel_maps(struct rb_root *machines); +int machines__create_kernel_maps(struct machines *machines, pid_t pid); +int machines__create_guest_kernel_maps(struct machines *machines); +void machines__destroy_kernel_maps(struct machines *machines); size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 0328d45c4f2a..ff94425779a2 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -19,7 +19,8 @@ const char *map_type__name[MAP__NR_TYPES] = { static inline int is_anon_memory(const char *filename) { - return strcmp(filename, "//anon") == 0; + return !strcmp(filename, "//anon") || + !strcmp(filename, "/anon_hugepage (deleted)"); } static inline int is_no_dso_memory(const char *filename) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 2d8d53bec17e..02f6421f03a0 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -380,8 +380,8 @@ static int add_tracepoint(struct list_head **listp, int *idx, return 0; } -static int add_tracepoint_multi(struct list_head **list, int *idx, - char *sys_name, char *evt_name) +static int add_tracepoint_multi_event(struct list_head **list, int *idx, + char *sys_name, char *evt_name) { char evt_path[MAXPATHLEN]; struct dirent *evt_ent; @@ -408,6 +408,47 @@ static int add_tracepoint_multi(struct list_head **list, int *idx, ret = add_tracepoint(list, idx, sys_name, evt_ent->d_name); } + closedir(evt_dir); + return ret; +} + +static int add_tracepoint_event(struct list_head **list, int *idx, + char *sys_name, char *evt_name) +{ + return strpbrk(evt_name, "*?") ? + add_tracepoint_multi_event(list, idx, sys_name, evt_name) : + add_tracepoint(list, idx, sys_name, evt_name); +} + +static int add_tracepoint_multi_sys(struct list_head **list, int *idx, + char *sys_name, char *evt_name) +{ + struct dirent *events_ent; + DIR *events_dir; + int ret = 0; + + events_dir = opendir(tracing_events_path); + if (!events_dir) { + perror("Can't open event dir"); + return -1; + } + + while (!ret && (events_ent = readdir(events_dir))) { + if (!strcmp(events_ent->d_name, ".") + || !strcmp(events_ent->d_name, "..") + || !strcmp(events_ent->d_name, "enable") + || !strcmp(events_ent->d_name, "header_event") + || !strcmp(events_ent->d_name, "header_page")) + continue; + + if (!strglobmatch(events_ent->d_name, sys_name)) + continue; + + ret = add_tracepoint_event(list, idx, events_ent->d_name, + evt_name); + } + + closedir(events_dir); return ret; } @@ -420,9 +461,10 @@ int parse_events_add_tracepoint(struct list_head **list, int *idx, if (ret) return ret; - return strpbrk(event, "*?") ? - add_tracepoint_multi(list, idx, sys, event) : - add_tracepoint(list, idx, sys, event); + if (strpbrk(sys, "*?")) + return add_tracepoint_multi_sys(list, idx, sys, event); + else + return add_tracepoint_event(list, idx, sys, event); } static int @@ -492,7 +534,7 @@ int parse_events_add_breakpoint(struct list_head **list, int *idx, } static int config_term(struct perf_event_attr *attr, - struct parse_events__term *term) + struct parse_events_term *term) { #define CHECK_TYPE_VAL(type) \ do { \ @@ -537,7 +579,7 @@ do { \ static int config_attr(struct perf_event_attr *attr, struct list_head *head, int fail) { - struct parse_events__term *term; + struct parse_events_term *term; list_for_each_entry(term, head, list) if (config_term(attr, term) && fail) @@ -563,14 +605,14 @@ int parse_events_add_numeric(struct list_head **list, int *idx, return add_event(list, idx, &attr, NULL); } -static int parse_events__is_name_term(struct parse_events__term *term) +static int parse_events__is_name_term(struct parse_events_term *term) { return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME; } static char *pmu_event_name(struct list_head *head_terms) { - struct parse_events__term *term; + struct parse_events_term *term; list_for_each_entry(term, head_terms, list) if (parse_events__is_name_term(term)) @@ -814,7 +856,7 @@ static int parse_events__scanner(const char *str, void *data, int start_token) */ int parse_events_terms(struct list_head *terms, const char *str) { - struct parse_events_data__terms data = { + struct parse_events_terms data = { .terms = NULL, }; int ret; @@ -830,10 +872,9 @@ int parse_events_terms(struct list_head *terms, const char *str) return ret; } -int parse_events(struct perf_evlist *evlist, const char *str, - int unset __maybe_unused) +int parse_events(struct perf_evlist *evlist, const char *str) { - struct parse_events_data__events data = { + struct parse_events_evlist data = { .list = LIST_HEAD_INIT(data.list), .idx = evlist->nr_entries, }; @@ -858,7 +899,7 @@ int parse_events_option(const struct option *opt, const char *str, int unset __maybe_unused) { struct perf_evlist *evlist = *(struct perf_evlist **)opt->value; - int ret = parse_events(evlist, str, unset); + int ret = parse_events(evlist, str); if (ret) { fprintf(stderr, "invalid or unsupported event: '%s'\n", str); @@ -1121,16 +1162,16 @@ void print_events(const char *event_glob, bool name_only) print_tracepoint_events(NULL, NULL, name_only); } -int parse_events__is_hardcoded_term(struct parse_events__term *term) +int parse_events__is_hardcoded_term(struct parse_events_term *term) { return term->type_term != PARSE_EVENTS__TERM_TYPE_USER; } -static int new_term(struct parse_events__term **_term, int type_val, +static int new_term(struct parse_events_term **_term, int type_val, int type_term, char *config, char *str, u64 num) { - struct parse_events__term *term; + struct parse_events_term *term; term = zalloc(sizeof(*term)); if (!term) @@ -1156,21 +1197,21 @@ static int new_term(struct parse_events__term **_term, int type_val, return 0; } -int parse_events__term_num(struct parse_events__term **term, +int parse_events_term__num(struct parse_events_term **term, int type_term, char *config, u64 num) { return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term, config, NULL, num); } -int parse_events__term_str(struct parse_events__term **term, +int parse_events_term__str(struct parse_events_term **term, int type_term, char *config, char *str) { return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term, config, str, 0); } -int parse_events__term_sym_hw(struct parse_events__term **term, +int parse_events_term__sym_hw(struct parse_events_term **term, char *config, unsigned idx) { struct event_symbol *sym; @@ -1188,8 +1229,8 @@ int parse_events__term_sym_hw(struct parse_events__term **term, (char *) "event", (char *) sym->symbol, 0); } -int parse_events__term_clone(struct parse_events__term **new, - struct parse_events__term *term) +int parse_events_term__clone(struct parse_events_term **new, + struct parse_events_term *term) { return new_term(new, term->type_val, term->type_term, term->config, term->val.str, term->val.num); @@ -1197,7 +1238,7 @@ int parse_events__term_clone(struct parse_events__term **new, void parse_events__free_terms(struct list_head *terms) { - struct parse_events__term *term, *h; + struct parse_events_term *term, *h; list_for_each_entry_safe(term, h, terms, list) free(term); diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index b7af80b8bdda..2cd2c42a69c5 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -29,8 +29,7 @@ const char *event_type(int type); extern int parse_events_option(const struct option *opt, const char *str, int unset); -extern int parse_events(struct perf_evlist *evlist, const char *str, - int unset); +extern int parse_events(struct perf_evlist *evlist, const char *str); extern int parse_events_terms(struct list_head *terms, const char *str); extern int parse_filter(const struct option *opt, const char *str, int unset); @@ -51,7 +50,7 @@ enum { PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE, }; -struct parse_events__term { +struct parse_events_term { char *config; union { char *str; @@ -62,24 +61,24 @@ struct parse_events__term { struct list_head list; }; -struct parse_events_data__events { +struct parse_events_evlist { struct list_head list; int idx; }; -struct parse_events_data__terms { +struct parse_events_terms { struct list_head *terms; }; -int parse_events__is_hardcoded_term(struct parse_events__term *term); -int parse_events__term_num(struct parse_events__term **_term, +int parse_events__is_hardcoded_term(struct parse_events_term *term); +int parse_events_term__num(struct parse_events_term **_term, int type_term, char *config, u64 num); -int parse_events__term_str(struct parse_events__term **_term, +int parse_events_term__str(struct parse_events_term **_term, int type_term, char *config, char *str); -int parse_events__term_sym_hw(struct parse_events__term **term, +int parse_events_term__sym_hw(struct parse_events_term **term, char *config, unsigned idx); -int parse_events__term_clone(struct parse_events__term **new, - struct parse_events__term *term); +int parse_events_term__clone(struct parse_events_term **new, + struct parse_events_term *term); void parse_events__free_terms(struct list_head *terms); int parse_events__modifier_event(struct list_head *list, char *str, bool add); int parse_events__modifier_group(struct list_head *list, char *event_mod); diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 0f9914ae6bac..9d43c86176ff 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -68,7 +68,7 @@ do { \ char *str; u64 num; struct list_head *head; - struct parse_events__term *term; + struct parse_events_term *term; } %% @@ -79,7 +79,7 @@ PE_START_TERMS start_terms start_events: groups { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; parse_events_update_lists($1, &data->list); } @@ -186,7 +186,7 @@ event_def: event_pmu | event_pmu: PE_NAME '/' event_config '/' { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_pmu(&list, &data->idx, $1, $3)); @@ -202,7 +202,7 @@ PE_VALUE_SYM_SW event_legacy_symbol: value_sym '/' event_config '/' { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; int type = $1 >> 16; int config = $1 & 255; @@ -215,7 +215,7 @@ value_sym '/' event_config '/' | value_sym sep_slash_dc { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; int type = $1 >> 16; int config = $1 & 255; @@ -228,7 +228,7 @@ value_sym sep_slash_dc event_legacy_cache: PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, $5)); @@ -237,7 +237,7 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT | PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, $3, NULL)); @@ -246,7 +246,7 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT | PE_NAME_CACHE_TYPE { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_cache(&list, &data->idx, $1, NULL, NULL)); @@ -256,7 +256,7 @@ PE_NAME_CACHE_TYPE event_legacy_mem: PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_breakpoint(&list, &data->idx, @@ -266,7 +266,7 @@ PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc | PE_PREFIX_MEM PE_VALUE sep_dc { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_breakpoint(&list, &data->idx, @@ -277,7 +277,7 @@ PE_PREFIX_MEM PE_VALUE sep_dc event_legacy_tracepoint: PE_NAME ':' PE_NAME { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_tracepoint(&list, &data->idx, $1, $3)); @@ -287,7 +287,7 @@ PE_NAME ':' PE_NAME event_legacy_numeric: PE_VALUE ':' PE_VALUE { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_numeric(&list, &data->idx, (u32)$1, $3, NULL)); @@ -297,7 +297,7 @@ PE_VALUE ':' PE_VALUE event_legacy_raw: PE_RAW { - struct parse_events_data__events *data = _data; + struct parse_events_evlist *data = _data; struct list_head *list = NULL; ABORT_ON(parse_events_add_numeric(&list, &data->idx, @@ -307,7 +307,7 @@ PE_RAW start_terms: event_config { - struct parse_events_data__terms *data = _data; + struct parse_events_terms *data = _data; data->terms = $1; } @@ -315,7 +315,7 @@ event_config: event_config ',' event_term { struct list_head *head = $1; - struct parse_events__term *term = $3; + struct parse_events_term *term = $3; ABORT_ON(!head); list_add_tail(&term->list, head); @@ -325,7 +325,7 @@ event_config ',' event_term event_term { struct list_head *head = malloc(sizeof(*head)); - struct parse_events__term *term = $1; + struct parse_events_term *term = $1; ABORT_ON(!head); INIT_LIST_HEAD(head); @@ -336,70 +336,70 @@ event_term event_term: PE_NAME '=' PE_NAME { - struct parse_events__term *term; + struct parse_events_term *term; - ABORT_ON(parse_events__term_str(&term, PARSE_EVENTS__TERM_TYPE_USER, + ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER, $1, $3)); $$ = term; } | PE_NAME '=' PE_VALUE { - struct parse_events__term *term; + struct parse_events_term *term; - ABORT_ON(parse_events__term_num(&term, PARSE_EVENTS__TERM_TYPE_USER, + ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, $1, $3)); $$ = term; } | PE_NAME '=' PE_VALUE_SYM_HW { - struct parse_events__term *term; + struct parse_events_term *term; int config = $3 & 255; - ABORT_ON(parse_events__term_sym_hw(&term, $1, config)); + ABORT_ON(parse_events_term__sym_hw(&term, $1, config)); $$ = term; } | PE_NAME { - struct parse_events__term *term; + struct parse_events_term *term; - ABORT_ON(parse_events__term_num(&term, PARSE_EVENTS__TERM_TYPE_USER, + ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER, $1, 1)); $$ = term; } | PE_VALUE_SYM_HW { - struct parse_events__term *term; + struct parse_events_term *term; int config = $1 & 255; - ABORT_ON(parse_events__term_sym_hw(&term, NULL, config)); + ABORT_ON(parse_events_term__sym_hw(&term, NULL, config)); $$ = term; } | PE_TERM '=' PE_NAME { - struct parse_events__term *term; + struct parse_events_term *term; - ABORT_ON(parse_events__term_str(&term, (int)$1, NULL, $3)); + ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3)); $$ = term; } | PE_TERM '=' PE_VALUE { - struct parse_events__term *term; + struct parse_events_term *term; - ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, $3)); + ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3)); $$ = term; } | PE_TERM { - struct parse_events__term *term; + struct parse_events_term *term; - ABORT_ON(parse_events__term_num(&term, (int)$1, NULL, 1)); + ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1)); $$ = term; } diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 9bdc60c6f138..4c6f9c490a8d 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1,4 +1,3 @@ - #include <linux/list.h> #include <sys/types.h> #include <sys/stat.h> @@ -11,6 +10,19 @@ #include "parse-events.h" #include "cpumap.h" +struct perf_pmu_alias { + char *name; + struct list_head terms; + struct list_head list; +}; + +struct perf_pmu_format { + char *name; + int value; + DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS); + struct list_head list; +}; + #define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/" int perf_pmu_parse(struct list_head *list, char *name); @@ -85,7 +97,7 @@ static int pmu_format(char *name, struct list_head *format) static int perf_pmu__new_alias(struct list_head *list, char *name, FILE *file) { - struct perf_pmu__alias *alias; + struct perf_pmu_alias *alias; char buf[256]; int ret; @@ -172,15 +184,15 @@ static int pmu_aliases(char *name, struct list_head *head) return 0; } -static int pmu_alias_terms(struct perf_pmu__alias *alias, +static int pmu_alias_terms(struct perf_pmu_alias *alias, struct list_head *terms) { - struct parse_events__term *term, *clone; + struct parse_events_term *term, *clone; LIST_HEAD(list); int ret; list_for_each_entry(term, &alias->terms, list) { - ret = parse_events__term_clone(&clone, term); + ret = parse_events_term__clone(&clone, term); if (ret) { parse_events__free_terms(&list); return ret; @@ -360,10 +372,10 @@ struct perf_pmu *perf_pmu__find(char *name) return pmu_lookup(name); } -static struct perf_pmu__format* +static struct perf_pmu_format * pmu_find_format(struct list_head *formats, char *name) { - struct perf_pmu__format *format; + struct perf_pmu_format *format; list_for_each_entry(format, formats, list) if (!strcmp(format->name, name)) @@ -403,9 +415,9 @@ static __u64 pmu_format_value(unsigned long *format, __u64 value) */ static int pmu_config_term(struct list_head *formats, struct perf_event_attr *attr, - struct parse_events__term *term) + struct parse_events_term *term) { - struct perf_pmu__format *format; + struct perf_pmu_format *format; __u64 *vp; /* @@ -450,7 +462,7 @@ int perf_pmu__config_terms(struct list_head *formats, struct perf_event_attr *attr, struct list_head *head_terms) { - struct parse_events__term *term; + struct parse_events_term *term; list_for_each_entry(term, head_terms, list) if (pmu_config_term(formats, attr, term)) @@ -471,10 +483,10 @@ int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr, return perf_pmu__config_terms(&pmu->format, attr, head_terms); } -static struct perf_pmu__alias *pmu_find_alias(struct perf_pmu *pmu, - struct parse_events__term *term) +static struct perf_pmu_alias *pmu_find_alias(struct perf_pmu *pmu, + struct parse_events_term *term) { - struct perf_pmu__alias *alias; + struct perf_pmu_alias *alias; char *name; if (parse_events__is_hardcoded_term(term)) @@ -507,8 +519,8 @@ static struct perf_pmu__alias *pmu_find_alias(struct perf_pmu *pmu, */ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms) { - struct parse_events__term *term, *h; - struct perf_pmu__alias *alias; + struct parse_events_term *term, *h; + struct perf_pmu_alias *alias; int ret; list_for_each_entry_safe(term, h, head_terms, list) { @@ -527,7 +539,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms) int perf_pmu__new_format(struct list_head *list, char *name, int config, unsigned long *bits) { - struct perf_pmu__format *format; + struct perf_pmu_format *format; format = zalloc(sizeof(*format)); if (!format) @@ -548,7 +560,7 @@ void perf_pmu__set_format(unsigned long *bits, long from, long to) if (!to) to = from; - memset(bits, 0, BITS_TO_LONGS(PERF_PMU_FORMAT_BITS)); + memset(bits, 0, BITS_TO_BYTES(PERF_PMU_FORMAT_BITS)); for (b = from; b <= to; b++) set_bit(b, bits); } diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index a313ed76a49a..32fe55b659fa 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -12,19 +12,6 @@ enum { #define PERF_PMU_FORMAT_BITS 64 -struct perf_pmu__format { - char *name; - int value; - DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS); - struct list_head list; -}; - -struct perf_pmu__alias { - char *name; - struct list_head terms; - struct list_head list; -}; - struct perf_pmu { char *name; __u32 type; @@ -42,7 +29,7 @@ int perf_pmu__config_terms(struct list_head *formats, struct list_head *head_terms); int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms); struct list_head *perf_pmu__alias(struct perf_pmu *pmu, - struct list_head *head_terms); + struct list_head *head_terms); int perf_pmu_wrap(void); void perf_pmu_error(struct list_head *list, char *name, char const *msg); diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 1daf5c14e751..be0329394d56 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -413,12 +413,12 @@ static int convert_variable_type(Dwarf_Die *vr_die, dwarf_diename(vr_die), dwarf_diename(&type)); return -EINVAL; } + if (die_get_real_type(&type, &type) == NULL) { + pr_warning("Failed to get a type" + " information.\n"); + return -ENOENT; + } if (ret == DW_TAG_pointer_type) { - if (die_get_real_type(&type, &type) == NULL) { - pr_warning("Failed to get a type" - " information.\n"); - return -ENOENT; - } while (*ref_ptr) ref_ptr = &(*ref_ptr)->next; /* Add new reference with offset +0 */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index a2657fd96837..925e0c3e6d91 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -1045,3 +1045,12 @@ error: if (PyErr_Occurred()) PyErr_SetString(PyExc_ImportError, "perf: Init failed!"); } + +/* + * Dummy, to avoid dragging all the test_attr infrastructure in the python + * binding. + */ +void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu, + int fd, int group_fd, unsigned long flags) +{ +} diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index f80605eb1855..eacec859f299 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -292,6 +292,7 @@ static void perl_process_tracepoint(union perf_event *perf_event __maybe_unused, ns = nsecs - s * NSECS_PER_SEC; scripting_context->event_data = data; + scripting_context->pevent = evsel->tp_format->pevent; ENTER; SAVETMPS; diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 14683dfca2ee..e87aa5d9696b 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -265,6 +265,7 @@ static void python_process_tracepoint(union perf_event *perf_event ns = nsecs - s * NSECS_PER_SEC; scripting_context->event_data = data; + scripting_context->pevent = evsel->tp_format->pevent; context = PyCObject_FromVoidPtr(scripting_context, NULL); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ce6f51162386..bd85280bb6e8 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -16,7 +16,6 @@ #include "cpumap.h" #include "event-parse.h" #include "perf_regs.h" -#include "unwind.h" #include "vdso.h" static int perf_session__open(struct perf_session *self, bool force) @@ -87,13 +86,12 @@ void perf_session__set_id_hdr_size(struct perf_session *session) { u16 id_hdr_size = perf_evlist__id_hdr_size(session->evlist); - session->host_machine.id_hdr_size = id_hdr_size; machines__set_id_hdr_size(&session->machines, id_hdr_size); } int perf_session__create_kernel_maps(struct perf_session *self) { - int ret = machine__create_kernel_maps(&self->host_machine); + int ret = machine__create_kernel_maps(&self->machines.host); if (ret >= 0) ret = machines__create_guest_kernel_maps(&self->machines); @@ -102,8 +100,7 @@ int perf_session__create_kernel_maps(struct perf_session *self) static void perf_session__destroy_kernel_maps(struct perf_session *self) { - machine__destroy_kernel_maps(&self->host_machine); - machines__destroy_guest_kernel_maps(&self->machines); + machines__destroy_kernel_maps(&self->machines); } struct perf_session *perf_session__new(const char *filename, int mode, @@ -128,22 +125,11 @@ struct perf_session *perf_session__new(const char *filename, int mode, goto out; memcpy(self->filename, filename, len); - /* - * On 64bit we can mmap the data file in one go. No need for tiny mmap - * slices. On 32bit we use 32MB. - */ -#if BITS_PER_LONG == 64 - self->mmap_window = ULLONG_MAX; -#else - self->mmap_window = 32 * 1024 * 1024ULL; -#endif - self->machines = RB_ROOT; self->repipe = repipe; INIT_LIST_HEAD(&self->ordered_samples.samples); INIT_LIST_HEAD(&self->ordered_samples.sample_cache); INIT_LIST_HEAD(&self->ordered_samples.to_free); - machine__init(&self->host_machine, "", HOST_KERNEL_ID); - hists__init(&self->hists); + machines__init(&self->machines); if (mode == O_RDONLY) { if (perf_session__open(self, force) < 0) @@ -171,37 +157,30 @@ out_delete: return NULL; } -static void machine__delete_dead_threads(struct machine *machine) -{ - struct thread *n, *t; - - list_for_each_entry_safe(t, n, &machine->dead_threads, node) { - list_del(&t->node); - thread__delete(t); - } -} - static void perf_session__delete_dead_threads(struct perf_session *session) { - machine__delete_dead_threads(&session->host_machine); + machine__delete_dead_threads(&session->machines.host); } -static void machine__delete_threads(struct machine *self) +static void perf_session__delete_threads(struct perf_session *session) { - struct rb_node *nd = rb_first(&self->threads); - - while (nd) { - struct thread *t = rb_entry(nd, struct thread, rb_node); - - rb_erase(&t->rb_node, &self->threads); - nd = rb_next(nd); - thread__delete(t); - } + machine__delete_threads(&session->machines.host); } -static void perf_session__delete_threads(struct perf_session *session) +static void perf_session_env__delete(struct perf_session_env *env) { - machine__delete_threads(&session->host_machine); + free(env->hostname); + free(env->os_release); + free(env->version); + free(env->arch); + free(env->cpu_desc); + free(env->cpuid); + + free(env->cmdline); + free(env->sibling_cores); + free(env->sibling_threads); + free(env->numa_nodes); + free(env->pmu_mappings); } void perf_session__delete(struct perf_session *self) @@ -209,198 +188,13 @@ void perf_session__delete(struct perf_session *self) perf_session__destroy_kernel_maps(self); perf_session__delete_dead_threads(self); perf_session__delete_threads(self); - machine__exit(&self->host_machine); + perf_session_env__delete(&self->header.env); + machines__exit(&self->machines); close(self->fd); free(self); vdso__exit(); } -void machine__remove_thread(struct machine *self, struct thread *th) -{ - self->last_match = NULL; - rb_erase(&th->rb_node, &self->threads); - /* - * We may have references to this thread, for instance in some hist_entry - * instances, so just move them to a separate list. - */ - list_add_tail(&th->node, &self->dead_threads); -} - -static bool symbol__match_parent_regex(struct symbol *sym) -{ - if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0)) - return 1; - - return 0; -} - -static const u8 cpumodes[] = { - PERF_RECORD_MISC_USER, - PERF_RECORD_MISC_KERNEL, - PERF_RECORD_MISC_GUEST_USER, - PERF_RECORD_MISC_GUEST_KERNEL -}; -#define NCPUMODES (sizeof(cpumodes)/sizeof(u8)) - -static void ip__resolve_ams(struct machine *self, struct thread *thread, - struct addr_map_symbol *ams, - u64 ip) -{ - struct addr_location al; - size_t i; - u8 m; - - memset(&al, 0, sizeof(al)); - - for (i = 0; i < NCPUMODES; i++) { - m = cpumodes[i]; - /* - * We cannot use the header.misc hint to determine whether a - * branch stack address is user, kernel, guest, hypervisor. - * Branches may straddle the kernel/user/hypervisor boundaries. - * Thus, we have to try consecutively until we find a match - * or else, the symbol is unknown - */ - thread__find_addr_location(thread, self, m, MAP__FUNCTION, - ip, &al, NULL); - if (al.sym) - goto found; - } -found: - ams->addr = ip; - ams->al_addr = al.addr; - ams->sym = al.sym; - ams->map = al.map; -} - -struct branch_info *machine__resolve_bstack(struct machine *self, - struct thread *thr, - struct branch_stack *bs) -{ - struct branch_info *bi; - unsigned int i; - - bi = calloc(bs->nr, sizeof(struct branch_info)); - if (!bi) - return NULL; - - for (i = 0; i < bs->nr; i++) { - ip__resolve_ams(self, thr, &bi[i].to, bs->entries[i].to); - ip__resolve_ams(self, thr, &bi[i].from, bs->entries[i].from); - bi[i].flags = bs->entries[i].flags; - } - return bi; -} - -static int machine__resolve_callchain_sample(struct machine *machine, - struct thread *thread, - struct ip_callchain *chain, - struct symbol **parent) - -{ - u8 cpumode = PERF_RECORD_MISC_USER; - unsigned int i; - int err; - - callchain_cursor_reset(&callchain_cursor); - - if (chain->nr > PERF_MAX_STACK_DEPTH) { - pr_warning("corrupted callchain. skipping...\n"); - return 0; - } - - for (i = 0; i < chain->nr; i++) { - u64 ip; - struct addr_location al; - - if (callchain_param.order == ORDER_CALLEE) - ip = chain->ips[i]; - else - ip = chain->ips[chain->nr - i - 1]; - - if (ip >= PERF_CONTEXT_MAX) { - switch (ip) { - case PERF_CONTEXT_HV: - cpumode = PERF_RECORD_MISC_HYPERVISOR; - break; - case PERF_CONTEXT_KERNEL: - cpumode = PERF_RECORD_MISC_KERNEL; - break; - case PERF_CONTEXT_USER: - cpumode = PERF_RECORD_MISC_USER; - break; - default: - pr_debug("invalid callchain context: " - "%"PRId64"\n", (s64) ip); - /* - * It seems the callchain is corrupted. - * Discard all. - */ - callchain_cursor_reset(&callchain_cursor); - return 0; - } - continue; - } - - al.filtered = false; - thread__find_addr_location(thread, machine, cpumode, - MAP__FUNCTION, ip, &al, NULL); - if (al.sym != NULL) { - if (sort__has_parent && !*parent && - symbol__match_parent_regex(al.sym)) - *parent = al.sym; - if (!symbol_conf.use_callchain) - break; - } - - err = callchain_cursor_append(&callchain_cursor, - ip, al.map, al.sym); - if (err) - return err; - } - - return 0; -} - -static int unwind_entry(struct unwind_entry *entry, void *arg) -{ - struct callchain_cursor *cursor = arg; - return callchain_cursor_append(cursor, entry->ip, - entry->map, entry->sym); -} - -int machine__resolve_callchain(struct machine *machine, - struct perf_evsel *evsel, - struct thread *thread, - struct perf_sample *sample, - struct symbol **parent) - -{ - int ret; - - callchain_cursor_reset(&callchain_cursor); - - ret = machine__resolve_callchain_sample(machine, thread, - sample->callchain, parent); - if (ret) - return ret; - - /* Can we do dwarf post unwind? */ - if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) && - (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER))) - return 0; - - /* Bail out if nothing was captured. */ - if ((!sample->user_regs.regs) || - (!sample->user_stack.size)) - return 0; - - return unwind__get_entries(unwind_entry, &callchain_cursor, machine, - thread, evsel->attr.sample_regs_user, - sample); - -} - static int process_event_synth_tracing_data_stub(union perf_event *event __maybe_unused, struct perf_session *session @@ -1027,7 +821,7 @@ static struct machine * return perf_session__findnew_machine(session, pid); } - return perf_session__find_host_machine(session); + return &session->machines.host; } static int perf_session_deliver_event(struct perf_session *session, @@ -1065,11 +859,11 @@ static int perf_session_deliver_event(struct perf_session *session, case PERF_RECORD_SAMPLE: dump_sample(evsel, event, sample); if (evsel == NULL) { - ++session->hists.stats.nr_unknown_id; + ++session->stats.nr_unknown_id; return 0; } if (machine == NULL) { - ++session->hists.stats.nr_unprocessable_samples; + ++session->stats.nr_unprocessable_samples; return 0; } return tool->sample(tool, event, sample, evsel, machine); @@ -1083,7 +877,7 @@ static int perf_session_deliver_event(struct perf_session *session, return tool->exit(tool, event, sample, machine); case PERF_RECORD_LOST: if (tool->lost == perf_event__process_lost) - session->hists.stats.total_lost += event->lost.lost; + session->stats.total_lost += event->lost.lost; return tool->lost(tool, event, sample, machine); case PERF_RECORD_READ: return tool->read(tool, event, sample, evsel, machine); @@ -1092,7 +886,7 @@ static int perf_session_deliver_event(struct perf_session *session, case PERF_RECORD_UNTHROTTLE: return tool->unthrottle(tool, event, sample, machine); default: - ++session->hists.stats.nr_unknown_events; + ++session->stats.nr_unknown_events; return -1; } } @@ -1106,8 +900,8 @@ static int perf_session__preprocess_sample(struct perf_session *session, if (!ip_callchain__valid(sample->callchain, event)) { pr_debug("call-chain problem with event, skipping it.\n"); - ++session->hists.stats.nr_invalid_chains; - session->hists.stats.total_invalid_chains += sample->period; + ++session->stats.nr_invalid_chains; + session->stats.total_invalid_chains += sample->period; return -EINVAL; } return 0; @@ -1165,7 +959,7 @@ static int perf_session__process_event(struct perf_session *session, if (event->header.type >= PERF_RECORD_HEADER_MAX) return -EINVAL; - hists__inc_nr_events(&session->hists, event->header.type); + events_stats__inc(&session->stats, event->header.type); if (event->header.type >= PERF_RECORD_USER_TYPE_START) return perf_session__process_user_event(session, event, tool, file_offset); @@ -1201,7 +995,7 @@ void perf_event_header__bswap(struct perf_event_header *self) struct thread *perf_session__findnew(struct perf_session *session, pid_t pid) { - return machine__findnew_thread(&session->host_machine, pid); + return machine__findnew_thread(&session->machines.host, pid); } static struct thread *perf_session__register_idle_thread(struct perf_session *self) @@ -1220,39 +1014,39 @@ static void perf_session__warn_about_errors(const struct perf_session *session, const struct perf_tool *tool) { if (tool->lost == perf_event__process_lost && - session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) { + session->stats.nr_events[PERF_RECORD_LOST] != 0) { ui__warning("Processed %d events and lost %d chunks!\n\n" "Check IO/CPU overload!\n\n", - session->hists.stats.nr_events[0], - session->hists.stats.nr_events[PERF_RECORD_LOST]); + session->stats.nr_events[0], + session->stats.nr_events[PERF_RECORD_LOST]); } - if (session->hists.stats.nr_unknown_events != 0) { + if (session->stats.nr_unknown_events != 0) { ui__warning("Found %u unknown events!\n\n" "Is this an older tool processing a perf.data " "file generated by a more recent tool?\n\n" "If that is not the case, consider " "reporting to linux-kernel@vger.kernel.org.\n\n", - session->hists.stats.nr_unknown_events); + session->stats.nr_unknown_events); } - if (session->hists.stats.nr_unknown_id != 0) { + if (session->stats.nr_unknown_id != 0) { ui__warning("%u samples with id not present in the header\n", - session->hists.stats.nr_unknown_id); + session->stats.nr_unknown_id); } - if (session->hists.stats.nr_invalid_chains != 0) { + if (session->stats.nr_invalid_chains != 0) { ui__warning("Found invalid callchains!\n\n" "%u out of %u events were discarded for this reason.\n\n" "Consider reporting to linux-kernel@vger.kernel.org.\n\n", - session->hists.stats.nr_invalid_chains, - session->hists.stats.nr_events[PERF_RECORD_SAMPLE]); + session->stats.nr_invalid_chains, + session->stats.nr_events[PERF_RECORD_SAMPLE]); } - if (session->hists.stats.nr_unprocessable_samples != 0) { + if (session->stats.nr_unprocessable_samples != 0) { ui__warning("%u unprocessable samples recorded.\n" "Do you have a KVM guest running and not using 'perf kvm'?\n", - session->hists.stats.nr_unprocessable_samples); + session->stats.nr_unprocessable_samples); } } @@ -1369,6 +1163,18 @@ fetch_mmaped_event(struct perf_session *session, return event; } +/* + * On 64bit we can mmap the data file in one go. No need for tiny mmap + * slices. On 32bit we use 32MB. + */ +#if BITS_PER_LONG == 64 +#define MMAP_SIZE ULLONG_MAX +#define NUM_MMAPS 1 +#else +#define MMAP_SIZE (32 * 1024 * 1024ULL) +#define NUM_MMAPS 128 +#endif + int __perf_session__process_events(struct perf_session *session, u64 data_offset, u64 data_size, u64 file_size, struct perf_tool *tool) @@ -1376,7 +1182,7 @@ int __perf_session__process_events(struct perf_session *session, u64 head, page_offset, file_offset, file_pos, progress_next; int err, mmap_prot, mmap_flags, map_idx = 0; size_t mmap_size; - char *buf, *mmaps[8]; + char *buf, *mmaps[NUM_MMAPS]; union perf_event *event; uint32_t size; @@ -1391,7 +1197,7 @@ int __perf_session__process_events(struct perf_session *session, progress_next = file_size / 16; - mmap_size = session->mmap_window; + mmap_size = MMAP_SIZE; if (mmap_size > file_size) mmap_size = file_size; @@ -1526,16 +1332,13 @@ int maps__set_kallsyms_ref_reloc_sym(struct map **maps, size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp) { - return __dsos__fprintf(&self->host_machine.kernel_dsos, fp) + - __dsos__fprintf(&self->host_machine.user_dsos, fp) + - machines__fprintf_dsos(&self->machines, fp); + return machines__fprintf_dsos(&self->machines, fp); } size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp, - bool with_hits) + bool (skip)(struct dso *dso, int parm), int parm) { - size_t ret = machine__fprintf_dsos_buildid(&self->host_machine, fp, with_hits); - return ret + machines__fprintf_dsos_buildid(&self->machines, fp, with_hits); + return machines__fprintf_dsos_buildid(&self->machines, fp, skip, parm); } size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) @@ -1543,11 +1346,11 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) struct perf_evsel *pos; size_t ret = fprintf(fp, "Aggregated stats:\n"); - ret += hists__fprintf_nr_events(&session->hists, fp); + ret += events_stats__fprintf(&session->stats, fp); list_for_each_entry(pos, &session->evlist->entries, node) { ret += fprintf(fp, "%s stats:\n", perf_evsel__name(pos)); - ret += hists__fprintf_nr_events(&pos->hists, fp); + ret += events_stats__fprintf(&pos->hists.stats, fp); } return ret; @@ -1559,7 +1362,7 @@ size_t perf_session__fprintf(struct perf_session *session, FILE *fp) * FIXME: Here we have to actually print all the machines in this * session, not just the host... */ - return machine__fprintf(&session->host_machine, fp); + return machine__fprintf(&session->machines.host, fp); } void perf_session__remove_thread(struct perf_session *session, @@ -1568,10 +1371,10 @@ void perf_session__remove_thread(struct perf_session *session, /* * FIXME: This one makes no sense, we need to remove the thread from * the machine it belongs to, perf_session can have many machines, so - * doing it always on ->host_machine is wrong. Fix when auditing all + * doing it always on ->machines.host is wrong. Fix when auditing all * the 'perf kvm' code. */ - machine__remove_thread(&session->host_machine, th); + machine__remove_thread(&session->machines.host, th); } struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index cea133a6bdf1..b5c0847edfa9 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -30,16 +30,10 @@ struct ordered_samples { struct perf_session { struct perf_header header; unsigned long size; - unsigned long mmap_window; - struct machine host_machine; - struct rb_root machines; + struct machines machines; struct perf_evlist *evlist; struct pevent *pevent; - /* - * FIXME: Need to split this up further, we need global - * stats + per event stats. - */ - struct hists hists; + struct events_stats stats; int fd; bool fd_pipe; bool repipe; @@ -54,7 +48,7 @@ struct perf_tool; struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe, struct perf_tool *tool); -void perf_session__delete(struct perf_session *self); +void perf_session__delete(struct perf_session *session); void perf_event_header__bswap(struct perf_event_header *self); @@ -81,43 +75,24 @@ void perf_session__set_id_hdr_size(struct perf_session *session); void perf_session__remove_thread(struct perf_session *self, struct thread *th); static inline -struct machine *perf_session__find_host_machine(struct perf_session *self) -{ - return &self->host_machine; -} - -static inline struct machine *perf_session__find_machine(struct perf_session *self, pid_t pid) { - if (pid == HOST_KERNEL_ID) - return &self->host_machine; return machines__find(&self->machines, pid); } static inline struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t pid) { - if (pid == HOST_KERNEL_ID) - return &self->host_machine; return machines__findnew(&self->machines, pid); } -static inline -void perf_session__process_machines(struct perf_session *self, - struct perf_tool *tool, - machine__process_t process) -{ - process(&self->host_machine, tool); - return machines__process(&self->machines, process, tool); -} - struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); size_t perf_session__fprintf(struct perf_session *self, FILE *fp); size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp); -size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, - FILE *fp, bool with_hits); +size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp, + bool (fn)(struct dso *dso, int parm), int parm); size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index cfd1c0feb32d..7ad62393aa88 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -60,7 +60,7 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right) static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%*s:%5d", width, + return repsep_snprintf(bf, size, "%*s:%5d", width - 6, self->thread->comm ?: "", self->thread->pid); } @@ -97,6 +97,16 @@ static int hist_entry__comm_snprintf(struct hist_entry *self, char *bf, return repsep_snprintf(bf, size, "%*s", width, self->thread->comm); } +struct sort_entry sort_comm = { + .se_header = "Command", + .se_cmp = sort__comm_cmp, + .se_collapse = sort__comm_collapse, + .se_snprintf = hist_entry__comm_snprintf, + .se_width_idx = HISTC_COMM, +}; + +/* --sort dso */ + static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r) { struct dso *dso_l = map_l ? map_l->dso : NULL; @@ -117,22 +127,38 @@ static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r) return strcmp(dso_name_l, dso_name_r); } -struct sort_entry sort_comm = { - .se_header = "Command", - .se_cmp = sort__comm_cmp, - .se_collapse = sort__comm_collapse, - .se_snprintf = hist_entry__comm_snprintf, - .se_width_idx = HISTC_COMM, -}; - -/* --sort dso */ - static int64_t sort__dso_cmp(struct hist_entry *left, struct hist_entry *right) { return _sort__dso_cmp(left->ms.map, right->ms.map); } +static int _hist_entry__dso_snprintf(struct map *map, char *bf, + size_t size, unsigned int width) +{ + if (map && map->dso) { + const char *dso_name = !verbose ? map->dso->short_name : + map->dso->long_name; + return repsep_snprintf(bf, size, "%-*s", width, dso_name); + } + + return repsep_snprintf(bf, size, "%-*s", width, "[unknown]"); +} + +static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf, + size_t size, unsigned int width) +{ + return _hist_entry__dso_snprintf(self->ms.map, bf, size, width); +} + +struct sort_entry sort_dso = { + .se_header = "Shared Object", + .se_cmp = sort__dso_cmp, + .se_snprintf = hist_entry__dso_snprintf, + .se_width_idx = HISTC_DSO, +}; + +/* --sort symbol */ static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r, u64 ip_l, u64 ip_r) @@ -143,35 +169,35 @@ static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r, if (sym_l == sym_r) return 0; - if (sym_l) - ip_l = sym_l->start; - if (sym_r) - ip_r = sym_r->start; + ip_l = sym_l->start; + ip_r = sym_r->start; return (int64_t)(ip_r - ip_l); } -static int _hist_entry__dso_snprintf(struct map *map, char *bf, - size_t size, unsigned int width) +static int64_t +sort__sym_cmp(struct hist_entry *left, struct hist_entry *right) { - if (map && map->dso) { - const char *dso_name = !verbose ? map->dso->short_name : - map->dso->long_name; - return repsep_snprintf(bf, size, "%-*s", width, dso_name); - } + u64 ip_l, ip_r; - return repsep_snprintf(bf, size, "%-*s", width, "[unknown]"); -} + if (!left->ms.sym && !right->ms.sym) + return right->level - left->level; -static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf, - size_t size, unsigned int width) -{ - return _hist_entry__dso_snprintf(self->ms.map, bf, size, width); + if (!left->ms.sym || !right->ms.sym) + return cmp_null(left->ms.sym, right->ms.sym); + + if (left->ms.sym == right->ms.sym) + return 0; + + ip_l = left->ms.sym->start; + ip_r = right->ms.sym->start; + + return _sort__sym_cmp(left->ms.sym, right->ms.sym, ip_l, ip_r); } static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym, u64 ip, char level, char *bf, size_t size, - unsigned int width __maybe_unused) + unsigned int width) { size_t ret = 0; @@ -197,43 +223,13 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym, return ret; } - -struct sort_entry sort_dso = { - .se_header = "Shared Object", - .se_cmp = sort__dso_cmp, - .se_snprintf = hist_entry__dso_snprintf, - .se_width_idx = HISTC_DSO, -}; - static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf, - size_t size, - unsigned int width __maybe_unused) + size_t size, unsigned int width) { return _hist_entry__sym_snprintf(self->ms.map, self->ms.sym, self->ip, self->level, bf, size, width); } -/* --sort symbol */ -static int64_t -sort__sym_cmp(struct hist_entry *left, struct hist_entry *right) -{ - u64 ip_l, ip_r; - - if (!left->ms.sym && !right->ms.sym) - return right->level - left->level; - - if (!left->ms.sym || !right->ms.sym) - return cmp_null(left->ms.sym, right->ms.sym); - - if (left->ms.sym == right->ms.sym) - return 0; - - ip_l = left->ms.sym->start; - ip_r = right->ms.sym->start; - - return _sort__sym_cmp(left->ms.sym, right->ms.sym, ip_l, ip_r); -} - struct sort_entry sort_sym = { .se_header = "Symbol", .se_cmp = sort__sym_cmp, @@ -335,7 +331,7 @@ sort__cpu_cmp(struct hist_entry *left, struct hist_entry *right) static int hist_entry__cpu_snprintf(struct hist_entry *self, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%-*d", width, self->cpu); + return repsep_snprintf(bf, size, "%*d", width, self->cpu); } struct sort_entry sort_cpu = { @@ -345,6 +341,8 @@ struct sort_entry sort_cpu = { .se_width_idx = HISTC_CPU, }; +/* sort keys for branch stacks */ + static int64_t sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right) { @@ -359,13 +357,6 @@ static int hist_entry__dso_from_snprintf(struct hist_entry *self, char *bf, bf, size, width); } -struct sort_entry sort_dso_from = { - .se_header = "Source Shared Object", - .se_cmp = sort__dso_from_cmp, - .se_snprintf = hist_entry__dso_from_snprintf, - .se_width_idx = HISTC_DSO_FROM, -}; - static int64_t sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right) { @@ -406,8 +397,7 @@ sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right) } static int hist_entry__sym_from_snprintf(struct hist_entry *self, char *bf, - size_t size, - unsigned int width __maybe_unused) + size_t size, unsigned int width) { struct addr_map_symbol *from = &self->branch_info->from; return _hist_entry__sym_snprintf(from->map, from->sym, from->addr, @@ -416,8 +406,7 @@ static int hist_entry__sym_from_snprintf(struct hist_entry *self, char *bf, } static int hist_entry__sym_to_snprintf(struct hist_entry *self, char *bf, - size_t size, - unsigned int width __maybe_unused) + size_t size, unsigned int width) { struct addr_map_symbol *to = &self->branch_info->to; return _hist_entry__sym_snprintf(to->map, to->sym, to->addr, @@ -425,6 +414,13 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *self, char *bf, } +struct sort_entry sort_dso_from = { + .se_header = "Source Shared Object", + .se_cmp = sort__dso_from_cmp, + .se_snprintf = hist_entry__dso_from_snprintf, + .se_width_idx = HISTC_DSO_FROM, +}; + struct sort_entry sort_dso_to = { .se_header = "Target Shared Object", .se_cmp = sort__dso_to_cmp, @@ -484,30 +480,40 @@ struct sort_dimension { #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) } -static struct sort_dimension sort_dimensions[] = { +static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_PID, "pid", sort_thread), DIM(SORT_COMM, "comm", sort_comm), DIM(SORT_DSO, "dso", sort_dso), - DIM(SORT_DSO_FROM, "dso_from", sort_dso_from), - DIM(SORT_DSO_TO, "dso_to", sort_dso_to), DIM(SORT_SYM, "symbol", sort_sym), - DIM(SORT_SYM_TO, "symbol_from", sort_sym_from), - DIM(SORT_SYM_FROM, "symbol_to", sort_sym_to), DIM(SORT_PARENT, "parent", sort_parent), DIM(SORT_CPU, "cpu", sort_cpu), - DIM(SORT_MISPREDICT, "mispredict", sort_mispredict), DIM(SORT_SRCLINE, "srcline", sort_srcline), }; +#undef DIM + +#define DIM(d, n, func) [d - __SORT_BRANCH_STACK] = { .name = n, .entry = &(func) } + +static struct sort_dimension bstack_sort_dimensions[] = { + DIM(SORT_DSO_FROM, "dso_from", sort_dso_from), + DIM(SORT_DSO_TO, "dso_to", sort_dso_to), + DIM(SORT_SYM_FROM, "symbol_from", sort_sym_from), + DIM(SORT_SYM_TO, "symbol_to", sort_sym_to), + DIM(SORT_MISPREDICT, "mispredict", sort_mispredict), +}; + +#undef DIM + int sort_dimension__add(const char *tok) { unsigned int i; - for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) { - struct sort_dimension *sd = &sort_dimensions[i]; + for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) { + struct sort_dimension *sd = &common_sort_dimensions[i]; if (strncasecmp(tok, sd->name, strlen(tok))) continue; + if (sd->entry == &sort_parent) { int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED); if (ret) { @@ -518,9 +524,7 @@ int sort_dimension__add(const char *tok) return -EINVAL; } sort__has_parent = 1; - } else if (sd->entry == &sort_sym || - sd->entry == &sort_sym_from || - sd->entry == &sort_sym_to) { + } else if (sd->entry == &sort_sym) { sort__has_sym = 1; } @@ -530,36 +534,42 @@ int sort_dimension__add(const char *tok) if (sd->entry->se_collapse) sort__need_collapse = 1; - if (list_empty(&hist_entry__sort_list)) { - if (!strcmp(sd->name, "pid")) - sort__first_dimension = SORT_PID; - else if (!strcmp(sd->name, "comm")) - sort__first_dimension = SORT_COMM; - else if (!strcmp(sd->name, "dso")) - sort__first_dimension = SORT_DSO; - else if (!strcmp(sd->name, "symbol")) - sort__first_dimension = SORT_SYM; - else if (!strcmp(sd->name, "parent")) - sort__first_dimension = SORT_PARENT; - else if (!strcmp(sd->name, "cpu")) - sort__first_dimension = SORT_CPU; - else if (!strcmp(sd->name, "symbol_from")) - sort__first_dimension = SORT_SYM_FROM; - else if (!strcmp(sd->name, "symbol_to")) - sort__first_dimension = SORT_SYM_TO; - else if (!strcmp(sd->name, "dso_from")) - sort__first_dimension = SORT_DSO_FROM; - else if (!strcmp(sd->name, "dso_to")) - sort__first_dimension = SORT_DSO_TO; - else if (!strcmp(sd->name, "mispredict")) - sort__first_dimension = SORT_MISPREDICT; - } + if (list_empty(&hist_entry__sort_list)) + sort__first_dimension = i; list_add_tail(&sd->entry->list, &hist_entry__sort_list); sd->taken = 1; return 0; } + + for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) { + struct sort_dimension *sd = &bstack_sort_dimensions[i]; + + if (strncasecmp(tok, sd->name, strlen(tok))) + continue; + + if (sort__branch_mode != 1) + return -EINVAL; + + if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to) + sort__has_sym = 1; + + if (sd->taken) + return 0; + + if (sd->entry->se_collapse) + sort__need_collapse = 1; + + if (list_empty(&hist_entry__sort_list)) + sort__first_dimension = i + __SORT_BRANCH_STACK; + + list_add_tail(&sd->entry->list, &hist_entry__sort_list); + sd->taken = 1; + + return 0; + } + return -ESRCH; } @@ -569,7 +579,11 @@ void setup_sorting(const char * const usagestr[], const struct option *opts) for (tok = strtok_r(str, ", ", &tmp); tok; tok = strtok_r(NULL, ", ", &tmp)) { - if (sort_dimension__add(tok) < 0) { + int ret = sort_dimension__add(tok); + if (ret == -EINVAL) { + error("Invalid --sort key: `%s'", tok); + usage_with_options(usagestr, opts); + } else if (ret == -ESRCH) { error("Unknown --sort key: `%s'", tok); usage_with_options(usagestr, opts); } diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index b4e8c3ba559d..e994ad3e9897 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -55,9 +55,6 @@ struct he_stat { struct hist_entry_diff { bool computed; - /* PERF_HPP__DISPL */ - int displacement; - /* PERF_HPP__DELTA */ double period_ratio_delta; @@ -118,25 +115,29 @@ static inline struct hist_entry *hist_entry__next_pair(struct hist_entry *he) return NULL; } -static inline void hist__entry_add_pair(struct hist_entry *he, +static inline void hist_entry__add_pair(struct hist_entry *he, struct hist_entry *pair) { list_add_tail(&he->pairs.head, &pair->pairs.node); } enum sort_type { + /* common sort keys */ SORT_PID, SORT_COMM, SORT_DSO, SORT_SYM, SORT_PARENT, SORT_CPU, - SORT_DSO_FROM, + SORT_SRCLINE, + + /* branch stack specific sort keys */ + __SORT_BRANCH_STACK, + SORT_DSO_FROM = __SORT_BRANCH_STACK, SORT_DSO_TO, SORT_SYM_FROM, SORT_SYM_TO, SORT_MISPREDICT, - SORT_SRCLINE, }; /* diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c index 346707df04b9..29c7b2cb2521 100644 --- a/tools/perf/util/string.c +++ b/tools/perf/util/string.c @@ -332,6 +332,24 @@ char *strxfrchar(char *s, char from, char to) } /** + * ltrim - Removes leading whitespace from @s. + * @s: The string to be stripped. + * + * Return pointer to the first non-whitespace character in @s. + */ +char *ltrim(char *s) +{ + int len = strlen(s); + + while (len && isspace(*s)) { + len--; + s++; + } + + return s; +} + +/** * rtrim - Removes trailing whitespace from @s. * @s: The string to be stripped. * diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index db0cc92cf2ea..54efcb5659ac 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1,6 +1,3 @@ -#include <libelf.h> -#include <gelf.h> -#include <elf.h> #include <fcntl.h> #include <stdio.h> #include <errno.h> @@ -718,6 +715,17 @@ int dso__load_sym(struct dso *dso, struct map *map, sym.st_value); used_opd = true; } + /* + * When loading symbols in a data mapping, ABS symbols (which + * has a value of SHN_ABS in its st_shndx) failed at + * elf_getscn(). And it marks the loading as a failure so + * already loaded symbols cannot be fixed up. + * + * I'm not sure what should be done. Just ignore them for now. + * - Namhyung Kim + */ + if (sym.st_shndx == SHN_ABS) + continue; sec = elf_getscn(runtime_ss->elf, sym.st_shndx); if (!sec) diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 259f8f2ea9c9..a7390cde63bc 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -1,6 +1,5 @@ #include "symbol.h" -#include <elf.h> #include <stdio.h> #include <fcntl.h> #include <string.h> diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 295f8d4feedf..e6432d85b43d 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -28,8 +28,8 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map, symbol_filter_t filter); static int dso__load_guest_kernel_sym(struct dso *dso, struct map *map, symbol_filter_t filter); -static int vmlinux_path__nr_entries; -static char **vmlinux_path; +int vmlinux_path__nr_entries; +char **vmlinux_path; struct symbol_conf symbol_conf = { .exclude_other = true, @@ -202,13 +202,6 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type) curr->end = ~0ULL; } -static void map_groups__fixup_end(struct map_groups *mg) -{ - int i; - for (i = 0; i < MAP__NR_TYPES; ++i) - __map_groups__fixup_end(mg, i); -} - struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name) { size_t namelen = strlen(name) + 1; @@ -652,8 +645,8 @@ discard_symbol: rb_erase(&pos->rb_node, root); return count + moved; } -static bool symbol__restricted_filename(const char *filename, - const char *restricted_filename) +bool symbol__restricted_filename(const char *filename, + const char *restricted_filename) { bool restricted = false; @@ -775,10 +768,6 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter) else machine = NULL; - name = malloc(PATH_MAX); - if (!name) - return -1; - dso->adjust_symbols = 0; if (strncmp(dso->name, "/tmp/perf-", 10) == 0) { @@ -802,6 +791,10 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter) if (machine) root_dir = machine->root_dir; + name = malloc(PATH_MAX); + if (!name) + return -1; + /* Iterate over candidate debug images. * Keep track of "interesting" ones (those which have a symtab, dynsym, * and/or opd section) for processing. @@ -887,200 +880,6 @@ struct map *map_groups__find_by_name(struct map_groups *mg, return NULL; } -static int map_groups__set_modules_path_dir(struct map_groups *mg, - const char *dir_name) -{ - struct dirent *dent; - DIR *dir = opendir(dir_name); - int ret = 0; - - if (!dir) { - pr_debug("%s: cannot open %s dir\n", __func__, dir_name); - return -1; - } - - while ((dent = readdir(dir)) != NULL) { - char path[PATH_MAX]; - struct stat st; - - /*sshfs might return bad dent->d_type, so we have to stat*/ - snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); - if (stat(path, &st)) - continue; - - if (S_ISDIR(st.st_mode)) { - if (!strcmp(dent->d_name, ".") || - !strcmp(dent->d_name, "..")) - continue; - - ret = map_groups__set_modules_path_dir(mg, path); - if (ret < 0) - goto out; - } else { - char *dot = strrchr(dent->d_name, '.'), - dso_name[PATH_MAX]; - struct map *map; - char *long_name; - - if (dot == NULL || strcmp(dot, ".ko")) - continue; - snprintf(dso_name, sizeof(dso_name), "[%.*s]", - (int)(dot - dent->d_name), dent->d_name); - - strxfrchar(dso_name, '-', '_'); - map = map_groups__find_by_name(mg, MAP__FUNCTION, - dso_name); - if (map == NULL) - continue; - - long_name = strdup(path); - if (long_name == NULL) { - ret = -1; - goto out; - } - dso__set_long_name(map->dso, long_name); - map->dso->lname_alloc = 1; - dso__kernel_module_get_build_id(map->dso, ""); - } - } - -out: - closedir(dir); - return ret; -} - -static char *get_kernel_version(const char *root_dir) -{ - char version[PATH_MAX]; - FILE *file; - char *name, *tmp; - const char *prefix = "Linux version "; - - sprintf(version, "%s/proc/version", root_dir); - file = fopen(version, "r"); - if (!file) - return NULL; - - version[0] = '\0'; - tmp = fgets(version, sizeof(version), file); - fclose(file); - - name = strstr(version, prefix); - if (!name) - return NULL; - name += strlen(prefix); - tmp = strchr(name, ' '); - if (tmp) - *tmp = '\0'; - - return strdup(name); -} - -static int machine__set_modules_path(struct machine *machine) -{ - char *version; - char modules_path[PATH_MAX]; - - version = get_kernel_version(machine->root_dir); - if (!version) - return -1; - - snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel", - machine->root_dir, version); - free(version); - - return map_groups__set_modules_path_dir(&machine->kmaps, modules_path); -} - -struct map *machine__new_module(struct machine *machine, u64 start, - const char *filename) -{ - struct map *map; - struct dso *dso = __dsos__findnew(&machine->kernel_dsos, filename); - - if (dso == NULL) - return NULL; - - map = map__new2(start, dso, MAP__FUNCTION); - if (map == NULL) - return NULL; - - if (machine__is_host(machine)) - dso->symtab_type = DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE; - else - dso->symtab_type = DSO_BINARY_TYPE__GUEST_KMODULE; - map_groups__insert(&machine->kmaps, map); - return map; -} - -static int machine__create_modules(struct machine *machine) -{ - char *line = NULL; - size_t n; - FILE *file; - struct map *map; - const char *modules; - char path[PATH_MAX]; - - if (machine__is_default_guest(machine)) - modules = symbol_conf.default_guest_modules; - else { - sprintf(path, "%s/proc/modules", machine->root_dir); - modules = path; - } - - if (symbol__restricted_filename(path, "/proc/modules")) - return -1; - - file = fopen(modules, "r"); - if (file == NULL) - return -1; - - while (!feof(file)) { - char name[PATH_MAX]; - u64 start; - char *sep; - int line_len; - - line_len = getline(&line, &n, file); - if (line_len < 0) - break; - - if (!line) - goto out_failure; - - line[--line_len] = '\0'; /* \n */ - - sep = strrchr(line, 'x'); - if (sep == NULL) - continue; - - hex2u64(sep + 1, &start); - - sep = strchr(line, ' '); - if (sep == NULL) - continue; - - *sep = '\0'; - - snprintf(name, sizeof(name), "[%s]", line); - map = machine__new_module(machine, start, name); - if (map == NULL) - goto out_delete_line; - dso__kernel_module_get_build_id(map->dso, machine->root_dir); - } - - free(line); - fclose(file); - - return machine__set_modules_path(machine); - -out_delete_line: - free(line); -out_failure: - return -1; -} - int dso__load_vmlinux(struct dso *dso, struct map *map, const char *vmlinux, symbol_filter_t filter) { @@ -1124,8 +923,10 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map, filename = dso__build_id_filename(dso, NULL, 0); if (filename != NULL) { err = dso__load_vmlinux(dso, map, filename, filter); - if (err > 0) + if (err > 0) { + dso->lname_alloc = 1; goto out; + } free(filename); } @@ -1133,6 +934,7 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map, err = dso__load_vmlinux(dso, map, vmlinux_path[i], filter); if (err > 0) { dso__set_long_name(dso, strdup(vmlinux_path[i])); + dso->lname_alloc = 1; break; } } @@ -1172,6 +974,7 @@ static int dso__load_kernel_sym(struct dso *dso, struct map *map, if (err > 0) { dso__set_long_name(dso, strdup(symbol_conf.vmlinux_name)); + dso->lname_alloc = 1; goto out_fixup; } return err; @@ -1300,195 +1103,6 @@ out_try_fixup: return err; } -size_t machines__fprintf_dsos(struct rb_root *machines, FILE *fp) -{ - struct rb_node *nd; - size_t ret = 0; - - for (nd = rb_first(machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret += __dsos__fprintf(&pos->kernel_dsos, fp); - ret += __dsos__fprintf(&pos->user_dsos, fp); - } - - return ret; -} - -size_t machine__fprintf_dsos_buildid(struct machine *machine, FILE *fp, - bool with_hits) -{ - return __dsos__fprintf_buildid(&machine->kernel_dsos, fp, with_hits) + - __dsos__fprintf_buildid(&machine->user_dsos, fp, with_hits); -} - -size_t machines__fprintf_dsos_buildid(struct rb_root *machines, - FILE *fp, bool with_hits) -{ - struct rb_node *nd; - size_t ret = 0; - - for (nd = rb_first(machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret += machine__fprintf_dsos_buildid(pos, fp, with_hits); - } - return ret; -} - -static struct dso *machine__get_kernel(struct machine *machine) -{ - const char *vmlinux_name = NULL; - struct dso *kernel; - - if (machine__is_host(machine)) { - vmlinux_name = symbol_conf.vmlinux_name; - if (!vmlinux_name) - vmlinux_name = "[kernel.kallsyms]"; - - kernel = dso__kernel_findnew(machine, vmlinux_name, - "[kernel]", - DSO_TYPE_KERNEL); - } else { - char bf[PATH_MAX]; - - if (machine__is_default_guest(machine)) - vmlinux_name = symbol_conf.default_guest_vmlinux_name; - if (!vmlinux_name) - vmlinux_name = machine__mmap_name(machine, bf, - sizeof(bf)); - - kernel = dso__kernel_findnew(machine, vmlinux_name, - "[guest.kernel]", - DSO_TYPE_GUEST_KERNEL); - } - - if (kernel != NULL && (!kernel->has_build_id)) - dso__read_running_kernel_build_id(kernel, machine); - - return kernel; -} - -struct process_args { - u64 start; -}; - -static int symbol__in_kernel(void *arg, const char *name, - char type __maybe_unused, u64 start) -{ - struct process_args *args = arg; - - if (strchr(name, '[')) - return 0; - - args->start = start; - return 1; -} - -/* Figure out the start address of kernel map from /proc/kallsyms */ -static u64 machine__get_kernel_start_addr(struct machine *machine) -{ - const char *filename; - char path[PATH_MAX]; - struct process_args args; - - if (machine__is_host(machine)) { - filename = "/proc/kallsyms"; - } else { - if (machine__is_default_guest(machine)) - filename = (char *)symbol_conf.default_guest_kallsyms; - else { - sprintf(path, "%s/proc/kallsyms", machine->root_dir); - filename = path; - } - } - - if (symbol__restricted_filename(filename, "/proc/kallsyms")) - return 0; - - if (kallsyms__parse(filename, &args, symbol__in_kernel) <= 0) - return 0; - - return args.start; -} - -int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel) -{ - enum map_type type; - u64 start = machine__get_kernel_start_addr(machine); - - for (type = 0; type < MAP__NR_TYPES; ++type) { - struct kmap *kmap; - - machine->vmlinux_maps[type] = map__new2(start, kernel, type); - if (machine->vmlinux_maps[type] == NULL) - return -1; - - machine->vmlinux_maps[type]->map_ip = - machine->vmlinux_maps[type]->unmap_ip = - identity__map_ip; - kmap = map__kmap(machine->vmlinux_maps[type]); - kmap->kmaps = &machine->kmaps; - map_groups__insert(&machine->kmaps, - machine->vmlinux_maps[type]); - } - - return 0; -} - -void machine__destroy_kernel_maps(struct machine *machine) -{ - enum map_type type; - - for (type = 0; type < MAP__NR_TYPES; ++type) { - struct kmap *kmap; - - if (machine->vmlinux_maps[type] == NULL) - continue; - - kmap = map__kmap(machine->vmlinux_maps[type]); - map_groups__remove(&machine->kmaps, - machine->vmlinux_maps[type]); - if (kmap->ref_reloc_sym) { - /* - * ref_reloc_sym is shared among all maps, so free just - * on one of them. - */ - if (type == MAP__FUNCTION) { - free((char *)kmap->ref_reloc_sym->name); - kmap->ref_reloc_sym->name = NULL; - free(kmap->ref_reloc_sym); - } - kmap->ref_reloc_sym = NULL; - } - - map__delete(machine->vmlinux_maps[type]); - machine->vmlinux_maps[type] = NULL; - } -} - -int machine__create_kernel_maps(struct machine *machine) -{ - struct dso *kernel = machine__get_kernel(machine); - - if (kernel == NULL || - __machine__create_kernel_maps(machine, kernel) < 0) - return -1; - - if (symbol_conf.use_modules && machine__create_modules(machine) < 0) { - if (machine__is_host(machine)) - pr_debug("Problems creating module maps, " - "continuing anyway...\n"); - else - pr_debug("Problems creating module maps for guest %d, " - "continuing anyway...\n", machine->pid); - } - - /* - * Now that we have all the maps created, just set the ->end of them: - */ - map_groups__fixup_end(&machine->kmaps); - return 0; -} - static void vmlinux_path__exit(void) { while (--vmlinux_path__nr_entries >= 0) { @@ -1549,25 +1163,6 @@ out_fail: return -1; } -size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp) -{ - int i; - size_t printed = 0; - struct dso *kdso = machine->vmlinux_maps[MAP__FUNCTION]->dso; - - if (kdso->has_build_id) { - char filename[PATH_MAX]; - if (dso__build_id_filename(kdso, filename, sizeof(filename))) - printed += fprintf(fp, "[0] %s\n", filename); - } - - for (i = 0; i < vmlinux_path__nr_entries; ++i) - printed += fprintf(fp, "[%d] %s\n", - i + kdso->has_build_id, vmlinux_path[i]); - - return printed; -} - static int setup_list(struct strlist **list, const char *list_str, const char *list_name) { @@ -1671,108 +1266,3 @@ void symbol__exit(void) symbol_conf.sym_list = symbol_conf.dso_list = symbol_conf.comm_list = NULL; symbol_conf.initialized = false; } - -int machines__create_kernel_maps(struct rb_root *machines, pid_t pid) -{ - struct machine *machine = machines__findnew(machines, pid); - - if (machine == NULL) - return -1; - - return machine__create_kernel_maps(machine); -} - -int machines__create_guest_kernel_maps(struct rb_root *machines) -{ - int ret = 0; - struct dirent **namelist = NULL; - int i, items = 0; - char path[PATH_MAX]; - pid_t pid; - char *endp; - - if (symbol_conf.default_guest_vmlinux_name || - symbol_conf.default_guest_modules || - symbol_conf.default_guest_kallsyms) { - machines__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID); - } - - if (symbol_conf.guestmount) { - items = scandir(symbol_conf.guestmount, &namelist, NULL, NULL); - if (items <= 0) - return -ENOENT; - for (i = 0; i < items; i++) { - if (!isdigit(namelist[i]->d_name[0])) { - /* Filter out . and .. */ - continue; - } - pid = (pid_t)strtol(namelist[i]->d_name, &endp, 10); - if ((*endp != '\0') || - (endp == namelist[i]->d_name) || - (errno == ERANGE)) { - pr_debug("invalid directory (%s). Skipping.\n", - namelist[i]->d_name); - continue; - } - sprintf(path, "%s/%s/proc/kallsyms", - symbol_conf.guestmount, - namelist[i]->d_name); - ret = access(path, R_OK); - if (ret) { - pr_debug("Can't access file %s\n", path); - goto failure; - } - machines__create_kernel_maps(machines, pid); - } -failure: - free(namelist); - } - - return ret; -} - -void machines__destroy_guest_kernel_maps(struct rb_root *machines) -{ - struct rb_node *next = rb_first(machines); - - while (next) { - struct machine *pos = rb_entry(next, struct machine, rb_node); - - next = rb_next(&pos->rb_node); - rb_erase(&pos->rb_node, machines); - machine__delete(pos); - } -} - -int machine__load_kallsyms(struct machine *machine, const char *filename, - enum map_type type, symbol_filter_t filter) -{ - struct map *map = machine->vmlinux_maps[type]; - int ret = dso__load_kallsyms(map->dso, filename, map, filter); - - if (ret > 0) { - dso__set_loaded(map->dso, type); - /* - * Since /proc/kallsyms will have multiple sessions for the - * kernel, with modules between them, fixup the end of all - * sections. - */ - __map_groups__fixup_end(&machine->kmaps, type); - } - - return ret; -} - -int machine__load_vmlinux_path(struct machine *machine, enum map_type type, - symbol_filter_t filter) -{ - struct map *map = machine->vmlinux_maps[type]; - int ret = dso__load_vmlinux_path(map->dso, map, filter); - - if (ret > 0) { - dso__set_loaded(map->dso, type); - map__reloc_vmlinux(map); - } - - return ret; -} diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index de68f98b236d..d97377ac2f16 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -16,8 +16,8 @@ #ifdef LIBELF_SUPPORT #include <libelf.h> #include <gelf.h> -#include <elf.h> #endif +#include <elf.h> #include "dso.h" @@ -120,6 +120,8 @@ struct symbol_conf { }; extern struct symbol_conf symbol_conf; +extern int vmlinux_path__nr_entries; +extern char **vmlinux_path; static inline void *symbol__priv(struct symbol *sym) { @@ -223,6 +225,8 @@ size_t symbol__fprintf_symname_offs(const struct symbol *sym, size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp); size_t symbol__fprintf(struct symbol *sym, FILE *fp); bool symbol_type__is_a(char symbol_type, enum map_type map_type); +bool symbol__restricted_filename(const char *filename, + const char *restricted_filename); int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, struct symsrc *runtime_ss, symbol_filter_t filter, diff --git a/tools/perf/util/sysfs.c b/tools/perf/util/sysfs.c index 48c6902e749f..f71e9eafe15a 100644 --- a/tools/perf/util/sysfs.c +++ b/tools/perf/util/sysfs.c @@ -8,7 +8,7 @@ static const char * const sysfs_known_mountpoints[] = { }; static int sysfs_found; -char sysfs_mountpoint[PATH_MAX]; +char sysfs_mountpoint[PATH_MAX + 1]; static int sysfs_valid_mountpoint(const char *sysfs) { diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index df59623ac763..632e40e5ceca 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -54,10 +54,10 @@ int thread__comm_len(struct thread *self) return self->comm_len; } -static size_t thread__fprintf(struct thread *self, FILE *fp) +size_t thread__fprintf(struct thread *thread, FILE *fp) { - return fprintf(fp, "Thread %d %s\n", self->pid, self->comm) + - map_groups__fprintf(&self->mg, verbose, fp); + return fprintf(fp, "Thread %d %s\n", thread->pid, thread->comm) + + map_groups__fprintf(&thread->mg, verbose, fp); } void thread__insert_map(struct thread *self, struct map *map) @@ -84,17 +84,3 @@ int thread__fork(struct thread *self, struct thread *parent) return -ENOMEM; return 0; } - -size_t machine__fprintf(struct machine *machine, FILE *fp) -{ - size_t ret = 0; - struct rb_node *nd; - - for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) { - struct thread *pos = rb_entry(nd, struct thread, rb_node); - - ret += thread__fprintf(pos, fp); - } - - return ret; -} diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index f2fa17caa7d5..5ad266403098 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -30,6 +30,7 @@ int thread__set_comm(struct thread *self, const char *comm); int thread__comm_len(struct thread *self); void thread__insert_map(struct thread *self, struct map *map); int thread__fork(struct thread *self, struct thread *parent); +size_t thread__fprintf(struct thread *thread, FILE *fp); static inline struct map *thread__find_map(struct thread *self, enum map_type type, u64 addr) diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 884dde9b9bc1..54d37a4753c5 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -26,6 +26,8 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) float samples_per_sec = top->samples / top->delay_secs; float ksamples_per_sec = top->kernel_samples / top->delay_secs; float esamples_percent = (100.0 * top->exact_samples) / top->samples; + struct perf_record_opts *opts = &top->record_opts; + struct perf_target *target = &opts->target; size_t ret = 0; if (!perf_guest) { @@ -61,31 +63,31 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) struct perf_evsel *first = perf_evlist__first(top->evlist); ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ", (uint64_t)first->attr.sample_period, - top->freq ? "Hz" : ""); + opts->freq ? "Hz" : ""); } ret += SNPRINTF(bf + ret, size - ret, "%s", perf_evsel__name(top->sym_evsel)); ret += SNPRINTF(bf + ret, size - ret, "], "); - if (top->target.pid) + if (target->pid) ret += SNPRINTF(bf + ret, size - ret, " (target_pid: %s", - top->target.pid); - else if (top->target.tid) + target->pid); + else if (target->tid) ret += SNPRINTF(bf + ret, size - ret, " (target_tid: %s", - top->target.tid); - else if (top->target.uid_str != NULL) + target->tid); + else if (target->uid_str != NULL) ret += SNPRINTF(bf + ret, size - ret, " (uid: %s", - top->target.uid_str); + target->uid_str); else ret += SNPRINTF(bf + ret, size - ret, " (all"); - if (top->target.cpu_list) + if (target->cpu_list) ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)", top->evlist->cpus->nr > 1 ? "s" : "", - top->target.cpu_list); + target->cpu_list); else { - if (top->target.tid) + if (target->tid) ret += SNPRINTF(bf + ret, size - ret, ")"); else ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)", diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 86ff1b15059b..7ebf357dc9e1 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -14,7 +14,7 @@ struct perf_session; struct perf_top { struct perf_tool tool; struct perf_evlist *evlist; - struct perf_target target; + struct perf_record_opts record_opts; /* * Symbols will be added here in perf_event__process_sample and will * get out after decayed. @@ -24,24 +24,16 @@ struct perf_top { u64 exact_samples; u64 guest_us_samples, guest_kernel_samples; int print_entries, count_filter, delay_secs; - int freq; bool hide_kernel_symbols, hide_user_symbols, zero; bool use_tui, use_stdio; bool sort_has_symbols; - bool dont_use_callchains; bool kptr_restrict_warned; bool vmlinux_warned; - bool inherit; - bool group; - bool sample_id_all_missing; - bool exclude_guest_missing; bool dump_symtab; struct hist_entry *sym_filter_entry; struct perf_evsel *sym_evsel; struct perf_session *session; struct winsize winsize; - unsigned int mmap_pages; - int default_interval; int realtime_prio; int sym_pcnt_filter; const char *sym_filter; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 5906e8426cc7..805d1f52c5b4 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -12,6 +12,8 @@ */ unsigned int page_size; +bool test_attr__enabled; + bool perf_host = true; bool perf_guest = false; @@ -218,3 +220,25 @@ void dump_stack(void) #else void dump_stack(void) {} #endif + +void get_term_dimensions(struct winsize *ws) +{ + char *s = getenv("LINES"); + + if (s != NULL) { + ws->ws_row = atoi(s); + s = getenv("COLUMNS"); + if (s != NULL) { + ws->ws_col = atoi(s); + if (ws->ws_row && ws->ws_col) + return; + } + } +#ifdef TIOCGWINSZ + if (ioctl(1, TIOCGWINSZ, ws) == 0 && + ws->ws_row && ws->ws_col) + return; +#endif + ws->ws_row = 25; + ws->ws_col = 80; +} diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index c2330918110c..09b4c26b71aa 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -265,10 +265,14 @@ bool is_power_of_2(unsigned long n) size_t hex_width(u64 v); int hex2u64(const char *ptr, u64 *val); +char *ltrim(char *s); char *rtrim(char *s); void dump_stack(void); extern unsigned int page_size; +struct winsize; +void get_term_dimensions(struct winsize *ws); + #endif |