summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/entry/entry_32.S109
-rw-r--r--arch/x86/entry/syscall_x32.c7
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl6
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl6
-rw-r--r--arch/x86/entry/vdso/Makefile4
-rw-r--r--arch/x86/entry/vdso/vdso32/note.S30
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/include/asm/device.h3
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/fpu/internal.h1
-rw-r--r--arch/x86/include/asm/fpu/regset.h4
-rw-r--r--arch/x86/include/asm/fpu/xstate.h4
-rw-r--r--arch/x86/include/asm/hardirq.h4
-rw-r--r--arch/x86/include/asm/idtentry.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h95
-rw-r--r--arch/x86/include/asm/kvm_para.h3
-rw-r--r--arch/x86/include/asm/kvm_types.h7
-rw-r--r--arch/x86/include/asm/mmu_context.h1
-rw-r--r--arch/x86/include/asm/mshyperv.h12
-rw-r--r--arch/x86/include/asm/pgalloc.h42
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/proto.h2
-rw-r--r--arch/x86/include/asm/qspinlock.h1
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/smp.h10
-rw-r--r--arch/x86/include/asm/tsc.h1
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h3
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/kernel/alternative.c1
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c1
-rw-r--r--arch/x86/kernel/apic/ipi.c1
-rw-r--r--arch/x86/kernel/apic/local.h1
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c1
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/hygon.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c7
-rw-r--r--arch/x86/kernel/devicetree.c1
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/fpu/regset.c55
-rw-r--r--arch/x86/kernel/fpu/signal.c13
-rw-r--r--arch/x86/kernel/fpu/xstate.c164
-rw-r--r--arch/x86/kernel/ftrace.c14
-rw-r--r--arch/x86/kernel/ftrace_64.S29
-rw-r--r--arch/x86/kernel/head_32.S31
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/jailhouse.c2
-rw-r--r--arch/x86/kernel/kvm.c118
-rw-r--r--arch/x86/kernel/mpparse.c3
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c75
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/tls.c32
-rw-r--r--arch/x86/kernel/tls.h2
-rw-r--r--arch/x86/kernel/topology.c1
-rw-r--r--arch/x86/kernel/traps.c1
-rw-r--r--arch/x86/kernel/tsc_msr.c1
-rw-r--r--arch/x86/kvm/cpuid.c118
-rw-r--r--arch/x86/kvm/cpuid.h2
-rw-r--r--arch/x86/kvm/hyperv.c1
-rw-r--r--arch/x86/kvm/lapic.c11
-rw-r--r--arch/x86/kvm/mmu.h34
-rw-r--r--arch/x86/kvm/mmu/mmu.c461
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c (renamed from arch/x86/kvm/mmu_audit.c)12
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h63
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h (renamed from arch/x86/kvm/mmutrace.h)2
-rw-r--r--arch/x86/kvm/mmu/page_track.c2
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h19
-rw-r--r--arch/x86/kvm/pmu.c5
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c2
-rw-r--r--arch/x86/kvm/svm/nested.c142
-rw-r--r--arch/x86/kvm/svm/sev.c47
-rw-r--r--arch/x86/kvm/svm/svm.c262
-rw-r--r--arch/x86/kvm/svm/svm.h32
-rw-r--r--arch/x86/kvm/svm/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/nested.c149
-rw-r--r--arch/x86/kvm/vmx/ops.h4
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c17
-rw-r--r--arch/x86/kvm/vmx/vmenter.S5
-rw-r--r--arch/x86/kvm/vmx/vmx.c209
-rw-r--r--arch/x86/kvm/vmx/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c259
-rw-r--r--arch/x86/kvm/x86.h34
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/math-emu/fpu_entry.c19
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c57
-rw-r--r--arch/x86/mm/kaslr.c1
-rw-r--r--arch/x86/mm/numa.c1
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/pti.c1
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/pci/xen.c2
-rw-r--r--arch/x86/platform/uv/bios_uv.c1
-rw-r--r--arch/x86/power/Makefile3
-rw-r--r--arch/x86/power/hibernate.c2
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/um/vdso/Makefile2
-rw-r--r--arch/x86/xen/Kconfig3
-rw-r--r--arch/x86/xen/Makefile8
-rw-r--r--arch/x86/xen/apic.c19
-rw-r--r--arch/x86/xen/enlighten_hvm.c1
-rw-r--r--arch/x86/xen/enlighten_pv.c78
-rw-r--r--arch/x86/xen/mmu_pv.c492
-rw-r--r--arch/x86/xen/p2m.c6
-rw-r--r--arch/x86/xen/setup.c36
-rw-r--r--arch/x86/xen/smp_hvm.c1
-rw-r--r--arch/x86/xen/smp_pv.c19
-rw-r--r--arch/x86/xen/spinlock.c4
-rw-r--r--arch/x86/xen/suspend_pv.c4
-rw-r--r--arch/x86/xen/vdso.h6
-rw-r--r--arch/x86/xen/xen-asm.S194
-rw-r--r--arch/x86/xen/xen-asm_32.S185
-rw-r--r--arch/x86/xen/xen-asm_64.S192
-rw-r--r--arch/x86/xen/xen-head.S6
-rw-r--r--arch/x86/xen/xen-ops.h1
131 files changed, 1824 insertions, 2422 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fd03cefabd34..7101ac64bb20 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -209,6 +209,7 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_FUNCTION_ARG_ACCESS_API
@@ -803,6 +804,7 @@ config KVM_GUEST
depends on PARAVIRT
select PARAVIRT_CLOCK
select ARCH_CPUIDLE_HALTPOLL
+ select X86_HV_CALLBACK_VECTOR
default y
help
This option enables various optimizations for running under the KVM
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1e634d7ee6eb..4346ffb2e39f 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -36,8 +36,8 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
+REALMODE_CFLAGS += -ffreestanding
+REALMODE_CFLAGS += -fno-stack-protector
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
export REALMODE_CFLAGS
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index c08714ae76ec..3962f592633d 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -35,8 +35,8 @@ cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small
KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-mmx -mno-sse
-KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
-KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
+KBUILD_CFLAGS += -ffreestanding
+KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
KBUILD_CFLAGS += -Wno-pointer-sign
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 29b7d52143e9..df8c017e6161 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -449,8 +449,6 @@
.macro SWITCH_TO_KERNEL_STACK
- ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
-
BUG_IF_WRONG_CR3
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
@@ -599,8 +597,6 @@
*/
.macro SWITCH_TO_ENTRY_STACK
- ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
-
/* Bytes to copy */
movl $PTREGS_SIZE, %ecx
@@ -872,17 +868,6 @@ SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
* will ignore all of the single-step traps generated in this range.
*/
-#ifdef CONFIG_XEN_PV
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-SYM_CODE_START(xen_sysenter_target)
- addl $5*4, %esp /* remove xen-provided frame */
- jmp .Lsysenter_past_esp
-SYM_CODE_END(xen_sysenter_target)
-#endif
-
/*
* 32-bit SYSENTER entry.
*
@@ -965,9 +950,8 @@ SYM_FUNC_START(entry_SYSENTER_32)
movl %esp, %eax
call do_SYSENTER_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ testl %eax, %eax
+ jz .Lsyscall_32_done
STACKLEAK_ERASE
@@ -1165,95 +1149,6 @@ SYM_FUNC_END(entry_INT80_32)
#endif
.endm
-#ifdef CONFIG_PARAVIRT
-SYM_CODE_START(native_iret)
- iret
- _ASM_EXTABLE(native_iret, asm_iret_error)
-SYM_CODE_END(native_iret)
-#endif
-
-#ifdef CONFIG_XEN_PV
-/*
- * See comment in entry_64.S for further explanation
- *
- * Note: This is not an actual IDT entry point. It's a XEN specific entry
- * point and therefore named to match the 64-bit trampoline counterpart.
- */
-SYM_FUNC_START(xen_asm_exc_xen_hypervisor_callback)
- /*
- * Check to see if we got the event in the critical
- * region in xen_iret_direct, after we've reenabled
- * events and checked for pending events. This simulates
- * iret instruction's behaviour where it delivers a
- * pending interrupt when enabling interrupts:
- */
- cmpl $xen_iret_start_crit, (%esp)
- jb 1f
- cmpl $xen_iret_end_crit, (%esp)
- jae 1f
- call xen_iret_crit_fixup
-1:
- pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- ENCODE_FRAME_POINTER
-
- mov %esp, %eax
- call xen_pv_evtchn_do_upcall
- jmp handle_exception_return
-SYM_FUNC_END(xen_asm_exc_xen_hypervisor_callback)
-
-/*
- * Hypervisor uses this for application faults while it executes.
- * We get here for two reasons:
- * 1. Fault while reloading DS, ES, FS or GS
- * 2. Fault while executing IRET
- * Category 1 we fix up by reattempting the load, and zeroing the segment
- * register if the load fails.
- * Category 2 we fix up by jumping to do_iret_error. We cannot use the
- * normal Linux return path in this case because if we use the IRET hypercall
- * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
- * We distinguish between categories by maintaining a status value in EAX.
- */
-SYM_FUNC_START(xen_failsafe_callback)
- pushl %eax
- movl $1, %eax
-1: mov 4(%esp), %ds
-2: mov 8(%esp), %es
-3: mov 12(%esp), %fs
-4: mov 16(%esp), %gs
- /* EAX == 0 => Category 1 (Bad segment)
- EAX != 0 => Category 2 (Bad IRET) */
- testl %eax, %eax
- popl %eax
- lea 16(%esp), %esp
- jz 5f
- jmp asm_iret_error
-5: pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- ENCODE_FRAME_POINTER
- jmp handle_exception_return
-
-.section .fixup, "ax"
-6: xorl %eax, %eax
- movl %eax, 4(%esp)
- jmp 1b
-7: xorl %eax, %eax
- movl %eax, 8(%esp)
- jmp 2b
-8: xorl %eax, %eax
- movl %eax, 12(%esp)
- jmp 3b
-9: xorl %eax, %eax
- movl %eax, 16(%esp)
- jmp 4b
-.previous
- _ASM_EXTABLE(1b, 6b)
- _ASM_EXTABLE(2b, 7b)
- _ASM_EXTABLE(3b, 8b)
- _ASM_EXTABLE(4b, 9b)
-SYM_FUNC_END(xen_failsafe_callback)
-#endif /* CONFIG_XEN_PV */
-
SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
/* the function address is in %gs's slot on the stack */
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 3d8d70d3896c..1583831f61a9 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -8,6 +8,13 @@
#include <asm/unistd.h>
#include <asm/syscall.h>
+/*
+ * Reuse the 64-bit entry points for the x32 versions that occupy different
+ * slots in the syscall table.
+ */
+#define __x32_sys_getsockopt __x64_sys_getsockopt
+#define __x32_sys_setsockopt __x64_sys_setsockopt
+
#define __SYSCALL_64(nr, sym)
#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *);
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3f0c6546b47c..9d1102873666 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -160,7 +160,7 @@
146 i386 writev sys_writev compat_sys_writev
147 i386 getsid sys_getsid
148 i386 fdatasync sys_fdatasync
-149 i386 _sysctl sys_sysctl compat_sys_sysctl
+149 i386 _sysctl sys_ni_syscall
150 i386 mlock sys_mlock
151 i386 munlock sys_munlock
152 i386 mlockall sys_mlockall
@@ -376,8 +376,8 @@
362 i386 connect sys_connect
363 i386 listen sys_listen
364 i386 accept4 sys_accept4
-365 i386 getsockopt sys_getsockopt compat_sys_getsockopt
-366 i386 setsockopt sys_setsockopt compat_sys_setsockopt
+365 i386 getsockopt sys_getsockopt sys_getsockopt
+366 i386 setsockopt sys_setsockopt sys_setsockopt
367 i386 getsockname sys_getsockname
368 i386 getpeername sys_getpeername
369 i386 sendto sys_sendto
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f8637c2c863d..f30d6ae9a688 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -164,7 +164,7 @@
153 common vhangup sys_vhangup
154 common modify_ldt sys_modify_ldt
155 common pivot_root sys_pivot_root
-156 64 _sysctl sys_sysctl
+156 64 _sysctl sys_ni_syscall
157 common prctl sys_prctl
158 common arch_prctl sys_arch_prctl
159 common adjtimex sys_adjtimex
@@ -397,8 +397,8 @@
538 x32 sendmmsg compat_sys_sendmmsg
539 x32 process_vm_readv compat_sys_process_vm_readv
540 x32 process_vm_writev compat_sys_process_vm_writev
-541 x32 setsockopt compat_sys_setsockopt
-542 x32 getsockopt compat_sys_getsockopt
+541 x32 setsockopt sys_setsockopt
+542 x32 getsockopt sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
545 x32 execveat compat_sys_execveat
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 04e65f0698f6..215376d975a2 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -82,7 +82,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
# optimize sibling calls.
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \
-fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
@@ -151,7 +151,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
-KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
+KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
index e78047d119f6..2cbd39939dc6 100644
--- a/arch/x86/entry/vdso/vdso32/note.S
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -16,33 +16,3 @@ ELFNOTE_START(Linux, 0, "a")
ELFNOTE_END
BUILD_SALT
-
-#ifdef CONFIG_XEN
-/*
- * Add a special note telling glibc's dynamic linker a fake hardware
- * flavor that it will use to choose the search path for libraries in the
- * same way it uses real hardware capabilities like "mmx".
- * We supply "nosegneg" as the fake capability, to indicate that we
- * do not like negative offsets in instructions using segment overrides,
- * since we implement those inefficiently. This makes it possible to
- * install libraries optimized to avoid those access patterns in someplace
- * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
- * corresponding to the bits here is needed to make ldconfig work right.
- * It should contain:
- * hwcap 1 nosegneg
- * to match the mapping of bit to name that we give here.
- *
- * At runtime, the fake hardware feature will be considered to be present
- * if its bit is set in the mask word. So, we start with the mask 0, and
- * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
- */
-
-#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
-
-ELFNOTE_START(GNU, 2, "a")
- .long 1 /* ncaps */
-VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
- .long 0 /* mask */
- .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
-ELFNOTE_END
-#endif
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 385d3d172ee1..ca8a657edf59 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -30,7 +30,6 @@
#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/user32.h>
#include <asm/ia32.h>
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 49bd6cf3eec9..7c0a52ca2f4d 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -3,9 +3,6 @@
#define _ASM_X86_DEVICE_H
struct dev_archdata {
-#ifdef CONFIG_IOMMU_API
- void *iommu; /* hook for IOMMU specific extension */
-#endif
};
struct pdev_archdata {
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 452beed7892b..b9a5d488f1a5 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -21,8 +21,6 @@ typedef struct user_i387_struct elf_fpregset_t;
#ifdef __i386__
-typedef struct user_fxsr_struct elf_fpxregset_t;
-
#define R_386_NONE 0
#define R_386_32 1
#define R_386_PC32 2
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b9527a54db99..0f0dd645b594 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -26,9 +26,9 @@
#ifndef __ASSEMBLY__
#include <linux/kernel.h>
-#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
+#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
#include <asm/kmap_types.h>
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 6b10cdaa7c96..0a460f2a3f90 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -34,7 +34,6 @@ extern int fpu__copy(struct task_struct *dst, struct task_struct *src);
extern void fpu__clear_user_states(struct fpu *fpu);
extern void fpu__clear_all(struct fpu *fpu);
extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
-extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
/*
* Boot time FPU initialization functions:
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
index d5bdffb9d27f..4f928d6a367b 100644
--- a/arch/x86/include/asm/fpu/regset.h
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -8,8 +8,8 @@
#include <linux/regset.h>
extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
- xstateregs_get;
+extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+ xstateregs_get;
extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
xstateregs_set;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 1559554af931..14ab815132d4 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -104,8 +104,8 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
const void *get_xsave_field_ptr(int xfeature_nr);
int using_compacted_format(void);
int xfeature_size(int xfeature_nr);
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
+struct membuf;
+void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
void copy_supervisor_to_kernel(struct xregs_state *xsave);
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 07533795b8d2..275e7fd20310 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -67,12 +67,12 @@ static inline void kvm_set_cpu_l1tf_flush_l1d(void)
__this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
}
-static inline void kvm_clear_cpu_l1tf_flush_l1d(void)
+static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void)
{
__this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
}
-static inline bool kvm_get_cpu_l1tf_flush_l1d(void)
+static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void)
{
return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
}
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index ff198fc2495e..a43366191212 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -632,6 +632,10 @@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
#endif
+#ifdef CONFIG_KVM_GUEST
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
+#endif
+
#undef X86_TRAP_OTHER
#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index be5363b21540..5ab3af7275d8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -193,8 +193,6 @@ struct x86_exception;
enum x86_intercept;
enum x86_intercept_stage;
-#define KVM_NR_MEM_OBJS 40
-
#define KVM_NR_DB_REGS 4
#define DR6_BD (1 << 13)
@@ -246,15 +244,6 @@ enum x86_intercept_stage;
struct kvm_kernel_irq_routing_entry;
/*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
-struct kvm_mmu_memory_cache {
- int nobjs;
- void *objects[KVM_NR_MEM_OBJS];
-};
-
-/*
* the pages used as guest page table on soft mmu are tracked by
* kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
* by indirect shadow page can not be more than 15 bits.
@@ -322,43 +311,6 @@ struct kvm_rmap_head {
unsigned long val;
};
-struct kvm_mmu_page {
- struct list_head link;
- struct hlist_node hash_link;
- struct list_head lpage_disallowed_link;
-
- bool unsync;
- u8 mmu_valid_gen;
- bool mmio_cached;
- bool lpage_disallowed; /* Can't be replaced by an equiv large page */
-
- /*
- * The following two entries are used to key the shadow page in the
- * hash table.
- */
- union kvm_mmu_page_role role;
- gfn_t gfn;
-
- u64 *spt;
- /* hold the gfn of each spte inside spt */
- gfn_t *gfns;
- int root_count; /* Currently serving as active root */
- unsigned int unsync_children;
- struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
- DECLARE_BITMAP(unsync_child_bitmap, 512);
-
-#ifdef CONFIG_X86_32
- /*
- * Used out of the mmu-lock to avoid reading spte values while an
- * update is in progress; see the comments in __get_spte_lockless().
- */
- int clear_spte_count;
-#endif
-
- /* Number of writes since the last time traversal visited this page. */
- atomic_t write_flooding_count;
-};
-
struct kvm_pio_request {
unsigned long linear_rip;
unsigned long count;
@@ -384,6 +336,8 @@ struct kvm_mmu_root_info {
#define KVM_MMU_NUM_PREV_ROOTS 3
+struct kvm_mmu_page;
+
/*
* x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
* and 2-level 32-bit). The kvm_mmu structure abstracts the details of the
@@ -580,6 +534,7 @@ struct kvm_vcpu_arch {
unsigned long cr3;
unsigned long cr4;
unsigned long cr4_guest_owned_bits;
+ unsigned long cr4_guest_rsvd_bits;
unsigned long cr8;
u32 host_pkru;
u32 pkru;
@@ -635,7 +590,8 @@ struct kvm_vcpu_arch {
struct kvm_mmu *walk_mmu;
struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
- struct kvm_mmu_memory_cache mmu_page_cache;
+ struct kvm_mmu_memory_cache mmu_shadow_page_cache;
+ struct kvm_mmu_memory_cache mmu_gfn_array_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
/*
@@ -683,7 +639,7 @@ struct kvm_vcpu_arch {
struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
int maxphyaddr;
- int tdp_level;
+ int max_tdp_level;
/* emulate context */
@@ -827,6 +783,9 @@ struct kvm_vcpu_arch {
/* Flush the L1 Data cache for L1TF mitigation on VMENTER */
bool l1tf_flush_l1d;
+ /* Host CPU on which VM-entry was most recently attempted */
+ unsigned int last_vmentry_cpu;
+
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
};
@@ -1083,7 +1042,7 @@ struct kvm_x86_ops {
void (*hardware_unsetup)(void);
bool (*cpu_has_accelerated_tpr)(void);
bool (*has_emulated_msr)(u32 index);
- void (*cpuid_update)(struct kvm_vcpu *vcpu);
+ void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
@@ -1098,7 +1057,7 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
- void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
+ void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -1174,10 +1133,10 @@ struct kvm_x86_ops {
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
- int (*get_tdp_level)(struct kvm_vcpu *vcpu);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
- void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
+ int pgd_level);
bool (*has_wbinvd_exit)(void);
@@ -1220,7 +1179,6 @@ struct kvm_x86_ops {
void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t offset, unsigned long mask);
- int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
@@ -1281,6 +1239,7 @@ struct kvm_x86_nested_ops {
struct kvm_nested_state __user *user_kvm_nested_state,
struct kvm_nested_state *kvm_state);
bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+ int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
@@ -1304,7 +1263,7 @@ struct kvm_arch_async_pf {
};
extern u64 __read_mostly host_efer;
-
+extern bool __read_mostly allow_smaller_maxphyaddr;
extern struct kvm_x86_ops kvm_x86_ops;
#define __KVM_HAVE_ARCH_VM_ALLOC
@@ -1549,20 +1508,8 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
bool skip_mmu_sync);
-void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
-
-static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception)
-{
- return gpa;
-}
-
-static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
-{
- struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
-}
+void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
+ int tdp_huge_page_level);
static inline u16 kvm_read_ldt(void)
{
@@ -1636,7 +1583,15 @@ asmlinkage void kvm_spurious_fault(void);
insn "\n\t" \
"jmp 668f \n\t" \
"667: \n\t" \
+ "1: \n\t" \
+ ".pushsection .discard.instr_begin \n\t" \
+ ".long 1b - . \n\t" \
+ ".popsection \n\t" \
"call kvm_spurious_fault \n\t" \
+ "1: \n\t" \
+ ".pushsection .discard.instr_end \n\t" \
+ ".long 1b - . \n\t" \
+ ".popsection \n\t" \
"668: \n\t" \
_ASM_EXTABLE(666b, 667b)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 49d3a9edb06f..338119852512 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -4,6 +4,7 @@
#include <asm/processor.h>
#include <asm/alternative.h>
+#include <linux/interrupt.h>
#include <uapi/asm/kvm_para.h>
extern void kvmclock_init(void);
@@ -18,7 +19,7 @@ static inline bool kvm_check_and_clear_guest_paused(void)
#endif /* CONFIG_KVM_GUEST */
#define KVM_HYPERCALL \
- ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
+ ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL)
/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h
new file mode 100644
index 000000000000..08f1b57d3b62
--- /dev/null
+++ b/arch/x86/include/asm/kvm_types.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_TYPES_H
+#define _ASM_X86_KVM_TYPES_H
+
+#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
+
+#endif /* _ASM_X86_KVM_TYPES_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 47562147e70b..d98016b83755 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -9,7 +9,6 @@
#include <trace/events/tlb.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 60b944dd2df1..4f77b8f22e54 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -8,6 +8,7 @@
#include <asm/io.h>
#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
+#include <asm/paravirt.h>
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
@@ -54,6 +55,17 @@ typedef int (*hyperv_fill_flush_list_func)(
vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);
#define hv_get_raw_timer() rdtsc_ordered()
+/*
+ * Reference to pv_ops must be inline so objtool
+ * detection of noinstr violations can work correctly.
+ */
+static __always_inline void hv_setup_sched_clock(void *sched_clock)
+{
+#ifdef CONFIG_PARAVIRT
+ pv_ops.time.sched_clock = sched_clock;
+#endif
+}
+
void hyperv_vector_handler(struct pt_regs *regs);
static inline void hv_enable_stimer0_percpu_irq(int irq) {}
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 29aa7859bdee..62ad61d6fefc 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -7,7 +7,8 @@
#include <linux/pagemap.h>
#define __HAVE_ARCH_PTE_ALLOC_ONE
-#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
+#define __HAVE_ARCH_PGD_FREE
+#include <asm-generic/pgalloc.h>
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
@@ -86,30 +87,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
#define pmd_pgtable(pmd) pmd_page(pmd)
#if CONFIG_PGTABLE_LEVELS > 2
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- struct page *page;
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
-
- if (mm == &init_mm)
- gfp &= ~__GFP_ACCOUNT;
- page = alloc_pages(gfp, 0);
- if (!page)
- return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_pages(page, 0);
- return NULL;
- }
- return (pmd_t *)page_address(page);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- pgtable_pmd_page_dtor(virt_to_page(pmd));
- free_page((unsigned long)pmd);
-}
-
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
@@ -147,21 +124,6 @@ static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pu
set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT;
-
- if (mm == &init_mm)
- gfp &= ~__GFP_ACCOUNT;
- return (pud_t *)get_zeroed_page(gfp);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- free_page((unsigned long)pud);
-}
-
extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 52e5f5f2240d..8f63efb2a2cc 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -159,4 +159,6 @@ extern unsigned int ptrs_per_p4d;
#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
+#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 6e81788a30c1..28996fe19301 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -25,7 +25,7 @@ void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_INT80_compat(void);
-#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+#ifdef CONFIG_XEN_PV
void xen_entry_INT80_compat(void);
#endif
#endif
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 444d6fd9a6d8..d86ab942219c 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -32,6 +32,7 @@ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_init_lock_hash(void);
extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+extern bool nopvspin;
#define queued_spin_unlock queued_spin_unlock
/**
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 6669164abadc..9646c300f128 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -301,7 +301,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
extern void early_ignore_irq(void);
-#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+#ifdef CONFIG_XEN_PV
extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE];
#endif
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index e15f364efbcc..c0538f82c9a2 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -5,16 +5,6 @@
#include <linux/cpumask.h>
#include <asm/percpu.h>
-/*
- * We need the APIC definitions automatically as part of 'smp.h'
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-# include <asm/mpspec.h>
-# include <asm/apic.h>
-# ifdef CONFIG_X86_IO_APIC
-# include <asm/io_apic.h>
-# endif
-#endif
#include <asm/thread_info.h>
#include <asm/cpumask.h>
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index b7b2624fba86..01a300a9700b 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -6,6 +6,7 @@
#define _ASM_X86_TSC_H
#include <asm/processor.h>
+#include <asm/cpufeature.h>
/*
* Standard way to access the cycle counter.
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2f3e8f2a958f..ecefaffd15d4 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -33,7 +33,7 @@ static inline void set_fs(mm_segment_t fs)
set_thread_flag(TIF_FSCHECK);
}
-#define segment_eq(a, b) ((a).seg == (b).seg)
+#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
#define user_addr_max() (current->thread.addr_limit.seg)
/*
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index fb81fea99093..df01d7349d79 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -241,7 +241,8 @@ static u64 vread_hvclock(void)
}
#endif
-static inline u64 __arch_get_hw_counter(s32 clock_mode)
+static inline u64 __arch_get_hw_counter(s32 clock_mode,
+ const struct vdso_data *vd)
{
if (likely(clock_mode == VDSO_CLOCKMODE_TSC))
return (u64)rdtsc_ordered();
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ba4c1b15908b..454b20815f35 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -82,7 +82,7 @@ struct xen_dm_op_buf;
* - clobber the rest
*
* The result certainly isn't pretty, and it really shows up cpp's
- * weakness as as macro language. Sorry. (But let's just give thanks
+ * weakness as a macro language. Sorry. (But let's just give thanks
* there aren't more than 5 arguments...)
*/
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c826cddae157..d1175533d125 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
+#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e0e2f020ec02..5f943b938167 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -40,13 +40,13 @@
#include <asm/irq_remapping.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
-#include <asm/pgalloc.h>
#include <linux/atomic.h>
#include <asm/mpspec.h>
#include <asm/i8259.h>
#include <asm/proto.h>
#include <asm/traps.h>
#include <asm/apic.h>
+#include <asm/acpi.h>
#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 98c9bb75d185..780c702969b7 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -10,6 +10,7 @@
* like self-ipi, etc...
*/
#include <linux/cpumask.h>
+#include <linux/thread_info.h>
#include <asm/apic.h>
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 38b5b51d42f6..98d015a4405a 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -9,6 +9,7 @@
#include <linux/smp.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include "local.h"
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index d1fc62a67320..34a992e275ef 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -9,6 +9,7 @@
* Bits copied from original nmi.c file
*
*/
+#include <linux/thread_info.h>
#include <asm/apic.h>
#include <asm/nmi.h>
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6ca0f91372fd..387154e39e08 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -2,6 +2,7 @@
#include <linux/cpumask.h>
#include <linux/smp.h>
+#include <asm/io_apic.h>
#include "local.h"
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index 04797f05ce94..a997d849509a 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -10,6 +10,7 @@
#include <linux/jump_label.h>
+#include <asm/irq_vectors.h>
#include <asm/apic.h>
/* APIC flat 64 */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 67b33d67002f..7bda71def557 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -10,6 +10,7 @@
#include <linux/errno.h>
#include <linux/smp.h>
+#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/acpi.h>
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 29f0e0984557..bd3835d6b535 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -8,6 +8,7 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
+#include <linux/thread_info.h>
#include <asm/apic.h>
#include "local.h"
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index dba6a83bc349..93792b457b81 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,8 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n
KCSAN_SANITIZE_common.o := n
# Make sure load_percpu_segment has no stackprotector
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_common.o := $(nostackp)
+CFLAGS_common.o := -fno-stack-protector
obj-y := cacheinfo.o scattered.o topology.o
obj-y += common.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index d4806eac9325..dcc3d943c68f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -15,6 +15,7 @@
#include <asm/cpu.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
+#include <asm/numa.h>
#include <asm/pci-direct.h>
#include <asm/delay.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 965474d78cef..c5d6f17d9b9d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -45,6 +45,7 @@
#include <asm/mtrr.h>
#include <asm/hwcap2.h>
#include <linux/numa.h>
+#include <asm/numa.h>
#include <asm/asm.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 4e28c1fc8749..ac6c30e5801d 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -10,6 +10,7 @@
#include <asm/cpu.h>
#include <asm/smp.h>
+#include <asm/numa.h>
#include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h>
#include <asm/delay.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b6b7b38dff5f..59a1e3ce3f14 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -23,6 +23,7 @@
#include <asm/cmdline.h>
#include <asm/traps.h>
#include <asm/resctrl.h>
+#include <asm/numa.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index af94f05a5c66..31125448b174 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -361,13 +361,6 @@ static void __init ms_hyperv_init_platform(void)
#endif
}
-void hv_setup_sched_clock(void *sched_clock)
-{
-#ifdef CONFIG_PARAVIRT
- pv_ops.time.sched_clock = sched_clock;
-#endif
-}
-
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 8d85e00bb40a..a0e8fc7d85f1 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -20,6 +20,7 @@
#include <asm/irqdomain.h>
#include <asm/hpet.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/pci_x86.h>
#include <asm/setup.h>
#include <asm/i8259.h>
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 2f9ec14be3b1..a4b5af03dcc1 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -550,6 +550,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_ICL_11_IDS(&gen11_early_ops),
INTEL_EHL_IDS(&gen11_early_ops),
INTEL_TGL_12_IDS(&gen11_early_ops),
+ INTEL_RKL_IDS(&gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index bd1d0649f8ce..c413756ba89f 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -27,8 +27,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
}
int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct fpu *fpu = &target->thread.fpu;
@@ -38,8 +37,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
fpu__prepare_read(fpu);
fpstate_sanitize_xstate(fpu);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fxsave, 0, -1);
+ return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state));
}
int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -74,12 +72,10 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
}
int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct fpu *fpu = &target->thread.fpu;
struct xregs_state *xsave;
- int ret;
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -ENODEV;
@@ -89,10 +85,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
fpu__prepare_read(fpu);
if (using_compacted_format()) {
- if (kbuf)
- ret = copy_xstate_to_kernel(kbuf, xsave, pos, count);
- else
- ret = copy_xstate_to_user(ubuf, xsave, pos, count);
+ copy_xstate_to_kernel(to, xsave);
+ return 0;
} else {
fpstate_sanitize_xstate(fpu);
/*
@@ -105,9 +99,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
/*
* Copy the xstate memory layout.
*/
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ return membuf_write(&to, xsave, fpu_user_xstate_size);
}
- return ret;
}
int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -293,8 +286,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave,
}
int fpregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct fpu *fpu = &target->thread.fpu;
struct user_i387_ia32_struct env;
@@ -302,23 +294,22 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
fpu__prepare_read(fpu);
if (!boot_cpu_has(X86_FEATURE_FPU))
- return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+ return fpregs_soft_get(target, regset, to);
- if (!boot_cpu_has(X86_FEATURE_FXSR))
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fsave, 0,
- -1);
+ if (!boot_cpu_has(X86_FEATURE_FXSR)) {
+ return membuf_write(&to, &fpu->state.fsave,
+ sizeof(struct fregs_state));
+ }
fpstate_sanitize_xstate(fpu);
- if (kbuf && pos == 0 && count == sizeof(env)) {
- convert_from_fxsr(kbuf, target);
+ if (to.left == sizeof(env)) {
+ convert_from_fxsr(to.p, target);
return 0;
}
convert_from_fxsr(&env, target);
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+ return membuf_write(&to, &env, sizeof(env));
}
int fpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -356,20 +347,4 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
return ret;
}
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
-{
- struct task_struct *tsk = current;
-
- return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct),
- ufpu, NULL);
-}
-EXPORT_SYMBOL(dump_fpu);
-
#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 9393a445d73c..a4ec65317a7f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -170,14 +170,15 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
IS_ENABLED(CONFIG_IA32_EMULATION));
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ struct user_i387_ia32_struct fp;
+ fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
+ .left = sizeof(fp)});
+ return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
+ }
+
if (!access_ok(buf, size))
return -EACCES;
-
- if (!static_cpu_has(X86_FEATURE_FPU))
- return fpregs_soft_get(current, NULL, 0,
- sizeof(struct user_i387_ia32_struct), NULL,
- (struct _fpstate_32 __user *) buf) ? -1 : 1;
-
retry:
/*
* Load the FPU registers if they are not valid for the current task.
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index be2a68a09d19..7a2bf884fede 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1014,32 +1014,20 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
return true;
}
-static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count)
+static void fill_gap(struct membuf *to, unsigned *last, unsigned offset)
{
- if (*pos < to) {
- unsigned size = to - *pos;
-
- if (size > *count)
- size = *count;
- memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size);
- *kbuf += size;
- *pos += size;
- *count -= size;
- }
+ if (*last >= offset)
+ return;
+ membuf_write(to, (void *)&init_fpstate.xsave + *last, offset - *last);
+ *last = offset;
}
-static void copy_part(unsigned offset, unsigned size, void *from,
- void **kbuf, unsigned *pos, unsigned *count)
+static void copy_part(struct membuf *to, unsigned *last, unsigned offset,
+ unsigned size, void *from)
{
- fill_gap(offset, kbuf, pos, count);
- if (size > *count)
- size = *count;
- if (size) {
- memcpy(*kbuf, from, size);
- *kbuf += size;
- *pos += size;
- *count -= size;
- }
+ fill_gap(to, last, offset);
+ membuf_write(to, from, size);
+ *last = offset + size;
}
/*
@@ -1049,20 +1037,15 @@ static void copy_part(unsigned offset, unsigned size, void *from,
* It supports partial copy but pos always starts from zero. This is called
* from xstateregs_get() and there we check the CPU has XSAVES.
*/
-int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
{
struct xstate_header header;
const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
- unsigned count = size_total;
+ unsigned size = to.left;
+ unsigned last = 0;
int i;
/*
- * Currently copy_regset_to_user() starts from pos 0:
- */
- if (unlikely(offset_start != 0))
- return -EFAULT;
-
- /*
* The destination is a ptrace buffer; we put in only user xstates:
*/
memset(&header, 0, sizeof(header));
@@ -1070,27 +1053,26 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
header.xfeatures &= xfeatures_mask_user();
if (header.xfeatures & XFEATURE_MASK_FP)
- copy_part(0, off_mxcsr,
- &xsave->i387, &kbuf, &offset_start, &count);
+ copy_part(&to, &last, 0, off_mxcsr, &xsave->i387);
if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
- copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE,
- &xsave->i387.mxcsr, &kbuf, &offset_start, &count);
+ copy_part(&to, &last, off_mxcsr,
+ MXCSR_AND_FLAGS_SIZE, &xsave->i387.mxcsr);
if (header.xfeatures & XFEATURE_MASK_FP)
- copy_part(offsetof(struct fxregs_state, st_space), 128,
- &xsave->i387.st_space, &kbuf, &offset_start, &count);
+ copy_part(&to, &last, offsetof(struct fxregs_state, st_space),
+ 128, &xsave->i387.st_space);
if (header.xfeatures & XFEATURE_MASK_SSE)
- copy_part(xstate_offsets[XFEATURE_SSE], 256,
- &xsave->i387.xmm_space, &kbuf, &offset_start, &count);
+ copy_part(&to, &last, xstate_offsets[XFEATURE_SSE],
+ 256, &xsave->i387.xmm_space);
/*
* Fill xsave->i387.sw_reserved value for ptrace frame:
*/
- copy_part(offsetof(struct fxregs_state, sw_reserved), 48,
- xstate_fx_sw_bytes, &kbuf, &offset_start, &count);
+ copy_part(&to, &last, offsetof(struct fxregs_state, sw_reserved),
+ 48, xstate_fx_sw_bytes);
/*
* Copy xregs_state->header:
*/
- copy_part(offsetof(struct xregs_state, header), sizeof(header),
- &header, &kbuf, &offset_start, &count);
+ copy_part(&to, &last, offsetof(struct xregs_state, header),
+ sizeof(header), &header);
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
/*
@@ -1099,104 +1081,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
if ((header.xfeatures >> i) & 1) {
void *src = __raw_xsave_addr(xsave, i);
- copy_part(xstate_offsets[i], xstate_sizes[i],
- src, &kbuf, &offset_start, &count);
+ copy_part(&to, &last, xstate_offsets[i],
+ xstate_sizes[i], src);
}
}
- fill_gap(size_total, &kbuf, &offset_start, &count);
-
- return 0;
-}
-
-static inline int
-__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
-{
- if (!size)
- return 0;
-
- if (offset < size_total) {
- unsigned int copy = min(size, size_total - offset);
-
- if (__copy_to_user(ubuf + offset, data, copy))
- return -EFAULT;
- }
- return 0;
-}
-
-/*
- * Convert from kernel XSAVES compacted format to standard format and copy
- * to a user-space buffer. It supports partial copy but pos always starts from
- * zero. This is called from xstateregs_get() and there we check the CPU
- * has XSAVES.
- */
-int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
-{
- unsigned int offset, size;
- int ret, i;
- struct xstate_header header;
-
- /*
- * Currently copy_regset_to_user() starts from pos 0:
- */
- if (unlikely(offset_start != 0))
- return -EFAULT;
-
- /*
- * The destination is a ptrace buffer; we put in only user xstates:
- */
- memset(&header, 0, sizeof(header));
- header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= xfeatures_mask_user();
-
- /*
- * Copy xregs_state->header:
- */
- offset = offsetof(struct xregs_state, header);
- size = sizeof(header);
-
- ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
- if (ret)
- return ret;
-
- for (i = 0; i < XFEATURE_MAX; i++) {
- /*
- * Copy only in-use xstates:
- */
- if ((header.xfeatures >> i) & 1) {
- void *src = __raw_xsave_addr(xsave, i);
-
- offset = xstate_offsets[i];
- size = xstate_sizes[i];
-
- /* The next component has to fit fully into the output buffer: */
- if (offset + size > size_total)
- break;
-
- ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
- if (ret)
- return ret;
- }
-
- }
-
- if (xfeatures_mxcsr_quirk(header.xfeatures)) {
- offset = offsetof(struct fxregs_state, mxcsr);
- size = MXCSR_AND_FLAGS_SIZE;
- __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
- }
-
- /*
- * Fill xsave->i387.sw_reserved value for ptrace frame:
- */
- offset = offsetof(struct fxregs_state, sw_reserved);
- size = sizeof(xstate_fx_sw_bytes);
-
- ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
- if (ret)
- return ret;
-
- return 0;
+ fill_gap(&to, &last, size);
}
/*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 51504566b3a6..7edbd5ee5ed4 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -286,6 +286,7 @@ extern void ftrace_regs_caller_ret(void);
extern void ftrace_caller_end(void);
extern void ftrace_caller_op_ptr(void);
extern void ftrace_regs_caller_op_ptr(void);
+extern void ftrace_regs_caller_jmp(void);
/* movq function_trace_op(%rip), %rdx */
/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
@@ -316,6 +317,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long end_offset;
unsigned long op_offset;
unsigned long call_offset;
+ unsigned long jmp_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -333,11 +335,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
call_offset = (unsigned long)ftrace_regs_call;
+ jmp_offset = (unsigned long)ftrace_regs_caller_jmp;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_caller_end;
op_offset = (unsigned long)ftrace_caller_op_ptr;
call_offset = (unsigned long)ftrace_call;
+ jmp_offset = 0;
}
size = end_offset - start_offset;
@@ -367,10 +371,14 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
if (WARN_ON(ret < 0))
goto fail;
+ /* No need to test direct calls on created trampolines */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
- ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller);
- ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE);
- if (WARN_ON(ret < 0))
+ /* NOP the jnz 1f; but make sure it's a 2 byte jnz */
+ ip = trampoline + (jmp_offset - start_offset);
+ if (WARN_ON(*(char *)ip != 0x75))
+ goto fail;
+ ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2);
+ if (ret < 0)
goto fail;
}
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 083a3da7bb73..ac3d5f22fe64 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -241,22 +241,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
*/
movq ORIG_RAX(%rsp), %rax
testq %rax, %rax
- jz 1f
+SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
+ jnz 1f
- /* Swap the flags with orig_rax */
- movq MCOUNT_REG_SIZE(%rsp), %rdi
- movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
- movq %rax, MCOUNT_REG_SIZE(%rsp)
-
- restore_mcount_regs 8
- /* Restore flags */
- popfq
-
-SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL);
- UNWIND_HINT_RET_OFFSET
- jmp ftrace_epilogue
-
-1: restore_mcount_regs
+ restore_mcount_regs
/* Restore flags */
popfq
@@ -269,6 +257,17 @@ SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL);
SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
+ /* Swap the flags with orig_rax */
+1: movq MCOUNT_REG_SIZE(%rsp), %rdi
+ movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs 8
+ /* Restore flags */
+ popfq
+ UNWIND_HINT_RET_OFFSET
+ jmp ftrace_epilogue
+
SYM_FUNC_END(ftrace_regs_caller)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f66a6b90f954..7ed84c282233 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -134,38 +134,7 @@ SYM_CODE_START(startup_32)
movl %eax,pa(initial_page_table+0xffc)
#endif
-#ifdef CONFIG_PARAVIRT
- /* This is can only trip for a broken bootloader... */
- cmpw $0x207, pa(boot_params + BP_version)
- jb .Ldefault_entry
-
- /* Paravirt-compatible boot parameters. Look to see what architecture
- we're booting under. */
- movl pa(boot_params + BP_hardware_subarch), %eax
- cmpl $num_subarch_entries, %eax
- jae .Lbad_subarch
-
- movl pa(subarch_entries)(,%eax,4), %eax
- subl $__PAGE_OFFSET, %eax
- jmp *%eax
-
-.Lbad_subarch:
-SYM_INNER_LABEL_ALIGN(xen_entry, SYM_L_WEAK)
- /* Unknown implementation; there's really
- nothing we can do at this point. */
- ud2a
-
- __INITDATA
-
-subarch_entries:
- .long .Ldefault_entry /* normal x86/PC */
- .long xen_entry /* Xen hypervisor */
- .long .Ldefault_entry /* Moorestown MID */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#else
jmp .Ldefault_entry
-#endif /* CONFIG_PARAVIRT */
SYM_CODE_END(startup_32)
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index dd73135d7cee..beb1bada1b0a 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -22,6 +22,8 @@
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <asm/i8259.h>
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 6eb8b50ea07e..4eb8f2d19a87 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -13,6 +13,8 @@
#include <linux/reboot.h>
#include <linux/serial_8250.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
#include <asm/cpu.h>
#include <asm/hypervisor.h>
#include <asm/i8259.h>
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 233c77d056c9..08320b0b2b27 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -7,8 +7,11 @@
* Authors: Anthony Liguori <aliguori@us.ibm.com>
*/
+#define pr_fmt(fmt) "kvm-guest: " fmt
+
#include <linux/context_tracking.h>
#include <linux/init.h>
+#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
@@ -232,16 +235,11 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
- u32 reason = kvm_read_and_reset_apf_flags();
+ u32 flags = kvm_read_and_reset_apf_flags();
irqentry_state_t state;
- switch (reason) {
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
- case KVM_PV_REASON_PAGE_READY:
- break;
- default:
+ if (!flags)
return false;
- }
state = irqentry_enter(regs);
instrumentation_begin();
@@ -254,13 +252,13 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
panic("Host injected async #PF in interrupt disabled region\n");
- if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
if (unlikely(!(user_mode(regs))))
panic("Host injected async #PF in kernel mode\n");
/* Page is swapped out by the host. */
kvm_async_pf_task_wait_schedule(token);
} else {
- kvm_async_pf_task_wake(token);
+ WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
}
instrumentation_end();
@@ -268,6 +266,27 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
return true;
}
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ u32 token;
+ irqentry_state_t state;
+
+ state = irqentry_enter(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ if (__this_cpu_read(apf_reason.enabled)) {
+ token = __this_cpu_read(apf_reason.token);
+ kvm_async_pf_task_wake(token);
+ __this_cpu_write(apf_reason.token, 0);
+ wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
+ }
+
+ irqentry_exit(regs, state);
+ set_irq_regs(old_regs);
+}
+
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
@@ -289,8 +308,8 @@ static void kvm_register_steal_time(void)
return;
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
- pr_info("kvm-stealtime: cpu %d, msr %llx\n",
- cpu, (unsigned long long) slow_virt_to_phys(st));
+ pr_info("stealtime: cpu %d, msr %llx\n", cpu,
+ (unsigned long long) slow_virt_to_phys(st));
}
static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@ -311,17 +330,19 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
static void kvm_guest_cpu_init(void)
{
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa;
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
- pa |= KVM_ASYNC_PF_ENABLED;
+ pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+ wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
+
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
@@ -493,7 +514,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
} else {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
- WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
min = max = apic_id;
ipi_bitmap = 0;
}
@@ -503,7 +525,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
if (ipi_bitmap) {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
- WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
}
local_irq_restore(flags);
@@ -533,7 +556,7 @@ static void kvm_setup_pv_ipi(void)
{
apic->send_IPI_mask = kvm_send_ipi_mask;
apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
- pr_info("KVM setup pv IPIs\n");
+ pr_info("setup PV IPIs\n");
}
static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
@@ -551,13 +574,6 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
}
}
-static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
-{
- native_smp_prepare_cpus(max_cpus);
- if (kvm_para_has_hint(KVM_HINTS_REALTIME))
- static_branch_disable(&virt_spin_lock_key);
-}
-
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
@@ -646,19 +662,20 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
static_branch_enable(&kvm_async_pf_enabled);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
+ }
#ifdef CONFIG_SMP
- smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
- pr_info("KVM setup pv sched yield\n");
+ pr_info("setup PV sched yield\n");
}
if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
- pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
+ pr_err("failed to install cpu hotplug callbacks\n");
#else
sev_map_percpu_data();
kvm_guest_cpu_init();
@@ -854,16 +871,36 @@ asm(
*/
void __init kvm_spinlock_init(void)
{
- /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
- if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ /*
+ * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
+ * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
+ * preferred over native qspinlock when vCPU is preempted.
+ */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
+ pr_info("PV spinlocks disabled, no host support\n");
return;
+ }
- if (kvm_para_has_hint(KVM_HINTS_REALTIME))
- return;
+ /*
+ * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
+ * are available.
+ */
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
+ pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
+ goto out;
+ }
- /* Don't use the pvqspinlock code if there is only 1 vCPU. */
- if (num_possible_cpus() == 1)
- return;
+ if (num_possible_cpus() == 1) {
+ pr_info("PV spinlocks disabled, single CPU\n");
+ goto out;
+ }
+
+ if (nopvspin) {
+ pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
+ goto out;
+ }
+
+ pr_info("PV spinlocks enabled\n");
__pv_init_lock_hash();
pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
@@ -876,6 +913,13 @@ void __init kvm_spinlock_init(void)
pv_ops.lock.vcpu_is_preempted =
PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
}
+ /*
+ * When PV spinlock is enabled which is preferred over
+ * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
+ * Just disable it anyway.
+ */
+out:
+ static_branch_disable(&virt_spin_lock_key);
}
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
@@ -895,8 +939,8 @@ static void kvm_enable_host_haltpoll(void *i)
void arch_haltpoll_enable(unsigned int cpu)
{
if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
- pr_err_once("kvm: host does not support poll control\n");
- pr_err_once("kvm: host upgrade recommended\n");
+ pr_err_once("host does not support poll control\n");
+ pr_err_once("host upgrade recommended\n");
return;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index afac7ccce72f..411af4aa7b51 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,10 +19,11 @@
#include <linux/smp.h>
#include <linux/pci.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
#include <asm/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
#include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d6f946707270..9afefe325acb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -390,7 +390,7 @@ unsigned long x86_fsgsbase_read_task(struct task_struct *task,
*/
mutex_lock(&task->mm->context.lock);
ldt = task->mm->context.ldt;
- if (unlikely(idx >= ldt->nr_entries))
+ if (unlikely(!ldt || idx >= ldt->nr_entries))
base = 0;
else
base = get_desc_base(ldt->entries + idx);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3f006489087f..5679aa3fdcb8 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -412,26 +412,12 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
static int genregs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (kbuf) {
- unsigned long *k = kbuf;
- while (count >= sizeof(*k)) {
- *k++ = getreg(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- unsigned long __user *u = ubuf;
- while (count >= sizeof(*u)) {
- if (__put_user(getreg(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ int reg;
+ for (reg = 0; to.left; reg++)
+ membuf_store(&to, getreg(target, reg * sizeof(unsigned long)));
return 0;
}
@@ -695,16 +681,14 @@ static int ioperm_active(struct task_struct *target,
static int ioperm_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct io_bitmap *iobm = target->thread.io_bitmap;
if (!iobm)
return -ENXIO;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- iobm->bitmap, 0, IO_BITMAP_BYTES);
+ return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES);
}
/*
@@ -1007,28 +991,15 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
static int genregs32_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count >= sizeof(*k)) {
- getreg32(target, pos, k++);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count >= sizeof(*u)) {
- compat_ulong_t word;
- getreg32(target, pos, &word);
- if (__put_user(word, u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ int reg;
+ for (reg = 0; to.left; reg++) {
+ u32 val;
+ getreg32(target, reg * 4, &val);
+ membuf_store(&to, val);
+ }
return 0;
}
@@ -1238,25 +1209,25 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct) / sizeof(long),
.size = sizeof(long), .align = sizeof(long),
- .get = genregs_get, .set = genregs_set
+ .regset_get = genregs_get, .set = genregs_set
},
[REGSET_FP] = {
.core_note_type = NT_PRFPREG,
.n = sizeof(struct user_i387_struct) / sizeof(long),
.size = sizeof(long), .align = sizeof(long),
- .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
},
[REGSET_XSTATE] = {
.core_note_type = NT_X86_XSTATE,
.size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .get = xstateregs_get,
+ .active = xstateregs_active, .regset_get = xstateregs_get,
.set = xstateregs_set
},
[REGSET_IOPERM64] = {
.core_note_type = NT_386_IOPERM,
.n = IO_BITMAP_LONGS,
.size = sizeof(long), .align = sizeof(long),
- .active = ioperm_active, .get = ioperm_get
+ .active = ioperm_active, .regset_get = ioperm_get
},
};
@@ -1279,24 +1250,24 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
.size = sizeof(u32), .align = sizeof(u32),
- .get = genregs32_get, .set = genregs32_set
+ .regset_get = genregs32_get, .set = genregs32_set
},
[REGSET_FP] = {
.core_note_type = NT_PRFPREG,
.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
.size = sizeof(u32), .align = sizeof(u32),
- .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
+ .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set
},
[REGSET_XFP] = {
.core_note_type = NT_PRXFPREG,
.n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
.size = sizeof(u32), .align = sizeof(u32),
- .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
},
[REGSET_XSTATE] = {
.core_note_type = NT_X86_XSTATE,
.size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .get = xstateregs_get,
+ .active = xstateregs_active, .regset_get = xstateregs_get,
.set = xstateregs_set
},
[REGSET_TLS] = {
@@ -1305,13 +1276,13 @@ static struct user_regset x86_32_regsets[] __ro_after_init = {
.size = sizeof(struct user_desc),
.align = sizeof(struct user_desc),
.active = regset_tls_active,
- .get = regset_tls_get, .set = regset_tls_set
+ .regset_get = regset_tls_get, .set = regset_tls_set
},
[REGSET_IOPERM32] = {
.core_note_type = NT_386_IOPERM,
.n = IO_BITMAP_BYTES / sizeof(u32),
.size = sizeof(u32), .align = sizeof(u32),
- .active = ioperm_active, .get = ioperm_get
+ .active = ioperm_active, .regset_get = ioperm_get
},
};
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a3767e74c758..3511736fbc74 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -25,6 +25,7 @@
#include <xen/xen.h>
#include <asm/apic.h>
+#include <asm/numa.h>
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
@@ -870,8 +871,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 71d3fef1edc9..64a496a0687f 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -256,36 +256,16 @@ int regset_tls_active(struct task_struct *target,
}
int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
const struct desc_struct *tls;
+ struct user_desc v;
+ int pos;
- if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
- (pos % sizeof(struct user_desc)) != 0 ||
- (count % sizeof(struct user_desc)) != 0)
- return -EINVAL;
-
- pos /= sizeof(struct user_desc);
- count /= sizeof(struct user_desc);
-
- tls = &target->thread.tls_array[pos];
-
- if (kbuf) {
- struct user_desc *info = kbuf;
- while (count-- > 0)
- fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
- tls++);
- } else {
- struct user_desc __user *u_info = ubuf;
- while (count-- > 0) {
- struct user_desc info;
- fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
- if (__copy_to_user(u_info++, &info, sizeof(info)))
- return -EFAULT;
- }
+ for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) {
+ fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls);
+ membuf_write(&to, &v, sizeof(v));
}
-
return 0;
}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
index 3a76e1d3535e..fc39447a0c1a 100644
--- a/arch/x86/kernel/tls.h
+++ b/arch/x86/kernel/tls.h
@@ -12,7 +12,7 @@
#include <linux/regset.h>
extern user_regset_active_fn regset_tls_active;
-extern user_regset_get_fn regset_tls_get;
+extern user_regset_get2_fn regset_tls_get;
extern user_regset_set_fn regset_tls_set;
#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index b8810ebbc8ae..0a2ec801b63f 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,6 +31,7 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/irq.h>
+#include <asm/io_apic.h>
#include <asm/cpu.h>
static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 438fc554d48d..1f66d2d1e998 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,7 +62,6 @@
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
-#include <asm/pgalloc.h>
#include <asm/proto.h>
#else
#include <asm/processor-flags.h>
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 4fec6f3a1858..46c72f2ec32f 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -7,6 +7,7 @@
*/
#include <linux/kernel.h>
+#include <linux/thread_info.h>
#include <asm/apic.h>
#include <asm/cpu_device_id.h>
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a294f9747aa..3fd6eec202d7 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -54,28 +54,38 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
#define F feature_bit
-int kvm_update_cpuid(struct kvm_vcpu *vcpu)
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
- struct kvm_lapic *apic = vcpu->arch.apic;
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- if (!best)
- return 0;
+ /*
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }
- /* Update OSXSAVE bit */
- if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1)
- cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
+ return 0;
+}
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (best) {
+ /* Update OSXSAVE bit */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
- cpuid_entry_change(best, X86_FEATURE_APIC,
+ cpuid_entry_change(best, X86_FEATURE_APIC,
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
-
- if (apic) {
- if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
- apic->lapic_timer.timer_mode_mask = 3 << 17;
- else
- apic->lapic_timer.timer_mode_mask = 1 << 17;
}
best = kvm_find_cpuid_entry(vcpu, 7, 0);
@@ -84,31 +94,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
- if (!best) {
- vcpu->arch.guest_supported_xcr0 = 0;
- } else {
- vcpu->arch.guest_supported_xcr0 =
- (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
- }
best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- /*
- * The existing code assumes virtual address is 48-bit or 57-bit in the
- * canonical address checks; exit if it is ever changed.
- */
- best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best) {
- int vaddr_bits = (best->eax & 0xff00) >> 8;
-
- if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
- return -EINVAL;
- }
-
best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
@@ -121,14 +114,39 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
+}
+
+static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *best;
+
+ kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (best && apic) {
+ if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
+ apic->lapic_timer.timer_mode_mask = 3 << 17;
+ else
+ apic->lapic_timer.timer_mode_mask = 1 << 17;
+
+ kvm_apic_set_version(vcpu);
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+ if (!best)
+ vcpu->arch.guest_supported_xcr0 = 0;
+ else
+ vcpu->arch.guest_supported_xcr0 =
+ (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
- /* Note, maxphyaddr must be updated before tdp_level. */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
kvm_mmu_reset_context(vcpu);
kvm_pmu_refresh(vcpu);
- return 0;
+ vcpu->arch.cr4_guest_rsvd_bits =
+ __cr4_reserved_bits(guest_cpuid_has, vcpu);
+ kvm_x86_ops.update_exception_bitmap(vcpu);
}
static int is_efer_nx(void)
@@ -203,10 +221,16 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
vcpu->arch.cpuid_entries[i].padding[2] = 0;
}
vcpu->arch.cpuid_nent = cpuid->nent;
+ r = kvm_check_cpuid(vcpu);
+ if (r) {
+ vcpu->arch.cpuid_nent = 0;
+ kvfree(cpuid_entries);
+ goto out;
+ }
+
cpuid_fix_nx_cap(vcpu);
- kvm_apic_set_version(vcpu);
- kvm_x86_ops.cpuid_update(vcpu);
- r = kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
kvfree(cpuid_entries);
out:
@@ -227,9 +251,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
- kvm_apic_set_version(vcpu);
- kvm_x86_ops.cpuid_update(vcpu);
- r = kvm_update_cpuid(vcpu);
+ r = kvm_check_cpuid(vcpu);
+ if (r) {
+ vcpu->arch.cpuid_nent = 0;
+ goto out;
+ }
+
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
out:
return r;
}
@@ -341,7 +370,8 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_7_EDX,
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
- F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM)
+ F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
+ F(SERIALIZE)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -604,7 +634,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
eax.split.bit_width = cap.bit_width_gp;
eax.split.mask_length = cap.events_mask_len;
- edx.split.num_counters_fixed = cap.num_counters_fixed;
+ edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS);
edx.split.bit_width_fixed = cap.bit_width_fixed;
edx.split.reserved = 0;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 05434cd9342f..3a923ae15f2f 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -9,7 +9,7 @@
extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
void kvm_set_cpu_caps(void);
-int kvm_update_cpuid(struct kvm_vcpu *vcpu);
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index af9cdb426dd2..814d3aee5cef 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -900,6 +900,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
synic->active = true;
synic->dont_zero_synic_pages = dont_zero_synic_pages;
+ synic->control = HV_SYNIC_CONTROL_ENABLE;
return 0;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4ce2ddd26c0b..5ccbee7165a2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -354,7 +354,6 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
void kvm_apic_set_version(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_cpuid_entry2 *feat;
u32 v = APIC_VERSION;
if (!lapic_in_kernel(vcpu))
@@ -367,8 +366,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
* version first and level-triggered interrupts never get EOIed in
* IOAPIC.
*/
- feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
- if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
+ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) &&
!ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
@@ -2068,7 +2066,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_TDCR: {
uint32_t old_divisor = apic->divide_count;
- kvm_lapic_set_reg(apic, APIC_TDCR, val);
+ kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
update_divide_count(apic);
if (apic->divide_count != old_divisor &&
apic->lapic_timer.period) {
@@ -2085,7 +2083,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_SELF_IPI:
if (apic_x2apic_mode(apic)) {
- kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+ kvm_lapic_reg_write(apic, APIC_ICR,
+ APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
} else
ret = 1;
break;
@@ -2232,7 +2231,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
vcpu->arch.apic_base = value;
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
if (!apic)
return;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 444bb9c54548..5efc6081ca13 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
+#include "cpuid.h"
#define PT64_PT_BITS 9
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -57,22 +58,14 @@ void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
+ gpa_t nested_cr3);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty, gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
-static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
-{
- if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
- return kvm->arch.n_max_mmu_pages -
- kvm->arch.n_used_mmu_pages;
-
- return 0;
-}
-
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE))
@@ -97,9 +90,13 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
{
- if (VALID_PAGE(vcpu->arch.mmu->root_hpa))
- kvm_x86_ops.load_mmu_pgd(vcpu, vcpu->arch.mmu->root_hpa |
- kvm_get_active_pcid(vcpu));
+ u64 root_hpa = vcpu->arch.mmu->root_hpa;
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
+ vcpu->arch.mmu->shadow_root_level);
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -158,6 +155,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
+static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+}
+
/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
@@ -218,11 +220,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
-bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
- struct kvm_memory_slot *slot, u64 gfn);
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e2e2e5cc7dc6..4e03841f053d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -18,6 +18,7 @@
#include "irq.h"
#include "ioapic.h"
#include "mmu.h"
+#include "mmu_internal.h"
#include "x86.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
@@ -91,7 +92,8 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
*/
bool tdp_enabled = false;
-static int max_page_level __read_mostly;
+static int max_huge_page_level __read_mostly;
+static int max_tdp_level __read_mostly;
enum {
AUDIT_PRE_PAGE_FAULT,
@@ -515,6 +517,18 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+{
+ /* Check if guest physical address doesn't exceed guest maximum */
+ if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+ exception->error_code |= PFERR_RSVD_MASK;
+ return UNMAPPED_GVA;
+ }
+
+ return gpa;
+}
+
/*
* Sets the shadow PTE masks used by the MMU.
*
@@ -676,7 +690,7 @@ union split_spte {
static void count_spte_clear(u64 *sptep, u64 spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
if (is_shadow_present_pte(spte))
return;
@@ -760,7 +774,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
*/
static u64 __get_spte_lockless(u64 *sptep)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
union split_spte spte, *orig = (union split_spte *)sptep;
int count;
@@ -1060,94 +1074,40 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
local_irq_enable();
}
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
-{
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
- if (!obj)
- return cache->nobjs >= min ? 0 : -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
-}
-
-static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
-{
- return cache->nobjs;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
- struct kmem_cache *cache)
-{
- while (mc->nobjs)
- kmem_cache_free(cache, mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
-{
- void *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return cache->nobjs >= min ? 0 : -ENOMEM;
- cache->objects[cache->nobjs++] = page;
- }
- return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{
int r;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
+ /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ return r;
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+ PT64_ROOT_MAX_LEVEL);
if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache, 4);
-out:
- return r;
+ return r;
+ if (maybe_indirect) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ PT64_ROOT_MAX_LEVEL);
}
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache);
- mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- return p;
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
+ return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
}
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1415,10 +1375,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
static bool rmap_can_add(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_memory_cache *cache;
+ struct kvm_mmu_memory_cache *mc;
- cache = &vcpu->arch.mmu_pte_list_desc_cache;
- return mmu_memory_cache_free_objects(cache);
+ mc = &vcpu->arch.mmu_pte_list_desc_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(mc);
}
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1426,7 +1386,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
struct kvm_mmu_page *sp;
struct kvm_rmap_head *rmap_head;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
return pte_list_add(vcpu, spte, rmap_head);
@@ -1438,7 +1398,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
gfn_t gfn;
struct kvm_rmap_head *rmap_head;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmap_head = gfn_to_rmap(kvm, gfn, sp);
__pte_list_remove(spte, rmap_head);
@@ -1530,7 +1490,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
{
if (is_large_pte(*sptep)) {
- WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K);
+ WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
--kvm->stat.lpages;
return true;
@@ -1542,7 +1502,7 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
if (__drop_large_spte(vcpu->kvm, sptep)) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(sp->role.level));
@@ -1738,21 +1698,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-/**
- * kvm_arch_write_log_dirty - emulate dirty page logging
- * @vcpu: Guest mode vcpu
- *
- * Emulate arch specific page modification logging for the
- * nested hypervisor
- */
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
-{
- if (kvm_x86_ops.write_log_dirty)
- return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa);
-
- return 0;
-}
-
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn)
{
@@ -2016,7 +1961,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
@@ -2105,10 +2050,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
{
struct kvm_mmu_page *sp;
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
if (!direct)
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
/*
@@ -2138,7 +2083,7 @@ static void mark_unsync(u64 *spte)
struct kvm_mmu_page *sp;
unsigned int index;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
index = spte - sp->spt;
if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
@@ -2207,7 +2152,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
continue;
}
- child = page_header(ent & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
@@ -2258,15 +2203,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
-
-#define for_each_valid_sp(_kvm, _sp, _gfn) \
- hlist_for_each_entry(_sp, \
- &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+#define for_each_valid_sp(_kvm, _sp, _list) \
+ hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
- for_each_valid_sp(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
static inline bool is_ept_sp(struct kvm_mmu_page *sp)
@@ -2464,9 +2408,7 @@ static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
static void clear_sp_write_flooding_count(u64 *spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- __clear_sp_write_flooding_count(sp);
+ __clear_sp_write_flooding_count(sptep_to_sp(spte));
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -2476,7 +2418,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
int direct,
unsigned int access)
{
+ bool direct_mmu = vcpu->arch.mmu->direct_map;
union kvm_mmu_page_role role;
+ struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
bool need_sync = false;
@@ -2490,13 +2434,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (role.direct)
role.gpte_is_8_bytes = true;
role.access = access;
- if (!vcpu->arch.mmu->direct_map
- && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+ if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_valid_sp(vcpu->kvm, sp, gfn) {
+
+ sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+ for_each_valid_sp(vcpu->kvm, sp, sp_list) {
if (sp->gfn != gfn) {
collisions++;
continue;
@@ -2508,6 +2453,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (sp->role.word != role.word)
continue;
+ if (direct_mmu)
+ goto trace_get_page;
+
if (sp->unsync) {
/* The page is good, but __kvm_sync_page might still end
* up zapping it. If so, break in order to rebuild it.
@@ -2523,6 +2471,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
__clear_sp_write_flooding_count(sp);
+
+trace_get_page:
trace_kvm_mmu_get_page(sp, false);
goto out;
}
@@ -2533,8 +2483,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link,
- &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
/*
* we should do write protection before syncing pages
@@ -2548,7 +2497,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PG_LEVEL_4K && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
- clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
@@ -2657,7 +2605,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
if (child->role.access == direct_access)
return;
@@ -2679,7 +2627,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_large_pte(pte))
--kvm->stat.lpages;
} else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
return true;
@@ -2757,10 +2705,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
if (!sp->root_count) {
/* Count self */
(*nr_zapped)++;
- list_move(&sp->link, invalid_list);
+
+ /*
+ * Already invalid pages (previously active roots) are not on
+ * the active page list. See list_del() in the "else" case of
+ * !sp->root_count.
+ */
+ if (sp->role.invalid)
+ list_add(&sp->link, invalid_list);
+ else
+ list_move(&sp->link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
} else {
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ /*
+ * Remove the active root from the active page list, the root
+ * will be explicitly freed when the root_count hits zero.
+ */
+ list_del(&sp->link);
/*
* Obsolete pages cannot be used on any vCPUs, see the comment
@@ -2812,33 +2773,60 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
}
}
-static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
- struct list_head *invalid_list)
+static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+ unsigned long nr_to_zap)
{
- struct kvm_mmu_page *sp;
+ unsigned long total_zapped = 0;
+ struct kvm_mmu_page *sp, *tmp;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+ int nr_zapped;
if (list_empty(&kvm->arch.active_mmu_pages))
- return false;
+ return 0;
+
+restart:
+ list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ /*
+ * Don't zap active root pages, the page itself can't be freed
+ * and zapping it will just force vCPUs to realloc and reload.
+ */
+ if (sp->root_count)
+ continue;
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+ &nr_zapped);
+ total_zapped += nr_zapped;
+ if (total_zapped >= nr_to_zap)
+ break;
+
+ if (unstable)
+ goto restart;
+ }
- sp = list_last_entry(&kvm->arch.active_mmu_pages,
- struct kvm_mmu_page, link);
- return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ kvm->stat.mmu_recycled += total_zapped;
+ return total_zapped;
+}
+
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+{
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
}
static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
- LIST_HEAD(invalid_list);
+ unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
- if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
return 0;
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
- if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
- break;
-
- ++vcpu->kvm->stat.mmu_recycled;
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
if (!kvm_mmu_available_pages(vcpu->kvm))
return -ENOSPC;
@@ -2851,17 +2839,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
- LIST_HEAD(invalid_list);
-
spin_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
- /* Need to free some mmu pages to achieve the goal. */
- while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
- if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
- break;
+ kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+ goal_nr_mmu_pages);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
@@ -2999,7 +2982,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (sp_ad_disabled(sp))
spte |= SPTE_AD_DISABLED_MASK;
else if (kvm_vcpu_ad_need_write_protect(vcpu))
@@ -3102,7 +3085,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = page_header(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
@@ -3212,7 +3195,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{
struct kvm_mmu_page *sp;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
/*
* Without accessed bits, there's no way to distinguish between
@@ -3274,7 +3257,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (!slot)
return PG_LEVEL_4K;
- max_level = min(max_level, max_page_level);
+ max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
linfo = lpage_info_slot(gfn, slot, max_level);
if (!linfo->disallow_lpage)
@@ -3520,7 +3503,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (!is_shadow_present_pte(spte))
break;
- sp = page_header(__pa(iterator.sptep));
+ sp = sptep_to_sp(iterator.sptep);
if (!is_last_spte(spte, sp->role.level))
break;
@@ -3607,7 +3590,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
+ sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -3668,7 +3651,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
int ret = 0;
- if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
@@ -3837,7 +3820,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root_hpa;
- sp = page_header(root);
+ sp = to_shadow_page(root);
/*
* Even if another CPU was marking the SP as unsync-ed
@@ -3871,7 +3854,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
+ sp = to_shadow_page(root);
mmu_sync_children(vcpu, sp);
}
}
@@ -4045,8 +4028,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
walk_shadow_page_lockless_end(vcpu);
}
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- gfn_t gfn)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
{
struct kvm_arch_async_pf arch;
@@ -4108,16 +4091,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- r = mmu_topup_memory_caches(vcpu);
+ if (fast_page_fault(vcpu, gpa, error_code))
+ return RET_PF_RETRY;
+
+ r = mmu_topup_memory_caches(vcpu, false);
if (r)
return r;
if (lpage_disallowed)
max_level = PG_LEVEL_4K;
- if (fast_page_fault(vcpu, gpa, error_code))
- return RET_PF_RETRY;
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -4131,7 +4114,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- if (make_mmu_pages_available(vcpu) < 0)
+ r = make_mmu_pages_available(vcpu);
+ if (r)
goto out_unlock;
r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
prefault, is_tdp && lpage_disallowed);
@@ -4156,6 +4140,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len)
{
int r = 1;
+ u32 flags = vcpu->arch.apf.host_apf_flags;
#ifndef CONFIG_X86_64
/* A 64-bit CR2 should be impossible on 32-bit KVM. */
@@ -4164,28 +4149,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
#endif
vcpu->arch.l1tf_flush_l1d = true;
- switch (vcpu->arch.apf.host_apf_flags) {
- default:
+ if (!flags) {
trace_kvm_page_fault(fault_address, error_code);
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
- break;
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
vcpu->arch.apf.host_apf_flags = 0;
local_irq_disable();
kvm_async_pf_task_wait_schedule(fault_address);
local_irq_enable();
- break;
- case KVM_PV_REASON_PAGE_READY:
- vcpu->arch.apf.host_apf_flags = 0;
- local_irq_disable();
- kvm_async_pf_task_wake(fault_address);
- local_irq_enable();
- break;
+ } else {
+ WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
}
+
return r;
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
@@ -4227,8 +4206,8 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) && page_header(root->hpa) &&
- role.word == page_header(root->hpa)->role.word;
+ VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
+ role.word == to_shadow_page(root->hpa)->role.word;
}
/*
@@ -4277,8 +4256,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
mmu->root_level >= PT64_ROOT_4LEVEL)
- return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
- cached_root_available(vcpu, new_pgd, new_role);
+ return cached_root_available(vcpu, new_pgd, new_role);
return false;
}
@@ -4313,7 +4291,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
+ __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa));
}
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
@@ -4869,13 +4847,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
return role;
}
+static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
+{
+ /* Use 5-level TDP if and only if it's useful/necessary. */
+ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
+ return 4;
+
+ return max_tdp_level;
+}
+
static union kvm_mmu_role
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
{
union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
role.base.ad_disabled = (shadow_accessed_mask == 0);
- role.base.level = vcpu->arch.tdp_level;
+ role.base.level = kvm_mmu_get_tdp_level(vcpu);
role.base.direct = true;
role.base.gpte_is_8_bytes = true;
@@ -4884,7 +4871,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
union kvm_mmu_role new_role =
kvm_calc_tdp_mmu_root_page_role(vcpu, false);
@@ -4896,7 +4883,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
context->update_pte = nonpaging_update_pte;
- context->shadow_root_level = vcpu->arch.tdp_level;
+ context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
context->direct_map = true;
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
@@ -4931,7 +4918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
}
static union kvm_mmu_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
{
union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
@@ -4939,9 +4926,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
!is_write_protection(vcpu);
role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu);
- role.base.direct = !is_paging(vcpu);
role.base.gpte_is_8_bytes = !!is_pae(vcpu);
+ return role;
+}
+
+static union kvm_mmu_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+{
+ union kvm_mmu_role role =
+ kvm_calc_shadow_root_page_role_common(vcpu, base_only);
+
+ role.base.direct = !is_paging(vcpu);
+
if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL;
else if (is_la57_mode(vcpu))
@@ -4952,15 +4949,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
return role;
}
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ u32 cr0, u32 cr4, u32 efer,
+ union kvm_mmu_role new_role)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
- union kvm_mmu_role new_role =
- kvm_calc_shadow_mmu_root_page_role(vcpu, false);
-
- if (new_role.as_u64 == context->mmu_role.as_u64)
- return;
-
if (!(cr0 & X86_CR0_PG))
nonpaging_init_context(vcpu, context);
else if (efer & EFER_LMA)
@@ -4973,7 +4965,43 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
context->mmu_role.as_u64 = new_role.as_u64;
reset_shadow_zero_bits_mask(vcpu, context);
}
-EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_role new_role =
+ kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+
+ if (new_role.as_u64 != context->mmu_role.as_u64)
+ shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+}
+
+static union kvm_mmu_role
+kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_role role =
+ kvm_calc_shadow_root_page_role_common(vcpu, false);
+
+ role.base.direct = false;
+ role.base.level = kvm_mmu_get_tdp_level(vcpu);
+
+ return role;
+}
+
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
+ gpa_t nested_cr3)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
+
+ context->shadow_root_level = new_role.base.level;
+
+ __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
+
+ if (new_role.as_u64 != context->mmu_role.as_u64)
+ shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
@@ -5007,7 +5035,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty, gpa_t new_eptp)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
union kvm_mmu_role new_role =
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
@@ -5041,7 +5069,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
kvm_init_shadow_mmu(vcpu,
kvm_read_cr0_bits(vcpu, X86_CR0_PG),
@@ -5151,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
- r = mmu_topup_memory_caches(vcpu);
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
if (r)
goto out;
r = mmu_alloc_roots(vcpu);
@@ -5345,7 +5373,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
* or not since pte prefetch is skiped if it does not have
* enough objects in the cache.
*/
- mmu_topup_memory_caches(vcpu);
+ mmu_topup_memory_caches(vcpu, true);
spin_lock(&vcpu->kvm->mmu_lock);
@@ -5553,23 +5581,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
-void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
+ int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
+ max_tdp_level = tdp_max_root_level;
/*
- * max_page_level reflects the capabilities of KVM's MMU irrespective
+ * max_huge_page_level reflects KVM's MMU capabilities irrespective
* of kernel support, e.g. KVM may be capable of using 1GB pages when
* the kernel is not. But, KVM never creates a page size greater than
* what is used by the kernel for any given HVA, i.e. the kernel's
* capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
*/
if (tdp_enabled)
- max_page_level = tdp_page_level;
+ max_huge_page_level = tdp_huge_page_level;
else if (boot_cpu_has(X86_FEATURE_GBPAGES))
- max_page_level = PG_LEVEL_1G;
+ max_huge_page_level = PG_LEVEL_1G;
else
- max_page_level = PG_LEVEL_2M;
+ max_huge_page_level = PG_LEVEL_2M;
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
@@ -5665,7 +5695,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
* SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
* skip allocating the PDP table.
*/
- if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL)
+ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
@@ -5684,6 +5714,14 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
uint i;
int ret;
+ vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
+ vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
+ vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -5732,12 +5770,11 @@ restart:
break;
/*
- * Skip invalid pages with a non-zero root count, zapping pages
- * with a non-zero root count will never succeed, i.e. the page
- * will get thrown back on active_mmu_pages and we'll get stuck
- * in an infinite loop.
+ * Invalid pages should never land back on the list of active
+ * pages. Skip the bogus page, otherwise we'll get stuck in an
+ * infinite loop if the page gets put back on the list (again).
*/
- if (sp->role.invalid && sp->root_count)
+ if (WARN_ON(sp->role.invalid))
continue;
/*
@@ -5904,7 +5941,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
pfn = spte_to_pfn(*sptep);
/*
@@ -6015,7 +6052,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (sp->role.invalid && sp->root_count)
+ if (WARN_ON(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
@@ -6092,9 +6129,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
goto unlock;
}
- if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
- freed++;
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
unlock:
spin_unlock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index 9d2844f87f6d..c8d51a37e2ce 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -45,7 +45,7 @@ static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
!is_last_spte(ent[i], level)) {
struct kvm_mmu_page *child;
- child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK);
__mmu_spte_walk(vcpu, child, fn, level - 1);
}
}
@@ -62,7 +62,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root_hpa;
- sp = page_header(root);
+ sp = to_shadow_page(root);
__mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level);
return;
}
@@ -72,7 +72,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
+ sp = to_shadow_page(root);
__mmu_spte_walk(vcpu, sp, fn, 2);
}
}
@@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
kvm_pfn_t pfn;
hpa_t hpa;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (sp->unsync) {
if (level != PG_LEVEL_4K) {
@@ -132,7 +132,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
struct kvm_memory_slot *slot;
gfn_t gfn;
- rev_sp = page_header(__pa(sptep));
+ rev_sp = sptep_to_sp(sptep);
gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
@@ -165,7 +165,7 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
new file mode 100644
index 000000000000..3acf3b8eb469
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_MMU_INTERNAL_H
+#define __KVM_X86_MMU_INTERNAL_H
+
+#include <linux/types.h>
+
+#include <asm/kvm_host.h>
+
+struct kvm_mmu_page {
+ struct list_head link;
+ struct hlist_node hash_link;
+ struct list_head lpage_disallowed_link;
+
+ bool unsync;
+ u8 mmu_valid_gen;
+ bool mmio_cached;
+ bool lpage_disallowed; /* Can't be replaced by an equiv large page */
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ union kvm_mmu_page_role role;
+ gfn_t gfn;
+
+ u64 *spt;
+ /* hold the gfn of each spte inside spt */
+ gfn_t *gfns;
+ int root_count; /* Currently serving as active root */
+ unsigned int unsync_children;
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Used out of the mmu-lock to avoid reading spte values while an
+ * update is in progress; see the comments in __get_spte_lockless().
+ */
+ int clear_spte_count;
+#endif
+
+ /* Number of writes since the last time traversal visited this page. */
+ atomic_t write_flooding_count;
+};
+
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
+{
+ return to_shadow_page(__pa(sptep));
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn);
+
+#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ffcd96fc02d0..9d15bc0c535b 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -387,7 +387,7 @@ TRACE_EVENT(
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_PATH mmu
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE mmutrace
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index a7bcde34d1f2..a84a141a2ad2 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -16,7 +16,7 @@
#include <asm/kvm_page_track.h>
-#include "mmu.h"
+#include "mmu_internal.h"
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 275564a0ebdb..4dd6b1e5b8cf 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
!(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
#if PTTYPE == PTTYPE_EPT
- if (kvm_arch_write_log_dirty(vcpu, addr))
+ if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
return -EINVAL;
#endif
pte |= PT_GUEST_DIRTY_MASK;
@@ -596,7 +596,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *spte;
int i;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (sp->role.level > PG_LEVEL_4K)
return;
@@ -789,10 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
/*
* If PFEC.RSVD is set, this is a shadow page fault.
* The bit needs to be cleared before walking guest page tables.
@@ -820,6 +816,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
return RET_PF_EMULATE;
}
+ r = mmu_topup_memory_caches(vcpu, true);
+ if (r)
+ return r;
+
vcpu->arch.write_fault_to_shadow_pgtable = false;
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
@@ -866,7 +866,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
- if (make_mmu_pages_available(vcpu) < 0)
+ r = make_mmu_pages_available(vcpu);
+ if (r)
goto out_unlock;
r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
map_writable, prefault, lpage_disallowed);
@@ -903,7 +904,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
* No need to check return value here, rmap_can_add() can
* help us to skip pte prefetch later.
*/
- mmu_topup_memory_caches(vcpu);
+ mmu_topup_memory_caches(vcpu, true);
if (!VALID_PAGE(root_hpa)) {
WARN_ON(1);
@@ -915,7 +916,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
level = iterator.level;
sptep = iterator.sptep;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (is_last_spte(*sptep, level)) {
pt_element_t gpte;
gpa_t pte_gpa;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index b86346903f2e..67741d2a0308 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -372,6 +372,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (!pmc)
return 1;
+ if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
+ (kvm_x86_ops.get_cpl(vcpu) != 0) &&
+ (kvm_read_cr0(vcpu) & X86_CR0_PE))
+ return 1;
+
*data = pmc_read_counter(pmc) & mask;
return 0;
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index ab85eed8a6cc..067fef51760c 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -15,6 +15,8 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+#define MAX_FIXED_COUNTERS 3
+
struct kvm_event_hw_type_mapping {
u8 eventsel;
u8 unit_mask;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index e80daa98682f..ac830cd50830 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -665,7 +665,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
} else {
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
}
- mark_dirty(vmcb, VMCB_AVIC);
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
svm_set_pi_irte_mode(vcpu, activated);
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6bceafb19108..fb68467e6049 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -48,13 +48,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
svm->vmcb->control.exit_info_1 |= fault->error_code;
- /*
- * The present bit is always zero for page structure faults on real
- * hardware.
- */
- if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
- svm->vmcb->control.exit_info_1 &= ~1;
-
nested_svm_vmexit(svm);
}
@@ -87,11 +80,11 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer);
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer,
+ svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level;
reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
@@ -106,7 +99,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h, *g;
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
if (!is_guest_mode(&svm->vcpu))
return;
@@ -222,8 +215,9 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
return true;
}
-static bool nested_vmcb_checks(struct vmcb *vmcb)
+static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb)
{
+ bool nested_vmcb_lma;
if ((vmcb->save.efer & EFER_SVME) == 0)
return false;
@@ -231,6 +225,30 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
(vmcb->save.cr0 & X86_CR0_NW))
return false;
+ if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7))
+ return false;
+
+ nested_vmcb_lma =
+ (vmcb->save.efer & EFER_LME) &&
+ (vmcb->save.cr0 & X86_CR0_PG);
+
+ if (!nested_vmcb_lma) {
+ if (vmcb->save.cr4 & X86_CR4_PAE) {
+ if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
+ return false;
+ } else {
+ if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
+ return false;
+ }
+ } else {
+ if (!(vmcb->save.cr4 & X86_CR4_PAE) ||
+ !(vmcb->save.cr0 & X86_CR0_PE) ||
+ (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK))
+ return false;
+ }
+ if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4))
+ return false;
+
return nested_vmcb_check_controls(&vmcb->control);
}
@@ -258,7 +276,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm)
/* Only a few fields of int_ctl are written by the processor. */
mask = V_IRQ_MASK | V_TPR_MASK;
if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
- is_intercept(svm, INTERCEPT_VINTR)) {
+ svm_is_intercept(svm, INTERCEPT_VINTR)) {
/*
* In order to request an interrupt window, L0 is usurping
* svm->vmcb->control.int_ctl and possibly setting V_IRQ
@@ -310,6 +328,42 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm,
nested_vmcb->control.exit_int_info = exit_int_info;
}
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+ return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
+/*
+ * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
+ * if we are emulating VM-Entry into a guest with NPT enabled.
+ */
+static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_npt)
+{
+ if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))
+ return -EINVAL;
+
+ if (!nested_npt && is_pae_paging(vcpu) &&
+ (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
+ if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ return -EINVAL;
+ }
+
+ /*
+ * TODO: optimize unconditional TLB flush/MMU sync here and in
+ * kvm_init_shadow_npt_mmu().
+ */
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3, false, false);
+
+ vcpu->arch.cr3 = cr3;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+
+ kvm_init_mmu(vcpu, false);
+
+ return 0;
+}
+
static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb)
{
/* Load the nested guest state */
@@ -323,8 +377,6 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v
svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
- (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
@@ -342,13 +394,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v
static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
{
const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
- if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE)
- nested_svm_init_mmu_context(&svm->vcpu);
-
- /* Guest paging mode is active - reset mmu */
- kvm_mmu_reset_context(&svm->vcpu);
- svm_flush_tlb(&svm->vcpu);
+ if (nested_npt_enabled(svm))
+ nested_svm_init_mmu_context(&svm->vcpu);
svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
@@ -375,18 +423,27 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm)
*/
recalc_intercepts(svm);
- mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(svm->vmcb);
}
-void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
struct vmcb *nested_vmcb)
{
+ int ret;
+
svm->nested.vmcb = vmcb_gpa;
load_nested_vmcb_control(svm, &nested_vmcb->control);
nested_prepare_vmcb_save(svm, nested_vmcb);
nested_prepare_vmcb_control(svm);
+ ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3,
+ nested_npt_enabled(svm));
+ if (ret)
+ return ret;
+
svm_set_gif(svm, true);
+
+ return 0;
}
int nested_svm_vmrun(struct vcpu_svm *svm)
@@ -416,7 +473,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb = map.hva;
- if (!nested_vmcb_checks(nested_vmcb)) {
+ if (!nested_vmcb_checks(svm, nested_vmcb)) {
nested_vmcb->control.exit_code = SVM_EXIT_ERR;
nested_vmcb->control.exit_code_hi = 0;
nested_vmcb->control.exit_info_1 = 0;
@@ -464,16 +521,22 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
copy_vmcb_control_area(&hsave->control, &vmcb->control);
svm->nested.nested_run_pending = 1;
- enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb);
- if (!nested_svm_vmrun_msrpm(svm)) {
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
+ if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb))
+ goto out_exit_err;
- nested_svm_vmexit(svm);
- }
+ if (nested_svm_vmrun_msrpm(svm))
+ goto out;
+
+out_exit_err:
+ svm->nested.nested_run_pending = 0;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
out:
kvm_vcpu_unmap(&svm->vcpu, &map, true);
@@ -585,12 +648,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = hsave->save.cr3;
- svm->vcpu.arch.cr3 = hsave->save.cr3;
- } else {
- (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
- }
kvm_rax_write(&svm->vcpu, hsave->save.rax);
kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
kvm_rip_write(&svm->vcpu, hsave->save.rip);
@@ -598,7 +655,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
- mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(svm->vmcb);
trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code,
nested_vmcb->control.exit_info_1,
@@ -610,8 +667,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_vcpu_unmap(&svm->vcpu, &map, true);
nested_svm_uninit_mmu_context(&svm->vcpu);
- kvm_mmu_reset_context(&svm->vcpu);
- kvm_mmu_load(&svm->vcpu);
+
+ rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false);
+ if (rc)
+ return 1;
+
+ if (npt_enabled)
+ svm->vmcb->save.cr3 = hsave->save.cr3;
/*
* Drop what we picked up for L2 via svm_complete_interrupts() so it
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 5573a97f1520..402dc4234e39 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -313,13 +313,15 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
int write)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- unsigned long npages, npinned, size;
+ unsigned long npages, size;
+ int npinned;
unsigned long locked, lock_limit;
struct page **pages;
unsigned long first, last;
+ int ret;
if (ulen == 0 || uaddr + ulen < uaddr)
- return NULL;
+ return ERR_PTR(-EINVAL);
/* Calculate number of pages. */
first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
@@ -330,9 +332,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
+ if (WARN_ON_ONCE(npages > INT_MAX))
+ return ERR_PTR(-EINVAL);
+
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
@@ -341,12 +346,13 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
if (!pages)
- return NULL;
+ return ERR_PTR(-ENOMEM);
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
+ ret = -ENOMEM;
goto err;
}
@@ -357,10 +363,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
err:
if (npinned > 0)
- release_pages(pages, npinned);
+ unpin_user_pages(pages, npinned);
kvfree(pages);
- return NULL;
+ return ERR_PTR(ret);
}
static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
@@ -368,7 +374,7 @@ static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- release_pages(pages, npages);
+ unpin_user_pages(pages, npages);
kvfree(pages);
sev->pages_locked -= npages;
}
@@ -434,8 +440,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Lock the user memory. */
inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
- if (!inpages) {
- ret = -ENOMEM;
+ if (IS_ERR(inpages)) {
+ ret = PTR_ERR(inpages);
goto e_free;
}
@@ -789,13 +795,13 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
/* lock userspace source and destination page */
src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
- if (!src_p)
- return -EFAULT;
+ if (IS_ERR(src_p))
+ return PTR_ERR(src_p);
dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
- if (!dst_p) {
+ if (IS_ERR(dst_p)) {
sev_unpin_memory(kvm, src_p, n);
- return -EFAULT;
+ return PTR_ERR(dst_p);
}
/*
@@ -860,8 +866,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EFAULT;
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
- if (!pages)
- return -ENOMEM;
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
/*
* The secret must be copied into contiguous memory region, lets verify
@@ -987,8 +993,8 @@ int svm_register_enc_region(struct kvm *kvm,
return -ENOMEM;
region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
- if (!region->pages) {
- ret = -ENOMEM;
+ if (IS_ERR(region->pages)) {
+ ret = PTR_ERR(region->pages);
goto e_free;
}
@@ -1180,11 +1186,10 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
* 2) or this VMCB was executed on different host CPU in previous VMRUNs.
*/
if (sd->sev_vmcbs[asid] == svm->vmcb &&
- svm->last_cpu == cpu)
+ svm->vcpu.arch.last_vmentry_cpu == cpu)
return;
- svm->last_cpu = cpu;
sd->sev_vmcbs[asid] = svm->vmcb;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
- mark_dirty(svm->vmcb, VMCB_ASID);
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5bbf76189afa..03dd7bac8034 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -254,7 +254,7 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
}
-static int get_npt_level(struct kvm_vcpu *vcpu)
+static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
return PT64_ROOT_4LEVEL;
@@ -282,7 +282,7 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
}
svm->vmcb->save.efer = efer | EFER_SVME;
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
static int is_external_interrupt(u32 info)
@@ -713,7 +713,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
pause_filter_count_max);
if (control->pause_filter_count != old) {
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
trace_kvm_ple_window_update(vcpu->vcpu_id,
control->pause_filter_count, old);
}
@@ -731,7 +731,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
pause_filter_count_shrink,
pause_filter_count);
if (control->pause_filter_count != old) {
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
trace_kvm_ple_window_update(vcpu->vcpu_id,
control->pause_filter_count, old);
}
@@ -885,7 +885,7 @@ static __init int svm_hardware_setup(void)
if (npt_enabled && !npt)
npt_enabled = false;
- kvm_configure_mmu(npt_enabled, PG_LEVEL_1G);
+ kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
if (nrips) {
@@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void)
svm_set_cpu_caps();
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
return 0;
err:
@@ -966,7 +981,7 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
return svm->vmcb->control.tsc_offset;
}
@@ -1002,38 +1017,38 @@ static void init_vmcb(struct vcpu_svm *svm)
if (enable_vmware_backdoor)
set_exception_intercept(svm, GP_VECTOR);
- set_intercept(svm, INTERCEPT_INTR);
- set_intercept(svm, INTERCEPT_NMI);
- set_intercept(svm, INTERCEPT_SMI);
- set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
- set_intercept(svm, INTERCEPT_RDPMC);
- set_intercept(svm, INTERCEPT_CPUID);
- set_intercept(svm, INTERCEPT_INVD);
- set_intercept(svm, INTERCEPT_INVLPG);
- set_intercept(svm, INTERCEPT_INVLPGA);
- set_intercept(svm, INTERCEPT_IOIO_PROT);
- set_intercept(svm, INTERCEPT_MSR_PROT);
- set_intercept(svm, INTERCEPT_TASK_SWITCH);
- set_intercept(svm, INTERCEPT_SHUTDOWN);
- set_intercept(svm, INTERCEPT_VMRUN);
- set_intercept(svm, INTERCEPT_VMMCALL);
- set_intercept(svm, INTERCEPT_VMLOAD);
- set_intercept(svm, INTERCEPT_VMSAVE);
- set_intercept(svm, INTERCEPT_STGI);
- set_intercept(svm, INTERCEPT_CLGI);
- set_intercept(svm, INTERCEPT_SKINIT);
- set_intercept(svm, INTERCEPT_WBINVD);
- set_intercept(svm, INTERCEPT_XSETBV);
- set_intercept(svm, INTERCEPT_RDPRU);
- set_intercept(svm, INTERCEPT_RSM);
+ svm_set_intercept(svm, INTERCEPT_INTR);
+ svm_set_intercept(svm, INTERCEPT_NMI);
+ svm_set_intercept(svm, INTERCEPT_SMI);
+ svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ svm_set_intercept(svm, INTERCEPT_RDPMC);
+ svm_set_intercept(svm, INTERCEPT_CPUID);
+ svm_set_intercept(svm, INTERCEPT_INVD);
+ svm_set_intercept(svm, INTERCEPT_INVLPG);
+ svm_set_intercept(svm, INTERCEPT_INVLPGA);
+ svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
+ svm_set_intercept(svm, INTERCEPT_MSR_PROT);
+ svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
+ svm_set_intercept(svm, INTERCEPT_VMRUN);
+ svm_set_intercept(svm, INTERCEPT_VMMCALL);
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_CLGI);
+ svm_set_intercept(svm, INTERCEPT_SKINIT);
+ svm_set_intercept(svm, INTERCEPT_WBINVD);
+ svm_set_intercept(svm, INTERCEPT_XSETBV);
+ svm_set_intercept(svm, INTERCEPT_RDPRU);
+ svm_set_intercept(svm, INTERCEPT_RSM);
if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
- set_intercept(svm, INTERCEPT_MONITOR);
- set_intercept(svm, INTERCEPT_MWAIT);
+ svm_set_intercept(svm, INTERCEPT_MONITOR);
+ svm_set_intercept(svm, INTERCEPT_MWAIT);
}
if (!kvm_hlt_in_guest(svm->vcpu.kvm))
- set_intercept(svm, INTERCEPT_HLT);
+ svm_set_intercept(svm, INTERCEPT_HLT);
control->iopm_base_pa = __sme_set(iopm_base);
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
@@ -1077,7 +1092,7 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
- clr_intercept(svm, INTERCEPT_INVLPG);
+ svm_clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
@@ -1094,9 +1109,9 @@ static void init_vmcb(struct vcpu_svm *svm)
control->pause_filter_count = pause_filter_count;
if (pause_filter_thresh)
control->pause_filter_thresh = pause_filter_thresh;
- set_intercept(svm, INTERCEPT_PAUSE);
+ svm_set_intercept(svm, INTERCEPT_PAUSE);
} else {
- clr_intercept(svm, INTERCEPT_PAUSE);
+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
if (kvm_vcpu_apicv_active(&svm->vcpu))
@@ -1107,14 +1122,14 @@ static void init_vmcb(struct vcpu_svm *svm)
* in VMCB and clear intercepts to avoid #VMEXIT.
*/
if (vls) {
- clr_intercept(svm, INTERCEPT_VMLOAD);
- clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
if (vgif) {
- clr_intercept(svm, INTERCEPT_STGI);
- clr_intercept(svm, INTERCEPT_CLGI);
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ svm_clr_intercept(svm, INTERCEPT_CLGI);
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
@@ -1123,7 +1138,7 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_exception_intercept(svm, UD_VECTOR);
}
- mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(svm->vmcb);
enable_gif(svm);
@@ -1257,7 +1272,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(cpu != vcpu->cpu)) {
svm->asid_generation = 0;
- mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(svm->vmcb);
}
#ifdef CONFIG_X86_64
@@ -1356,7 +1371,7 @@ static void svm_set_vintr(struct vcpu_svm *svm)
/* The following fields are ignored when AVIC is enabled */
WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
- set_intercept(svm, INTERCEPT_VINTR);
+ svm_set_intercept(svm, INTERCEPT_VINTR);
/*
* This is just a dummy VINTR to actually cause a vmexit to happen.
@@ -1367,13 +1382,13 @@ static void svm_set_vintr(struct vcpu_svm *svm)
control->int_ctl &= ~V_INTR_PRIO_MASK;
control->int_ctl |= V_IRQ_MASK |
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
- mark_dirty(svm->vmcb, VMCB_INTR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}
static void svm_clear_vintr(struct vcpu_svm *svm)
{
const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
- clr_intercept(svm, INTERCEPT_VINTR);
+ svm_clr_intercept(svm, INTERCEPT_VINTR);
/* Drop int_ctl fields related to VINTR injection. */
svm->vmcb->control.int_ctl &= mask;
@@ -1385,7 +1400,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
}
- mark_dirty(svm->vmcb, VMCB_INTR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1503,7 +1518,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.idtr.limit = dt->size;
svm->vmcb->save.idtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1520,7 +1535,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.gdtr.limit = dt->size;
svm->vmcb->save.gdtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
static void update_cr0_intercept(struct vcpu_svm *svm)
@@ -1531,7 +1546,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0) {
clr_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1572,7 +1587,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
}
@@ -1592,7 +1607,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
cr4 |= X86_CR4_PAE;
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
- mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
return 0;
}
@@ -1624,10 +1639,10 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
/* This is symmetric with svm_get_segment() */
svm->vmcb->save.cpl = (var->dpl & 3);
- mark_dirty(svm->vmcb, VMCB_SEG);
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_bp_intercept(struct kvm_vcpu *vcpu)
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1636,8 +1651,7 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
- } else
- vcpu->guest_debug = 0;
+ }
}
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1651,7 +1665,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid_generation = sd->asid_generation;
svm->vmcb->control.asid = sd->next_asid++;
- mark_dirty(svm->vmcb, VMCB_ASID);
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
@@ -1660,7 +1674,7 @@ static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
if (unlikely(value != vmcb->save.dr6)) {
vmcb->save.dr6 = value;
- mark_dirty(vmcb, VMCB_DR);
+ vmcb_mark_dirty(vmcb, VMCB_DR);
}
}
@@ -1687,7 +1701,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.dr7 = value;
- mark_dirty(svm->vmcb, VMCB_DR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
static int pf_interception(struct vcpu_svm *svm)
@@ -2000,8 +2014,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
* again while processing KVM_REQ_EVENT if needed.
*/
if (vgif_enabled(svm))
- clr_intercept(svm, INTERCEPT_STGI);
- if (is_intercept(svm, INTERCEPT_VINTR))
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ if (svm_is_intercept(svm, INTERCEPT_VINTR))
svm_clear_vintr(svm);
enable_gif(svm);
@@ -2162,7 +2176,7 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- clr_intercept(svm, INTERCEPT_IRET);
+ svm_clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
@@ -2358,8 +2372,10 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr)
if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ return 0;
default:
- return 1;
+ return KVM_MSR_RET_INVALID;
}
return 0;
@@ -2512,7 +2528,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
return 1;
vcpu->arch.pat = data;
svm->vmcb->save.g_pat = data;
- mark_dirty(svm->vmcb, VMCB_NPT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
@@ -2522,7 +2538,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
- if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
svm->spec_ctrl = data;
@@ -2617,7 +2633,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
return 1;
svm->vmcb->save.dbgctl = data;
- mark_dirty(svm->vmcb, VMCB_LBR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
svm_enable_lbrv(svm);
else
@@ -2947,6 +2963,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
+ kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
dump_vmcb(vcpu);
return 0;
}
@@ -2970,8 +2987,9 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -2992,21 +3010,18 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
static void reload_tss(struct kvm_vcpu *vcpu)
{
- int cpu = raw_smp_processor_id();
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
sd->tss_desc->type = 9; /* available 32/64-bit TSS */
load_TR_desc();
}
static void pre_svm_run(struct vcpu_svm *svm)
{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
if (sev_guest(svm->vcpu.kvm))
- return pre_sev_run(svm, cpu);
+ return pre_sev_run(svm, svm->vcpu.cpu);
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
@@ -3019,7 +3034,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
+ svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3040,7 +3055,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm_nested_virtualize_tpr(vcpu))
+ if (nested_svm_virtualize_tpr(vcpu))
return;
clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
@@ -3096,10 +3111,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
+ svm_set_intercept(svm, INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- clr_intercept(svm, INTERCEPT_IRET);
+ svm_clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3179,7 +3194,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
if (!gif_set(svm)) {
if (vgif_enabled(svm))
- set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_STGI);
return; /* STGI will cause a vm exit */
}
@@ -3234,7 +3249,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm_nested_virtualize_tpr(vcpu))
+ if (nested_svm_virtualize_tpr(vcpu))
return;
if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
@@ -3248,7 +3263,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (svm_nested_virtualize_tpr(vcpu) ||
+ if (nested_svm_virtualize_tpr(vcpu) ||
kvm_vcpu_apicv_active(vcpu))
return;
@@ -3344,6 +3359,60 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
+static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ struct vcpu_svm *svm)
+{
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+
+ __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+
+#ifdef CONFIG_X86_64
+ native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+ loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+#endif
+#endif
+
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * guest_exit_irqoff() restores host context and reinstates RCU if
+ * enabled and required.
+ *
+ * This needs to be done before the below as native_read_msr()
+ * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+ * into world and some more.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ guest_exit_irqoff();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
fastpath_t exit_fastpath;
@@ -3399,16 +3468,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
*/
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
-
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
- loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
+ svm_vcpu_enter_exit(vcpu, svm);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -3477,11 +3537,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
- mark_all_clean(svm->vmcb);
+ vmcb_mark_all_clean(svm->vmcb);
return exit_fastpath;
}
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+ int root_level)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long cr3;
@@ -3489,7 +3550,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
cr3 = __sme_set(root);
if (npt_enabled) {
svm->vmcb->control.nested_cr3 = cr3;
- mark_dirty(svm->vmcb, VMCB_NPT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
/* Loading L2's CR3 is handled by enter_svm_guest_mode. */
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
@@ -3498,7 +3559,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
}
svm->vmcb->save.cr3 = cr3;
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
static int is_disabled(void)
@@ -3551,7 +3612,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return 0;
}
-static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3843,6 +3904,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
struct kvm_host_map map;
u64 guest;
u64 vmcb;
+ int ret = 0;
guest = GET_SMSTATE(u64, smstate, 0x7ed8);
vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
@@ -3851,10 +3913,11 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
return 1;
nested_vmcb = map.hva;
- enter_svm_guest_mode(svm, vmcb, nested_vmcb);
+ ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
kvm_vcpu_unmap(&svm->vcpu, &map, true);
}
- return 0;
+
+ return ret;
}
static void enable_smi_window(struct kvm_vcpu *vcpu)
@@ -3863,7 +3926,7 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
if (!gif_set(svm)) {
if (vgif_enabled(svm))
- set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_STGI);
/* STGI will cause a vm exit */
} else {
/* We must be in SMM; RSM will cause a vmexit anyway. */
@@ -3992,7 +4055,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
- .update_bp_intercept = update_bp_intercept,
+ .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
@@ -4049,12 +4112,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_tss_addr = svm_set_tss_addr,
.set_identity_map_addr = svm_set_identity_map_addr,
- .get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
.get_exit_info = svm_get_exit_info,
- .cpuid_update = svm_cpuid_update,
+ .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
.has_wbinvd_exit = svm_has_wbinvd_exit,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6ac4c00a5d82..a798e1731709 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -81,7 +81,7 @@ struct kvm_svm {
struct kvm_vcpu;
-struct nested_state {
+struct svm_nested_state {
struct vmcb *hsave;
u64 hsave_msr;
u64 vm_cr_msr;
@@ -133,7 +133,7 @@ struct vcpu_svm {
ulong nmi_iret_rip;
- struct nested_state nested;
+ struct svm_nested_state nested;
bool nmi_singlestep;
u64 nmi_singlestep_guest_rflags;
@@ -158,9 +158,6 @@ struct vcpu_svm {
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
-
- /* which host CPU was used for running this vcpu */
- unsigned int last_cpu;
};
struct svm_cpu_data {
@@ -188,18 +185,18 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
return container_of(kvm, struct kvm_svm, kvm);
}
-static inline void mark_all_dirty(struct vmcb *vmcb)
+static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
}
-static inline void mark_all_clean(struct vmcb *vmcb)
+static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
{
vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
& ~VMCB_ALWAYS_DIRTY_MASK;
}
-static inline void mark_dirty(struct vmcb *vmcb, int bit)
+static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
{
vmcb->control.clean &= ~(1 << bit);
}
@@ -293,7 +290,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
recalc_intercepts(svm);
}
-static inline void set_intercept(struct vcpu_svm *svm, int bit)
+static inline void svm_set_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
@@ -302,7 +299,7 @@ static inline void set_intercept(struct vcpu_svm *svm, int bit)
recalc_intercepts(svm);
}
-static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
@@ -311,7 +308,7 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit)
recalc_intercepts(svm);
}
-static inline bool is_intercept(struct vcpu_svm *svm, int bit)
+static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
{
return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
}
@@ -346,7 +343,10 @@ static inline bool gif_set(struct vcpu_svm *svm)
}
/* svm.c */
-#define MSR_INVALID 0xffffffffU
+#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U
+#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U
+#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U
+#define MSR_INVALID 0xffffffffU
u32 svm_msrpm_offset(u32 msr);
void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
@@ -365,7 +365,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value);
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
-static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -387,8 +387,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI));
}
-void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb);
+int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb);
void svm_leave_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct vcpu_svm *svm);
void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
@@ -420,7 +420,7 @@ extern int avic;
static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
{
svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
- mark_dirty(svm->vmcb, VMCB_AVIC);
+ vmcb_mark_dirty(svm->vmcb, VMCB_AVIC);
}
static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index bf944334003a..1ec1ac40e328 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -27,7 +27,7 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
- .text
+.section .noinstr.text, "ax"
/**
* __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 11e4df560018..23b58c28a1c9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -171,15 +171,6 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
u32 vm_instruction_error)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /*
- * failValid writes the error number to the current VMCS, which
- * can't be done if there isn't a current VMCS.
- */
- if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
- return nested_vmx_failInvalid(vcpu);
-
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
X86_EFLAGS_SF | X86_EFLAGS_OF))
@@ -192,6 +183,20 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
return kvm_skip_emulated_instruction(vcpu);
}
+static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done if there isn't a current VMCS.
+ */
+ if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+ return nested_vmx_failInvalid(vcpu);
+
+ return nested_vmx_failValid(vcpu, vm_instruction_error);
+}
+
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
@@ -2157,7 +2162,8 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
* consistency checks.
*/
if (enable_ept && nested_early_check)
- vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
+ vmcs_write64(EPT_POINTER,
+ construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
/* All VMFUNCs are currently emulated through L0 vmexits. */
if (cpu_has_vmx_vmfunc())
@@ -2433,22 +2439,28 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
/*
* Whether page-faults are trapped is determined by a combination of
- * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
- * If enable_ept, L0 doesn't care about page faults and we should
- * set all of these to L1's desires. However, if !enable_ept, L0 does
- * care about (at least some) page faults, and because it is not easy
- * (if at all possible?) to merge L0 and L1's desires, we simply ask
- * to exit on each and every L2 page fault. This is done by setting
- * MASK=MATCH=0 and (see below) EB.PF=1.
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
+ * doesn't care about page faults then we should set all of these to
+ * L1's desires. However, if L0 does care about (some) page faults, it
+ * is not easy (if at all possible?) to merge L0 and L1's desires, we
+ * simply ask to exit on each and every L2 page fault. This is done by
+ * setting MASK=MATCH=0 and (see below) EB.PF=1.
* Note that below we don't need special code to set EB.PF beyond the
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
*/
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
- enable_ept ? vmcs12->page_fault_error_code_mask : 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
- enable_ept ? vmcs12->page_fault_error_code_match : 0);
+ if (vmx_need_pf_intercept(&vmx->vcpu)) {
+ /*
+ * TODO: if both L0 and L1 need the same MASK and MATCH,
+ * go ahead and use it?
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ } else {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
+ }
if (cpu_has_vmx_apicv()) {
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
@@ -3205,6 +3217,43 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
return true;
}
+static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t dst;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return 0;
+
+ if (WARN_ON_ONCE(vmx->nested.pml_full))
+ return 1;
+
+ /*
+ * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
+ * set is already checked as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa &= ~0xFFFull;
+ dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
+
+ if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+ offset_in_page(dst), sizeof(gpa)))
+ return 0;
+
+ vmcs12->guest_pml_index--;
+
+ return 0;
+}
+
/*
* Intel's VMX Instruction Reference specifies a common set of prerequisites
* for running VMX instructions (except VMXON, whose prerequisites are
@@ -3456,19 +3505,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* when using the merged vmcs02.
*/
if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
- return nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
if (vmcs12->launch_state == launch)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
if (nested_vmx_check_controls(vcpu, vmcs12))
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
if (nested_vmx_check_host_state(vcpu, vmcs12))
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
/*
* We're finally done with prerequisite checking, and can start with
@@ -3517,7 +3565,7 @@ vmentry_failed:
if (status == NVMX_VMENTRY_VMEXIT)
return 1;
WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
}
/*
@@ -4460,7 +4508,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
* flag and the VM-instruction error field of the VMCS
* accordingly, and skip the emulated instruction.
*/
- (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
/*
* Restore L1's host state to KVM's software model. We're here
@@ -4760,8 +4808,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
}
if (vmx->nested.vmxon)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
@@ -4852,12 +4899,10 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return r;
if (!page_address_valid(vcpu, vmptr))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
if (vmptr == vmx->nested.vmxon_ptr)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
/*
* When Enlightened VMEntry is enabled on the calling CPU we treat
@@ -4927,8 +4972,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
offset = vmcs_field_to_offset(field);
if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
@@ -5031,8 +5075,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
offset = vmcs_field_to_offset(field);
if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/*
* If the vCPU supports "VMWRITE to any supported field in the
@@ -5040,8 +5083,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
*/
if (vmcs_field_readonly(field) &&
!nested_cpu_has_vmwrite_any_field(vcpu))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
/*
* Ensure vmcs12 is up-to-date before any VMWRITE that dirties
@@ -5116,12 +5158,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return r;
if (!page_address_valid(vcpu, vmptr))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
if (vmptr == vmx->nested.vmxon_ptr)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_VMXON_POINTER);
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
/* Forbid normal VMPTRLD if Enlightened version was used */
if (vmx->nested.hv_evmcs)
@@ -5138,7 +5178,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
* given physical address won't match the required
* VMCS12_REVISION identifier.
*/
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -5148,7 +5188,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
(new_vmcs12->hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
kvm_vcpu_unmap(vcpu, &map, false);
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -5233,8 +5273,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (type >= 32 || !(types & (1 << type)))
- return nested_vmx_failValid(vcpu,
- VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/* According to the Intel VMX instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
@@ -5255,7 +5294,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
switch (type) {
case VMX_EPT_EXTENT_CONTEXT:
if (!nested_vmx_check_eptp(vcpu, operand.eptp))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
roots_to_free = 0;
@@ -5315,7 +5354,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
if (type >= 32 || !(types & (1 << type)))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/* according to the intel vmx instruction reference, the memory
@@ -5329,7 +5368,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return vmx_handle_memory_failure(vcpu, r, &e);
if (operand.vpid >> 16)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid02 = nested_get_vpid02(vcpu);
@@ -5337,14 +5376,14 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
if (!operand.vpid ||
is_noncanonical_address(operand.gla, vcpu))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_vcpu_addr(vpid02, operand.gla);
break;
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
if (!operand.vpid)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_context(vpid02);
break;
@@ -6333,7 +6372,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
/*
* secondary cpu-based controls. Do not include those that
- * depend on CPUID bits, they are added later by vmx_cpuid_update.
+ * depend on CPUID bits, they are added later by
+ * vmx_vcpu_after_set_cpuid.
*/
if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -6514,6 +6554,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
.get_vmcs12_pages = nested_get_vmcs12_pages,
+ .write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
};
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 5f1ac002b4b6..692b0c31c9c8 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -146,7 +146,9 @@ do { \
: : op1 : "cc" : error, fault); \
return; \
error: \
+ instrumentation_begin(); \
insn##_error(error_args); \
+ instrumentation_end(); \
return; \
fault: \
kvm_spurious_fault(); \
@@ -161,7 +163,9 @@ do { \
: : op1, op2 : "cc" : error, fault); \
return; \
error: \
+ instrumentation_begin(); \
insn##_error(error_args); \
+ instrumentation_end(); \
return; \
fault: \
kvm_spurious_fault(); \
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bdcce65c7a1d..a886a47daebd 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -180,9 +180,6 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
ret = pmu->version > 1;
break;
- case MSR_IA32_PERF_CAPABILITIES:
- ret = 1;
- break;
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
@@ -224,12 +221,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = pmu->global_ovf_ctrl;
return 0;
- case MSR_IA32_PERF_CAPABILITIES:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- return 1;
- msr_info->data = vcpu->arch.perf_capabilities;
- return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
@@ -289,14 +280,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
break;
- case MSR_IA32_PERF_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ?
- (data & ~vmx_get_perf_capabilities()) : data)
- return 1;
- vcpu->arch.perf_capabilities = data;
- return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index e0a182cb3cdd..799db084a336 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -27,7 +27,7 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
- .text
+.section .noinstr.text, "ax"
/**
* vmx_vmenter - VM-Enter the current loaded VMCS
@@ -234,6 +234,9 @@ SYM_FUNC_START(__vmx_vcpu_run)
jmp 1b
SYM_FUNC_END(__vmx_vcpu_run)
+
+.section .text, "ax"
+
/**
* vmread_error_trampoline - Trampoline from inline asm to vmread_error()
* @field: VMCS field encoding that failed
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 559634b59d2a..46ba2e03a892 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -781,7 +781,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
- if (enable_ept)
+ if (!vmx_need_pf_intercept(vcpu))
eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
@@ -1816,7 +1816,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
msr->data = vmx_get_perf_capabilities();
return 0;
default:
- return 1;
+ return KVM_MSR_RET_INVALID;
}
}
@@ -2063,7 +2063,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
- if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
vmx->spec_ctrl = data;
@@ -2934,14 +2934,16 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
- u64 root_hpa = vcpu->arch.mmu->root_hpa;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 root_hpa = mmu->root_hpa;
/* No flush required if the current context is invalid. */
if (!VALID_PAGE(root_hpa))
return;
if (enable_ept)
- ept_sync_context(construct_eptp(vcpu, root_hpa));
+ ept_sync_context(construct_eptp(vcpu, root_hpa,
+ mmu->shadow_root_level));
else if (!is_guest_mode(vcpu))
vpid_sync_context(to_vmx(vcpu)->vpid);
else
@@ -3064,26 +3066,19 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = emulation_required(vcpu);
}
-static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
+static int vmx_get_max_tdp_level(void)
{
- if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ if (cpu_has_vmx_ept_5levels())
return 5;
return 4;
}
-static int get_ept_level(struct kvm_vcpu *vcpu)
-{
- if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
- return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
-
- return vmx_get_tdp_level(vcpu);
-}
-
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
+u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
+ int root_level)
{
u64 eptp = VMX_EPTP_MT_WB;
- eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+ eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
@@ -3093,7 +3088,8 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
return eptp;
}
-void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
+ int pgd_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3101,7 +3097,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, pgd);
+ eptp = construct_eptp(vcpu, pgd, pgd_level);
vmcs_write64(EPT_POINTER, eptp);
if (kvm_x86_ops.tlb_remote_flush) {
@@ -4356,6 +4352,16 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ /*
+ * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+ * between guest and host. In that case we only care about present
+ * faults.
+ */
+ if (enable_ept) {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
+ }
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -4782,18 +4788,25 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.ndata = 4;
vcpu->run->internal.data[0] = vect_info;
vcpu->run->internal.data[1] = intr_info;
vcpu->run->internal.data[2] = error_code;
+ vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
return 0;
}
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu);
- /* EPT won't cause page fault directly */
- WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+ /*
+ * EPT will cause page fault only if we need to
+ * detect illegal GPAs.
+ */
+ kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+ return 1;
+ } else
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -5309,6 +5322,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
vcpu->arch.exit_qualification = exit_qualification;
+
+ /*
+ * Check that the GPA doesn't exceed physical memory limits, as that is
+ * a guest page fault. We have to emulate the instruction here, because
+ * if the illegal address is that of a paging structure, then
+ * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
+ * would also use advanced VM-exit information for EPT violations to
+ * reconstruct the page fault error code.
+ */
+ if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+ return kvm_emulate_instruction(vcpu, 0);
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
@@ -6005,6 +6030,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6013,6 +6039,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6039,6 +6066,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->internal.data[3] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
+ vcpu->run->internal.data[vcpu->run->internal.ndata++] =
+ vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6094,8 +6123,9 @@ unexpected_vmexit:
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -6109,7 +6139,7 @@ unexpected_vmexit:
* information but as all relevant affected CPUs have 32KiB L1D cache size
* there is no point in doing so.
*/
-static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;
@@ -6142,7 +6172,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
vcpu->stat.l1d_flush++;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
return;
}
@@ -6628,7 +6658,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
}
}
-void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
{
if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
vmx->loaded_vmcs->host_state.rsp = host_rsp;
@@ -6650,6 +6680,63 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
+static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ struct vcpu_vmx *vmx)
+{
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+
+ if (vcpu->arch.cr2 != native_read_cr2())
+ native_write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
+
+ vcpu->arch.cr2 = native_read_cr2();
+
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * guest_exit_irqoff() restores host context and reinstates RCU if
+ * enabled and required.
+ *
+ * This needs to be done before the below as native_read_msr()
+ * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+ * into world and some more.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ guest_exit_irqoff();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
fastpath_t exit_fastpath;
@@ -6724,19 +6811,8 @@ reenter_guest:
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
- /* L1D Flush includes CPU buffer clear to mitigate MDS */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mds_user_clear))
- mds_clear_cpu_buffers();
-
- if (vcpu->arch.cr2 != read_cr2())
- write_cr2(vcpu->arch.cr2);
-
- vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- vmx->loaded_vmcs->launched);
-
- vcpu->arch.cr2 = read_cr2();
+ /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+ vmx_vcpu_enter_exit(vcpu, vmx);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -7229,7 +7305,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7478,42 +7554,6 @@ static void vmx_flush_log_dirty(struct kvm *kvm)
kvm_flush_pml_buffers(kvm);
}
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- struct vmcs12 *vmcs12;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t dst;
-
- if (is_guest_mode(vcpu)) {
- WARN_ON_ONCE(vmx->nested.pml_full);
-
- /*
- * Check if PML is enabled for the nested guest.
- * Whether eptp bit 6 is set is already checked
- * as part of A/D emulation.
- */
- vmcs12 = get_vmcs12(vcpu);
- if (!nested_cpu_has_pml(vmcs12))
- return 0;
-
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
- vmx->nested.pml_full = true;
- return 1;
- }
-
- gpa &= ~0xFFFull;
- dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
-
- if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
- offset_in_page(dst), sizeof(gpa)))
- return 0;
-
- vmcs12->guest_pml_index--;
- }
-
- return 0;
-}
-
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *memslot,
gfn_t offset, unsigned long mask)
@@ -7858,7 +7898,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_bp_intercept = update_exception_bitmap,
+ .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
@@ -7918,12 +7958,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
- .get_tdp_level = vmx_get_tdp_level,
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
- .cpuid_update = vmx_cpuid_update,
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
@@ -7942,7 +7981,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
- .write_log_dirty = vmx_write_pml_buffer,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
@@ -8070,7 +8108,7 @@ static __init int hardware_setup(void)
ept_lpage_level = PG_LEVEL_2M;
else
ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, ept_lpage_level);
+ kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -8265,6 +8303,13 @@ static int __init vmx_init(void)
#endif
vmx_check_vmcs12_offsets();
+ /*
+ * Intel processors don't have problems with
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
+ * it for VMX by default
+ */
+ allow_smaller_maxphyaddr = true;
+
return 0;
}
module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 639798e4a6ca..26175a4759fa 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -11,6 +11,7 @@
#include "kvm_cache_regs.h"
#include "ops.h"
#include "vmcs.h"
+#include "cpuid.h"
extern const u32 vmx_msr_index[];
@@ -337,11 +338,11 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
-void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
+u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
+ int root_level);
void update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
@@ -536,8 +537,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
GFP_KERNEL_ACCOUNT);
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
-
static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
{
vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
@@ -550,6 +549,11 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
+static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
+{
+ return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
+}
+
void dump_vmcs(void);
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21d5e7a7ffd0..599d73206299 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -188,6 +188,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs;
u64 __read_mostly host_efer;
EXPORT_SYMBOL_GPL(host_efer);
+bool __read_mostly allow_smaller_maxphyaddr;
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+
static u64 __read_mostly host_xss;
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
@@ -244,6 +247,29 @@ static struct kmem_cache *x86_fpu_cache;
static struct kmem_cache *x86_emulator_cache;
+/*
+ * When called, it means the previous get/set msr reached an invalid msr.
+ * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want
+ * to fail the caller.
+ */
+static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+ u64 data, bool write)
+{
+ const char *op = write ? "wrmsr" : "rdmsr";
+
+ if (ignore_msrs) {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ /* Mask the error */
+ return 0;
+ } else {
+ vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ return 1;
+ }
+}
+
static struct kmem_cache *kvm_alloc_emulator_cache(void)
{
unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
@@ -380,7 +406,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-asmlinkage __visible void kvm_spurious_fault(void)
+asmlinkage __visible noinstr void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG_ON(!kvm_rebooting);
@@ -776,6 +802,7 @@ EXPORT_SYMBOL_GPL(pdptrs_changed);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -793,22 +820,22 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return 1;
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
- if ((vcpu->arch.efer & EFER_LME)) {
- int cs_db, cs_l;
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
+ (cr0 & X86_CR0_PG)) {
+ int cs_db, cs_l;
- if (!is_pae(vcpu))
- return 1;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l)
- return 1;
- } else
-#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l)
return 1;
}
+#endif
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
+ return 1;
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
return 1;
@@ -917,7 +944,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
@@ -932,37 +959,17 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-#define __cr4_reserved_bits(__cpu_has, __c) \
-({ \
- u64 __reserved_bits = CR4_RESERVED_BITS; \
- \
- if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
- __reserved_bits |= X86_CR4_OSXSAVE; \
- if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
- __reserved_bits |= X86_CR4_SMEP; \
- if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
- __reserved_bits |= X86_CR4_SMAP; \
- if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
- __reserved_bits |= X86_CR4_FSGSBASE; \
- if (!__cpu_has(__c, X86_FEATURE_PKU)) \
- __reserved_bits |= X86_CR4_PKE; \
- if (!__cpu_has(__c, X86_FEATURE_LA57)) \
- __reserved_bits |= X86_CR4_LA57; \
- if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
- __reserved_bits |= X86_CR4_UMIP; \
- __reserved_bits; \
-})
-
-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
return -EINVAL;
- if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu))
+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_valid_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -1001,7 +1008,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
@@ -1111,7 +1118,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 4:
/* fall through */
case 6:
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr6_valid(val))
return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
break;
@@ -1390,8 +1397,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- if (kvm_x86_ops.get_msr_feature(msr))
- return 1;
+ return kvm_x86_ops.get_msr_feature(msr);
}
return 0;
}
@@ -1403,6 +1409,13 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
msr.index = index;
r = kvm_get_msr_feature(&msr);
+
+ if (r == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear the output for simplicity */
+ *data = 0;
+ r = kvm_msr_ignored_check(vcpu, index, 0, false);
+ }
+
if (r)
return r;
@@ -1517,6 +1530,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
return kvm_x86_ops.set_msr(vcpu, &msr);
}
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 data, bool host_initiated)
+{
+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID)
+ ret = kvm_msr_ignored_check(vcpu, index, data, true);
+
+ return ret;
+}
+
/*
* Read the MSR specified by @index into @data. Select MSR specific fault
* checks are bypassed if @host_initiated is %true.
@@ -1538,15 +1562,29 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
return ret;
}
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 *data, bool host_initiated)
+{
+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear *data for simplicity */
+ *data = 0;
+ ret = kvm_msr_ignored_check(vcpu, index, 0, false);
+ }
+
+ return ret;
+}
+
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
- return __kvm_get_msr(vcpu, index, data, false);
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
- return __kvm_set_msr(vcpu, index, data, false);
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
@@ -1666,12 +1704,12 @@ EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
*/
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return __kvm_get_msr(vcpu, index, data, true);
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
}
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return __kvm_set_msr(vcpu, index, *data, true);
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
#ifdef CONFIG_X86_64
@@ -2823,6 +2861,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.arch_capabilities = data;
break;
+ case MSR_IA32_PERF_CAPABILITIES: {
+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
+
+ if (!msr_info->host_initiated)
+ return 1;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+ return 1;
+ if (data & ~msr_ent.data)
+ return 1;
+
+ vcpu->arch.perf_capabilities = data;
+
+ return 0;
+ }
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
@@ -2882,7 +2934,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
return 1;
vcpu->arch.ia32_misc_enable_msr = data;
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
} else {
vcpu->arch.ia32_misc_enable_msr = data;
}
@@ -3067,17 +3119,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu,
- "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- break;
- }
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
@@ -3173,6 +3215,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
msr_info->data = vcpu->arch.arch_capabilities;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return 1;
+ msr_info->data = vcpu->arch.perf_capabilities;
+ break;
case MSR_IA32_POWER_CTL:
msr_info->data = vcpu->arch.msr_ia32_power_ctl;
break;
@@ -3332,17 +3380,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
- msr_info->index);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
- msr_info->index);
- msr_info->data = 0;
- }
- break;
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
@@ -3477,6 +3515,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_LAST_CPU:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -3539,6 +3578,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
break;
+ case KVM_CAP_SMALLER_MAXPHYADDR:
+ r = (int) allow_smaller_maxphyaddr;
+ break;
default:
break;
}
@@ -8155,7 +8197,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_x86_ops.set_efer(vcpu, 0);
#endif
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
kvm_mmu_reset_context(vcpu);
}
@@ -8507,7 +8549,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
- guest_enter_irqoff();
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -8549,6 +8590,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -8569,7 +8611,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_disable();
kvm_after_interrupt(vcpu);
- guest_exit_irqoff();
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
if (delta != S64_MIN) {
@@ -9174,7 +9215,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
(X86_CR4_OSXSAVE | X86_CR4_PKE));
kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
if (cpuid_update_needed)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -9278,7 +9319,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops.update_bp_intercept(vcpu);
+ kvm_x86_ops.update_exception_bitmap(vcpu);
r = 0;
@@ -9476,7 +9517,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
@@ -10627,11 +10667,17 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ int ret;
irqfd->producer = prod;
+ kvm_arch_start_assignment(irqfd->kvm);
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
- return kvm_x86_ops.update_pi_irte(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
+ if (ret)
+ kvm_arch_end_assignment(irqfd->kvm);
+
+ return ret;
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
@@ -10654,6 +10700,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
+
+ kvm_arch_end_assignment(irqfd->kvm);
}
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
@@ -10673,28 +10721,53 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
-u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+
+int kvm_spec_ctrl_test_value(u64 value)
{
- uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
+ /*
+ * test that setting IA32_SPEC_CTRL to given value
+ * is allowed by the host processor
+ */
+
+ u64 saved_value;
+ unsigned long flags;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ ret = 1;
+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
+ ret = 1;
+ else
+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
+
+ local_irq_restore(flags);
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
- bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
- if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
- !boot_cpu_has(X86_FEATURE_AMD_IBRS))
- bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
- bits &= ~SPEC_CTRL_SSBD;
- if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
- !boot_cpu_has(X86_FEATURE_AMD_SSBD))
- bits &= ~SPEC_CTRL_SSBD;
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
+{
+ struct x86_exception fault;
- return bits;
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) {
+ /*
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+ * tables probably do not match the TLB. Just proceed
+ * with the error code that the processor gave.
+ */
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = error_code;
+ fault.nested_page_fault = false;
+ fault.address = gva;
+ }
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
}
-EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6eb62e97e59f..995ab696dcf0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -272,6 +272,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
@@ -360,10 +361,41 @@ static inline bool kvm_dr7_valid(u64 data)
/* Bits [63:32] are reserved */
return !(data >> 32);
}
+static inline bool kvm_dr6_valid(u64 data)
+{
+ /* Bits [63:32] are reserved */
+ return !(data >> 32);
+}
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
-u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu);
+int kvm_spec_ctrl_test_value(u64 value);
+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
+#define KVM_MSR_RET_INVALID 2
+
+#define __cr4_reserved_bits(__cpu_has, __c) \
+({ \
+ u64 __reserved_bits = CR4_RESERVED_BITS; \
+ \
+ if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
+ __reserved_bits |= X86_CR4_OSXSAVE; \
+ if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
+ __reserved_bits |= X86_CR4_SMEP; \
+ if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
+ __reserved_bits |= X86_CR4_SMAP; \
+ if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
+ __reserved_bits |= X86_CR4_FSGSBASE; \
+ if (!__cpu_has(__c, X86_FEATURE_PKU)) \
+ __reserved_bits |= X86_CR4_PKE; \
+ if (!__cpu_has(__c, X86_FEATURE_LA57)) \
+ __reserved_bits |= X86_CR4_LA57; \
+ if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
+ __reserved_bits |= X86_CR4_UMIP; \
+ if (!__cpu_has(__c, X86_FEATURE_VMX)) \
+ __reserved_bits |= X86_CR4_VMXE; \
+ __reserved_bits; \
+})
+
#endif
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 6110bce7237b..d46fff11f06f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -24,7 +24,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_cmdline.o = -pg
endif
-CFLAGS_cmdline.o := $(call cc-option, -fno-stack-protector)
+CFLAGS_cmdline.o := -fno-stack-protector
endif
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index a873da6b46d6..8679a9d6c47f 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -689,12 +689,10 @@ int fpregs_soft_set(struct task_struct *target,
int fpregs_soft_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct swregs_state *s387 = &target->thread.fpu.state.soft;
const void *space = s387->st_space;
- int ret;
int offset = (S387->ftop & 7) * 10, other = 80 - offset;
RE_ENTRANT_CHECK_OFF;
@@ -709,18 +707,11 @@ int fpregs_soft_get(struct task_struct *target,
S387->fos |= 0xffff0000;
#endif /* PECULIAR_486 */
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
- offsetof(struct swregs_state, st_space));
-
- /* Copy all registers in stack order. */
- if (!ret)
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- space + offset, 0, other);
- if (!ret)
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- space, 0, offset);
+ membuf_write(&to, s387, offsetof(struct swregs_state, st_space));
+ membuf_write(&to, space + offset, other);
+ membuf_write(&to, space, offset);
RE_ENTRANT_CHECK_ON;
- return ret;
+ return 0;
}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f7fd0e868c9c..5864219221ca 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -22,10 +22,9 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
obj-y += pat/
# Make sure __phys_addr has no stackprotector
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_physaddr.o := $(nostackp)
-CFLAGS_setup_nx.o := $(nostackp)
-CFLAGS_mem_encrypt_identity.o := $(nostackp)
+CFLAGS_physaddr.o := -fno-stack-protector
+CFLAGS_setup_nx.o := -fno-stack-protector
+CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5e5edd2ec893..35f1498e9832 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -21,7 +21,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
-#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
@@ -1140,7 +1139,7 @@ void do_user_addr_fault(struct pt_regs *regs,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- vm_fault_t fault, major = 0;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_DEFAULT;
tsk = current;
@@ -1292,8 +1291,7 @@ good_area:
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
*/
- fault = handle_mm_fault(vma, address, flags);
- major |= fault & VM_FAULT_MAJOR;
+ fault = handle_mm_fault(vma, address, flags, regs);
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
@@ -1320,18 +1318,6 @@ good_area:
return;
}
- /*
- * Major/minor page fault accounting. If any of the events
- * returned VM_FAULT_MAJOR, we account it as a major fault.
- */
- if (major) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
- }
-
check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index cf5781142716..a0d023cb4292 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -17,7 +17,6 @@
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
#if 0 /* This is just for testing */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8b4afad84f4a..7c055259de3a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -52,6 +52,7 @@
#include <asm/cpu_entry_area.h>
#include <asm/init.h>
#include <asm/pgtable_areas.h>
+#include <asm/numa.h>
#include "mm_internal.h"
@@ -678,7 +679,6 @@ void __init initmem_init(void)
#endif
memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
- sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
@@ -718,7 +718,6 @@ void __init paging_init(void)
* NOTE: at this point the bootmem allocator is fully available.
*/
olpc_dt_build_devicetree();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_sizes_init();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e65b96f381a7..a4ac13cc3fdc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -217,6 +217,11 @@ static void sync_global_pgds(unsigned long start, unsigned long end)
sync_global_pgds_l4(start, end);
}
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+ sync_global_pgds(start, end);
+}
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -812,7 +817,6 @@ void __init initmem_init(void)
void __init paging_init(void)
{
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
/*
@@ -1248,28 +1252,23 @@ static void __init preallocate_vmalloc_pages(void)
p4d_t *p4d;
pud_t *pud;
- p4d = p4d_offset(pgd, addr);
- if (p4d_none(*p4d)) {
- /* Can only happen with 5-level paging */
- p4d = p4d_alloc(&init_mm, pgd, addr);
- if (!p4d) {
- lvl = "p4d";
- goto failed;
- }
- }
+ lvl = "p4d";
+ p4d = p4d_alloc(&init_mm, pgd, addr);
+ if (!p4d)
+ goto failed;
+ /*
+ * With 5-level paging the P4D level is not folded. So the PGDs
+ * are now populated and there is no need to walk down to the
+ * PUD level.
+ */
if (pgtable_l5_enabled())
continue;
- pud = pud_offset(p4d, addr);
- if (pud_none(*pud)) {
- /* Ends up here only with 4-level paging */
- pud = pud_alloc(&init_mm, p4d, addr);
- if (!pud) {
- lvl = "pud";
- goto failed;
- }
- }
+ lvl = "pud";
+ pud = pud_alloc(&init_mm, p4d, addr);
+ if (!pud)
+ goto failed;
}
return;
@@ -1453,6 +1452,15 @@ static unsigned long probe_memory_block_size(void)
goto done;
}
+ /*
+ * Use max block size to minimize overhead on bare metal, where
+ * alignment for memory hotplug isn't a concern.
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ bz = MAX_BLOCK_SIZE;
+ goto done;
+ }
+
/* Find the largest allowed block size that aligns to memory end */
for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
if (IS_ALIGNED(boot_mem_end, bz))
@@ -1510,10 +1518,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (pmd_none(*pmd)) {
void *p;
- if (altmap)
- p = altmap_alloc_block_buf(PMD_SIZE, altmap);
- else
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (p) {
pte_t entry;
@@ -1540,7 +1545,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
}
- if (vmemmap_populate_basepages(addr, next, node))
+ if (vmemmap_populate_basepages(addr, next, node, NULL))
return -ENOMEM;
}
return 0;
@@ -1552,7 +1557,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
int err;
if (end - start < PAGES_PER_SECTION * sizeof(struct page))
- err = vmemmap_populate_basepages(start, end, node);
+ err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
err = vmemmap_populate_hugepages(start, end, node, altmap);
else if (altmap) {
@@ -1560,7 +1565,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
__func__);
err = -ENOMEM;
} else
- err = vmemmap_populate_basepages(start, end, node);
+ err = vmemmap_populate_basepages(start, end, node, NULL);
if (!err)
sync_global_pgds(start, end - 1);
return err;
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index fb620fd9dae9..6e6b39710e5f 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -26,7 +26,6 @@
#include <linux/memblock.h>
#include <linux/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/kaslr.h>
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index b05f45e5e8e2..aa76ec2d359b 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -929,5 +929,4 @@ int memory_add_physaddr_to_nid(u64 start)
nid = numa_meminfo.blk[0].nid;
return nid;
}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 1953685c2ddf..c234634e26ba 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -11,7 +11,6 @@
#include <linux/spinlock.h>
#include <asm/cpu_entry_area.h>
-#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/e820/api.h>
#include <asm/tlb.h>
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a8a924b3c335..1aab92930569 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -34,7 +34,6 @@
#include <asm/vsyscall.h>
#include <asm/cmdline.h>
#include <asm/pti.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/sections.h>
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 0c67a5a94de3..b8c9a5b87f37 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -557,12 +557,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_z
* Device [8086:2fc0]
* Erratum HSE43
* CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset
- * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html
+ * https://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html
*
* Devices [8086:6f60,6fa0,6fc0]
* Erratum BDF2
* PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration
- * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html
+ * https://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html
*/
static void pci_invalid_bar(struct pci_dev *dev)
{
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index e3f1ca316068..9f9aad42ccff 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -62,7 +62,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
#ifdef CONFIG_ACPI
static int xen_register_pirq(u32 gsi, int triggering, bool set_pirq)
{
- int rc, pirq = -1, irq = -1;
+ int rc, pirq = -1, irq;
struct physdev_map_pirq map_irq;
int shareable = 0;
char *name;
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index a6e5f2c1805d..a2f447dffea6 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <asm/efi.h>
#include <linux/io.h>
+#include <asm/pgalloc.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv_hub.h>
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 37923d715741..6907b523e856 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -3,8 +3,7 @@ OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
# __restore_processor_state() restores %gs after S3 resume and so should not
# itself be stack-protected
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_cpu.o := $(nostackp)
+CFLAGS_cpu.o := -fno-stack-protector
obj-$(CONFIG_PM_SLEEP) += cpu.o
obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index d147f1b2c925..cd3914fc9f3d 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -98,7 +98,7 @@ static int get_e820_md5(struct e820_table *table, void *buf)
if (crypto_shash_digest(desc, (u8 *)table, size, buf))
ret = -EINVAL;
- kzfree(desc);
+ kfree_sensitive(desc);
free_tfm:
crypto_free_shash(tfm);
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 088bd764e0b7..183ac60e5990 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -34,7 +34,7 @@ KCOV_INSTRUMENT := n
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss
PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
-PURGATORY_CFLAGS += $(call cc-option,-fno-stack-protector)
+PURGATORY_CFLAGS += -fno-stack-protector
# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
# in turn leaves some undefined symbols like __fentry__ in purgatory and not
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index 0caddd6acb22..5943387e3f35 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -42,7 +42,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
# optimize sibling calls.
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \
-fno-omit-frame-pointer -foptimize-sibling-calls
$(vobjs): KBUILD_CFLAGS += $(CFL)
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1aded63a95cb..218acbd5c7a0 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -19,6 +19,7 @@ config XEN_PV
bool "Xen PV guest support"
default y
depends on XEN
+ depends on X86_64
select PARAVIRT_XXL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
@@ -50,7 +51,7 @@ config XEN_PVHVM_SMP
config XEN_512GB
bool "Limit Xen pv-domain memory to 512GB"
- depends on XEN_PV && X86_64
+ depends on XEN_PV
default y
help
Limit paravirtualized user domains to 512GB of RAM.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 084de77a109e..fc5c5ba4aacb 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_xen-asm.o := y
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
@@ -9,9 +9,8 @@ CFLAGS_REMOVE_irq.o = -pg
endif
# Make sure early boot has no stackprotector
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_enlighten_pv.o := $(nostackp)
-CFLAGS_mmu_pv.o := $(nostackp)
+CFLAGS_enlighten_pv.o := -fno-stack-protector
+CFLAGS_mmu_pv.o := -fno-stack-protector
obj-y += enlighten.o
obj-y += mmu.o
@@ -34,7 +33,6 @@ obj-$(CONFIG_XEN_PV) += mmu_pv.o
obj-$(CONFIG_XEN_PV) += irq.o
obj-$(CONFIG_XEN_PV) += multicalls.o
obj-$(CONFIG_XEN_PV) += xen-asm.o
-obj-$(CONFIG_XEN_PV) += xen-asm_$(BITS).o
obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 5e53bfbe5823..e82fd1910dae 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -1,8 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
+#include <linux/thread_info.h>
#include <asm/x86_init.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
@@ -58,10 +60,6 @@ static u32 xen_apic_read(u32 reg)
if (reg == APIC_LVR)
return 0x14;
-#ifdef CONFIG_X86_32
- if (reg == APIC_LDR)
- return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
-#endif
if (reg != APIC_ID)
return 0;
@@ -127,14 +125,6 @@ static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
return initial_apic_id >> index_msb;
}
-#ifdef CONFIG_X86_32
-static int xen_x86_32_early_logical_apicid(int cpu)
-{
- /* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */
- return 1 << cpu;
-}
-#endif
-
static void xen_noop(void)
{
}
@@ -197,11 +187,6 @@ static struct apic xen_pv_apic = {
.icr_write = xen_apic_icr_write,
.wait_icr_idle = xen_noop,
.safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
-
-#ifdef CONFIG_X86_32
- /* generic_processor_info and setup_local_APIC. */
- .x86_32_early_logical_apicid = xen_x86_32_early_logical_apicid,
-#endif
};
static void __init xen_apic_check(void)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 3e89b0067ff0..9e87ab010c82 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -11,6 +11,7 @@
#include <asm/cpu.h>
#include <asm/smp.h>
+#include <asm/io_apic.h>
#include <asm/reboot.h>
#include <asm/setup.h>
#include <asm/idtentry.h>
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 2aab43a13a8c..22e741e0b10c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -119,14 +119,6 @@ static void __init xen_banner(void)
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
-
-#ifdef CONFIG_X86_32
- pr_warn("WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n"
- "Support for running as 32-bit PV-guest under Xen will soon be removed\n"
- "from the Linux kernel!\n"
- "Please use either a 64-bit kernel or switch to HVM or PVH mode!\n"
- "WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n");
-#endif
}
static void __init xen_pv_init_platform(void)
@@ -353,15 +345,13 @@ static void set_aliased_prot(void *v, pgprot_t prot)
pte_t *ptep;
pte_t pte;
unsigned long pfn;
- struct page *page;
unsigned char dummy;
+ void *va;
ptep = lookup_address((unsigned long)v, &level);
BUG_ON(ptep == NULL);
pfn = pte_pfn(*ptep);
- page = pfn_to_page(pfn);
-
pte = pfn_pte(pfn, prot);
/*
@@ -391,14 +381,10 @@ static void set_aliased_prot(void *v, pgprot_t prot)
if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
BUG();
- if (!PageHighMem(page)) {
- void *av = __va(PFN_PHYS(pfn));
+ va = __va(PFN_PHYS(pfn));
- if (av != v)
- if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
- BUG();
- } else
- kmap_flush_unused();
+ if (va != v && HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ BUG();
preempt_enable();
}
@@ -538,30 +524,12 @@ static void load_TLS_descriptor(struct thread_struct *t,
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
- * XXX sleazy hack: If we're being called in a lazy-cpu zone
- * and lazy gs handling is enabled, it means we're in a
- * context switch, and %gs has just been saved. This means we
- * can zero it out to prevent faults on exit from the
- * hypervisor if the next process has no %gs. Either way, it
- * has been saved, and the new value will get loaded properly.
- * This will go away as soon as Xen has been modified to not
- * save/restore %gs for normal hypercalls.
- *
- * On x86_64, this hack is not used for %gs, because gs points
- * to KERNEL_GS_BASE (and uses it for PDA references), so we
- * must not zero %gs on x86_64
- *
- * For x86_64, we need to zero %fs, otherwise we may get an
+ * In lazy mode we need to zero %fs, otherwise we may get an
* exception between the new %fs descriptor being loaded and
* %fs being effectively cleared at __switch_to().
*/
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
-#ifdef CONFIG_X86_32
- lazy_load_gs(0);
-#else
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
loadsegment(fs, 0);
-#endif
- }
xen_mc_batch();
@@ -572,13 +540,11 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
-#ifdef CONFIG_X86_64
static void xen_load_gs_index(unsigned int idx)
{
if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
BUG();
}
-#endif
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
@@ -597,7 +563,6 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
preempt_enable();
}
-#ifdef CONFIG_X86_64
void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
@@ -697,7 +662,6 @@ static bool __ref get_trap_addr(void **addr, unsigned int ist)
return true;
}
-#endif
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
@@ -710,10 +674,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
info->vector = vector;
addr = gate_offset(val);
-#ifdef CONFIG_X86_64
if (!get_trap_addr((void **)&addr, val->bits.ist))
return 0;
-#endif /* CONFIG_X86_64 */
info->address = addr;
info->cs = gate_segment(val);
@@ -958,15 +920,12 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
-#ifdef CONFIG_X86_64
unsigned int which;
u64 base;
-#endif
ret = 0;
switch (msr) {
-#ifdef CONFIG_X86_64
case MSR_FS_BASE: which = SEGBASE_FS; goto set;
case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
@@ -976,7 +935,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
if (HYPERVISOR_set_segment_base(which, base) != 0)
ret = -EIO;
break;
-#endif
case MSR_STAR:
case MSR_CSTAR:
@@ -1058,9 +1016,7 @@ void __init xen_setup_vcpu_info_placement(void)
static const struct pv_info xen_info __initconst = {
.shared_kernel_pmd = 0,
-#ifdef CONFIG_X86_64
.extra_user_64bit_cs = FLAT_USER_CS64,
-#endif
.name = "Xen",
};
@@ -1086,18 +1042,14 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
.iret = xen_iret,
-#ifdef CONFIG_X86_64
.usergs_sysret64 = xen_sysret64,
-#endif
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
.load_idt = xen_load_idt,
.load_tls = xen_load_tls,
-#ifdef CONFIG_X86_64
.load_gs_index = xen_load_gs_index,
-#endif
.alloc_ldt = xen_alloc_ldt,
.free_ldt = xen_free_ldt,
@@ -1364,15 +1316,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* keep using Xen gdt for now; no urgent need to change it */
-#ifdef CONFIG_X86_32
- pv_info.kernel_rpl = 1;
- if (xen_feature(XENFEAT_supervisor_mode_kernel))
- pv_info.kernel_rpl = 0;
-#else
pv_info.kernel_rpl = 0;
-#endif
- /* set the limit of our address space */
- xen_reserve_top();
/*
* We used to do this in xen_arch_setup, but that is too late
@@ -1384,12 +1328,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (rc != 0)
xen_raw_printk("physdev_op failed %d\n", rc);
-#ifdef CONFIG_X86_32
- /* set up basic CPUID stuff */
- cpu_detect(&new_cpu_data);
- set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
- new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-#endif
if (xen_start_info->mod_start) {
if (xen_start_info->flags & SIF_MOD_START_PFN)
@@ -1458,12 +1396,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_efi_init(&boot_params);
/* Start the world */
-#ifdef CONFIG_X86_32
- i386_start_kernel();
-#else
cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
-#endif
}
static int xen_cpu_up_prepare_pv(unsigned int cpu)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index a58d9c69807a..3273c985d3dd 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -86,19 +86,8 @@
#include "mmu.h"
#include "debugfs.h"
-#ifdef CONFIG_X86_32
-/*
- * Identity map, in addition to plain kernel map. This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
-static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
-#endif
-#ifdef CONFIG_X86_64
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
-#endif /* CONFIG_X86_64 */
/*
* Protects atomic reservation decrease/increase against concurrent increases.
@@ -280,10 +269,7 @@ static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
if (!xen_batched_set_pte(ptep, pteval)) {
/*
* Could call native_set_pte() here and trap and
- * emulate the PTE write but with 32-bit guests this
- * needs two traps (one for each of the two 32-bit
- * words in the PTE) so do one hypercall directly
- * instead.
+ * emulate the PTE write, but a hypercall is much cheaper.
*/
struct mmu_update u;
@@ -439,26 +425,6 @@ static void xen_set_pud(pud_t *ptr, pud_t val)
xen_set_pud_hyper(ptr, val);
}
-#ifdef CONFIG_X86_PAE
-static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- trace_xen_mmu_set_pte_atomic(ptep, pte);
- __xen_set_pte(ptep, pte);
-}
-
-static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- trace_xen_mmu_pte_clear(mm, addr, ptep);
- __xen_set_pte(ptep, native_make_pte(0));
-}
-
-static void xen_pmd_clear(pmd_t *pmdp)
-{
- trace_xen_mmu_pmd_clear(pmdp);
- set_pmd(pmdp, __pmd(0));
-}
-#endif /* CONFIG_X86_PAE */
-
__visible pmd_t xen_make_pmd(pmdval_t pmd)
{
pmd = pte_pfn_to_mfn(pmd);
@@ -466,7 +432,6 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
-#ifdef CONFIG_X86_64
__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
@@ -571,27 +536,27 @@ __visible p4d_t xen_make_p4d(p4dval_t p4d)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
-#endif /* CONFIG_X86_64 */
-static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
- int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
- bool last, unsigned long limit)
+static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
+ void (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ bool last, unsigned long limit)
{
- int i, nr, flush = 0;
+ int i, nr;
nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
for (i = 0; i < nr; i++) {
if (!pmd_none(pmd[i]))
- flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
+ (*func)(mm, pmd_page(pmd[i]), PT_PTE);
}
- return flush;
}
-static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
- int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
- bool last, unsigned long limit)
+static void xen_pud_walk(struct mm_struct *mm, pud_t *pud,
+ void (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ bool last, unsigned long limit)
{
- int i, nr, flush = 0;
+ int i, nr;
nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
for (i = 0; i < nr; i++) {
@@ -602,29 +567,26 @@ static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
pmd = pmd_offset(&pud[i], 0);
if (PTRS_PER_PMD > 1)
- flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
- flush |= xen_pmd_walk(mm, pmd, func,
- last && i == nr - 1, limit);
+ (*func)(mm, virt_to_page(pmd), PT_PMD);
+ xen_pmd_walk(mm, pmd, func, last && i == nr - 1, limit);
}
- return flush;
}
-static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
- int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
- bool last, unsigned long limit)
+static void xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
+ void (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ bool last, unsigned long limit)
{
- int flush = 0;
pud_t *pud;
if (p4d_none(*p4d))
- return flush;
+ return;
pud = pud_offset(p4d, 0);
if (PTRS_PER_PUD > 1)
- flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
- flush |= xen_pud_walk(mm, pud, func, last, limit);
- return flush;
+ (*func)(mm, virt_to_page(pud), PT_PUD);
+ xen_pud_walk(mm, pud, func, last, limit);
}
/*
@@ -636,32 +598,27 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
* will be STACK_TOP_MAX, but at boot we need to pin up to
* FIXADDR_TOP.
*
- * For 32-bit the important bit is that we don't pin beyond there,
- * because then we start getting into Xen's ptes.
- *
- * For 64-bit, we must skip the Xen hole in the middle of the address
- * space, just after the big x86-64 virtual hole.
+ * We must skip the Xen hole in the middle of the address space, just after
+ * the big x86-64 virtual hole.
*/
-static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
- int (*func)(struct mm_struct *mm, struct page *,
- enum pt_level),
- unsigned long limit)
+static void __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
+ void (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
{
- int i, nr, flush = 0;
+ int i, nr;
unsigned hole_low = 0, hole_high = 0;
/* The limit is the last byte to be touched */
limit--;
BUG_ON(limit >= FIXADDR_TOP);
-#ifdef CONFIG_X86_64
/*
* 64-bit has a great big hole in the middle of the address
* space, which contains the Xen mappings.
*/
hole_low = pgd_index(GUARD_HOLE_BASE_ADDR);
hole_high = pgd_index(GUARD_HOLE_END_ADDR);
-#endif
nr = pgd_index(limit) + 1;
for (i = 0; i < nr; i++) {
@@ -674,22 +631,20 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
continue;
p4d = p4d_offset(&pgd[i], 0);
- flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
+ xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}
/* Do the top level last, so that the callbacks can use it as
a cue to do final things like tlb flushes. */
- flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
-
- return flush;
+ (*func)(mm, virt_to_page(pgd), PT_PGD);
}
-static int xen_pgd_walk(struct mm_struct *mm,
- int (*func)(struct mm_struct *mm, struct page *,
- enum pt_level),
- unsigned long limit)
+static void xen_pgd_walk(struct mm_struct *mm,
+ void (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
{
- return __xen_pgd_walk(mm, mm->pgd, func, limit);
+ __xen_pgd_walk(mm, mm->pgd, func, limit);
}
/* If we're using split pte locks, then take the page's lock and
@@ -722,26 +677,17 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
xen_extend_mmuext_op(&op);
}
-static int xen_pin_page(struct mm_struct *mm, struct page *page,
- enum pt_level level)
+static void xen_pin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
unsigned pgfl = TestSetPagePinned(page);
- int flush;
-
- if (pgfl)
- flush = 0; /* already pinned */
- else if (PageHighMem(page))
- /* kmaps need flushing if we found an unpinned
- highpage */
- flush = 1;
- else {
+
+ if (!pgfl) {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0);
spinlock_t *ptl;
- flush = 0;
-
/*
* We need to hold the pagetable lock between the time
* we make the pagetable RO and when we actually pin
@@ -778,8 +724,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
xen_mc_callback(xen_pte_unlock, ptl);
}
}
-
- return flush;
}
/* This is called just after a mm has been created, but it has not
@@ -787,39 +731,22 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
read-only, and can be pinned. */
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
{
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
trace_xen_mmu_pgd_pin(mm, pgd);
xen_mc_batch();
- if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
- /* re-enable interrupts for flushing */
- xen_mc_issue(0);
+ __xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT);
- kmap_flush_unused();
+ xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
- xen_mc_batch();
+ if (user_pgd) {
+ xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
+ xen_do_pin(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
}
-#ifdef CONFIG_X86_64
- {
- pgd_t *user_pgd = xen_get_user_pgd(pgd);
-
- xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
-
- if (user_pgd) {
- xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
- xen_do_pin(MMUEXT_PIN_L4_TABLE,
- PFN_DOWN(__pa(user_pgd)));
- }
- }
-#else /* CONFIG_X86_32 */
-#ifdef CONFIG_X86_PAE
- /* Need to make sure unshared kernel PMD is pinnable */
- xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
- PT_PMD);
-#endif
- xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
-#endif /* CONFIG_X86_64 */
xen_mc_issue(0);
}
@@ -854,11 +781,10 @@ void xen_mm_pin_all(void)
spin_unlock(&pgd_lock);
}
-static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
- enum pt_level level)
+static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
SetPagePinned(page);
- return 0;
}
/*
@@ -870,18 +796,16 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
static void __init xen_after_bootmem(void)
{
static_branch_enable(&xen_struct_pages_ready);
-#ifdef CONFIG_X86_64
SetPagePinned(virt_to_page(level3_user_vsyscall));
-#endif
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
-static int xen_unpin_page(struct mm_struct *mm, struct page *page,
- enum pt_level level)
+static void xen_unpin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
{
unsigned pgfl = TestClearPagePinned(page);
- if (pgfl && !PageHighMem(page)) {
+ if (pgfl) {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
spinlock_t *ptl = NULL;
@@ -912,36 +836,24 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page,
xen_mc_callback(xen_pte_unlock, ptl);
}
}
-
- return 0; /* never need to flush on unpin */
}
/* Release a pagetables pages back as normal RW */
static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
{
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
trace_xen_mmu_pgd_unpin(mm, pgd);
xen_mc_batch();
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-#ifdef CONFIG_X86_64
- {
- pgd_t *user_pgd = xen_get_user_pgd(pgd);
-
- if (user_pgd) {
- xen_do_pin(MMUEXT_UNPIN_TABLE,
- PFN_DOWN(__pa(user_pgd)));
- xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
- }
+ if (user_pgd) {
+ xen_do_pin(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
+ xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
}
-#endif
-
-#ifdef CONFIG_X86_PAE
- /* Need to make sure unshared kernel PMD is unpinned */
- xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
- PT_PMD);
-#endif
__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
@@ -1089,7 +1001,6 @@ static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
BUG();
}
-#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
{
@@ -1273,17 +1184,15 @@ static void __init xen_pagetable_cleanhighmap(void)
xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2));
xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
}
-#endif
static void __init xen_pagetable_p2m_setup(void)
{
xen_vmalloc_p2m_tree();
-#ifdef CONFIG_X86_64
xen_pagetable_p2m_free();
xen_pagetable_cleanhighmap();
-#endif
+
/* And revector! Bye bye old array */
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
}
@@ -1420,6 +1329,8 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
}
static void xen_write_cr3(unsigned long cr3)
{
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+
BUG_ON(preemptible());
xen_mc_batch(); /* disables interrupts */
@@ -1430,20 +1341,14 @@ static void xen_write_cr3(unsigned long cr3)
__xen_write_cr3(true, cr3);
-#ifdef CONFIG_X86_64
- {
- pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
- if (user_pgd)
- __xen_write_cr3(false, __pa(user_pgd));
- else
- __xen_write_cr3(false, 0);
- }
-#endif
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
-#ifdef CONFIG_X86_64
/*
* At the start of the day - when Xen launches a guest, it has already
* built pagetables for the guest. We diligently look over them
@@ -1478,49 +1383,39 @@ static void __init xen_write_cr3_init(unsigned long cr3)
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
-#endif
static int xen_pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd = mm->pgd;
- int ret = 0;
+ struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
+ int ret = -ENOMEM;
BUG_ON(PagePinned(virt_to_page(pgd)));
+ BUG_ON(page->private != 0);
-#ifdef CONFIG_X86_64
- {
- struct page *page = virt_to_page(pgd);
- pgd_t *user_pgd;
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
- BUG_ON(page->private != 0);
-
- ret = -ENOMEM;
-
- user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- page->private = (unsigned long)user_pgd;
-
- if (user_pgd != NULL) {
+ if (user_pgd != NULL) {
#ifdef CONFIG_X86_VSYSCALL_EMULATION
- user_pgd[pgd_index(VSYSCALL_ADDR)] =
- __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+ user_pgd[pgd_index(VSYSCALL_ADDR)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
#endif
- ret = 0;
- }
-
- BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ ret = 0;
}
-#endif
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+
return ret;
}
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
-#ifdef CONFIG_X86_64
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd)
free_page((unsigned long)user_pgd);
-#endif
}
/*
@@ -1539,7 +1434,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
*/
__visible pte_t xen_make_pte_init(pteval_t pte)
{
-#ifdef CONFIG_X86_64
unsigned long pfn;
/*
@@ -1553,7 +1447,7 @@ __visible pte_t xen_make_pte_init(pteval_t pte)
pfn >= xen_start_info->first_p2m_pfn &&
pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
pte &= ~_PAGE_RW;
-#endif
+
pte = pte_pfn_to_mfn(pte);
return native_make_pte(pte);
}
@@ -1561,13 +1455,6 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
{
-#ifdef CONFIG_X86_32
- /* If there's an existing pte, then don't allow _PAGE_RW to be set */
- if (pte_mfn(pte) != INVALID_P2M_ENTRY
- && pte_val_ma(*ptep) & _PAGE_PRESENT)
- pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
- pte_val_ma(pte));
-#endif
__xen_set_pte(ptep, pte);
}
@@ -1642,20 +1529,14 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
if (static_branch_likely(&xen_struct_pages_ready))
SetPagePinned(page);
- if (!PageHighMem(page)) {
- xen_mc_batch();
+ xen_mc_batch();
- __set_pfn_prot(pfn, PAGE_KERNEL_RO);
+ __set_pfn_prot(pfn, PAGE_KERNEL_RO);
- if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
- __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- } else {
- /* make sure there are no stray mappings of
- this page */
- kmap_flush_unused();
- }
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
}
}
@@ -1678,16 +1559,15 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
trace_xen_mmu_release_ptpage(pfn, level, pinned);
if (pinned) {
- if (!PageHighMem(page)) {
- xen_mc_batch();
+ xen_mc_batch();
- if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
- __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
- __set_pfn_prot(pfn, PAGE_KERNEL);
+ __set_pfn_prot(pfn, PAGE_KERNEL);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- }
ClearPagePinned(page);
}
}
@@ -1702,7 +1582,6 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
-#ifdef CONFIG_X86_64
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -1712,20 +1591,6 @@ static void xen_release_pud(unsigned long pfn)
{
xen_release_ptpage(pfn, PT_PUD);
}
-#endif
-
-void __init xen_reserve_top(void)
-{
-#ifdef CONFIG_X86_32
- unsigned long top = HYPERVISOR_VIRT_START;
- struct xen_platform_parameters pp;
-
- if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
- top = pp.virt_start;
-
- reserve_top_address(-top);
-#endif /* CONFIG_X86_32 */
-}
/*
* Like __va(), but returns address in the kernel mapping (which is
@@ -1733,11 +1598,7 @@ void __init xen_reserve_top(void)
*/
static void * __init __ka(phys_addr_t paddr)
{
-#ifdef CONFIG_X86_64
return (void *)(paddr + __START_KERNEL_map);
-#else
- return __va(paddr);
-#endif
}
/* Convert a machine address to physical address */
@@ -1771,56 +1632,7 @@ static void __init set_page_prot(void *addr, pgprot_t prot)
{
return set_page_prot_flags(addr, prot, UVMF_NONE);
}
-#ifdef CONFIG_X86_32
-static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
-{
- unsigned pmdidx, pteidx;
- unsigned ident_pte;
- unsigned long pfn;
-
- level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
- PAGE_SIZE);
-
- ident_pte = 0;
- pfn = 0;
- for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
- pte_t *pte_page;
-
- /* Reuse or allocate a page of ptes */
- if (pmd_present(pmd[pmdidx]))
- pte_page = m2v(pmd[pmdidx].pmd);
- else {
- /* Check for free pte pages */
- if (ident_pte == LEVEL1_IDENT_ENTRIES)
- break;
-
- pte_page = &level1_ident_pgt[ident_pte];
- ident_pte += PTRS_PER_PTE;
-
- pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
- }
-
- /* Install mappings */
- for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
- pte_t pte;
- if (pfn > max_pfn_mapped)
- max_pfn_mapped = pfn;
-
- if (!pte_none(pte_page[pteidx]))
- continue;
-
- pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
- pte_page[pteidx] = pte;
- }
- }
-
- for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
- set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
-
- set_page_prot(pmd, PAGE_KERNEL_RO);
-}
-#endif
void __init xen_setup_machphys_mapping(void)
{
struct xen_machphys_mapping mapping;
@@ -1831,13 +1643,8 @@ void __init xen_setup_machphys_mapping(void)
} else {
machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
}
-#ifdef CONFIG_X86_32
- WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
- < machine_to_phys_mapping);
-#endif
}
-#ifdef CONFIG_X86_64
static void __init convert_pfn_mfn(void *v)
{
pte_t *pte = v;
@@ -2168,105 +1975,6 @@ void __init xen_relocate_p2m(void)
xen_start_info->nr_p2m_frames = n_frames;
}
-#else /* !CONFIG_X86_64 */
-static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
-static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
-RESERVE_BRK(fixup_kernel_pmd, PAGE_SIZE);
-RESERVE_BRK(fixup_kernel_pte, PAGE_SIZE);
-
-static void __init xen_write_cr3_init(unsigned long cr3)
-{
- unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
-
- BUG_ON(read_cr3_pa() != __pa(initial_page_table));
- BUG_ON(cr3 != __pa(swapper_pg_dir));
-
- /*
- * We are switching to swapper_pg_dir for the first time (from
- * initial_page_table) and therefore need to mark that page
- * read-only and then pin it.
- *
- * Xen disallows sharing of kernel PMDs for PAE
- * guests. Therefore we must copy the kernel PMD from
- * initial_page_table into a new kernel PMD to be used in
- * swapper_pg_dir.
- */
- swapper_kernel_pmd =
- extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
- copy_page(swapper_kernel_pmd, initial_kernel_pmd);
- swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
- __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
- set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
-
- set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
- xen_write_cr3(cr3);
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
-
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
- PFN_DOWN(__pa(initial_page_table)));
- set_page_prot(initial_page_table, PAGE_KERNEL);
- set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
-
- pv_ops.mmu.write_cr3 = &xen_write_cr3;
-}
-
-/*
- * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
- * not the first page table in the page table pool.
- * Iterate through the initial page tables to find the real page table base.
- */
-static phys_addr_t __init xen_find_pt_base(pmd_t *pmd)
-{
- phys_addr_t pt_base, paddr;
- unsigned pmdidx;
-
- pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
-
- for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
- if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
- paddr = m2p(pmd[pmdidx].pmd);
- pt_base = min(pt_base, paddr);
- }
-
- return pt_base;
-}
-
-void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
-{
- pmd_t *kernel_pmd;
-
- kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
-
- xen_pt_base = xen_find_pt_base(kernel_pmd);
- xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
-
- initial_kernel_pmd =
- extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
-
- max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
-
- copy_page(initial_kernel_pmd, kernel_pmd);
-
- xen_map_identity_early(initial_kernel_pmd, max_pfn);
-
- copy_page(initial_page_table, pgd);
- initial_page_table[KERNEL_PGD_BOUNDARY] =
- __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
-
- set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
- set_page_prot(initial_page_table, PAGE_KERNEL_RO);
- set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
-
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
- PFN_DOWN(__pa(initial_page_table)));
- xen_write_cr3(__pa(initial_page_table));
-
- memblock_reserve(xen_pt_base, xen_pt_size);
-}
-#endif /* CONFIG_X86_64 */
-
void __init xen_reserve_special_pages(void)
{
phys_addr_t paddr;
@@ -2300,12 +2008,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_32
- case FIX_WP_TEST:
-# ifdef CONFIG_HIGHMEM
- case FIX_KMAP_BEGIN ... FIX_KMAP_END:
-# endif
-#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
case VSYSCALL_PAGE:
#endif
/* All local page mappings */
@@ -2357,9 +2060,7 @@ static void __init xen_post_allocator_init(void)
pv_ops.mmu.set_pte = xen_set_pte;
pv_ops.mmu.set_pmd = xen_set_pmd;
pv_ops.mmu.set_pud = xen_set_pud;
-#ifdef CONFIG_X86_64
pv_ops.mmu.set_p4d = xen_set_p4d;
-#endif
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
@@ -2367,15 +2068,11 @@ static void __init xen_post_allocator_init(void)
pv_ops.mmu.alloc_pmd = xen_alloc_pmd;
pv_ops.mmu.release_pte = xen_release_pte;
pv_ops.mmu.release_pmd = xen_release_pmd;
-#ifdef CONFIG_X86_64
pv_ops.mmu.alloc_pud = xen_alloc_pud;
pv_ops.mmu.release_pud = xen_release_pud;
-#endif
pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte);
-#ifdef CONFIG_X86_64
pv_ops.mmu.write_cr3 = &xen_write_cr3;
-#endif
}
static void xen_leave_lazy_mmu(void)
@@ -2420,17 +2117,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
-#ifdef CONFIG_X86_PAE
- .set_pte_atomic = xen_set_pte_atomic,
- .pte_clear = xen_pte_clear,
- .pmd_clear = xen_pmd_clear,
-#endif /* CONFIG_X86_PAE */
.set_pud = xen_set_pud_hyper,
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#ifdef CONFIG_X86_64
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_p4d = xen_set_p4d_hyper,
@@ -2442,7 +2133,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
-#endif /* CONFIG_X86_64 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 0acba2c712ab..be4151f42611 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -379,12 +379,8 @@ static void __init xen_rebuild_p2m_list(unsigned long *p2m)
if (type == P2M_TYPE_PFN || i < chunk) {
/* Use initial p2m page contents. */
-#ifdef CONFIG_X86_64
mfns = alloc_p2m_page();
copy_page(mfns, xen_p2m_addr + pfn);
-#else
- mfns = xen_p2m_addr + pfn;
-#endif
ptep = populate_extra_pte((unsigned long)(p2m + pfn));
set_pte(ptep,
pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL));
@@ -467,7 +463,7 @@ EXPORT_SYMBOL_GPL(get_phys_to_machine);
* Allocate new pmd(s). It is checked whether the old pmd is still in place.
* If not, nothing is changed. This is okay as the only reason for allocating
* a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
- * pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
+ * pmd.
*/
static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
{
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 3566e37241d7..7eab14d56369 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -32,7 +32,6 @@
#include <xen/features.h>
#include <xen/hvc-console.h>
#include "xen-ops.h"
-#include "vdso.h"
#include "mmu.h"
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
@@ -545,13 +544,10 @@ static unsigned long __init xen_get_pages_limit(void)
{
unsigned long limit;
-#ifdef CONFIG_X86_32
- limit = GB(64) / PAGE_SIZE;
-#else
limit = MAXMEM / PAGE_SIZE;
if (!xen_initial_domain() && xen_512gb_limit)
limit = GB(512) / PAGE_SIZE;
-#endif
+
return limit;
}
@@ -722,17 +718,8 @@ static void __init xen_reserve_xen_mfnlist(void)
if (!xen_is_e820_reserved(start, size))
return;
-#ifdef CONFIG_X86_32
- /*
- * Relocating the p2m on 32 bit system to an arbitrary virtual address
- * is not supported, so just give up.
- */
- xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
- BUG();
-#else
xen_relocate_p2m();
memblock_free(start, size);
-#endif
}
/**
@@ -921,20 +908,6 @@ char * __init xen_memory_setup(void)
return "Xen";
}
-/*
- * Set the bit indicating "nosegneg" library variants should be used.
- * We only need to bother in pure 32-bit mode; compat 32-bit processes
- * can have un-truncated segments, so wrapping around is allowed.
- */
-static void __init fiddle_vdso(void)
-{
-#ifdef CONFIG_X86_32
- u32 *mask = vdso_image_32.data +
- vdso_image_32.sym_VDSO32_NOTE_MASK;
- *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
-#endif
-}
-
static int register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
@@ -951,11 +924,7 @@ void xen_enable_sysenter(void)
int ret;
unsigned sysenter_feature;
-#ifdef CONFIG_X86_32
- sysenter_feature = X86_FEATURE_SEP;
-#else
sysenter_feature = X86_FEATURE_SYSENTER32;
-#endif
if (!boot_cpu_has(sysenter_feature))
return;
@@ -967,7 +936,6 @@ void xen_enable_sysenter(void)
void xen_enable_syscall(void)
{
-#ifdef CONFIG_X86_64
int ret;
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
@@ -983,7 +951,6 @@ void xen_enable_syscall(void)
if (ret != 0)
setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
-#endif /* CONFIG_X86_64 */
}
static void __init xen_pvmmu_arch_setup(void)
@@ -1024,7 +991,6 @@ void __init xen_arch_setup(void)
disable_cpuidle();
disable_cpufreq();
WARN_ON(xen_set_default_idle());
- fiddle_vdso();
#ifdef CONFIG_NUMA
numa_off = 1;
#endif
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index f8d39440b292..f5e7db4f82ab 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/thread_info.h>
#include <asm/smp.h>
#include <xen/events.h>
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 9ea598dcc132..c2ac319f11a4 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -29,6 +29,7 @@
#include <asm/idtentry.h>
#include <asm/desc.h>
#include <asm/cpu.h>
+#include <asm/io_apic.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
@@ -210,15 +211,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void)
* sure the old memory can be recycled. */
make_lowmem_page_readwrite(xen_initial_gdt);
-#ifdef CONFIG_X86_32
- /*
- * Xen starts us with XEN_FLAT_RING1_DS, but linux code
- * expects __USER_DS
- */
- loadsegment(ds, __USER_DS);
- loadsegment(es, __USER_DS);
-#endif
-
xen_filter_cpu_maps();
xen_setup_vcpu_info_placement();
@@ -299,10 +291,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
gdt = get_cpu_gdt_rw(cpu);
-#ifdef CONFIG_X86_32
- ctxt->user_regs.fs = __KERNEL_PERCPU;
- ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
-#endif
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
/*
@@ -340,12 +328,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = task_top_of_stack(idle);
-#ifdef CONFIG_X86_32
- ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
-#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
-#endif
ctxt->event_callback_eip =
(unsigned long)xen_asm_exc_xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 6deb49094c60..799f4eba0a62 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,9 +114,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
*/
void __init xen_init_spinlocks(void)
{
-
/* Don't need to use pvqspinlock code if there is only 1 vCPU. */
- if (num_possible_cpus() == 1)
+ if (num_possible_cpus() == 1 || nopvspin)
xen_pvspin = false;
if (!xen_pvspin) {
@@ -137,6 +136,7 @@ void __init xen_init_spinlocks(void)
static __init int xen_parse_nopvspin(char *arg)
{
+ pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n");
xen_pvspin = false;
return 0;
}
diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c
index 8303b58c79a9..cae9660f4c67 100644
--- a/arch/x86/xen/suspend_pv.c
+++ b/arch/x86/xen/suspend_pv.c
@@ -1,11 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
-#include <asm/fixmap.h>
-
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
+#include <asm/fixmap.h>
+
#include "xen-ops.h"
void xen_pv_pre_suspend(void)
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h
deleted file mode 100644
index 873c54c488fe..000000000000
--- a/arch/x86/xen/vdso.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/* Bit used for the pseudo-hwcap for non-negative segments. We use
- bit 1 to avoid bugs in some versions of glibc when bit 0 is
- used; the choice is otherwise arbitrary. */
-#define VDSO_NOTE_NONEGSEG_BIT 1
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 508fe204520b..1cb0e84b9161 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -6,12 +6,18 @@
* operations here; the indirect forms are better handled in C.
*/
+#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
-#include <asm/frame.h>
+#include <asm/segment.h>
+#include <asm/thread_info.h>
#include <asm/asm.h>
+#include <asm/frame.h>
+#include <xen/interface/xen.h>
+
+#include <linux/init.h>
#include <linux/linkage.h>
/*
@@ -76,11 +82,7 @@ SYM_FUNC_END(xen_save_fl_direct)
*/
SYM_FUNC_START(xen_restore_fl_direct)
FRAME_BEGIN
-#ifdef CONFIG_X86_64
testw $X86_EFLAGS_IF, %di
-#else
- testb $X86_EFLAGS_IF>>8, %ah
-#endif
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
@@ -104,15 +106,6 @@ SYM_FUNC_END(xen_restore_fl_direct)
*/
SYM_FUNC_START(check_events)
FRAME_BEGIN
-#ifdef CONFIG_X86_32
- push %eax
- push %ecx
- push %edx
- call xen_force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
-#else
push %rax
push %rcx
push %rdx
@@ -132,7 +125,6 @@ SYM_FUNC_START(check_events)
pop %rdx
pop %rcx
pop %rax
-#endif
FRAME_END
ret
SYM_FUNC_END(check_events)
@@ -151,3 +143,175 @@ SYM_FUNC_START(xen_read_cr2_direct)
FRAME_END
ret
SYM_FUNC_END(xen_read_cr2_direct);
+
+.macro xen_pv_trap name
+SYM_CODE_START(xen_\name)
+ pop %rcx
+ pop %r11
+ jmp \name
+SYM_CODE_END(xen_\name)
+_ASM_NOKPROBE(xen_\name)
+.endm
+
+xen_pv_trap asm_exc_divide_error
+xen_pv_trap asm_xenpv_exc_debug
+xen_pv_trap asm_exc_int3
+xen_pv_trap asm_xenpv_exc_nmi
+xen_pv_trap asm_exc_overflow
+xen_pv_trap asm_exc_bounds
+xen_pv_trap asm_exc_invalid_op
+xen_pv_trap asm_exc_device_not_available
+xen_pv_trap asm_exc_double_fault
+xen_pv_trap asm_exc_coproc_segment_overrun
+xen_pv_trap asm_exc_invalid_tss
+xen_pv_trap asm_exc_segment_not_present
+xen_pv_trap asm_exc_stack_segment
+xen_pv_trap asm_exc_general_protection
+xen_pv_trap asm_exc_page_fault
+xen_pv_trap asm_exc_spurious_interrupt_bug
+xen_pv_trap asm_exc_coprocessor_error
+xen_pv_trap asm_exc_alignment_check
+#ifdef CONFIG_X86_MCE
+xen_pv_trap asm_exc_machine_check
+#endif /* CONFIG_X86_MCE */
+xen_pv_trap asm_exc_simd_coprocessor_error
+#ifdef CONFIG_IA32_EMULATION
+xen_pv_trap entry_INT80_compat
+#endif
+xen_pv_trap asm_exc_xen_hypervisor_callback
+
+ __INIT
+SYM_CODE_START(xen_early_idt_handler_array)
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ pop %rcx
+ pop %r11
+ jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE
+ i = i + 1
+ .fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+SYM_CODE_END(xen_early_idt_handler_array)
+ __FINIT
+
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+ * Xen64 iret frame:
+ *
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip <-- standard iret frame
+ *
+ * flags
+ *
+ * rcx }
+ * r11 }<-- pushed by hypercall page
+ * rsp->rax }
+ */
+SYM_CODE_START(xen_iret)
+ pushq $0
+ jmp hypercall_iret
+SYM_CODE_END(xen_iret)
+
+SYM_CODE_START(xen_sysret64)
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back.
+ *
+ * tss.sp2 is scratch space.
+ */
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ pushq $__USER_DS
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ pushq %r11
+ pushq $__USER_CS
+ pushq %rcx
+
+ pushq $VGCF_in_syscall
+ jmp hypercall_iret
+SYM_CODE_END(xen_sysret64)
+
+/*
+ * Xen handles syscall callbacks much like ordinary exceptions, which
+ * means we have:
+ * - kernel gs
+ * - kernel rsp
+ * - an iret-like stack frame on the stack (including rcx and r11):
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip
+ * r11
+ * rsp->rcx
+ */
+
+/* Normal 64-bit system call target */
+SYM_FUNC_START(xen_syscall_target)
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER_DS and __USER_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER_DS, 4*8(%rsp)
+ movq $__USER_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_64_after_hwframe
+SYM_FUNC_END(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+SYM_FUNC_START(xen_syscall32_target)
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_compat_after_hwframe
+SYM_FUNC_END(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+SYM_FUNC_START(xen_sysenter_target)
+ /*
+ * NB: Xen is polite and clears TF from EFLAGS for us. This means
+ * that we don't need to guard against single step exceptions here.
+ */
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
+ jmp entry_SYSENTER_compat_after_hwframe
+SYM_FUNC_END(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+SYM_FUNC_START_ALIAS(xen_syscall32_target)
+SYM_FUNC_START(xen_sysenter_target)
+ lea 16(%rsp), %rsp /* strip %rcx, %r11 */
+ mov $-ENOSYS, %rax
+ pushq $0
+ jmp hypercall_iret
+SYM_FUNC_END(xen_sysenter_target)
+SYM_FUNC_END_ALIAS(xen_syscall32_target)
+
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
deleted file mode 100644
index 4757cec33abe..000000000000
--- a/arch/x86/xen/xen-asm_32.S
+++ /dev/null
@@ -1,185 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Asm versions of Xen pv-ops, suitable for direct use.
- *
- * We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C.
- */
-
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-#include <asm/asm.h>
-
-#include <xen/interface/xen.h>
-
-#include <linux/linkage.h>
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
- * This is run where a normal iret would be run, with the same stack setup:
- * 8: eflags
- * 4: cs
- * esp-> 0: eip
- *
- * This attempts to make sure that any pending events are dealt with
- * on return to usermode, but there is a small window in which an
- * event can happen just before entering usermode. If the nested
- * interrupt ends up setting one of the TIF_WORK_MASK pending work
- * flags, they will not be tested again before returning to
- * usermode. This means that a process can end up with pending work,
- * which will be unprocessed until the process enters and leaves the
- * kernel again, which could be an unbounded amount of time. This
- * means that a pending signal or reschedule event could be
- * indefinitely delayed.
- *
- * The fix is to notice a nested interrupt in the critical window, and
- * if one occurs, then fold the nested interrupt into the current
- * interrupt stack frame, and re-process it iteratively rather than
- * recursively. This means that it will exit via the normal path, and
- * all pending work will be dealt with appropriately.
- *
- * Because the nested interrupt handler needs to deal with the current
- * stack state in whatever form its in, we keep things simple by only
- * using a single register which is pushed/popped on the stack.
- */
-
-.macro POP_FS
-1:
- popw %fs
-.pushsection .fixup, "ax"
-2: movw $0, (%esp)
- jmp 1b
-.popsection
- _ASM_EXTABLE(1b,2b)
-.endm
-
-SYM_CODE_START(xen_iret)
- /* test eflags for special cases */
- testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
- jnz hyper_iret
-
- push %eax
- ESP_OFFSET=4 # bytes pushed onto stack
-
- /* Store vcpu_info pointer for easy access */
-#ifdef CONFIG_SMP
- pushw %fs
- movl $(__KERNEL_PERCPU), %eax
- movl %eax, %fs
- movl %fs:xen_vcpu, %eax
- POP_FS
-#else
- movl %ss:xen_vcpu, %eax
-#endif
-
- /* check IF state we're restoring */
- testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-
- /*
- * Maybe enable events. Once this happens we could get a
- * recursive event, so the critical region starts immediately
- * afterwards. However, if that happens we don't end up
- * resuming the code, so we don't have to be worried about
- * being preempted to another CPU.
- */
- setz %ss:XEN_vcpu_info_mask(%eax)
-xen_iret_start_crit:
-
- /* check for unmasked and pending */
- cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
-
- /*
- * If there's something pending, mask events again so we can
- * jump back into exc_xen_hypervisor_callback. Otherwise do not
- * touch XEN_vcpu_info_mask.
- */
- jne 1f
- movb $1, %ss:XEN_vcpu_info_mask(%eax)
-
-1: popl %eax
-
- /*
- * From this point on the registers are restored and the stack
- * updated, so we don't need to worry about it if we're
- * preempted
- */
-iret_restore_end:
-
- /*
- * Jump to hypervisor_callback after fixing up the stack.
- * Events are masked, so jumping out of the critical region is
- * OK.
- */
- je xen_asm_exc_xen_hypervisor_callback
-
-1: iret
-xen_iret_end_crit:
- _ASM_EXTABLE(1b, asm_iret_error)
-
-hyper_iret:
- /* put this out of line since its very rarely used */
- jmp hypercall_page + __HYPERVISOR_iret * 32
-SYM_CODE_END(xen_iret)
-
- .globl xen_iret_start_crit, xen_iret_end_crit
-
-/*
- * This is called by xen_asm_exc_xen_hypervisor_callback in entry_32.S when it sees
- * that the EIP at the time of interrupt was between
- * xen_iret_start_crit and xen_iret_end_crit.
- *
- * The stack format at this point is:
- * ----------------
- * ss : (ss/esp may be present if we came from usermode)
- * esp :
- * eflags } outer exception info
- * cs }
- * eip }
- * ----------------
- * eax : outer eax if it hasn't been restored
- * ----------------
- * eflags }
- * cs } nested exception info
- * eip }
- * return address : (into xen_asm_exc_xen_hypervisor_callback)
- *
- * In order to deliver the nested exception properly, we need to discard the
- * nested exception frame such that when we handle the exception, we do it
- * in the context of the outer exception rather than starting a new one.
- *
- * The only caveat is that if the outer eax hasn't been restored yet (i.e.
- * it's still on stack), we need to restore its value here.
-*/
-.pushsection .noinstr.text, "ax"
-SYM_CODE_START(xen_iret_crit_fixup)
- /*
- * Paranoia: Make sure we're really coming from kernel space.
- * One could imagine a case where userspace jumps into the
- * critical range address, but just before the CPU delivers a
- * PF, it decides to deliver an interrupt instead. Unlikely?
- * Definitely. Easy to avoid? Yes.
- */
- testb $2, 2*4(%esp) /* nested CS */
- jnz 2f
-
- /*
- * If eip is before iret_restore_end then stack
- * hasn't been restored yet.
- */
- cmpl $iret_restore_end, 1*4(%esp)
- jae 1f
-
- movl 4*4(%esp), %eax /* load outer EAX */
- ret $4*4 /* discard nested EIP, CS, and EFLAGS as
- * well as the just restored EAX */
-
-1:
- ret $3*4 /* discard nested EIP, CS, and EFLAGS */
-
-2:
- ret
-SYM_CODE_END(xen_iret_crit_fixup)
-.popsection
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
deleted file mode 100644
index aab1d99b2b48..000000000000
--- a/arch/x86/xen/xen-asm_64.S
+++ /dev/null
@@ -1,192 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Asm versions of Xen pv-ops, suitable for direct use.
- *
- * We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C.
- */
-
-#include <asm/errno.h>
-#include <asm/percpu.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-
-#include <xen/interface/xen.h>
-
-#include <linux/init.h>
-#include <linux/linkage.h>
-
-.macro xen_pv_trap name
-SYM_CODE_START(xen_\name)
- pop %rcx
- pop %r11
- jmp \name
-SYM_CODE_END(xen_\name)
-_ASM_NOKPROBE(xen_\name)
-.endm
-
-xen_pv_trap asm_exc_divide_error
-xen_pv_trap asm_xenpv_exc_debug
-xen_pv_trap asm_exc_int3
-xen_pv_trap asm_xenpv_exc_nmi
-xen_pv_trap asm_exc_overflow
-xen_pv_trap asm_exc_bounds
-xen_pv_trap asm_exc_invalid_op
-xen_pv_trap asm_exc_device_not_available
-xen_pv_trap asm_exc_double_fault
-xen_pv_trap asm_exc_coproc_segment_overrun
-xen_pv_trap asm_exc_invalid_tss
-xen_pv_trap asm_exc_segment_not_present
-xen_pv_trap asm_exc_stack_segment
-xen_pv_trap asm_exc_general_protection
-xen_pv_trap asm_exc_page_fault
-xen_pv_trap asm_exc_spurious_interrupt_bug
-xen_pv_trap asm_exc_coprocessor_error
-xen_pv_trap asm_exc_alignment_check
-#ifdef CONFIG_X86_MCE
-xen_pv_trap asm_exc_machine_check
-#endif /* CONFIG_X86_MCE */
-xen_pv_trap asm_exc_simd_coprocessor_error
-#ifdef CONFIG_IA32_EMULATION
-xen_pv_trap entry_INT80_compat
-#endif
-xen_pv_trap asm_exc_xen_hypervisor_callback
-
- __INIT
-SYM_CODE_START(xen_early_idt_handler_array)
- i = 0
- .rept NUM_EXCEPTION_VECTORS
- pop %rcx
- pop %r11
- jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE
- i = i + 1
- .fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
- .endr
-SYM_CODE_END(xen_early_idt_handler_array)
- __FINIT
-
-hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
-/*
- * Xen64 iret frame:
- *
- * ss
- * rsp
- * rflags
- * cs
- * rip <-- standard iret frame
- *
- * flags
- *
- * rcx }
- * r11 }<-- pushed by hypercall page
- * rsp->rax }
- */
-SYM_CODE_START(xen_iret)
- pushq $0
- jmp hypercall_iret
-SYM_CODE_END(xen_iret)
-
-SYM_CODE_START(xen_sysret64)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back.
- *
- * tss.sp2 is scratch space.
- */
- movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER_DS
- pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- pushq %r11
- pushq $__USER_CS
- pushq %rcx
-
- pushq $VGCF_in_syscall
- jmp hypercall_iret
-SYM_CODE_END(xen_sysret64)
-
-/*
- * Xen handles syscall callbacks much like ordinary exceptions, which
- * means we have:
- * - kernel gs
- * - kernel rsp
- * - an iret-like stack frame on the stack (including rcx and r11):
- * ss
- * rsp
- * rflags
- * cs
- * rip
- * r11
- * rsp->rcx
- */
-
-/* Normal 64-bit system call target */
-SYM_FUNC_START(xen_syscall_target)
- popq %rcx
- popq %r11
-
- /*
- * Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER_DS and __USER_CS, so
- * report those values even though Xen will guess its own values.
- */
- movq $__USER_DS, 4*8(%rsp)
- movq $__USER_CS, 1*8(%rsp)
-
- jmp entry_SYSCALL_64_after_hwframe
-SYM_FUNC_END(xen_syscall_target)
-
-#ifdef CONFIG_IA32_EMULATION
-
-/* 32-bit compat syscall target */
-SYM_FUNC_START(xen_syscall32_target)
- popq %rcx
- popq %r11
-
- /*
- * Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
- * report those values even though Xen will guess its own values.
- */
- movq $__USER32_DS, 4*8(%rsp)
- movq $__USER32_CS, 1*8(%rsp)
-
- jmp entry_SYSCALL_compat_after_hwframe
-SYM_FUNC_END(xen_syscall32_target)
-
-/* 32-bit compat sysenter target */
-SYM_FUNC_START(xen_sysenter_target)
- /*
- * NB: Xen is polite and clears TF from EFLAGS for us. This means
- * that we don't need to guard against single step exceptions here.
- */
- popq %rcx
- popq %r11
-
- /*
- * Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
- * report those values even though Xen will guess its own values.
- */
- movq $__USER32_DS, 4*8(%rsp)
- movq $__USER32_CS, 1*8(%rsp)
-
- jmp entry_SYSENTER_compat_after_hwframe
-SYM_FUNC_END(xen_sysenter_target)
-
-#else /* !CONFIG_IA32_EMULATION */
-
-SYM_FUNC_START_ALIAS(xen_syscall32_target)
-SYM_FUNC_START(xen_sysenter_target)
- lea 16(%rsp), %rsp /* strip %rcx, %r11 */
- mov $-ENOSYS, %rax
- pushq $0
- jmp hypercall_iret
-SYM_FUNC_END(xen_sysenter_target)
-SYM_FUNC_END_ALIAS(xen_syscall32_target)
-
-#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1ba601df3a37..2d7c8f34f56c 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -35,13 +35,8 @@ SYM_CODE_START(startup_xen)
rep __ASM_SIZE(stos)
mov %_ASM_SI, xen_start_info
-#ifdef CONFIG_X86_64
mov initial_stack(%rip), %rsp
-#else
- mov initial_stack, %esp
-#endif
-#ifdef CONFIG_X86_64
/* Set up %gs.
*
* The base of %gs always points to fixed_percpu_data. If the
@@ -53,7 +48,6 @@ SYM_CODE_START(startup_xen)
movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax
cdq
wrmsr
-#endif
call xen_start_kernel
SYM_CODE_END(startup_xen)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 53b224fd6177..45d556f71858 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -33,7 +33,6 @@ void xen_setup_mfn_list_list(void);
void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
-void xen_reserve_top(void);
void __init xen_reserve_special_pages(void);
void __init xen_pt_check_e820(void);