diff options
Diffstat (limited to 'arch/x86')
76 files changed, 743 insertions, 591 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd18994a9555..4ccfacc7232a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -360,7 +360,7 @@ config SMP Management" code will be disabled if you say Y here. See also <file:Documentation/x86/i386/IO-APIC.txt>, - <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at + <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at <http://www.tldp.org/docs.html#howto>. If you don't know what to do here, say N. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4430dd489620..bf240b920473 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -159,7 +159,7 @@ ifdef CONFIG_FUNCTION_GRAPH_TRACER # If '-Os' is enabled, disable it and print a warning. ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE undefine CONFIG_CC_OPTIMIZE_FOR_SIZE - $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.) + $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.) endif endif @@ -179,7 +179,8 @@ ifdef CONFIG_JUMP_LABEL endif ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) - KBUILD_CFLAGS += -maccumulate-outgoing-args + # This compiler flag is not supported by Clang: + KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,) endif # Stackpointer is addressed different for 32 bit and 64 bit x86 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 44163e8c3868..2c860ad4fe06 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -94,7 +94,7 @@ vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o quiet_cmd_check_data_rel = DATAREL $@ define cmd_check_data_rel for obj in $(filter %.o,$^); do \ - readelf -S $$obj | grep -qF .rel.local && { \ + ${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \ echo "error: $$obj has data relocations!" >&2; \ exit 1; \ } || true; \ diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index 2e59dac07f9e..d732e608e3af 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -1,7 +1,9 @@ #ifndef BOOT_COMPRESSED_ERROR_H #define BOOT_COMPRESSED_ERROR_H +#include <linux/compiler.h> + void warn(char *m); -void error(char *m); +void error(char *m) __noreturn; #endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 56589d0a804b..1d78f1739087 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -70,7 +70,7 @@ static unsigned long level4p; * Due to relocation, pointers must be assigned at run time not build time. */ static struct x86_mapping_info mapping_info = { - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, }; /* Locates and clears a region for a new top level page table. */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 50bc26949e9e..48ef7bb32c42 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -252,6 +252,23 @@ ENTRY(__switch_to_asm) END(__switch_to_asm) /* + * The unwinder expects the last frame on the stack to always be at the same + * offset from the end of the page, which allows it to validate the stack. + * Calling schedule_tail() directly would break that convention because its an + * asmlinkage function so its argument has to be pushed on the stack. This + * wrapper creates a proper "end of stack" frame header before the call. + */ +ENTRY(schedule_tail_wrapper) + FRAME_BEGIN + + pushl %eax + call schedule_tail + popl %eax + + FRAME_END + ret +ENDPROC(schedule_tail_wrapper) +/* * A newly forked process directly context switches into this address. * * eax: prev task we switched from @@ -259,24 +276,15 @@ END(__switch_to_asm) * edi: kernel thread arg */ ENTRY(ret_from_fork) - FRAME_BEGIN /* help unwinder find end of stack */ - - /* - * schedule_tail() is asmlinkage so we have to put its 'prev' argument - * on the stack. - */ - pushl %eax - call schedule_tail - popl %eax + call schedule_tail_wrapper testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ 2: /* When we fork, we trace the syscall return in the child, too. */ - leal FRAME_OFFSET(%esp), %eax + movl %esp, %eax call syscall_return_slowpath - FRAME_END jmp restore_all /* kernel thread */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 607d72c4a485..4a4c0834f965 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,7 +36,6 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> -#include <asm/frame.h> #include <linux/err.h> .code64 @@ -406,19 +405,17 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) - FRAME_BEGIN /* help unwinder find end of stack */ movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ + call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ 2: - leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */ + movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS - FRAME_END jmp restore_regs_and_iret 1: diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 9d05c7e67f60..a45e2114a846 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -761,7 +761,7 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7acb51c49fec..7a9df3beb89b 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -32,6 +32,7 @@ #define _ASM_ADD __ASM_SIZE(add) #define _ASM_SUB __ASM_SIZE(sub) #define _ASM_XADD __ASM_SIZE(xadd) +#define _ASM_MUL __ASM_SIZE(mul) #define _ASM_AX __ASM_REG(ax) #define _ASM_BX __ASM_REG(bx) diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e7e1942edff7..8b4140f6724f 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,93 +5,8 @@ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> -/* - * The set_memory_* API can be used to change various attributes of a virtual - * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack - * Executability : eXeutable, NoteXecutable - * Read/Write : ReadOnly, ReadWrite - * Presence : NotPresent - * - * Within a category, the attributes are mutually exclusive. - * - * The implementation of this API will take care of various aspects that - * are associated with changing such attributes, such as: - * - Flushing TLBs - * - Flushing CPU caches - * - Making sure aliases of the memory behind the mapping don't violate - * coherency rules as defined by the CPU in the system. - * - * What this API does not do: - * - Provide exclusion between various callers - including callers that - * operation on other mappings of the same physical page - * - Restore default attributes when a page is freed - * - Guarantee that mappings other than the requested one are - * in any state, other than that these do not violate rules for - * the CPU you have. Do not depend on any effects on other mappings, - * CPUs other than the one you have may have more relaxed rules. - * The caller is required to take care of these. - */ - -int _set_memory_uc(unsigned long addr, int numpages); -int _set_memory_wc(unsigned long addr, int numpages); -int _set_memory_wt(unsigned long addr, int numpages); -int _set_memory_wb(unsigned long addr, int numpages); -int set_memory_uc(unsigned long addr, int numpages); -int set_memory_wc(unsigned long addr, int numpages); -int set_memory_wt(unsigned long addr, int numpages); -int set_memory_wb(unsigned long addr, int numpages); -int set_memory_x(unsigned long addr, int numpages); -int set_memory_nx(unsigned long addr, int numpages); -int set_memory_ro(unsigned long addr, int numpages); -int set_memory_rw(unsigned long addr, int numpages); -int set_memory_np(unsigned long addr, int numpages); -int set_memory_4k(unsigned long addr, int numpages); - -int set_memory_array_uc(unsigned long *addr, int addrinarray); -int set_memory_array_wc(unsigned long *addr, int addrinarray); -int set_memory_array_wt(unsigned long *addr, int addrinarray); -int set_memory_array_wb(unsigned long *addr, int addrinarray); - -int set_pages_array_uc(struct page **pages, int addrinarray); -int set_pages_array_wc(struct page **pages, int addrinarray); -int set_pages_array_wt(struct page **pages, int addrinarray); -int set_pages_array_wb(struct page **pages, int addrinarray); - -/* - * For legacy compatibility with the old APIs, a few functions - * are provided that work on a "struct page". - * These functions operate ONLY on the 1:1 kernel mapping of the - * memory that the struct page represents, and internally just - * call the set_memory_* function. See the description of the - * set_memory_* function for more details on conventions. - * - * These APIs should be considered *deprecated* and are likely going to - * be removed in the future. - * The reason for this is the implicit operation on the 1:1 mapping only, - * making this not a generally useful API. - * - * Specifically, many users of the old APIs had a virtual address, - * called virt_to_page() or vmalloc_to_page() on that address to - * get a struct page* that the old API required. - * To convert these cases, use set_memory_*() on the original - * virtual address, do not use these functions. - */ - -int set_pages_uc(struct page *page, int numpages); -int set_pages_wb(struct page *page, int numpages); -int set_pages_x(struct page *page, int numpages); -int set_pages_nx(struct page *page, int numpages); -int set_pages_ro(struct page *page, int numpages); -int set_pages_rw(struct page *page, int numpages); - - void clflush_cache_range(void *addr, unsigned int size); #define mmio_flush_range(addr, size) clflush_cache_range(addr, size) -extern int kernel_set_to_readonly; -void set_kernel_text_rw(void); -void set_kernel_text_ro(void); - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 737da62bfeb0..474eb8c66fee 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -4,8 +4,9 @@ struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ - unsigned long pmd_flag; /* page flag for PMD entry */ + unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ + bool direct_gbpages; /* PUD level 1GB page support */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f5bddf92faba..695605eb1dfb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,7 +43,7 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_HALT_POLL_NS_DEFAULT 400000 +#define KVM_HALT_POLL_NS_DEFAULT 200000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS @@ -1020,6 +1020,8 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); + int (*write_log_dirty)(struct kvm_vcpu *vcpu); + /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4fd5195deed0..3f9a3d2a5209 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -266,6 +266,7 @@ static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *s #endif int mce_available(struct cpuinfo_x86 *c); +bool mce_is_memory_error(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1411dbed5e5e..f513cc231151 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -7,6 +7,7 @@ #include <linux/string.h> #include <linux/scatterlist.h> #include <asm/io.h> +#include <asm/pat.h> #include <asm/x86_init.h> #ifdef __KERNEL__ @@ -102,10 +103,8 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); #define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, - int write_combine); - +#define arch_can_pci_mmap_wc() pat_enabled() +#define ARCH_GENERIC_PCI_MMAP_RESOURCE #ifdef CONFIG_PCI extern void early_quirks(void); diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index d5a22bac9988..0ff8fe71b255 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -98,7 +98,7 @@ static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, if (bytes < 8) { if (!IS_ALIGNED(dest, 4) || (bytes != 4)) - arch_wb_cache_pmem(addr, 1); + arch_wb_cache_pmem(addr, bytes); } else { if (!IS_ALIGNED(dest, 8)) { dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h new file mode 100644 index 000000000000..eaec6c364e42 --- /dev/null +++ b/arch/x86/include/asm/set_memory.h @@ -0,0 +1,87 @@ +#ifndef _ASM_X86_SET_MEMORY_H +#define _ASM_X86_SET_MEMORY_H + +#include <asm/page.h> +#include <asm-generic/set_memory.h> + +/* + * The set_memory_* API can be used to change various attributes of a virtual + * address range. The attributes include: + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXeutable, NoteXecutable + * Read/Write : ReadOnly, ReadWrite + * Presence : NotPresent + * + * Within a category, the attributes are mutually exclusive. + * + * The implementation of this API will take care of various aspects that + * are associated with changing such attributes, such as: + * - Flushing TLBs + * - Flushing CPU caches + * - Making sure aliases of the memory behind the mapping don't violate + * coherency rules as defined by the CPU in the system. + * + * What this API does not do: + * - Provide exclusion between various callers - including callers that + * operation on other mappings of the same physical page + * - Restore default attributes when a page is freed + * - Guarantee that mappings other than the requested one are + * in any state, other than that these do not violate rules for + * the CPU you have. Do not depend on any effects on other mappings, + * CPUs other than the one you have may have more relaxed rules. + * The caller is required to take care of these. + */ + +int _set_memory_uc(unsigned long addr, int numpages); +int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); +int _set_memory_wb(unsigned long addr, int numpages); +int set_memory_uc(unsigned long addr, int numpages); +int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); +int set_memory_wb(unsigned long addr, int numpages); +int set_memory_np(unsigned long addr, int numpages); +int set_memory_4k(unsigned long addr, int numpages); + +int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); +int set_memory_array_wb(unsigned long *addr, int addrinarray); + +int set_pages_array_uc(struct page **pages, int addrinarray); +int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); +int set_pages_array_wb(struct page **pages, int addrinarray); + +/* + * For legacy compatibility with the old APIs, a few functions + * are provided that work on a "struct page". + * These functions operate ONLY on the 1:1 kernel mapping of the + * memory that the struct page represents, and internally just + * call the set_memory_* function. See the description of the + * set_memory_* function for more details on conventions. + * + * These APIs should be considered *deprecated* and are likely going to + * be removed in the future. + * The reason for this is the implicit operation on the 1:1 mapping only, + * making this not a generally useful API. + * + * Specifically, many users of the old APIs had a virtual address, + * called virt_to_page() or vmalloc_to_page() on that address to + * get a struct page* that the old API required. + * To convert these cases, use set_memory_*() on the original + * virtual address, do not use these functions. + */ + +int set_pages_uc(struct page *page, int numpages); +int set_pages_wb(struct page *page, int numpages); +int set_pages_x(struct page *page, int numpages); +int set_pages_nx(struct page *page, int numpages); +int set_pages_ro(struct page *page, int numpages); +int set_pages_rw(struct page *page, int numpages); + +extern int kernel_set_to_readonly; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); + +#endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 68766b276d9e..a059aac9e937 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -319,10 +319,10 @@ do { \ #define __get_user_asm_u64(x, ptr, retval, errret) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %2,%%eax\n" \ "2: movl %3,%%edx\n" \ - "3: " ASM_CLAC "\n" \ + "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: mov %4,%0\n" \ " xorl %%eax,%%eax\n" \ @@ -331,7 +331,7 @@ do { \ ".previous\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - : "=r" (retval), "=A"(x) \ + : "=r" (retval), "=&A"(x) \ : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) @@ -703,14 +703,15 @@ extern struct movsl_mask { #define unsafe_put_user(x, ptr, err_label) \ do { \ int __pu_err; \ - __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ + __typeof__(*(ptr)) __pu_val = (x); \ + __put_user_size(__pu_val, (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ if (unlikely(__pu_err)) goto err_label; \ } while (0) #define unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ - unsigned long __gu_val; \ + __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ if (unlikely(__gu_err)) goto err_label; \ diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index 3dec769cadf7..83b6e9a0dce4 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -4,62 +4,3 @@ include include/uapi/asm-generic/Kbuild.asm genhdr-y += unistd_32.h genhdr-y += unistd_64.h genhdr-y += unistd_x32.h -header-y += a.out.h -header-y += auxvec.h -header-y += bitsperlong.h -header-y += boot.h -header-y += bootparam.h -header-y += byteorder.h -header-y += debugreg.h -header-y += e820.h -header-y += errno.h -header-y += fcntl.h -header-y += hw_breakpoint.h -header-y += hyperv.h -header-y += ioctl.h -header-y += ioctls.h -header-y += ipcbuf.h -header-y += ist.h -header-y += kvm.h -header-y += kvm_para.h -header-y += kvm_perf.h -header-y += ldt.h -header-y += mce.h -header-y += mman.h -header-y += msgbuf.h -header-y += msr-index.h -header-y += msr.h -header-y += mtrr.h -header-y += param.h -header-y += perf_regs.h -header-y += poll.h -header-y += posix_types.h -header-y += posix_types_32.h -header-y += posix_types_64.h -header-y += posix_types_x32.h -header-y += prctl.h -header-y += processor-flags.h -header-y += ptrace-abi.h -header-y += ptrace.h -header-y += resource.h -header-y += sembuf.h -header-y += setup.h -header-y += shmbuf.h -header-y += sigcontext.h -header-y += sigcontext32.h -header-y += siginfo.h -header-y += signal.h -header-y += socket.h -header-y += sockios.h -header-y += stat.h -header-y += statfs.h -header-y += svm.h -header-y += swab.h -header-y += termbits.h -header-y += termios.h -header-y += types.h -header-y += ucontext.h -header-y += unistd.h -header-y += vm86.h -header-y += vmx.h -header-y += vsyscall.h diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c5b8f760473c..32e14d137416 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, memcpy(insnbuf, replacement, a->replacementlen); insnbuf_sz = a->replacementlen; - /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) { + /* + * 0xe8 is a relative jump; fix the offset. + * + * Instruction length is checked before the opcode to avoid + * accessing uninitialized bytes for zero-length replacements. + */ + if (a->replacementlen == 5 && *insnbuf == 0xe8) { *(s32 *)(insnbuf + 1) += replacement - instr; DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", *(s32 *)(insnbuf + 1), diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index df083efe6ee0..815dd63f49d0 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -36,7 +36,7 @@ #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/amd_nb.h> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c36140d788fe..bb5abe8f5fd4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -16,7 +16,7 @@ #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/cacheflush.h> +# include <asm/set_memory.h> #endif #include "cpu.h" @@ -799,8 +799,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); - /* AMD CPUs don't reset SS attributes on SYSRET */ - set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a44ef52184df..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,7 +17,7 @@ #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> void __init check_bugs(void) { diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index a70fd61095f8..6f077445647a 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -255,6 +255,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) break; case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ + case 11: /* GX1 with inverted Device ID */ #ifdef CONFIG_PCI { u32 vendor, device; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5abd4bf73d6e..5cfbaeb6529a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -499,16 +499,14 @@ static int mce_usable_address(struct mce *m) return 1; } -static bool memory_error(struct mce *m) +bool mce_is_memory_error(struct mce *m) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { + if (m->cpuvendor == X86_VENDOR_AMD) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { + } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -529,6 +527,7 @@ static bool memory_error(struct mce *m) return false; } +EXPORT_SYMBOL_GPL(mce_is_memory_error); static bool cec_add_mce(struct mce *m) { @@ -536,7 +535,7 @@ static bool cec_add_mce(struct mce *m) return false; /* We eat only correctable DRAM errors with usable addresses. */ - if (memory_error(m) && + if (mce_is_memory_error(m) && !(m->status & MCI_STATUS_UC) && mce_usable_address(m)) if (!cec_add_elem(m->addr >> PAGE_SHIFT)) @@ -713,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) + if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m)) if (m.status & MCI_STATUS_ADDRV) m.severity = severity; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 7889ae492af0..e9f4d762aa5b 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -10,7 +10,7 @@ * Author: Peter Oruba <peter.oruba@amd.com> * * Based on work by: - * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Tigran Aivazian <aivazian.tigran@gmail.com> * * early loader: * Copyright (C) 2013 Advanced Micro Devices, Inc. @@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) } static enum ucode_state -load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); +load_microcode_amd(bool save, u8 family, const u8 *data, size_t size); int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { @@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax), - desc.data, desc.size); + ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); if (ret != UCODE_OK) return -EINVAL; @@ -352,8 +351,6 @@ void reload_ucode_amd(void) u32 rev, dummy; mc = (struct microcode_amd *)amd_ucode_patch; - if (!mc) - return; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); @@ -677,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, } static enum ucode_state -load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) +load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) { enum ucode_state ret; @@ -691,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) #ifdef CONFIG_X86_32 /* save BSP's matching patch for early load */ - if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(cpu); + if (save) { + struct ucode_patch *p = find_patch(0); if (p) { memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), @@ -724,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; /* reload ucode container only on the boot cpu */ - if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + if (!refresh_fw || !bsp) return UCODE_OK; if (c->x86 >= 0x15) @@ -745,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size); + ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b4a4cd39b358..e53d3c909840 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -1,7 +1,7 @@ /* * CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * 2006 Shaohua Li <shaohua.li@intel.com> * 2013-2016 Borislav Petkov <bp@alien8.de> * diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8325d8a09ab0..f522415bf9e5 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -1,7 +1,7 @@ /* * Intel CPU Microcode Update Driver for Linux * - * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * 2006 Shaohua Li <shaohua.li@intel.com> * * Intel CPU microcode early update for Linux @@ -619,6 +619,9 @@ int __init save_microcode_in_initrd_intel(void) show_saved_mc(); + /* initrd is going away, clear patch ptr. */ + intel_ucode_patch = NULL; + return 0; } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index c2f8dde3255c..d5d44c452624 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -90,6 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8ee76dce9140..9bef1bbeba63 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -24,7 +24,7 @@ #include <trace/syscall.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> @@ -689,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size) { return module_alloc(size); } -static inline void tramp_free(void *tramp) +static inline void tramp_free(void *tramp, int size) { + int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; + + set_memory_nx((unsigned long)tramp, npages); + set_memory_rw((unsigned long)tramp, npages); module_memfree(tramp); } #else @@ -699,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size) { return NULL; } -static inline void tramp_free(void *tramp) { } +static inline void tramp_free(void *tramp, int size) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ @@ -771,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* Copy ftrace_caller onto the trampoline memory */ ret = probe_kernel_read(trampoline, (void *)start_offset, size); if (WARN_ON(ret < 0)) { - tramp_free(trampoline); + tramp_free(trampoline, *tramp_size); return 0; } @@ -797,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* Are we pointing to the reference? */ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) { - tramp_free(trampoline); + tramp_free(trampoline, *tramp_size); return 0; } @@ -839,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) unsigned long offset; unsigned long ip; unsigned int size; - int ret; + int ret, npages; if (ops->trampoline) { /* @@ -848,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) */ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; + npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT; + set_memory_rw(ops->trampoline, npages); } else { ops->trampoline = create_trampoline(ops, &size); if (!ops->trampoline) return; ops->trampoline_size = size; + npages = PAGE_ALIGN(size) >> PAGE_SHIFT; } offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); @@ -863,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); ret = update_ftrace_func(ip, new); + set_memory_ro(ops->trampoline, npages); /* The update should never fail */ WARN_ON(ret); @@ -939,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; - tramp_free((void *)ops->trampoline); + tramp_free((void *)ops->trampoline, ops->trampoline_size); ops->trampoline = 0; } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index be22f5a2192e..4e3b8a587c88 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -418,6 +418,7 @@ struct legacy_pic default_legacy_pic = { }; struct legacy_pic *legacy_pic = &default_legacy_pic; +EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 19e1f2a6d7b0..6b877807598b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -52,6 +52,7 @@ #include <linux/ftrace.h> #include <linux/frame.h> #include <linux/kasan.h> +#include <linux/moduleloader.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -61,6 +62,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" @@ -416,6 +418,14 @@ static void prepare_boost(struct kprobe *p, struct insn *insn) } } +/* Recover page to RW mode before releasing it */ +void free_insn_page(void *page) +{ + set_memory_nx((unsigned long)page & PAGE_MASK, 1); + set_memory_rw((unsigned long)page & PAGE_MASK, 1); + module_memfree(page); +} + static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 9aadff3d0902..901c640d152f 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -37,6 +37,7 @@ #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> +#include <asm/set_memory.h> #include "common.h" diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index da5c09789984..43e10d6fdbed 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -161,8 +161,8 @@ void kvm_async_pf_task_wait(u32 token) */ rcu_irq_exit(); native_safe_halt(); - rcu_irq_enter(); local_irq_disable(); + rcu_irq_enter(); } } if (!n.halted) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5f43cec296c5..8c53c5d7a1bc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,7 @@ #include <asm/io_apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 085c3b300d32..6f5ca4ebe6e5 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -27,6 +27,7 @@ #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> #include <asm/setup.h> +#include <asm/set_memory.h> #ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { @@ -113,7 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .context = image, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -122,6 +123,10 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); + + if (direct_gbpages) + info.direct_gbpages = true; + for (i = 0; i < nr_pfn_mapped; i++) { mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 477ae806c2fa..f67bd3205df7 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -85,7 +85,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ff40e74c9181..ffeae818aa7a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, - smp_processor_id()); + raw_smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 603a1669a2ec..f81823695014 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -980,8 +980,6 @@ void __init setup_arch(char **cmdline_p) */ x86_configure_nx(); - simple_udelay_calibration(); - parse_early_param(); #ifdef CONFIG_MEMORY_HOTPLUG @@ -1041,6 +1039,8 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + simple_udelay_calibration(); + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1225,6 +1225,21 @@ void __init setup_arch(char **cmdline_p) kasan_init(); +#ifdef CONFIG_X86_32 + /* sync back kernel address range */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +#endif + tboot_probe(); map_vsyscall(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bb1e8cc0bc84..10edd1e69a68 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -291,11 +291,11 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_32 /* - * Sync back kernel address range. We want to make sure that - * all kernel mappings, including percpu mappings, are available - * in the smpboot asm. We can't reliably pick up percpu - * mappings using vmalloc_fault(), because exception dispatch - * needs percpu data. + * Sync back kernel address range again. We already did this in + * setup_arch(), but percpu data also needs to be available in + * the smpboot asm. We can't reliably pick up percpu mappings + * using vmalloc_fault(), because exception dispatch needs + * percpu data. */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index d4c8011a2293..4b1724059909 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -514,6 +514,9 @@ int tboot_force_iommu(void) if (!tboot_enabled()) return 0; + if (!intel_iommu_tboot_noforce) + return 1; + if (no_iommu || swiotlb || dmar_disabled) pr_warning("Forcing Intel-IOMMU to enabled\n"); diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 82c6d7f1fd73..b9389d72b2f7 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -104,6 +104,11 @@ static inline unsigned long *last_frame(struct unwind_state *state) return (unsigned long *)task_pt_regs(state->task) - 2; } +static bool is_last_frame(struct unwind_state *state) +{ + return state->bp == last_frame(state); +} + #ifdef CONFIG_X86_32 #define GCC_REALIGN_WORDS 3 #else @@ -115,16 +120,15 @@ static inline unsigned long *last_aligned_frame(struct unwind_state *state) return last_frame(state) - GCC_REALIGN_WORDS; } -static bool is_last_task_frame(struct unwind_state *state) +static bool is_last_aligned_frame(struct unwind_state *state) { unsigned long *last_bp = last_frame(state); unsigned long *aligned_bp = last_aligned_frame(state); /* - * We have to check for the last task frame at two different locations - * because gcc can occasionally decide to realign the stack pointer and - * change the offset of the stack frame in the prologue of a function - * called by head/entry code. Examples: + * GCC can occasionally decide to realign the stack pointer and change + * the offset of the stack frame in the prologue of a function called + * by head/entry code. Examples: * * <start_secondary>: * push %edi @@ -141,11 +145,38 @@ static bool is_last_task_frame(struct unwind_state *state) * push %rbp * mov %rsp,%rbp * - * Note that after aligning the stack, it pushes a duplicate copy of - * the return address before pushing the frame pointer. + * After aligning the stack, it pushes a duplicate copy of the return + * address before pushing the frame pointer. + */ + return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1)); +} + +static bool is_last_ftrace_frame(struct unwind_state *state) +{ + unsigned long *last_bp = last_frame(state); + unsigned long *last_ftrace_bp = last_bp - 3; + + /* + * When unwinding from an ftrace handler of a function called by entry + * code, the stack layout of the last frame is: + * + * bp + * parent ret addr + * bp + * function ret addr + * parent ret addr + * pt_regs + * ----------------- */ - return (state->bp == last_bp || - (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1))); + return (state->bp == last_ftrace_bp && + *state->bp == *(state->bp + 2) && + *(state->bp + 1) == *(state->bp + 4)); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + return is_last_frame(state) || is_last_aligned_frame(state) || + is_last_ftrace_frame(state); } /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a181ae76c71c..59ca2eea522c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -780,18 +780,20 @@ out: static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; + struct kvm_cpuid_entry2 *ej; + int j = i; + int nent = vcpu->arch.cpuid_nent; e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ + do { + j = (j + 1) % nent; + ej = &vcpu->arch.cpuid_entries[j]; + } while (ej->function != e->function); + + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + + return j; } /* find an entry with matching function, matching index (if needed), and that diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c25cfaf584e7..0816ab2e8adc 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4173,7 +4173,7 @@ static int check_dr_write(struct x86_emulate_ctxt *ctxt) static int check_svme(struct x86_emulate_ctxt *ctxt) { - u64 efer; + u64 efer = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9fa5b8164961..d24c8742d9b0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -177,8 +177,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); - new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + new = kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); if (!new) goto out; @@ -1495,8 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; + preempt_enable(); } static bool start_hv_timer(struct kvm_lapic *apic) @@ -1934,7 +1936,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) for (i = 0; i < KVM_APIC_LVT_NUM; i++) kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) + if (kvm_vcpu_is_reset_bsp(vcpu) && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 558676538fca..cb8225969255 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1498,6 +1498,21 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } +/** + * kvm_arch_write_log_dirty - emulate dirty page logging + * @vcpu: Guest mode vcpu + * + * Emulate arch specific page modification logging for the + * nested hypervisor + */ +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->write_log_dirty) + return kvm_x86_ops->write_log_dirty(vcpu); + + return 0; +} + bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -3683,12 +3698,15 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool can_do_async_pf(struct kvm_vcpu *vcpu) +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; + if (is_guest_mode(vcpu)) + return false; + return kvm_x86_ops->interrupt_allowed(vcpu); } @@ -3704,7 +3722,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!async) return false; /* *pfn has correct page already */ - if (!prefault && can_do_async_pf(vcpu)) { + if (!prefault && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(gva, gfn); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index d8ccb32f7308..330bf3a811fb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -76,6 +76,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -202,4 +203,5 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 60168cdd0546..ea67dc876316 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -40,8 +40,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - slot->arch.gfn_track[i] = kvm_kvzalloc(npages * - sizeof(*slot->arch.gfn_track[i])); + slot->arch.gfn_track[i] = kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 314d2071b337..b0454c7e4cff 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -226,6 +226,10 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (level == walker->level && write_fault && !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); +#if PTTYPE == PTTYPE_EPT + if (kvm_arch_write_log_dirty(vcpu)) + return -EINVAL; +#endif pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) @@ -279,11 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; + u64 pt_access, pte_access; + unsigned index, accessed_dirty, pte_pkey; unsigned nested_access; gpa_t pte_gpa; bool have_ad; int offset; + u64 walk_nx_mask = 0; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; @@ -298,6 +304,7 @@ retry_walk: have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 + walk_nx_mask = 1ULL << PT64_NX_SHIFT; if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); @@ -309,8 +316,6 @@ retry_walk: walker->max_level = walker->level; ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); - accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0; - /* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging * by the MOV to CR instruction are treated as reads and do not cause the @@ -318,14 +323,14 @@ retry_walk: */ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; - pt_access = pte_access = ACC_ALL; + pte_access = ~0; ++walker->level; do { gfn_t real_gfn; unsigned long host_addr; - pt_access &= pte_access; + pt_access = pte_access; --walker->level; index = PT_INDEX(addr, walker->level); @@ -367,6 +372,12 @@ retry_walk: trace_kvm_mmu_paging_element(pte, walker->level); + /* + * Inverting the NX it lets us AND it like other + * permission bits. + */ + pte_access = pt_access & (pte ^ walk_nx_mask); + if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; @@ -375,14 +386,16 @@ retry_walk: goto error; } - accessed_dirty &= pte; - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); - walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); - errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access); + accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); + walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); + errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; @@ -399,7 +412,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - FNAME(protect_clean_gpte)(mmu, &pte_access, pte); + FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. @@ -417,10 +430,8 @@ retry_walk: goto retry_walk; } - walker->pt_access = pt_access; - walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pte_access, pt_access); + __func__, (u64)pte, walker->pte_access, walker->pt_access); return 1; error: @@ -448,7 +459,7 @@ error: */ if (!(errcode & PFERR_RSVD_MASK)) { vcpu->arch.exit_qualification &= 0x187; - vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3; + vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; } #endif walker->fault.address = addr; diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index 9d4a8504a95a..5ab4a364348e 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -294,7 +294,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c27ac6923a18..ba9891ac5c56 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1272,7 +1272,8 @@ static void init_vmcb(struct vcpu_svm *svm) } -static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index) +static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, + unsigned int index) { u64 *avic_physical_id_table; struct kvm_arch *vm_data = &vcpu->kvm->arch; @@ -1806,7 +1807,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, * AMD's VMCB does not have an explicit unusable field, so emulate it * for cross vendor migration purposes by "not present" */ - var->unusable = !var->present || (var->type == 0); + var->unusable = !var->present; switch (seg) { case VCPU_SREG_TR: @@ -1839,6 +1840,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, */ if (var->unusable) var->db = 0; + /* This is symmetric with svm_set_segment() */ var->dpl = to_svm(vcpu)->vmcb->save.cpl; break; } @@ -1979,18 +1981,14 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->base = var->base; s->limit = var->limit; s->selector = var->selector; - if (var->unusable) - s->attrib = 0; - else { - s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); - s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; - s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; - s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; - s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; - s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; - s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; - s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; - } + s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); + s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; + s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; + s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; + s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; + s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; + s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; + s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; /* * This is always accurate, except if SYSRET returned to a segment @@ -1999,7 +1997,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, * would entail passing the CPL to userspace and back. */ if (seg == VCPU_SREG_SS) - svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + /* This is symmetric with svm_get_segment() */ + svm->vmcb->save.cpl = (var->dpl & 3); mark_dirty(svm->vmcb, VMCB_SEG); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c5fd459c4043..ca5d2b93385c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -248,6 +248,7 @@ struct __packed vmcs12 { u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; + u64 pml_address; u64 guest_ia32_debugctl; u64 guest_ia32_pat; u64 guest_ia32_efer; @@ -369,6 +370,7 @@ struct __packed vmcs12 { u16 guest_ldtr_selector; u16 guest_tr_selector; u16 guest_intr_status; + u16 guest_pml_index; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -407,6 +409,7 @@ struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; gpa_t vmxon_ptr; + bool pml_full; /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; @@ -742,6 +745,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), FIELD(GUEST_INTR_STATUS, guest_intr_status), + FIELD(GUEST_PML_INDEX, guest_pml_index), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -767,6 +771,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), + FIELD64(PML_ADDRESS, pml_address), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), FIELD64(GUEST_IA32_PAT, guest_ia32_pat), FIELD64(GUEST_IA32_EFER, guest_ia32_efer), @@ -1314,6 +1319,11 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } +static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) +{ + return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low); +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -1348,6 +1358,11 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); +} + static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -2410,7 +2425,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; - nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); return 1; @@ -2751,8 +2766,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_1GB_PAGE_BIT; - if (enable_ept_ad_bits) + if (enable_ept_ad_bits) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_PML; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + } } else vmx->nested.nested_vmx_ept_caps = 0; @@ -6486,7 +6504,7 @@ static __init int hardware_setup(void) enable_ept_ad_bits = 0; } - if (!cpu_has_vmx_ept_ad_bits()) + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; if (!cpu_has_vmx_unrestricted_guest()) @@ -6896,97 +6914,21 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, return 0; } -/* - * This function performs the various checks including - * - if it's 4KB aligned - * - No bits beyond the physical address width are set - * - Returns 0 on success or else 1 - * (Intel SDM Section 30.3) - */ -static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, - gpa_t *vmpointer) +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) { gva_t gva; - gpa_t vmptr; struct x86_exception e; - struct page *page; - struct vcpu_vmx *vmx = to_vmx(vcpu); - int maxphyaddr = cpuid_maxphyaddr(vcpu); if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer, + sizeof(*vmpointer), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } - switch (exit_reason) { - case EXIT_REASON_VMON: - /* - * SDM 3: 24.11.5 - * The first 4 bytes of VMXON region contain the supported - * VMCS revision identifier - * - * Note - IA32_VMX_BASIC[48] will never be 1 - * for the nested case; - * which replaces physical address width with 32 - * - */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - nested_release_page_clean(page); - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - kunmap(page); - nested_release_page_clean(page); - vmx->nested.vmxon_ptr = vmptr; - break; - case EXIT_REASON_VMCLEAR: - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } - break; - case EXIT_REASON_VMPTRLD: - if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } - - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } - break; - default: - return 1; /* shouldn't happen */ - } - - if (vmpointer) - *vmpointer = vmptr; return 0; } @@ -7048,6 +6990,8 @@ out_msr_bitmap: static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; + gpa_t vmptr; + struct page *page; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -7077,9 +7021,37 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - + + /* + * SDM 3: 24.11.5 + * The first 4 bytes of VMXON region contain the supported + * VMCS revision identifier + * + * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; + * which replaces physical address width with 32 + */ + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + + page = nested_get_page(vcpu, vmptr); + if (page == NULL) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + if (*(u32 *)kmap(page) != VMCS12_REVISION) { + kunmap(page); + nested_release_page_clean(page); + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + kunmap(page); + nested_release_page_clean(page); + + vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); if (ret) return ret; @@ -7195,9 +7167,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); + return kvm_skip_emulated_instruction(vcpu); + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); + } + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); @@ -7527,9 +7509,19 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) + if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { + nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); + return kvm_skip_emulated_instruction(vcpu); + } + + if (vmptr == vmx->nested.vmxon_ptr) { + nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); + return kvm_skip_emulated_instruction(vcpu); + } + if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; @@ -7895,11 +7887,13 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); int cr = exit_qualification & 15; - int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_readl(vcpu, reg); + int reg; + unsigned long val; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ + reg = (exit_qualification >> 8) & 15; + val = kvm_register_readl(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask & @@ -7954,6 +7948,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, * lmsw can change bits 1..3 of cr0, and only set bit 0 of * cr0. Other attempted changes are ignored, with no exit. */ + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; if (vmcs12->cr0_guest_host_mask & 0xe & (val ^ vmcs12->cr0_read_shadow)) return true; @@ -8114,7 +8109,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_PREEMPTION_TIMER: return false; case EXIT_REASON_PML_FULL: - /* We don't expose PML support to L1. */ + /* We emulate PML support to L1. */ return false; default: return true; @@ -9364,13 +9359,20 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason; + unsigned long exit_qualification = vcpu->arch.exit_qualification; - if (fault->error_code & PFERR_RSVD_MASK) + if (vmx->nested.pml_full) { + exit_reason = EXIT_REASON_PML_FULL; + vmx->nested.pml_full = false; + exit_qualification &= INTR_INFO_UNBLOCK_NMI; + } else if (fault->error_code & PFERR_RSVD_MASK) exit_reason = EXIT_REASON_EPT_MISCONFIG; else exit_reason = EXIT_REASON_EPT_VIOLATION; - nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); + + nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; } @@ -9713,6 +9715,22 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u64 address = vmcs12->pml_address; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { + if (!nested_cpu_has_ept(vmcs12) || + !IS_ALIGNED(address, 4096) || + address >> maxphyaddr) + return -EINVAL; + } + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -9886,7 +9904,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; + u32 exec_control, vmcs12_exec_ctrl; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -10017,8 +10035,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) - exec_control |= vmcs12->secondary_vm_exec_control; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { + vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & + ~SECONDARY_EXEC_ENABLE_PML; + exec_control |= vmcs12_exec_ctrl; + } if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { vmcs_write64(EOI_EXIT_BITMAP0, @@ -10248,6 +10269,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_pml_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || @@ -10266,6 +10290,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.nested_vmx_entry_ctls_high)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || !nested_cr3_valid(vcpu, vmcs12->host_cr3)) @@ -11143,6 +11170,46 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t gpa; + struct page *page = NULL; + u64 *pml_address; + + if (is_guest_mode(vcpu)) { + WARN_ON_ONCE(vmx->nested.pml_full); + + /* + * Check if PML is enabled for the nested guest. + * Whether eptp bit 6 is set is already checked + * as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + + page = nested_get_page(vcpu, vmcs12->pml_address); + if (!page) + return 0; + + pml_address = kmap(page); + pml_address[vmcs12->guest_pml_index--] = gpa; + kunmap(page); + nested_release_page_clean(page); + } + + return 0; +} + static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -11502,6 +11569,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b38a302858a0..87d3cb901935 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1763,6 +1763,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + u64 ret; spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { @@ -1774,10 +1775,17 @@ u64 get_kvmclock_ns(struct kvm *kvm) hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; spin_unlock(&ka->pvclock_gtod_sync_lock); + /* both __this_cpu_read() and rdtsc() should be on the same cpu */ + get_cpu(); + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - return __pvclock_read_cycles(&hv_clock, rdtsc()); + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + + put_cpu(); + + return ret; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v) @@ -3288,11 +3296,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, } } +#define XSAVE_MXCSR_OFFSET 24 + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* @@ -3300,11 +3311,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0()) + if (xstate_bv & ~kvm_supported_xcr0() || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE) + if (xstate_bv & ~XFEATURE_MASK_FPSSE || + mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); @@ -4818,16 +4831,20 @@ emul_write: static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) { - /* TODO: String I/O for in kernel device */ - int r; + int r = 0, i; - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); + for (i = 0; i < vcpu->arch.pio.count; i++) { + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); + if (r) + break; + pd += vcpu->arch.pio.size; + } return r; } @@ -4865,6 +4882,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (vcpu->arch.pio.count) goto data_avail; + memset(vcpu->arch.pio_data, 0, size * count); + ret = emulator_pio_in_out(vcpu, size, port, val, count, true); if (ret) { data_avail: @@ -5048,6 +5067,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, if (var.unusable) { memset(desc, 0, sizeof(*desc)); + if (base3) + *base3 = 0; return false; } @@ -8190,13 +8211,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->base_gfn, level) + 1; slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL); if (!linfo) goto out_free; @@ -8373,10 +8394,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; - if (atomic_read(&vcpu->arch.nmi_queued)) + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + (vcpu->arch.nmi_pending && + kvm_x86_ops->nmi_allowed(vcpu))) return true; - if (kvm_test_request(KVM_REQ_SMI, vcpu)) + if (kvm_test_request(KVM_REQ_SMI, vcpu) || + (vcpu->arch.smi_pending && !is_smm(vcpu))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -8583,8 +8607,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) return true; else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_can_do_async_pf(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 7e48807b2fa1..45a53dfe1859 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -55,7 +55,7 @@ ENTRY(csum_partial_copy_generic) movq %r12, 3*8(%rsp) movq %r14, 4*8(%rsp) movq %r13, 5*8(%rsp) - movq %rbp, 6*8(%rsp) + movq %r15, 6*8(%rsp) movq %r8, (%rsp) movq %r9, 1*8(%rsp) @@ -74,7 +74,7 @@ ENTRY(csum_partial_copy_generic) /* main loop. clear in 64 byte blocks */ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ /* r11: temp3, rdx: temp4, r12 loopcnt */ - /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ + /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */ .p2align 4 .Lloop: source @@ -89,7 +89,7 @@ ENTRY(csum_partial_copy_generic) source movq 32(%rdi), %r10 source - movq 40(%rdi), %rbp + movq 40(%rdi), %r15 source movq 48(%rdi), %r14 source @@ -103,7 +103,7 @@ ENTRY(csum_partial_copy_generic) adcq %r11, %rax adcq %rdx, %rax adcq %r10, %rax - adcq %rbp, %rax + adcq %r15, %rax adcq %r14, %rax adcq %r13, %rax @@ -121,7 +121,7 @@ ENTRY(csum_partial_copy_generic) dest movq %r10, 32(%rsi) dest - movq %rbp, 40(%rsi) + movq %r15, 40(%rsi) dest movq %r14, 48(%rsi) dest @@ -203,7 +203,7 @@ ENTRY(csum_partial_copy_generic) movq 3*8(%rsp), %r12 movq 4*8(%rsp), %r14 movq 5*8(%rsp), %r13 - movq 6*8(%rsp), %rbp + movq 6*8(%rsp), %r15 addq $7*8, %rsp ret diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index 5761a4f19455..ab2d1d73e9e7 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -5,6 +5,7 @@ * kernel starts. This file is included in the compressed kernel and * normally linked in the regular. */ +#include <asm/asm.h> #include <asm/kaslr.h> #include <asm/msr.h> #include <asm/archrandom.h> @@ -79,7 +80,7 @@ unsigned long kaslr_get_random_long(const char *purpose) } /* Circular multiply for better bit diffusion */ - asm("mul %3" + asm(_ASM_MUL "%3" : "=a" (random), "=d" (raw) : "a" (random), "rm" (mix_const)); random += raw; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 04210a29dd60..adab1595f4bd 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -13,7 +13,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, if (pmd_present(*pmd)) continue; - set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag)); + set_pmd(pmd, __pmd((addr - info->offset) | info->page_flag)); } } @@ -30,6 +30,18 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (next > end) next = end; + if (info->direct_gbpages) { + pud_t pudval; + + if (pud_present(*pud)) + continue; + + addr &= PUD_MASK; + pudval = __pud((addr - info->offset) | info->page_flag); + set_pud(pud, pudval); + continue; + } + if (pud_present(*pud)) { pmd = pmd_offset(pud, 0); ident_pmd_init(info, pmd, addr, next); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 138bad2fb6bc..cbc87ea98751 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -5,7 +5,7 @@ #include <linux/memblock.h> #include <linux/bootmem.h> /* for max_low_pfn */ -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/page.h> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f34d275ee201..99fb83819a5f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -48,7 +48,7 @@ #include <asm/sections.h> #include <asm/paravirt.h> #include <asm/setup.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/page_types.h> #include <asm/init.h> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 745e5e183169..95651dc58e09 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -50,7 +50,7 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/numa.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/init.h> #include <asm/uv/uv.h> #include <asm/setup.h> @@ -94,10 +94,10 @@ __setup("noexec32=", nonx32_setup); */ void sync_global_pgds(unsigned long start, unsigned long end) { - unsigned long address; + unsigned long addr; - for (address = start; address <= end; address += PGDIR_SIZE) { - pgd_t *pgd_ref = pgd_offset_k(address); + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd_ref = pgd_offset_k(addr); const p4d_t *p4d_ref; struct page *page; @@ -106,7 +106,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) * handle synchonization on p4d level. */ BUILD_BUG_ON(pgd_none(*pgd_ref)); - p4d_ref = p4d_offset(pgd_ref, address); + p4d_ref = p4d_offset(pgd_ref, addr); if (p4d_none(*p4d_ref)) continue; @@ -117,8 +117,8 @@ void sync_global_pgds(unsigned long start, unsigned long end) p4d_t *p4d; spinlock_t *pgt_lock; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - p4d = p4d_offset(pgd, address); + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d = p4d_offset(pgd, addr); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index e4f7b25df18e..bbc558b88a88 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -14,7 +14,7 @@ #include <linux/vmalloc.h> #include <linux/mmiotrace.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/e820/api.h> #include <asm/fixmap.h> #include <asm/pgtable.h> diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 6b7ce6279133..aca6295350f3 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -100,5 +100,6 @@ void __init initmem_init(void) printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); + __vmalloc_start_set = true; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 56b22fa504df..c8520b2c62d2 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -24,6 +24,7 @@ #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/pat.h> +#include <asm/set_memory.h> /* * The current flushing context - we pass it instead of 5 arguments: @@ -185,7 +186,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) unsigned int i, level; unsigned long addr; - BUG_ON(irqs_disabled()); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); on_each_cpu(__cpa_flush_range, NULL, 1); diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 38868adf07ea..f6ae6830b341 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -9,7 +9,7 @@ #include <linux/mmiotrace.h> static unsigned long mmio_address; -module_param(mmio_address, ulong, 0); +module_param_hw(mmio_address, ulong, iomem, 0); MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " "(or 8 MB if read_far is non-zero)."); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 14f840df1d95..f58939393eef 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <linux/bpf.h> int bpf_jit_enable __read_mostly; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 6fa84d531f4f..7b4307163eac 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -406,50 +406,3 @@ void __init pcibios_resource_survey(void) */ ioapic_insert_resources(); } - -static const struct vm_operations_struct pci_mmap_ops = { - .access = generic_access_phys, -}; - -int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine) -{ - unsigned long prot; - - /* I/O space cannot be accessed via normal processor loads and - * stores on this platform. - */ - if (mmap_state == pci_mmap_io) - return -EINVAL; - - prot = pgprot_val(vma->vm_page_prot); - - /* - * Return error if pat is not enabled and write_combine is requested. - * Caller can followup with UC MINUS request and add a WC mtrr if there - * is a free mtrr slot. - */ - if (!pat_enabled() && write_combine) - return -EINVAL; - - if (pat_enabled() && write_combine) - prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); - else if (pat_enabled() || boot_cpu_data.x86 > 3) - /* - * ioremap() and ioremap_nocache() defaults to UC MINUS for now. - * To avoid attribute conflicts, request UC MINUS here - * as well. - */ - prot |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); - - vma->vm_page_prot = __pgprot(prot); - - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) - return -EAGAIN; - - vma->vm_ops = &pci_mmap_ops; - - return 0; -} diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 29e9ba6ace9d..c1bdb9edcae7 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -11,7 +11,7 @@ #include <asm/pci_x86.h> #include <asm/e820/types.h> #include <asm/pci-functions.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a15cf815ac4e..43b96f5f78ba 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -49,7 +49,7 @@ #include <asm/efi.h> #include <asm/e820/api.h> #include <asm/time.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> #include <asm/uv/uv.h> @@ -828,9 +828,11 @@ static void __init kexec_enter_virtual_mode(void) /* * We don't do virtual mode, since we don't do runtime services, on - * non-native EFI + * non-native EFI. With efi=old_map, we don't do runtime services in + * kexec kernel because in the initial boot something else might + * have been mapped at these virtual addresses. */ - if (!efi_is_native()) { + if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) { efi_memmap_unmap(); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index c488625c9712..eb8dff15a7f6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -71,11 +71,13 @@ static void __init early_code_mapping_set_exec(int executable) pgd_t * __init efi_call_phys_prolog(void) { - unsigned long vaddress; - pgd_t *save_pgd; + unsigned long vaddr, addr_pgd, addr_p4d, addr_pud; + pgd_t *save_pgd, *pgd_k, *pgd_efi; + p4d_t *p4d, *p4d_k, *p4d_efi; + pud_t *pud; int pgd; - int n_pgds; + int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { save_pgd = (pgd_t *)read_cr3(); @@ -88,10 +90,49 @@ pgd_t * __init efi_call_phys_prolog(void) n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL); + /* + * Build 1:1 identity mapping for efi=old_map usage. Note that + * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while + * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical + * address X, the pud_index(X) != pud_index(__va(X)), we can only copy + * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping. + * This means here we can only reuse the PMD tables of the direct mapping. + */ for (pgd = 0; pgd < n_pgds; pgd++) { - save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE); - vaddress = (unsigned long)__va(pgd * PGDIR_SIZE); - set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); + addr_pgd = (unsigned long)(pgd * PGDIR_SIZE); + vaddr = (unsigned long)__va(pgd * PGDIR_SIZE); + pgd_efi = pgd_offset_k(addr_pgd); + save_pgd[pgd] = *pgd_efi; + + p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd); + if (!p4d) { + pr_err("Failed to allocate p4d table!\n"); + goto out; + } + + for (i = 0; i < PTRS_PER_P4D; i++) { + addr_p4d = addr_pgd + i * P4D_SIZE; + p4d_efi = p4d + p4d_index(addr_p4d); + + pud = pud_alloc(&init_mm, p4d_efi, addr_p4d); + if (!pud) { + pr_err("Failed to allocate pud table!\n"); + goto out; + } + + for (j = 0; j < PTRS_PER_PUD; j++) { + addr_pud = addr_p4d + j * PUD_SIZE; + + if (addr_pud > (max_pfn << PAGE_SHIFT)) + break; + + vaddr = (unsigned long)__va(addr_pud); + + pgd_k = pgd_offset_k(vaddr); + p4d_k = p4d_offset(pgd_k, vaddr); + pud[j] = *pud_offset(p4d_k, vaddr); + } + } } out: __flush_tlb_all(); @@ -104,8 +145,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) /* * After the lock is released, the original page table is restored. */ - int pgd_idx; + int pgd_idx, i; int nr_pgds; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; if (!efi_enabled(EFI_OLD_MEMMAP)) { write_cr3((unsigned long)save_pgd); @@ -115,9 +159,28 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); - for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) + for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) { + pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE); set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]); + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + continue; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_offset(pgd, + pgd_idx * PGDIR_SIZE + i * P4D_SIZE); + + if (!(p4d_val(*p4d) & _PAGE_PRESENT)) + continue; + + pud = (pud_t *)p4d_page_vaddr(*p4d); + pud_free(&init_mm, pud); + } + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + p4d_free(&init_mm, p4d); + } + kfree(save_pgd); __flush_tlb_all(); diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 26615991d69c..e0cf95a83f3f 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -360,6 +360,9 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } + if (!num_entries) + return; + new_size = efi.memmap.desc_size * num_entries; new_phys = efi_memmap_alloc(num_entries); if (!new_phys) { diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 6a61194ffd58..a6e21fee22ea 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -104,7 +104,7 @@ static int set_up_temporary_mappings(void) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, - .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, .offset = __PAGE_OFFSET, }; unsigned long mstart, mend; diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 5db706f14111..a163a90af4aa 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -2,7 +2,7 @@ #include <linux/slab.h> #include <linux/memblock.h> -#include <asm/cacheflush.h> +#include <asm/set_memory.h> #include <asm/pgtable.h> #include <asm/realmode.h> #include <asm/tlbflush.h> diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index a5c9910d234f..09a085bde0d4 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -125,7 +125,7 @@ int poke_user(struct task_struct *child, long addr, long data) else if ((addr >= offsetof(struct user, u_debugreg[0])) && (addr <= offsetof(struct user, u_debugreg[7]))) { addr -= offsetof(struct user, u_debugreg[0]); - addr = addr >> 2; + addr = addr >> 3; if ((addr == 4) || (addr == 5)) return -EIO; child->thread.arch.debugregs[addr] = data; diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 46a9df99f3c5..7e1d35b6ad5c 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -2,16 +2,9 @@ #include <linux/sched.h> #include <linux/elf.h> #include <linux/crypto.h> +#include <linux/kbuild.h> #include <asm/mman.h> -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) - -#define BLANK() asm volatile("\n->" : : ) - -#define OFFSET(sym, str, mem) \ - DEFINE(sym, offsetof(struct str, mem)); - void foo(void) { #include <common-offsets.h> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index a732bc2b9dfc..f33eef4ebd12 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -142,9 +142,7 @@ static void __init xen_banner(void) struct xen_extraversion extra; HYPERVISOR_xen_version(XENVER_extraversion, &extra); - pr_info("Booting paravirtualized kernel %son %s\n", - xen_feature(XENFEAT_auto_translated_physmap) ? - "with PVH extensions " : "", pv_info.name); + pr_info("Booting paravirtualized kernel on %s\n", pv_info.name); printk(KERN_INFO "Xen version: %d.%d%s%s\n", version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); @@ -277,31 +275,19 @@ static bool __init xen_check_mwait(void) static bool __init xen_check_xsave(void) { - unsigned int err, eax, edx; + unsigned int cx, xsave_mask; - /* - * Xen 4.0 and older accidentally leaked the host XSAVE flag into guest - * view, despite not being able to support guests using the - * functionality. Probe for the actual availability of XSAVE by seeing - * whether xgetbv executes successfully or raises #UD. - */ - asm volatile("1: .byte 0x0f,0x01,0xd0\n\t" /* xgetbv */ - "xor %[err], %[err]\n" - "2:\n\t" - ".pushsection .fixup,\"ax\"\n\t" - "3: movl $1,%[err]\n\t" - "jmp 2b\n\t" - ".popsection\n\t" - _ASM_EXTABLE(1b, 3b) - : [err] "=r" (err), "=a" (eax), "=d" (edx) - : "c" (0)); - - return err == 0; + cx = cpuid_ecx(1); + + xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) | + (1 << (X86_FEATURE_OSXSAVE % 32)); + + /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ + return (cx & xsave_mask) == xsave_mask; } static void __init xen_init_capabilities(void) { - setup_clear_cpu_cap(X86_BUG_SYSRET_SS_ATTRS); setup_force_cpu_cap(X86_FEATURE_XENPV); setup_clear_cpu_cap(X86_FEATURE_DCA); setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); @@ -317,10 +303,7 @@ static void __init xen_init_capabilities(void) else setup_clear_cpu_cap(X86_FEATURE_MWAIT); - if (xen_check_xsave()) { - setup_force_cpu_cap(X86_FEATURE_XSAVE); - setup_force_cpu_cap(X86_FEATURE_OSXSAVE); - } else { + if (!xen_check_xsave()) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_OSXSAVE); } @@ -972,15 +955,10 @@ static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) void xen_setup_shared_info(void) { - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - set_fixmap(FIX_PARAVIRT_BOOTMAP, - xen_start_info->shared_info); + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - } else - HYPERVISOR_shared_info = - (struct shared_info *)__va(xen_start_info->shared_info); + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); #ifndef CONFIG_SMP /* In UP this is as good a place as any to set up shared info */ @@ -988,6 +966,13 @@ void xen_setup_shared_info(void) #endif xen_setup_mfn_list_list(); + + /* + * Now that shared info is set up we can start using routines that + * point to pvclock area. + */ + if (system_state == SYSTEM_BOOTING) + xen_init_time_ops(); } /* This is called once we have the cpu_possible_mask */ @@ -1286,8 +1271,6 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; - xen_init_time_ops(); - /* * Set up some pagetable state before starting to set any ptes. */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e375a5e815f..3be06f3caf3c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -42,7 +42,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr) } EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); -void xen_flush_tlb_all(void) +static void xen_flush_tlb_all(void) { struct mmuext_op *op; struct multicall_space mcs; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 9d9ae6650aa1..1f386d7fdf70 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -355,10 +355,8 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) pteval_t flags = val & PTE_FLAGS_MASK; unsigned long mfn; - if (!xen_feature(XENFEAT_auto_translated_physmap)) - mfn = __pfn_to_mfn(pfn); - else - mfn = pfn; + mfn = __pfn_to_mfn(pfn); + /* * If there's no mfn for the pfn, then just create an * empty non-present pte. Unfortunately this loses @@ -647,9 +645,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, limit--; BUG_ON(limit >= FIXADDR_TOP); - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - /* * 64-bit has a great big hole in the middle of the address * space, which contains the Xen mappings. On 32-bit these @@ -1289,9 +1284,6 @@ static void __init xen_pagetable_cleanhighmap(void) static void __init xen_pagetable_p2m_setup(void) { - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - xen_vmalloc_p2m_tree(); #ifdef CONFIG_X86_64 @@ -1314,8 +1306,7 @@ static void __init xen_pagetable_init(void) xen_build_mfn_list_list(); /* Remap memory freed due to conflicts with E820 map */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_remap_memory(); + xen_remap_memory(); xen_setup_shared_info(); } @@ -1925,21 +1916,20 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* Pre-constructed entries are in pfn, so convert to mfn */ - /* L4[272] -> level3_ident_pgt - * L4[511] -> level3_kernel_pgt */ - convert_pfn_mfn(init_level4_pgt); + /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt */ + /* L4[511] -> level3_kernel_pgt */ + convert_pfn_mfn(init_level4_pgt); - /* L3_i[0] -> level2_ident_pgt */ - convert_pfn_mfn(level3_ident_pgt); - /* L3_k[510] -> level2_kernel_pgt - * L3_k[511] -> level2_fixmap_pgt */ - convert_pfn_mfn(level3_kernel_pgt); + /* L3_i[0] -> level2_ident_pgt */ + convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt */ + /* L3_k[511] -> level2_fixmap_pgt */ + convert_pfn_mfn(level3_kernel_pgt); + + /* L3_k[511][506] -> level1_fixmap_pgt */ + convert_pfn_mfn(level2_fixmap_pgt); - /* L3_k[511][506] -> level1_fixmap_pgt */ - convert_pfn_mfn(level2_fixmap_pgt); - } /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); @@ -1962,34 +1952,30 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) if (i && i < pgd_index(__START_KERNEL_map)) init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); - } else - native_write_cr3(__pa(init_level4_pgt)); + /* + * At this stage there can be no user pgd, and no page structure to + * attach it to, so make sure we just set kernel pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(init_level4_pgt)); + xen_mc_issue(PARAVIRT_LAZY_CPU); /* We can't that easily rip out L3 and L2, as the Xen pagetables are * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for @@ -2025,7 +2011,8 @@ static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) /* * Translate a virtual address to a physical one without relying on mapped - * page tables. + * page tables. Don't rely on big pages being aligned in (guest) physical + * space! */ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) { @@ -2046,7 +2033,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) sizeof(pud))); if (!pud_present(pud)) return 0; - pa = pud_pfn(pud) << PAGE_SHIFT; + pa = pud_val(pud) & PTE_PFN_MASK; if (pud_large(pud)) return pa + (vaddr & ~PUD_MASK); @@ -2054,7 +2041,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) sizeof(pmd))); if (!pmd_present(pmd)) return 0; - pa = pmd_pfn(pmd) << PAGE_SHIFT; + pa = pmd_val(pmd) & PTE_PFN_MASK; if (pmd_large(pmd)) return pa + (vaddr & ~PMD_MASK); @@ -2402,9 +2389,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void __init xen_post_allocator_init(void) { - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; @@ -2510,9 +2494,6 @@ void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); @@ -2649,9 +2630,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, * this function are redundant and can be ignored. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - if (unlikely(order > MAX_CONTIG_ORDER)) return -ENOMEM; @@ -2688,9 +2666,6 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) int success; unsigned long vstart; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - if (unlikely(order > MAX_CONTIG_ORDER)) return; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 090c7eb4dca9..a1895a8e85c1 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -406,7 +406,7 @@ static void __init xen_time_init(void) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } -void __init xen_init_time_ops(void) +void __ref xen_init_time_ops(void) { pv_time_ops = xen_time_ops; |