diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2009-06-26 13:55:59 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2009-06-26 13:55:59 +1000 |
commit | 0a705313c1caa31a5f597a4688afc800e2306d9a (patch) | |
tree | 85507ace9c09cc48b39f2282e281038a9f93aa8e /arch | |
parent | a57c9190fc1b8d642ad24c8a277603a4382c3983 (diff) | |
parent | 7e57c8c2c787c5283d66e2bda1589175c0cecbc8 (diff) |
Merge commit 'tip/auto-latest'
Conflicts:
arch/x86/include/asm/termios.h
Diffstat (limited to 'arch')
38 files changed, 1309 insertions, 1851 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f9..4b3388447498 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1247,6 +1247,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 175adf58dd4f..2e7529295f55 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -26,6 +26,7 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern asmlinkage void math_state_restore(void); +extern void __math_state_restore(void); extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index 0d5b23b7b06e..ec34c760665e 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h @@ -1,94 +1 @@ -#ifndef _ASM_X86_IOCTLS_H -#define _ASM_X86_IOCTLS_H - -#include <asm/ioctl.h> - -/* 0x54 is just a magic number to make these relatively unique ('T') */ - -#define TCGETS 0x5401 -#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */ -#define TCSETSW 0x5403 -#define TCSETSF 0x5404 -#define TCGETA 0x5405 -#define TCSETA 0x5406 -#define TCSETAW 0x5407 -#define TCSETAF 0x5408 -#define TCSBRK 0x5409 -#define TCXONC 0x540A -#define TCFLSH 0x540B -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E -#define TIOCGPGRP 0x540F -#define TIOCSPGRP 0x5410 -#define TIOCOUTQ 0x5411 -#define TIOCSTI 0x5412 -#define TIOCGWINSZ 0x5413 -#define TIOCSWINSZ 0x5414 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define FIONREAD 0x541B -#define TIOCINQ FIONREAD -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -#define FIONBIO 0x5421 -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TCGETS2 _IOR('T', 0x2A, struct termios2) -#define TCSETS2 _IOW('T', 0x2B, struct termios2) -#define TCSETSW2 _IOW('T', 0x2C, struct termios2) -#define TCSETSF2 _IOW('T', 0x2D, struct termios2) -#define TIOCGRS485 0x542E -#define TIOCSRS485 0x542F -#define TIOCGPTN _IOR('T', 0x30, unsigned int) - /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ -#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ -#define TCSETX 0x5433 -#define TCSETXF 0x5434 -#define TCSETXW 0x5435 - -#define FIONCLEX 0x5450 -#define FIOCLEX 0x5451 -#define FIOASYNC 0x5452 -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ -#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ -#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ -#define FIOQSIZE 0x5460 - -/* Used for packet mode */ -#define TIOCPKT_DATA 0 -#define TIOCPKT_FLUSHREAD 1 -#define TIOCPKT_FLUSHWRITE 2 -#define TIOCPKT_STOP 4 -#define TIOCPKT_START 8 -#define TIOCPKT_NOSTOP 16 -#define TIOCPKT_DOSTOP 32 - -#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ - -#endif /* _ASM_X86_IOCTLS_H */ +#include <asm-generic/ioctls.h> diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index ee678fd51594..84c7e51cb6d0 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h @@ -1,28 +1 @@ -#ifndef _ASM_X86_IPCBUF_H -#define _ASM_X86_IPCBUF_H - -/* - * The ipc64_perm structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space is left for: - * - 32-bit mode_t and seq - * - 2 miscellaneous 32-bit values - */ - -struct ipc64_perm { - __kernel_key_t key; - __kernel_uid32_t uid; - __kernel_gid32_t gid; - __kernel_uid32_t cuid; - __kernel_gid32_t cgid; - __kernel_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned long __unused1; - unsigned long __unused2; -}; - -#endif /* _ASM_X86_IPCBUF_H */ +#include <asm-generic/ipcbuf.h> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec9..b50b9e9042c4 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -9,7 +9,7 @@ */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 751af2550ed9..593e51d4643f 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -1,20 +1,8 @@ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H -#include <asm-generic/mman-common.h> - #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ +#include <asm-generic/mman.h> #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 47d62743c4d5..555bc12bdcd6 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,18 +1,7 @@ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H -/* x86_32/64 are simple */ -struct mod_arch_specific {}; - -#ifdef CONFIG_X86_32 -# define Elf_Shdr Elf32_Shdr -# define Elf_Sym Elf32_Sym -# define Elf_Ehdr Elf32_Ehdr -#else -# define Elf_Shdr Elf64_Shdr -# define Elf_Sym Elf64_Sym -# define Elf_Ehdr Elf64_Ehdr -#endif +#include <asm-generic/module.h> #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index 7e4e9481f51c..809134c644a6 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h @@ -1,39 +1 @@ -#ifndef _ASM_X86_MSGBUF_H -#define _ASM_X86_MSGBUF_H - -/* - * The msqid64_ds structure for i386 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space on i386 is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - * - * Pad space on x8664 is left for: - * - 2 miscellaneous 64-bit values - */ -struct msqid64_ds { - struct ipc64_perm msg_perm; - __kernel_time_t msg_stime; /* last msgsnd time */ -#ifdef __i386__ - unsigned long __unused1; -#endif - __kernel_time_t msg_rtime; /* last msgrcv time */ -#ifdef __i386__ - unsigned long __unused2; -#endif - __kernel_time_t msg_ctime; /* last change time */ -#ifdef __i386__ - unsigned long __unused3; -#endif - unsigned long msg_cbytes; /* current number of bytes on queue */ - unsigned long msg_qnum; /* number of messages in queue */ - unsigned long msg_qbytes; /* max number of bytes on queue */ - __kernel_pid_t msg_lspid; /* pid of last msgsnd */ - __kernel_pid_t msg_lrpid; /* last receive pid */ - unsigned long __unused4; - unsigned long __unused5; -}; - -#endif /* _ASM_X86_MSGBUF_H */ +#include <asm-generic/msgbuf.h> diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index 6f0d0422f4ca..965d45427975 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h @@ -1,22 +1 @@ -#ifndef _ASM_X86_PARAM_H -#define _ASM_X86_PARAM_H - -#ifdef __KERNEL__ -# define HZ CONFIG_HZ /* Internal kernel timer frequency */ -# define USER_HZ 100 /* some user interfaces are */ -# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#define EXEC_PAGESIZE 4096 - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - -#endif /* _ASM_X86_PARAM_H */ +#include <asm-generic/param.h> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8a0832..6a07af432c81 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -7,689 +7,11 @@ #include <asm/pgtable_types.h> #include <asm/asm.h> -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) - -#include <asm/desc_defs.h> -#endif /* X86_64 */ - -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) +#include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/cpumask.h> -#include <asm/kmap_types.h> -#include <asm/desc_defs.h> - -struct page; -struct thread_struct; -struct desc_ptr; -struct tss_struct; -struct mm_struct; -struct desc_struct; -struct task_struct; - -/* - * Wrapper type for pointers to code which uses the non-standard - * calling convention. See PV_CALL_SAVE_REGS_THUNK below. - */ -struct paravirt_callee_save { - void *func; -}; - -/* general info */ -struct pv_info { - unsigned int kernel_rpl; - int shared_kernel_pmd; - int paravirt_enabled; - const char *name; -}; - -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, - unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*arch_setup)(void); - char *(*memory_setup)(void); - void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); -}; - - -struct pv_lazy_ops { - /* Set deferred update mode, used for batching operations. */ - void (*enter)(void); - void (*leave)(void); -}; - -struct pv_time_ops { - void (*time_init)(void); - - /* Set and set time of day */ - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long); - - unsigned long long (*sched_clock)(void); - unsigned long (*get_tsc_khz)(void); -}; - -struct pv_cpu_ops { - /* hooks for various privileged instructions */ - unsigned long (*get_debugreg)(int regno); - void (*set_debugreg)(int regno, unsigned long value); - - void (*clts)(void); - - unsigned long (*read_cr0)(void); - void (*write_cr0)(unsigned long); - - unsigned long (*read_cr4_safe)(void); - unsigned long (*read_cr4)(void); - void (*write_cr4)(unsigned long); - -#ifdef CONFIG_X86_64 - unsigned long (*read_cr8)(void); - void (*write_cr8)(unsigned long); -#endif - - /* Segment descriptor handling */ - void (*load_tr_desc)(void); - void (*load_gdt)(const struct desc_ptr *); - void (*load_idt)(const struct desc_ptr *); - void (*store_gdt)(struct desc_ptr *); - void (*store_idt)(struct desc_ptr *); - void (*set_ldt)(const void *desc, unsigned entries); - unsigned long (*store_tr)(void); - void (*load_tls)(struct thread_struct *t, unsigned int cpu); -#ifdef CONFIG_X86_64 - void (*load_gs_index)(unsigned int idx); -#endif - void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, - const void *desc); - void (*write_gdt_entry)(struct desc_struct *, - int entrynum, const void *desc, int size); - void (*write_idt_entry)(gate_desc *, - int entrynum, const gate_desc *gate); - void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); - void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); - - void (*set_iopl_mask)(unsigned mask); - - void (*wbinvd)(void); - void (*io_delay)(void); - - /* cpuid emulation, mostly so that caps bits can be disabled */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx); - - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr_amd)(unsigned int msr, int *err); - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - - u64 (*read_tsc)(void); - u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); - - /* - * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) - */ - void (*irq_enable_sysexit)(void); - - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - - /* - * Switch to usermode gs and return to 32-bit usermode using - * sysret. Used to return to 32-on-64 compat processes. - * Other usermode register state, including %esp, must already - * be restored. - */ - void (*usergs_sysret32)(void); - - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - - void (*swapgs)(void); - - void (*start_context_switch)(struct task_struct *prev); - void (*end_context_switch)(struct task_struct *next); -}; - -struct pv_irq_ops { - void (*init_IRQ)(void); - - /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. - * - * NOTE: These functions callers expect the callee to preserve - * more registers than the standard C calling convention. - */ - struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; - struct paravirt_callee_save irq_disable; - struct paravirt_callee_save irq_enable; - - void (*safe_halt)(void); - void (*halt)(void); - -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif -}; - -struct pv_apic_ops { -#ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - - void (*startup_ipi_hook)(int phys_apicid, - unsigned long start_eip, - unsigned long start_esp); -#endif -}; - -struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - - unsigned long (*read_cr2)(void); - void (*write_cr2)(unsigned long); - - unsigned long (*read_cr3)(void); - void (*write_cr3)(unsigned long); - - /* - * Hooks for intercepting the creation/use/destruction of an - * mm_struct. - */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); - - - /* TLB operations */ - void (*flush_tlb_user)(void); - void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(unsigned long addr); - void (*flush_tlb_others)(const struct cpumask *cpus, - struct mm_struct *mm, - unsigned long va); - - /* Hooks for allocating and freeing a pagetable top-level */ - int (*pgd_alloc)(struct mm_struct *mm); - void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); - - /* - * Hooks for allocating/releasing pagetable pages when they're - * attached to a pagetable - */ - void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); - void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); - void (*release_pte)(unsigned long pfn); - void (*release_pmd)(unsigned long pfn); - void (*release_pud)(unsigned long pfn); - - /* Pagetable manipulation functions */ - void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); - - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); - - struct paravirt_callee_save pte_val; - struct paravirt_callee_save make_pte; - - struct paravirt_callee_save pgd_val; - struct paravirt_callee_save make_pgd; - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pmd_clear)(pmd_t *pmdp); - -#endif /* CONFIG_X86_PAE */ - - void (*set_pud)(pud_t *pudp, pud_t pudval); - - struct paravirt_callee_save pmd_val; - struct paravirt_callee_save make_pmd; - -#if PAGETABLE_LEVELS == 4 - struct paravirt_callee_save pud_val; - struct paravirt_callee_save make_pud; - - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ - -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - - struct pv_lazy_ops lazy_mode; - - /* dom0 ops */ - - /* Sometimes the physical address is a pfn, and sometimes its - an mfn. We can tell which is which from the index. */ - void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, - phys_addr_t phys, pgprot_t flags); -}; - -struct raw_spinlock; -struct pv_lock_ops { - int (*spin_is_locked)(struct raw_spinlock *lock); - int (*spin_is_contended)(struct raw_spinlock *lock); - void (*spin_lock)(struct raw_spinlock *lock); - void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct raw_spinlock *lock); - void (*spin_unlock)(struct raw_spinlock *lock); -}; - -/* This contains all the paravirt structures: we get a convenient - * number for each function using the offset which we use to indicate - * what to patch. */ -struct paravirt_patch_template { - struct pv_init_ops pv_init_ops; - struct pv_time_ops pv_time_ops; - struct pv_cpu_ops pv_cpu_ops; - struct pv_irq_ops pv_irq_ops; - struct pv_apic_ops pv_apic_ops; - struct pv_mmu_ops pv_mmu_ops; - struct pv_lock_ops pv_lock_ops; -}; - -extern struct pv_info pv_info; -extern struct pv_init_ops pv_init_ops; -extern struct pv_time_ops pv_time_ops; -extern struct pv_cpu_ops pv_cpu_ops; -extern struct pv_irq_ops pv_irq_ops; -extern struct pv_apic_ops pv_apic_ops; -extern struct pv_mmu_ops pv_mmu_ops; -extern struct pv_lock_ops pv_lock_ops; - -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "i" (&(op)) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") - -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - -unsigned paravirt_patch_nop(void); -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_ignore(unsigned len); -unsigned paravirt_patch_call(void *insnbuf, - const void *target, u16 tgt_clobbers, - unsigned long addr, u16 site_clobbers, - unsigned len); -unsigned paravirt_patch_jmp(void *insnbuf, const void *target, - unsigned long addr, unsigned len); -unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len); - -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, - const char *start, const char *end); - -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len); - -int paravirt_disable_iospace(void); - -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ -#define PARAVIRT_CALL "call *%c[paravirt_opptr];" - -/* - * These macros are intended to wrap calls through one of the paravirt - * ops structs, so that they can be later identified and patched at - * runtime. - * - * Normally, a call to a pv_op function is a simple indirect call: - * (pv_op_struct.operations)(args...). - * - * Unfortunately, this is a relatively slow operation for modern CPUs, - * because it cannot necessarily determine what the destination - * address is. In this case, the address is a runtime constant, so at - * the very least we can patch the call to e a simple direct call, or - * ideally, patch an inline implementation into the callsite. (Direct - * calls are essentially free, because the call and return addresses - * are completely predictable.) - * - * For i386, these macros rely on the standard gcc "regparm(3)" calling - * convention, in which the first three arguments are placed in %eax, - * %edx, %ecx (in that order), and the remaining arguments are placed - * on the stack. All caller-save registers (eax,edx,ecx) are expected - * to be modified (either clobbered or used for return values). - * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, - * %rdx, and %rcx. Note that for this reason, x86_64 does not need any - * special handling for dealing with 4 arguments, unlike i386. - * However, x86_64 also have to clobber all caller saved registers, which - * unfortunately, are quite a bit (r8 - r11) - * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * - * Unfortunately there's no way to get gcc to generate the args setup - * for the call, and then allow the call itself to be generated by an - * inline asm. Because of this, we must do the complete arg setup and - * return value handling from within these macros. This is fairly - * cumbersome. - * - * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. - * It could be extended to more arguments, but there would be little - * to be gained from that. For each number of arguments, there are - * the two VCALL and CALL variants for void and non-void functions. - * - * When there is a return value, the invoker of the macro must specify - * the return type. The macro then uses sizeof() on that type to - * determine whether its a 32 or 64 bit value, and places the return - * in the right register(s) (just %eax for 32-bit, and %edx:%eax for - * 64-bit). For x86_64 machines, it just returns at %rax regardless of - * the return value size. - * - * 64-bit arguments are passed as a pair of adjacent 32-bit arguments - * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments - * in low,high order - * - * Small structures are passed and returned in registers. The macro - * calling convention can't directly deal with this, so the wrapper - * functions must do this. - * - * These PVOP_* macros are only defined within this header. This - * means that all uses must be wrapped in inline functions. This also - * makes sure the incoming and outgoing types are always correct. - */ -#ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - -#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS -#define VEXTRA_CLOBBERS -#else /* CONFIG_X86_64 */ -#define PVOP_VCALL_ARGS \ - unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax - -#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ - "=S" (__esi), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" -#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_PARAVIRT_DEBUG -#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) -#else -#define PVOP_TEST_NULL(op) ((void)op) -#endif - -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ - ({ \ - rettype __ret; \ - PVOP_CALL_ARGS; \ - PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ - } \ - __ret; \ - }) - -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ - ({ \ - PVOP_VCALL_ARGS; \ - PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - }) - -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) - -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - - -#define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") -#define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") - -#define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") -#define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") - - -#define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) - -#define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) - - -#define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - - -#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -#define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) - -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif static inline int paravirt_enabled(void) { @@ -1393,20 +715,6 @@ static inline void pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); - #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { @@ -1437,12 +745,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, pv_mmu_ops.set_fixmap(idx, phys, flags); } -void _paravirt_nop(void); -u32 _paravirt_ident_32(u32); -u64 _paravirt_ident_64(u64); - -#define paravirt_nop ((void *)_paravirt_nop) - #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static inline int __raw_spin_is_locked(struct raw_spinlock *lock) @@ -1479,17 +781,6 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) #endif -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 instrtype; /* type of this instruction */ - u8 len; /* length of original instruction */ - u16 clobbers; /* what registers you may clobber */ -}; - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; - #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" #define PV_RESTORE_REGS "popl %edx; popl %ecx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h new file mode 100644 index 000000000000..2b3371bae295 --- /dev/null +++ b/arch/x86/include/asm/paravirt_types.h @@ -0,0 +1,720 @@ +#ifndef _ASM_X86_PARAVIRT_TYPES_H +#define _ASM_X86_PARAVIRT_TYPES_H + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0 +#define CLBR_EAX (1 << 0) +#define CLBR_ECX (1 << 1) +#define CLBR_EDX (1 << 2) +#define CLBR_EDI (1 << 3) + +#ifdef CONFIG_X86_32 +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 4) - 1) + +#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) +#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) +#define CLBR_SCRATCH (0) +#else +#define CLBR_RAX CLBR_EAX +#define CLBR_RCX CLBR_ECX +#define CLBR_RDX CLBR_EDX +#define CLBR_RDI CLBR_EDI +#define CLBR_RSI (1 << 4) +#define CLBR_R8 (1 << 5) +#define CLBR_R9 (1 << 6) +#define CLBR_R10 (1 << 7) +#define CLBR_R11 (1 << 8) + +#define CLBR_ANY ((1 << 9) - 1) + +#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ + CLBR_RCX | CLBR_R8 | CLBR_R9) +#define CLBR_RET_REG (CLBR_RAX) +#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) + +#endif /* X86_64 */ + +#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) + +#ifndef __ASSEMBLY__ + +#include <asm/desc_defs.h> +#include <asm/kmap_types.h> + +struct page; +struct thread_struct; +struct desc_ptr; +struct tss_struct; +struct mm_struct; +struct desc_struct; +struct task_struct; +struct cpumask; + +/* + * Wrapper type for pointers to code which uses the non-standard + * calling convention. See PV_CALL_SAVE_REGS_THUNK below. + */ +struct paravirt_callee_save { + void *func; +}; + +/* general info */ +struct pv_info { + unsigned int kernel_rpl; + int shared_kernel_pmd; + int paravirt_enabled; + const char *name; +}; + +struct pv_init_ops { + /* + * Patch may replace one of the defined code sequences with + * arbitrary code, subject to the same register constraints. + * This generally means the code is not free to clobber any + * registers other than EAX. The patch function should return + * the number of bytes of code generated, as we nop pad the + * rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, + unsigned long addr, unsigned len); + + /* Basic arch-specific setup */ + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*post_allocator_init)(void); + + /* Print a banner to identify the environment */ + void (*banner)(void); +}; + + +struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); +}; + +struct pv_time_ops { + void (*time_init)(void); + + /* Set and set time of day */ + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + + unsigned long long (*sched_clock)(void); + unsigned long (*get_tsc_khz)(void); +}; + +struct pv_cpu_ops { + /* hooks for various privileged instructions */ + unsigned long (*get_debugreg)(int regno); + void (*set_debugreg)(int regno, unsigned long value); + + void (*clts)(void); + + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); + + unsigned long (*read_cr4_safe)(void); + unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); + +#ifdef CONFIG_X86_64 + unsigned long (*read_cr8)(void); + void (*write_cr8)(unsigned long); +#endif + + /* Segment descriptor handling */ + void (*load_tr_desc)(void); + void (*load_gdt)(const struct desc_ptr *); + void (*load_idt)(const struct desc_ptr *); + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); +#ifdef CONFIG_X86_64 + void (*load_gs_index)(unsigned int idx); +#endif + void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, + const void *desc); + void (*write_gdt_entry)(struct desc_struct *, + int entrynum, const void *desc, int size); + void (*write_idt_entry)(gate_desc *, + int entrynum, const gate_desc *gate); + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); + + void (*wbinvd)(void); + void (*io_delay)(void); + + /* cpuid emulation, mostly so that caps bits can be disabled */ + void (*cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (*read_msr_amd)(unsigned int msr, int *err); + u64 (*read_msr)(unsigned int msr, int *err); + int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + u64 (*read_tsc)(void); + u64 (*read_pmc)(int counter); + unsigned long long (*read_tscp)(unsigned int *aux); + + /* + * Atomically enable interrupts and return to userspace. This + * is only ever used to return to 32-bit processes; in a + * 64-bit kernel, it's used for 32-on-64 compat processes, but + * never native 64-bit processes. (Jump, not call.) + */ + void (*irq_enable_sysexit)(void); + + /* + * Switch to usermode gs and return to 64-bit usermode using + * sysret. Only used in 64-bit kernels to return to 64-bit + * processes. Usermode register state, including %rsp, must + * already be restored. + */ + void (*usergs_sysret64)(void); + + /* + * Switch to usermode gs and return to 32-bit usermode using + * sysret. Used to return to 32-on-64 compat processes. + * Other usermode register state, including %esp, must already + * be restored. + */ + void (*usergs_sysret32)(void); + + /* Normal iret. Jump to this with the standard iret stack + frame set up. */ + void (*iret)(void); + + void (*swapgs)(void); + + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); +}; + +struct pv_irq_ops { + void (*init_IRQ)(void); + + /* + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. + * + * NOTE: These functions callers expect the callee to preserve + * more registers than the standard C calling convention. + */ + struct paravirt_callee_save save_fl; + struct paravirt_callee_save restore_fl; + struct paravirt_callee_save irq_disable; + struct paravirt_callee_save irq_enable; + + void (*safe_halt)(void); + void (*halt)(void); + +#ifdef CONFIG_X86_64 + void (*adjust_exception_frame)(void); +#endif +}; + +struct pv_apic_ops { +#ifdef CONFIG_X86_LOCAL_APIC + void (*setup_boot_clock)(void); + void (*setup_secondary_clock)(void); + + void (*startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); +#endif +}; + +struct pv_mmu_ops { + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); + + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); + + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + + + /* TLB operations */ + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_others)(const struct cpumask *cpus, + struct mm_struct *mm, + unsigned long va); + + /* Hooks for allocating and freeing a pagetable top-level */ + int (*pgd_alloc)(struct mm_struct *mm); + void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); + + /* + * Hooks for allocating/releasing pagetable pages when they're + * attached to a pagetable + */ + void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); + void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*release_pte)(unsigned long pfn); + void (*release_pmd)(unsigned long pfn); + void (*release_pud)(unsigned long pfn); + + /* Pagetable manipulation functions */ + void (*set_pte)(pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*pte_update)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + + struct paravirt_callee_save pte_val; + struct paravirt_callee_save make_pte; + + struct paravirt_callee_save pgd_val; + struct paravirt_callee_save make_pgd; + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); + +#endif /* CONFIG_X86_PAE */ + + void (*set_pud)(pud_t *pudp, pud_t pudval); + + struct paravirt_callee_save pmd_val; + struct paravirt_callee_save make_pmd; + +#if PAGETABLE_LEVELS == 4 + struct paravirt_callee_save pud_val; + struct paravirt_callee_save make_pud; + + void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); +#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* PAGETABLE_LEVELS >= 3 */ + +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif + + struct pv_lazy_ops lazy_mode; + + /* dom0 ops */ + + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags); +}; + +struct raw_spinlock; +struct pv_lock_ops { + int (*spin_is_locked)(struct raw_spinlock *lock); + int (*spin_is_contended)(struct raw_spinlock *lock); + void (*spin_lock)(struct raw_spinlock *lock); + void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct raw_spinlock *lock); + void (*spin_unlock)(struct raw_spinlock *lock); +}; + +/* This contains all the paravirt structures: we get a convenient + * number for each function using the offset which we use to indicate + * what to patch. */ +struct paravirt_patch_template { + struct pv_init_ops pv_init_ops; + struct pv_time_ops pv_time_ops; + struct pv_cpu_ops pv_cpu_ops; + struct pv_irq_ops pv_irq_ops; + struct pv_apic_ops pv_apic_ops; + struct pv_mmu_ops pv_mmu_ops; + struct pv_lock_ops pv_lock_ops; +}; + +extern struct pv_info pv_info; +extern struct pv_init_ops pv_init_ops; +extern struct pv_time_ops pv_time_ops; +extern struct pv_cpu_ops pv_cpu_ops; +extern struct pv_irq_ops pv_irq_ops; +extern struct pv_apic_ops pv_apic_ops; +extern struct pv_mmu_ops pv_mmu_ops; +extern struct pv_lock_ops pv_lock_ops; + +#define PARAVIRT_PATCH(x) \ + (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) + +#define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ + [paravirt_opptr] "i" (&(op)) +#define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +/* + * Generate some code, and mark it as patchable by the + * apply_paravirt() alternate instruction patcher. + */ +#define _paravirt_alt(insn_string, type, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR " 771b\n" \ + " .byte " type "\n" \ + " .byte 772b-771b\n" \ + " .short " clobber "\n" \ + ".popsection\n" + +/* Generate patchable code, with the default asm parameters. */ +#define paravirt_alt(insn_string) \ + _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + +/* Simple instruction patching code. */ +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); +unsigned paravirt_patch_ignore(unsigned len); +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len); +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len); +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len); + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len); + +int paravirt_disable_iospace(void); + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +#define PARAVIRT_CALL "call *%c[paravirt_opptr];" + +/* + * These macros are intended to wrap calls through one of the paravirt + * ops structs, so that they can be later identified and patched at + * runtime. + * + * Normally, a call to a pv_op function is a simple indirect call: + * (pv_op_struct.operations)(args...). + * + * Unfortunately, this is a relatively slow operation for modern CPUs, + * because it cannot necessarily determine what the destination + * address is. In this case, the address is a runtime constant, so at + * the very least we can patch the call to e a simple direct call, or + * ideally, patch an inline implementation into the callsite. (Direct + * calls are essentially free, because the call and return addresses + * are completely predictable.) + * + * For i386, these macros rely on the standard gcc "regparm(3)" calling + * convention, in which the first three arguments are placed in %eax, + * %edx, %ecx (in that order), and the remaining arguments are placed + * on the stack. All caller-save registers (eax,edx,ecx) are expected + * to be modified (either clobbered or used for return values). + * X86_64, on the other hand, already specifies a register-based calling + * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * %rdx, and %rcx. Note that for this reason, x86_64 does not need any + * special handling for dealing with 4 arguments, unlike i386. + * However, x86_64 also have to clobber all caller saved registers, which + * unfortunately, are quite a bit (r8 - r11) + * + * The call instruction itself is marked by placing its start address + * and size into the .parainstructions section, so that + * apply_paravirt() in arch/i386/kernel/alternative.c can do the + * appropriate patching under the control of the backend pv_init_ops + * implementation. + * + * Unfortunately there's no way to get gcc to generate the args setup + * for the call, and then allow the call itself to be generated by an + * inline asm. Because of this, we must do the complete arg setup and + * return value handling from within these macros. This is fairly + * cumbersome. + * + * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. + * It could be extended to more arguments, but there would be little + * to be gained from that. For each number of arguments, there are + * the two VCALL and CALL variants for void and non-void functions. + * + * When there is a return value, the invoker of the macro must specify + * the return type. The macro then uses sizeof() on that type to + * determine whether its a 32 or 64 bit value, and places the return + * in the right register(s) (just %eax for 32-bit, and %edx:%eax for + * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * the return value size. + * + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments + * in low,high order + * + * Small structures are passed and returned in registers. The macro + * calling convention can't directly deal with this, so the wrapper + * functions must do this. + * + * These PVOP_* macros are only defined within this header. This + * means that all uses must be wrapped in inline functions. This also + * makes sure the incoming and outgoing types are always correct. + */ +#ifdef CONFIG_X86_32 +#define PVOP_VCALL_ARGS \ + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS + +#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS +#define VEXTRA_CLOBBERS +#else /* CONFIG_X86_64 */ +#define PVOP_VCALL_ARGS \ + unsigned long __edi = __edi, __esi = __esi, \ + __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + +#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ + "=S" (__esi), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" +#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_PARAVIRT_DEBUG +#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#else +#define PVOP_TEST_NULL(op) ((void)op) +#endif + +#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ + pre, post, ...) \ + ({ \ + rettype __ret; \ + PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ + /* This is 32-bit specific, but is okay in 64-bit */ \ + /* since this condition will never hold */ \ + if (sizeof(rettype) > sizeof(unsigned long)) { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)__eax; \ + } \ + __ret; \ + }) + +#define __PVOP_CALL(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ + EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + +#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ + ({ \ + PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + }) + +#define __PVOP_VCALL(op, pre, post, ...) \ + ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, \ + pre, post, ##__VA_ARGS__) + +#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + + +#define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") +#define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + +#define PVOP_CALLEE0(rettype, op) \ + __PVOP_CALLEESAVE(rettype, op, "", "") +#define PVOP_VCALLEE0(op) \ + __PVOP_VCALLEESAVE(op, "", "") + + +#define PVOP_CALL1(rettype, op, arg1) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALL1(op, arg1) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + +#define PVOP_CALLEE1(rettype, op, arg1) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALLEE1(op, arg1) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + + +#define PVOP_CALL2(rettype, op, arg1, arg2) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALL2(op, arg1, arg2) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + +#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALLEE2(op, arg1, arg2) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + + +#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) +#define PVOP_VCALL3(op, arg1, arg2, arg3) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + +/* This is the only difference in x86_64. We can make it much simpler */ +#ifdef CONFIG_X86_32 +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#else +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#endif + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + +void paravirt_enter_lazy_mmu(void); +void paravirt_leave_lazy_mmu(void); + +void _paravirt_nop(void); +u32 _paravirt_ident_32(u32); +u64 _paravirt_ident_64(u64); + +#define paravirt_nop ((void *)_paravirt_nop) + +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +extern struct paravirt_patch_site __parainstructions[], + __parainstructions_end[]; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 02ecb30982a3..103f1ddb0d85 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -42,6 +42,7 @@ #else /* ...!ASSEMBLY */ +#include <linux/kernel.h> #include <linux/stringify.h> #ifdef CONFIG_SMP @@ -155,6 +156,15 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); +#ifdef CONFIG_NEED_MULTIPLE_NODES +void *pcpu_lpage_remapped(void *kaddr); +#else +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 263d397d2eef..75af592677ec 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -1,33 +1,8 @@ #ifndef _ASM_X86_SCATTERLIST_H #define _ASM_X86_SCATTERLIST_H -#include <asm/types.h> - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -#define ARCH_HAS_SG_CHAIN #define ISA_DMA_THRESHOLD (0x00ffffff) -/* - * These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#ifdef CONFIG_X86_32 -# define sg_dma_len(sg) ((sg)->length) -#else -# define sg_dma_len(sg) ((sg)->dma_length) -#endif +#include <asm-generic/scatterlist.h> #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index b51413b74971..83c05fc2de38 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h @@ -1,51 +1 @@ -#ifndef _ASM_X86_SHMBUF_H -#define _ASM_X86_SHMBUF_H - -/* - * The shmid64_ds structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space on 32 bit is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - * - * Pad space on 64 bit is left for: - * - 2 miscellaneous 64-bit values - */ - -struct shmid64_ds { - struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ - __kernel_time_t shm_atime; /* last attach time */ -#ifdef __i386__ - unsigned long __unused1; -#endif - __kernel_time_t shm_dtime; /* last detach time */ -#ifdef __i386__ - unsigned long __unused2; -#endif - __kernel_time_t shm_ctime; /* last change time */ -#ifdef __i386__ - unsigned long __unused3; -#endif - __kernel_pid_t shm_cpid; /* pid of creator */ - __kernel_pid_t shm_lpid; /* pid of last operator */ - unsigned long shm_nattch; /* no. of current attaches */ - unsigned long __unused4; - unsigned long __unused5; -}; - -struct shminfo64 { - unsigned long shmmax; - unsigned long shmmin; - unsigned long shmmni; - unsigned long shmseg; - unsigned long shmall; - unsigned long __unused1; - unsigned long __unused2; - unsigned long __unused3; - unsigned long __unused4; -}; - -#endif /* _ASM_X86_SHMBUF_H */ +#include <asm-generic/shmbuf.h> diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index ca8bf2cd0ba9..6b71384b9d8b 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h @@ -1,60 +1 @@ -#ifndef _ASM_X86_SOCKET_H -#define _ASM_X86_SOCKET_H - -#include <asm/sockios.h> - -/* For setsockopt(2) */ -#define SOL_SOCKET 1 - -#define SO_DEBUG 1 -#define SO_REUSEADDR 2 -#define SO_TYPE 3 -#define SO_ERROR 4 -#define SO_DONTROUTE 5 -#define SO_BROADCAST 6 -#define SO_SNDBUF 7 -#define SO_RCVBUF 8 -#define SO_SNDBUFFORCE 32 -#define SO_RCVBUFFORCE 33 -#define SO_KEEPALIVE 9 -#define SO_OOBINLINE 10 -#define SO_NO_CHECK 11 -#define SO_PRIORITY 12 -#define SO_LINGER 13 -#define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ -#define SO_PASSCRED 16 -#define SO_PEERCRED 17 -#define SO_RCVLOWAT 18 -#define SO_SNDLOWAT 19 -#define SO_RCVTIMEO 20 -#define SO_SNDTIMEO 21 - -/* Security levels - as per NRL IPv6 - don't actually do anything */ -#define SO_SECURITY_AUTHENTICATION 22 -#define SO_SECURITY_ENCRYPTION_TRANSPORT 23 -#define SO_SECURITY_ENCRYPTION_NETWORK 24 - -#define SO_BINDTODEVICE 25 - -/* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 - -#define SO_PEERNAME 28 -#define SO_TIMESTAMP 29 -#define SCM_TIMESTAMP SO_TIMESTAMP - -#define SO_ACCEPTCONN 30 - -#define SO_PEERSEC 31 -#define SO_PASSSEC 34 -#define SO_TIMESTAMPNS 35 -#define SCM_TIMESTAMPNS SO_TIMESTAMPNS - -#define SO_MARK 36 - -#define SO_TIMESTAMPING 37 -#define SCM_TIMESTAMPING SO_TIMESTAMPING - -#endif /* _ASM_X86_SOCKET_H */ +#include <asm-generic/socket.h> diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 49cc72b5d3c9..def6d4746ee7 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h @@ -1,13 +1 @@ -#ifndef _ASM_X86_SOCKIOS_H -#define _ASM_X86_SOCKIOS_H - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* _ASM_X86_SOCKIOS_H */ +#include <asm-generic/sockios.h> diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index af1b70ea440f..3935b106de79 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h @@ -1,198 +1 @@ -#ifndef _ASM_X86_TERMBITS_H -#define _ASM_X86_TERMBITS_H - -#include <linux/posix_types.h> - -typedef unsigned char cc_t; -typedef unsigned int speed_t; -typedef unsigned int tcflag_t; - -#define NCCS 19 -struct termios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ -}; - -struct termios2 { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -struct ktermios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -/* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VTIME 5 -#define VMIN 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 -#define VSUSP 10 -#define VEOL 11 -#define VREPRINT 12 -#define VDISCARD 13 -#define VWERASE 14 -#define VLNEXT 15 -#define VEOL2 16 - -/* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IUCLC 0001000 -#define IXON 0002000 -#define IXANY 0004000 -#define IXOFF 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 - -/* c_oflag bits */ -#define OPOST 0000001 -#define OLCUC 0000002 -#define ONLCR 0000004 -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 - -/* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 -#define EXTA B19200 -#define EXTB B38400 -#define CSIZE 0000060 -#define CS5 0000000 -#define CS6 0000020 -#define CS7 0000040 -#define CS8 0000060 -#define CSTOPB 0000100 -#define CREAD 0000200 -#define PARENB 0000400 -#define PARODD 0001000 -#define HUPCL 0002000 -#define CLOCAL 0004000 -#define CBAUDEX 0010000 -#define BOTHER 0010000 /* non standard rate */ -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ - -/* c_lflag bits */ -#define ISIG 0000001 -#define ICANON 0000002 -#define XCASE 0000004 -#define ECHO 0000010 -#define ECHOE 0000020 -#define ECHOK 0000040 -#define ECHONL 0000100 -#define NOFLSH 0000200 -#define TOSTOP 0000400 -#define ECHOCTL 0001000 -#define ECHOPRT 0002000 -#define ECHOKE 0004000 -#define FLUSHO 0010000 -#define PENDIN 0040000 -#define IEXTEN 0100000 - -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - -/* tcsetattr uses these */ -#define TCSANOW 0 -#define TCSADRAIN 1 -#define TCSAFLUSH 2 - -#endif /* _ASM_X86_TERMBITS_H */ +#include <asm-generic/termbits.h> diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index ee6f0c859244..280d78a9d966 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -1,113 +1 @@ -#ifndef _ASM_X86_TERMIOS_H -#define _ASM_X86_TERMIOS_H - -#include <asm/termbits.h> -#include <asm/ioctls.h> - -struct winsize { - unsigned short ws_row; - unsigned short ws_col; - unsigned short ws_xpixel; - unsigned short ws_ypixel; -}; - -#define NCC 8 -struct termio { - unsigned short c_iflag; /* input mode flags */ - unsigned short c_oflag; /* output mode flags */ - unsigned short c_cflag; /* control mode flags */ - unsigned short c_lflag; /* local mode flags */ - unsigned char c_line; /* line discipline */ - unsigned char c_cc[NCC]; /* control characters */ -}; - -/* modem lines */ -#define TIOCM_LE 0x001 -#define TIOCM_DTR 0x002 -#define TIOCM_RTS 0x004 -#define TIOCM_ST 0x008 -#define TIOCM_SR 0x010 -#define TIOCM_CTS 0x020 -#define TIOCM_CAR 0x040 -#define TIOCM_RNG 0x080 -#define TIOCM_DSR 0x100 -#define TIOCM_CD TIOCM_CAR -#define TIOCM_RI TIOCM_RNG -#define TIOCM_OUT1 0x2000 -#define TIOCM_OUT2 0x4000 -#define TIOCM_LOOP 0x8000 - -#ifdef __KERNEL__ - -/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - -#include <asm/uaccess.h> - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -/* - * Translate a "termio" structure into a "termios". Ugh. - */ -#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \ - unsigned short __tmp; \ - get_user(__tmp,&(termio)->x); \ - *(unsigned short *) &(termios)->x = __tmp; \ -} - -static inline int user_termio_to_kernel_termios(struct ktermios *termios, - struct termio __user *termio) -{ - SET_LOW_TERMIOS_BITS(termios, termio, c_iflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); - get_user(termios->c_line, &termio->c_line); - return copy_from_user(termios->c_cc, termio->c_cc, NCC); -} - -/* - * Translate a "termios" structure into a "termio". Ugh. - */ -static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) -{ - put_user((termios)->c_iflag, &(termio)->c_iflag); - put_user((termios)->c_oflag, &(termio)->c_oflag); - put_user((termios)->c_cflag, &(termio)->c_cflag); - put_user((termios)->c_lflag, &(termio)->c_lflag); - put_user((termios)->c_line, &(termio)->c_line); - return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC); -} - -static inline int user_termios_to_kernel_termios(struct ktermios *k, - struct termios2 __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios2)); -} - -static inline int kernel_termios_to_user_termios(struct termios2 __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios2)); -} - -static inline int user_termios_to_kernel_termios_1(struct ktermios *k, - struct termios __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios)); -} - -static inline int kernel_termios_to_user_termios_1(struct termios __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios)); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_X86_TERMIOS_H */ +#include <asm-generic/termios.h> diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index 09b97745772f..df1da20f4534 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,19 +1,11 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#include <asm-generic/int-ll64.h> +#define dma_addr_t dma_addr_t -#ifndef __ASSEMBLY__ - -typedef unsigned short umode_t; +#include <asm-generic/types.h> -#endif /* __ASSEMBLY__ */ - -/* - * These aren't exported outside the kernel to avoid name space clashes - */ #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 87324cf439d9..b7c29c8017f2 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h @@ -7,12 +7,6 @@ * sigcontext struct (uc_mcontext). */ -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct sigcontext uc_mcontext; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; +#include <asm-generic/ucontext.h> #endif /* _ASM_X86_UCONTEXT_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e5b27d8f1b47..28e5f5956042 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -258,13 +258,15 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; + int cpu = smp_processor_id(); bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; #endif } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968bc..7da8fec9ca88 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -242,7 +242,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_add_return(1, &mce_paniced) > 1) + if (atomic_inc_return(&mce_paniced) > 1) wait_for_panic(); barrier(); @@ -705,7 +705,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef23f789..a310d19faca3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -401,7 +401,7 @@ static const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */ + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ [ C(RESULT_MISS) ] = 0, }, [ C(OP_PREFETCH) ] = { @@ -969,13 +969,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - /* - * Quirk, IA32_FIXED_CTRs do not work on current Atom processors: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86_model == 28) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) @@ -1428,8 +1421,6 @@ static int intel_pmu_init(void) */ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - /* * Install the hw-cache-events table: */ @@ -1514,6 +1505,7 @@ void __init init_hw_perf_counters(void) perf_counter_mask |= ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_counter_mask; perf_counters_lapic_init(); register_die_notifier(&perf_counter_nmi_notifier); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246b..f82706a3901d 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); -#ifdef CONFIG_X86_64 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); -#endif seq_printf(m, "power management:"); for (i = 0; i < 32; i++) { diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 48bfe1386038..ef42a038f1a6 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -509,15 +509,15 @@ enum bts_field { bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; -static inline unsigned long bts_get(const char *base, enum bts_field field) +static inline unsigned long bts_get(const char *base, unsigned long field) { base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } -static inline void bts_set(char *base, enum bts_field field, unsigned long val) +static inline void bts_set(char *base, unsigned long field, unsigned long val) { - base += (ds_cfg.sizeof_ptr_field * field);; + base += (ds_cfg.sizeof_ptr_field * field); (*(unsigned long *)base) = val; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984af..a80eddd41658 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -350,14 +350,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -398,6 +405,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -407,15 +419,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9d..a28279dbb07c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -386,9 +386,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + bool preload_fpu; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -419,6 +427,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* Must be after DS reload */ + unlazy_fpu(prev_p); + + /* Make sure cpu is ready for new context */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -459,9 +474,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - /* Must be after DS reload */ - unlazy_fpu(prev_p); - /* * Switch the PDA and FPU contexts. */ @@ -480,15 +492,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 36cd891c49da..d1d9b0066a4a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -289,6 +289,20 @@ void * __init extend_brk(size_t size, size_t align) return ret; } +#ifdef CONFIG_X86_64 +static void __init init_gbpages(void) +{ + if (direct_gbpages && cpu_has_gbpages) + printk(KERN_INFO "Using GB pages for direct mapping\n"); + else + direct_gbpages = 0; +} +#else +static inline void init_gbpages(void) +{ +} +#endif + static void __init reserve_brk(void) { if (_brk_end > _brk_start) @@ -871,6 +885,8 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + init_gbpages(); + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); max_pfn_mapped = max_low_pfn_mapped; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f0823e6aa..29a3eef7cf4a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator + * Large page remap allocator * * This allocator uses PMD page as unit. A PMD page is allocated for * each cpu and each is remapped into vmalloc area using PMD mapping. @@ -137,105 +137,185 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; - if (off >= pcpur_size) + if (off >= pcpul_size) return NULL; - return virt_to_page(pcpur_ptrs[cpu] + off); + return virt_to_page(pcpul_map[cpu].ptr + off); } -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - static struct vm_struct vm; - size_t ptrs_size, dyn_size; + size_t map_size, dyn_size; unsigned int cpu; + int i, j; ssize_t ret; - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. - */ - if (true || !cpu_has_pse || !pcpu_need_numa()) + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = num_possible_cpus() * PMD_SIZE; + + /* on non-NUMA, embedding is better */ + if (!pcpu_need_numa()) + return -EINVAL; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + return -EINVAL; + } + } + + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); return -EINVAL; + } /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { + if (pcpul_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, + PMD_SIZE); + if (!pcpul_map[cpu].ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); goto enomem; + } /* - * Only use pcpur_size bytes and give back the rest. + * Only use pcpul_size bytes and give back the rest. * * Ingo: The 2MB up-rounding bootmem is needed to make * sure the partial 2MB page is still fully RAM - it's * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); + free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), + PMD_SIZE - pcpul_size); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); } /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { - pmd_t *pmd; + pmd_t *pmd, pmd_v; - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), - PAGE_KERNEL_LARGE)); + pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + + cpu * PMD_SIZE); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), + PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + "%zu bytes\n", pcpul_vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; + PMD_SIZE, pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; enomem: for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); - return ret; + if (pcpul_map[cpu].ptr) + free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu PMD + * mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used PMD + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); + unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < pmd_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > pmd_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * PMD_SIZE + offset; + } + } + + return NULL; } #else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { return -EINVAL; } @@ -249,7 +329,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size) +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -258,7 +338,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) * this. Also, embedding allocation doesn't play well with * NUMA. */ - if (!cpu_has_pse || pcpu_need_numa()) + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, @@ -308,8 +388,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) void *ptr; ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); goto enomem; + } memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); @@ -333,6 +416,16 @@ out_free_ar: return ret; } +/* for explicit first chunk allocator selection */ +static char pcpu_chosen_alloc[16] __initdata; + +static int __init percpu_alloc_setup(char *str) +{ + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -346,11 +439,6 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; @@ -367,9 +455,26 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_remap(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); + ret = -EINVAL; + if (strlen(pcpu_chosen_alloc)) { + if (strcmp(pcpu_chosen_alloc, "4k")) { + if (!strcmp(pcpu_chosen_alloc, "lpage")) + ret = setup_pcpu_lpage(static_size, true); + else if (!strcmp(pcpu_chosen_alloc, "embed")) + ret = setup_pcpu_embed(static_size, true); + else + pr_warning("PERCPU: unknown allocator %s " + "specified\n", pcpu_chosen_alloc); + if (ret < 0) + pr_warning("PERCPU: %s allocator failed (%zd), " + "falling back to 4k\n", + pcpu_chosen_alloc, ret); + } + } else { + ret = setup_pcpu_lpage(static_size, false); + if (ret < 0) + ret = setup_pcpu_embed(static_size, false); + } if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a0f48f5671c0..82d27f6dbadb 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -813,6 +813,28 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) } /* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; +} + +/* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task * @@ -843,17 +865,8 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f53b57e4086f..47ce9a2ce5e7 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -177,20 +177,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, return nr_range; } -#ifdef CONFIG_X86_64 -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} -#else -static inline void init_gbpages(void) -{ -} -#endif - /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -210,9 +196,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - if (!after_bootmem) - init_gbpages(); - #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cfe9ced8a4c..1b734d7a8966 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -11,6 +11,7 @@ #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/pfn.h> #include <asm/e820.h> #include <asm/processor.h> @@ -681,8 +682,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; - int ret = 0; - unsigned long temp_cpa_vaddr, vaddr; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr, remapped; + int ret; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -706,42 +708,55 @@ static int cpa_process_alias(struct cpa_data *cpa) PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; - temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; } #ifdef CONFIG_X86_64 - if (ret) - return ret; /* - * No need to redo, when the primary call touched the high - * mapping already: - */ - if (within(vaddr, (unsigned long) _text, _brk_end)) - return 0; - - /* - * If the physical address is inside the kernel map, we need + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) - return 0; + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - alias_cpa = *cpa; - temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; - alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif /* - * The high mapping range is imprecise, so ignore the return value. + * If the PMD page was partially used for per-cpu remapping, + * the recycled area needs to be split and modified. Because + * the area is always proper subset of a PMD page + * cpa->numpages is guaranteed to be 1 for these areas, so + * there's no need to loop over and check for further remaps. */ - __change_page_attr_set_clr(&alias_cpa, 0); -#endif - return ret; + remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); + if (remapped) { + WARN_ON(cpa->numpages > 1); + alias_cpa = *cpa; + alias_cpa.vaddr = &remapped; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + + return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b07dd8d0b321..28ee490c1b80 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -31,6 +31,26 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; +/* common functions */ + +u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config) +{ + u64 val = 0; + u16 event = (u16)counter_config->event; + + val |= ARCH_PERFMON_EVENTSEL_INT; + val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; + val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; + val |= (counter_config->unit_mask & 0xFF) << 8; + event &= model->event_mask ? model->event_mask : 0xFF; + val |= event & 0xFF; + val |= (event & 0x0F00) << 24; + + return val; +} + + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -52,26 +72,18 @@ static int profile_exceptions_notify(struct notifier_block *self, static void nmi_cpu_save_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - rdmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + rdmsrl(counters[i].addr, counters[i].saved); } - for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - rdmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + rdmsrl(controls[i].addr, controls[i].saved); } } @@ -126,7 +138,7 @@ static void nmi_cpu_setup(void *dummy) int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); spin_lock(&oprofilefs_lock); - model->setup_ctrs(msrs); + model->setup_ctrs(model, msrs); spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); @@ -178,26 +190,18 @@ static int nmi_setup(void) static void nmi_restore_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - wrmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + wrmsrl(controls[i].addr, controls[i].saved); } - for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - wrmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + wrmsrl(counters[i].addr, counters[i].saved); } } @@ -402,6 +406,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + struct op_x86_model_spec const *spec = &op_ppro_spec; /* default */ if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; @@ -428,7 +433,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 26: - arch_perfmon_setup_counters(); + spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; case 28: @@ -439,17 +444,7 @@ static int __init ppro_init(char **cpu_type) return 0; } - model = &op_ppro_spec; - return 1; -} - -static int __init arch_perfmon_init(char **cpu_type) -{ - if (!cpu_has_arch_perfmon) - return 0; - *cpu_type = "i386/arch_perfmon"; - model = &op_arch_perfmon_spec; - arch_perfmon_setup_counters(); + model = spec; return 1; } @@ -471,27 +466,26 @@ int __init op_nmi_init(struct oprofile_operations *ops) /* Needs to be at least an Athlon (or hammer in 32bit mode) */ switch (family) { - default: - return -ENODEV; case 6: - model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_amd_spec; - /* Actually it could be i386/hammer too, but give - user space an consistent name. */ + /* + * Actually it could be i386/hammer too, but + * give user space an consistent name. + */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_amd_spec; cpu_type = "x86-64/family10"; break; case 0x11: - model = &op_amd_spec; cpu_type = "x86-64/family11h"; break; + default: + return -ENODEV; } + model = &op_amd_spec; break; case X86_VENDOR_INTEL: @@ -510,8 +504,15 @@ int __init op_nmi_init(struct oprofile_operations *ops) break; } - if (!cpu_type && !arch_perfmon_init(&cpu_type)) + if (cpu_type) + break; + + if (!cpu_has_arch_perfmon) return -ENODEV; + + /* use arch perfmon as fallback */ + cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; break; default: diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8fdf06e4edf9..cc930467575d 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -25,43 +25,28 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#define OP_EVENT_MASK 0x0FFF +#define OP_CTR_OVERFLOW (1ULL<<31) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) +#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) static unsigned long reset_value[NUM_COUNTERS]; #ifdef CONFIG_OPROFILE_IBS /* IbsFetchCtl bits/masks */ -#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ -#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ -#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL /*IbsOpCtl bits */ -#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ -#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) -#define IBS_FETCH_SIZE 6 -#define IBS_OP_SIZE 12 +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 static int has_ibs; /* AMD Family10h and later */ @@ -99,49 +84,38 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) } } - -static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; - CTR_WRITE(1, msrs, i); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; - - CTR_WRITE(counter_config[i].count, msrs, i); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - - CTRL_WRITE(low, high, msrs, i); + wrmsrl(msrs->counters[i].addr, + -(s64)counter_config[i].count); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -154,93 +128,116 @@ static inline int op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - u32 low, high; - u64 msr; + u64 val, ctl; struct op_entry entry; if (!has_ibs) return 1; if (ibs_config.fetch_enabled) { - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_HIGH_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); + if (ctl & IBS_FETCH_VAL) { + rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); + oprofile_write_reserve(&entry, regs, val, IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - oprofile_add_data(&entry, low); - oprofile_add_data(&entry, high); - rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data64(&entry, val); + oprofile_add_data64(&entry, ctl); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ - high &= ~IBS_FETCH_HIGH_VALID_BIT; - high |= IBS_FETCH_HIGH_ENABLE; - low &= IBS_FETCH_LOW_MAX_CNT_MASK; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } } if (ibs_config.op_enabled) { - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_LOW_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSOPRIP, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSOPCTL, ctl); + if (ctl & IBS_OP_VAL) { + rdmsrl(MSR_AMD64_IBSOPRIP, val); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, IBS_OP_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA2, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA3, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCLINAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA2, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA3, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCLINAD, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ - high = 0; - low &= ~IBS_OP_LOW_VALID_BIT; - low |= IBS_OP_LOW_ENABLE; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); + ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; + ctl |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } return 1; } +static inline void op_amd_start_ibs(void) +{ + u64 val; + if (has_ibs && ibs_config.fetch_enabled) { + val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; + val |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, val); + } + + if (has_ibs && ibs_config.op_enabled) { + val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; + val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + val |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, val); + } +} + +static void op_amd_stop_ibs(void) +{ + if (has_ibs && ibs_config.fetch_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); + + if (has_ibs && ibs_config.op_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSOPCTL, 0); +} + +#else + +static inline int op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) { } +static inline void op_amd_start_ibs(void) { } +static inline void op_amd_stop_ibs(void) { } + #endif static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } + rdmsrl(msrs->counters[i].addr, val); + /* bit is clear if overflowed: */ + if (val & OP_CTR_OVERFLOW) + continue; + oprofile_add_sample(regs, i); + wrmsrl(msrs->counters[i].addr, -(s64)reset_value[i]); } -#ifdef CONFIG_OPROFILE_IBS op_amd_handle_ibs(regs, msrs); -#endif /* See op_model_ppro.c */ return 1; @@ -248,38 +245,22 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, static void op_amd_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ - + IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (has_ibs && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) - + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ - + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_start_ibs(); } - static void op_amd_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* @@ -289,26 +270,12 @@ static void op_amd_stop(struct op_msrs const * const msrs) for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } - if (has_ibs && ibs_config.op_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_stop_ibs(); } static void op_amd_shutdown(struct op_msrs const * const msrs) @@ -316,11 +283,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } } @@ -491,14 +458,16 @@ static void op_amd_exit(void) {} #endif /* CONFIG_OPROFILE_IBS */ struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .reserved = MSR_AMD_EVENTSEL_RESERVED, + .event_mask = OP_EVENT_MASK, + .init = op_amd_init, + .exit = op_amd_exit, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown + .shutdown = &op_amd_shutdown, }; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 819b131fd752..f01e53b118fa 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -32,6 +32,8 @@ #define NUM_CCCRS_HT2 9 #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) +#define OP_CTR_OVERFLOW (1ULL<<31) + static unsigned int num_counters = NUM_COUNTERS_NON_HT; static unsigned int num_controls = NUM_CONTROLS_NON_HT; @@ -350,8 +352,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) -#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) -#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) #define CCCR_RESERVED_BITS 0x38030FFF #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) @@ -361,17 +361,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) -#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) -#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) -#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) - /* this assigns a "stagger" to the current CPU, which is used throughout the code in this module as an extra array offset, to select the "even" @@ -515,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) if (ev->bindings[i].virt_counter & counter_bit) { /* modify ESCR */ - ESCR_READ(escr, high, ev, i); + rdmsr(ev->bindings[i].escr_address, escr, high); ESCR_CLEAR(escr); if (stag == 0) { ESCR_SET_USR_0(escr, counter_config[ctr].user); @@ -526,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } ESCR_SET_EVENT_SELECT(escr, ev->event_select); ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); - ESCR_WRITE(escr, high, ev, i); + wrmsr(ev->bindings[i].escr_address, escr, high); /* modify CCCR */ - CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); @@ -537,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) CCCR_SET_PMI_OVF_0(cccr); else CCCR_SET_PMI_OVF_1(cccr); - CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); return; } } @@ -548,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } -static void p4_setup_ctrs(struct op_msrs const * const msrs) +static void p4_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int i; unsigned int low, high; @@ -564,7 +559,7 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) /* clear the cccrs we will use */ for (i = 0 ; i < num_counters ; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); @@ -574,17 +569,18 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) /* clear all escrs (including those outside our concern) */ for (i = num_counters; i < num_controls; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; wrmsr(msrs->controls[i].addr, 0, 0); } /* setup all counters */ for (i = 0 ; i < num_counters ; ++i) { - if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); - CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(s64)counter_config[i].count); } else { reset_value[i] = 0; } @@ -624,14 +620,16 @@ static int p4_check_ctrs(struct pt_regs * const regs, real = VIRT_CTR(stag, i); - CCCR_READ(low, high, real); - CTR_READ(ctr, high, real); - if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + rdmsr(p4_counters[real].cccr_address, low, high); + rdmsr(p4_counters[real].counter_address, ctr, high); + if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], real); + wrmsrl(p4_counters[real].counter_address, + -(s64)reset_value[i]); CCCR_CLEAR_OVF(low); - CCCR_WRITE(low, high, real); - CTR_WRITE(reset_value[i], real); + wrmsr(p4_counters[real].cccr_address, low, high); + wrmsrl(p4_counters[real].counter_address, + -(s64)reset_value[i]); } } @@ -653,9 +651,9 @@ static void p4_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_ENABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -670,9 +668,9 @@ static void p4_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_DISABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -681,7 +679,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(msrs->counters[i].addr); } /* @@ -690,7 +688,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) * This saves a few bits. */ for (i = num_counters ; i < num_controls ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(msrs->controls[i].addr); } } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4da7230b3d17..cd72d5c73b49 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -10,6 +10,7 @@ * @author Philippe Elie * @author Graydon Hoare * @author Andi Kleen + * @author Robert Richter <robert.richter@amd.com> */ #include <linux/oprofile.h> @@ -18,7 +19,6 @@ #include <asm/msr.h> #include <asm/apic.h> #include <asm/nmi.h> -#include <asm/perf_counter.h> #include "op_x86_model.h" #include "op_counter.h" @@ -26,20 +26,7 @@ static int num_counters = 2; static int counter_width = 32; -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR(x) (x &= (1<<21)) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT(val, e) (val |= e) +#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) static u64 *reset_value; @@ -63,9 +50,10 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) } -static void ppro_setup_ctrs(struct op_msrs const * const msrs) +static void ppro_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) { @@ -94,35 +82,29 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) /* clear all counters */ for (i = 0 ; i < num_counters; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < num_counters; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT(low, counter_config[i].event); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -147,10 +129,10 @@ static int ppro_check_ctrs(struct pt_regs * const regs, if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); - if (CTR_OVERFLOWED(val)) { - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - } + if (val & (1ULL << (counter_width - 1))) + continue; + oprofile_add_sample(regs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } out: @@ -171,16 +153,16 @@ out: static void ppro_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } } @@ -188,7 +170,7 @@ static void ppro_start(struct op_msrs const * const msrs) static void ppro_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) @@ -196,9 +178,9 @@ static void ppro_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } @@ -207,11 +189,11 @@ static void ppro_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } for (i = 0 ; i < num_counters ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } if (reset_value) { @@ -221,9 +203,10 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } -struct op_x86_model_spec op_ppro_spec = { - .num_counters = 2, /* can be overriden */ - .num_controls = 2, /* dito */ +struct op_x86_model_spec const op_ppro_spec = { + .num_counters = 2, + .num_controls = 2, + .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -241,7 +224,7 @@ struct op_x86_model_spec op_ppro_spec = { * the specific CPU. */ -void arch_perfmon_setup_counters(void) +static void arch_perfmon_setup_counters(void) { union cpuid10_eax eax; @@ -259,11 +242,17 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; - op_ppro_spec.num_counters = num_counters; - op_ppro_spec.num_controls = num_counters; +} + +static int arch_perfmon_init(struct oprofile_operations *ignore) +{ + arch_perfmon_setup_counters(); + return 0; } struct op_x86_model_spec op_arch_perfmon_spec = { + .reserved = MSR_PPRO_EVENTSEL_RESERVED, + .init = &arch_perfmon_init, /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, /* user space does the cpuid check for available events */ diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 825e79064d64..505489873b9d 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -6,19 +6,18 @@ * @remark Read the file COPYING * * @author Graydon Hoare + * @author Robert Richter <robert.richter@amd.com> */ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H -struct op_saved_msr { - unsigned int high; - unsigned int low; -}; +#include <asm/types.h> +#include <asm/perf_counter.h> struct op_msr { - unsigned long addr; - struct op_saved_msr saved; + unsigned long addr; + u64 saved; }; struct op_msrs { @@ -28,29 +27,37 @@ struct op_msrs { struct pt_regs; +struct oprofile_operations; + /* The model vtable abstracts the differences between * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { - int (*init)(struct oprofile_operations *ops); - void (*exit)(void); - unsigned int num_counters; - unsigned int num_controls; - void (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_msrs const * const msrs); - int (*check_ctrs)(struct pt_regs * const regs, - struct op_msrs const * const msrs); - void (*start)(struct op_msrs const * const msrs); - void (*stop)(struct op_msrs const * const msrs); - void (*shutdown)(struct op_msrs const * const msrs); + unsigned int num_counters; + unsigned int num_controls; + u64 reserved; + u16 event_mask; + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); + void (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); + int (*check_ctrs)(struct pt_regs * const regs, + struct op_msrs const * const msrs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); + void (*shutdown)(struct op_msrs const * const msrs); }; -extern struct op_x86_model_spec op_ppro_spec; +struct op_counter_config; + +extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config); + +extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; -extern void arch_perfmon_setup_counters(void); - #endif /* OP_X86_MODEL_H */ diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index d277ef1eea51..b3d20b9cac63 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -244,7 +244,7 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); mtrr_ap_init(); -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); #endif } |