diff options
Diffstat (limited to 'arch/x86')
103 files changed, 3204 insertions, 3978 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8b..cd2788c78a8a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -777,41 +777,17 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS increased on these systems. config X86_MCE - bool "Machine Check Exception" + bool "Machine Check / overheating reporting" ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). + Machine Check support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, data corruption). The action the kernel takes depends on the severity of the problem, - ranging from a warning message on the console, to halting the machine. - Your processor must be a Pentium or newer to support this - check the - flags in /proc/cpuinfo for mce. Note that some older Pentium systems - have a design flaw which leads to false MCE events - hence MCE is - disabled on all P5 processors, unless explicitly enabled with "mce" - as a boot argument. Similarly, if MCE is built in and creates a - problem on some new non-standard machine, you can boot with "nomce" - to disable it. MCE support simply ignores non-MCE processors like - the 386 and 486, so nearly everyone can say Y here. - -config X86_OLD_MCE - depends on X86_32 && X86_MCE - bool "Use legacy machine check code (will go away)" - default n - select X86_ANCIENT_MCE - ---help--- - Use the old i386 machine check code. This is merely intended for - testing in a transition period. Try this if you run into any machine - check related software problems, but report the problem to - linux-kernel. When in doubt say no. - -config X86_NEW_MCE - depends on X86_MCE - bool - default y if (!X86_OLD_MCE && X86_32) || X86_64 + ranging from warning messages to halting the machine. config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -819,14 +795,14 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. config X86_ANCIENT_MCE def_bool n - depends on X86_32 + depends on X86_32 && X86_MCE prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip @@ -839,36 +815,16 @@ config X86_MCE_THRESHOLD default y config X86_MCE_INJECT - depends on X86_NEW_MCE + depends on X86_MCE tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_MCE_NONFATAL - tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_OLD_MCE - ---help--- - Enabling this feature starts a timer that triggers every 5 seconds which - will look at the machine check registers to see if anything happened. - Non-fatal problems automatically get corrected (but still logged). - Disable this if you don't want to see these messages. - Seeing the messages this option prints out may be indicative of dying - or out-of-spec (ie, overclocked) hardware. - This option only does something on certain CPUs. - (AMD Athlon/Duron and Intel Pentium 4) - -config X86_MCE_P4THERMAL - bool "check for P4 thermal throttling interrupt." - depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) - ---help--- - Enabling this feature will cause a message to be printed when the P4 - enters thermal throttling. - config X86_THERMAL_VECTOR def_bool y - depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + depends on X86_MCE_INTEL config VM86 bool "Enable VM86 support" if EMBEDDED @@ -1247,6 +1203,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b4..1f3851a626d3 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -32,8 +32,8 @@ ifeq ($(CONFIG_X86_32),y) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: - KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ - echo $(call cc-option,-fno-unit-at-a-time); fi ;) + KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \ + $(call cc-option,-fno-unit-at-a-time)) # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/x86/Makefile_32.cpu diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 275dd177f198..11e8c6eb80a1 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -31,7 +31,6 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { -#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; @@ -49,8 +48,7 @@ static int vesa_probe(void) vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ -#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */ -#ifdef CONFIG_VIDEO_VESA + set_fs(vginfo.video_mode_ptr.seg); mode_ptr = vginfo.video_mode_ptr.off; @@ -102,9 +100,6 @@ static int vesa_probe(void) } return nmodes; -#else - return 0; -#endif /* CONFIG_VIDEO_VESA */ } static int vesa_set_mode(struct mode_info *mode) diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 8f8d827e254d..819caa1f2008 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -47,14 +47,6 @@ static u8 vga_set_basic_mode(void) initregs(&ireg); -#ifdef CONFIG_VIDEO_400_HACK - if (adapter >= ADAPTER_VGA) { - ireg.ax = 0x1202; - ireg.bx = 0x0030; - intcall(0x10, &ireg, NULL); - } -#endif - ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; @@ -62,11 +54,9 @@ static u8 vga_set_basic_mode(void) set_fs(0); rows = rdfs8(0x484); /* rows minus one */ -#ifndef CONFIG_VIDEO_400_HACK if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; -#endif if (mode != 3 && mode != 7) mode = 3; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index bad728b76fc2..d42da3802499 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -221,7 +221,6 @@ static unsigned int mode_menu(void) } } -#ifdef CONFIG_VIDEO_RETAIN /* Save screen content to the heap */ static struct saved_screen { int x, y; @@ -299,10 +298,6 @@ static void restore_screen(void) ireg.dl = saved.curx; intcall(0x10, &ireg, NULL); } -#else -#define save_screen() ((void)0) -#define restore_screen() ((void)0) -#endif void set_video(void) { diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index 5bb174a997fc..ff339c5db311 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -17,19 +17,8 @@ #include <linux/types.h> -/* Enable autodetection of SVGA adapters and modes. */ -#undef CONFIG_VIDEO_SVGA - -/* Enable autodetection of VESA modes */ -#define CONFIG_VIDEO_VESA - -/* Retain screen contents when switching modes */ -#define CONFIG_VIDEO_RETAIN - -/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ -#undef CONFIG_VIDEO_400_HACK - -/* This code uses an extended set of video mode numbers. These include: +/* + * This code uses an extended set of video mode numbers. These include: * Aliases for standard modes * NORMAL_VGA (-1) * EXTENDED_VGA (-2) @@ -67,13 +56,8 @@ /* The "recalculate timings" flag */ #define VIDEO_RECALC 0x8000 -/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ -#ifdef CONFIG_VIDEO_RETAIN void store_screen(void); #define DO_STORE() store_screen() -#else -#define DO_STORE() ((void)0) -#endif /* CONFIG_VIDEO_RETAIN */ /* * Mode table structures diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e590261ba059..ba331bfd1112 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -537,7 +537,7 @@ ia32_sys_call_table: .quad sys_mkdir .quad sys_rmdir /* 40 */ .quad sys_dup - .quad sys32_pipe + .quad sys_pipe .quad compat_sys_times .quad quiet_ni_syscall /* old prof syscall holder */ .quad sys_brk /* 45 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 085a8c35f149..9f5527198825 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -189,20 +189,6 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len, return sys_mprotect(start, len, prot); } -asmlinkage long sys32_pipe(int __user *fd) -{ - int retval; - int fds[2]; - - retval = do_pipe_flags(fds, 0); - if (retval) - goto out; - if (copy_to_user(fd, fds, sizeof(fds))) - retval = -EFAULT; -out: - return retval; -} - asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, struct sigaction32 __user *oact, unsigned int sigsetsize) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bb7d47925847..586b7adb8e53 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -183,6 +183,10 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) +static inline void x2apic_force_phys(void) +{ + x2apic_phys = 1; +} #else static inline void check_x2apic(void) { @@ -194,6 +198,9 @@ static inline int x2apic_enabled(void) { return 0; } +static inline void x2apic_force_phys(void) +{ +} #define x2apic_preenabled 0 #define x2apic_supported() 0 diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index c68c361697e1..4d447b732d82 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { - return percpu_read(current_task); + return percpu_read_stable(current_task); } #define current get_current() diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c993e9e0fed4..e8de2f6f5ca5 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -291,11 +291,24 @@ static inline unsigned long get_desc_base(const struct desc_struct *desc) return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); } +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + static inline unsigned long get_desc_limit(const struct desc_struct *desc) { return desc->limit0 | (desc->limit << 16); } +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a6adefa28b94..9d6684849fd9 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -34,6 +34,12 @@ struct desc_struct { }; } __attribute__((packed)); +#define GDT_ENTRY_INIT(flags, base, limit) { { { \ + .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ + .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ + ((limit) & 0xf0000) | ((base) & 0xff000000), \ + } } } + enum { GATE_INTERRUPT = 0xE, GATE_TRAP = 0xF, diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1c3f9435f1c9..0ee770d23d0e 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -55,6 +55,24 @@ extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag); +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return 0; + + return addr + size <= *dev->dma_mask; +} + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr; +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr; +} + static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index ff8cbfa07851..5e3f2044f0d3 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) #endif diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 175adf58dd4f..2e7529295f55 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -26,6 +26,7 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern asmlinkage void math_state_restore(void); +extern void __math_state_restore(void); extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index 0d5b23b7b06e..ec34c760665e 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h @@ -1,94 +1 @@ -#ifndef _ASM_X86_IOCTLS_H -#define _ASM_X86_IOCTLS_H - -#include <asm/ioctl.h> - -/* 0x54 is just a magic number to make these relatively unique ('T') */ - -#define TCGETS 0x5401 -#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */ -#define TCSETSW 0x5403 -#define TCSETSF 0x5404 -#define TCGETA 0x5405 -#define TCSETA 0x5406 -#define TCSETAW 0x5407 -#define TCSETAF 0x5408 -#define TCSBRK 0x5409 -#define TCXONC 0x540A -#define TCFLSH 0x540B -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E -#define TIOCGPGRP 0x540F -#define TIOCSPGRP 0x5410 -#define TIOCOUTQ 0x5411 -#define TIOCSTI 0x5412 -#define TIOCGWINSZ 0x5413 -#define TIOCSWINSZ 0x5414 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define FIONREAD 0x541B -#define TIOCINQ FIONREAD -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -#define FIONBIO 0x5421 -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TCGETS2 _IOR('T', 0x2A, struct termios2) -#define TCSETS2 _IOW('T', 0x2B, struct termios2) -#define TCSETSW2 _IOW('T', 0x2C, struct termios2) -#define TCSETSF2 _IOW('T', 0x2D, struct termios2) -#define TIOCGRS485 0x542E -#define TIOCSRS485 0x542F -#define TIOCGPTN _IOR('T', 0x30, unsigned int) - /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ -#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ -#define TCSETX 0x5433 -#define TCSETXF 0x5434 -#define TCSETXW 0x5435 - -#define FIONCLEX 0x5450 -#define FIOCLEX 0x5451 -#define FIOASYNC 0x5452 -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ -#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ -#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ -#define FIOQSIZE 0x5460 - -/* Used for packet mode */ -#define TIOCPKT_DATA 0 -#define TIOCPKT_FLUSHREAD 1 -#define TIOCPKT_FLUSHWRITE 2 -#define TIOCPKT_STOP 4 -#define TIOCPKT_START 8 -#define TIOCPKT_NOSTOP 16 -#define TIOCPKT_DOSTOP 32 - -#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ - -#endif /* _ASM_X86_IOCTLS_H */ +#include <asm-generic/ioctls.h> diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index ee678fd51594..84c7e51cb6d0 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h @@ -1,28 +1 @@ -#ifndef _ASM_X86_IPCBUF_H -#define _ASM_X86_IPCBUF_H - -/* - * The ipc64_perm structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space is left for: - * - 32-bit mode_t and seq - * - 2 miscellaneous 32-bit values - */ - -struct ipc64_perm { - __kernel_key_t key; - __kernel_uid32_t uid; - __kernel_gid32_t gid; - __kernel_uid32_t cuid; - __kernel_gid32_t cgid; - __kernel_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned long __unused1; - unsigned long __unused2; -}; - -#endif /* _ASM_X86_IPCBUF_H */ +#include <asm-generic/ipcbuf.h> diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 5136dad57cbb..0d97deba1e35 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -90,8 +90,9 @@ static inline void lguest_set_ts(void) } /* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } }) -#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } }) +#define FULL_EXEC_SEGMENT \ + ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) +#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec9..ad7535372918 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -9,7 +9,7 @@ */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ @@ -115,13 +115,6 @@ void mcheck_init(struct cpuinfo_x86 *c); static inline void mcheck_init(struct cpuinfo_x86 *c) {} #endif -#ifdef CONFIG_X86_OLD_MCE -extern int nr_mce_banks; -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - #ifdef CONFIG_X86_ANCIENT_MCE void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); @@ -137,10 +130,11 @@ void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); /* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. + * Maximum banks number. + * This is the limit of the current register layout on + * Intel CPUs. */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) +#define MAX_NR_BANKS 32 #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; @@ -208,11 +202,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_NEW_MCE void mce_log_therm_throt_event(__u64 status); -#else -static inline void mce_log_therm_throt_event(__u64 status) {} -#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 751af2550ed9..593e51d4643f 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -1,20 +1,8 @@ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H -#include <asm-generic/mman-common.h> - #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ +#include <asm-generic/mman.h> #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 47d62743c4d5..555bc12bdcd6 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,18 +1,7 @@ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H -/* x86_32/64 are simple */ -struct mod_arch_specific {}; - -#ifdef CONFIG_X86_32 -# define Elf_Shdr Elf32_Shdr -# define Elf_Sym Elf32_Sym -# define Elf_Ehdr Elf32_Ehdr -#else -# define Elf_Shdr Elf64_Shdr -# define Elf_Sym Elf64_Sym -# define Elf_Ehdr Elf64_Ehdr -#endif +#include <asm-generic/module.h> #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index 7e4e9481f51c..809134c644a6 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h @@ -1,39 +1 @@ -#ifndef _ASM_X86_MSGBUF_H -#define _ASM_X86_MSGBUF_H - -/* - * The msqid64_ds structure for i386 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space on i386 is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - * - * Pad space on x8664 is left for: - * - 2 miscellaneous 64-bit values - */ -struct msqid64_ds { - struct ipc64_perm msg_perm; - __kernel_time_t msg_stime; /* last msgsnd time */ -#ifdef __i386__ - unsigned long __unused1; -#endif - __kernel_time_t msg_rtime; /* last msgrcv time */ -#ifdef __i386__ - unsigned long __unused2; -#endif - __kernel_time_t msg_ctime; /* last change time */ -#ifdef __i386__ - unsigned long __unused3; -#endif - unsigned long msg_cbytes; /* current number of bytes on queue */ - unsigned long msg_qnum; /* number of messages in queue */ - unsigned long msg_qbytes; /* max number of bytes on queue */ - __kernel_pid_t msg_lspid; /* pid of last msgsnd */ - __kernel_pid_t msg_lrpid; /* last receive pid */ - unsigned long __unused4; - unsigned long __unused5; -}; - -#endif /* _ASM_X86_MSGBUF_H */ +#include <asm-generic/msgbuf.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index bd5549034a95..4ffe09b2ad75 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -81,8 +81,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + #define CMCI_EN (1ULL << 30) #define CMCI_THRESHOLD_MASK 0xffffULL @@ -215,6 +222,10 @@ #define THERM_STATUS_PROCHOT (1 << 0) +#define MSR_THERM2_CTL 0x0000019d + +#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c86e5ed4af51..e63cf7d441e1 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; -void __trigger_all_cpu_backtrace(void); -#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() +void arch_trigger_all_cpu_backtrace(void); +#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace static inline void localise_nmi_watchdog(void) { diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index 6f0d0422f4ca..965d45427975 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h @@ -1,22 +1 @@ -#ifndef _ASM_X86_PARAM_H -#define _ASM_X86_PARAM_H - -#ifdef __KERNEL__ -# define HZ CONFIG_HZ /* Internal kernel timer frequency */ -# define USER_HZ 100 /* some user interfaces are */ -# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#define EXEC_PAGESIZE 4096 - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - -#endif /* _ASM_X86_PARAM_H */ +#include <asm-generic/param.h> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8a0832..6a07af432c81 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -7,689 +7,11 @@ #include <asm/pgtable_types.h> #include <asm/asm.h> -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) - -#include <asm/desc_defs.h> -#endif /* X86_64 */ - -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) +#include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/cpumask.h> -#include <asm/kmap_types.h> -#include <asm/desc_defs.h> - -struct page; -struct thread_struct; -struct desc_ptr; -struct tss_struct; -struct mm_struct; -struct desc_struct; -struct task_struct; - -/* - * Wrapper type for pointers to code which uses the non-standard - * calling convention. See PV_CALL_SAVE_REGS_THUNK below. - */ -struct paravirt_callee_save { - void *func; -}; - -/* general info */ -struct pv_info { - unsigned int kernel_rpl; - int shared_kernel_pmd; - int paravirt_enabled; - const char *name; -}; - -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, - unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*arch_setup)(void); - char *(*memory_setup)(void); - void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); -}; - - -struct pv_lazy_ops { - /* Set deferred update mode, used for batching operations. */ - void (*enter)(void); - void (*leave)(void); -}; - -struct pv_time_ops { - void (*time_init)(void); - - /* Set and set time of day */ - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long); - - unsigned long long (*sched_clock)(void); - unsigned long (*get_tsc_khz)(void); -}; - -struct pv_cpu_ops { - /* hooks for various privileged instructions */ - unsigned long (*get_debugreg)(int regno); - void (*set_debugreg)(int regno, unsigned long value); - - void (*clts)(void); - - unsigned long (*read_cr0)(void); - void (*write_cr0)(unsigned long); - - unsigned long (*read_cr4_safe)(void); - unsigned long (*read_cr4)(void); - void (*write_cr4)(unsigned long); - -#ifdef CONFIG_X86_64 - unsigned long (*read_cr8)(void); - void (*write_cr8)(unsigned long); -#endif - - /* Segment descriptor handling */ - void (*load_tr_desc)(void); - void (*load_gdt)(const struct desc_ptr *); - void (*load_idt)(const struct desc_ptr *); - void (*store_gdt)(struct desc_ptr *); - void (*store_idt)(struct desc_ptr *); - void (*set_ldt)(const void *desc, unsigned entries); - unsigned long (*store_tr)(void); - void (*load_tls)(struct thread_struct *t, unsigned int cpu); -#ifdef CONFIG_X86_64 - void (*load_gs_index)(unsigned int idx); -#endif - void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, - const void *desc); - void (*write_gdt_entry)(struct desc_struct *, - int entrynum, const void *desc, int size); - void (*write_idt_entry)(gate_desc *, - int entrynum, const gate_desc *gate); - void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); - void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); - - void (*set_iopl_mask)(unsigned mask); - - void (*wbinvd)(void); - void (*io_delay)(void); - - /* cpuid emulation, mostly so that caps bits can be disabled */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx); - - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr_amd)(unsigned int msr, int *err); - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - - u64 (*read_tsc)(void); - u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); - - /* - * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) - */ - void (*irq_enable_sysexit)(void); - - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - - /* - * Switch to usermode gs and return to 32-bit usermode using - * sysret. Used to return to 32-on-64 compat processes. - * Other usermode register state, including %esp, must already - * be restored. - */ - void (*usergs_sysret32)(void); - - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - - void (*swapgs)(void); - - void (*start_context_switch)(struct task_struct *prev); - void (*end_context_switch)(struct task_struct *next); -}; - -struct pv_irq_ops { - void (*init_IRQ)(void); - - /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. - * - * NOTE: These functions callers expect the callee to preserve - * more registers than the standard C calling convention. - */ - struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; - struct paravirt_callee_save irq_disable; - struct paravirt_callee_save irq_enable; - - void (*safe_halt)(void); - void (*halt)(void); - -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif -}; - -struct pv_apic_ops { -#ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - - void (*startup_ipi_hook)(int phys_apicid, - unsigned long start_eip, - unsigned long start_esp); -#endif -}; - -struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - - unsigned long (*read_cr2)(void); - void (*write_cr2)(unsigned long); - - unsigned long (*read_cr3)(void); - void (*write_cr3)(unsigned long); - - /* - * Hooks for intercepting the creation/use/destruction of an - * mm_struct. - */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); - - - /* TLB operations */ - void (*flush_tlb_user)(void); - void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(unsigned long addr); - void (*flush_tlb_others)(const struct cpumask *cpus, - struct mm_struct *mm, - unsigned long va); - - /* Hooks for allocating and freeing a pagetable top-level */ - int (*pgd_alloc)(struct mm_struct *mm); - void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); - - /* - * Hooks for allocating/releasing pagetable pages when they're - * attached to a pagetable - */ - void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); - void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); - void (*release_pte)(unsigned long pfn); - void (*release_pmd)(unsigned long pfn); - void (*release_pud)(unsigned long pfn); - - /* Pagetable manipulation functions */ - void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); - - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); - - struct paravirt_callee_save pte_val; - struct paravirt_callee_save make_pte; - - struct paravirt_callee_save pgd_val; - struct paravirt_callee_save make_pgd; - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pmd_clear)(pmd_t *pmdp); - -#endif /* CONFIG_X86_PAE */ - - void (*set_pud)(pud_t *pudp, pud_t pudval); - - struct paravirt_callee_save pmd_val; - struct paravirt_callee_save make_pmd; - -#if PAGETABLE_LEVELS == 4 - struct paravirt_callee_save pud_val; - struct paravirt_callee_save make_pud; - - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ - -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - - struct pv_lazy_ops lazy_mode; - - /* dom0 ops */ - - /* Sometimes the physical address is a pfn, and sometimes its - an mfn. We can tell which is which from the index. */ - void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, - phys_addr_t phys, pgprot_t flags); -}; - -struct raw_spinlock; -struct pv_lock_ops { - int (*spin_is_locked)(struct raw_spinlock *lock); - int (*spin_is_contended)(struct raw_spinlock *lock); - void (*spin_lock)(struct raw_spinlock *lock); - void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct raw_spinlock *lock); - void (*spin_unlock)(struct raw_spinlock *lock); -}; - -/* This contains all the paravirt structures: we get a convenient - * number for each function using the offset which we use to indicate - * what to patch. */ -struct paravirt_patch_template { - struct pv_init_ops pv_init_ops; - struct pv_time_ops pv_time_ops; - struct pv_cpu_ops pv_cpu_ops; - struct pv_irq_ops pv_irq_ops; - struct pv_apic_ops pv_apic_ops; - struct pv_mmu_ops pv_mmu_ops; - struct pv_lock_ops pv_lock_ops; -}; - -extern struct pv_info pv_info; -extern struct pv_init_ops pv_init_ops; -extern struct pv_time_ops pv_time_ops; -extern struct pv_cpu_ops pv_cpu_ops; -extern struct pv_irq_ops pv_irq_ops; -extern struct pv_apic_ops pv_apic_ops; -extern struct pv_mmu_ops pv_mmu_ops; -extern struct pv_lock_ops pv_lock_ops; - -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "i" (&(op)) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") - -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - -unsigned paravirt_patch_nop(void); -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_ignore(unsigned len); -unsigned paravirt_patch_call(void *insnbuf, - const void *target, u16 tgt_clobbers, - unsigned long addr, u16 site_clobbers, - unsigned len); -unsigned paravirt_patch_jmp(void *insnbuf, const void *target, - unsigned long addr, unsigned len); -unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len); - -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, - const char *start, const char *end); - -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len); - -int paravirt_disable_iospace(void); - -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ -#define PARAVIRT_CALL "call *%c[paravirt_opptr];" - -/* - * These macros are intended to wrap calls through one of the paravirt - * ops structs, so that they can be later identified and patched at - * runtime. - * - * Normally, a call to a pv_op function is a simple indirect call: - * (pv_op_struct.operations)(args...). - * - * Unfortunately, this is a relatively slow operation for modern CPUs, - * because it cannot necessarily determine what the destination - * address is. In this case, the address is a runtime constant, so at - * the very least we can patch the call to e a simple direct call, or - * ideally, patch an inline implementation into the callsite. (Direct - * calls are essentially free, because the call and return addresses - * are completely predictable.) - * - * For i386, these macros rely on the standard gcc "regparm(3)" calling - * convention, in which the first three arguments are placed in %eax, - * %edx, %ecx (in that order), and the remaining arguments are placed - * on the stack. All caller-save registers (eax,edx,ecx) are expected - * to be modified (either clobbered or used for return values). - * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, - * %rdx, and %rcx. Note that for this reason, x86_64 does not need any - * special handling for dealing with 4 arguments, unlike i386. - * However, x86_64 also have to clobber all caller saved registers, which - * unfortunately, are quite a bit (r8 - r11) - * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * - * Unfortunately there's no way to get gcc to generate the args setup - * for the call, and then allow the call itself to be generated by an - * inline asm. Because of this, we must do the complete arg setup and - * return value handling from within these macros. This is fairly - * cumbersome. - * - * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. - * It could be extended to more arguments, but there would be little - * to be gained from that. For each number of arguments, there are - * the two VCALL and CALL variants for void and non-void functions. - * - * When there is a return value, the invoker of the macro must specify - * the return type. The macro then uses sizeof() on that type to - * determine whether its a 32 or 64 bit value, and places the return - * in the right register(s) (just %eax for 32-bit, and %edx:%eax for - * 64-bit). For x86_64 machines, it just returns at %rax regardless of - * the return value size. - * - * 64-bit arguments are passed as a pair of adjacent 32-bit arguments - * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments - * in low,high order - * - * Small structures are passed and returned in registers. The macro - * calling convention can't directly deal with this, so the wrapper - * functions must do this. - * - * These PVOP_* macros are only defined within this header. This - * means that all uses must be wrapped in inline functions. This also - * makes sure the incoming and outgoing types are always correct. - */ -#ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - -#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS -#define VEXTRA_CLOBBERS -#else /* CONFIG_X86_64 */ -#define PVOP_VCALL_ARGS \ - unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax - -#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ - "=S" (__esi), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" -#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_PARAVIRT_DEBUG -#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) -#else -#define PVOP_TEST_NULL(op) ((void)op) -#endif - -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ - ({ \ - rettype __ret; \ - PVOP_CALL_ARGS; \ - PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ - } \ - __ret; \ - }) - -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ - ({ \ - PVOP_VCALL_ARGS; \ - PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - }) - -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) - -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - - -#define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") -#define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") - -#define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") -#define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") - - -#define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) - -#define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) - - -#define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - - -#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -#define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) - -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif static inline int paravirt_enabled(void) { @@ -1393,20 +715,6 @@ static inline void pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); - #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { @@ -1437,12 +745,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, pv_mmu_ops.set_fixmap(idx, phys, flags); } -void _paravirt_nop(void); -u32 _paravirt_ident_32(u32); -u64 _paravirt_ident_64(u64); - -#define paravirt_nop ((void *)_paravirt_nop) - #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static inline int __raw_spin_is_locked(struct raw_spinlock *lock) @@ -1479,17 +781,6 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) #endif -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 instrtype; /* type of this instruction */ - u8 len; /* length of original instruction */ - u16 clobbers; /* what registers you may clobber */ -}; - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; - #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" #define PV_RESTORE_REGS "popl %edx; popl %ecx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h new file mode 100644 index 000000000000..2b3371bae295 --- /dev/null +++ b/arch/x86/include/asm/paravirt_types.h @@ -0,0 +1,720 @@ +#ifndef _ASM_X86_PARAVIRT_TYPES_H +#define _ASM_X86_PARAVIRT_TYPES_H + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0 +#define CLBR_EAX (1 << 0) +#define CLBR_ECX (1 << 1) +#define CLBR_EDX (1 << 2) +#define CLBR_EDI (1 << 3) + +#ifdef CONFIG_X86_32 +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 4) - 1) + +#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) +#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) +#define CLBR_SCRATCH (0) +#else +#define CLBR_RAX CLBR_EAX +#define CLBR_RCX CLBR_ECX +#define CLBR_RDX CLBR_EDX +#define CLBR_RDI CLBR_EDI +#define CLBR_RSI (1 << 4) +#define CLBR_R8 (1 << 5) +#define CLBR_R9 (1 << 6) +#define CLBR_R10 (1 << 7) +#define CLBR_R11 (1 << 8) + +#define CLBR_ANY ((1 << 9) - 1) + +#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ + CLBR_RCX | CLBR_R8 | CLBR_R9) +#define CLBR_RET_REG (CLBR_RAX) +#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) + +#endif /* X86_64 */ + +#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) + +#ifndef __ASSEMBLY__ + +#include <asm/desc_defs.h> +#include <asm/kmap_types.h> + +struct page; +struct thread_struct; +struct desc_ptr; +struct tss_struct; +struct mm_struct; +struct desc_struct; +struct task_struct; +struct cpumask; + +/* + * Wrapper type for pointers to code which uses the non-standard + * calling convention. See PV_CALL_SAVE_REGS_THUNK below. + */ +struct paravirt_callee_save { + void *func; +}; + +/* general info */ +struct pv_info { + unsigned int kernel_rpl; + int shared_kernel_pmd; + int paravirt_enabled; + const char *name; +}; + +struct pv_init_ops { + /* + * Patch may replace one of the defined code sequences with + * arbitrary code, subject to the same register constraints. + * This generally means the code is not free to clobber any + * registers other than EAX. The patch function should return + * the number of bytes of code generated, as we nop pad the + * rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, + unsigned long addr, unsigned len); + + /* Basic arch-specific setup */ + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*post_allocator_init)(void); + + /* Print a banner to identify the environment */ + void (*banner)(void); +}; + + +struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); +}; + +struct pv_time_ops { + void (*time_init)(void); + + /* Set and set time of day */ + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + + unsigned long long (*sched_clock)(void); + unsigned long (*get_tsc_khz)(void); +}; + +struct pv_cpu_ops { + /* hooks for various privileged instructions */ + unsigned long (*get_debugreg)(int regno); + void (*set_debugreg)(int regno, unsigned long value); + + void (*clts)(void); + + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); + + unsigned long (*read_cr4_safe)(void); + unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); + +#ifdef CONFIG_X86_64 + unsigned long (*read_cr8)(void); + void (*write_cr8)(unsigned long); +#endif + + /* Segment descriptor handling */ + void (*load_tr_desc)(void); + void (*load_gdt)(const struct desc_ptr *); + void (*load_idt)(const struct desc_ptr *); + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); +#ifdef CONFIG_X86_64 + void (*load_gs_index)(unsigned int idx); +#endif + void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, + const void *desc); + void (*write_gdt_entry)(struct desc_struct *, + int entrynum, const void *desc, int size); + void (*write_idt_entry)(gate_desc *, + int entrynum, const gate_desc *gate); + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); + + void (*wbinvd)(void); + void (*io_delay)(void); + + /* cpuid emulation, mostly so that caps bits can be disabled */ + void (*cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (*read_msr_amd)(unsigned int msr, int *err); + u64 (*read_msr)(unsigned int msr, int *err); + int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + u64 (*read_tsc)(void); + u64 (*read_pmc)(int counter); + unsigned long long (*read_tscp)(unsigned int *aux); + + /* + * Atomically enable interrupts and return to userspace. This + * is only ever used to return to 32-bit processes; in a + * 64-bit kernel, it's used for 32-on-64 compat processes, but + * never native 64-bit processes. (Jump, not call.) + */ + void (*irq_enable_sysexit)(void); + + /* + * Switch to usermode gs and return to 64-bit usermode using + * sysret. Only used in 64-bit kernels to return to 64-bit + * processes. Usermode register state, including %rsp, must + * already be restored. + */ + void (*usergs_sysret64)(void); + + /* + * Switch to usermode gs and return to 32-bit usermode using + * sysret. Used to return to 32-on-64 compat processes. + * Other usermode register state, including %esp, must already + * be restored. + */ + void (*usergs_sysret32)(void); + + /* Normal iret. Jump to this with the standard iret stack + frame set up. */ + void (*iret)(void); + + void (*swapgs)(void); + + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); +}; + +struct pv_irq_ops { + void (*init_IRQ)(void); + + /* + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. + * + * NOTE: These functions callers expect the callee to preserve + * more registers than the standard C calling convention. + */ + struct paravirt_callee_save save_fl; + struct paravirt_callee_save restore_fl; + struct paravirt_callee_save irq_disable; + struct paravirt_callee_save irq_enable; + + void (*safe_halt)(void); + void (*halt)(void); + +#ifdef CONFIG_X86_64 + void (*adjust_exception_frame)(void); +#endif +}; + +struct pv_apic_ops { +#ifdef CONFIG_X86_LOCAL_APIC + void (*setup_boot_clock)(void); + void (*setup_secondary_clock)(void); + + void (*startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); +#endif +}; + +struct pv_mmu_ops { + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); + + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); + + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + + + /* TLB operations */ + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_others)(const struct cpumask *cpus, + struct mm_struct *mm, + unsigned long va); + + /* Hooks for allocating and freeing a pagetable top-level */ + int (*pgd_alloc)(struct mm_struct *mm); + void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); + + /* + * Hooks for allocating/releasing pagetable pages when they're + * attached to a pagetable + */ + void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); + void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*release_pte)(unsigned long pfn); + void (*release_pmd)(unsigned long pfn); + void (*release_pud)(unsigned long pfn); + + /* Pagetable manipulation functions */ + void (*set_pte)(pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*pte_update)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + + struct paravirt_callee_save pte_val; + struct paravirt_callee_save make_pte; + + struct paravirt_callee_save pgd_val; + struct paravirt_callee_save make_pgd; + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); + +#endif /* CONFIG_X86_PAE */ + + void (*set_pud)(pud_t *pudp, pud_t pudval); + + struct paravirt_callee_save pmd_val; + struct paravirt_callee_save make_pmd; + +#if PAGETABLE_LEVELS == 4 + struct paravirt_callee_save pud_val; + struct paravirt_callee_save make_pud; + + void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); +#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* PAGETABLE_LEVELS >= 3 */ + +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif + + struct pv_lazy_ops lazy_mode; + + /* dom0 ops */ + + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags); +}; + +struct raw_spinlock; +struct pv_lock_ops { + int (*spin_is_locked)(struct raw_spinlock *lock); + int (*spin_is_contended)(struct raw_spinlock *lock); + void (*spin_lock)(struct raw_spinlock *lock); + void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct raw_spinlock *lock); + void (*spin_unlock)(struct raw_spinlock *lock); +}; + +/* This contains all the paravirt structures: we get a convenient + * number for each function using the offset which we use to indicate + * what to patch. */ +struct paravirt_patch_template { + struct pv_init_ops pv_init_ops; + struct pv_time_ops pv_time_ops; + struct pv_cpu_ops pv_cpu_ops; + struct pv_irq_ops pv_irq_ops; + struct pv_apic_ops pv_apic_ops; + struct pv_mmu_ops pv_mmu_ops; + struct pv_lock_ops pv_lock_ops; +}; + +extern struct pv_info pv_info; +extern struct pv_init_ops pv_init_ops; +extern struct pv_time_ops pv_time_ops; +extern struct pv_cpu_ops pv_cpu_ops; +extern struct pv_irq_ops pv_irq_ops; +extern struct pv_apic_ops pv_apic_ops; +extern struct pv_mmu_ops pv_mmu_ops; +extern struct pv_lock_ops pv_lock_ops; + +#define PARAVIRT_PATCH(x) \ + (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) + +#define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ + [paravirt_opptr] "i" (&(op)) +#define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +/* + * Generate some code, and mark it as patchable by the + * apply_paravirt() alternate instruction patcher. + */ +#define _paravirt_alt(insn_string, type, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR " 771b\n" \ + " .byte " type "\n" \ + " .byte 772b-771b\n" \ + " .short " clobber "\n" \ + ".popsection\n" + +/* Generate patchable code, with the default asm parameters. */ +#define paravirt_alt(insn_string) \ + _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + +/* Simple instruction patching code. */ +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); +unsigned paravirt_patch_ignore(unsigned len); +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len); +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len); +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len); + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len); + +int paravirt_disable_iospace(void); + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +#define PARAVIRT_CALL "call *%c[paravirt_opptr];" + +/* + * These macros are intended to wrap calls through one of the paravirt + * ops structs, so that they can be later identified and patched at + * runtime. + * + * Normally, a call to a pv_op function is a simple indirect call: + * (pv_op_struct.operations)(args...). + * + * Unfortunately, this is a relatively slow operation for modern CPUs, + * because it cannot necessarily determine what the destination + * address is. In this case, the address is a runtime constant, so at + * the very least we can patch the call to e a simple direct call, or + * ideally, patch an inline implementation into the callsite. (Direct + * calls are essentially free, because the call and return addresses + * are completely predictable.) + * + * For i386, these macros rely on the standard gcc "regparm(3)" calling + * convention, in which the first three arguments are placed in %eax, + * %edx, %ecx (in that order), and the remaining arguments are placed + * on the stack. All caller-save registers (eax,edx,ecx) are expected + * to be modified (either clobbered or used for return values). + * X86_64, on the other hand, already specifies a register-based calling + * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * %rdx, and %rcx. Note that for this reason, x86_64 does not need any + * special handling for dealing with 4 arguments, unlike i386. + * However, x86_64 also have to clobber all caller saved registers, which + * unfortunately, are quite a bit (r8 - r11) + * + * The call instruction itself is marked by placing its start address + * and size into the .parainstructions section, so that + * apply_paravirt() in arch/i386/kernel/alternative.c can do the + * appropriate patching under the control of the backend pv_init_ops + * implementation. + * + * Unfortunately there's no way to get gcc to generate the args setup + * for the call, and then allow the call itself to be generated by an + * inline asm. Because of this, we must do the complete arg setup and + * return value handling from within these macros. This is fairly + * cumbersome. + * + * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. + * It could be extended to more arguments, but there would be little + * to be gained from that. For each number of arguments, there are + * the two VCALL and CALL variants for void and non-void functions. + * + * When there is a return value, the invoker of the macro must specify + * the return type. The macro then uses sizeof() on that type to + * determine whether its a 32 or 64 bit value, and places the return + * in the right register(s) (just %eax for 32-bit, and %edx:%eax for + * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * the return value size. + * + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments + * in low,high order + * + * Small structures are passed and returned in registers. The macro + * calling convention can't directly deal with this, so the wrapper + * functions must do this. + * + * These PVOP_* macros are only defined within this header. This + * means that all uses must be wrapped in inline functions. This also + * makes sure the incoming and outgoing types are always correct. + */ +#ifdef CONFIG_X86_32 +#define PVOP_VCALL_ARGS \ + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS + +#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS +#define VEXTRA_CLOBBERS +#else /* CONFIG_X86_64 */ +#define PVOP_VCALL_ARGS \ + unsigned long __edi = __edi, __esi = __esi, \ + __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + +#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ + "=S" (__esi), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" +#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_PARAVIRT_DEBUG +#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#else +#define PVOP_TEST_NULL(op) ((void)op) +#endif + +#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ + pre, post, ...) \ + ({ \ + rettype __ret; \ + PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ + /* This is 32-bit specific, but is okay in 64-bit */ \ + /* since this condition will never hold */ \ + if (sizeof(rettype) > sizeof(unsigned long)) { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)__eax; \ + } \ + __ret; \ + }) + +#define __PVOP_CALL(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ + EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + +#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ + ({ \ + PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + }) + +#define __PVOP_VCALL(op, pre, post, ...) \ + ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, \ + pre, post, ##__VA_ARGS__) + +#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + + +#define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") +#define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + +#define PVOP_CALLEE0(rettype, op) \ + __PVOP_CALLEESAVE(rettype, op, "", "") +#define PVOP_VCALLEE0(op) \ + __PVOP_VCALLEESAVE(op, "", "") + + +#define PVOP_CALL1(rettype, op, arg1) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALL1(op, arg1) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + +#define PVOP_CALLEE1(rettype, op, arg1) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALLEE1(op, arg1) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + + +#define PVOP_CALL2(rettype, op, arg1, arg2) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALL2(op, arg1, arg2) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + +#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALLEE2(op, arg1, arg2) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + + +#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) +#define PVOP_VCALL3(op, arg1, arg2, arg3) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + +/* This is the only difference in x86_64. We can make it much simpler */ +#ifdef CONFIG_X86_32 +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#else +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#endif + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + +void paravirt_enter_lazy_mmu(void); +void paravirt_leave_lazy_mmu(void); + +void _paravirt_nop(void); +u32 _paravirt_ident_32(u32); +u64 _paravirt_ident_64(u64); + +#define paravirt_nop ((void *)_paravirt_nop) + +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +extern struct paravirt_patch_site __parainstructions[], + __parainstructions_end[]; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d85..04eacefcfd26 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -49,7 +49,7 @@ #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x #define __my_cpu_offset percpu_read(this_cpu_off) #else -#define __percpu_arg(x) "%" #x +#define __percpu_arg(x) "%P" #x #endif /* @@ -104,36 +104,48 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var) \ +#define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ : "=q" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ default: __bad_percpu_size(); \ } \ ret__; \ }) -#define percpu_read(var) percpu_from_op("mov", per_cpu__##var) +/* + * percpu_read() makes gcc load the percpu variable every time it is + * accessed while percpu_read_stable() allows the value to be cached. + * percpu_read_stable() is more efficient and can be used if its value + * is guaranteed to be valid across cpus. The current users include + * get_current() and get_thread_info() both of which are actually + * per-thread variables implemented as per-cpu variables and thus + * stable for the duration of the respective task. + */ +#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ + "m" (per_cpu__##var)) +#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ + "p" (&per_cpu__##var)) #define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) #define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) #define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index fa64e401589d..e7b7c938ae27 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,6 +84,16 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed counter range, since lower + * values are used by actual fixed counters and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3cc06e3fceb8..9de8729c1c8f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -134,6 +134,11 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -351,7 +356,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ -static inline unsigned pmd_index(unsigned long address) +static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } @@ -371,7 +376,7 @@ static inline unsigned pmd_index(unsigned long address) * this function returns the index of the entry in the pte page which would * control the given virtual address */ -static inline unsigned pte_index(unsigned long address) +static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } @@ -422,11 +427,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; -} - static inline int pud_large(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == @@ -462,7 +462,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned pud_index(unsigned long address) +static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 263d397d2eef..75af592677ec 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -1,33 +1,8 @@ #ifndef _ASM_X86_SCATTERLIST_H #define _ASM_X86_SCATTERLIST_H -#include <asm/types.h> - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -#define ARCH_HAS_SG_CHAIN #define ISA_DMA_THRESHOLD (0x00ffffff) -/* - * These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#ifdef CONFIG_X86_32 -# define sg_dma_len(sg) ((sg)->length) -#else -# define sg_dma_len(sg) ((sg)->dma_length) -#endif +#include <asm-generic/scatterlist.h> #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index b51413b74971..83c05fc2de38 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h @@ -1,51 +1 @@ -#ifndef _ASM_X86_SHMBUF_H -#define _ASM_X86_SHMBUF_H - -/* - * The shmid64_ds structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space on 32 bit is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - * - * Pad space on 64 bit is left for: - * - 2 miscellaneous 64-bit values - */ - -struct shmid64_ds { - struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ - __kernel_time_t shm_atime; /* last attach time */ -#ifdef __i386__ - unsigned long __unused1; -#endif - __kernel_time_t shm_dtime; /* last detach time */ -#ifdef __i386__ - unsigned long __unused2; -#endif - __kernel_time_t shm_ctime; /* last change time */ -#ifdef __i386__ - unsigned long __unused3; -#endif - __kernel_pid_t shm_cpid; /* pid of creator */ - __kernel_pid_t shm_lpid; /* pid of last operator */ - unsigned long shm_nattch; /* no. of current attaches */ - unsigned long __unused4; - unsigned long __unused5; -}; - -struct shminfo64 { - unsigned long shmmax; - unsigned long shmmin; - unsigned long shmmni; - unsigned long shmseg; - unsigned long shmall; - unsigned long __unused1; - unsigned long __unused2; - unsigned long __unused3; - unsigned long __unused4; -}; - -#endif /* _ASM_X86_SHMBUF_H */ +#include <asm-generic/shmbuf.h> diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index b2a8c74f2d06..6b71384b9d8b 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h @@ -1,63 +1 @@ -#ifndef _ASM_X86_SOCKET_H -#define _ASM_X86_SOCKET_H - -#include <asm/sockios.h> - -/* For setsockopt(2) */ -#define SOL_SOCKET 1 - -#define SO_DEBUG 1 -#define SO_REUSEADDR 2 -#define SO_TYPE 3 -#define SO_ERROR 4 -#define SO_DONTROUTE 5 -#define SO_BROADCAST 6 -#define SO_SNDBUF 7 -#define SO_RCVBUF 8 -#define SO_SNDBUFFORCE 32 -#define SO_RCVBUFFORCE 33 -#define SO_KEEPALIVE 9 -#define SO_OOBINLINE 10 -#define SO_NO_CHECK 11 -#define SO_PRIORITY 12 -#define SO_LINGER 13 -#define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ -#define SO_PASSCRED 16 -#define SO_PEERCRED 17 -#define SO_RCVLOWAT 18 -#define SO_SNDLOWAT 19 -#define SO_RCVTIMEO 20 -#define SO_SNDTIMEO 21 - -/* Security levels - as per NRL IPv6 - don't actually do anything */ -#define SO_SECURITY_AUTHENTICATION 22 -#define SO_SECURITY_ENCRYPTION_TRANSPORT 23 -#define SO_SECURITY_ENCRYPTION_NETWORK 24 - -#define SO_BINDTODEVICE 25 - -/* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 - -#define SO_PEERNAME 28 -#define SO_TIMESTAMP 29 -#define SCM_TIMESTAMP SO_TIMESTAMP - -#define SO_ACCEPTCONN 30 - -#define SO_PEERSEC 31 -#define SO_PASSSEC 34 -#define SO_TIMESTAMPNS 35 -#define SCM_TIMESTAMPNS SO_TIMESTAMPNS - -#define SO_MARK 36 - -#define SO_TIMESTAMPING 37 -#define SCM_TIMESTAMPING SO_TIMESTAMPING - -#define SO_PROTOCOL 38 -#define SO_DOMAIN 39 - -#endif /* _ASM_X86_SOCKET_H */ +#include <asm-generic/socket.h> diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 49cc72b5d3c9..def6d4746ee7 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h @@ -1,13 +1 @@ -#ifndef _ASM_X86_SOCKIOS_H -#define _ASM_X86_SOCKIOS_H - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* _ASM_X86_SOCKIOS_H */ +#include <asm-generic/sockios.h> diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2d742c6e15f..44efdff3975d 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -48,7 +48,7 @@ * head_32 for boot CPU and setup_per_cpu_areas() for others. */ #define GDT_STACK_CANARY_INIT \ - [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, + [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18), /* * Initialize the stackprotector canary value. @@ -90,9 +90,7 @@ static inline void setup_stack_canary_segment(int cpu) struct desc_struct desc; desc = gdt_table[GDT_ENTRY_STACK_CANARY]; - desc.base0 = canary & 0xffff; - desc.base1 = (canary >> 16) & 0xff; - desc.base2 = (canary >> 24) & 0xff; + set_desc_base(&desc, canary); write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); #endif } diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 643c59b4bc6e..75c49c782e20 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -150,33 +150,6 @@ do { \ #endif #ifdef __KERNEL__ -#define _set_base(addr, base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while (0) - -#define _set_limit(addr, limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while (0) - -#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) -#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) extern void native_load_gs_index(unsigned); diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index af1b70ea440f..3935b106de79 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h @@ -1,198 +1 @@ -#ifndef _ASM_X86_TERMBITS_H -#define _ASM_X86_TERMBITS_H - -#include <linux/posix_types.h> - -typedef unsigned char cc_t; -typedef unsigned int speed_t; -typedef unsigned int tcflag_t; - -#define NCCS 19 -struct termios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ -}; - -struct termios2 { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -struct ktermios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -/* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VTIME 5 -#define VMIN 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 -#define VSUSP 10 -#define VEOL 11 -#define VREPRINT 12 -#define VDISCARD 13 -#define VWERASE 14 -#define VLNEXT 15 -#define VEOL2 16 - -/* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IUCLC 0001000 -#define IXON 0002000 -#define IXANY 0004000 -#define IXOFF 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 - -/* c_oflag bits */ -#define OPOST 0000001 -#define OLCUC 0000002 -#define ONLCR 0000004 -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 - -/* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 -#define EXTA B19200 -#define EXTB B38400 -#define CSIZE 0000060 -#define CS5 0000000 -#define CS6 0000020 -#define CS7 0000040 -#define CS8 0000060 -#define CSTOPB 0000100 -#define CREAD 0000200 -#define PARENB 0000400 -#define PARODD 0001000 -#define HUPCL 0002000 -#define CLOCAL 0004000 -#define CBAUDEX 0010000 -#define BOTHER 0010000 /* non standard rate */ -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ - -/* c_lflag bits */ -#define ISIG 0000001 -#define ICANON 0000002 -#define XCASE 0000004 -#define ECHO 0000010 -#define ECHOE 0000020 -#define ECHOK 0000040 -#define ECHONL 0000100 -#define NOFLSH 0000200 -#define TOSTOP 0000400 -#define ECHOCTL 0001000 -#define ECHOPRT 0002000 -#define ECHOKE 0004000 -#define FLUSHO 0010000 -#define PENDIN 0040000 -#define IEXTEN 0100000 - -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - -/* tcsetattr uses these */ -#define TCSANOW 0 -#define TCSADRAIN 1 -#define TCSAFLUSH 2 - -#endif /* _ASM_X86_TERMBITS_H */ +#include <asm-generic/termbits.h> diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index c4ee8056baca..280d78a9d966 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -1,114 +1 @@ -#ifndef _ASM_X86_TERMIOS_H -#define _ASM_X86_TERMIOS_H - -#include <asm/termbits.h> -#include <asm/ioctls.h> - -struct winsize { - unsigned short ws_row; - unsigned short ws_col; - unsigned short ws_xpixel; - unsigned short ws_ypixel; -}; - -#define NCC 8 -struct termio { - unsigned short c_iflag; /* input mode flags */ - unsigned short c_oflag; /* output mode flags */ - unsigned short c_cflag; /* control mode flags */ - unsigned short c_lflag; /* local mode flags */ - unsigned char c_line; /* line discipline */ - unsigned char c_cc[NCC]; /* control characters */ -}; - -/* modem lines */ -#define TIOCM_LE 0x001 -#define TIOCM_DTR 0x002 -#define TIOCM_RTS 0x004 -#define TIOCM_ST 0x008 -#define TIOCM_SR 0x010 -#define TIOCM_CTS 0x020 -#define TIOCM_CAR 0x040 -#define TIOCM_RNG 0x080 -#define TIOCM_DSR 0x100 -#define TIOCM_CD TIOCM_CAR -#define TIOCM_RI TIOCM_RNG -#define TIOCM_OUT1 0x2000 -#define TIOCM_OUT2 0x4000 -#define TIOCM_LOOP 0x8000 - -/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - -#ifdef __KERNEL__ - -#include <asm/uaccess.h> - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -/* - * Translate a "termio" structure into a "termios". Ugh. - */ -#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \ - unsigned short __tmp; \ - get_user(__tmp,&(termio)->x); \ - *(unsigned short *) &(termios)->x = __tmp; \ -} - -static inline int user_termio_to_kernel_termios(struct ktermios *termios, - struct termio __user *termio) -{ - SET_LOW_TERMIOS_BITS(termios, termio, c_iflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); - get_user(termios->c_line, &termio->c_line); - return copy_from_user(termios->c_cc, termio->c_cc, NCC); -} - -/* - * Translate a "termios" structure into a "termio". Ugh. - */ -static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) -{ - put_user((termios)->c_iflag, &(termio)->c_iflag); - put_user((termios)->c_oflag, &(termio)->c_oflag); - put_user((termios)->c_cflag, &(termio)->c_cflag); - put_user((termios)->c_lflag, &(termio)->c_lflag); - put_user((termios)->c_line, &(termio)->c_line); - return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC); -} - -static inline int user_termios_to_kernel_termios(struct ktermios *k, - struct termios2 __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios2)); -} - -static inline int kernel_termios_to_user_termios(struct termios2 __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios2)); -} - -static inline int user_termios_to_kernel_termios_1(struct ktermios *k, - struct termios __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios)); -} - -static inline int kernel_termios_to_user_termios_1(struct termios __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios)); -} - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_TERMIOS_H */ +#include <asm-generic/termios.h> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f26432ac1c57..ae0f88de09ab 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -218,7 +218,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - ti = (void *)(percpu_read(kernel_stack) + + ti = (void *)(percpu_read_stable(kernel_stack) + KERNEL_STACK_OFFSET - THREAD_SIZE); return ti; } diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bfd74c032fca..4da91ad69e0d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -81,9 +81,7 @@ extern int panic_on_unrecovered_nmi; void math_error(void __user *); void math_emulate(struct math_emu_info *); -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long, unsigned long); -#else +#ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index 09b97745772f..df1da20f4534 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,19 +1,11 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#include <asm-generic/int-ll64.h> +#define dma_addr_t dma_addr_t -#ifndef __ASSEMBLY__ - -typedef unsigned short umode_t; +#include <asm-generic/types.h> -#endif /* __ASSEMBLY__ */ - -/* - * These aren't exported outside the kernel to avoid name space clashes - */ #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 87324cf439d9..b7c29c8017f2 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h @@ -7,12 +7,6 @@ * sigcontext struct (uc_mcontext). */ -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct sigcontext uc_mcontext; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; +#include <asm-generic/ucontext.h> #endif /* _ASM_X86_UCONTEXT_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a7f9e89741c2..d29eba3c04d2 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1157,9 +1157,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) * If MPS is present, it will handle them, * otherwise the system will stay in PIC mode */ - if (acpi_disabled || acpi_noirq) { + if (acpi_disabled || acpi_noirq) return -ENODEV; - } if (!cpu_has_apic) return -ENODEV; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec66..de039fcdd053 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -49,6 +49,7 @@ #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/mce.h> +#include <asm/kvm_para.h> unsigned int num_processors; @@ -1361,52 +1362,76 @@ void enable_x2apic(void) } #endif /* CONFIG_X86_X2APIC */ -void __init enable_IR_x2apic(void) +int __init enable_IR(void) { #ifdef CONFIG_INTR_REMAP int ret; - unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; ret = dmar_table_init(); if (ret) { pr_debug("dmar_table_init() failed with %d:\n", ret); - goto ir_failed; + return 0; } if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - goto ir_failed; + return 0; } - if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return; + return 0; } + if (enable_intr_remapping(x2apic_supported())) + return 0; + + pr_info("Enabled Interrupt-remapping\n"); + + return 1; + +#endif + return 0; +} + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + struct IO_APIC_route_entry **ioapic_entries = NULL; + int ret, x2apic_enabled = 0; + ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { - pr_info("Allocate ioapic_entries failed: %d\n", ret); - goto end; + pr_err("Allocate ioapic_entries failed\n"); + goto out; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto end; + goto out; } local_irq_save(flags); - mask_IO_APIC_setup(ioapic_entries); mask_8259A(); + mask_IO_APIC_setup(ioapic_entries); - ret = enable_intr_remapping(x2apic_supported()); - if (ret) - goto end_restore; + ret = enable_IR(); + if (!ret) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || !kvm_para_available()) + goto nox2apic; + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_force_phys(); + } - pr_info("Enabled Interrupt-remapping\n"); + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { x2apic_mode = 1; @@ -1414,41 +1439,25 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -end_restore: - if (ret) - /* - * IR enabling failed - */ +nox2apic: + if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); local_irq_restore(flags); -end: +out: if (ioapic_entries) free_ioapic_entries(ioapic_entries); - if (!ret) + if (x2apic_enabled) return; -ir_failed: if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); + panic("x2apic: enabled by BIOS but kernel init failed."); else if (cpu_has_x2apic) - pr_info("Not enabling x2apic,Intr-remapping\n"); -#else - if (!cpu_has_x2apic) - return; - - if (x2apic_preenabled) - panic("x2apic enabled prior OS handover," - " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); -#endif - - return; + pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); } - #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. @@ -1651,7 +1660,6 @@ int __init APIC_init_uniprocessor(void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7d52343356e8..78ca97fffa94 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,8 @@ #include <asm/apic.h> #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -116,15 +118,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_pin_list; - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -139,6 +132,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; @@ -411,13 +409,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -425,9 +420,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -495,72 +487,68 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } + struct irq_pin_list **last, *entry; - while (entry->next) { - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; + return 0; + last = &entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } entry->apic = apic; entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; + struct irq_pin_list *entry; - while (entry) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } - entry = entry->next; } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -577,7 +565,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -593,11 +580,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { @@ -610,7 +592,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -1699,12 +1680,8 @@ __apicdebuginit(void) print_IO_APIC(void) if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -2208,7 +2185,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2221,14 +2197,6 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -2266,13 +2234,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2285,9 +2249,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } @@ -2512,11 +2473,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2529,31 +2487,28 @@ static void ack_apic_level(unsigned int irq) } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2595,7 +2550,7 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2603,26 +2558,15 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void @@ -3238,8 +3182,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); @@ -3909,7 +3852,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a9..e6b4f517fcfe 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -150,7 +150,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!boot_cpu_has(X86_FEATURE_APIC)) + if (!cpu_has_apic) return 0; apicid = hard_smp_processor_id(); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63a..cb66a22d98ad 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) return 0; } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b82..f3b1037076e4 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,11 +50,11 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic_mode && (apic != &apic_x2apic_phys && + if (x2apic_mode #ifdef CONFIG_X86_UV - apic != &apic_x2apic_uv_x && + && apic != &apic_x2apic_uv_x #endif - apic != &apic_x2apic_cluster)) { + ) { if (x2apic_phys) apic = &apic_x2apic_phys; else diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 442b5508893f..bfe0815c4f96 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -2337,8 +2337,8 @@ static int __init apm_init(void) * This is for buggy BIOS's that refer to (real mode) segment 0x40 * even though they are called in protected mode. */ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); + set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); + set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); /* * Set up the long jump entry point to the APM BIOS, which is called @@ -2358,12 +2358,12 @@ static int __init apm_init(void) * code to that CPU. */ gdt = get_cpu_gdt_table(0); - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); proc_create("apm", 0, NULL, &apm_file_ops); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 82c1b37d38f4..0e05b9f7b8ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -2,7 +2,7 @@ #include <linux/bitops.h> #include <linux/mm.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> @@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) #define CBAR_ENB (0x80000000) #define CBAR_KEY (0X000000CB) if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); } } @@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) d = d2-d; if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); + printk(KERN_CONT + "system stability may be impaired when more than 32 MB are used.\n"); else - printk("probably OK (after B9730xxxx).\n"); + printk(KERN_CONT "probably OK (after B9730xxxx).\n"); printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } @@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } @@ -398,7 +400,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) u32 level; level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } if (c->x86 == 0x10 || c->x86 == 0x11) @@ -487,27 +489,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < + ((tseg>>PMD_SHIFT) < (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); } } #endif } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, + unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { - if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_mask == 0) size = 64; + /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ + (c->x86_mask == 0 || c->x86_mask == 1)) size = 256; } return size; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c8e315f1aa83..01a265212395 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -81,7 +81,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk("Hmm, FPU with FDIV bug.\n"); + printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); } static void __init check_hlt(void) @@ -98,7 +98,7 @@ static void __init check_hlt(void) halt(); halt(); halt(); - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); } /* @@ -122,9 +122,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk("Buggy.\n"); + printk(KERN_CONT "Buggy.\n"); else - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); #endif } @@ -156,7 +156,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 9a3ed0649d4e..04f0fe5af83e 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -15,7 +15,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #if !defined(CONFIG_SMP) - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif alternative_instructions(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9a..54a3ead5cfb2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,8 +18,8 @@ #include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/sections.h> -#include <asm/topology.h> -#include <asm/cpumask.h> +#include <linux/topology.h> +#include <linux/cpumask.h> #include <asm/pgtable.h> #include <asm/atomic.h> #include <asm/proto.h> @@ -28,13 +28,13 @@ #include <asm/desc.h> #include <asm/i387.h> #include <asm/mtrr.h> -#include <asm/numa.h> +#include <linux/numa.h> #include <asm/asm.h> #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> -#include <asm/smp.h> +#include <linux/smp.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -71,45 +71,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; @@ -982,18 +982,26 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; /* @@ -1008,8 +1016,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ void syscall_init(void) @@ -1042,6 +1049,9 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist); #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, stack_canary); #endif diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 593171e967ef..19807b89f058 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -3,10 +3,10 @@ #include <linux/delay.h> #include <linux/pci.h> #include <asm/dma.h> -#include <asm/io.h> +#include <linux/io.h> #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> -#include <asm/timer.h> +#include <linux/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> @@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * The 5510/5520 companion chips have a funky PIT. */ if (vendor == PCI_VENDOR_ID_CYRIX && - (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif @@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * ? : 0x7x * GX1 : 0x8x GX1 datasheet 56 */ - if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); return; } else { /* MediaGX */ @@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); local_irq_restore(flags); } } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index fb5b86af0b01..93ba8eeb100a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,11 +28,10 @@ static inline void __cpuinit detect_hypervisor_vendor(struct cpuinfo_x86 *c) { - if (vmware_platform()) { + if (vmware_platform()) c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; - } else { + else c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; - } } unsigned long get_hypervisor_tsc_freq(void) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3260ab044996..80a722a071b5 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,17 +7,17 @@ #include <linux/sched.h> #include <linux/thread_info.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/ds.h> #include <asm/bugs.h> #include <asm/cpu.h> #ifdef CONFIG_X86_64 -#include <asm/topology.h> +#include <linux/topology.h> #include <asm/numa_64.h> #endif @@ -174,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; @@ -207,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); } } @@ -283,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) /* Intel has a non-standard dependency on %ecx for this CPUID level. */ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) - return ((eax >> 26) + 1); + return (eax >> 26) + 1; else return 1; } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 789efe217e1a..306bf0dca061 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -3,7 +3,7 @@ * * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. + * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -16,7 +16,7 @@ #include <linux/pci.h> #include <asm/processor.h> -#include <asm/smp.h> +#include <linux/smp.h> #include <asm/k8.h> #define LVL_1_INST 1 @@ -25,14 +25,15 @@ #define LVL_3 4 #define LVL_TRACE 5 -struct _cache_table -{ +struct _cache_table { unsigned char descriptor; char cache_type; short size; }; -/* all the cache descriptor types we care about (no TLB or trace cache entries) */ +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + static const struct _cache_table __cpuinitconst cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ @@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = }; -enum _cache_type -{ +enum _cache_type { CACHE_TYPE_NULL = 0, CACHE_TYPE_DATA = 1, CACHE_TYPE_INST = 2, @@ -170,31 +170,31 @@ unsigned short num_cache_leaves; Maybe later */ union l1_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 8; - unsigned assoc : 8; - unsigned size_in_kb : 8; + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; }; unsigned val; }; union l2_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned size_in_kb : 16; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; }; unsigned val; }; union l3_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned res : 2; - unsigned size_encoded : 14; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; }; unsigned val; }; @@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void) unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; @@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) retval = cpuid4_cache_lookup_regs(i, &this_leaf); if (retval >= 0) { - switch(this_leaf.eax.split.level) { - case 1: + switch (this_leaf.eax.split.level) { + case 1: if (this_leaf.eax.split.type == CACHE_TYPE_DATA) new_l1d = this_leaf.size/1024; @@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) CACHE_TYPE_INST) new_l1i = this_leaf.size/1024; break; - case 2: + case 2: new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l2_id = c->apicid >> index_msb; break; - case 3: + case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); + index_msb = get_count_order( + num_threads_sharing); l3_id = c->apicid >> index_msb; break; - default: + default: break; } } @@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; - for ( i = 0 ; i < n ; i++ ) { + for (i = 0 ; i < n ; i++) { cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for ( j = 0 ; j < 3 ; j++ ) { - if (regs[j] & (1 << 31)) regs[j] = 0; - } + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; /* Byte 0 is level count, not a descriptor */ - for ( j = 1 ; j < 16 ; j++ ) { + for (j = 1 ; j < 16 ; j++) { unsigned char des = dp[j]; unsigned char k = 0; /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) - { + while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { if (only_trace && cache_table[k].cache_type != LVL_TRACE) break; @@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) } if (trace) - printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if ( l1i ) - printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); + printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); + else if (l1i) + printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); if (l1d) - printk(", L1 D cache: %dK\n", l1d); + printk(KERN_CONT ", L1 D cache: %dK\n", l1d); else - printk("\n"); + printk(KERN_CONT "\n"); if (l2) printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); @@ -558,8 +559,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +} + +static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +{ +} #endif static void __cpuinit free_cache_attributes(unsigned int cpu) @@ -645,7 +651,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); static ssize_t show_##file_name \ (struct _cpuid4_info *this_leaf, char *buf) \ { \ - return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ + return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } show_one_plus(level, eax.split.level, 0); @@ -656,7 +662,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) { - return sprintf (buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -669,7 +675,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, const struct cpumask *mask; mask = to_cpumask(this_leaf->shared_cpu_map); - n = type? + n = type ? cpulist_scnprintf(buf, len-2, mask) : cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; @@ -800,7 +806,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -static struct attribute * default_attrs[] = { +static struct attribute *default_attrs[] = { &type.attr, &level.attr, &coherency_line_size.attr, @@ -815,7 +821,7 @@ static struct attribute * default_attrs[] = { NULL }; -static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -828,8 +834,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) return ret; } -static ssize_t store(struct kobject * kobj, struct attribute * attr, - const char * buf, size_t count) +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -883,7 +889,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) goto err_out; per_cpu(index_kobject, cpu) = kzalloc( - sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); + sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); if (unlikely(per_cpu(index_kobject, cpu) == NULL)) goto err_out; @@ -917,7 +923,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } for (i = 0; i < num_cache_leaves; i++) { - this_object = INDEX_KOBJECT_PTR(cpu,i); + this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; retval = kobject_init_and_add(&(this_object->kobj), @@ -925,9 +931,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) per_cpu(cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { - for (j = 0; j < i; j++) { - kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); - } + for (j = 0; j < i; j++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; @@ -952,7 +957,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); for (i = 0; i < num_cache_leaves; i++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); + kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } @@ -977,8 +982,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = -{ +static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2b..4ac6d48fe11b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,8 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc609..000000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones <davej@redhat.com> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e5..6bd51e7ba87b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include <linux/sysdev.h> #include <asm/mce.h> enum severity_level { @@ -10,6 +11,19 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f97056..51f7c725dab5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); if (dmce) debugfs_remove(dmce); return -ENOMEM; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1cfb623ce11c..97cd5b0baab4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -45,21 +45,8 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -77,7 +64,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -87,13 +73,13 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; @@ -104,11 +90,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -242,7 +223,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_add_return(1, &mce_paniced) > 1) + if (atomic_inc_return(&mce_paniced) > 1) wait_for_panic(); barrier(); @@ -286,11 +267,11 @@ static int msr_to_offset(u32 msr) unsigned bank = __get_cpu_var(injectm.bank); if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -495,7 +476,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -504,7 +485,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -519,9 +500,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -537,7 +518,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -558,7 +539,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -705,7 +686,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. @@ -842,7 +823,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -916,14 +897,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -964,9 +945,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1159,10 +1140,25 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit mce_cap_init(void) { unsigned b; u64 cap; @@ -1182,11 +1178,10 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = mce_banks_init(); + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1218,15 +1213,16 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { @@ -1236,7 +1232,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1250,7 +1246,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1263,8 +1259,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1273,6 +1269,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* There are also broken BIOSes on some Pentium M systems. */ + if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; @@ -1324,6 +1324,17 @@ static void mce_init_timer(void) add_timer_on(t, smp_processor_id()); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -1538,8 +1549,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) @@ -1580,8 +1593,9 @@ static int mce_disable(void) int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } @@ -1656,14 +1670,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1674,7 +1689,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1816,7 +1831,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1825,10 +1840,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1846,7 +1861,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); @@ -1863,8 +1878,9 @@ static void mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + struct mce_bank *b = &mce_banks[i]; + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } @@ -1879,8 +1895,9 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } @@ -1928,35 +1945,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } static __init int mce_init_device(void) @@ -1969,9 +1972,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) @@ -1991,51 +1992,6 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_OLD_MCE: */ - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); -} - -static int __init mcheck_enable(char *str) -{ - mce_p5_enabled = 1; - return 1; -} -__setup("mce", mcheck_enable); - -#endif /* CONFIG_X86_OLD_MCE */ - /* * Old style boot options parsing. Only for compatibility. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a32..889f665fe93d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +152,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb6..000000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. <davej@redhat.com> - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include <linux/interrupt.h> -#include <linux/workqueue.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2e..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f8178183..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> - */ -#include <linux/interrupt.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/mce.h> -#include <asm/msr.h> - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd5..15f2bc07bb60 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -253,9 +253,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -264,6 +261,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index ee2331b0e58f..33af14110dfd 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -7,15 +7,15 @@ static void amd_get_mtrr(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type) + unsigned long *size, mtrr_type *type) { unsigned long low, high; rdmsr(MSR_K6_UWCCR, low, high); - /* Upper dword is region 1, lower is region 0 */ + /* Upper dword is region 1, lower is region 0 */ if (reg == 1) low = high; - /* The base masks off on the right alignment */ + /* The base masks off on the right alignment */ *base = (low & 0xFFFE0000) >> PAGE_SHIFT; *type = 0; if (low & 1) @@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base, return; } /* - * This needs a little explaining. The size is stored as an - * inverted mask of bits of 128K granularity 15 bits long offset - * 2 bits + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. * - * So to get a size we do invert the mask and add 1 to the lowest - * mask bit (4 as its 2 bits in). This gives us a size we then shift - * to turn into 128K blocks + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. * - * eg 111 1111 1111 1100 is 512K + * eg 111 1111 1111 1100 is 512K * - * invert 000 0000 0000 0011 - * +1 000 0000 0000 0100 - * *128K ... + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... */ low = (~low) & 0x1FFFC; *size = (low + 4) << (15 - PAGE_SHIFT); - return; } -static void amd_set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - <reg> The register to set. - <base> The base address of the region. - <size> The size of the region. If this is 0 the region is disabled. - <type> The type of the region. - [RETURNS] Nothing. -*/ +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { u32 regs[2]; /* - * Low is MTRR0 , High MTRR 1 + * Low is MTRR0, High MTRR 1 */ rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); /* - * Blank to disable + * Blank to disable */ - if (size == 0) + if (size == 0) { regs[reg] = 0; - else - /* Set the register to the base, the type (off by one) and an - inverted bitmask of the size The size is the only odd - bit. We are fed say 512K We invert this and we get 111 1111 - 1111 1011 but if you subtract one and invert you get the - desired 111 1111 1111 1100 mask - - But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) | (base << PAGE_SHIFT) | (type + 1); + } /* - * The writeback rule is quite specific. See the manual. Its - * disable local interrupts, write back the cache, set the mtrr + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr */ wbinvd(); wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); } -static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { - /* Apply the K6 block alignment and size rules - In order - o Uncached or gathering only - o 128K or bigger block - o Power of 2 block - o base suitably aligned to the power - */ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; @@ -115,5 +122,3 @@ int __init amd_init_mtrr(void) set_mtrr_ops(&amd_mtrr_ops); return 0; } - -//arch_initcall(amd_mtrr_init); diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index cb9aa3a7a7ab..de89f14eff3a 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -1,7 +1,9 @@ #include <linux/init.h> #include <linux/mm.h> + #include <asm/mtrr.h> #include <asm/msr.h> + #include "mtrr.h" static struct { @@ -12,25 +14,25 @@ static struct { static u8 centaur_mcr_reserved; static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ -/* - * Report boot time MCR setups +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. */ - static int centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - <base> The starting (base) address of the region. - <size> The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { if (centaur_mcr_reserved & (1 << i)) continue; @@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) if (lsize == 0) return i; } + return -ENOSPC; } -void -mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +/* + * Report boot time MCR setups + */ +void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { centaur_mcr[mcr].low = lo; centaur_mcr[mcr].high = hi; @@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base, { *base = centaur_mcr[reg].high >> PAGE_SHIFT; *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; - *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) *type = MTRR_TYPE_WRBACK; if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) *type = MTRR_TYPE_WRBACK; - } -static void centaur_set_mcr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) { unsigned long low, high; if (size == 0) { - /* Disable */ + /* Disable */ high = low = 0; } else { high = base << PAGE_SHIFT; - if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ - else { + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << PAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ } } centaur_mcr[reg].high = high; @@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base, wrmsr(MSR_IDT_MCR0 + reg, low, high); } -#if 0 -/* - * Initialise the later (saner) Winchip MCR variant. In this version - * the BIOS can pass us the registers it has used (but not their values) - * and the control register is read/write - */ - -static void __init -centaur_mcr1_init(void) -{ - unsigned i; - u32 lo, hi; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ - lo &= ~0x1C0; /* clear key */ - lo |= 0x040; /* set key to 1 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ - } - - centaur_mcr_type = 1; - - /* - * Clear any unconfigured MCR's. - */ - - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { - if (!(lo & (1 << (9 + i)))) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - else - /* - * If the BIOS set up an MCR we cannot see it - * but we don't wish to obliterate it - */ - centaur_mcr_reserved |= (1 << i); - } - } - /* - * Throw the main write-combining switch... - * However if OOSTORE is enabled then people have already done far - * cleverer things and we should behave. - */ - - lo |= 15; /* Write combine enables */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -/* - * Initialise the original winchip with read only MCR registers - * no used bitmask for the BIOS to pass on and write only control - */ - -static void __init -centaur_mcr0_init(void) -{ - unsigned i; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - /* Clear any unconfigured MCR's. - * This way we are sure that the centaur_mcr array contains the actual - * values. The disadvantage is that any BIOS tweaks are thus undone. - * - */ - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - } - - wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ -} - -/* - * Initialise Winchip series MCR registers - */ - -static void __init -centaur_mcr_init(void) -{ - struct set_mtrr_context ctxt; - - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - if (boot_cpu_data.x86_model == 4) - centaur_mcr0_init(); - else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) - centaur_mcr1_init(); - - set_mtrr_done(&ctxt); -} -#endif - -static int centaur_validate_add_page(unsigned long base, - unsigned long size, unsigned int type) +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { /* - * FIXME: Winchip2 supports uncached + * FIXME: Winchip2 supports uncached */ - if (type != MTRR_TYPE_WRCOMB && + if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { - printk(KERN_WARNING - "mtrr: only write-combining%s supported\n", - centaur_mcr_type ? " and uncacheable are" - : " is"); + pr_warning("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); return -EINVAL; } return 0; @@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base, static struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, -// .init = centaur_mcr_init, .set = centaur_set_mcr, .get = centaur_get_mcr, .get_free_region = centaur_get_free_region, @@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void) set_mtrr_ops(¢aur_mtrr_ops); return 0; } - -//arch_initcall(centaur_init_mtrr); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 1d584a18a50d..315738c74aad 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -1,51 +1,75 @@ -/* MTRR (Memory Type Range Register) cleanup - - Copyright (C) 2009 Yinghai Lu - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ #include <linux/module.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/smp.h> #include <linux/cpu.h> -#include <linux/mutex.h> #include <linux/sort.h> +#include <linux/mutex.h> +#include <linux/uaccess.h> +#include <linux/kvm_para.h> +#include <asm/processor.h> #include <asm/e820.h> #include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> #include <asm/msr.h> -#include <asm/kvm_para.h> -#include "mtrr.h" -/* should be related to MTRR_VAR_RANGES nums */ -#define RANGE_NUM 256 +#include "mtrr.h" struct res_range { - unsigned long start; - unsigned long end; + unsigned long start; + unsigned long end; +}; + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; }; +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + + static int __init -add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { - /* out of slots */ + /* Out of slots: */ if (nr_range >= RANGE_NUM) return nr_range; @@ -58,12 +82,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start, } static int __init -add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range_with_merge(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { int i; - /* try to merge it with old one */ + /* Try to merge it with old one: */ for (i = 0; i < nr_range; i++) { unsigned long final_start, final_end; unsigned long common_start, common_end; @@ -84,7 +108,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, return nr_range; } - /* need to add that */ + /* Need to add it: */ return add_range(range, nr_range, start, end); } @@ -117,7 +141,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) } if (start > range[j].start && end < range[j].end) { - /* find the new spare */ + /* Find the new spare: */ for (i = 0; i < RANGE_NUM; i++) { if (range[i].end == 0) break; @@ -146,14 +170,8 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } -struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; -}; - -static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; -static int __initdata debug_print; +#define BIOS_BUG_MSG KERN_WARNING \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, @@ -180,7 +198,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, range[i].start, range[i].end + 1); } - /* take out UC ranges */ + /* Take out UC ranges: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_UNCACHABLE && @@ -193,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && (mtrr_state.enabled & 1)) { /* Var MTRR contains UC entry below 1M? Skip it: */ - printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " - "contains strange UC entry under 1M, check " - "with your system vendor!\n", i); + printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) continue; size -= (1<<(20-PAGE_SHIFT)) - base; @@ -237,17 +253,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, return nr_range; } -static struct res_range __initdata range[RANGE_NUM]; -static int __initdata nr_range; - #ifdef CONFIG_MTRR_SANITIZER static unsigned long __init sum_ranges(struct res_range *range, int nr_range) { - unsigned long sum; + unsigned long sum = 0; int i; - sum = 0; for (i = 0; i < nr_range; i++) sum += range[i].end + 1 - range[i].start; @@ -278,17 +290,9 @@ static int __init mtrr_cleanup_debug_setup(char *str) } early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); -struct var_mtrr_state { - unsigned long range_startk; - unsigned long range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; -}; - static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) + unsigned char type, unsigned int address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -301,7 +305,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, mask = (1ULL << address_bits) - 1; mask &= ~((((u64)sizek) << 10) - 1); - base = ((u64)basek) << 10; + base = ((u64)basek) << 10; base |= type; mask |= 0x800; @@ -317,15 +321,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, static void __init save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type) + unsigned char type) { range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); range_state[reg].type = type; } -static void __init -set_var_mtrr_all(unsigned int address_bits) +static void __init set_var_mtrr_all(unsigned int address_bits) { unsigned long basek, sizek; unsigned char type; @@ -342,11 +345,11 @@ set_var_mtrr_all(unsigned int address_bits) static unsigned long to_size_factor(unsigned long sizek, char *factorp) { - char factor; unsigned long base = sizek; + char factor; if (base & ((1<<10) - 1)) { - /* not MB alignment */ + /* Not MB-aligned: */ factor = 'K'; } else if (base & ((1<<20) - 1)) { factor = 'M'; @@ -372,11 +375,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long max_align, align; unsigned long sizek; - /* Compute the maximum size I can make a range */ + /* Compute the maximum size with which we can make a range: */ if (range_startk) max_align = ffs(range_startk) - 1; else max_align = 32; + align = fls(range_sizek) - 1; if (align > max_align) align = max_align; @@ -386,11 +390,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; - start_base = to_size_factor(range_startk, - &start_factor), - size_base = to_size_factor(sizek, &size_factor), + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); - printk(KERN_DEBUG "Setting variable MTRR %d, " + Dprintk("Setting variable MTRR %d, " "base: %ld%cB, range: %ld%cB, type %s\n", reg, start_base, start_factor, size_base, size_factor, @@ -425,10 +428,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; - /* align with gran size, prevent small block used up MTRRs */ + /* Align with gran size, prevent small block used up MTRRs: */ range_basek = ALIGN(state->range_startk, gran_sizek); if ((range_basek > basek) && basek) return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); range_sizek = ALIGN(state->range_sizek, gran_sizek); @@ -439,22 +443,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } state->range_sizek = range_sizek; - /* try to append some small hole */ + /* Try to append some small hole: */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - /* no increase */ + /* No increase: */ if (range0_sizek == state->range_sizek) { - if (debug_print) - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + state->range_sizek)<<10); + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK); return 0; } - /* only cut back, when it is not the last */ + /* Only cut back when it is not the last: */ if (sizek) { while (range0_basek + range0_sizek > (basek + sizek)) { if (range0_sizek >= chunk_sizek) @@ -470,16 +473,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_try: range_basek = range0_basek + range0_sizek; - /* one hole in the middle */ + /* One hole in the middle: */ if (range_basek > basek && range_basek <= (basek + sizek)) second_sizek = range_basek - basek; if (range0_sizek > state->range_sizek) { - /* one hole in middle or at end */ + /* One hole in middle or at the end: */ hole_sizek = range0_sizek - state->range_sizek - second_sizek; - /* hole size should be less than half of range0 size */ + /* Hole size should be less than half of range0 size: */ if (hole_sizek >= (range0_sizek >> 1) && range0_sizek >= chunk_sizek) { range0_sizek -= chunk_sizek; @@ -491,32 +494,30 @@ second_try: } if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); } if (range0_sizek < state->range_sizek) { - /* need to handle left over */ + /* Need to handle left over range: */ range_sizek = state->range_sizek - range0_sizek; - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, MTRR_TYPE_WRBACK); } if (hole_sizek) { hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, MTRR_TYPE_UNCACHABLE); } @@ -537,23 +538,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, basek = base_pfn << (PAGE_SHIFT - 10); sizek = size_pfn << (PAGE_SHIFT - 10); - /* See if I can merge with the last range */ + /* See if I can merge with the last range: */ if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { unsigned long endk = basek + sizek; state->range_sizek = endk - state->range_startk; return; } - /* Write the range mtrrs */ + /* Write the range mtrrs: */ if (state->range_sizek != 0) second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - /* Allocate an msr */ + /* Allocate an msr: */ state->range_startk = basek + second_sizek; state->range_sizek = sizek - second_sizek; } -/* mininum size of mtrr block that can take hole */ +/* Mininum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) @@ -565,7 +566,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) } early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); -/* granity of mtrr of block */ +/* Granularity of mtrr of block: */ static u64 mtrr_gran_size __initdata; static int __init parse_mtrr_gran_size_opt(char *p) @@ -577,7 +578,7 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static int nr_mtrr_spare_reg __initdata = +static unsigned long nr_mtrr_spare_reg __initdata = CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; static int __init parse_mtrr_spare_reg(char *arg) @@ -586,7 +587,6 @@ static int __init parse_mtrr_spare_reg(char *arg) nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); return 0; } - early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init @@ -594,8 +594,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; - int i; int num_reg; + int i; var_state.range_startk = 0; var_state.range_sizek = 0; @@ -605,17 +605,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, memset(range_state, 0, sizeof(range_state)); - /* Write the range etc */ - for (i = 0; i < nr_range; i++) + /* Write the range: */ + for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + } - /* Write the last range */ + /* Write the last range: */ if (var_state.range_sizek != 0) range_to_mtrr_with_hole(&var_state, 0, 0); num_reg = var_state.reg; - /* Clear out the extra MTRR's */ + /* Clear out the extra MTRR's: */ while (var_state.reg < num_var_ranges) { save_var_mtrr(var_state.reg, 0, 0, 0); var_state.reg++; @@ -625,11 +626,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, } struct mtrr_cleanup_result { - unsigned long gran_sizek; - unsigned long chunk_sizek; - unsigned long lose_cover_sizek; - unsigned int num_reg; - int bad; + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; }; /* @@ -645,10 +646,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static void __init print_out_mtrr_range_state(void) { - int i; char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; mtrr_type type; + int i; for (i = 0; i < num_var_ranges; i++) { @@ -676,10 +677,10 @@ static int __init mtrr_need_cleanup(void) int i; mtrr_type type; unsigned long size; - /* extra one for all 0 */ + /* Extra one for all 0: */ int num[MTRR_NUM_TYPES + 1]; - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -693,88 +694,86 @@ static int __init mtrr_need_cleanup(void) num[type]++; } - /* check if we got UC entries */ + /* Check if we got UC entries: */ if (!num[MTRR_TYPE_UNCACHABLE]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) + num_var_ranges - num[MTRR_NUM_TYPES]) return 0; return 1; } static unsigned long __initdata range_sums; -static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, - unsigned long extra_remove_base, - unsigned long extra_remove_size, - int i) + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) { - int num_reg; static struct res_range range_new[RANGE_NUM]; - static int nr_range_new; unsigned long range_sums_new; + static int nr_range_new; + int num_reg; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - /* we got new setting in range_state, check it */ + /* We got new setting in range_state, check it: */ memset(range_new, 0, sizeof(range_new)); nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); + x_remove_base, x_remove_size); range_sums_new = sum_ranges(range_new, nr_range_new); result[i].chunk_sizek = chunk_size >> 10; result[i].gran_sizek = gran_size >> 10; result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } - /* double check it */ + /* Double check it: */ if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; } static void __init mtrr_print_out_one_result(int i) { - char gran_factor, chunk_factor, lose_factor; unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad ? "*BAD*" : " ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad ? "-" : "", - lose_base, lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); } static int __init mtrr_search_optimal_index(void) { - int i; int num_reg_good; int index_good; + int i; if (nr_mtrr_spare_reg >= num_var_ranges) nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { if (!min_loss_pfn[i]) @@ -796,24 +795,24 @@ static int __init mtrr_search_optimal_index(void) return index_good; } - int __init mtrr_cleanup(unsigned address_bits) { - unsigned long extra_remove_base, extra_remove_size; + unsigned long x_remove_base, x_remove_size; unsigned long base, size, def, dummy; - mtrr_type type; u64 chunk_size, gran_size; + mtrr_type type; int index_good; int i; if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -822,29 +821,28 @@ int __init mtrr_cleanup(unsigned address_bits) range_state[i].type = type; } - /* check if we need handle it and can handle it */ + /* Check if we need handle it and can handle it: */ if (!mtrr_need_cleanup()) return 0; - /* print original var MTRRs at first, for debugging: */ + /* Print original var MTRRs at first, for debugging: */ printk(KERN_DEBUG "original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); - extra_remove_size = 0; - extra_remove_base = 1 << (32 - PAGE_SHIFT); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); if (mtrr_tom2) - extra_remove_size = - (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, - extra_remove_size); + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); /* - * [0, 1M) should always be coverred by var mtrr with WB - * and fixed mtrrs should take effective before var mtrr for it + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); - /* sort the ranges */ + /* Sort the ranges: */ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); @@ -854,7 +852,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { i = 0; mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); mtrr_print_out_one_result(i); @@ -880,7 +878,7 @@ int __init mtrr_cleanup(unsigned address_bits) continue; mtrr_calc_range_state(chunk_size, gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); if (debug_print) { mtrr_print_out_one_result(i); printk(KERN_INFO "\n"); @@ -890,7 +888,7 @@ int __init mtrr_cleanup(unsigned address_bits) } } - /* try to find the optimal index */ + /* Try to find the optimal index: */ index_good = mtrr_search_optimal_index(); if (index_good != -1) { @@ -898,7 +896,7 @@ int __init mtrr_cleanup(unsigned address_bits) i = index_good; mtrr_print_out_one_result(i); - /* convert ranges to var ranges state */ + /* Convert ranges to var ranges state: */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; @@ -941,8 +939,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup); * Note this won't check if the MTRRs < 4GB where the magic bit doesn't * apply to are wrong, but so far we don't know of any such case in the wild. */ -#define Tom2Enabled (1U << 21) -#define Tom2ForceMemTypeWB (1U << 22) +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) int __init amd_special_default_mtrr(void) { @@ -952,7 +950,7 @@ int __init amd_special_default_mtrr(void) return 0; if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) return 0; - /* In case some hypervisor doesn't pass SYSCFG through */ + /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) return 0; /* @@ -965,19 +963,21 @@ int __init amd_special_default_mtrr(void) return 0; } -static u64 __init real_trim_memory(unsigned long start_pfn, - unsigned long limit_pfn) +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) { u64 trim_start, trim_size; + trim_start = start_pfn; trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); + return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } + /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs * @end_pfn: ending page frame number @@ -985,7 +985,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn, * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches * the end of memory, to make sure the MTRRs having a write back type cover - * all of the memory the kernel is intending to use. If not, it'll trim any + * all of the memory the kernel is intending to use. If not, it'll trim any * memory off the end by adjusting end_pfn, removing it from the kernel's * allocation pools, warning the user with an obnoxious message. */ @@ -994,21 +994,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 total_trim_size; - /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; + /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -1017,7 +1018,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) range_state[i].type = type; } - /* Find highest cached pfn */ + /* Find highest cached pfn: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_WRBACK) @@ -1028,13 +1029,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) highest_pfn = base + size; } - /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ if (!highest_pfn) { printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); return 0; } - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -1046,11 +1047,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) num[type]++; } - /* no entry for WB? */ + /* No entry for WB? */ if (!num[MTRR_TYPE_WRBACK]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC: */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != num_var_ranges - num[MTRR_NUM_TYPES]) return 0; @@ -1066,31 +1067,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + /* Check the head: */ total_trim_size = 0; - /* check the head */ if (range[0].start) total_trim_size += real_trim_memory(0, range[0].start); - /* check the holes */ + + /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) total_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); } - /* check the top */ + + /* Check the top: */ i = nr_range - 1; if (range[i].end + 1 < end_pfn) total_trim_size += real_trim_memory(range[i].end + 1, end_pfn); if (total_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_trim_size >> 20); + pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); if (!changed_by_mtrr_cleanup) WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr\n"); + pr_info("update e820 for mtrr\n"); update_e820(); return 1; @@ -1098,4 +1099,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } - diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ff14c320040c..228d982ce09c 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -1,38 +1,40 @@ #include <linux/init.h> +#include <linux/io.h> #include <linux/mm.h> -#include <asm/mtrr.h> -#include <asm/msr.h> -#include <asm/io.h> + #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + #include "mtrr.h" static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) { - unsigned long flags; unsigned char arr, ccr3, rcr, shift; + unsigned long flags; arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - /* Save flags and disable interrupts */ local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ((unsigned char *) base)[3] = getCx86(arr); - ((unsigned char *) base)[2] = getCx86(arr + 1); - ((unsigned char *) base)[1] = getCx86(arr + 2); + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); rcr = getCx86(CX86_RCR_BASE + reg); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ - /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); + shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; - /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. */ if (shift) @@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, } } +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ static int cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free ARR. - <base> The starting (base) address of the region. - <size> The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i; switch (replace_reg) { case 7: @@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) cyrix_get_arr(7, &lbase, &lsize, <ype); if (lsize == 0) return 7; - /* Else try ARR0-ARR6 first */ + /* Else try ARR0-ARR6 first */ } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } - /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ cyrix_get_arr(i, &lbase, &lsize, <ype); if ((lsize == 0) && (size >= 0x40)) return i; @@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) return -ENOSPC; } -static u32 cr4 = 0; -static u32 ccr3; +static u32 cr4, ccr3; static void prepare_set(void) { u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } - /* Disable and flush caches. Note that wbinvd flushes the TLBs as - a side-effect */ + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); @@ -147,22 +156,21 @@ static void prepare_set(void) /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); - } static void post_set(void) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ccr3); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); } @@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, size >>= 6; size &= 0x7fff; /* make sure arr_size <= 14 */ - for (arr_size = 0; size; arr_size++, size >>= 1) ; + for (arr_size = 0; size; arr_size++, size >>= 1) + ; if (reg < 7) { switch (type) { @@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, prepare_set(); base <<= PAGE_SHIFT; - setCx86(arr, ((unsigned char *) &base)[3]); - setCx86(arr + 1, ((unsigned char *) &base)[2]); - setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); setCx86(CX86_RCR_BASE + reg, arr_type); post_set(); } typedef struct { - unsigned long base; - unsigned long size; - mtrr_type type; + unsigned long base; + unsigned long size; + mtrr_type type; } arr_state_t; static arr_state_t arr_state[8] = { @@ -247,16 +256,17 @@ static void cyrix_set_all(void) setCx86(CX86_CCR0 + i, ccr_state[i]); for (; i < 7; i++) setCx86(CX86_CCR4 + i, ccr_state[i]); - for (i = 0; i < 8; i++) - cyrix_set_arr(i, arr_state[i].base, + + for (i = 0; i < 8; i++) { + cyrix_set_arr(i, arr_state[i].base, arr_state[i].size, arr_state[i].type); + } post_set(); } static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, -// .init = cyrix_arr_init, .set_all = cyrix_set_all, .set = cyrix_set_arr, .get = cyrix_get_arr, @@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void) set_mtrr_ops(&cyrix_mtrr_ops); return 0; } - -//arch_initcall(cyrix_init_mtrr); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0543f69f0b27..55da0c5f68dd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,28 +1,34 @@ -/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - because MTRRs can span upto 40 bits (36bits on most modern x86) */ +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span upto 40 bits (36bits on most modern x86) + */ +#define DEBUG + +#include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/io.h> #include <linux/mm.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/msr.h> -#include <asm/system.h> -#include <asm/cpufeature.h> + #include <asm/processor-flags.h> +#include <asm/cpufeature.h> #include <asm/tlbflush.h> +#include <asm/system.h> +#include <asm/mtrr.h> +#include <asm/msr.h> #include <asm/pat.h> + #include "mtrr.h" struct fixed_range_block { - int base_msr; /* start address of an MTRR block */ - int ranges; /* number of MTRRs in this block */ + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { - { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -30,10 +36,10 @@ static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; -struct mtrr_state_type mtrr_state = {}; +struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); -/** +/* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section @@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) { + if (!(mtrr_state.enabled & 2)) return mtrr_state.def_type; - } prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { @@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) if (start_state != end_state) return 0xFE; - if ((start & mask) != (base & mask)) { + if ((start & mask) != (base & mask)) continue; - } curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; if (prev_match == 0xFF) { @@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) curr_match = MTRR_TYPE_WRTHROUGH; } - if (prev_match != curr_match) { + if (prev_match != curr_match) return MTRR_TYPE_UNCACHABLE; - } } if (mtrr_tom2) { @@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end) return mtrr_state.def_type; } -/* Get the MSR pair relating to a var range */ +/* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { @@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -/* fill the MSR pair relating to a var range */ +/* Fill the MSR pair relating to a var range */ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) { @@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index, vr[index].mask_hi = mask_hi; } -static void -get_fixed_ranges(mtrr_type * frs) +static void get_fixed_ranges(mtrr_type *frs) { - unsigned int *p = (unsigned int *) frs; + unsigned int *p = (unsigned int *)frs; int i; k8_check_syscfg_dram_mod_en(); @@ -217,22 +219,22 @@ static void __init print_fixed_last(void) if (!last_fixed_end) return; - printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, - last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + pr_debug(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } static void __init update_fixed_last(unsigned base, unsigned end, - mtrr_type type) + mtrr_type type) { last_fixed_start = base; last_fixed_end = end; last_fixed_type = type; } -static void __init print_fixed(unsigned base, unsigned step, - const mtrr_type *types) +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) { unsigned i; @@ -259,54 +261,55 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - printk(KERN_DEBUG "MTRR default type: %s\n", - mtrr_attrib_to_str(mtrr_state.def_type)); + pr_debug("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + pr_debug("MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) - print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) - print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); /* tail */ print_fixed_last(); } - printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + pr_debug("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); if (size_or_mask & 0xffffffffUL) high_width = ffs(size_or_mask & 0xffffffffUL) - 1; else high_width = ffs(size_or_mask>>32) + 32 - 1; high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); else - printk(KERN_DEBUG " %u disabled\n", i); - } - if (mtrr_tom2) { - printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", - mtrr_tom2, mtrr_tom2>>20); + pr_debug(" %u disabled\n", i); } + if (mtrr_tom2) + pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } -/* Grab all of the MTRR state for this CPU into *state */ +/* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { - unsigned int i; struct mtrr_var_range *vrs; - unsigned lo, dummy; unsigned long flags; + unsigned lo, dummy; + unsigned int i; vrs = mtrr_state.var_ranges; @@ -324,6 +327,7 @@ void __init get_mtrr_state(void) if (amd_special_default_mtrr()) { unsigned low, high; + /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); mtrr_tom2 = high; @@ -344,10 +348,9 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); - } -/* Some BIOS's are fucked and don't set all MTRRs the same! */ +/* Some BIOS's are messed up and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; @@ -355,28 +358,33 @@ void __init mtrr_state_warn(void) if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); printk(KERN_INFO "mtrr: corrected configuration.\n"); } -/* Doesn't attempt to pass an error out to MTRR users - because it's quite complicated in some cases and probably not - worth it because the best error handling is to ignore it. */ +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { - if (wrmsr_safe(msr, a, b) < 0) + if (wrmsr_safe(msr, a, b) < 0) { printk(KERN_ERR "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); + } } /** - * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have @@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) * * Returns: The index of the region on success, else negative on error. */ -int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } + return -ENOSPC; } @@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { - /* Invalid (i.e. free) range */ + /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; @@ -471,27 +482,31 @@ out_put_cpu: } /** - * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ -static int set_fixed_ranges(mtrr_type * frs) +static int set_fixed_ranges(mtrr_type *frs) { - unsigned long long *saved = (unsigned long long *) frs; + unsigned long long *saved = (unsigned long long *)frs; bool changed = false; - int block=-1, range; + int block = -1, range; k8_check_syscfg_dram_mod_en(); - while (fixed_range_blocks[++block].ranges) - for (range=0; range < fixed_range_blocks[block].ranges; range++) - set_fixed_range(fixed_range_blocks[block].base_msr + range, - &changed, (unsigned int *) saved++); + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } return changed; } -/* Set the MSR pair relating to a var range. Returns TRUE if - changes are made */ +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; @@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; } @@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi; */ static unsigned long set_mtrr_state(void) { - unsigned int i; unsigned long change_mask = 0; + unsigned int i; - for (i = 0; i < num_var_ranges; i++) + for (i = 0; i < num_var_ranges; i++) { if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; - /* Set_mtrr_restore restores the old value of MTRRdefType, - so to set it we fiddle with the saved value */ + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ if ((deftype_lo & 0xff) != mtrr_state.def_type || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); + + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | + (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void) } -static unsigned long cr4 = 0; +static unsigned long cr4; static DEFINE_SPINLOCK(set_atomicity_lock); /* - * Since we are disabling the cache don't allow any interrupts - they - * would run extremely slow and would only increase the pain. The caller must - * ensure that local interrupts are disabled and are reenabled after post_set() - * has been called. + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after post_set() has been called. */ - static void prepare_set(void) __acquires(set_atomicity_lock) { unsigned long cr0; - /* Note that this is not ideal, since the cache is only flushed/disabled - for this CPU while the MTRRs are changed, but changing this requires - more invasive changes to the way the kernel boots */ + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ spin_lock(&set_atomicity_lock); - /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } @@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ __flush_tlb(); - /* Save MTRR state */ + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - /* Disable MTRRs, and set the default type to uncached */ + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) { - /* Flush TLBs (no need to flush caches - they are disabled) */ + /* Flush TLBs (no need to flush caches - they are disabled) */ __flush_tlb(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); spin_unlock(&set_atomicity_lock); } @@ -623,24 +647,27 @@ static void generic_set_all(void) post_set(); local_irq_restore(flags); - /* Use the atomic bitops to update the global mask */ + /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } - + } +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - <reg> The register to set. - <base> The base address of the region. - <size> The size of the region. If this is 0 the region is disabled. - <type> The type of the region. - [RETURNS] Nothing. -*/ { unsigned long flags; struct mtrr_var_range *vr; @@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, prepare_set(); if (size == 0) { - /* The invalid bit is kept in the mask, so we simply clear the - relevant mask register to disable a range. */ + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { @@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, local_irq_restore(flags); } -int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) { unsigned long lbase, last; - /* For Intel PPro stepping <= 7, must be 4 MiB aligned - and not touch 0x70000000->0x7003FFFF */ + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { - printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } - /* Check upper bits of base and last are equal and lower bits are 0 - for base and 1 for last */ + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); - lbase = lbase >> 1, last = last >> 1) ; + lbase = lbase >> 1, last = last >> 1) + ; if (lbase != last) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", - base, size); + pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; } - static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); - return (config & (1 << 10)); + return config & (1 << 10); } int positive_have_wrcomb(void) @@ -716,14 +749,15 @@ int positive_have_wrcomb(void) return 1; } -/* generic structure... +/* + * Generic structure... */ struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .set_all = generic_set_all, - .get = generic_get_mtrr, - .get_free_region = generic_get_free_region, - .set = generic_set_mtrr, - .validate_add_page = generic_validate_add_page, - .have_wrcomb = generic_have_wrcomb, + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, }; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index fb73a52913a4..08b6ea4c62b4 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -1,27 +1,28 @@ -#include <linux/init.h> -#include <linux/proc_fs.h> #include <linux/capability.h> -#include <linux/ctype.h> -#include <linux/module.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> #define LINE_SIZE 80 #include <asm/mtrr.h> + #include "mtrr.h" #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = { - "uncachable", /* 0 */ - "write-combining", /* 1 */ - "?", /* 2 */ - "?", /* 3 */ - "write-through", /* 4 */ - "write-protect", /* 5 */ - "write-back", /* 6 */ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ }; const char *mtrr_attrib_to_str(int x) @@ -35,8 +36,8 @@ static int mtrr_file_add(unsigned long base, unsigned long size, unsigned int type, bool increment, struct file *file, int page) { + unsigned int *fcount = FILE_FCOUNT(file); int reg, max; - unsigned int *fcount = FILE_FCOUNT(file); max = num_var_ranges; if (fcount == NULL) { @@ -61,8 +62,8 @@ static int mtrr_file_del(unsigned long base, unsigned long size, struct file *file, int page) { - int reg; unsigned int *fcount = FILE_FCOUNT(file); + int reg; if (!page) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) @@ -81,13 +82,14 @@ mtrr_file_del(unsigned long base, unsigned long size, return reg; } -/* RED-PEN: seq_file can seek now. this is ignored. */ +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ static ssize_t mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) -/* Format of control line: - "base=%Lx size=%Lx type=%s" OR: - "disable=%d" -*/ { int i, err; unsigned long reg; @@ -100,15 +102,18 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EPERM; if (!len) return -EINVAL; + memset(line, 0, LINE_SIZE); if (len > LINE_SIZE) len = LINE_SIZE; if (copy_from_user(line, buf, len - 1)) return -EFAULT; + linelen = strlen(line); ptr = line + linelen - 1; if (linelen && *ptr == '\n') *ptr = '\0'; + if (!strncmp(line, "disable=", 8)) { reg = simple_strtoul(line + 8, &ptr, 0); err = mtrr_del_page(reg, 0, 0); @@ -116,28 +121,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return err; return len; } + if (strncmp(line, "base=", 5)) return -EINVAL; + base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "size=", 5)) return -EINVAL; + size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; - err = - mtrr_add_page((unsigned long) base, (unsigned long) size, i, - true); + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); if (err < 0) return err; return len; @@ -181,7 +193,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC32_SET_PAGE_ENTRY: case MTRRIOC32_DEL_PAGE_ENTRY: case MTRRIOC32_KILL_PAGE_ENTRY: { - struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; err = get_user(sentry.base, &s32->base); err |= get_user(sentry.size, &s32->size); err |= get_user(sentry.type, &s32->type); @@ -191,7 +205,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) } case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = get_user(gentry.regnum, &g32->regnum); err |= get_user(gentry.base, &g32->base); err |= get_user(gentry.size, &g32->size); @@ -314,7 +330,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (err) return err; - switch(cmd) { + switch (cmd) { case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: if (copy_to_user(arg, &gentry, sizeof gentry)) @@ -323,7 +339,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = put_user(gentry.base, &g32->base); err |= put_user(gentry.size, &g32->size); err |= put_user(gentry.regnum, &g32->regnum); @@ -335,11 +353,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) return err; } -static int -mtrr_close(struct inode *ino, struct file *file) +static int mtrr_close(struct inode *ino, struct file *file) { - int i, max; unsigned int *fcount = FILE_FCOUNT(file); + int i, max; if (fcount != NULL) { max = num_var_ranges; @@ -359,22 +376,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset); static int mtrr_open(struct inode *inode, struct file *file) { - if (!mtrr_if) + if (!mtrr_if) return -EIO; - if (!mtrr_if->get) - return -ENXIO; + if (!mtrr_if->get) + return -ENXIO; return single_open(file, mtrr_seq_show, NULL); } static const struct file_operations mtrr_fops = { - .owner = THIS_MODULE, - .open = mtrr_open, - .read = seq_read, - .llseek = seq_lseek, - .write = mtrr_write, - .unlocked_ioctl = mtrr_ioctl, - .compat_ioctl = mtrr_ioctl, - .release = mtrr_close, + .owner = THIS_MODULE, + .open = mtrr_open, + .read = seq_read, + .llseek = seq_lseek, + .write = mtrr_write, + .unlocked_ioctl = mtrr_ioctl, + .compat_ioctl = mtrr_ioctl, + .release = mtrr_close, }; static int mtrr_seq_show(struct seq_file *seq, void *offset) @@ -388,23 +405,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); - if (size == 0) + if (size == 0) { mtrr_usage_table[i] = 0; - else { - if (size < (0x100000 >> PAGE_SHIFT)) { - /* less than 1MB */ - factor = 'K'; - size <<= PAGE_SHIFT - 10; - } else { - factor = 'M'; - size >>= 20 - PAGE_SHIFT; - } - /* RED-PEN: base can be > 32bit */ - len += seq_printf(seq, - "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_usage_table[i], mtrr_attrib_to_str(type)); + continue; } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + len += seq_printf(seq, "reg%02i: base=0x%06lx000 " + "(%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), size, + factor, mtrr_usage_table[i], + mtrr_attrib_to_str(type)); } return 0; } @@ -422,6 +440,5 @@ static int __init mtrr_if_init(void) proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); return 0; } - arch_initcall(mtrr_if_init); #endif /* CONFIG_PROC_FS */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8fc248b5aeaf..7af0f88a4163 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -25,43 +25,48 @@ Operating System Writer's Guide" (Intel document number 242692), section 11.11.7 - This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> - on 6-7 March 2002. - Source: Intel Architecture Software Developers Manual, Volume 3: + This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: System Programming Guide; Section 9.11. (1997 edition - PPro). */ +#define DEBUG + +#include <linux/types.h> /* FIXME: kvm_para.h needs this */ + +#include <linux/kvm_para.h> +#include <linux/uaccess.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/init.h> +#include <linux/sort.h> +#include <linux/cpu.h> #include <linux/pci.h> #include <linux/smp.h> -#include <linux/cpu.h> -#include <linux/mutex.h> -#include <linux/sort.h> +#include <asm/processor.h> #include <asm/e820.h> #include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> #include <asm/msr.h> -#include <asm/kvm_para.h> + #include "mtrr.h" -u32 num_var_ranges = 0; +u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; +static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops * mtrr_if = NULL; +struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops * ops) +void set_mtrr_ops(struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; @@ -72,30 +77,36 @@ static int have_wrcomb(void) { struct pci_dev *dev; u8 rev; - - if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { - /* ServerWorks LE chipsets < rev 6 have problems with write-combining - Don't allow it and leave room for other chipsets to be tagged */ + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev <= 5) { - printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } } - /* Intel 450NX errata # 23. Non ascending cacheline evictions to - write combining memory may resulting in data corruption */ + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ if (dev->vendor == PCI_VENDOR_ID_INTEL && dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } pci_dev_put(dev); - } - return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; } /* This function returns the number of variable MTRRs */ @@ -103,12 +114,13 @@ static void __init set_num_var_ranges(void) { unsigned long config = 0, dummy; - if (use_intel()) { + if (use_intel()) rdmsr(MSR_MTRRcap, config, dummy); - } else if (is_cpu(AMD)) + else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; + num_var_ranges = config & 0xff; } @@ -130,10 +142,12 @@ struct set_mtrr_data { mtrr_type smp_type; }; +/** + * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * + * Returns nothing. + */ static void ipi_handler(void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; @@ -142,18 +156,19 @@ static void ipi_handler(void *info) local_irq_save(flags); atomic_dec(&data->count); - while(!atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ - if (data->smp_reg != ~0U) - mtrr_if->set(data->smp_reg, data->smp_base, + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - else + } else { mtrr_if->set_all(); + } atomic_dec(&data->count); - while(atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); @@ -161,7 +176,8 @@ static void ipi_handler(void *info) #endif } -static inline int types_compatible(mtrr_type type1, mtrr_type type2) { +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ return type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || @@ -176,10 +192,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * @type: mtrr type * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: - * + * * 1. Send IPI to do the following: * 2. Disable Interrupts - * 3. Wait for all procs to do so + * 3. Wait for all procs to do so * 4. Enter no-fill cache mode * 5. Flush caches * 6. Clear PGE bit @@ -189,26 +205,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * 10. Enable all range registers * 11. Flush all TLBs and caches again * 12. Enter normal cache mode and reenable caching - * 13. Set PGE + * 13. Set PGE * 14. Wait for buddies to catch up * 15. Enable interrupts. - * + * * What does that mean for us? Well, first we set data.count to the number * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it - * differently, so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate to - * be reset. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. + * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * CPU goes through the transition of updating MTRRs. + * The CPU vendors may each do it differently, + * so we call mtrr_if->set() callback and let them take care of it. + * When they're done, they again decrement data->count and wait for data.gate + * to be reset. + * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { struct set_mtrr_data data; unsigned long flags; @@ -218,121 +235,122 @@ static void set_mtrr(unsigned int reg, unsigned long base, data.smp_size = size; data.smp_type = type; atomic_set(&data.count, num_booting_cpus() - 1); - /* make sure data.count is visible before unleashing other CPUs */ + + /* Make sure data.count is visible before unleashing other CPUs */ smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); - /* Start the ball rolling on other CPUs */ + /* Start the ball rolling on other CPUs */ if (smp_call_function(ipi_handler, &data, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); local_irq_save(flags); - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); - /* ok, reset count and toggle gate */ + /* Ok, reset count and toggle gate */ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,1); + atomic_set(&data.gate, 1); - /* do our MTRR business */ + /* Do our MTRR business */ - /* HACK! + /* + * HACK! * We use this same function to initialize the mtrrs on boot. * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. + * to replicate across all the APs. * If we're doing that @reg is set to something special... */ - if (reg != ~0U) - mtrr_if->set(reg,base,size,type); + if (reg != ~0U) + mtrr_if->set(reg, base, size, type); - /* wait for the others */ - while(atomic_read(&data.count)) + /* Wait for the others */ + while (atomic_read(&data.count)) cpu_relax(); atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); /* * Wait here for everyone to have seen the gate change * So we're the last ones to touch 'data' */ - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); local_irq_restore(flags); } /** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (in units of 4 kB!) - * @size: Physical size of region in pages (4 kB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int mtrr_add_page(unsigned long base, unsigned long size, +int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment) { + unsigned long lbase, lsize; int i, replace, error; mtrr_type ltype; - unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; - - if ((error = mtrr_if->validate_add_page(base,size,type))) + + error = mtrr_if->validate_add_page(base, size, type); + if (error) return error; if (type >= MTRR_NUM_TYPES) { - printk(KERN_WARNING "mtrr: type: %u invalid\n", type); + pr_warning("mtrr: type: %u invalid\n", type); return -EINVAL; } - /* If the type is WC, check that this processor supports it */ + /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - printk(KERN_WARNING - "mtrr: your processor doesn't support write-combining\n"); + pr_warning("mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - printk(KERN_WARNING "mtrr: zero sized request\n"); + pr_warning("mtrr: zero sized request\n"); return -EINVAL; } if (base & size_or_mask || size & size_or_mask) { - printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); + pr_warning("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -341,36 +359,40 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* No CPU hotplug when we change MTRR entries */ get_online_cpus(); - /* Search for existing MTRR */ + + /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); - if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) continue; - /* At this point we know there is some kind of overlap/enclosure */ + /* + * At this point we know there is some kind of + * overlap/enclosure + */ if (base < lbase || base + size - 1 > lbase + lsize - 1) { - if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { /* New region encloses an existing region */ if (type == ltype) { replace = replace == -1 ? i : -2; continue; - } - else if (types_compatible(type, ltype)) + } else if (types_compatible(type, ltype)) continue; } - printk(KERN_WARNING - "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, - lsize); + pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); goto out; } - /* New region is enclosed by an existing region */ + /* New region is enclosed by an existing region */ if (ltype != type) { if (types_compatible(type, ltype)) continue; - printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, mtrr_attrib_to_str(ltype), - mtrr_attrib_to_str(type)); + pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); goto out; } if (increment) @@ -378,7 +400,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; goto out; } - /* Search for an empty MTRR */ + /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); @@ -393,8 +415,9 @@ int mtrr_add_page(unsigned long base, unsigned long size, mtrr_usage_table[replace] = 0; } } - } else - printk(KERN_INFO "mtrr: no more MTRRs available\n"); + } else { + pr_info("mtrr: no more MTRRs available\n"); + } error = i; out: mutex_unlock(&mtrr_mutex); @@ -405,10 +428,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING - "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG - "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -416,66 +437,64 @@ static int mtrr_check(unsigned long base, unsigned long size) } /** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } +EXPORT_SYMBOL(mtrr_add); /** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; @@ -500,22 +519,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); goto out; } } if (reg >= max) { - printk(KERN_WARNING "mtrr: register: %d too big\n", reg); + pr_warning("mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); + pr_warning("mtrr: MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + pr_warning("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) @@ -526,33 +545,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) put_online_cpus(); return error; } + /** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - -int -mtrr_del(int reg, unsigned long base, unsigned long size) +int mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } - -EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); -/* HACK ALERT! +/* + * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ @@ -576,29 +593,28 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device * sysdev, pm_message_t state) +static int mtrr_save(struct sys_device *sysdev, pm_message_t state) { int i; for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, - &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); } return 0; } -static int mtrr_restore(struct sys_device * sysdev) +static int mtrr_restore(struct sys_device *sysdev) { int i; for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) - set_mtrr(i, - mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } } return 0; } @@ -615,26 +631,29 @@ int __initdata changed_by_mtrr_cleanup; /** * mtrr_bp_init - initialize mtrrs on the boot CPU * - * This needs to be called early; before any of the other CPUs are + * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). - * + * */ void __init mtrr_bp_init(void) { u32 phys_addr; + init_ifs(); phys_addr = 32; if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ + size_or_mask = 0xff000000; /* 36 bits */ size_and_mask = 0x00f00000; phys_addr = 36; - /* This is an AMD specific MSR, but we assume(hope?) that - Intel will implement it to when they extend the address - bus of the Xeon. */ + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it to when they extend the address + * bus of the Xeon. + */ if (cpuid_eax(0x80000000) >= 0x80000008) { phys_addr = cpuid_eax(0x80000008) & 0xff; /* CPUID workaround for Intel 0F33/0F34 CPU */ @@ -649,9 +668,11 @@ void __init mtrr_bp_init(void) size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { - /* VIA C* family have Intel style MTRRs, but - don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = 0xfff00000; /* 32 bits */ size_and_mask = 0; phys_addr = 32; } @@ -694,7 +715,6 @@ void __init mtrr_bp_init(void) changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); } - } } } @@ -706,12 +726,17 @@ void mtrr_ap_init(void) if (!mtrr_if || !use_intel()) return; /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, - * but this routine will be called in cpu boot time, holding the lock - * breaks it. This routine is called in two cases: 1.very earily time - * of software resume, when there absolutely isn't mtrr entry changes; - * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to - * prevent mtrr entry changes + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes */ local_irq_save(flags); @@ -732,19 +757,23 @@ static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; + if (use_intel()) { if (!changed_by_mtrr_cleanup) mtrr_state_warn(); - } else { - /* The CPUs haven't MTRR and seem to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * TBD: is there any system with such CPU which supports - * suspend/resume? if no, we should remove the code. - */ - sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); + return 0; } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + return 0; } subsys_initcall(mtrr_init_finialize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7538b767f206..a501dee9a87a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -1,5 +1,5 @@ /* - * local mtrr defines. + * local MTRR defines. */ #include <linux/types.h> @@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; u32 use_intel_if; -// void (*init)(void); void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type); + unsigned long *size, mtrr_type *type); int (*get_free_region)(unsigned long base, unsigned long size, int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, @@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { - unsigned long flags; - unsigned long cr4val; - u32 deftype_lo; - u32 deftype_hi; - u32 ccr3; + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; }; void set_mtrr_done(struct set_mtrr_context *ctxt); @@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops * ops); +extern void set_mtrr_ops(struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops * mtrr_if; +extern struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 1f5fb1588d1f..dfc80b4e6b0d 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -1,24 +1,25 @@ -#include <linux/mm.h> #include <linux/init.h> -#include <asm/io.h> -#include <asm/mtrr.h> -#include <asm/msr.h> +#include <linux/io.h> +#include <linux/mm.h> + #include <asm/processor-cyrix.h> #include <asm/processor-flags.h> -#include "mtrr.h" +#include <asm/mtrr.h> +#include <asm/msr.h> +#include "mtrr.h" -/* Put the processor into a state where MTRRs can be safely set */ +/* Put the processor into a state where MTRRs can be safely set */ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) { unsigned int cr0; - /* Disable interrupts locally */ + /* Disable interrupts locally */ local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { - /* Save value of CR4 and clear Page Global Enable (bit 7) */ + /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { ctxt->cr4val = read_cr4(); write_cr4(ctxt->cr4val & ~X86_CR4_PGE); @@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) write_cr0(cr0); wbinvd(); - if (use_intel()) - /* Save MTRR state */ + if (use_intel()) { + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else were excluded at the top */ + } else { + /* + * Cyrix ARRs - + * everything else were excluded at the top + */ ctxt->ccr3 = getCx86(CX86_CCR3); + } } } void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { - if (use_intel()) - /* Disable MTRRs, and set the default type to uncached */ + if (use_intel()) { + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); - else if (is_cpu(CYRIX)) - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } else { + if (is_cpu(CYRIX)) { + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } + } } -/* Restore the processor after a set_mtrr_prepare */ +/* Restore the processor after a set_mtrr_prepare */ void set_mtrr_done(struct set_mtrr_context *ctxt) { if (use_intel() || is_cpu(CYRIX)) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); - /* Restore MTRRdefType */ - if (use_intel()) + /* Restore MTRRdefType */ + if (use_intel()) { /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else was excluded at the top */ + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, + ctxt->deftype_hi); + } else { + /* + * Cyrix ARRs - + * everything else was excluded at the top + */ setCx86(CX86_CCR3, ctxt->ccr3); + } - /* Enable caches */ + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ + /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(ctxt->cr4val); } - /* Re-enable interrupts locally (if enabled previously) */ + /* Re-enable interrupts locally (if enabled previously) */ local_irq_restore(ctxt->flags); } - diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f900954..b237c181aa41 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -6,6 +6,7 @@ * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * * For licencing details see kernel-base/COPYING */ @@ -20,6 +21,7 @@ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/highmem.h> +#include <linux/cpu.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -27,12 +29,52 @@ static u64 perf_counter_mask __read_mostly; +/* The maximal number of PEBS counters: */ +#define MAX_PEBS_COUNTERS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; +}; + struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; + struct debug_store *ds; }; /* @@ -57,6 +99,8 @@ struct x86_pmu { u64 counter_mask; u64 max_period; u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); }; static struct x86_pmu x86_pmu __read_mostly; @@ -576,6 +620,9 @@ x86_perf_counter_update(struct perf_counter *counter, u64 prev_raw_count, new_raw_count; s64 delta; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * Careful: an NMI might modify the previous counter value. * @@ -659,10 +706,109 @@ static void release_pmc_hardware(void) enable_lapic_nmi_watchdog(); } +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(long)ds), (u32)((u64)(long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_counters, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_counters, cpu).ds = NULL; + + kfree((void *)(long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return -EOPNOTSUPP; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_counters, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + static void hw_perf_counter_destroy(struct perf_counter *counter) { if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); + release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -705,6 +851,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) return 0; } +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + /* * Setup the hardware configuration for a given attr_type */ @@ -721,9 +903,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter) err = 0; if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) - err = -EBUSY; - else + if (atomic_read(&active_counters) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + reserve_bts_hardware(); + } + if (!err) atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } @@ -801,7 +987,18 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); } static void amd_pmu_disable_all(void) @@ -859,7 +1056,25 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_counter *counter = + cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!counter)) + return; + + intel_pmu_enable_bts(counter->hw.config); + } } static void amd_pmu_enable_all(void) @@ -946,6 +1161,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) static inline void intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc, idx); return; @@ -974,6 +1194,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, s64 period = hwc->sample_period; int err, ret = 0; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * If we are way outside a reasoable range then just skip forward: */ @@ -1056,6 +1279,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_counters).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc, idx); return; @@ -1077,11 +1308,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + if (!x86_pmu.num_counters_fixed) return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) @@ -1102,7 +1338,25 @@ static int x86_pmu_enable(struct perf_counter *counter) int idx; idx = fixed_mode_idx(counter, hwc); - if (idx >= 0) { + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* + * Try to use BTS for branch tracing. If that is not + * available, try to get a generic counter. + */ + if (unlikely(!cpuc->ds)) + goto try_generic; + + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = 0; + hwc->counter_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { /* * Try to get the fixed counter, if that is already taken * then try to get a generic counter: @@ -1213,6 +1467,45 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, + struct perf_sample_data *data) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + unsigned long orig_ip = data->regs->ip; + u64 at; + + if (!counter) + return; + + if (!ds) + return; + + for (at = ds->bts_buffer_base; + at < ds->bts_index; + at += sizeof(struct bts_record)) { + struct bts_record *rec = (struct bts_record *)(long)at; + + data->regs->ip = rec->from; + data->addr = rec->to; + + perf_counter_output(counter, 1, data); + } + + ds->bts_index = ds->bts_buffer_base; + + data->regs->ip = orig_ip; + data->addr = 0; + + /* There's new data available. */ + counter->pending_kill = POLL_IN; +} + static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1237,6 +1530,15 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct perf_sample_data data; + struct pt_regs regs; + + data.regs = ®s; + intel_pmu_drain_bts_buffer(cpuc, &data); + } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); @@ -1264,6 +1566,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter) static void intel_pmu_reset(void) { + struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; unsigned long flags; int idx; @@ -1281,6 +1584,8 @@ static void intel_pmu_reset(void) for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } + if (ds) + ds->bts_index = ds->bts_buffer_base; local_irq_restore(flags); } @@ -1346,6 +1651,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); + intel_pmu_drain_bts_buffer(cpuc, &data); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1547,6 +1853,8 @@ static struct x86_pmu intel_pmu = { * the generic counter period: */ .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, }; static struct x86_pmu amd_pmu = { @@ -1936,3 +2244,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void hw_perf_counter_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index e60ed740d2b3..392bea43b890 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); + return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); + return msr - MSR_ARCH_PERFMON_PERFCTR0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_PERFCTR0); + return msr - MSR_P6_PERFCTR0; case 15: - return (msr - MSR_P4_BPU_PERFCTR0); + return msr - MSR_P4_BPU_PERFCTR0; } } return 0; @@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); + return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + return msr - MSR_ARCH_PERFMON_EVENTSEL0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_EVNTSEL0); + return msr - MSR_P6_EVNTSEL0; case 15: - return (msr - MSR_P4_BSU_ESCR0); + return msr - MSR_P4_BSU_ESCR0; } } return 0; @@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) { BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } /* checks the an msr for availability */ @@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) counter = nmi_perfctr_msr_to_bit(msr); BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); @@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) */ counter_val = (u64)cpu_khz * 1000; do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { + if (counter_val > 0x7fffffffULL) { u64 count = (u64)cpu_khz * 1000; do_div(count, 0x7fffffffUL); retval = count + 1; @@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } @@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) apic_write(APIC_LVTPC, APIC_DM_NMI); /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); + write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); } static const struct wd_ops p6_wd_ops = { @@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz) if (smp_num_siblings == 2) { unsigned int ebx, apicid; - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; } else #endif ht_num = 0; @@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz) } evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS + | P4_ESCR_OS | P4_ESCR_USR; cccr_val |= P4_CCCR_THRESHOLD(15) @@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* - * P4 quirks: + * P4 quirks: * - An overflown perfctr will assert its interrupt * until the OVF flag in its CCCR is cleared. * - LVTPC is masked on interrupt and must be @@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) * NOTE: Corresponding bit = 0 in ebx indicates event present. */ cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + if ((eax.split.mask_length < + (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) return 0; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246b..62ac8cb6ba27 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); -#ifdef CONFIG_X86_64 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); -#endif seq_printf(m, "power management:"); for (i = 0; i < 32; i++) { @@ -128,7 +126,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (i < ARRAY_SIZE(x86_power_flags) && x86_power_flags[i]) seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", + x86_power_flags[i][0] ? " " : "", x86_power_flags[i]); else seq_printf(m, " [%d]", i); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 284c399e3234..bc24f514ec93 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -49,17 +49,17 @@ static inline int __vmware_platform(void) static unsigned long __vmware_get_tsc_khz(void) { - uint64_t tsc_hz; - uint32_t eax, ebx, ecx, edx; + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - return tsc_hz; + if (ebx == UINT_MAX) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; } /* diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index b4f14c6c09d9..37250fe490b1 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -27,9 +27,7 @@ static void doublefault_fn(void) if (ptr_ok(gdt)) { gdt += GDT_ENTRY_TSS << 3; - tss = *(u16 *)(gdt+2); - tss += *(u8 *)(gdt+4) << 16; - tss += *(u8 *)(gdt+7) << 24; + tss = get_desc_base((struct desc_struct *)gdt); printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 48bfe1386038..ef42a038f1a6 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -509,15 +509,15 @@ enum bts_field { bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; -static inline unsigned long bts_get(const char *base, enum bts_field field) +static inline unsigned long bts_get(const char *base, unsigned long field) { base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } -static inline void bts_set(char *base, enum bts_field field, unsigned long val) +static inline void bts_set(char *base, unsigned long field, unsigned long val) { - base += (ds_cfg.sizeof_ptr_field * field);; + base += (ds_cfg.sizeof_ptr_field * field); (*(unsigned long *)base) = val; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c8405718a4c3..2d8a371d4339 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -15,7 +15,6 @@ #include <linux/bug.h> #include <linux/nmi.h> #include <linux/sysfs.h> -#include <linux/ftrace.h> #include <asm/stacktrace.h> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d94e1ea3b9fe..8e9663413b7f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; - /* Nmi's are currently unsupported */ - if (unlikely(in_nmi())) - return; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f5..74656d1d4e30 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_threshold_count; # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d58..ccf8ab54f31a 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -190,7 +190,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_MCE_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ae13e34f7248..2cf12fd32c83 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -147,7 +147,7 @@ again: return NULL; addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, get_order(size)); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d2e56b8f48e7..98a827ee9ed7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || - !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return force_iommu || !dma_capable(dev, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return !dma_capable(dev, addr, size); } /* Map a single continuous physical area into the IOMMU. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 71d412a09f30..c0a4222bf62b 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { + if (hwdev && !dma_capable(hwdev, bus, size)) { if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 1e66b18f45cb..aaa6b7839f1e 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,31 +13,6 @@ int swiotlb __read_mostly; -void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - -void *swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr; -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return baddr; -} - -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984af..4cf79567cdab 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,9 +61,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ @@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9d..ad535b683170 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,9 +55,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); @@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + bool preload_fpu; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* Must be after DS reload */ + unlazy_fpu(prev_p); + + /* Make sure cpu is ready for new context */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - /* Must be after DS reload */ - unlazy_fpu(prev_p); - /* * Switch the PDA and FPU contexts. */ @@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94e..cc26ad4c3070 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index e8b9863ef8c4..3149032ff107 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/ptrace.h> +#include <asm/desc.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { @@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u32 *desc; + struct desc_struct *desc; unsigned long base; seg &= ~7UL; @@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re addr = -1L; /* bogus selector, access would fault */ else { desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); + base = get_desc_base(desc); /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) + if (!desc->d) addr &= 0xffff; addr += base; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332f475d..83264922a878 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -76,7 +76,7 @@ char ignore_fpu_irq; * F0 0F bug workaround.. We have a special link segment * for this. */ -gate_desc idt_table[256] +gate_desc idt_table[NR_VECTORS] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif @@ -786,33 +786,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; } -#endif -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +/* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) { + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } /* @@ -846,17 +847,8 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bfae139182ff..775a020990a5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -285,26 +285,25 @@ check_v8086_mode(struct pt_regs *regs, unsigned long address, tsk->thread.screen_bitmap |= 1 << bit; } -static void dump_pagetable(unsigned long address) +static bool low_pfn(unsigned long pfn) { - __typeof__(pte_val(__pte(0))) page; + return pfn < max_low_pfn; +} - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(address)]; + pmd_t *pmd; + pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page >> PAGE_SHIFT) < max_low_pfn - && page & _PAGE_PRESENT) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); + printk("*pdpt = %016Lx ", pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; #endif + pmd = pmd_offset(pud_offset(pgd, address), address); + printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* * We must not directly access the pte in the highpte @@ -312,16 +311,12 @@ static void dump_pagetable(unsigned long address) * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + pte = pte_offset_kernel(pmd, address); + printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); +out: printk("\n"); } @@ -450,16 +445,12 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *pgd; + pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *pgd = base + pgd_index(address); pud_t *pud; pmd_t *pmd; pte_t *pte; - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - - pgd += pgd_index(address); if (bad_address(pgd)) goto bad; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 2112ed55e7ea..1617958a3805 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -24,7 +24,7 @@ void kunmap(struct page *page) * no global lock is needed and because the kmap code must perform a global TLB * invalidation when the kmap pool wraps. * - * However when holding an atomic kmap is is not legal to sleep, so atomic + * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 89b9a5cd63da..cb88b1a0bd5f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,11 +1,14 @@ /** * @file nmi_int.c * - * @remark Copyright 2002-2008 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> * @author Robert Richter <robert.richter@amd.com> + * @author Barry Kasindorf <barry.kasindorf@amd.com> + * @author Jason Yeh <jason.yeh@amd.com> + * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> */ #include <linux/init.h> @@ -24,13 +27,35 @@ #include "op_counter.h" #include "op_x86_model.h" -static struct op_x86_model_spec const *model; +static struct op_x86_model_spec *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; +struct op_counter_config counter_config[OP_MAX_COUNTER]; + +/* common functions */ + +u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config) +{ + u64 val = 0; + u16 event = (u16)counter_config->event; + + val |= ARCH_PERFMON_EVENTSEL_INT; + val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; + val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; + val |= (counter_config->unit_mask & 0xFF) << 8; + event &= model->event_mask ? model->event_mask : 0xFF; + val |= event & 0xFF; + val |= (event & 0x0F00) << 24; + + return val; +} + + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -52,36 +77,214 @@ static int profile_exceptions_notify(struct notifier_block *self, static void nmi_cpu_save_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - rdmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + rdmsrl(counters[i].addr, counters[i].saved); + } + + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + rdmsrl(controls[i].addr, controls[i].saved); + } +} + +static void nmi_cpu_start(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->start(msrs); +} + +static int nmi_start(void) +{ + on_each_cpu(nmi_cpu_start, NULL, 1); + return 0; +} + +static void nmi_cpu_stop(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->stop(msrs); +} + +static void nmi_stop(void) +{ + on_each_cpu(nmi_cpu_stop, NULL, 1); +} + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static DEFINE_PER_CPU(int, switch_index); + +static inline int has_mux(void) +{ + return !!model->switch_ctrl; +} + +inline int op_x86_phys_to_virt(int phys) +{ + return __get_cpu_var(switch_index) + phys; +} + +inline int op_x86_virt_to_phys(int virt) +{ + return virt % model->num_counters; +} + +static void nmi_shutdown_mux(void) +{ + int i; + + if (!has_mux()) + return; + + for_each_possible_cpu(i) { + kfree(per_cpu(cpu_msrs, i).multiplex); + per_cpu(cpu_msrs, i).multiplex = NULL; + per_cpu(switch_index, i) = 0; } +} + +static int nmi_setup_mux(void) +{ + size_t multiplex_size = + sizeof(struct op_msr) * model->num_virt_counters; + int i; + + if (!has_mux()) + return 1; + + for_each_possible_cpu(i) { + per_cpu(cpu_msrs, i).multiplex = + kmalloc(multiplex_size, GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).multiplex) + return 0; + } + + return 1; +} + +static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) +{ + int i; + struct op_msr *multiplex = msrs->multiplex; + + if (!has_mux()) + return; - for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - rdmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); + for (i = 0; i < model->num_virt_counters; ++i) { + if (counter_config[i].enabled) { + multiplex[i].saved = -(u64)counter_config[i].count; + } else { + multiplex[i].addr = 0; + multiplex[i].saved = 0; } } + + per_cpu(switch_index, cpu) = 0; +} + +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + } } -static void nmi_save_registers(void *dummy) +static void nmi_cpu_switch(void *dummy) { int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - nmi_cpu_save_registers(msrs); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_counters; + if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, cpu) = 0; + else + per_cpu(switch_index, cpu) = si; + + model->switch_ctrl(model, msrs); + nmi_cpu_restore_mpx_registers(msrs); + + nmi_cpu_start(NULL); +} + + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (!has_mux()) + return -ENOSYS; /* not implemented */ + if (nmi_multiplex_on() < 0) + return -EINVAL; /* not necessary */ + + on_each_cpu(nmi_cpu_switch, NULL, 1); + + return 0; +} + +static inline void mux_init(struct oprofile_operations *ops) +{ + if (has_mux()) + ops->switch_events = nmi_switch_event; +} + +static void mux_clone(int cpu) +{ + if (!has_mux()) + return; + + memcpy(per_cpu(cpu_msrs, cpu).multiplex, + per_cpu(cpu_msrs, 0).multiplex, + sizeof(struct op_msr) * model->num_virt_counters); } +#else + +inline int op_x86_phys_to_virt(int phys) { return phys; } +inline int op_x86_virt_to_phys(int virt) { return virt; } +static inline void nmi_shutdown_mux(void) { } +static inline int nmi_setup_mux(void) { return 1; } +static inline void +nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } +static inline void mux_init(struct oprofile_operations *ops) { } +static void mux_clone(int cpu) { } + +#endif + static void free_msrs(void) { int i; @@ -95,38 +298,32 @@ static void free_msrs(void) static int allocate_msrs(void) { - int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).counters) { - success = 0; - break; - } + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).counters) + return 0; per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).controls) { - success = 0; - break; - } + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).controls) + return 0; } - if (!success) - free_msrs(); - - return success; + return 1; } static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); - model->setup_ctrs(msrs); + model->setup_ctrs(model, msrs); + nmi_cpu_setup_mux(cpu, msrs); spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); @@ -144,11 +341,15 @@ static int nmi_setup(void) int cpu; if (!allocate_msrs()) - return -ENOMEM; + err = -ENOMEM; + else if (!nmi_setup_mux()) + err = -ENOMEM; + else + err = register_die_notifier(&profile_exceptions_nb); - err = register_die_notifier(&profile_exceptions_nb); if (err) { free_msrs(); + nmi_shutdown_mux(); return err; } @@ -159,45 +360,38 @@ static int nmi_setup(void) /* Assume saved/restored counters are the same on all CPUs */ model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); for_each_possible_cpu(cpu) { - if (cpu != 0) { - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); - } + if (!cpu) + continue; + + memcpy(per_cpu(cpu_msrs, cpu).counters, + per_cpu(cpu_msrs, 0).counters, + sizeof(struct op_msr) * model->num_counters); + + memcpy(per_cpu(cpu_msrs, cpu).controls, + per_cpu(cpu_msrs, 0).controls, + sizeof(struct op_msr) * model->num_controls); + mux_clone(cpu); } - on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; } -static void nmi_restore_registers(struct op_msrs *msrs) +static void nmi_cpu_restore_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - wrmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + wrmsrl(controls[i].addr, controls[i].saved); } - for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - wrmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + wrmsrl(counters[i].addr, counters[i].saved); } } @@ -205,7 +399,7 @@ static void nmi_cpu_shutdown(void *dummy) { unsigned int v; int cpu = smp_processor_id(); - struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); /* restoring APIC_LVTPC can trigger an apic error because the delivery * mode and vector nr combination can be illegal. That's by design: on @@ -216,7 +410,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_restore_registers(msrs); + nmi_cpu_restore_registers(msrs); } static void nmi_shutdown(void) @@ -226,42 +420,18 @@ static void nmi_shutdown(void) nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); + nmi_shutdown_mux(); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); put_cpu_var(cpu_msrs); } -static void nmi_cpu_start(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->start(msrs); -} - -static int nmi_start(void) -{ - on_each_cpu(nmi_cpu_start, NULL, 1); - return 0; -} - -static void nmi_cpu_stop(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->stop(msrs); -} - -static void nmi_stop(void) -{ - on_each_cpu(nmi_cpu_stop, NULL, 1); -} - -struct op_counter_config counter_config[OP_MAX_COUNTER]; - static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; - for (i = 0; i < model->num_counters; ++i) { + for (i = 0; i < model->num_virt_counters; ++i) { struct dentry *dir; char buf[4]; @@ -270,7 +440,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) * NOTE: assumes 1:1 mapping here (that counters are organized * sequentially in their struct assignment). */ - if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) + if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) continue; snprintf(buf, sizeof(buf), "%d", i); @@ -402,6 +572,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; @@ -428,7 +599,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 26: - arch_perfmon_setup_counters(); + spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; case 28: @@ -439,17 +610,7 @@ static int __init ppro_init(char **cpu_type) return 0; } - model = &op_ppro_spec; - return 1; -} - -static int __init arch_perfmon_init(char **cpu_type) -{ - if (!cpu_has_arch_perfmon) - return 0; - *cpu_type = "i386/arch_perfmon"; - model = &op_arch_perfmon_spec; - arch_perfmon_setup_counters(); + model = spec; return 1; } @@ -471,27 +632,26 @@ int __init op_nmi_init(struct oprofile_operations *ops) /* Needs to be at least an Athlon (or hammer in 32bit mode) */ switch (family) { - default: - return -ENODEV; case 6: - model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_amd_spec; - /* Actually it could be i386/hammer too, but give - user space an consistent name. */ + /* + * Actually it could be i386/hammer too, but + * give user space an consistent name. + */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_amd_spec; cpu_type = "x86-64/family10"; break; case 0x11: - model = &op_amd_spec; cpu_type = "x86-64/family11h"; break; + default: + return -ENODEV; } + model = &op_amd_spec; break; case X86_VENDOR_INTEL: @@ -510,8 +670,15 @@ int __init op_nmi_init(struct oprofile_operations *ops) break; } - if (!cpu_type && !arch_perfmon_init(&cpu_type)) + if (cpu_type) + break; + + if (!cpu_has_arch_perfmon) return -ENODEV; + + /* use arch perfmon as fallback */ + cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; break; default: @@ -522,18 +689,23 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; if (model->init) ret = model->init(ops); if (ret) return ret; + if (!model->num_virt_counters) + model->num_virt_counters = model->num_counters; + + mux_init(ops); + init_sysfs(); using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 91b6a116165e..e28398df0df2 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,7 +10,7 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 8 +#define OP_MAX_COUNTER 32 /* Per-perfctr configuration as set via * oprofilefs. diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8fdf06e4edf9..39686c29f03a 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -9,12 +9,15 @@ * @author Philippe Elie * @author Graydon Hoare * @author Robert Richter <robert.richter@amd.com> - * @author Barry Kasindorf + * @author Barry Kasindorf <barry.kasindorf@amd.com> + * @author Jason Yeh <jason.yeh@amd.com> + * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> */ #include <linux/oprofile.h> #include <linux/device.h> #include <linux/pci.h> +#include <linux/percpu.h> #include <asm/ptrace.h> #include <asm/msr.h> @@ -25,43 +28,36 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +#define NUM_VIRT_COUNTERS 32 +#define NUM_VIRT_CONTROLS 32 +#else +#define NUM_VIRT_COUNTERS NUM_COUNTERS +#define NUM_VIRT_CONTROLS NUM_CONTROLS +#endif + +#define OP_EVENT_MASK 0x0FFF +#define OP_CTR_OVERFLOW (1ULL<<31) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) - -static unsigned long reset_value[NUM_COUNTERS]; +#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) + +static unsigned long reset_value[NUM_VIRT_COUNTERS]; #ifdef CONFIG_OPROFILE_IBS /* IbsFetchCtl bits/masks */ -#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ -#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ -#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL /*IbsOpCtl bits */ -#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ -#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) -#define IBS_FETCH_SIZE 6 -#define IBS_OP_SIZE 12 +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 static int has_ibs; /* AMD Family10h and later */ @@ -78,6 +74,45 @@ static struct op_ibs_config ibs_config; #endif +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_mux_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_VIRT_COUNTERS; i++) { + int hw_counter = op_x86_virt_to_phys(i); + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; + else + msrs->multiplex[i].addr = 0; + } +} + +static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + +#else + +static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } + +#endif + /* functions for op_amd_spec */ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) @@ -97,150 +132,174 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) else msrs->controls[i].addr = 0; } -} + op_mux_fill_in_addresses(msrs); +} -static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; + /* setup reset_value */ + for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + if (counter_config[i].enabled) + reset_value[i] = counter_config[i].count; + else + reset_value[i] = 0; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + for (i = 0; i < NUM_CONTROLS; ++i) { + if (unlikely(!msrs->controls[i].addr)) continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; - CTR_WRITE(1, msrs, i); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + if (!msrs->counters[i].addr) + continue; - CTR_WRITE(counter_config[i].count, msrs, i); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - - CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; - } + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); } } #ifdef CONFIG_OPROFILE_IBS -static inline int +static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - u32 low, high; - u64 msr; + u64 val, ctl; struct op_entry entry; if (!has_ibs) - return 1; + return; if (ibs_config.fetch_enabled) { - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_HIGH_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); + if (ctl & IBS_FETCH_VAL) { + rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); + oprofile_write_reserve(&entry, regs, val, IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - oprofile_add_data(&entry, low); - oprofile_add_data(&entry, high); - rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data64(&entry, val); + oprofile_add_data64(&entry, ctl); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ - high &= ~IBS_FETCH_HIGH_VALID_BIT; - high |= IBS_FETCH_HIGH_ENABLE; - low &= IBS_FETCH_LOW_MAX_CNT_MASK; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } } if (ibs_config.op_enabled) { - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_LOW_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSOPRIP, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSOPCTL, ctl); + if (ctl & IBS_OP_VAL) { + rdmsrl(MSR_AMD64_IBSOPRIP, val); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, IBS_OP_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA2, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA3, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCLINAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA2, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA3, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCLINAD, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ - high = 0; - low &= ~IBS_OP_LOW_VALID_BIT; - low |= IBS_OP_LOW_ENABLE; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); + ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; + ctl |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } +} - return 1; +static inline void op_amd_start_ibs(void) +{ + u64 val; + if (has_ibs && ibs_config.fetch_enabled) { + val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; + val |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, val); + } + + if (has_ibs && ibs_config.op_enabled) { + val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; + val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + val |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, val); + } +} + +static void op_amd_stop_ibs(void) +{ + if (has_ibs && ibs_config.fetch_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); + + if (has_ibs && ibs_config.op_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSOPCTL, 0); } +#else + +static inline void op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) { } +static inline void op_amd_start_ibs(void) { } +static inline void op_amd_stop_ibs(void) { } + #endif static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } + rdmsrl(msrs->counters[i].addr, val); + /* bit is clear if overflowed: */ + if (val & OP_CTR_OVERFLOW) + continue; + oprofile_add_sample(regs, virt); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); } -#ifdef CONFIG_OPROFILE_IBS op_amd_handle_ibs(regs, msrs); -#endif /* See op_model_ppro.c */ return 1; @@ -248,79 +307,50 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, static void op_amd_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - } -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ - + IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + for (i = 0; i < NUM_COUNTERS; ++i) { + if (!reset_value[op_x86_phys_to_virt(i)]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } - if (has_ibs && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) - + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ - + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_start_ibs(); } - static void op_amd_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) + for (i = 0; i < NUM_COUNTERS; ++i) { + if (!reset_value[op_x86_phys_to_virt(i)]) continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } - if (has_ibs && ibs_config.op_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_stop_ibs(); } static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + for (i = 0; i < NUM_COUNTERS; ++i) { + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + for (i = 0; i < NUM_CONTROLS; ++i) { + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } } @@ -490,15 +520,21 @@ static void op_amd_exit(void) {} #endif /* CONFIG_OPROFILE_IBS */ -struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, +struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_virt_counters = NUM_VIRT_COUNTERS, + .reserved = MSR_AMD_EVENTSEL_RESERVED, + .event_mask = OP_EVENT_MASK, + .init = op_amd_init, + .exit = op_amd_exit, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown + .shutdown = &op_amd_shutdown, +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + .switch_ctrl = &op_mux_switch_ctrl, +#endif }; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 819b131fd752..ac6b354becdf 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -32,6 +32,8 @@ #define NUM_CCCRS_HT2 9 #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) +#define OP_CTR_OVERFLOW (1ULL<<31) + static unsigned int num_counters = NUM_COUNTERS_NON_HT; static unsigned int num_controls = NUM_CONTROLS_NON_HT; @@ -350,8 +352,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) -#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) -#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) #define CCCR_RESERVED_BITS 0x38030FFF #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) @@ -361,17 +361,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) -#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) -#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) -#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) - /* this assigns a "stagger" to the current CPU, which is used throughout the code in this module as an extra array offset, to select the "even" @@ -515,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) if (ev->bindings[i].virt_counter & counter_bit) { /* modify ESCR */ - ESCR_READ(escr, high, ev, i); + rdmsr(ev->bindings[i].escr_address, escr, high); ESCR_CLEAR(escr); if (stag == 0) { ESCR_SET_USR_0(escr, counter_config[ctr].user); @@ -526,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } ESCR_SET_EVENT_SELECT(escr, ev->event_select); ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); - ESCR_WRITE(escr, high, ev, i); + wrmsr(ev->bindings[i].escr_address, escr, high); /* modify CCCR */ - CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); @@ -537,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) CCCR_SET_PMI_OVF_0(cccr); else CCCR_SET_PMI_OVF_1(cccr); - CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); return; } } @@ -548,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } -static void p4_setup_ctrs(struct op_msrs const * const msrs) +static void p4_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int i; unsigned int low, high; @@ -563,8 +558,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) } /* clear the cccrs we will use */ - for (i = 0 ; i < num_counters ; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + for (i = 0; i < num_counters; i++) { + if (unlikely(!msrs->controls[i].addr)) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); @@ -574,17 +569,18 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) /* clear all escrs (including those outside our concern) */ for (i = num_counters; i < num_controls; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; wrmsr(msrs->controls[i].addr, 0, 0); } /* setup all counters */ - for (i = 0 ; i < num_counters ; ++i) { - if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { + for (i = 0; i < num_counters; ++i) { + if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); - CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(u64)counter_config[i].count); } else { reset_value[i] = 0; } @@ -624,14 +620,16 @@ static int p4_check_ctrs(struct pt_regs * const regs, real = VIRT_CTR(stag, i); - CCCR_READ(low, high, real); - CTR_READ(ctr, high, real); - if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + rdmsr(p4_counters[real].cccr_address, low, high); + rdmsr(p4_counters[real].counter_address, ctr, high); + if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], real); + wrmsrl(p4_counters[real].counter_address, + -(u64)reset_value[i]); CCCR_CLEAR_OVF(low); - CCCR_WRITE(low, high, real); - CTR_WRITE(reset_value[i], real); + wrmsr(p4_counters[real].cccr_address, low, high); + wrmsrl(p4_counters[real].counter_address, + -(u64)reset_value[i]); } } @@ -653,9 +651,9 @@ static void p4_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_ENABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -670,9 +668,9 @@ static void p4_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_DISABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -680,8 +678,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + for (i = 0; i < num_counters; ++i) { + if (msrs->counters[i].addr) release_perfctr_nmi(msrs->counters[i].addr); } /* @@ -689,15 +687,15 @@ static void p4_shutdown(struct op_msrs const * const msrs) * conjunction with the counter registers (hence the starting offset). * This saves a few bits. */ - for (i = num_counters ; i < num_controls ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + for (i = num_counters; i < num_controls; ++i) { + if (msrs->controls[i].addr) release_evntsel_nmi(msrs->controls[i].addr); } } #ifdef CONFIG_SMP -struct op_x86_model_spec const op_p4_ht2_spec = { +struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, @@ -709,7 +707,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = { }; #endif -struct op_x86_model_spec const op_p4_spec = { +struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4da7230b3d17..4899215999de 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -10,6 +10,7 @@ * @author Philippe Elie * @author Graydon Hoare * @author Andi Kleen + * @author Robert Richter <robert.richter@amd.com> */ #include <linux/oprofile.h> @@ -18,7 +19,6 @@ #include <asm/msr.h> #include <asm/apic.h> #include <asm/nmi.h> -#include <asm/perf_counter.h> #include "op_x86_model.h" #include "op_counter.h" @@ -26,20 +26,7 @@ static int num_counters = 2; static int counter_width = 32; -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR(x) (x &= (1<<21)) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT(val, e) (val |= e) +#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) static u64 *reset_value; @@ -63,9 +50,10 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) } -static void ppro_setup_ctrs(struct op_msrs const * const msrs) +static void ppro_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) { @@ -93,36 +81,30 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) } /* clear all counters */ - for (i = 0 ; i < num_counters; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + for (i = 0; i < num_counters; ++i) { + if (unlikely(!msrs->controls[i].addr)) continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < num_counters; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT(low, counter_config[i].event); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -143,14 +125,14 @@ static int ppro_check_ctrs(struct pt_regs * const regs, if (unlikely(!reset_value)) goto out; - for (i = 0 ; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); - if (CTR_OVERFLOWED(val)) { - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - } + if (val & (1ULL << (counter_width - 1))) + continue; + oprofile_add_sample(regs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } out: @@ -171,16 +153,16 @@ out: static void ppro_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } } @@ -188,7 +170,7 @@ static void ppro_start(struct op_msrs const * const msrs) static void ppro_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) @@ -196,9 +178,9 @@ static void ppro_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } @@ -206,12 +188,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + for (i = 0; i < num_counters; ++i) { + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < num_counters ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + for (i = 0; i < num_counters; ++i) { + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } if (reset_value) { @@ -222,8 +204,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec op_ppro_spec = { - .num_counters = 2, /* can be overriden */ - .num_controls = 2, /* dito */ + .num_counters = 2, + .num_controls = 2, + .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -241,7 +224,7 @@ struct op_x86_model_spec op_ppro_spec = { * the specific CPU. */ -void arch_perfmon_setup_counters(void) +static void arch_perfmon_setup_counters(void) { union cpuid10_eax eax; @@ -259,11 +242,17 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; - op_ppro_spec.num_counters = num_counters; - op_ppro_spec.num_controls = num_counters; +} + +static int arch_perfmon_init(struct oprofile_operations *ignore) +{ + arch_perfmon_setup_counters(); + return 0; } struct op_x86_model_spec op_arch_perfmon_spec = { + .reserved = MSR_PPRO_EVENTSEL_RESERVED, + .init = &arch_perfmon_init, /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, /* user space does the cpuid check for available events */ diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 825e79064d64..b83776180c7f 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -6,51 +6,66 @@ * @remark Read the file COPYING * * @author Graydon Hoare + * @author Robert Richter <robert.richter@amd.com> */ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H -struct op_saved_msr { - unsigned int high; - unsigned int low; -}; +#include <asm/types.h> +#include <asm/perf_counter.h> struct op_msr { - unsigned long addr; - struct op_saved_msr saved; + unsigned long addr; + u64 saved; }; struct op_msrs { struct op_msr *counters; struct op_msr *controls; + struct op_msr *multiplex; }; struct pt_regs; +struct oprofile_operations; + /* The model vtable abstracts the differences between * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { - int (*init)(struct oprofile_operations *ops); - void (*exit)(void); - unsigned int num_counters; - unsigned int num_controls; - void (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_msrs const * const msrs); - int (*check_ctrs)(struct pt_regs * const regs, - struct op_msrs const * const msrs); - void (*start)(struct op_msrs const * const msrs); - void (*stop)(struct op_msrs const * const msrs); - void (*shutdown)(struct op_msrs const * const msrs); + unsigned int num_counters; + unsigned int num_controls; + unsigned int num_virt_counters; + u64 reserved; + u16 event_mask; + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); + void (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); + int (*check_ctrs)(struct pt_regs * const regs, + struct op_msrs const * const msrs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); + void (*shutdown)(struct op_msrs const * const msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + void (*switch_ctrl)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); +#endif }; +struct op_counter_config; + +extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config); +extern int op_x86_phys_to_virt(int phys); +extern int op_x86_virt_to_phys(int virt); + extern struct op_x86_model_spec op_ppro_spec; -extern struct op_x86_model_spec const op_p4_spec; -extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_amd_spec; +extern struct op_x86_model_spec op_p4_spec; +extern struct op_x86_model_spec op_p4_ht2_spec; +extern struct op_x86_model_spec op_amd_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; -extern void arch_perfmon_setup_counters(void); - #endif /* OP_X86_MODEL_H */ |