diff options
Diffstat (limited to 'arch/x86/kernel')
81 files changed, 2796 insertions, 1349 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d8e5d0cdd678..4f2e66e29ecc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 67e929b89875..87eee07da21f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) } hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; /* * Some broken BIOSes advertise HPET at 0x0. We really do not diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d296f4a195c9..d85d1b2432ba 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL) + if (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR) init_intel_pdc(pr, c); return; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ca93638ba430..82e508677b91 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -78,12 +78,9 @@ int acpi_save_state_mem(void) #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); - header->pmode_efer_low = nx_enabled; - if (header->pmode_efer_low & 1) { - /* This is strange, why not save efer, always? */ - rdmsr(MSR_EFER, header->pmode_efer_low, - header->pmode_efer_high); - } + if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, + &header->pmode_efer_high)) + header->pmode_efer_low = header->pmode_efer_high = 0; #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); @@ -119,29 +116,32 @@ void acpi_restore_state_mem(void) /** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation * * We allocate a page from the first 1MB of memory for the wakeup * routine for when we come back from a sleep state. The * runtime allocator allows specification of <16MB pages, but not * <1MB pages. */ -void __init acpi_reserve_bootmem(void) +void __init acpi_reserve_wakeup_memory(void) { + unsigned long mem; + if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { printk(KERN_ERR "ACPI: Wakeup code way too big, S3 disabled.\n"); return; } - acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); + mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - if (!acpi_realmode) { + if (mem == -1L) { printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); return; } - - acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); + acpi_realmode = (unsigned long) phys_to_virt(mem); + acpi_wakeup_address = mem; + reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); } diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index da7b7b9f8bd8..565c1bfc507d 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,7 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 894aa97f0717..efb2b9cd132c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -241,28 +241,13 @@ static int modern_apic(void) } /* - * bare function to substitute write operation - * and it's _that_ fast :) - */ -static void native_apic_write_dummy(u32 reg, u32 v) -{ - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); -} - -static u32 native_apic_read_dummy(u32 reg) -{ - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); - return 0; -} - -/* - * right after this call apic->write/read doesn't do anything - * note that there is no restore operation it works one way + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything */ void apic_disable(void) { - apic->read = native_apic_read_dummy; - apic->write = native_apic_write_dummy; + pr_info("APIC: switched to apic NOOP\n"); + apic = &apic_noop; } void native_apic_wait_icr_idle(void) @@ -459,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0xffffffff); + apic_write(APIC_TMICT, 0); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -662,7 +647,7 @@ static int __init calibrate_APIC_clock(void) calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", calibration_result); @@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; int ret, x2apic_enabled = 0; - int dmar_table_init_ret = 0; + int dmar_table_init_ret; -#ifdef CONFIG_INTR_REMAP dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret) - pr_debug("dmar_table_init() failed with %d:\n", - dmar_table_init_ret); -#endif + if (dmar_table_init_ret && !x2apic_supported()) + return; ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 000000000000..e31b9ffe25f5 --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,200 @@ +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... + */ + +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/setup.h> + +#include <linux/smp.h> +#include <asm/ipi.h> + +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/e820.h> + +static void noop_init_apic_ldr(void) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_wait_icr_idle(void) { } +static void noop_apic_icr_write(u32 low, u32 id) { } + +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + return -1; +} + +static u32 noop_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static u64 noop_apic_icr_read(void) +{ + return 0; +} + +static int noop_cpu_to_logical_apicid(int cpu) +{ + return 0; +} + +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return 0; +} + +static unsigned int noop_get_apic_id(unsigned long x) +{ + return 0; +} + +static int noop_probe(void) +{ + /* + * NOOP apic should not ever be + * enabled via probe routine + */ + return 0; +} + +static int noop_apic_id_registered(void) +{ + /* + * if we would be really "pedantic" + * we should pass read_apic_id() here + * but since NOOP suppose APIC ID = 0 + * lets save a few cycles + */ + return physid_isset(0, phys_cpu_present_map); +} + +static const struct cpumask *noop_target_cpus(void) +{ + /* only BSP here */ + return cpumask_of(0); +} + +static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +static unsigned long noop_check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} + +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + if (cpu != 0) + pr_warning("APIC: Vector allocated for non-BSP cpu\n"); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +int noop_apicid_to_node(int logical_apicid) +{ + /* we're always on node 0 */ + return 0; +} + +static u32 noop_apic_read(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +static void noop_apic_write(u32 reg, u32 v) +{ + WARN_ON_ONCE(cpu_has_apic && !disable_apic); +} + +struct apic apic_noop = { + .name = "noop", + .probe = noop_probe, + .acpi_madt_oem_check = NULL, + + .apic_id_registered = noop_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = noop_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = noop_check_apicid_used, + .check_apicid_present = noop_check_apicid_present, + + .vector_allocation_domain = noop_vector_allocation_domain, + .init_apic_ldr = noop_init_apic_ldr, + + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .apicid_to_node = noop_apicid_to_node, + + .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + + .phys_pkg_id = noop_phys_pkg_id, + + .mps_oem_check = NULL, + + .get_apic_id = noop_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0x0F << 24, + + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + /* should be safe */ + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = NULL, + + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = noop_apic_read, + .write = noop_apic_write, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, + .wait_icr_idle = noop_apic_wait_icr_idle, + .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, +}; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 77a06413b6b2..38dcecfa5818 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void) #endif } -static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) -{ - return physid_mask_of_physid(phys_apicid); -} - /* Mapping from cpu number to logical apicid */ static inline int bigsmp_cpu_to_logical_apicid(int cpu) { @@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu) return cpu_physical_id(cpu); } -static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xFFL); + physids_promote(0xFFL, retmap); } static int bigsmp_check_phys_apicid_present(int phys_apicid) @@ -230,7 +225,7 @@ struct apic apic_bigsmp = { .apicid_to_node = bigsmp_apicid_to_node, .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = bigsmp_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 89174f847b49..dd2b5f264643 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -27,6 +27,9 @@ * * http://www.unisys.com */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/cpumask.h> @@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr) mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); - pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + pr_debug("host_reg = 0x%lx\n", (unsigned long)host_reg); - pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + pr_debug("mip_reg = 0x%lx\n", (unsigned long)mip_reg); success++; break; @@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void) if (!es7000_plat) return; - printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); + pr_info("Enabling APIC mode.\n"); memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); es7000_mip_reg.off_0x00 = MIP_SW_APIC; es7000_mip_reg.off_0x38 = MIP_VALID; @@ -466,11 +469,11 @@ static const struct cpumask *es7000_target_cpus(void) return cpumask_of(smp_processor_id()); } -static unsigned long -es7000_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } + static unsigned long es7000_check_apicid_present(int bit) { return physid_isset(bit, phys_cpu_present_map); @@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk(KERN_INFO - "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? "Physical Cluster" : "Logical Cluster", nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); @@ -539,14 +541,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu) static int cpu_id; -static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) +static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) { - physid_mask_t mask; - - mask = physid_mask_of_physid(cpu_id); + physid_set_mask_of_physid(cpu_id, retmap); ++cpu_id; - - return mask; } /* Mapping from cpu number to logical apicid */ @@ -561,10 +559,10 @@ static int es7000_cpu_to_logical_apicid(int cpu) #endif } -static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) +static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xff); + physids_promote(0xFFL, retmap); } static int es7000_check_phys_apicid_present(int cpu_physical_apicid) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index dc69f28489f5..d5d498fbee4b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -60,8 +60,6 @@ #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/hw_irq.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/uv_irq.h> #include <asm/apic.h> @@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ -struct irq_cfg { - struct irq_pin_list *irq_2_pin; - cpumask_var_t domain; - cpumask_var_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[] = { @@ -209,7 +193,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg = NULL; struct irq_desc *desc; @@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) /* end for move_irq_desc */ #else -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } @@ -555,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, add_pin_to_irq_node(cfg, node, newapic, newpin); } +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + static void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { - int pin; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); - } + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) @@ -595,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { struct irq_cfg *cfg = desc->chip_data; @@ -1177,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if ((cfg->move_in_progress) || cfg->move_cleanup_count) + if (cfg->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -1237,8 +1227,7 @@ next: return err; } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@ -1599,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_desc *desc; unsigned int irq; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", @@ -1708,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base) { int i; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG); for (i = 0; i < 8; i++) @@ -1724,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) unsigned int i, v, ver, maxlvt; u64 icr; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); @@ -1824,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk("\n"); } -__apicdebuginit(void) print_all_local_APICs(void) +__apicdebuginit(void) print_local_APICs(int maxcpu) { int cpu; + if (!maxcpu) + return; + preempt_disable(); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } preempt_enable(); } @@ -1839,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) + if (!nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1866,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -__apicdebuginit(int) print_all_ICs(void) +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) { + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) +{ + if (apic_verbosity == APIC_QUIET) + return 0; + print_PIC(); /* don't print out if apic is not there */ if (!cpu_has_apic && !apic_from_smp_config()) return 0; - print_all_local_APICs(); + print_local_APICs(show_lapic); print_IO_APIC(); return 0; } -fs_initcall(print_all_ICs); +fs_initcall(print_ICs); /* Where if anywhere is the i8259 connect in external int mode */ @@ -2031,7 +2037,7 @@ void __init setup_ioapic_ids_from_mpc(void) * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -2058,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(phys_id_present_map, + if (apic->check_apicid_used(&phys_id_present_map, mp_ioapics[apic_id].apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic_id, mp_ioapics[apic_id].apicid); @@ -2073,7 +2079,7 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid = i; } else { physid_mask_t tmp; - tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); + apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", mp_ioapics[apic_id].apicid); @@ -2228,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) +void send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } @@ -2272,15 +2274,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - /* * Either sets desc->affinity to a valid value, and returns * ->cpu_mask_to_apicid of that, or returns BAD_APICID and * leaves desc->affinity untouched. */ -static unsigned int +unsigned int set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; @@ -2433,8 +2432,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2452,7 +2449,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; unlock: spin_unlock(&desc->lock); } @@ -2460,21 +2456,33 @@ unlock: irq_exit(); } -static void irq_complete_move(struct irq_desc **descp) +static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; struct irq_cfg *cfg = desc->chip_data; - unsigned vector, me; + unsigned me; if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); } + +static void irq_complete_move(struct irq_desc **descp) +{ + __irq_complete_move(descp, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + + __irq_complete_move(&desc, cfg->vector); +} #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif @@ -2490,6 +2498,59 @@ static void ack_apic_edge(unsigned int irq) atomic_t irq_mis_count; +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. +*/ +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (mp_ioapics[entry->apic].apicver >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } else { + __mask_and_edge_IO_APIC_irq(entry); + __unmask_and_level_IO_APIC_irq(entry); + } + } +} + +static void eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2525,6 +2586,19 @@ static void ack_apic_level(unsigned int irq) * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. */ cfg = desc->chip_data; i = cfg->vector; @@ -2536,6 +2610,19 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + eoi_ioapic_irq(desc); + } + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2569,41 +2656,9 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } - - /* Tail end of version 0x11 I/O APIC bug workaround */ - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } } #ifdef CONFIG_INTR_REMAP -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, cfg->irq_2_pin) - io_apic_eoi(entry->apic, entry->pin); -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); @@ -3157,6 +3212,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) continue; desc_new = move_irq_desc(desc_new, node); + cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; @@ -3211,7 +3267,8 @@ void destroy_irq(unsigned int irq) * MSI message composition */ #ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) { struct irq_cfg *cfg; int err; @@ -3245,7 +3302,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.dest_id = IRTE_DEST(dest); /* Set source-id of interrupt request */ - set_msi_sid(&irte, pdev); + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); modify_irte(irq, &irte); @@ -3410,7 +3470,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(dev, irq, &msg); + ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; @@ -3543,7 +3603,7 @@ int arch_setup_dmar_msi(unsigned int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, &msg, -1); if (ret < 0) return ret; dmar_msi_write(irq, &msg); @@ -3583,6 +3643,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ +static struct irq_chip ir_hpet_msi_type = { + .name = "IR-HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif +#endif + .retrigger = ioapic_retrigger_irq, +}; + static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, @@ -3594,20 +3667,36 @@ static struct irq_chip hpet_msi_type = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq) +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { int ret; struct msi_msg msg; struct irq_desc *desc = irq_to_desc(irq); - ret = msi_compose_msg(NULL, irq, &msg); + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq, &msg); desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, - "edge"); + if (irq_remapped(irq)) + set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + handle_edge_irq, "edge"); + else + set_irq_chip_and_handler_name(irq, &hpet_msi_type, + handle_edge_irq, "edge"); return 0; } @@ -3708,75 +3797,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_UV -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg; - int mmr_pnode; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long flags; - int err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - cfg = irq_cfg(irq); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - spin_lock_irqsave(&vector_lock, flags); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - spin_unlock_irqrestore(&vector_lock, flags); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} -#endif /* CONFIG_X86_64 */ - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3944,7 +3964,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) */ if (physids_empty(apic_id_map)) - apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -3960,10 +3980,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(apic_id_map, apic_id)) { + if (apic->check_apicid_used(&apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(apic_id_map, i)) + if (!apic->check_apicid_used(&apic_id_map, i)) break; } @@ -3976,7 +3996,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - tmp = apic->apicid_to_cpu_present(apic_id); + apic->apicid_to_cpu_present(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -4106,7 +4126,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) for (i = 0; i < nr_ioapics; i++) { res[i].name = mem; res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; } @@ -4140,18 +4160,17 @@ void __init ioapic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } } diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 7ff61d6a188a..6389432a9dbf 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,8 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_t backtrace_mask __read_mostly; +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (cpumask_test_cpu(cpu, &backtrace_mask)) { + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); @@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, &backtrace_mask); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; } @@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(&backtrace_mask, cpu_online_mask); + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); apic->send_IPI_all(NMI_VECTOR); /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(&backtrace_mask)) + if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index efa00e2b8505..98c4665f251c 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static __init void early_check_numaq(void) { /* - * Find possible boot-time SMP configuration: - */ - early_find_smp_config(); - - /* * get boot-time SMP configuration: */ if (smp_found_config) @@ -334,10 +329,9 @@ static inline const struct cpumask *numaq_target_cpus(void) return cpu_all_mask; } -static inline unsigned long -numaq_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static inline unsigned long numaq_check_apicid_present(int bit) @@ -371,10 +365,10 @@ static inline int numaq_multi_timer_check(int apic, int irq) return apic != 0 && irq == 0; } -static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) +static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* We don't have a good way to do this yet - hack */ - return physids_promote(0xFUL); + return physids_promote(0xFUL, retmap); } static inline int numaq_cpu_to_logical_apicid(int cpu) @@ -402,12 +396,12 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } -static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) +static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); int cpu = __ffs(logical_apicid & 0xf); - return physid_mask_of_physid(cpu + 4*node); + physid_set_mask_of_physid(cpu + 4*node, retmap); } /* Where the IO area was mapped on multiquad, always 0 otherwise */ diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0c0182cc947d..1a6559f6768c 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -108,7 +108,7 @@ struct apic apic_default = { .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = default_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 645ecc4ff0be..9b419263d90d 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void) return cpumask_of(0); } -static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) +static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0x0F); + physids_promote(0x0FL, retmap); } -static physid_mask_t summit_apicid_to_cpu_present(int apicid) +static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) { - return physid_mask_of_physid(0); + physid_set_mask_of_physid(0, retmap); } static int summit_check_phys_apicid_present(int physical_apicid) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 326c25477d3d..b684bb303cbf 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -30,10 +30,22 @@ #include <asm/apic.h> #include <asm/ipi.h> #include <asm/smp.h> +#include <asm/x86_init.h> DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; +static u64 gru_start_paddr, gru_end_paddr; + +static inline bool is_GRU_range(u64 start, u64 end) +{ + return start >= gru_start_paddr && end <= gru_end_paddr; +} + +static bool uv_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end) || is_GRU_range(start, end); +} static int early_get_nodeid(void) { @@ -49,6 +61,7 @@ static int early_get_nodeid(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) @@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode) int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); - if (gru.s.enable) + if (gru.s.enable) { map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)gru.s.base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); + + } } static __init void map_mmr_high(int max_pnode) @@ -409,6 +426,12 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + static __init void uv_rtc_init(void) { long status; @@ -550,6 +573,8 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 151ace69a5aa..b5b6b23bce53 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -204,7 +204,6 @@ #include <linux/module.h> #include <linux/poll.h> -#include <linux/smp_lock.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/timer.h> @@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); /* * Set up a segment that references the real mode segment 0x40 @@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) return -EPERM; switch (cmd) { case APM_IOC_STANDBY: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->standbys_read > 0) { as->standbys_read--; as->standbys_pending--; @@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_STANDBY, as); if (standbys_pending <= 0) standby(); - unlock_kernel(); + mutex_unlock(&apm_mutex); break; case APM_IOC_SUSPEND: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->suspends_read > 0) { as->suspends_read--; as->suspends_pending--; @@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_SUSPEND, as); if (suspends_pending <= 0) { ret = suspend(1); + mutex_unlock(&apm_mutex); } else { as->suspend_wait = 1; + mutex_unlock(&apm_mutex); wait_event_interruptible(apm_suspend_waitqueue, as->suspend_wait == 0); ret = as->suspend_result; } - unlock_kernel(); return ret; default: return -ENOTTY; @@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp) { struct apm_user *as; - lock_kernel(); as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", sizeof(*as)); - unlock_kernel(); return -ENOMEM; } as->magic = APM_BIOS_MAGIC; @@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp) user_list = as; spin_unlock(&user_list_lock); filp->private_data = as; - unlock_kernel(); return 0; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 68537e957a9b..1d2cb383410e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,6 +5,7 @@ # Don't trace early stages of a secondary CPU boot ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif # Make sure load_percpu_segment has no stackprotector diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c910a716a71c..7128b3799cec 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -535,7 +535,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - display_cacheinfo(c); + cpu_detect_cache_sizes(c); /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) { diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c95e831bb095..e58d978e0758 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } - display_cacheinfo(c); + cpu_detect_cache_sizes(c); } enum { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..c1afa990a6c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void) static void __cpuinit default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 - display_cacheinfo(c); + cpu_detect_cache_sizes(c); #else /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) } } -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); #ifdef CONFIG_X86_64 /* On K8 L1 TLB is inclusive, so don't count it */ @@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) #endif c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); } void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -659,24 +654,31 @@ void __init early_cpu_init(void) const struct cpu_dev *const *cdev; int count = 0; +#ifdef PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); +#endif + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; - unsigned int j; if (count >= X86_VENDOR_NUM) break; cpu_devs[count] = cpudev; count++; - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - printk(KERN_INFO " %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); +#ifdef PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } } +#endif } - early_identify_cpu(&boot_cpu_data); } @@ -837,10 +839,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } -#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); -#endif + mcheck_cpu_init(c); select_idle_routine(c); @@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void) wrmsrl(MSR_KERNEL_GS_BASE, 0); barrier(); - check_efer(); + x86_configure_nx(); if (cpu != 0) enable_x2apic(); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 6de9a908e400..3624e8a0f71b 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -32,6 +32,6 @@ struct cpu_dev { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; -extern void display_cacheinfo(struct cpuinfo_x86 *c); +extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7d5c3b0ea8da..8b581d3905cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -526,15 +526,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) { - /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf + /* Intel Xeon Processor 7100 Series Specification Update + * http://www.intel.com/Assets/PDF/specupdate/314554.pdf * AL30: A Machine Check Exception (MCE) Occurring during an * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up when HT is enabled*/ + * Both Processor Cores to Lock Up. */ if (c->x86_vendor == X86_VENDOR_INTEL) { if ((c->x86 == 15) && (c->x86_model == 6) && - (c->x86_mask == 8) && smt_capable()) + (c->x86_mask == 8)) { + printk(KERN_INFO "acpi-cpufreq: Intel(R) " + "Xeon(R) 7100 Errata AL30, processors may " + "lock up on frequency changes: disabling " + "acpi-cpufreq.\n"); return -ENODEV; + } } return 0; } @@ -549,13 +555,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data(policy->cpu); struct acpi_processor_performance *perf; +#ifdef CONFIG_SMP + static int blacklisted; +#endif dprintk("acpi_cpufreq_cpu_init\n"); #ifdef CONFIG_SMP - result = acpi_cpufreq_blacklist(c); - if (result) - return result; + if (blacklisted) + return blacklisted; + blacklisted = acpi_cpufreq_blacklist(c); + if (blacklisted) + return blacklisted; #endif data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index ce2ed3e4aad9..cabd2fa3fc93 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); break; case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V1; + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 6394aa5c7985..3f12dabeab52 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data) * set it to 1 to avoid problems in the future. * For all others it's a BIOS bug. */ - if (!boot_cpu_data.x86 == 0x11) + if (boot_cpu_data.x86 != 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 6911e91fb4f6..3ae5a7a3a500 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -struct get_freq_data { - unsigned int speed; - unsigned int processor; -}; - -static void get_freq_data(void *_data) +static void get_freq_data(void *_speed) { - struct get_freq_data *data = _data; + unsigned int *speed = _speed; - data->speed = speedstep_get_frequency(data->processor); + *speed = speedstep_get_frequency(speedstep_processor); } static unsigned int speedstep_get(unsigned int cpu) { - struct get_freq_data data = { .processor = cpu }; + unsigned int speed; /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) BUG(); - dprintk("detected %u kHz as current frequency\n", data.speed); - return data.speed; + dprintk("detected %u kHz as current frequency\n", speed); + return speed; } /** diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 19807b89f058..4fbd384fb645 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c) /* Handle the GX (Formally known as the GX2) */ if (c->x86 == 5 && c->x86_model == 5) - display_cacheinfo(c); + cpu_detect_cache_sizes(c); else init_cyrix(c); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 40e1835b35e8..c900b73f9224 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) + if (node == NUMA_NO_NODE) node = first_node(node_online_map); + else if (!node_online(node)) { + /* reuse the value from init_cpu_to_node() */ + node = cpu_to_node(cpu); + } numa_set_node(cpu, node); printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 804c40e2bc3e..6c40f6b5b340 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ @@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; @@ -488,22 +491,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) #endif } - if (trace) - printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if (l1i) - printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); - - if (l1d) - printk(KERN_CONT ", L1 D cache: %dK\n", l1d); - else - printk(KERN_CONT "\n"); - - if (l2) - printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); - - if (l3) - printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); return l2; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 721a77ca8115..a8aacd4b513c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,9 @@ #include "mce-internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/mce.h> + int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -static void default_decode_mce(struct mce *m) +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); + +static int default_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) { pr_emerg("No human readable MCE decoding support on this CPU type.\n"); pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + + return NOTIFY_STOP; } -/* - * CPU/chipset specific EDAC code can register a callback here to print - * MCE errors in a human-readable form: - */ -void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; -EXPORT_SYMBOL(x86_mce_decode_callback); +static struct notifier_block mce_dec_nb = { + .notifier_call = default_decode_mce, + .priority = -1, +}; /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -141,6 +152,9 @@ void mce_log(struct mce *mce) { unsigned next, entry; + /* Emit the trace record: */ + trace_mce_record(mce); + mce->finished = 0; wmb(); for (;;) { @@ -204,9 +218,9 @@ static void print_mce(struct mce *m) /* * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that): + * (if the CPU has an implementation for that) */ - x86_mce_decode_callback(m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } static void print_mce_head(void) @@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_timer(unsigned long data) +static void mce_start_timer(unsigned long data) { struct timer_list *t = &per_cpu(mce_timer, data); int *n; @@ -1187,7 +1201,7 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int mce_banks_init(void) +static int __cpuinit __mcheck_cpu_mce_banks_init(void) { int i; @@ -1206,7 +1220,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit mce_cap_init(void) +static int __cpuinit __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1228,7 +1242,7 @@ static int __cpuinit mce_cap_init(void) WARN_ON(banks != 0 && b != banks); banks = b; if (!mce_banks) { - int err = mce_banks_init(); + int err = __mcheck_cpu_mce_banks_init(); if (err) return err; @@ -1244,7 +1258,7 @@ static int __cpuinit mce_cap_init(void) return 0; } -static void mce_init(void) +static void __mcheck_cpu_init_generic(void) { mce_banks_t all_banks; u64 cap; @@ -1273,7 +1287,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); @@ -1341,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) return 0; } -static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return; @@ -1355,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) } } -static void mce_cpu_features(struct cpuinfo_x86 *c) +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -1369,18 +1383,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } -static void mce_init_timer(void) +static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); + setup_timer(t, mce_start_timer, smp_processor_id()); + if (mce_ignore_ce) return; *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mcheck_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } @@ -1400,27 +1415,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mce_disabled) return; - mce_ancient_init(c); + __mcheck_cpu_ancient_init(c); if (!mce_available(c)) return; - if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { mce_disabled = 1; return; } machine_check_vector = do_machine_check; - mce_init(); - mce_cpu_features(c); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + } /* @@ -1640,6 +1656,15 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); +int __init mcheck_init(void) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + + mcheck_intel_therm_init(); + + return 0; +} + /* * Sysfs support */ @@ -1648,7 +1673,7 @@ __setup("mce", mcheck_enable); * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable(void) +static int mce_disable_error_reporting(void) { int i; @@ -1663,12 +1688,12 @@ static int mce_disable(void) static int mce_suspend(struct sys_device *dev, pm_message_t state) { - return mce_disable(); + return mce_disable_error_reporting(); } static int mce_shutdown(struct sys_device *dev) { - return mce_disable(); + return mce_disable_error_reporting(); } /* @@ -1678,8 +1703,8 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(); - mce_cpu_features(¤t_cpu_data); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(¤t_cpu_data); return 0; } @@ -1689,8 +1714,8 @@ static void mce_cpu_restart(void *data) del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(¤t_cpu_data)) return; - mce_init(); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); } /* Reinit MCEs after user configuration changes */ @@ -1716,7 +1741,7 @@ static void mce_enable_ce(void *all) cmci_reenable(); cmci_recheck(); if (all) - mce_init_timer(); + __mcheck_cpu_init_timer(); } static struct sysdev_class mce_sysclass = { @@ -1904,7 +1929,7 @@ error2: sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1929,13 +1954,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void __cpuinit mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { @@ -1946,7 +1972,7 @@ static void mce_disable_cpu(void *h) } } -static void mce_reenable_cpu(void *h) +static void __cpuinit mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -1991,9 +2017,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + + if (!mce_ignore_ce && check_interval) { + t->expires = round_jiffies(jiffies + __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); + add_timer_on(t, cpu); + } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; case CPU_POST_DEAD: @@ -2025,7 +2053,7 @@ static __init void mce_init_banks(void) } } -static __init int mce_init_device(void) +static __init int mcheck_init_device(void) { int err; int i = 0; @@ -2053,7 +2081,7 @@ static __init int mce_init_device(void) return err; } -device_initcall(mce_init_device); +device_initcall(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. @@ -2101,7 +2129,7 @@ static int fake_panic_set(void *data, u64 val) DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, "%llu\n"); -static int __init mce_debugfs_init(void) +static int __init mcheck_debugfs_init(void) { struct dentry *dmce, *ffake_panic; @@ -2115,5 +2143,5 @@ static int __init mce_debugfs_init(void) return 0; } -late_initcall(mce_debugfs_init); +late_initcall(mcheck_debugfs_init); #endif diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index b3a1dba75330..4fef985fc221 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); +static u32 lvtthmr_init __read_mostly; + #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) @@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +void __init mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && + cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); + + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * Always restore the value that BIOS has programmed on AP based on + * BSP's info we saved since BIOS is always setting the same value + * for all threads/cores + */ + apic_write(APIC_LVTTHMR, lvtthmr_init); + + h = lvtthmr_init; + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 73c86db5acbe..09b1698e0466 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } +static int __init clean_sort_range(struct res_range *range, int az) +{ + int i, j, k = az - 1, nr_range = 0; + + for (i = 0; i < k; i++) { + if (range[i].end) + continue; + for (j = k; j > i; j--) { + if (range[j].end) { + k = j; + break; + } + } + if (j == i) + break; + range[i].start = range[k].start; + range[i].end = range[k].end; + range[k].start = 0; + range[k].end = 0; + k--; + } + /* count it */ + for (i = 0; i < az; i++) { + if (!range[i].end) { + nr_range = i; + break; + } + } + + /* sort them */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + + return nr_range; +} + #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" @@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, subtract_range(range, extra_remove_base, extra_remove_base + extra_remove_size - 1); - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", range[i].start, range[i].end + 1); + } } /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + nr_range = clean_sort_range(range, RANGE_NUM); if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) @@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void) continue; if (!size) type = MTRR_NUM_TYPES; - if (type == MTRR_TYPE_WRPROT) - type = MTRR_TYPE_UNCACHABLE; num[type]++; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b5801c311846..c1bbed1021d9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -77,6 +77,18 @@ struct cpu_hw_events { struct debug_store *ds; }; +struct event_constraint { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int code; +}; + +#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } +#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->idxmsk[0]; (e)++) + + /* * struct x86_pmu - generic x86 pmu */ @@ -102,6 +114,8 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); + int (*get_event_idx)(struct cpu_hw_events *cpuc, + struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; @@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +static const struct event_constraint *event_constraints; + /* * Not sure about some of these */ @@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event) return hw_event & P6_EVNTSEL_MASK; } +static const struct event_constraint intel_p6_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; /* * Intel PerfMon v3. Used on Core2 and later. @@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; +static const struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static const struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static const u64 nehalem_hw_cache_event_ids +static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids }, }; -static const u64 core2_hw_cache_event_ids +static __initconst u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids }, }; -static const u64 atom_hw_cache_event_ids +static __initconst u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) return hw_event & CORE_EVNTSEL_MASK; } -static const u64 amd_hw_cache_event_ids +static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event) */ hwc->config = ARCH_PERFMON_EVENTSEL_INT; + hwc->idx = -1; + /* * Count user and OS events unless requested not to. */ @@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) x86_pmu_enable_event(hwc, idx); } -static int -fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) +static int fixed_mode_idx(struct hw_perf_event *hwc) { unsigned int hw_event; @@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) if (!x86_pmu.num_events_fixed) return -1; + /* + * fixed counters do not take all possible filters + */ + if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) + return -1; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) @@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) } /* - * Find a PMC slot for the freshly enabled / scheduled in event: + * generic counter allocator: get next free counter */ -static int x86_pmu_enable(struct perf_event *event) +static int +gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) +{ + int idx; + + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); + return idx == x86_pmu.num_events ? -1 : idx; +} + +/* + * intel-specific counter allocator: check event constraints + */ +static int +intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) +{ + const struct event_constraint *event_constraint; + int i, code; + + if (!event_constraints) + goto skip; + + code = hwc->config & CORE_EVNTSEL_EVENT_MASK; + + for_each_event_constraint(event_constraint, event_constraints) { + if (code == event_constraint->code) { + for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { + if (!test_and_set_bit(i, cpuc->used_mask)) + return i; + } + return -1; + } + } +skip: + return gen_get_event_idx(cpuc, hwc); +} + +static int +x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; int idx; - idx = fixed_mode_idx(event, hwc); + idx = fixed_mode_idx(hwc); if (idx == X86_PMC_IDX_FIXED_BTS) { /* BTS is already occupied. */ if (test_and_set_bit(idx, cpuc->used_mask)) return -EAGAIN; hwc->config_base = 0; - hwc->event_base = 0; + hwc->event_base = 0; hwc->idx = idx; } else if (idx >= 0) { /* @@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event) } else { idx = hwc->idx; /* Try to get the previous generic event again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { + if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_events); - if (idx == x86_pmu.num_events) + idx = x86_pmu.get_event_idx(cpuc, hwc); + if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; } + return idx; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = x86_schedule_event(cpuc, hwc); + if (idx < 0) + return idx; + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); @@ -1852,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = 1 }; -static struct x86_pmu p6_pmu = { +static __initconst struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, @@ -1877,9 +1989,10 @@ static struct x86_pmu p6_pmu = { */ .event_bits = 32, .event_mask = (1ULL << 32) - 1, + .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu intel_pmu = { +static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, @@ -1900,9 +2013,10 @@ static struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, + .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu amd_pmu = { +static __initconst struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, @@ -1920,9 +2034,10 @@ static struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, + .get_event_idx = gen_get_event_idx, }; -static int p6_pmu_init(void) +static __init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -1932,10 +2047,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ + event_constraints = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ + event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -1954,7 +2071,7 @@ static int p6_pmu_init(void) return 0; } -static int intel_pmu_init(void) +static __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -2007,12 +2124,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); + event_constraints = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: @@ -2025,7 +2144,7 @@ static int intel_pmu_init(void) return 0; } -static int amd_pmu_init(void) +static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) @@ -2105,11 +2224,47 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; +static int +validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event fake_event = event->hw; + + if (event->pmu && event->pmu != &pmu) + return 0; + + return x86_schedule_event(cpuc, &fake_event) >= 0; +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cpu_hw_events fake_pmu; + + memset(&fake_pmu, 0, sizeof(fake_pmu)); + + if (!validate_event(&fake_pmu, leader)) + return -ENOSPC; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + if (!validate_event(&fake_pmu, sibling)) + return -ENOSPC; + } + + if (!validate_event(&fake_pmu, event)) + return -ENOSPC; + + return 0; +} + const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; err = __hw_perf_event_init(event); + if (!err) { + if (event->group_leader != event) + err = validate_group(event); + } if (err) { if (event->destroy) event->destroy(event); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index fab786f60ed6..898df9719afb 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) + boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) return; wd_ops = &k7_wd_ops; break; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index bb62b3e5caad..28000743bbb0 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) early_init_transmeta(c); - display_cacheinfo(c); + cpu_detect_cache_sizes(c); /* Print CMS and CPU revision */ max = cpuid_eax(0x80860000); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 6a52d4b36a30..7ef24a796992 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file) { unsigned int cpu; struct cpuinfo_x86 *c; - int ret = 0; - - lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - ret = -ENXIO; /* No such CPU */ - goto out; - } + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + c = &cpu_data(cpu); if (c->cpuid_level < 0) - ret = -EIO; /* CPUID not supported */ -out: - unlock_kernel(); - return ret; + return -EIO; /* CPUID not supported */ + + return 0; } /* diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2d8a371d4339..b8ce165dde5d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) show_registers(regs); #ifdef CONFIG_X86_32 - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { + if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); } printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); print_symbol("%s", regs->ip); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f7dd2a7c3bf4..e0ed4c7abb62 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -10,9 +10,9 @@ #include <linux/module.h> #include <linux/ptrace.h> #include <linux/kexec.h> +#include <linux/sysfs.h> #include <linux/bug.h> #include <linux/nmi.h> -#include <linux/sysfs.h> #include <asm/stacktrace.h> @@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; + stack = &dummy; if (task && task != current) stack = (unsigned long *)task->thread.sp; @@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, - data, NULL, &graph); + bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) @@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a071e6be177e..8e740934bd1f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -10,26 +10,28 @@ #include <linux/module.h> #include <linux/ptrace.h> #include <linux/kexec.h> +#include <linux/sysfs.h> #include <linux/bug.h> #include <linux/nmi.h> -#include <linux/sysfs.h> #include <asm/stacktrace.h> #include "dumpstack.h" +#define N_EXCEPTION_STACKS_END \ + (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) static char x86_stack_ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", + [ DEBUG_STACK-1 ] = "#DB", + [ NMI_STACK-1 ] = "NMI", + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ STACKFAULT_STACK-1 ] = "#SS", + [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" + [ N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS_END ] = "#DB[?]" #endif - }; +}; int x86_is_stack_id(int id, char *name) { @@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name) } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) + unsigned *usedp, char **idp) { unsigned k; @@ -202,21 +204,24 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { + unsigned long *irq_stack_end; + unsigned long *irq_stack; unsigned long *stack; + int cpu; int i; - const int cpu = smp_processor_id(); - unsigned long *irq_stack_end = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - unsigned long *irq_stack = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + + preempt_disable(); + cpu = smp_processor_id(); + + irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); /* - * debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu. + * Debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu: */ - if (sp == NULL) { if (task) sp = (unsigned long *)task->thread.sp; @@ -240,6 +245,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } + preempt_enable(); + printk("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -303,4 +310,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c6..50b9c220e121 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -334,6 +334,10 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" +/* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to * go as quickly as possible which is why some of this is @@ -383,6 +387,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -513,6 +521,10 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -705,6 +717,10 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. @@ -814,6 +830,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder @@ -1185,17 +1209,14 @@ END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 pushl %eax - pushl %ecx pushl %edx movl %ebp, %eax call ftrace_return_to_handler - movl %eax, 0xc(%esp) + movl %eax, %ecx popl %edx - popl %ecx popl %eax - ret + jmp *%ecx #endif .section .rodata,"a" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f358..63bca794c8f9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -155,11 +155,11 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 16(%rsp) + movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $16, %rsp - retq + addq $24, %rsp + jmp *%rdi #endif @@ -803,6 +803,10 @@ END(interrupt) call \func .endm +/* + * Interrupt entry/exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -941,6 +945,10 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) +/* + * End of kprobes section + */ + .popsection /* * APIC interrupts. @@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt GENERIC_INTERRUPT_VECTOR \ - generic_interrupt smp_generic_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ @@ -1491,12 +1499,17 @@ error_kernelspace: leaq irq_return(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP+8(%rsp) - je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret cmpq $gs_change,RIP+8(%rsp) je error_swapgs jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + jmp error_swapgs END(error_entry) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9dbb527e1652..309689245431 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -9,6 +9,8 @@ * the dangers of modifying code on the run. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/spinlock.h> #include <linux/hardirq.h> #include <linux/uaccess.h> @@ -187,9 +189,26 @@ static void wait_for_nmi(void) nmi_wait_count++; } +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + static int do_ftrace_mod_code(unsigned long ip, void *new_code) { + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + mod_code_ip = (void *)ip; mod_code_newcode = new_code; @@ -336,15 +355,15 @@ int __init ftrace_dyn_arch_init(void *data) switch (faulted) { case 0: - pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); break; case 1: - pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + pr_info("converting mcount calls to 66 66 66 66 90\n"); memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); break; case 2: - pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + pr_info("converting mcount calls to jmp . + 5\n"); memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); break; } @@ -468,82 +487,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; extern unsigned long *sys_call_table; -static struct syscall_metadata **syscalls_metadata; - -static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) -{ - struct syscall_metadata *start; - struct syscall_metadata *stop; - char str[KSYM_SYMBOL_LEN]; - - - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; - kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); - - for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name, str)) - return start; - } - return NULL; -} - -struct syscall_metadata *syscall_nr_to_meta(int nr) -{ - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; -} - -int syscall_name_to_nr(char *name) -{ - int i; - - if (!syscalls_metadata) - return -1; - - for (i = 0; i < NR_syscalls; i++) { - if (syscalls_metadata[i]) { - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - } - } - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) +unsigned long __init arch_syscall_addr(int nr) { - syscalls_metadata[num]->exit_id = id; -} - -static int __init arch_init_ftrace_syscalls(void) -{ - int i; - struct syscall_metadata *meta; - unsigned long **psys_syscall_table = &sys_call_table; - - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return -ENOMEM; - } - - for (i = 0; i < NR_syscalls; i++) { - meta = find_syscall_meta(psys_syscall_table[i]); - syscalls_metadata[i] = meta; - } - return 0; + return (unsigned long)(&sys_call_table)[nr]; } -arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 050c278481b1..7fd318bac59c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -18,6 +18,8 @@ #include <asm/asm-offsets.h> #include <asm/setup.h> #include <asm/processor-flags.h> +#include <asm/msr-index.h> +#include <asm/cpufeature.h> #include <asm/percpu.h> /* Physical address */ @@ -297,25 +299,27 @@ ENTRY(startup_32_smp) orl %edx,%eax movl %eax,%cr4 - btl $5, %eax # check if PAE is enabled - jnc 6f + testb $X86_CR4_PAE, %al # check if PAE is enabled + jz 6f /* Check if extended functions are implemented */ movl $0x80000000, %eax cpuid - cmpl $0x80000000, %eax - jbe 6f + /* Value must be in the range 0x80000001 to 0x8000ffff */ + subl $0x80000001, %eax + cmpl $(0x8000ffff-0x80000001), %eax + ja 6f mov $0x80000001, %eax cpuid /* Execute Disable bit supported? */ - btl $20, %edx + btl $(X86_FEATURE_NX & 31), %edx jnc 6f /* Setup EFER (Extended Feature Enable Register) */ - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx rdmsr - btsl $11, %eax + btsl $_EFER_NX, %eax /* Make changes effective */ wrmsr diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 780cd928fcd5..2d8b5035371c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -212,8 +212,8 @@ ENTRY(secondary_startup_64) */ lgdt early_gdt_descr(%rip) - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax + /* set up data segments */ + xorl %eax,%eax movl %eax,%ds movl %eax,%ss movl %eax,%es @@ -262,11 +262,11 @@ ENTRY(secondary_startup_64) .quad x86_64_start_kernel ENTRY(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - __FINITDATA ENTRY(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 + __FINITDATA bad_address: jmp bad_address @@ -340,6 +340,7 @@ ENTRY(name) i = i + 1 ; \ .endr + .data /* * This default setting generates an ident mapping at address 0x100000 * and a mapping for the kernel that precisely maps virtual address diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dedc2bddf7a5..ba6e65884603 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,6 +33,7 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -47,12 +48,12 @@ struct hpet_dev { char name[10]; }; -unsigned long hpet_readl(unsigned long a) +inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); } -static inline void hpet_writel(unsigned long d, unsigned long a) +static inline void hpet_writel(unsigned int d, unsigned int a) { writel(d, hpet_virt_address + a); } @@ -167,7 +168,7 @@ do { \ static void hpet_reserve_msi_timers(struct hpet_data *hd); -static void hpet_reserve_platform_timers(unsigned long id) +static void hpet_reserve_platform_timers(unsigned int id) { struct hpet __iomem *hpet = hpet_virt_address; struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; @@ -205,7 +206,7 @@ static void hpet_reserve_platform_timers(unsigned long id) } #else -static void hpet_reserve_platform_timers(unsigned long id) { } +static void hpet_reserve_platform_timers(unsigned int id) { } #endif /* @@ -246,7 +247,7 @@ static void hpet_reset_counter(void) static void hpet_start_counter(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -271,7 +272,7 @@ static void hpet_resume_counter(void) static void hpet_enable_legacy_int(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); @@ -314,7 +315,7 @@ static int hpet_setup_msi_irq(unsigned int irq); static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { - unsigned long cfg, cmp, now; + unsigned int cfg, cmp, now; uint64_t delta; switch (mode) { @@ -323,7 +324,7 @@ static void hpet_set_mode(enum clock_event_mode mode, delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned long) delta; + cmp = now + (unsigned int) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); /* Make sure we use edge triggered interrupts */ cfg &= ~HPET_TN_LEVEL; @@ -339,7 +340,7 @@ static void hpet_set_mode(enum clock_event_mode mode, * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ - hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); hpet_start_counter(); hpet_print_config(); break; @@ -383,13 +384,24 @@ static int hpet_next_event(unsigned long delta, hpet_writel(cnt, HPET_Tn_CMP(timer)); /* - * We need to read back the CMP register to make sure that - * what we wrote hit the chip before we compare it to the - * counter. + * We need to read back the CMP register on certain HPET + * implementations (ATI chipsets) which seem to delay the + * transfer of the compare register into the internal compare + * logic. With small deltas this might actually be too late as + * the counter could already be higher than the compare value + * at that point and we would wait for the next hpet interrupt + * forever. We found out that reading the CMP register back + * forces the transfer so we can rely on the comparison with + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. */ - WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + KERN_WARNING "hpet: compare register read back failed.\n"); - return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -415,7 +427,7 @@ static struct hpet_dev *hpet_devs; void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); - unsigned long cfg; + unsigned int cfg; /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); @@ -425,7 +437,7 @@ void hpet_msi_unmask(unsigned int irq) void hpet_msi_mask(unsigned int irq) { - unsigned long cfg; + unsigned int cfg; struct hpet_dev *hdev = get_irq_data(irq); /* mask it */ @@ -467,7 +479,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { - if (arch_setup_hpet_msi(irq)) { + if (arch_setup_hpet_msi(irq, hpet_blockid)) { destroy_irq(irq); return -EINVAL; } @@ -584,6 +596,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (boot_cpu_has(X86_FEATURE_ARAT)) + return; id = hpet_readl(HPET_ID); num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); @@ -598,7 +612,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { struct hpet_dev *hdev = &hpet_devs[num_timers_used]; - unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); /* Only consider HPET timer with MSI support */ if (!(cfg & HPET_TN_FSB_CAP)) @@ -813,7 +827,7 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { - unsigned long id; + unsigned int id; int i; if (!is_hpet_capable()) @@ -872,10 +886,8 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); - hpet_msi_capability_lookup(2); return 1; } - hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -908,9 +920,17 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; + if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) + hpet_msi_capability_lookup(2); + else + hpet_msi_capability_lookup(0); + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (boot_cpu_has(X86_FEATURE_ARAT)) + return 0; + for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } @@ -925,7 +945,7 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { if (is_hpet_capable()) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; @@ -965,8 +985,8 @@ static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static u32 hpet_t1_cmp; -static unsigned long hpet_default_delta; -static unsigned long hpet_pie_delta; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; @@ -1017,7 +1037,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); */ int hpet_rtc_timer_init(void) { - unsigned long cfg, cnt, delta, flags; + unsigned int cfg, cnt, delta; + unsigned long flags; if (!is_hpet_enabled()) return 0; @@ -1027,7 +1048,7 @@ int hpet_rtc_timer_init(void) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; - hpet_default_delta = (unsigned long) clc; + hpet_default_delta = clc; } if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) @@ -1113,7 +1134,7 @@ int hpet_set_periodic_freq(unsigned long freq) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= hpet_clockevent.shift; - hpet_pie_delta = (unsigned long) clc; + hpet_pie_delta = clc; } return 1; } @@ -1127,7 +1148,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { - unsigned long cfg, delta; + unsigned int cfg, delta; int lost_ints = -1; if (unlikely(!hpet_rtc_flags)) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..d42f65ac4927 --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,555 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> +#include <linux/irqflags.h> +#include <linux/notifier.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/hw_breakpoint.h> +#include <asm/processor.h> +#include <asm/debugreg.h> + +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); + + +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + + return bp_info; +} + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) +{ + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __get_cpu_var(cpu_debugreg[i]) = info->address; + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 &= ~__encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case X86_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case X86_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case X86_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); + if (info->address) + return 0; + + return -EINVAL; +} + +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) +{ + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; + break; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; + break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Len */ + switch (bp->attr.bp_len) { + case HW_BREAKPOINT_LEN_1: + info->len = X86_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + info->len = X86_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + info->len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + break; + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + if (info->type == X86_BREAKPOINT_EXECUTE) + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + if ((!arch_check_va_in_userspace(info->address, info->len)) && + info->len != X86_BREAKPOINT_EXECUTE) + return ret; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->callback) + ret = arch_store_info(bp); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (info->address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(info->address, info->len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(info->address, info->len)) + return -EFAULT; + } + + return 0; +} + +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} +EXPORT_SYMBOL_GPL(aout_dump_debugregs); + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } +} + +void hw_breakpoint_restore(void) +{ + set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); + set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); + set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); + set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__get_cpu_var(cpu_dr7), 7); +} +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap<n> set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +static int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct perf_event *bp; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + + /* + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. + */ + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + if (bp) + rc = NOTIFY_DONE; + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); + /* + * bp can be NULL due to lazy debug register switching + * or due to concurrent perf counter removing. + */ + if (!bp) { + rcu_read_unlock(); + break; + } + + (bp->callback)(bp, args->regs); + + rcu_read_unlock(); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu(); + + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} + +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 04bbd5278568..664bcb7384ac 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -18,7 +18,7 @@ atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ -void (*generic_interrupt_extension)(void) = NULL; +void (*x86_platform_ipi_callback)(void) = NULL; /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); #endif - if (generic_interrupt_extension) { + if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_printf(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP @@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); -# endif #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); @@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_pending_irqs; #endif - if (generic_interrupt_extension) - sum += irq_stats(cpu)->generic_irqs; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; sum += irq_stats(cpu)->irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; -# endif #endif #ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); @@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } /* - * Handler for GENERIC_INTERRUPT_VECTOR. + * Handler for X86_PLATFORM_IPI_VECTOR. */ -void smp_generic_interrupt(struct pt_regs *regs) +void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs) irq_enter(); - inc_irq_stat(generic_irqs); + inc_irq_stat(x86_platform_ipis); - if (generic_interrupt_extension) - generic_interrupt_extension(); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); irq_exit(); @@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs) } EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq, vector; + static int warned; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); + continue; + } + + /* + * Complete the irq move. This cpu is going down and for + * non intr-remapping case, we can't wait till this interrupt + * arrives at this cpu before completing the irq move. + */ + irq_force_complete_move(irq); + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) + desc->chip->unmask(irq); + + spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + printk("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irr; + + if (__get_cpu_var(vector_irq)[vector] < 0) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + spin_lock(&desc->lock); + if (desc->chip->retrigger) + desc->chip->retrigger(irq); + spin_unlock(&desc->lock); + } + } +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 7d35d0fe2329..10709f29d166 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } - -#ifdef CONFIG_HOTPLUG_CPU - -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - affinity = desc->affinity; - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - printk("Breaking affinity for irq %i\n", irq); - affinity = cpu_all_mask; - } - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (desc->action) - printk_once("Cannot set affinity for irq %i\n", irq); - } - -#if 0 - barrier(); - /* Ingo Molnar says: "after the IO-APIC masks have been redirected - [note the nop - the interrupt-enable boundary on x86 is two - instructions from sti] - to flush out pending hardirqs and - IPIs. After this point nothing is supposed to reach this CPU." */ - __asm__ __volatile__("sti; nop; cli"); - barrier(); -#else - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -#endif -} -#endif - diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 977d8b43a0dd..acf8fbf8fbda 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } -#ifdef CONFIG_HOTPLUG_CPU -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - static int warned; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); - - affinity = desc->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); - continue; - } - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_all_mask; - } - - if (desc->chip->mask) - desc->chip->mask(irq); - - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (!(warned++)) - set_affinity = 0; - - if (desc->chip->unmask) - desc->chip->unmask(irq); - - spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif extern void call_softirq(void); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 40f30773fb29..d5932226614f 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -200,8 +200,8 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI for X86 platform specific use */ + alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8d82a77a3f3b..20a5b3689463 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include <linux/smp.h> #include <linux/nmi.h> +#include <asm/debugreg.h> #include <asm/apicdef.h> #include <asm/system.h> @@ -88,7 +89,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; - gdb_regs[GDB_SP] = (int)®s->sp; #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; #endif + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); } /** @@ -434,6 +434,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..1f3186ce213c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,31 +48,22 @@ #include <linux/preempt.h> #include <linux/module.h> #include <linux/kdebug.h> +#include <linux/kallsyms.h> #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/uaccess.h> #include <asm/alternative.h> +#include <asm/insn.h> +#include <asm/debugreg.h> void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef CONFIG_X86_64 -#define stack_addr(regs) ((unsigned long *)regs->sp) -#else -/* - * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs - * don't save the ss and esp registers if the CPU is already in kernel - * mode when it traps. So for kprobes, regs->sp and regs->ss are not - * the [nonexistent] saved stack pointer and ss register, but rather - * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to - * point to the top of the pre-int3 stack. - */ -#define stack_addr(regs) ((unsigned long *)®s->sp) -#endif +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -244,6 +191,75 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } @@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: save_previous_kprobe(kcb); set_current_kprobe(p, regs, kcb); @@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. - */ - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); @@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..a3fa43ba5d3b 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include <asm/desc.h> #include <asm/system.h> #include <asm/cacheflush.h> +#include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) { @@ -157,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image) { int error; - if (nx_enabled) - set_pages_x(image->control_code_page, 1); + set_pages_x(image->control_code_page, 1); error = machine_kexec_alloc_page_tables(image); if (error) return error; @@ -172,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { - if (nx_enabled) - set_pages_nx(image->control_code_page, 1); + set_pages_nx(image->control_code_page, 1); machine_kexec_free_page_tables(image); } @@ -202,6 +201,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e98..4a8bb82248ae 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/debugreg.h> static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index f4c538b681ca..37542b67c57e 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,6 +13,9 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/firmware.h> #include <linux/pci_ids.h> #include <linux/uaccess.h> @@ -33,6 +36,9 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 +const struct firmware *firmware; +static int supported_cpu; + struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -71,17 +77,14 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + if (!supported_cpu) return -1; - } + + memset(csig, 0, sizeof(*csig)); rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev) i++; } - if (!equiv_cpu_id) { - printk(KERN_WARNING "microcode: CPU%d: cpu revision " - "not listed in equivalent cpu table\n", cpu); + if (!equiv_cpu_id) return 0; - } - if (mc_header->processor_rev_id != equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d: patch mismatch " - "(processor_rev_id: %x, equiv_cpu_id: %x)\n", - cpu, mc_header->processor_rev_id, equiv_cpu_id); + if (mc_header->processor_rev_id != equiv_cpu_id) return 0; - } /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - printk(KERN_ERR "microcode: CPU%d: loading of chipset " - "specific code not yet supported\n", cpu); + pr_err("CPU%d: loading of chipset specific code not yet supported\n", + cpu); return 0; } @@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d: update failed " - "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); + pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + cpu, mc_amd->hdr.patch_id); return -1; } - printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", - cpu, rev); - + pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_DEBUG "microcode: size %u, total_size %u\n", - size, total_size); - if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error: size mismatch\n"); + pr_err("error: size mismatch\n"); return NULL; } @@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: failed to allocate " - "equivalent CPU table\n"); + pr_err("failed to allocate equivalent CPU table\n"); return 0; } @@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: failed to create " - "equivalent cpu table\n"); + pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (!leftover) { vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode " - "update with version 0x%x (current=0x%x)\n", + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); } else { vfree(new_mc); @@ -308,33 +294,26 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { - const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; enum ucode_state ret; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + if (firmware == NULL) return UCODE_NFOUND; - } if (*(u32 *)firmware->data != UCODE_MAGIC) { - printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); - release_firmware(firmware); - return ret; } static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_INFO "microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -346,7 +325,31 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +void init_microcode_amd(struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &boot_cpu_data; + + WARN_ON(c->x86_vendor != X86_VENDOR_AMD); + + if (c->x86 < 0x10) { + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + return; + } + supported_cpu = 1; + + if (request_firmware(&firmware, fw_name, device)) + pr_err("failed to load file %s\n", fw_name); +} + +void fini_microcode_amd(void) +{ + release_firmware(firmware); +} + static struct microcode_ops microcode_amd_ops = { + .init = init_microcode_amd, + .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 378e9a8f1bf8..844c02c65fcb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -70,10 +70,12 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/platform_device.h> #include <linux/miscdevice.h> #include <linux/capability.h> -#include <linux/smp_lock.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> @@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size) static int microcode_open(struct inode *unused1, struct file *unused2) { - cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } @@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); + pr_err("too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -246,7 +247,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu) if (!uci->mc) return UCODE_NFOUND; - pr_debug("microcode: CPU%d updated upon resume\n", cpu); + pr_debug("CPU%d updated upon resume\n", cpu); apply_microcode_on_target(cpu); return UCODE_OK; @@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu) ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); if (ustate == UCODE_OK) { - pr_debug("microcode: CPU%d updated upon init\n", cpu); + pr_debug("CPU%d updated upon init\n", cpu); apply_microcode_on_target(cpu); } @@ -393,7 +394,7 @@ static enum ucode_state microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - if (uci->valid) + if (uci->valid && uci->mc) ustate = microcode_resume_cpu(cpu); else ustate = microcode_init_cpu(cpu); @@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) @@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; @@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - pr_err("microcode: Failed to create group for CPU%d\n", cpu); + pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); break; case CPU_DEAD: case CPU_UP_CANCELED_FROZEN: @@ -509,7 +510,7 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - pr_err("microcode: no support for this CPU vendor\n"); + pr_err("no support for this CPU vendor\n"); return -ENODEV; } @@ -520,6 +521,9 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } + if (microcode_ops->init) + microcode_ops->init(µcode_pdev->dev); + get_online_cpus(); mutex_lock(µcode_mutex); @@ -540,8 +544,7 @@ static int __init microcode_init(void) register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " <tigran@aivazian.fsnet.co.uk>," - " Peter Oruba\n"); + " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); return 0; } @@ -563,6 +566,9 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); + if (microcode_ops->fini) + microcode_ops->fini(); + microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0d334ddd0a96..ebd193e476ca 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/firmware.h> #include <linux/uaccess.h> #include <linux/kernel.h> @@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); + pr_err("CPU%d not a capable Intel processor\n", cpu_num); return -1; } @@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc) data_size = get_datasize(mc_header); if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); + pr_err("error! Bad data size in microcode data file\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); + pr_err("error! Unknown microcode update format\n"); return -EINVAL; } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); + pr_err("error! Small exttable size in microcode data file\n"); return -EINVAL; } ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); + pr_err("error! Bad exttable size in microcode data file\n"); return -EFAULT; } ext_sigcount = ext_header->count; @@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc) while (i--) ext_table_sum += ext_tablep[i]; if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); + pr_warning("aborting, bad extended signature table checksum\n"); return -EINVAL; } } @@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc) while (i--) orig_sum += ((int *)mc)[i]; if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } if (!ext_table_size) @@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc) - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } } @@ -327,13 +324,11 @@ static int apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update " - "to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); return -1; } - printk(KERN_INFO "microcode: CPU%d updated to revision " - "0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, @@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc_size = get_totalsize(&mc_header); if (!mc_size || mc_size > leftover) { - printk(KERN_ERR "microcode: error!" - "Bad data in microcode data file\n"); + pr_err("error! Bad data in microcode data file\n"); break; } @@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); out: return state; } @@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) c->x86, c->x86_model, c->x86_mask); if (request_firmware(&firmware, name, device)) { - pr_debug("microcode: data file %s load failed\n", name); + pr_debug("data file %s load failed\n", name); return UCODE_NFOUND; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 5be95ef4ffec..35a57c963df9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early) */ } -static void __init smp_reserve_bootmem(struct mpf_intel *mpf) +static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); -#ifdef CONFIG_X86_32 - /* - * We cannot access to MPC table to compute table size yet, - * as only few megabytes from the bottom is mapped now. - * PC-9800's MPC table places on the very last of physical - * memory; so that simply reserving PAGE_SIZE from mpf->physptr - * yields BUG() in reserve_bootmem. - * also need to make sure physptr is below than max_low_pfn - * we don't need reserve the area above max_low_pfn - */ - unsigned long end = max_low_pfn * PAGE_SIZE; - if (mpf->physptr < end) { - if (mpf->physptr + size > end) - size = end - mpf->physptr; - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); - } -#else - reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); -#endif + reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } -static int __init smp_scan_config(unsigned long base, unsigned long length, - unsigned reserve) +static int __init smp_scan_config(unsigned long base, unsigned long length) { unsigned int *bp = phys_to_virt(base); struct mpf_intel *mpf; + unsigned long mem; apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", bp, length); @@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", mpf, (u64)virt_to_phys(mpf)); - if (!reserve) - return 1; - reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), - BOOTMEM_DEFAULT); + mem = virt_to_phys(mpf); + reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) - smp_reserve_bootmem(mpf); + smp_reserve_memory(mpf); return 1; } @@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -void __init default_find_smp_config(unsigned int reserve) +void __init default_find_smp_config(void) { unsigned int address; @@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve) * 2) Scan the top 1K of base RAM * 3) Scan the 64K of bios */ - if (smp_scan_config(0x0, 0x400, reserve) || - smp_scan_config(639 * 0x400, 0x400, reserve) || - smp_scan_config(0xF0000, 0x10000, reserve)) + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) return; /* * If it is an SMP machine we should know now, unless the @@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve) address = get_bios_ebda(); if (address) - smp_scan_config(address, 0x400, reserve); + smp_scan_config(address, 0x400); } #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 6a3cefc7dda1..553449951b84 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -174,21 +174,17 @@ static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &cpu_data(cpu); - int ret = 0; - lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - ret = -ENXIO; /* No such CPU */ - goto out; - } + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + c = &cpu_data(cpu); if (!cpu_has(c, X86_FEATURE_MSR)) - ret = -EIO; /* MSR not supported */ -out: - unlock_kernel(); - return ret; + return -EIO; /* MSR not supported */ + + return 0; } /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5284cd2b5776..7a7bd4e3ec49 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,11 @@ #include <linux/pm.h> #include <linux/clockchips.h> #include <linux/random.h> +#include <linux/user-return-notifier.h> +#include <linux/dmi.h> +#include <linux/utsname.h> #include <trace/events/power.h> +#include <linux/hw_breakpoint.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -17,6 +21,7 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/ds.h> +#include <asm/debugreg.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -87,6 +92,25 @@ void exit_thread(void) } } +void show_regs_common(void) +{ + const char *board, *product; + + board = dmi_get_system_info(DMI_BOARD_NAME); + if (!board) + board = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; + + printk("\n"); + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board, product); +} + void flush_thread(void) { struct task_struct *tsk = current; @@ -103,14 +127,7 @@ void flush_thread(void) } #endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -192,16 +209,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ @@ -224,6 +231,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + propagate_user_return_notify(prev_p, next_p); } int sys_fork(struct pt_regs *regs) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cdab..120b88797a75 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -23,7 +23,6 @@ #include <linux/vmalloc.h> #include <linux/user.h> #include <linux/interrupt.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/reboot.h> #include <linux/init.h> @@ -35,7 +34,6 @@ #include <linux/tick.h> #include <linux/percpu.h> #include <linux/prctl.h> -#include <linux/dmi.h> #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/io.h> @@ -58,6 +56,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -127,28 +126,18 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; - const char *board; if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; gs = get_user_gs(regs); } else { - sp = (unsigned long) (®s->sp); + sp = kernel_stack_pointer(regs); savesegment(ss, ss); savesegment(gs, gs); } - printk("\n"); - - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", - task_pid_nr(current), current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + show_regs_common(); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, @@ -187,7 +176,7 @@ void __show_regs(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, ®s->sp, regs->bp); } @@ -259,7 +248,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eb62cbcaa490..e5ab0cd0ef36 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/interrupt.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ptrace.h> @@ -38,7 +37,6 @@ #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> -#include <linux/dmi.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -52,6 +50,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage extern void ret_from_fork(void); @@ -162,18 +161,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; - const char *board; - - printk("\n"); - print_modules(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + + show_regs_common(); printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, @@ -226,8 +215,7 @@ void __show_regs(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - printk(KERN_INFO "CPU %d:", smp_processor_id()); - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } @@ -297,12 +285,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -341,29 +333,46 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + return err; } -void -start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) { loadsegment(fs, 0); - loadsegment(es, 0); - loadsegment(ds, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); - regs->cs = __USER_CS; - regs->ss = __USER_DS; - regs->flags = 0x200; + regs->cs = _cs; + regs->ss = _ss; + regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* * Free the old FP and other extended state */ free_thread_xstate(current); } -EXPORT_SYMBOL_GPL(start_thread); + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} + +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER32_CS, __USER32_DS, __USER32_DS); +} +#endif /* * switch_to(x,y) should switch tasks from x to y. @@ -495,6 +504,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (preload_fpu) __math_state_restore(); + return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7b058a2dc66a..04d182a7cfdb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/workqueue.h> +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -34,6 +36,7 @@ #include <asm/prctl.h> #include <asm/proto.h> #include <asm/ds.h> +#include <asm/hw_breakpoint.h> #include "tls.h" @@ -49,6 +52,118 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/** + * regs_get_argument_nth() - get Nth argument at function call + * @regs: pt_regs which contains registers at function entry. + * @n: argument number. + * + * regs_get_argument_nth() returns @n th argument of a function call. + * Since usually the kernel stack will be changed right after function entry, + * you must use this at function entry. If the @n th entry is NOT in the + * kernel stack or pt_regs, this returns 0. + */ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) +{ + if (n < ARRAY_SIZE(arg_offs_table)) + return *(unsigned long *)((char *)regs + arg_offs_table[n]); + else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + n -= ARRAY_SIZE(arg_offs_table); + return regs_get_kernel_stack_nth(regs, 1 + n); + } +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -454,99 +555,239 @@ static int genregs_set(struct task_struct *target, return ret; } +static void ptrace_triggered(struct perf_event *bp, void *data) +{ + int i; + struct thread_struct *thread = &(current->thread); + + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) + break; + } + + thread->debugreg6 |= (DR_TRAP0 << i); +} + /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { - switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } } - return 0; + + return dr7; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +static struct perf_event * +ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + struct task_struct *tsk, int disabled) { - int i; + int err; + int gen_len, gen_type; + DEFINE_BREAKPOINT_ATTR(attr); - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ + if (!bp) + return ERR_PTR(-EINVAL); - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + if (err) + return ERR_PTR(err); - switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + attr = bp->attr; + attr.bp_len = gen_len; + attr.bp_type = gen_type; + attr.disabled = disabled; - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; + return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); +} + +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + struct perf_event *bp; + + data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->ptrace_bps[i]; + + if (!enabled) { + if (bp) { + /* + * Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + + thread->ptrace_bps[i] = NULL; + bp = ptrace_modify_breakpoint(bp, len, type, + tsk, 1); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; + } + continue; + } + + bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + + /* Incorrect bp, or we have a bug in bp API */ + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} + +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; - case 7: + if (n < HBP_NUM) { + struct perf_event *bp; + bp = thread->ptrace_bps[n]; + if (!bp) + return 0; + val = bp->hw.info.address; + } else if (n == 6) { + val = thread->debugreg6; + } else if (n == 7) { + val = ptrace_get_dr7(thread->ptrace_bps); + } + return val; +} + +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct perf_event *bp; + struct thread_struct *t = &tsk->thread; + DEFINE_BREAKPOINT_ATTR(attr); + + if (!t->ptrace_bps[nr]) { /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) + * Put stub len and type to register (reserve) an inactive but + * correct bp */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; + attr.bp_addr = addr; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + } else { + bp = t->ptrace_bps[nr]; + t->ptrace_bps[nr] = NULL; + + attr = bp->attr; + attr.bp_addr = addr; + bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } + /* + * CHECKME: the previous code returned -EIO if the addr wasn't a + * valid task virtual addr. The new one will return -EINVAL in this + * case. + * -EINVAL may be what we want for in-kernel breakpoints users, but + * -EIO looks better for ptrace, since we refuse a register writing + * for the user. And anyway this is the previous behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; return 0; } /* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + thread->debugreg6 = val; + goto ret_path; + } + if (n < HBP_NUM) { + rc = ptrace_set_breakpoint_addr(tsk, n, val); + if (rc) + return rc; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); + +ret_path: + return rc; +} + +/* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 6c3b2c6fd772..18093d7498f0 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; + u32 node; u32 val; devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); @@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) return; pci_read_config_dword(nb_ht, 0x60, &val); - set_dev_node(&dev->dev, val & 7); + node = val & 7; + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); pci_dev_put(nb_ht); } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2b97fc5b124e..1545bc0c9845 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -259,6 +259,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, { } }; diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 61a837743fe5..201eab63b05f 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -80,6 +80,7 @@ void mach_reboot_fixups(void) continue; cur->reboot_fixup(dev); + pci_dev_put(dev); } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2a34f9c5be21..946a311a25c9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -106,9 +106,11 @@ #include <asm/percpu.h> #include <asm/topology.h> #include <asm/apicdef.h> +#include <asm/k8.h> #ifdef CONFIG_X86_64 #include <asm/numa_64.h> #endif +#include <asm/mce.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -247,7 +249,7 @@ EXPORT_SYMBOL(edd); * from boot_params into a safe place. * */ -static inline void copy_edd(void) +static inline void __init copy_edd(void) { memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, sizeof(edd.mbr_signature)); @@ -256,7 +258,7 @@ static inline void copy_edd(void) edd.edd_info_nr = boot_params.eddbuf_entries; } #else -static inline void copy_edd(void) +static inline void __init copy_edd(void) { } #endif @@ -486,42 +488,11 @@ static void __init reserve_early_setup_data(void) #ifdef CONFIG_KEXEC -/** - * Reserve @size bytes of crashkernel memory at any suitable offset. - * - * @size: Size of the crashkernel memory to reserve. - * Returns the base address on success, and -1ULL on failure. - */ -static -unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) -{ - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long start = 0LL; - - while (1) { - int ret; - - start = find_e820_area(start, ULONG_MAX, size, alignment); - if (start == -1ULL) - return start; - - /* try to reserve it */ - ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE); - if (ret >= 0) - return start; - - start += alignment; - } -} - static inline unsigned long long get_total_mem(void) { unsigned long long total; - total = max_low_pfn - min_low_pfn; -#ifdef CONFIG_HIGHMEM - total += highend_pfn - highstart_pfn; -#endif + total = max_pfn - min_low_pfn; return total << PAGE_SHIFT; } @@ -541,21 +512,25 @@ static void __init reserve_crashkernel(void) /* 0 means: find the address automatically */ if (crash_base <= 0) { - crash_base = find_and_reserve_crashkernel(crash_size); + const unsigned long long alignment = 16<<20; /* 16M */ + + crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, + alignment); if (crash_base == -1ULL) { - pr_info("crashkernel reservation failed. " - "No suitable area found.\n"); + pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } } else { - ret = reserve_bootmem_generic(crash_base, crash_size, - BOOTMEM_EXCLUSIVE); - if (ret < 0) { - pr_info("crashkernel reservation failed - " - "memory is in use\n"); + unsigned long long start; + + start = find_e820_area(crash_base, ULONG_MAX, crash_size, + 1<<20); + if (start != crash_base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } + reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -698,6 +673,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { void __init setup_arch(char **cmdline_p) { + int acpi = 0; + int k8 = 0; + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -790,21 +768,18 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; -#ifdef CONFIG_X86_64 /* - * Must call this twice: Once just to detect whether hardware doesn't - * support NX (so that the early EHCI debug console setup can safely - * call set_fixmap(), and then again after parsing early parameters to - * honor the respective command line option. + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). It may then be called + * again from within noexec_setup() during parsing early parameters + * to honor the respective command line option. */ - check_efer(); -#endif + x86_configure_nx(); parse_early_param(); -#ifdef CONFIG_X86_64 - check_efer(); -#endif + x86_report_nx(); /* Must be before kernel pagetables are setup */ vmi_activate(); @@ -900,6 +875,13 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + * even before init_memory_mapping + */ + acpi_reserve_wakeup_memory(); +#endif init_gbpages(); /* max_pfn_mapped is updated here */ @@ -926,6 +908,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); + reserve_crashkernel(); + vsmp_init(); io_delay_init(); @@ -937,27 +921,24 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. */ - acpi_numa_init(); + acpi = acpi_numa_init(); #endif - initmem_init(0, max_pfn); - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); +#ifdef CONFIG_K8_NUMA + if (!acpi) + k8 = !k8_numa_init(0, max_pfn); #endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - reserve_crashkernel(); + initmem_init(0, max_pfn, acpi, k8); #ifdef CONFIG_X86_64 /* @@ -1031,6 +1012,8 @@ void __init setup_arch(char **cmdline_p) #endif #endif x86_init.oem.banner(); + + mcheck_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d559af913e1f..35abcb8b00e9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -20,9 +22,9 @@ #include <asm/stackprotector.h> #ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(x...) printk(KERN_DEBUG x) +# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) #else -# define DBG(x...) +# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) #endif DEFINE_PER_CPU(int, cpu_number); @@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } else { ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), size, align, goal); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", + cpu, size, node, __pa(ptr)); } return ptr; #else @@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", + pr_warning("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 6a44a76055ad..74fe6d86dc5d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -19,6 +19,7 @@ #include <linux/stddef.h> #include <linux/personality.h> #include <linux/uaccess.h> +#include <linux/user-return-notifier.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -799,15 +800,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* @@ -872,6 +864,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (current->replacement_session_keyring) key_replace_session_keyring(); } + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 565ebc65920e..324f2a44c221 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1250,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu) void cpu_disable_common(void) { int cpu = smp_processor_id(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 0157cd26d7cc..70c2125d55b9 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -336,3 +336,4 @@ ENTRY(sys_call_table) .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open + .long sys_recvmmsg diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 1740c85e24bb..364d015efebc 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -817,10 +817,8 @@ static int __init uv_init_blade(int blade) */ apicid = blade_to_first_apicid(blade); pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); - if ((pa & 0xff) != UV_BAU_MESSAGE) { - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, ((apicid << 32) | UV_BAU_MESSAGE)); - } return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7e37dcee0cc3..33399176512a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); /* Catch kmemcheck conditions first of all! */ - if (condition & DR_STEP && kmemcheck_trap(regs)) + if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index f37930954d15..eed156851f5d 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); + if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) + pr_info( + "Skipped synchronization checks as TSC is reliable.\n"); return; } - pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); - /* * Reset it - in case this is a second bootup: */ @@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu) cpu_relax(); if (nr_warps) { - printk("\n"); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { - printk(" passed.\n"); + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); } /* diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index aeef529917e4..61d805df4c91 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -9,10 +9,25 @@ */ #include <linux/module.h> +#include <linux/rbtree.h> #include <linux/irq.h> #include <asm/apic.h> #include <asm/uv/uv_irq.h> +#include <asm/uv/uv_hub.h> + +/* MMR offset and pnode of hub sourcing interrupts for a given irq */ +struct uv_irq_2_mmr_pnode{ + struct rb_node list; + unsigned long offset; + int pnode; + int irq; +}; + +static spinlock_t uv_irq_lock; +static struct rb_root uv_irq_root; + +static int uv_set_irq_affinity(unsigned int, const struct cpumask *); static void uv_noop(unsigned int irq) { @@ -39,25 +54,214 @@ struct irq_chip uv_irq_chip = { .unmask = uv_noop, .eoi = uv_ack_apic, .end = uv_noop, + .set_affinity = uv_set_irq_affinity, }; /* + * Add offset and pnode information of the hub sourcing interrupts to the + * rb tree for a specific irq. + */ +static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +{ + struct rb_node **link = &uv_irq_root.rb_node; + struct rb_node *parent = NULL; + struct uv_irq_2_mmr_pnode *n; + struct uv_irq_2_mmr_pnode *e; + unsigned long irqflags; + + n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, + uv_blade_to_memory_nid(blade)); + if (!n) + return -ENOMEM; + + n->irq = irq; + n->offset = offset; + n->pnode = uv_blade_to_pnode(blade); + spin_lock_irqsave(&uv_irq_lock, irqflags); + /* Find the right place in the rbtree: */ + while (*link) { + parent = *link; + e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); + + if (unlikely(irq == e->irq)) { + /* irq entry exists */ + e->pnode = uv_blade_to_pnode(blade); + e->offset = offset; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + kfree(n); + return 0; + } + + if (irq < e->irq) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + /* Insert the node into the rbtree. */ + rb_link_node(&n->list, parent, link); + rb_insert_color(&n->list, &uv_irq_root); + + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; +} + +/* Retrieve offset and pnode information from the rb tree for a specific irq */ +int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +{ + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + + if (e->irq == irq) { + *offset = e->offset; + *pnode = e->pnode; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; + } + + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return -1; +} + +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +static int +arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset, int restrict) +{ + const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int err; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); + if (err != 0) + return err; + + if (restrict == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->mask = 1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} + +static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + unsigned mmr_pnode; + + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} + +/* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an * interrupt is raised. */ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int restrict) { - int irq; - int ret; + int irq, ret; + + irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - irq = create_irq(); if (irq <= 0) return -EBUSY; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); - if (ret != irq) + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, + restrict); + if (ret == irq) + uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); + else destroy_irq(irq); return ret; @@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); * * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). */ -void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +void uv_teardown_irq(unsigned int irq) { - arch_disable_uv_irq(mmr_blade, mmr_offset); + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + if (e->irq == irq) { + arch_disable_uv_irq(e->pnode, e->offset); + rb_erase(n, &uv_irq_root); + kfree(e); + break; + } + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); destroy_irq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 583f11d5c480..3c84aa001c11 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -74,7 +74,7 @@ struct uv_rtc_timer_head { */ static struct uv_rtc_timer_head **blade_info __read_mostly; -static int uv_rtc_enable; +static int uv_rtc_evt_enable; /* * Hardware interface routines @@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu) pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | - (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } @@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires) uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, UVH_EVENT_OCCURRED0_RTC1_MASK); - val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); /* Set configuration */ @@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires) /* Initialize comparator value */ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); + if (uv_read_rtc(NULL) <= expires) + return 0; + + return !uv_intr_pending(pnode); } /* @@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) next_cpu = head->next_cpu; *t = expires; + /* Will this one be next to go off? */ if (next_cpu < 0 || bcpu == next_cpu || expires < head->cpu[next_cpu].expires) { @@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) *t = ULLONG_MAX; uv_rtc_find_next_timer(head, pnode); spin_unlock_irqrestore(&head->lock, flags); - return 1; + return -ETIME; } } @@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) * * Returns 1 if this timer was pending. */ -static int uv_rtc_unset_timer(int cpu) +static int uv_rtc_unset_timer(int cpu, int force) { int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); @@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu) spin_lock_irqsave(&head->lock, flags); - if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) + if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force) rc = 1; - *t = ULLONG_MAX; - - /* Was the hardware setup for this timer? */ - if (head->next_cpu == bcpu) - uv_rtc_find_next_timer(head, pnode); + if (rc) { + *t = ULLONG_MAX; + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + } spin_unlock_irqrestore(&head->lock, flags); @@ -310,32 +315,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu); + uv_rtc_unset_timer(ced_cpu, 1); break; } } static void uv_rtc_interrupt(void) { - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); int cpu = smp_processor_id(); + struct clock_event_device *ced = &per_cpu(cpu_ced, cpu); if (!ced || !ced->event_handler) return; - if (uv_rtc_unset_timer(cpu) != 1) + if (uv_rtc_unset_timer(cpu, 0) != 1) return; ced->event_handler(ced); } -static int __init uv_enable_rtc(char *str) +static int __init uv_enable_evt_rtc(char *str) { - uv_rtc_enable = 1; + uv_rtc_evt_enable = 1; return 1; } -__setup("uvrtc", uv_enable_rtc); +__setup("uvrtcevt", uv_enable_evt_rtc); static __init void uv_rtc_register_clockevents(struct work_struct *dummy) { @@ -350,27 +355,32 @@ static __init int uv_rtc_setup_clock(void) { int rc; - if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + if (!is_uv_system()) return -ENODEV; - generic_interrupt_extension = uv_rtc_interrupt; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, clocksource_uv.shift); + /* If single blade, prefer tsc */ + if (uv_num_possible_blades() == 1) + clocksource_uv.rating = 250; + rc = clocksource_register(&clocksource_uv); - if (rc) { - generic_interrupt_extension = NULL; + if (rc) + printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); + else + printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", + sn_rtc_cycles_per_second/(unsigned long)1E6); + + if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback) return rc; - } /* Setup and register clockevents */ rc = uv_rtc_allocate_timers(); - if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; - return rc; - } + if (rc) + goto error; + + x86_platform_ipi_callback = uv_rtc_interrupt; clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, NSEC_PER_SEC, clock_event_device_uv.shift); @@ -383,11 +393,19 @@ static __init int uv_rtc_setup_clock(void) rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; + x86_platform_ipi_callback = NULL; uv_rtc_deallocate_timers(); + goto error; } + printk(KERN_INFO "UV RTC clockevents registered\n"); + + return 0; + +error: + clocksource_unregister(&clocksource_uv); + printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc); + return rc; } arch_initcall(uv_rtc_setup_clock); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index f068553a1b17..34a279a7471d 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apic_cpus = apic->apicid_to_cpu_present(m->apicid); + apic->apicid_to_cpu_present(m->apicid, &apic_cpus); physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); /* * Validate version @@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static void __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(void) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq) } static struct irq_chip cobalt_irq_type = { - .typename = "Cobalt-APIC", + .name = "Cobalt-APIC", .startup = startup_cobalt_irq, .shutdown = disable_cobalt_irq, .enable = enable_cobalt_irq, @@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq) } static struct irq_chip piix4_master_irq_type = { - .typename = "PIIX4-master", + .name = "PIIX4-master", .startup = startup_piix4_master_irq, .ack = ack_cobalt_irq, .end = end_piix4_master_irq, @@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { - .typename = "PIIX4-virtual", + .name = "PIIX4-virtual", .shutdown = disable_8259A_irq, .enable = enable_8259A_irq, .disable = disable_8259A_irq, diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 611b9e2360d3..74c92bb194df 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) evt->min_delta_ns = clockevent_delta2ns(1, evt); evt->cpumask = cpumask_of(cpu); - printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", evt->name, evt->mult, evt->shift); clockevents_register_device(evt); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3c68fe2d46cf..f3f2104408d9 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -41,6 +41,32 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +/* + * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA + * we retain large page mappings for boundaries spanning kernel text, rodata + * and data sections. + * + * However, kernel identity mappings will have different RWX permissions + * to the pages mapping to text and to the pages padding (which are freed) the + * text section. Hence kernel identity mappings will be broken to smaller + * pages. For 64-bit, kernel text and kernel identity mappings are different, + * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, + * as well as retain 2MB large page mappings for kernel text. + */ +#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); + +#define X64_ALIGN_DEBUG_RODATA_END \ + . = ALIGN(HPAGE_SIZE); \ + __end_rodata_hpage_align = .; + +#else + +#define X64_ALIGN_DEBUG_RODATA_BEGIN +#define X64_ALIGN_DEBUG_RODATA_END + +#endif + PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -90,7 +116,9 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 + X64_ALIGN_DEBUG_RODATA_BEGIN RO_DATA(PAGE_SIZE) + X64_ALIGN_DEBUG_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -107,13 +135,13 @@ SECTIONS PAGE_ALIGNED_DATA(PAGE_SIZE) - CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) DATA_DATA CONSTRUCTORS /* rarely changed data like cpu maps */ - READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) + READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) /* End of data section */ _edata = .; @@ -137,12 +165,12 @@ SECTIONS *(.vsyscall_0) } :user - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } @@ -166,7 +194,7 @@ SECTIONS } vgetcpu_mode = VVIRT(.vgetcpu_mode); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(L1_CACHE_BYTES); .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8cb4974ff599..9055e5872ff0 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,7 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) { unsigned long flags; @@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; @@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = { }; static ctl_table kernel_root_table2[] = { - { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + { .procname = "kernel", .mode = 0555, .child = kernel_table2 }, {} }; diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 3909e3ba5ce3..a1029769b6f2 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -30,9 +30,8 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); -EXPORT_SYMBOL(copy_from_user); -EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(__copy_from_user_inatomic); +EXPORT_SYMBOL(_copy_from_user); +EXPORT_SYMBOL(_copy_to_user); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index d11c5ff7c65e..ccd179dec36e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -13,6 +13,7 @@ #include <asm/e820.h> #include <asm/time.h> #include <asm/irq.h> +#include <asm/pat.h> #include <asm/tsc.h> #include <asm/iommu.h> @@ -80,4 +81,5 @@ struct x86_platform_ops x86_platform = { .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, + .is_untracked_pat_range = is_ISA_range, }; |