diff options
Diffstat (limited to 'arch/x86/kernel')
122 files changed, 3325 insertions, 2316 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0503f5bfb18d..4dd5d500eb60 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o @@ -125,6 +126,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +ifdef CONFIG_FRAME_POINTER +obj-y += unwind_frame.o +else +obj-y += unwind_guess.o +endif + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 3242e591fa82..26b78d86f25a 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9414f84584e4..8a5abaa7d453 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -28,7 +28,7 @@ #include <linux/acpi_pmtmr.h> #include <linux/efi.h> #include <linux/cpumask.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/dmi.h> #include <linux/irq.h> #include <linux/slab.h> @@ -161,28 +161,29 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) /** * acpi_register_lapic - register a local apic and generates a logic cpu number * @id: local apic id to register + * @acpiid: ACPI id to register * @enabled: this cpu is enabled or not * * Returns the logic cpu number which maps to the local apic */ -static int acpi_register_lapic(int id, u8 enabled) +static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) { unsigned int ver = 0; + int cpu; if (id >= MAX_LOCAL_APIC) { printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); return -EINVAL; } - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; - return generic_processor_info(id, ver); + cpu = __generic_processor_info(id, ver, enabled); + if (cpu >= 0) + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; + + return cpu; } static int __init @@ -212,7 +213,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) if (!apic->apic_id_valid(apic_id) && enabled) printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); else - acpi_register_lapic(apic_id, enabled); + acpi_register_lapic(apic_id, processor->uid, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif @@ -232,6 +233,10 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) acpi_table_print_madt_entry(header); + /* Ignore invalid ID */ + if (processor->id == 0xff) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -240,6 +245,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) * when we use CPU hotplug. */ acpi_register_lapic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); return 0; @@ -258,6 +264,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); return 0; @@ -274,6 +281,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; + acpi_table_print_madt_entry(header); + acpi_lapic_addr = lapic_addr_ovr->address; return 0; @@ -697,7 +706,7 @@ static void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include <acpi/processor.h> -static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA int nid; @@ -708,13 +717,14 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) numa_set_node(cpu, nid); } #endif + return 0; } int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { int cpu; - cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED); + cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); return cpu; @@ -990,21 +1000,6 @@ static int __init acpi_parse_madt_lapic_entries(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; - /* - * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value). - */ - - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, - acpi_parse_lapic_addr_ovr, 0); - if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); - return count; - } - - register_lapic_address(acpi_lapic_addr); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, acpi_parse_sapic, MAX_LOCAL_APIC); @@ -1023,8 +1018,8 @@ static int __init acpi_parse_madt_lapic_entries(void) return ret; } - x2count = madt_proc[0].count; - count = madt_proc[1].count; + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); @@ -1505,7 +1500,7 @@ void __init acpi_boot_table_init(void) * If acpi_disabled, bail out */ if (acpi_disabled) - return; + return; /* * Initialize the ACPI boot-time table parser. diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c new file mode 100644 index 000000000000..6fb478bf82fd --- /dev/null +++ b/arch/x86/kernel/acpi/cppc_msr.c @@ -0,0 +1,58 @@ +/* + * cppc_msr.c: MSR Interface for CPPC + * Copyright (c) 2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <acpi/cppc_acpi.h> +#include <asm/msr.h> + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 4b28159e0421..af15f4444330 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -5,7 +5,7 @@ */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/acpi.h> #include <linux/cpu.h> @@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) +void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); struct cstate_entry *percpu_entry; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index adb3eaf8fe2a..48587335ede8 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP - stack_start = (unsigned long)temp_stack + sizeof(temp_stack); + initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 8e3842fc8bea..63ff468a7986 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -20,7 +20,6 @@ #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> -#include <linux/module.h> #include <linux/topology.h> #include <linux/interrupt.h> #include <linux/bitmap.h> @@ -242,7 +241,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, static dma_addr_t gart_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long bus; phys_addr_t paddr = page_to_phys(page) + offset; @@ -264,7 +263,7 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page, */ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long iommu_page; int npages; @@ -286,7 +285,7 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, * Wrapper for pci_unmap_single working with scatterlists. */ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *s; int i; @@ -294,7 +293,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, for_each_sg(sg, s, nents, i) { if (!s->dma_length || !s->length) break; - gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL); + gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0); } } @@ -316,7 +315,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, addr = dma_map_area(dev, addr, s->length, dir, 0); if (addr == bad_dma_addr) { if (i > 0) - gart_unmap_sg(dev, sg, i, dir, NULL); + gart_unmap_sg(dev, sg, i, dir, 0); nents = 0; sg[0].dma_length = 0; break; @@ -387,7 +386,7 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, * Merge chunks that have page aligned sizes into a continuous mapping. */ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *s, *ps, *start_sg, *sgmap; int need = 0, nextneed, i, out, start; @@ -457,7 +456,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, error: flush_gart(); - gart_unmap_sg(dev, sg, out, dir, NULL); + gart_unmap_sg(dev, sg, out, dir, 0); /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { @@ -477,7 +476,7 @@ error: /* allocate and map a coherent mapping */ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, - gfp_t flag, struct dma_attrs *attrs) + gfp_t flag, unsigned long attrs) { dma_addr_t paddr; unsigned long align_mask; @@ -509,9 +508,9 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, /* free a coherent mapping */ static void gart_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) + dma_addr_t dma_addr, unsigned long attrs) { - gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); + gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index a147e676fc7b..4fdf6230d93c 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -9,7 +9,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/spinlock.h> #include <asm/amd_nb.h> @@ -71,8 +71,8 @@ int amd_cache_northbridges(void) while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) i++; - if (i == 0) - return 0; + if (!i) + return -ENODEV; nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) @@ -219,24 +219,22 @@ int amd_set_subcaches(int cpu, unsigned long mask) return 0; } -static int amd_cache_gart(void) +static void amd_cache_gart(void) { u16 i; - if (!amd_nb_has_feature(AMD_NB_GART)) - return 0; - - flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - amd_northbridges.flags &= ~AMD_NB_GART; - return -ENOMEM; - } + if (!amd_nb_has_feature(AMD_NB_GART)) + return; - for (i = 0; i != amd_nb_num(); i++) - pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, - &flush_words[i]); + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); + return; + } - return 0; + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); } void amd_flush_garts(void) @@ -278,17 +276,10 @@ EXPORT_SYMBOL_GPL(amd_flush_garts); static __init int init_amd_nbs(void) { - int err = 0; - - err = amd_cache_northbridges(); - - if (err < 0) - pr_notice("Cannot enumerate AMD northbridges\n"); + amd_cache_northbridges(); + amd_cache_gart(); - if (amd_cache_gart() < 0) - pr_notice("Cannot initialize GART flush words, GART support disabled\n"); - - return err; + return 0; } /* This has to go after the PCI subsystem */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index cefacbad1531..456316f6c868 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -215,26 +215,18 @@ void apbt_setup_secondary_clock(void) * cpu timers during the offline process due to the ordering of notification. * the extra interrupt is harmless. */ -static int apbt_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) +static int apbt_cpu_dead(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - dw_apb_clockevent_pause(adev->timer); - if (system_state == SYSTEM_RUNNING) { - pr_debug("skipping APBT CPU %lu offline\n", cpu); - } else { - pr_debug("APBT clockevent for cpu %lu offline\n", cpu); - dw_apb_clockevent_stop(adev->timer); - } - break; - default: - pr_debug("APBT notified %lu, no action\n", action); + dw_apb_clockevent_pause(adev->timer); + if (system_state == SYSTEM_RUNNING) { + pr_debug("skipping APBT CPU %u offline\n", cpu); + } else { + pr_debug("APBT clockevent for cpu %u offline\n", cpu); + dw_apb_clockevent_stop(adev->timer); } - return NOTIFY_OK; + return 0; } static __init int apbt_late_init(void) @@ -242,9 +234,8 @@ static __init int apbt_late_init(void) if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || !apb_timer_block_enabled) return 0; - /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(apbt_cpuhp_notify, -20); - return 0; + return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL, + apbt_cpu_dead); } fs_initcall(apbt_late_init); #else diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 60078a67d7e3..88c657b057e2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -23,7 +23,7 @@ #include <linux/bootmem.h> #include <linux/ftrace.h> #include <linux/ioport.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/timex.h> @@ -64,6 +64,8 @@ unsigned disabled_cpus; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); +u8 boot_cpu_apic_version; + /* * The highest APIC ID seen during enumeration. */ @@ -92,8 +94,10 @@ static int apic_extnmi = APIC_EXTNMI_BSP; */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); #ifdef CONFIG_X86_32 @@ -145,7 +149,7 @@ static int force_enable_local_apic __initdata; */ static int __init parse_lapic(char *arg) { - if (config_enabled(CONFIG_X86_32) && !arg) + if (IS_ENABLED(CONFIG_X86_32) && !arg) force_enable_local_apic = 1; else if (arg && !strncmp(arg, "notscdeadline", 13)) setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); @@ -311,7 +315,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 -#define TSC_DIVISOR 32 +#define TSC_DIVISOR 8 /* * This function sets up the local APIC timer, with a timeout of @@ -563,13 +567,37 @@ static void setup_APIC_timer(void) CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; clockevents_config_and_register(levt, - (tsc_khz / TSC_DIVISOR) * 1000, + tsc_khz * (1000 / TSC_DIVISOR), 0xF, ~0UL); } else clockevents_register_device(levt); } /* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + +/* * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer @@ -1348,7 +1376,6 @@ void setup_local_APIC(void) * Actually disabling the focus CPU check just makes the hang less * frequent as it makes the interrupt distributon model be more * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro */ /* @@ -1597,6 +1624,9 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; + if (skip_ioapic_setup) + return; + ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) return; @@ -1787,8 +1817,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1799,17 +1828,14 @@ void __init register_lapic_address(unsigned long address) if (!x2apic_mode) { set_fixmap_nocache(FIX_APIC_BASE, address); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, mp_lapic_addr); + APIC_BASE, address); } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -1998,7 +2024,53 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -int generic_processor_info(int apicid, int version) +/* + * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated + * contiguously, it equals to current allocated max logical CPU ID plus 1. + * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of + * nr_logical_cpuids is nr_cpu_ids. + * + * NOTE: Reserve 0 for BSP. + */ +static int nr_logical_cpuids = 1; + +/* + * Used to store mapping between logical CPU IDs and APIC IDs. + */ +static int cpuid_to_apicid[] = { + [0 ... NR_CPUS - 1] = -1, +}; + +/* + * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids + * and cpuid_to_apicid[] synchronized. + */ +static int allocate_logical_cpuid(int apicid) +{ + int i; + + /* + * cpuid <-> apicid mapping is persistent, so when a cpu is up, + * check if the kernel has allocated a cpuid for it. + */ + for (i = 0; i < nr_logical_cpuids; i++) { + if (cpuid_to_apicid[i] == apicid) + return i; + } + + /* Allocate a new cpuid. */ + if (nr_logical_cpuids >= nr_cpu_ids) { + WARN_ONCE(1, "Only %d processors supported." + "Processor %d/0x%x and the rest are ignored.\n", + nr_cpu_ids - 1, nr_logical_cpuids, apicid); + return -1; + } + + cpuid_to_apicid[nr_logical_cpuids] = apicid; + return nr_logical_cpuids++; +} + +int __generic_processor_info(int apicid, int version, bool enabled) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, @@ -2045,7 +2117,7 @@ int generic_processor_info(int apicid, int version) int thiscpu = max + disabled_cpus - 1; pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i almost" + "APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2056,15 +2128,16 @@ int generic_processor_info(int apicid, int version) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i reached." - " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + if (enabled) { + pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " + "reached. Processor %d/0x%x ignored.\n", + max, thiscpu, apicid); + } disabled_cpus++; return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2074,8 +2147,16 @@ int generic_processor_info(int apicid, int version) * for BSP. */ cpu = 0; - } else - cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* Logical cpuid 0 is reserved for BSP. */ + cpuid_to_apicid[0] = apicid; + } else { + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; + } + } /* * This can happen on physical hotplug. The sanity check at boot time @@ -2085,8 +2166,9 @@ int generic_processor_info(int apicid, int version) if (topology_update_package_map(apicid, cpu) < 0) { int thiscpu = max + disabled_cpus; - pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n", + pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } @@ -2099,14 +2181,12 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } - physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -2119,11 +2199,23 @@ int generic_processor_info(int apicid, int version) apic->x86_32_early_logical_apicid(cpu); #endif set_cpu_possible(cpu, true); - set_cpu_present(cpu, true); + + if (enabled) { + num_processors++; + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + } else { + disabled_cpus++; + } return cpu; } +int generic_processor_info(int apicid, int version) +{ + return __generic_processor_info(apicid, version, true); +} + int hard_smp_processor_id(void) { return read_apic_id(); @@ -2246,7 +2338,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 76f89e2b245a..a4d7ff20ed22 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -15,7 +15,7 @@ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/hardirq.h> -#include <linux/module.h> +#include <linux/export.h> #include <asm/smp.h> #include <asm/apic.h> #include <asm/ipi.h> @@ -25,7 +25,7 @@ static struct apic apic_physflat; static struct apic apic_flat; -struct apic __read_mostly *apic = &apic_flat; +struct apic *apic __ro_after_init = &apic_flat; EXPORT_SYMBOL_GPL(apic); static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -116,27 +116,17 @@ static void flat_send_IPI_all(int vector) static unsigned int flat_get_apic_id(unsigned long x) { - unsigned int id; - - id = (((x)>>24) & 0xFFu); - - return id; + return (x >> 24) & 0xFF; } static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xFFu)<<24); - return x; + return (id & 0xFF) << 24; } static unsigned int read_xapic_id(void) { - unsigned int id; - - id = flat_get_apic_id(apic_read(APIC_ID)); - return id; + return flat_get_apic_id(apic_read(APIC_ID)); } static int flat_apic_id_registered(void) @@ -154,7 +144,7 @@ static int flat_probe(void) return 1; } -static struct apic apic_flat = { +static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -181,7 +171,6 @@ static struct apic apic_flat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, @@ -249,7 +238,7 @@ static int physflat_probe(void) return 0; } -static struct apic apic_physflat = { +static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, @@ -278,7 +267,6 @@ static struct apic apic_physflat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 13d19ed58514..b109e4389c92 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -11,7 +11,6 @@ #include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> @@ -109,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } -struct apic apic_noop = { +struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, .acpi_madt_oem_check = NULL, @@ -141,7 +140,6 @@ struct apic apic_noop = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index ab5c2c685a3c..e08fe2c8dd8c 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,10 +40,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) static unsigned long numachip1_set_apic_id(unsigned int id) { - unsigned long x; - - x = ((id & 0xffU) << 24); - return x; + return (id & 0xff) << 24; } static unsigned int numachip2_get_apic_id(unsigned long x) @@ -269,7 +266,6 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, @@ -321,7 +317,6 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cf9bd896c12d..56012010332c 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -142,7 +142,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -static struct apic apic_bigsmp = { +static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, @@ -171,7 +171,6 @@ static struct apic apic_bigsmp = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 7788ce643bf4..c73c9fb281e1 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -16,7 +16,7 @@ #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/nmi.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/delay.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -26,32 +26,32 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) } #endif -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace static void nmi_raise_cpu_backtrace(cpumask_t *mask) { apic->send_IPI_mask(mask, NMI_VECTOR); } -void arch_trigger_all_cpu_backtrace(bool include_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { - nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); + nmi_trigger_cpumask_backtrace(mask, exclude_self, + nmi_raise_cpu_backtrace); } -static int -arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) +static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; return NMI_DONE; } -NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler); +NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler); -static int __init register_trigger_all_cpu_backtrace(void) +static int __init register_nmi_cpu_backtrace_handler(void) { - register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler, + register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler, 0, "arch_bt"); return 0; } -early_initcall(register_trigger_all_cpu_backtrace); +early_initcall(register_nmi_cpu_backtrace_handler); #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 84e33ff5a6d5..48e6d84f173e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -39,7 +39,7 @@ #include <linux/mc146818rtc.h> #include <linux/compiler.h> #include <linux/acpi.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscore_ops.h> #include <linux/freezer.h> #include <linux/kthread.h> @@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), - info, legacy); + info, legacy, NULL); } /* @@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; @@ -1592,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void) * no meaning without the serial APIC bus. */ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + || APIC_XAPIC(boot_cpu_apic_version)) return; setup_ioapic_ids_from_mpc_nocheck(); } @@ -2422,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); else return id; @@ -2567,29 +2568,25 @@ static struct resource * __init ioapic_setup_resources(void) unsigned long n; struct resource *res; char *mem; - int i, num = 0; + int i; - for_each_ioapic(i) - num++; - if (num == 0) + if (nr_ioapics == 0) return NULL; n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= num; + n *= nr_ioapics; mem = alloc_bootmem(n); res = (void *)mem; - mem += sizeof(struct resource) * num; + mem += sizeof(struct resource) * nr_ioapics; - num = 0; for_each_ioapic(i) { - res[num].name = mem; - res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; - num++; - ioapics[i].iomem_res = res; + ioapics[i].iomem_res = &res[i]; } ioapic_resources = res; diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 2a0f225afebd..3a205d4a12d0 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -8,7 +8,6 @@ #include <linux/mc146818rtc.h> #include <linux/cache.h> #include <linux/cpu.h> -#include <linux/module.h> #include <asm/smp.h> #include <asm/mtrr.h> diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ade25320df96..015bbf30e3e3 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } -static struct irq_chip hpet_msi_controller = { +static struct irq_chip hpet_msi_controller __ro_after_init = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f316e34abb42..c48264e202fd 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -8,7 +8,7 @@ */ #include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> @@ -72,7 +72,7 @@ static int probe_default(void) return 1; } -static struct apic apic_default = { +static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, @@ -101,7 +101,6 @@ static struct apic apic_default = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, @@ -127,7 +126,7 @@ static struct apic apic_default = { apic_driver(apic_default); -struct apic *apic = &apic_default; +struct apic *apic __ro_after_init = &apic_default; EXPORT_SYMBOL_GPL(apic); static int cmdline_apic __initdata; @@ -153,7 +152,7 @@ early_param("apic", parse_apic); void __init default_setup_apic_routing(void) { - int version = apic_version[boot_cpu_physical_apicid]; + int version = boot_cpu_apic_version; if (num_possible_cpus() > 8) { switch (boot_cpu_data.x86_vendor) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1793dba7a741..c303054b90b5 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -11,10 +11,9 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/ctype.h> -#include <linux/init.h> #include <linux/hardirq.h> #include <linux/dmar.h> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a5e400afc563..5d30c5e42bb1 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -523,7 +523,7 @@ static int apic_set_affinity(struct irq_data *irq_data, struct apic_chip_data *data = irq_data->chip_data; int err, irq = irq_data->irq; - if (!config_enabled(CONFIG_SMP)) + if (!IS_ENABLED(CONFIG_SMP)) return -EPERM; if (!cpumask_intersects(dest, cpu_online_mask)) @@ -661,11 +661,28 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { - struct irq_data *irqdata = irq_desc_get_irq_data(desc); - struct apic_chip_data *data = apic_chip_data(irqdata); - struct irq_cfg *cfg = data ? &data->cfg : NULL; + struct irq_data *irqdata; + struct apic_chip_data *data; + struct irq_cfg *cfg; unsigned int cpu; + /* + * The function is called for all descriptors regardless of which + * irqdomain they belong to. For example if an IRQ is provided by + * an irq_chip as part of a GPIO driver, the chip data for that + * descriptor is specific to the irq_chip in question. + * + * Check first that the chip_data is what we expect + * (apic_chip_data) before touching it any further. + */ + irqdata = irq_domain_get_irq_data(x86_vector_domain, + irq_desc_get_irq(desc)); + if (!irqdata) + return; + + data = apic_chip_data(irqdata); + cfg = data ? &data->cfg : NULL; + if (!cfg) return; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index aca8b75c1552..200af5ae9662 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -152,68 +152,53 @@ static void init_x2apic_ldr(void) } } - /* - * At CPU state changes, update the x2apic cluster sibling info. - */ -static int -update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) +/* + * At CPU state changes, update the x2apic cluster sibling info. + */ +static int x2apic_prepare_cpu(unsigned int cpu) { - unsigned int this_cpu = (unsigned long)hcpu; - unsigned int cpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu), - GFP_KERNEL)) { - err = -ENOMEM; - } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu), - GFP_KERNEL)) { - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - err = -ENOMEM; - } - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); - } - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - free_cpumask_var(per_cpu(ipi_mask, this_cpu)); - break; + if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) { + free_cpumask_var(per_cpu(cpus_in_cluster, cpu)); + return -ENOMEM; } - return notifier_from_errno(err); + return 0; } -static struct notifier_block x2apic_cpu_notifier = { - .notifier_call = update_clusterinfo, -}; - -static int x2apic_init_cpu_notifier(void) +static int x2apic_dead_cpu(unsigned int this_cpu) { - int cpu = smp_processor_id(); - - zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL); - - BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); + int cpu; - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); - register_hotcpu_notifier(&x2apic_cpu_notifier); - return 1; + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + return 0; } static int x2apic_cluster_probe(void) { - if (x2apic_mode) - return x2apic_init_cpu_notifier(); - else + int cpu = smp_processor_id(); + int ret; + + if (!x2apic_mode) + return 0; + + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + x2apic_prepare_cpu, x2apic_dead_cpu); + if (ret < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); return 0; + } + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); + return 1; } static const struct cpumask *x2apic_cluster_target_cpus(void) @@ -242,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } -static struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, @@ -270,7 +255,6 @@ static struct apic apic_x2apic_cluster = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a1242e2c12e6..ff111f05a314 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -98,7 +98,7 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -static struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, @@ -126,7 +126,6 @@ static struct apic apic_x2apic_phys = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 29003154fafd..aeef53ce93e1 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -12,7 +12,7 @@ #include <linux/proc_fs.h> #include <linux/threads.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/sched.h> @@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + /* Setup early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; @@ -325,7 +330,7 @@ static __init void build_uv_gr_table(void) struct uv_gam_range_entry *gre = uv_gre_table; struct uv_gam_range_s *grt; unsigned long last_limit = 0, ram_limit = 0; - int bytes, i, sid, lsid = -1; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; if (!gre) return; @@ -356,11 +361,12 @@ static __init void build_uv_gr_table(void) } sid = gre->sockid - _min_socket; if (lsid < sid) { /* new range */ - grt = &_gr_table[sid]; - grt->base = lsid; + grt = &_gr_table[indx]; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid = sid; + lindx = indx++; continue; } if (lsid == sid && !ram_limit) { /* update range */ @@ -371,7 +377,7 @@ static __init void build_uv_gr_table(void) } if (!ram_limit) { /* non-contiguous ram range */ grt++; - grt->base = sid - 1; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; @@ -527,11 +533,8 @@ static unsigned int x2apic_get_apic_id(unsigned long x) static unsigned long set_apic_id(unsigned int id) { - unsigned long x; - - /* maskout x2apic_extra_bits ? */ - x = id; - return x; + /* CHECKME: Do we need to mask out the xapic extra bits? */ + return id; } static unsigned int uv_read_apic_id(void) @@ -554,7 +557,7 @@ static int uv_probe(void) return apic == &apic_x2apic_uv_x; } -static struct apic __refdata apic_x2apic_uv_x = { +static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, @@ -582,7 +585,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, @@ -919,16 +921,16 @@ static void uv_heartbeat(unsigned long ignored) uv_set_scir_bits(bits); /* enable next timer period */ - mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } -static void uv_heartbeat_enable(int cpu) +static int uv_heartbeat_enable(unsigned int cpu) { while (!uv_cpu_scir_info(cpu)->enabled) { struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - setup_timer(timer, uv_heartbeat, cpu); + setup_pinned_timer(timer, uv_heartbeat, cpu); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; @@ -936,43 +938,24 @@ static void uv_heartbeat_enable(int cpu) /* also ensure that boot cpu is enabled */ cpu = 0; } + return 0; } #ifdef CONFIG_HOTPLUG_CPU -static void uv_heartbeat_disable(int cpu) +static int uv_heartbeat_disable(unsigned int cpu) { if (uv_cpu_scir_info(cpu)->enabled) { uv_cpu_scir_info(cpu)->enabled = 0; del_timer(&uv_cpu_scir_info(cpu)->timer); } uv_set_cpu_scir_bits(cpu, 0xff); -} - -/* - * cpu hotplug notifier - */ -static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - uv_heartbeat_enable(cpu); - break; - case CPU_DOWN_PREPARE: - uv_heartbeat_disable(cpu); - break; - default: - break; - } - return NOTIFY_OK; + return 0; } static __init void uv_scir_register_cpu_notifier(void) { - hotcpu_notifier(uv_scir_cpu_notify, 0); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online", + uv_heartbeat_enable, uv_heartbeat_disable); } #else /* !CONFIG_HOTPLUG_CPU */ @@ -1156,19 +1139,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", - "SID", "PN", "PXM"); + "SID", "PN"); } pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, ((unsigned long)(gre->limit - lgre)) >> (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ - gre->type, gre->nasid, gre->sockid, - gre->pnode, gre->pxm); + gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; if (sock_min > gre->sockid) @@ -1287,7 +1269,7 @@ static void __init build_socket_tables(void) _pnode_to_socket[i] = SOCK_EMPTY; /* fill in pnode/node/addr conversion list values */ - pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; @@ -1295,20 +1277,18 @@ static void __init build_socket_tables(void) if (_socket_to_pnode[i] != SOCK_EMPTY) continue; /* duplicate */ _socket_to_pnode[i] = gre->pnode; - _socket_to_node[i] = gre->pxm; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], - _socket_to_node[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* check socket -> node values */ + /* Set socket -> node values */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1319,14 +1299,9 @@ static void __init build_socket_tables(void) lnid = nid; apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - i = sockid - minsock; - - if (nid != _socket_to_node[i]) { - pr_warn( - "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", - i, sockid, gre->type, _socket_to_node[i], nid); - _socket_to_node[i] = nid; - } + _socket_to_node[sockid - minsock] = nid; + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", + sockid, apicid, nid); } /* Setup physical blade to pnode translation from GAM Range Table */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 674134e9f5e5..c62e015b126c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -29,9 +29,14 @@ void common(void) { BLANK(); - OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); - OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TASK_threadsp, task_struct, thread.sp); +#ifdef CONFIG_CC_STACKPROTECTOR + OFFSET(TASK_stack_canary, task_struct, stack_canary); +#endif + + BLANK(); + OFFSET(TASK_TI_flags, task_struct, thread_info.flags); + OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index ecdc1d217dc0..880aa093268d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -57,6 +57,11 @@ void foo(void) /* Size of SYSENTER_stack */ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +#ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +#endif + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d875f97d4e0b..210927ee2e74 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -56,6 +56,11 @@ int main(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); +#ifdef CONFIG_CC_STACKPROTECTOR + DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + BLANK(); +#endif + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c343a54bed39..b81fe2d63e15 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -669,19 +669,30 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } +#define MSR_AMD64_DE_CFG 0xC0011029 + +static void init_amd_ln(struct cpuinfo_x86 *c) +{ + /* + * Apply erratum 665 fix unconditionally so machines without a BIOS + * fix work. + */ + msr_set_bit(MSR_AMD64_DE_CFG, 31); +} + static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; /* re-enable TopologyExtensions if switched off by BIOS */ - if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) && !cpu_has(c, X86_FEATURE_TOPOEXT)) { if (msr_set_bit(0xc0011005, 54) > 0) { rdmsrl(0xc0011005, value); if (value & BIT_64(54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } } } @@ -726,6 +737,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 6: init_amd_k7(c); break; case 0xf: init_amd_k8(c); break; case 0x10: init_amd_gh(c); break; + case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0fe6953f421c..9bd910a7dd0a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2,7 +2,7 @@ #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/percpu.h> #include <linux/string.h> #include <linux/ctype.h> @@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) - return; - - cpu_detect(c); - get_cpu_vendor(c); - get_cpu_cap(c); + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); - if (this_cpu->c_early_init) - this_cpu->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); - c->cpu_index = 0; - filter_cpuid_features(c, false); + c->cpu_index = 0; + filter_cpuid_features(c, false); - if (this_cpu->c_bsp_init) - this_cpu->c_bsp_init(c); + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + } setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); @@ -1265,9 +1264,14 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; -struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) debug_idt_table }; +struct desc_ptr idt_descr __ro_after_init = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) idt_table, +}; +const struct desc_ptr debug_idt_descr = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; @@ -1281,7 +1285,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; @@ -1305,11 +1309,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); @@ -1452,7 +1451,7 @@ void cpu_init(void) struct task_struct *me; struct tss_struct *t; unsigned long v; - int cpu = stack_smp_processor_id(); + int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 73d391ae452f..35691a6b0d32 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -21,7 +21,8 @@ * */ -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <asm/processor.h> #include <asm/hypervisor.h> @@ -85,3 +86,14 @@ bool __init hypervisor_x2apic_available(void) x86_hyper->x2apic_available && x86_hyper->x2apic_available(); } + +void hypervisor_pin_vcpu(int cpu) +{ + if (!x86_hyper) + return; + + if (x86_hyper->pin_vcpu) + x86_hyper->pin_vcpu(cpu); + else + WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 6e2ffbebbcdb..fcd484d2bb03 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -5,7 +5,7 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/thread_info.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/uaccess.h> #include <asm/cpufeature.h> @@ -13,6 +13,7 @@ #include <asm/msr.h> #include <asm/bugs.h> #include <asm/cpu.h> +#include <asm/intel-family.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -300,15 +301,14 @@ static void intel_workarounds(struct cpuinfo_x86 *c) } /* - * P4 Xeon errata 037 workaround. + * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) - > 0) { + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); - pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); + pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n"); } } @@ -509,6 +509,10 @@ static void init_intel(struct cpuinfo_x86 *c) (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && + ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + set_cpu_bug(c, X86_BUG_MONITOR); + #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index fbb5e90557a5..e42117d5f4d7 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,7 +1,7 @@ #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <linux/cpu.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> /** diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 34c89a3e8260..83f1a98d37db 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -46,7 +46,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) return; mce_setup(&m); - m.bank = 1; + m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 92e5e37d97bf..a7fdf453d895 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -41,6 +41,7 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> +#include <linux/jump_label.h> #include <asm/processor.h> #include <asm/traps.h> @@ -292,6 +293,13 @@ static void print_mce(struct mce *m) if (m->misc) pr_cont("MISC %llx ", m->misc); + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + pr_cont("\n"); /* * Note this output is parsed by external tools and old fields @@ -425,7 +433,7 @@ static u64 mce_rdmsrl(u32 msr) } if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); /* * Return zero in case the access faulted. This should * not happen normally but can happen if the CPU does @@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(msr_ops.misc(i)); + if (m->status & MCI_STATUS_ADDRV) { m->addr = mce_rdmsrl(msr_ops.addr(i)); @@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i) m->addr >>= shift; m->addr <<= shift; } + + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); } } @@ -1309,7 +1335,7 @@ static void __restart_timer(struct timer_list *t, unsigned long interval) if (timer_pending(t)) { if (time_before(when, t->expires)) - mod_timer_pinned(t, when); + mod_timer(t, when); } else { t->expires = round_jiffies(when); add_timer_on(t, smp_processor_id()); @@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; - /* - * MCG_CAP.MCG_SER_P is necessary but not sufficient to know - * whether this processor will actually generate recoverable - * machine checks. Check to see if this is an E7 model Xeon. - * We can't do a model number check because E5 and E7 use the - * same model number. E5 doesn't support recovery, E7 does. - */ - if (mca_cfg.recovery || (mca_cfg.ser && - !strncmp(c->x86_model_id, - "Intel(R) Xeon(R) CPU E7-", 24))) - set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -1735,7 +1750,7 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned int cpu = smp_processor_id(); - setup_timer(t, mce_timer_fn, cpu); + setup_pinned_timer(t, mce_timer_fn, cpu); mce_start_timer(cpu, t); } @@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank) * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable memcpy_mcsafe() */ static int __init mcheck_enable(char *str) { @@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void) static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif +DEFINE_STATIC_KEY_FALSE(mcsafe_key); +EXPORT_SYMBOL_GPL(mcsafe_key); + static int __init mcheck_late_init(void) { + if (mca_cfg.recovery) + static_branch_inc(&mcsafe_key); + mcheck_debugfs_init(); /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 10b0661651e0..9b5403462936 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/string.h> #include <asm/amd_nb.h> #include <asm/apic.h> @@ -63,37 +64,74 @@ static const char * const th_names[] = { "execution_unit", }; -/* Define HWID to IP type mappings for Scalable MCA */ -struct amd_hwid amd_hwids[] = { - [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, - [SMCA_DF] = { "data_fabric", 0x2E }, - [SMCA_UMC] = { "umc", 0x96 }, - [SMCA_PB] = { "param_block", 0x5 }, - [SMCA_PSP] = { "psp", 0xFF }, - [SMCA_SMU] = { "smu", 0x1 }, +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" }; -EXPORT_SYMBOL_GPL(amd_hwids); - -const char * const amd_core_mcablock_names[] = { - [SMCA_LS] = "load_store", - [SMCA_IF] = "insn_fetch", - [SMCA_L2_CACHE] = "l2_cache", - [SMCA_DE] = "decode_unit", - [RES] = "", - [SMCA_EX] = "execution_unit", - [SMCA_FP] = "floating_point", - [SMCA_L3_CACHE] = "l3_cache", + +struct smca_bank_name smca_bank_names[] = { + [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(amd_core_mcablock_names); +EXPORT_SYMBOL_GPL(smca_bank_names); + +static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype, xec_bitmap } */ + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, -const char * const amd_df_mcablock_names[] = { - [SMCA_CS] = "coherent_slave", - [SMCA_PIE] = "pie", + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + +struct smca_bank_info smca_banks[MAX_NR_BANKS]; +EXPORT_SYMBOL_GPL(smca_banks); + +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; * CPU Initialization */ +static void get_smca_bank_info(unsigned int bank) +{ + unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + struct smca_hwid_mcatype *type; + u32 high, instanceId; + u16 hwid, mcatype; + + /* Collect bank_info using CPU 0 for now. */ + if (cpu) + return; + + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid = high & MCI_IPID_HWID; + mcatype = (high & MCI_IPID_MCATYPE) >> 16; + hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + type = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == type->hwid_mcatype) { + smca_banks[bank].type = type; + smca_banks[bank].type_instance = instanceId; + break; + } + } +} + struct thresh_restart { struct threshold_block *b; int reset; @@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 get_block_address(u32 current_addr, u32 low, u32 high, +static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; @@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, */ u32 low, high; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) return addr; if (!(low & MCI_CONFIG_MCAX)) return addr; - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); } @@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, */ smca_high &= ~BIT(2); + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) + smca_high |= BIT(5); + wrmsr(smca_addr, smca_low, smca_high); } @@ -421,12 +503,15 @@ out: void mce_amd_feature_init(struct cpuinfo_x86 *c) { u32 low = 0, high = 0, address = 0; - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (mce_flags.smca) + get_smca_bank_info(bank); + for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) if (threshold_err) m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) + if (m.status & MCI_STATUS_ADDRV) { rdmsrl(msr_addr, m.addr); + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m.addr >> 56) & 0x3f; + + m.addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } + mce_log(&m); wrmsrl(msr_status, 0); @@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; - int cpu = smp_processor_id(); - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; +static const char *get_name(unsigned int bank, struct threshold_block *b) +{ + unsigned int bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + if (!smca_banks[bank].type) + return NULL; + + bank_type = smca_banks[bank].type->bank_type; + + if (b && bank_type == SMCA_UMC) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%x", smca_bank_names[bank_type].name, + smca_banks[bank].type_instance); + return buf_mcatype; +} + static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, unsigned int block, u32 address) { @@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - (bank == 4 ? bank4_names(b) : th_names[bank])); + get_name(bank, b)); if (err) goto out_free; recurse: - address = get_block_address(address, low, high, bank, ++block); + address = get_block_address(cpu, address, low, high, bank, ++block); if (!address) return 0; @@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = th_names[bank]; + const char *name = get_name(bank, NULL); int err = 0; if (is_shared_bank(bank)) { @@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); + err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); if (!err) goto out; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8581963894c7..620ab06bcf45 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -54,26 +54,27 @@ static LIST_HEAD(pcache); */ static u8 *container; static size_t container_size; +static bool ucode_builtin; static u32 ucode_new_rev; -u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u8 amd_ucode_patch[PATCH_MAX_SIZE]; static u16 this_equiv_id; static struct cpio_data ucode_cpio; -/* - * Microcode patch container file is prepended to the initrd in cpio format. - * See Documentation/x86/early-microcode.txt - */ -static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - static struct cpio_data __init find_ucode_in_initrd(void) { - long offset = 0; +#ifdef CONFIG_BLK_DEV_INITRD char *path; void *start; size_t size; + /* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/early-microcode.txt + */ + static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + #ifdef CONFIG_X86_32 struct boot_params *p; @@ -89,9 +90,12 @@ static struct cpio_data __init find_ucode_in_initrd(void) path = ucode_path; start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); size = boot_params.hdr.ramdisk_size; -#endif +#endif /* !CONFIG_X86_32 */ - return find_cpio_data(path, start, size, &offset); + return find_cpio_data(path, start, size, NULL); +#else + return (struct cpio_data){ NULL, 0, "" }; +#endif } static size_t compute_container_size(u8 *data, u32 total_size) @@ -278,22 +282,26 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { struct cpio_data cp; + bool *builtin; void **data; size_t *size; #ifdef CONFIG_X86_32 data = (void **)__pa_nodebug(&ucode_cpio.data); size = (size_t *)__pa_nodebug(&ucode_cpio.size); + builtin = (bool *)__pa_nodebug(&ucode_builtin); #else data = &ucode_cpio.data; size = &ucode_cpio.size; + builtin = &ucode_builtin; #endif - cp = find_ucode_in_initrd(); - if (!cp.data) { - if (!load_builtin_amd_microcode(&cp, family)) - return; - } + *builtin = load_builtin_amd_microcode(&cp, family); + if (!*builtin) + cp = find_ucode_in_initrd(); + + if (!(cp.data && cp.size)) + return; *data = cp.data; *size = cp.size; @@ -352,6 +360,7 @@ void load_ucode_amd_ap(void) unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; + u8 *cont = container; u32 rev, eax; u16 eq_id; @@ -368,8 +377,12 @@ void load_ucode_amd_ap(void) if (check_current_patch_level(&rev, false)) return; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + if (!ucode_builtin) + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -431,6 +444,10 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + if (!ucode_builtin) + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ac360bfbbdb6..5ce5155f0695 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -60,7 +60,6 @@ static bool dis_ucode_ldr; static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -EXPORT_SYMBOL_GPL(ucode_cpu_info); /* * Operations that are run on a target cpu: @@ -175,24 +174,24 @@ void load_ucode_ap(void) } } -int __init save_microcode_in_initrd(void) +static int __init save_microcode_in_initrd(void) { struct cpuinfo_x86 *c = &boot_cpu_data; switch (c->x86_vendor) { case X86_VENDOR_INTEL: if (c->x86 >= 6) - save_microcode_in_initrd_intel(); + return save_microcode_in_initrd_intel(); break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - save_microcode_in_initrd_amd(); + return save_microcode_in_initrd_amd(); break; default: break; } - return 0; + return -EINVAL; } void reload_early_microcode(void) @@ -559,55 +558,36 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +static int mc_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct device *dev; dev = get_cpu_device(cpu); + microcode_update_cpu(cpu); + pr_debug("CPU%d added\n", cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - microcode_update_cpu(cpu); - pr_debug("CPU%d added\n", cpu); - /* - * "break" is missing on purpose here because we want to fall - * through in order to create the sysfs group. - */ - - case CPU_DOWN_FAILED: - if (sysfs_create_group(&dev->kobj, &mc_attr_group)) - pr_err("Failed to create group for CPU%d\n", cpu); - break; + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + return 0; +} - case CPU_DOWN_PREPARE: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&dev->kobj, &mc_attr_group); - pr_debug("CPU%d removed\n", cpu); - break; +static int mc_cpu_down_prep(unsigned int cpu) +{ + struct device *dev; + dev = get_cpu_device(cpu); + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("CPU%d removed\n", cpu); /* - * case CPU_DEAD: - * * When a CPU goes offline, don't free up or invalidate the copy of * the microcode in kernel memory, so that we can reuse it when the * CPU comes back online without unnecessarily requesting the userspace * for it again. */ - } - - /* The CPU refused to come up during a system resume */ - if (action == CPU_UP_CANCELED_FROZEN) - microcode_fini_cpu(cpu); - - return NOTIFY_OK; + return 0; } -static struct notifier_block mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - static struct attribute *cpu_root_microcode_attrs[] = { &dev_attr_reload.attr, NULL @@ -666,7 +646,8 @@ int __init microcode_init(void) goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); - register_hotcpu_notifier(&mc_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", + mc_cpu_online, mc_cpu_down_prep); pr_info("Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); @@ -691,4 +672,5 @@ int __init microcode_init(void) return error; } +fs_initcall(save_microcode_in_initrd); late_initcall(microcode_init); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 65cbbcd48fe4..cdc0deab00c9 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -40,9 +40,13 @@ #include <asm/msr.h> /* - * Temporary microcode blobs pointers storage. We note here the pointers to - * microcode blobs we've got from whatever storage (detached initrd, builtin). - * Later on, we put those into final storage mc_saved_data.mc_saved. + * Temporary microcode blobs pointers storage. We note here during early load + * the pointers to microcode blobs we've got from whatever storage (detached + * initrd, builtin). Later on, we put those into final storage + * mc_saved_data.mc_saved. + * + * Important: those are offsets from the beginning of initrd or absolute + * addresses within the kernel image when built-in. */ static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; @@ -51,8 +55,15 @@ static struct mc_saved_data { struct microcode_intel **mc_saved; } mc_saved_data; +/* Microcode blobs within the initrd. 0 if builtin. */ +static struct ucode_blobs { + unsigned long start; + bool valid; +} blobs; + +/* Go through saved patches and find the one suitable for the current CPU. */ static enum ucode_state -load_microcode_early(struct microcode_intel **saved, +find_microcode_patch(struct microcode_intel **saved, unsigned int num_saved, struct ucode_cpu_info *uci) { struct microcode_intel *ucode_ptr, *new_mc = NULL; @@ -121,13 +132,13 @@ load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, if (!mcs->mc_saved) { copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); - return load_microcode_early(mc_saved_tmp, count, uci); + return find_microcode_patch(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 microcode_phys(mc_saved_tmp, mcs); - return load_microcode_early(mc_saved_tmp, count, uci); + return find_microcode_patch(mc_saved_tmp, count, uci); #else - return load_microcode_early(mcs->mc_saved, count, uci); + return find_microcode_patch(mcs->mc_saved, count, uci); #endif } } @@ -450,8 +461,6 @@ static void show_saved_mc(void) #endif } -#ifdef CONFIG_HOTPLUG_CPU -static DEFINE_MUTEX(x86_cpu_microcode_mutex); /* * Save this mc into mc_saved_data. So it will be loaded early when a CPU is * hot added or resumes. @@ -459,19 +468,18 @@ static DEFINE_MUTEX(x86_cpu_microcode_mutex); * Please make sure this mc should be a valid microcode patch before calling * this function. */ -int save_mc_for_early(u8 *mc) +static void save_mc_for_early(u8 *mc) { +#ifdef CONFIG_HOTPLUG_CPU + /* Synchronization during CPU hotplug. */ + static DEFINE_MUTEX(x86_cpu_microcode_mutex); + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int mc_saved_count_init; unsigned int num_saved; struct microcode_intel **mc_saved; - int ret = 0; - int i; + int ret, i; - /* - * Hold hotplug lock so mc_saved_data is not accessed by a CPU in - * hotplug. - */ mutex_lock(&x86_cpu_microcode_mutex); mc_saved_count_init = mc_saved_data.num_saved; @@ -509,11 +517,8 @@ int save_mc_for_early(u8 *mc) out: mutex_unlock(&x86_cpu_microcode_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(save_mc_for_early); #endif +} static bool __init load_builtin_intel_microcode(struct cpio_data *cp) { @@ -532,37 +537,6 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) #endif } -static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long start, unsigned long size, - struct ucode_cpu_info *uci) -{ - struct cpio_data cd; - long offset = 0; -#ifdef CONFIG_X86_32 - char *p = (char *)__pa_nodebug(ucode_name); -#else - char *p = ucode_name; -#endif - - cd.data = NULL; - cd.size = 0; - - /* try built-in microcode if no initrd */ - if (!size) { - if (!load_builtin_intel_microcode(&cd)) - return UCODE_ERROR; - } else { - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) - return UCODE_ERROR; - } - - return get_matching_model_microcode(start, cd.data, cd.size, - mcs, mc_ptrs, uci); -} - /* * Print ucode update info. */ @@ -680,38 +654,117 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) */ int __init save_microcode_in_initrd_intel(void) { - unsigned int count = mc_saved_data.num_saved; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - int ret = 0; + unsigned int count = mc_saved_data.num_saved; + unsigned long offset = 0; + int ret; if (!count) - return ret; + return 0; + + /* + * We have found a valid initrd but it might've been relocated in the + * meantime so get its updated address. + */ + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid) + offset = initrd_start; - copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count); + copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); - - show_saved_mc(); + else + show_saved_mc(); return ret; } +static __init enum ucode_state +__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp) +{ +#ifdef CONFIG_BLK_DEV_INITRD + static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; + char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name) + : ucode_name; +# ifdef CONFIG_X86_32 + unsigned long start = 0, size; + struct boot_params *params; + + params = (struct boot_params *)__pa_nodebug(&boot_params); + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? params->hdr.ramdisk_image : 0); + +# else /* CONFIG_X86_64 */ + unsigned long start = 0, size; + + size = (u64)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (u64)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + *cd = find_cpio_data(p, (void *)start, size, NULL); + if (cd->data) { + blbp->start = start; + blbp->valid = true; + + return UCODE_OK; + } else +#endif /* CONFIG_BLK_DEV_INITRD */ + return UCODE_ERROR; +} + +static __init enum ucode_state +scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, + struct ucode_cpu_info *uci, struct ucode_blobs *blbp) +{ + struct cpio_data cd = { NULL, 0, "" }; + enum ucode_state ret; + + /* try built-in microcode first */ + if (load_builtin_intel_microcode(&cd)) + /* + * Invalidate blobs as we might've gotten an initrd too, + * supplied by the boot loader, by mistake or simply forgotten + * there. That's fine, we ignore it since we've found builtin + * microcode already. + */ + blbp->valid = false; + else { + ret = __scan_microcode_initrd(&cd, blbp); + if (ret != UCODE_OK) + return ret; + } + + return get_matching_model_microcode(blbp->start, cd.data, cd.size, + mcs, mc_ptrs, uci); +} + static void __init _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long start, unsigned long size) + struct ucode_blobs *blbp) { struct ucode_cpu_info uci; enum ucode_state ret; collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, mc_ptrs, start, size, &uci); + ret = scan_microcode(mcs, mc_ptrs, &uci, blbp); if (ret != UCODE_OK) return; - ret = load_microcode(mcs, mc_ptrs, start, &uci); + ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci); if (ret != UCODE_OK) return; @@ -720,54 +773,60 @@ _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, void __init load_ucode_intel_bsp(void) { - u64 start, size; -#ifdef CONFIG_X86_32 - struct boot_params *p; - - p = (struct boot_params *)__pa_nodebug(&boot_params); - size = p->hdr.ramdisk_size; + struct ucode_blobs *blobs_p; + struct mc_saved_data *mcs; + unsigned long *ptrs; - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ - start = (size ? p->hdr.ramdisk_image : 0); - - _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_tmp_ptrs), - start, size); +#ifdef CONFIG_X86_32 + mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs); + blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); #else - size = boot_params.hdr.ramdisk_size; - start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); - - _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size); + mcs = &mc_saved_data; + ptrs = mc_tmp_ptrs; + blobs_p = &blobs; #endif + + _load_ucode_intel_bsp(mcs, ptrs, blobs_p); } void load_ucode_intel_ap(void) { - unsigned long *mcs_tmp_p; - struct mc_saved_data *mcs_p; + struct ucode_blobs *blobs_p; + unsigned long *ptrs, start = 0; + struct mc_saved_data *mcs; struct ucode_cpu_info uci; enum ucode_state ret; -#ifdef CONFIG_X86_32 - mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); - mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); +#ifdef CONFIG_X86_32 + mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); + blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); #else - mcs_tmp_p = mc_tmp_ptrs; - mcs_p = &mc_saved_data; + mcs = &mc_saved_data; + ptrs = mc_tmp_ptrs; + blobs_p = &blobs; #endif /* * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (!mcs_p->num_saved) + if (!mcs->num_saved) return; + if (blobs_p->valid) { + start = blobs_p->start; + + /* + * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles + * physmem mapping too and there we have the initrd. + */ + start += PAGE_OFFSET - __PAGE_OFFSET_BASE; + } + collect_cpu_info_early(&uci); - ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci); + ret = load_microcode(mcs, ptrs, start, &uci); if (ret != UCODE_OK) return; @@ -784,7 +843,7 @@ void reload_ucode_intel(void) collect_cpu_info_early(&uci); - ret = load_microcode_early(mc_saved_data.mc_saved, + ret = find_microcode_patch(mc_saved_data.mc_saved, mc_saved_data.num_saved, &uci); if (ret != UCODE_OK) return; @@ -794,6 +853,7 @@ void reload_ucode_intel(void) static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { + static struct cpu_signature prev; struct cpuinfo_x86 *c = &cpu_data(cpu_num); unsigned int val[2]; @@ -808,8 +868,13 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) } csig->rev = c->microcode; - pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + + /* No extra locking on prev, races are harmless. */ + if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) { + pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n", + csig->sig, csig->pf, csig->rev); + prev = *csig; + } return 0; } @@ -838,6 +903,7 @@ static int apply_microcode_intel(int cpu) struct ucode_cpu_info *uci; struct cpuinfo_x86 *c; unsigned int val[2]; + static int prev_rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) @@ -872,11 +938,14 @@ static int apply_microcode_intel(int cpu) return -1; } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, val[1], - mc->hdr.date & 0xffff, - mc->hdr.date >> 24, - (mc->hdr.date >> 16) & 0xff); + if (val[1] != prev_rev) { + pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", + val[1], + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + prev_rev = val[1]; + } c = &cpu_data(cpu); diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 2ce1a7dc45b7..406cb6c0d9dd 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -141,7 +141,6 @@ int microcode_sanity_check(void *mc, int print_err) } return 0; } -EXPORT_SYMBOL_GPL(microcode_sanity_check); /* * Returns 1 if update has been found, 0 otherwise. @@ -183,4 +182,3 @@ int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) return find_matching_signature(mc, csig, cpf); } -EXPORT_SYMBOL_GPL(has_newer_microcode); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 10c11b4da31d..8f44c5a50ab8 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -13,7 +13,8 @@ #include <linux/types.h> #include <linux/time.h> #include <linux/clocksource.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/hardirq.h> #include <linux/efi.h> #include <linux/interrupt.h> diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 31e951ce6dff..3b442b64c72d 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -17,7 +17,6 @@ * License along with this library; if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/module.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 16e37a2581ac..fdc55215d44d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -4,7 +4,7 @@ */ #define DEBUG -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index d76f13d6d8d6..6d9b45549109 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -2,7 +2,6 @@ #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> -#include <linux/module.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/slab.h> diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7d393ecdeee6..24e87e74990d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -38,7 +38,7 @@ #include <linux/stop_machine.h> #include <linux/kvm_para.h> #include <linux/uaccess.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/sort.h> @@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(const struct mtrr_ops *ops) +void __init set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 6c7ced07d16d..ad8bd763efa5 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index, bool get_mtrr_state(void); void mtrr_bp_pat_init(void); -extern void set_mtrr_ops(const struct mtrr_ops *ops); +extern void __init set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 2e8caf03f593..181eabecae25 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -12,7 +12,7 @@ */ #include <linux/percpu.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index f6f50c4ceaec..cfa97ff67bda 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -39,9 +39,9 @@ __setup("nordrand", x86_rdrand_setup); */ #define SANITY_CHECK_LOOPS 8 +#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { -#ifdef CONFIG_ARCH_RANDOM unsigned long tmp; int i; @@ -55,5 +55,5 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } -#endif } +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 8cac429b6a1d..81160578b91a 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,10 +22,12 @@ */ #include <linux/dmi.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> +#include <asm/apic.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -81,10 +83,17 @@ static void __init vmware_platform_setup(void) VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx != UINT_MAX) + if (ebx != UINT_MAX) { x86_platform.calibrate_tsc = vmware_get_tsc_khz; - else +#ifdef CONFIG_X86_LOCAL_APIC + /* Skip lapic calibration since we know the bus frequency. */ + lapic_timer_frequency = ecx / HZ; + pr_info("Host bus clock speed read from hypervisor : %u Hz\n", + ecx); +#endif + } else { pr_warn("Failed to get TSC freq from the hypervisor\n"); + } } /* diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 9ef978d69c22..9616cf76940c 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -20,7 +20,7 @@ #include <linux/delay.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2bb25c3fe2e8..9b7cf5c28f5f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -17,7 +17,7 @@ #include <linux/sysfs.h> #include <asm/stacktrace.h> - +#include <asm/unwind.h> int panic_on_unrecovered_nmi; int panic_on_io_nmi; @@ -25,11 +25,29 @@ unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + unsigned long *begin = task_stack_page(task); + unsigned long *end = task_stack_page(task) + THREAD_SIZE; + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_TASK; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, - void *data) + char *log_lvl) { + touch_nmi_watchdog(); printk("%s [<%p>] %s%pB\n", - (char *)data, (void *)address, reliable ? "" : "? ", + log_lvl, (void *)address, reliable ? "" : "? ", (void *)address); } @@ -38,165 +56,120 @@ void printk_address(unsigned long address) pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct thread_info *tinfo, int *graph) +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) { - struct task_struct *task; - unsigned long ret_addr; - int index; + struct unwind_state state; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; - if (addr != (unsigned long)return_to_handler) - return; - - task = tinfo->task; - index = task->curr_ret_stack; - - if (!task->ret_stack || index < *graph) - return; - - index -= *graph; - ret_addr = task->ret_stack[index].ret; + printk("%sCall Trace:\n", log_lvl); - ops->address(data, ret_addr, 1); + unwind_start(&state, task, regs, stack); - (*graph)++; -} -#else -static inline void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct thread_info *tinfo, int *graph) -{ } -#endif + /* + * Iterate through the stacks, starting with the current stack pointer. + * Each stack has a pointer to the next one. + * + * x86-64 can have several stacks: + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) + * + * x86-32 can have up to three stacks: + * - task stack + * - softirq stack + * - hardirq stack + */ + for (; stack; stack = stack_info.next_sp) { + const char *str_begin, *str_end; -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ + /* + * If we overflowed the task stack into a guard page, jump back + * to the bottom of the usable stack. + */ + if (task_stack_page(task) - (void *)stack < PAGE_SIZE) + stack = task_stack_page(task); -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; -unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, 0); - } - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + stack_type_str(stack_info.type, &str_begin, &str_end); + if (str_begin) + printk("%s <%s> ", log_lvl, str_begin); + + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. + * + * Addresses found during the scan which are not reported by + * the unwinder are considered to be additional clues which are + * sometimes useful for debugging and are prefixed with '?'. + * This also serves as a failsafe option in case the unwinder + * goes off in the weeds. + */ + for (; stack < stack_info.end; stack++) { + unsigned long real_addr; + int reliable = 0; + unsigned long addr = *stack; + unsigned long *ret_addr_p = + unwind_get_return_address_ptr(&state); + + if (!__kernel_text_address(addr)) + continue; + + if (stack == ret_addr_p) + reliable = 1; + + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ + real_addr = ftrace_graph_ret_addr(task, &graph_idx, + addr, stack); + if (real_addr != addr) + printk_stack_address(addr, 0, log_lvl); + printk_stack_address(real_addr, reliable, log_lvl); + + if (!reliable) + continue; + + /* + * Get the next frame from the unwinder. No need to + * check for an error: if anything goes wrong, the rest + * of the addresses will just be printed as unreliable. + */ + unwind_next_frame(&state); } - stack++; - } - return bp; -} -EXPORT_SYMBOL_GPL(print_context_stack); - -unsigned long -print_context_stack_bp(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long *ret_addr = &frame->return_address; - - while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { - unsigned long addr = *ret_addr; - if (!__kernel_text_address(addr)) - break; - - if (ops->address(data, addr, 1)) - break; - frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + if (str_end) + printk("%s <%s> ", log_lvl, str_end); } - - return (unsigned long)frame; -} -EXPORT_SYMBOL_GPL(print_context_stack_bp); - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static int print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_stack_address(addr, reliable, data); - return 0; -} - -static const struct stacktrace_ops print_trace_ops = { - .stack = print_trace_stack, - .address = print_trace_address, - .walk_stack = print_context_stack, -}; - -void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); } void show_stack(struct task_struct *task, unsigned long *sp) { - unsigned long bp = 0; - unsigned long stack; + task = task ? : current; /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ - if (!sp && (!task || task == current)) { - sp = &stack; - bp = stack_frame(current, NULL); - } + if (!sp && task == current) + sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, bp, ""); + show_stack_log_lvl(task, NULL, sp, ""); +} + +void show_stack_regs(struct pt_regs *regs) +{ + show_stack_log_lvl(current, regs, NULL, ""); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -228,6 +201,8 @@ unsigned long oops_begin(void) EXPORT_SYMBOL_GPL(oops_begin); NOKPROBE_SYMBOL(oops_begin); +void __noreturn rewind_stack_do_exit(int signr); + void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { if (regs && kexec_should_crash(current)) @@ -249,7 +224,13 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - do_exit(signr); + + /* + * We're not going to return, but we might be on an IST stack or + * have very little stack space left. Rewind the stack and kill + * the task. + */ + rewind_stack_do_exit(signr); } NOKPROBE_SYMBOL(oops_end); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 464ffd69b92e..06eb322b5f9f 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -7,7 +7,7 @@ #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> @@ -16,93 +16,121 @@ #include <asm/stacktrace.h> -static void *is_irq_stack(void *p, void *irq) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - if (p < irq || p >= (irq + THREAD_SIZE)) - return NULL; - return irq + THREAD_SIZE; + switch (type) { + case STACK_TYPE_IRQ: + case STACK_TYPE_SOFTIRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + default: + *begin = NULL; + *end = NULL; + } } - -static void *is_hardirq_stack(unsigned long *stack, int cpu) +static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - void *irq = per_cpu(hardirq_stack, cpu); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - return is_irq_stack(stack, irq); -} + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; -static void *is_softirq_stack(unsigned long *stack, int cpu) -{ - void *irq = per_cpu(softirq_stack, cpu); + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; - return is_irq_stack(stack, irq); + /* + * See irq_32.c -- the next stack pointer is stored at the beginning of + * the stack. + */ + info->next_sp = (unsigned long *)*begin; + + return true; } -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) +static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - const unsigned cpu = get_cpu(); - int graph = 0; - u32 *prev_esp; + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - if (!task) - task = current; + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) + return false; - if (!stack) { - unsigned long dummy; + info->type = STACK_TYPE_SOFTIRQ; + info->begin = begin; + info->end = end; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } + /* + * The next stack pointer is stored at the beginning of the stack. + * See irq_32.c. + */ + info->next_sp = (unsigned long *)*begin; - if (!bp) - bp = stack_frame(task, regs); + return true; +} - for (;;) { - struct thread_info *context; - void *end_stack; +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; - end_stack = is_hardirq_stack(stack, cpu); - if (!end_stack) - end_stack = is_softirq_stack(stack, cpu); + task = task ? : current; - context = task_thread_info(task); - bp = ops->walk_stack(context, stack, bp, ops, data, - end_stack, &graph); + if (in_task_stack(stack, task, info)) + goto recursion_check; - /* Stop if not on irq stack */ - if (!end_stack) - break; + if (task != current) + goto unknown; - /* The previous esp is saved on the bottom of the stack */ - prev_esp = (u32 *)(end_stack - THREAD_SIZE); - stack = (unsigned long *)*prev_esp; - if (!stack) - break; + if (in_hardirq_stack(stack, info)) + goto recursion_check; - if (ops->stack(data, "IRQ") < 0) - break; - touch_nmi_watchdog(); + if (in_softirq_stack(stack, info)) + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; } - put_cpu(); + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } -EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + if (!try_get_task_stack(task)) + return; + + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { @@ -117,7 +145,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); + + put_task_stack(task); } @@ -139,7 +169,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + show_stack_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5f1c6266eb30..36cf1a498227 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -7,7 +7,7 @@ #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> @@ -16,264 +16,150 @@ #include <asm/stacktrace.h> +static char *exception_stack_names[N_EXCEPTION_STACKS] = { + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ NMI_STACK-1 ] = "NMI", + [ DEBUG_STACK-1 ] = "#DB", + [ MCE_STACK-1 ] = "#MC", +}; -#define N_EXCEPTION_STACKS_END \ - (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) - -static char x86_stack_ids[][8] = { - [ DEBUG_STACK-1 ] = "#DB", - [ NMI_STACK-1 ] = "NMI", - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ MCE_STACK-1 ] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [ N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS_END ] = "#DB[?]" -#endif +static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = x86_stack_ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - x86_stack_ids[j][4] = '1' + - (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = x86_stack_ids[j]; - return (unsigned long *)end; - } -#endif + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + + switch (type) { + case STACK_TYPE_IRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: + *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; + *end = "EOE"; + break; + default: + *begin = NULL; + *end = NULL; } - return NULL; } -static inline int -in_irq_stack(unsigned long *stack, unsigned long *irq_stack, - unsigned long *irq_stack_end) +static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - return (stack >= irq_stack && stack < irq_stack_end); -} - -static const unsigned long irq_stack_size = - (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); - -enum stack_type { - STACK_IS_UNKNOWN, - STACK_IS_NORMAL, - STACK_IS_EXCEPTION, - STACK_IS_IRQ, -}; - -static enum stack_type -analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, - unsigned long **stack_end, unsigned long *irq_stack, - unsigned *used, char **id) -{ - unsigned long addr; + unsigned long *begin, *end; + struct pt_regs *regs; + unsigned k; - addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); - if ((unsigned long)task_stack_page(task) == addr) - return STACK_IS_NORMAL; + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - *stack_end = in_exception_stack(cpu, (unsigned long)stack, - used, id); - if (*stack_end) - return STACK_IS_EXCEPTION; + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; + begin = end - (exception_stack_sizes[k] / sizeof(long)); + regs = (struct pt_regs *)end - 1; - if (!irq_stack) - return STACK_IS_NORMAL; + if (stack < begin || stack >= end) + continue; - *stack_end = irq_stack; - irq_stack = irq_stack - irq_stack_size; + info->type = STACK_TYPE_EXCEPTION + k; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)regs->sp; - if (in_irq_stack(stack, irq_stack, *stack_end)) - return STACK_IS_IRQ; + return true; + } - return STACK_IS_UNKNOWN; + return false; } -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) +static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - const unsigned cpu = get_cpu(); - struct thread_info *tinfo; - unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); - unsigned long dummy; - unsigned used = 0; - int graph = 0; - int done = 0; - - if (!task) - task = current; - - if (!stack) { - if (regs) - stack = (unsigned long *)regs->sp; - else if (task != current) - stack = (unsigned long *)task->thread.sp; - else - stack = &dummy; - } + unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); - if (!bp) - bp = stack_frame(task, regs); /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. */ - tinfo = task_thread_info(task); - while (!done) { - unsigned long *stack_end; - enum stack_type stype; - char *id; + if (stack < begin || stack > end) + return false; - stype = analyze_stack(cpu, task, stack, &stack_end, - irq_stack, &used, &id); + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; - /* Default finish unless specified to continue */ - done = 1; + /* + * The next stack pointer is the first thing pushed by the entry code + * after switching to the irq stack. + */ + info->next_sp = (unsigned long *)*(end - 1); - switch (stype) { + return true; +} - /* Break out early if we are on the thread stack */ - case STACK_IS_NORMAL: - break; +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!stack) + goto unknown; - case STACK_IS_EXCEPTION: + task = task ? : current; - if (ops->stack(data, id) < 0) - break; + if (in_task_stack(stack, task, info)) + goto recursion_check; - bp = ops->walk_stack(tinfo, stack, bp, ops, - data, stack_end, &graph); - ops->stack(data, "<EOE>"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) stack_end[-2]; - done = 0; - break; + if (task != current) + goto unknown; - case STACK_IS_IRQ: + if (in_exception_stack(stack, info)) + goto recursion_check; - if (ops->stack(data, "IRQ") < 0) - break; - bp = ops->walk_stack(tinfo, stack, bp, - ops, data, stack_end, &graph); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (stack_end[-1]); - irq_stack = NULL; - ops->stack(data, "EOI"); - done = 0; - break; + if (in_irq_stack(stack, info)) + goto recursion_check; - case STACK_IS_UNKNOWN: - ops->stack(data, "UNK"); - break; - } - } + goto unknown; +recursion_check: /* - * This handles the process stack: + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. */ - bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); - put_cpu(); + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + } + + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } -EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; unsigned long *stack; - int cpu; int i; - preempt_disable(); - cpu = smp_processor_id(); + if (!try_get_task_stack(task)) + return; - irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); + irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - /* - * Debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu: - */ - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { + unsigned long word; + if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); @@ -283,26 +169,31 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; } + + if (probe_kernel_address(stack, word)) + break; + if ((i % STACKSLOTS_PER_LINE) == 0) { if (i != 0) pr_cont("\n"); - printk("%s %016lx", log_lvl, *stack++); + printk("%s %016lx", log_lvl, word); } else - pr_cont(" %016lx", *stack++); + pr_cont(" %016lx", word); + + stack++; touch_nmi_watchdog(); } - preempt_enable(); pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); + + put_task_stack(task); } void show_regs(struct pt_regs *regs) { int i; - unsigned long sp; - sp = regs->sp; show_regs_print_info(KERN_DEFAULT); __show_regs(regs, 1); @@ -317,8 +208,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_DEFAULT); + show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 621b501f8935..b85fe5f91c3f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +166,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -348,7 +350,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, * continue building up new bios map based on this * information */ - if (current_type != last_type || current_type == E820_PRAM) { + if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; @@ -388,11 +390,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) while (nr_map) { u64 start = biosmap->addr; u64 size = biosmap->size; - u64 end = start + size; + u64 end = start + size - 1; u32 type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) + if (start > end && likely(size)) return -1; e820_add_region(start, size, type); @@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820, start, size, old_type, new_type); + return __e820_update_range(e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820_saved, start, size, old_type, + return __e820_update_range(e820_saved, start, size, old_type, new_type); } @@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, e820_print_type(old_type); printk(KERN_CONT "\n"); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; u64 final_start, final_end; u64 ei_end; @@ -566,15 +568,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, void __init update_e820(void) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map)) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map)) return; printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) { - sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), - &e820_saved.nr_map); + sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map), + &e820_saved->nr_map); } #define MAX_GAP_END 0x100000000ull /* @@ -584,14 +586,14 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, unsigned long start_addr, unsigned long long end_addr) { unsigned long long last; - int i = e820.nr_map; + int i = e820->nr_map; int found = 0; last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; + unsigned long long start = e820->map[i].addr; + unsigned long long end = start + e820->map[i].size; if (end < start_addr) continue; @@ -649,6 +651,33 @@ __init void e820_setup_gap(void) gapstart, gapstart + gapsize - 1); } +/* + * Called late during init, in free_initmem(). + * + * Initial e820 and e820_saved are largish __initdata arrays. + * Copy them to (usually much smaller) dynamically allocated area. + * This is done after all tweaks we ever do to them: + * all functions which modify them are __init functions, + * they won't exist after this point. + */ +__init void e820_reallocate_tables(void) +{ + struct e820map *n; + int size; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820, size); + e820 = n; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_saved, size); + e820_saved = n; +} + /** * Because of the size limitation of struct boot_params, only first * 128 E820 memory entries are passed to kernel via @@ -665,7 +694,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) entries = sdata->len / sizeof(struct e820entry); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); @@ -686,8 +715,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) int i; unsigned long pfn = 0; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (pfn < PFN_UP(ei->addr)) register_nosave_region(pfn, PFN_UP(ei->addr)); @@ -712,8 +741,8 @@ static int __init e820_mark_nvs_memory(void) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (ei->type == E820_NVS) acpi_nvs_register(ei->addr, ei->size); @@ -754,22 +783,18 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; unsigned long start_pfn; unsigned long end_pfn; - /* - * Persistent memory is accounted as ram for purposes of - * establishing max_pfn and mem_map. - */ - if (ei->type != E820_RAM && ei->type != E820_PRAM) + if (ei->type != type) continue; start_pfn = ei->addr >> PAGE_SHIFT; @@ -794,15 +819,15 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) } unsigned long __init e820_end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN); + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); } unsigned long __init e820_end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32-PAGE_SHIFT)); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); } -static void early_panic(char *msg) +static void __init early_panic(char *msg) { early_printk(msg); panic(msg); @@ -856,7 +881,7 @@ static int __init parse_memmap_one(char *p) */ saved_max_pfn = e820_end_of_ram_pfn(); #endif - e820.nr_map = 0; + e820->nr_map = 0; userdef = 1; return 0; } @@ -903,8 +928,8 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { - if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), - &e820.nr_map) < 0) + if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), + &e820->nr_map) < 0) early_panic("Invalid user supplied memory map"); printk(KERN_INFO "e820: user-defined physical RAM map:\n"); @@ -912,7 +937,7 @@ void __init finish_e820_parsing(void) } } -static const char *e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -926,7 +951,7 @@ static const char *e820_type_to_string(int e820_type) } } -static unsigned long e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -942,7 +967,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type) } } -static unsigned long e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(int e820_type) { switch (e820_type) { case E820_ACPI: @@ -961,7 +986,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type) } } -static bool do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -991,35 +1016,35 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820->nr_map); e820_res = res; - for (i = 0; i < e820.nr_map; i++) { - end = e820.map[i].addr + e820.map[i].size - 1; + for (i = 0; i < e820->nr_map; i++) { + end = e820->map[i].addr + e820->map[i].size - 1; if (end != (resource_size_t)end) { res++; continue; } - res->name = e820_type_to_string(e820.map[i].type); - res->start = e820.map[i].addr; + res->name = e820_type_to_string(e820->map[i].type); + res->start = e820->map[i].addr; res->end = end; - res->flags = e820_type_to_iomem_type(e820.map[i].type); - res->desc = e820_type_to_iores_desc(e820.map[i].type); + res->flags = e820_type_to_iomem_type(e820->map[i].type); + res->desc = e820_type_to_iores_desc(e820->map[i].type); /* * don't register the region that could be conflicted with * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (do_mark_busy(e820.map[i].type, res)) { + if (do_mark_busy(e820->map[i].type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - for (i = 0; i < e820_saved.nr_map; i++) { - struct e820entry *entry = &e820_saved.map[i]; + for (i = 0; i < e820_saved->nr_map; i++) { + struct e820entry *entry = &e820_saved->map[i]; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry->type)); @@ -1027,7 +1052,7 @@ void __init e820_reserve_resources(void) } /* How much should we pad RAM ending depending on where it is? */ -static unsigned long ram_alignment(resource_size_t pos) +static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1051,7 +1076,7 @@ void __init e820_reserve_resources_late(void) struct resource *res; res = e820_res; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; @@ -1061,8 +1086,8 @@ void __init e820_reserve_resources_late(void) * Try to bump up RAM regions to reasonable boundaries to * avoid stolen RAM: */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *entry = &e820->map[i]; u64 start, end; if (entry->type != E820_RAM) @@ -1110,7 +1135,7 @@ char *__init default_machine_specific_memory_setup(void) who = "BIOS-e801"; } - e820.nr_map = 0; + e820->nr_map = 0; e820_add_region(0, LOWMEMSIZE(), E820_RAM); e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } @@ -1124,7 +1149,7 @@ void __init setup_memory_map(void) char *who; who = x86_init.resources.memory_setup(); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); } @@ -1141,8 +1166,8 @@ void __init memblock_x86_fill(void) */ memblock_allow_resize(); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; end = ei->addr + ei->size; if (end != (resource_size_t)end) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index bca14c899137..18bb3a639197 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -11,7 +11,11 @@ #include <linux/pci.h> #include <linux/acpi.h> +#include <linux/delay.h> +#include <linux/dmi.h> #include <linux/pci_ids.h> +#include <linux/bcma/bcma.h> +#include <linux/bcma/bcma_regs.h> #include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -21,6 +25,9 @@ #include <asm/iommu.h> #include <asm/gart.h> #include <asm/irq_remapping.h> +#include <asm/early_ioremap.h> + +#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg) static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -76,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func) #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC /* + * Only applies to Nvidia root ports (bus 0) and not to + * Nvidia graphics cards with PCI ports on secondary buses. + */ + if (num) + return; + + /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. * Unfortunately that's not true on many Asus boards. @@ -223,36 +237,19 @@ static void __init intel_remapping_check(int num, int slot, int func) * despite the efforts of the "RAM buffer" approach, which simply rounds * memory boundaries up to 64M to try to catch space that may decode * as RAM and so is not suitable for MMIO. - * - * And yes, so far on current devices the base addr is always under 4G. */ -static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size) -{ - u32 base; - - /* - * For the PCI IDs in this quirk, the stolen base is always - * in 0x5c, aka the BDSM register (yes that's really what - * it's called). - */ - base = read_pci_config(num, slot, func, 0x5c); - base &= ~((1<<20) - 1); - - return base; -} #define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) -#define GB(x) (MB (KB (x))) static size_t __init i830_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; - if (tmp & I830_TSEG_SIZE_1M) + if (esmramc & I830_TSEG_SIZE_1M) return MB(1); else return KB(512); @@ -260,27 +257,26 @@ static size_t __init i830_tseg_size(void) static size_t __init i845_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK; - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; - switch (tmp & I845_TSEG_SIZE_MASK) { - case I845_TSEG_SIZE_512K: - return KB(512); - case I845_TSEG_SIZE_1M: - return MB(1); + switch (tseg_size) { + case I845_TSEG_SIZE_512K: return KB(512); + case I845_TSEG_SIZE_1M: return MB(1); default: - WARN_ON(1); - return 0; + WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc); } + return 0; } static size_t __init i85x_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; return MB(1); @@ -300,285 +296,287 @@ static size_t __init i85x_mem_size(void) * On 830/845/85x the stolen memory base isn't available in any * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. */ -static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i830_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i830_mem_size() - i830_tseg_size() - stolen_size; + return (phys_addr_t)i830_mem_size() - i830_tseg_size() - stolen_size; } -static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i845_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i830_mem_size() - i845_tseg_size() - stolen_size; + return (phys_addr_t)i830_mem_size() - i845_tseg_size() - stolen_size; } -static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i85x_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i85x_mem_size() - i85x_tseg_size() - stolen_size; + return (phys_addr_t)i85x_mem_size() - i85x_tseg_size() - stolen_size; } -static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i865_stolen_base(int num, int slot, int func, + size_t stolen_size) { + u16 toud; + /* * FIXME is the graphics stolen memory region * always at TOUD? Ie. is it always the last * one to be allocated by the BIOS? */ - return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; + toud = read_pci_config_16(0, 0, 0, I865_TOUD); + + return (phys_addr_t)toud << 16; +} + +static phys_addr_t __init gen3_stolen_base(int num, int slot, int func, + size_t stolen_size) +{ + u32 bsm; + + /* Almost universally we can find the Graphics Base of Stolen Memory + * at register BSM (0x5c) in the igfx configuration space. On a few + * (desktop) machines this is also mirrored in the bridge device at + * different locations, or in the MCHBAR. + */ + bsm = read_pci_config(num, slot, func, INTEL_BSM); + + return (phys_addr_t)bsm & INTEL_BSM_MASK; } static size_t __init i830_stolen_size(int num, int slot, int func) { - size_t stolen_size; u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); - - switch (gmch_ctrl & I830_GMCH_GMS_MASK) { - case I830_GMCH_GMS_STOLEN_512: - stolen_size = KB(512); - break; - case I830_GMCH_GMS_STOLEN_1024: - stolen_size = MB(1); - break; - case I830_GMCH_GMS_STOLEN_8192: - stolen_size = MB(8); - break; - case I830_GMCH_GMS_LOCAL: - /* local memory isn't part of the normal address space */ - stolen_size = 0; - break; + gms = gmch_ctrl & I830_GMCH_GMS_MASK; + + switch (gms) { + case I830_GMCH_GMS_STOLEN_512: return KB(512); + case I830_GMCH_GMS_STOLEN_1024: return MB(1); + case I830_GMCH_GMS_STOLEN_8192: return MB(8); + /* local memory isn't part of the normal address space */ + case I830_GMCH_GMS_LOCAL: return 0; default: - return 0; + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); } - return stolen_size; + return 0; } static size_t __init gen3_stolen_size(int num, int slot, int func) { - size_t stolen_size; u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); - - switch (gmch_ctrl & I855_GMCH_GMS_MASK) { - case I855_GMCH_GMS_STOLEN_1M: - stolen_size = MB(1); - break; - case I855_GMCH_GMS_STOLEN_4M: - stolen_size = MB(4); - break; - case I855_GMCH_GMS_STOLEN_8M: - stolen_size = MB(8); - break; - case I855_GMCH_GMS_STOLEN_16M: - stolen_size = MB(16); - break; - case I855_GMCH_GMS_STOLEN_32M: - stolen_size = MB(32); - break; - case I915_GMCH_GMS_STOLEN_48M: - stolen_size = MB(48); - break; - case I915_GMCH_GMS_STOLEN_64M: - stolen_size = MB(64); - break; - case G33_GMCH_GMS_STOLEN_128M: - stolen_size = MB(128); - break; - case G33_GMCH_GMS_STOLEN_256M: - stolen_size = MB(256); - break; - case INTEL_GMCH_GMS_STOLEN_96M: - stolen_size = MB(96); - break; - case INTEL_GMCH_GMS_STOLEN_160M: - stolen_size = MB(160); - break; - case INTEL_GMCH_GMS_STOLEN_224M: - stolen_size = MB(224); - break; - case INTEL_GMCH_GMS_STOLEN_352M: - stolen_size = MB(352); - break; + gms = gmch_ctrl & I855_GMCH_GMS_MASK; + + switch (gms) { + case I855_GMCH_GMS_STOLEN_1M: return MB(1); + case I855_GMCH_GMS_STOLEN_4M: return MB(4); + case I855_GMCH_GMS_STOLEN_8M: return MB(8); + case I855_GMCH_GMS_STOLEN_16M: return MB(16); + case I855_GMCH_GMS_STOLEN_32M: return MB(32); + case I915_GMCH_GMS_STOLEN_48M: return MB(48); + case I915_GMCH_GMS_STOLEN_64M: return MB(64); + case G33_GMCH_GMS_STOLEN_128M: return MB(128); + case G33_GMCH_GMS_STOLEN_256M: return MB(256); + case INTEL_GMCH_GMS_STOLEN_96M: return MB(96); + case INTEL_GMCH_GMS_STOLEN_160M:return MB(160); + case INTEL_GMCH_GMS_STOLEN_224M:return MB(224); + case INTEL_GMCH_GMS_STOLEN_352M:return MB(352); default: - stolen_size = 0; - break; + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); } - return stolen_size; + return 0; } static size_t __init gen6_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; - gmch_ctrl &= SNB_GMCH_GMS_MASK; + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; - return gmch_ctrl << 25; /* 32 MB units */ + return (size_t)gms * MB(32); } static size_t __init gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; - gmch_ctrl &= BDW_GMCH_GMS_MASK; - return gmch_ctrl << 25; /* 32 MB units */ + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; + + return (size_t)gms * MB(32); } static size_t __init chv_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; - gmch_ctrl &= SNB_GMCH_GMS_MASK; + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; /* * 0x0 to 0x10: 32MB increments starting at 0MB * 0x11 to 0x16: 4MB increments starting at 8MB * 0x17 to 0x1d: 4MB increments start at 36MB */ - if (gmch_ctrl < 0x11) - return gmch_ctrl << 25; - else if (gmch_ctrl < 0x17) - return (gmch_ctrl - 0x11 + 2) << 22; + if (gms < 0x11) + return (size_t)gms * MB(32); + else if (gms < 0x17) + return (size_t)(gms - 0x11 + 2) * MB(4); else - return (gmch_ctrl - 0x17 + 9) << 22; + return (size_t)(gms - 0x17 + 9) * MB(4); } -struct intel_stolen_funcs { - size_t (*size)(int num, int slot, int func); - u32 (*base)(int num, int slot, int func, size_t size); -}; - static size_t __init gen9_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; - gmch_ctrl &= BDW_GMCH_GMS_MASK; + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; - if (gmch_ctrl < 0xf0) - return gmch_ctrl << 25; /* 32 MB units */ + /* 0x0 to 0xef: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if (gms < 0xf0) + return (size_t)gms * MB(32); else - /* 4MB increments starting at 0xf0 for 4MB */ - return (gmch_ctrl - 0xf0 + 1) << 22; + return (size_t)(gms - 0xf0 + 1) * MB(4); } -typedef size_t (*stolen_size_fn)(int num, int slot, int func); +struct intel_early_ops { + size_t (*stolen_size)(int num, int slot, int func); + phys_addr_t (*stolen_base)(int num, int slot, int func, size_t size); +}; -static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { - .base = i830_stolen_base, - .size = i830_stolen_size, +static const struct intel_early_ops i830_early_ops __initconst = { + .stolen_base = i830_stolen_base, + .stolen_size = i830_stolen_size, }; -static const struct intel_stolen_funcs i845_stolen_funcs __initconst = { - .base = i845_stolen_base, - .size = i830_stolen_size, +static const struct intel_early_ops i845_early_ops __initconst = { + .stolen_base = i845_stolen_base, + .stolen_size = i830_stolen_size, }; -static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = { - .base = i85x_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops i85x_early_ops __initconst = { + .stolen_base = i85x_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs i865_stolen_funcs __initconst = { - .base = i865_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops i865_early_ops __initconst = { + .stolen_base = i865_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops gen3_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen6_stolen_size, +static const struct intel_early_ops gen6_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen6_stolen_size, }; -static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen8_stolen_size, +static const struct intel_early_ops gen8_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen8_stolen_size, }; -static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen9_stolen_size, +static const struct intel_early_ops gen9_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen9_stolen_size, }; -static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = chv_stolen_size, +static const struct intel_early_ops chv_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = chv_stolen_size, }; -static const struct pci_device_id intel_stolen_ids[] __initconst = { - INTEL_I830_IDS(&i830_stolen_funcs), - INTEL_I845G_IDS(&i845_stolen_funcs), - INTEL_I85X_IDS(&i85x_stolen_funcs), - INTEL_I865G_IDS(&i865_stolen_funcs), - INTEL_I915G_IDS(&gen3_stolen_funcs), - INTEL_I915GM_IDS(&gen3_stolen_funcs), - INTEL_I945G_IDS(&gen3_stolen_funcs), - INTEL_I945GM_IDS(&gen3_stolen_funcs), - INTEL_VLV_M_IDS(&gen6_stolen_funcs), - INTEL_VLV_D_IDS(&gen6_stolen_funcs), - INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), - INTEL_I965G_IDS(&gen3_stolen_funcs), - INTEL_G33_IDS(&gen3_stolen_funcs), - INTEL_I965GM_IDS(&gen3_stolen_funcs), - INTEL_GM45_IDS(&gen3_stolen_funcs), - INTEL_G45_IDS(&gen3_stolen_funcs), - INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), - INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), - INTEL_SNB_D_IDS(&gen6_stolen_funcs), - INTEL_SNB_M_IDS(&gen6_stolen_funcs), - INTEL_IVB_M_IDS(&gen6_stolen_funcs), - INTEL_IVB_D_IDS(&gen6_stolen_funcs), - INTEL_HSW_D_IDS(&gen6_stolen_funcs), - INTEL_HSW_M_IDS(&gen6_stolen_funcs), - INTEL_BDW_M_IDS(&gen8_stolen_funcs), - INTEL_BDW_D_IDS(&gen8_stolen_funcs), - INTEL_CHV_IDS(&chv_stolen_funcs), - INTEL_SKL_IDS(&gen9_stolen_funcs), - INTEL_BXT_IDS(&gen9_stolen_funcs), - INTEL_KBL_IDS(&gen9_stolen_funcs), +static const struct pci_device_id intel_early_ids[] __initconst = { + INTEL_I830_IDS(&i830_early_ops), + INTEL_I845G_IDS(&i845_early_ops), + INTEL_I85X_IDS(&i85x_early_ops), + INTEL_I865G_IDS(&i865_early_ops), + INTEL_I915G_IDS(&gen3_early_ops), + INTEL_I915GM_IDS(&gen3_early_ops), + INTEL_I945G_IDS(&gen3_early_ops), + INTEL_I945GM_IDS(&gen3_early_ops), + INTEL_VLV_M_IDS(&gen6_early_ops), + INTEL_VLV_D_IDS(&gen6_early_ops), + INTEL_PINEVIEW_IDS(&gen3_early_ops), + INTEL_I965G_IDS(&gen3_early_ops), + INTEL_G33_IDS(&gen3_early_ops), + INTEL_I965GM_IDS(&gen3_early_ops), + INTEL_GM45_IDS(&gen3_early_ops), + INTEL_G45_IDS(&gen3_early_ops), + INTEL_IRONLAKE_D_IDS(&gen3_early_ops), + INTEL_IRONLAKE_M_IDS(&gen3_early_ops), + INTEL_SNB_D_IDS(&gen6_early_ops), + INTEL_SNB_M_IDS(&gen6_early_ops), + INTEL_IVB_M_IDS(&gen6_early_ops), + INTEL_IVB_D_IDS(&gen6_early_ops), + INTEL_HSW_D_IDS(&gen6_early_ops), + INTEL_HSW_M_IDS(&gen6_early_ops), + INTEL_BDW_M_IDS(&gen8_early_ops), + INTEL_BDW_D_IDS(&gen8_early_ops), + INTEL_CHV_IDS(&chv_early_ops), + INTEL_SKL_IDS(&gen9_early_ops), + INTEL_BXT_IDS(&gen9_early_ops), + INTEL_KBL_IDS(&gen9_early_ops), }; -static void __init intel_graphics_stolen(int num, int slot, int func) +static void __init +intel_graphics_stolen(int num, int slot, int func, + const struct intel_early_ops *early_ops) { + phys_addr_t base, end; size_t size; + + size = early_ops->stolen_size(num, slot, func); + base = early_ops->stolen_base(num, slot, func, size); + + if (!size || !base) + return; + + end = base + size - 1; + printk(KERN_INFO "Reserving Intel graphics memory at %pa-%pa\n", + &base, &end); + + /* Mark this space as reserved */ + e820_add_region(base, size, E820_RESERVED); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); +} + +static void __init intel_graphics_quirks(int num, int slot, int func) +{ + const struct intel_early_ops *early_ops; + u16 device; int i; - u32 start; - u16 device, subvendor, subdevice; device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); - subvendor = read_pci_config_16(num, slot, func, - PCI_SUBSYSTEM_VENDOR_ID); - subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID); - - for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { - if (intel_stolen_ids[i].device == device) { - const struct intel_stolen_funcs *stolen_funcs = - (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; - size = stolen_funcs->size(num, slot, func); - start = stolen_funcs->base(num, slot, func, size); - if (size && start) { - printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", - start, start + (u32)size - 1); - /* Mark this space as reserved */ - e820_add_region(start, size, E820_RESERVED); - sanitize_e820_map(e820.map, - ARRAY_SIZE(e820.map), - &e820.nr_map); - } - return; - } + + for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { + kernel_ulong_t driver_data = intel_early_ids[i].driver_data; + + if (intel_early_ids[i].device != device) + continue; + + early_ops = (typeof(early_ops))driver_data; + + intel_graphics_stolen(num, slot, func, early_ops); + + return; } } @@ -590,6 +588,61 @@ static void __init force_disable_hpet(int num, int slot, int func) #endif } +#define BCM4331_MMIO_SIZE 16384 +#define BCM4331_PM_CAP 0x40 +#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg) +#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg) + +static void __init apple_airport_reset(int bus, int slot, int func) +{ + void __iomem *mmio; + u16 pmcsr; + u64 addr; + int i; + + if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + return; + + /* Card may have been put into PCI_D3hot by grub quirk */ + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr); + mdelay(10); + + pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL); + if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) { + dev_err("Cannot power up Apple AirPort card\n"); + return; + } + } + + addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32; + addr &= PCI_BASE_ADDRESS_MEM_MASK; + + mmio = early_ioremap(addr, BCM4331_MMIO_SIZE); + if (!mmio) { + dev_err("Cannot iomap Apple AirPort card\n"); + return; + } + + pr_info("Resetting Apple AirPort card (left enabled by EFI)\n"); + + for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++) + udelay(10); + + bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET); + bcma_aread32(BCMA_RESET_CTL); + udelay(1); + + bcma_awrite32(BCMA_RESET_CTL, 0); + bcma_aread32(BCMA_RESET_CTL); + udelay(10); + + early_iounmap(mmio, BCM4331_MMIO_SIZE); +} #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 @@ -603,12 +656,6 @@ struct chipset { void (*f)(int num, int slot, int func); }; -/* - * Only works for devices on the root bus. If you add any devices - * not on bus 0 readd another loop level in early_quirks(). But - * be careful because at least the Nvidia quirk here relies on - * only matching on bus 0. - */ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, @@ -627,7 +674,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_stolen }, + QFLAG_APPLY_ONCE, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. @@ -638,9 +685,13 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_BROADCOM, 0x4331, + PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} }; +static void __init early_pci_scan_bus(int bus); + /** * check_dev_quirk - apply early quirks to a given PCI device * @num: bus number @@ -649,7 +700,7 @@ static struct chipset early_qrk[] __initdata = { * * Check the vendor & device ID against the early quirks table. * - * If the device is single function, let early_quirks() know so we don't + * If the device is single function, let early_pci_scan_bus() know so we don't * poke at this device again. */ static int __init check_dev_quirk(int num, int slot, int func) @@ -658,6 +709,7 @@ static int __init check_dev_quirk(int num, int slot, int func) u16 vendor; u16 device; u8 type; + u8 sec; int i; class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); @@ -685,25 +737,36 @@ static int __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); + + if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); + if (sec > num) + early_pci_scan_bus(sec); + } + if (!(type & 0x80)) return -1; return 0; } -void __init early_quirks(void) +static void __init early_pci_scan_bus(int bus) { int slot, func; - if (!early_pci_allowed()) - return; - /* Poor man's PCI discovery */ - /* Only scan the root bus */ for (slot = 0; slot < 32; slot++) for (func = 0; func < 8; func++) { /* Only probe function 0 on single fn devices */ - if (check_dev_quirk(0, slot, func)) + if (check_dev_quirk(bus, slot, func)) break; } } + +void __init early_quirks(void) +{ + if (!early_pci_allowed()) + return; + + early_pci_scan_bus(0); +} diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index afe65dffee80..4312f8ae71b7 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -6,66 +6,92 @@ #include <asm/bios_ebda.h> /* + * This function reserves all conventional PC system BIOS related + * firmware memory areas (some of which are data, some of which + * are code), that must not be used by the kernel as available + * RAM. + * * The BIOS places the EBDA/XBDA at the top of conventional * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. + * conventional memory (int 0x12) too. + * + * This means that as a first approximation on most systems we can + * guess the reserved BIOS area by looking at the low BIOS RAM size + * value and assume that everything above that value (up to 1MB) is + * reserved. + * + * But life in firmware country is not that simple: + * + * - This code also contains a quirk for Dell systems that neglect + * to reserve the EBDA area in the 'RAM size' value ... + * + * - The same quirk also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). (Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in.) + * + * - Plus paravirt systems don't have a reliable value in the + * 'BIOS RAM size' pointer we can rely on, so we must quirk + * them too. + * + * Due to those various problems this function is deliberately + * very conservative and tries to err on the side of reserving + * too much, to not risk reserving too little. + * + * Losing a small amount of memory in the bottom megabyte is + * rarely a problem, as long as we have enough memory to install + * the SMP bootup trampoline which *must* be in this area. * - * This functions is deliberately very conservative. Losing - * memory in the bottom megabyte is rarely a problem, as long - * as we have enough memory to install the trampoline. Using - * memory that is in use by the BIOS or by some DMA device - * the BIOS didn't shut down *is* a big problem. + * Using memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem to the kernel, + * obviously. */ -#define BIOS_LOWMEM_KILOBYTES 0x413 -#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ -#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ +#define BIOS_RAM_SIZE_KB_PTR 0x413 -void __init reserve_ebda_region(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +void __init reserve_bios_regions(void) { - unsigned int lowmem, ebda_addr; + unsigned int bios_start, ebda_start; /* - * To determine the position of the EBDA and the - * end of conventional memory, we need to look at - * the BIOS data area. In a paravirtual environment - * that area is absent. We'll just have to assume - * that the paravirt case can handle memory setup - * correctly, without our help. + * NOTE: In a paravirtual environment the BIOS reserved + * area is absent. We'll just have to assume that the + * paravirt case can handle memory setup correctly, + * without our help. */ - if (!x86_platform.legacy.ebda_search) + if (!x86_platform.legacy.reserve_bios_regions) return; - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - /* - * Note: some old Dells seem to need 4k EBDA without - * reporting so, so just consider the memory above 0x9f000 - * to be off limits (bugzilla 2990). + * BIOS RAM size is encoded in kilobytes, convert it + * to bytes to get a first guess at where the BIOS + * firmware area starts: */ + bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR); + bios_start <<= 10; - /* If the EBDA address is below 128K, assume it is bogus */ - if (ebda_addr < INSANE_CUTOFF) - ebda_addr = LOWMEM_CAP; + /* + * If bios_start is less than 128K, assume it is bogus + * and bump it up to 640K. Similarly, if bios_start is above 640K, + * don't trust it. + */ + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; - /* If lowmem is less than 128K, assume it is bogus */ - if (lowmem < INSANE_CUTOFF) - lowmem = LOWMEM_CAP; + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); - /* Use the lower of the lowmem and EBDA markers as the cutoff */ - lowmem = min(lowmem, ebda_addr); - lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ + /* + * If the EBDA start address is sane and is below the BIOS region, + * then also reserve everything from the EBDA start address up to + * the BIOS region. + */ + if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; - /* reserve all memory between lowmem and the 1MB mark */ - memblock_reserve(lowmem, 0x100000 - lowmem); + /* Reserve all memory between bios_start and the 1MB mark: */ + memblock_reserve(bios_start, 0x100000 - bios_start); } diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4d38416e2a7f..04f89caef9c4 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -57,7 +57,7 @@ # error "Need more than one PGD for the ESPFIX hack" #endif -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) /* This contains the *bottom* address of the espfix stack */ DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 97027545a72d..47004010ad5d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -8,9 +8,14 @@ #include <asm/fpu/internal.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> +#include <asm/fpu/types.h> #include <asm/traps.h> #include <linux/hardirq.h> +#include <linux/pkeys.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/fpu.h> /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, @@ -192,6 +197,7 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); + trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { if (use_eager_fpu()) @@ -200,6 +206,7 @@ void fpu__save(struct fpu *fpu) fpregs_deactivate(fpu); } } + trace_x86_fpu_after_save(fpu); preempt_enable(); } EXPORT_SYMBOL_GPL(fpu__save); @@ -222,7 +229,14 @@ void fpstate_init(union fpregs_state *state) return; } - memset(state, 0, xstate_size); + memset(state, 0, fpu_kernel_xstate_size); + + /* + * XRSTORS requires that this bit is set in xcomp_bv, or + * it will #GP. Make sure it is replaced after the memset(). + */ + if (static_cpu_has(X86_FEATURE_XSAVES)) + state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT; if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); @@ -247,7 +261,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * leak into the child task: */ if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -266,7 +280,8 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) */ preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); + memcpy(&src_fpu->state, &dst_fpu->state, + fpu_kernel_xstate_size); if (use_eager_fpu()) copy_kernel_to_fpregs(&src_fpu->state); @@ -275,6 +290,9 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) } preempt_enable(); + trace_x86_fpu_copy_src(src_fpu); + trace_x86_fpu_copy_dst(dst_fpu); + return 0; } @@ -288,7 +306,9 @@ void fpu__activate_curr(struct fpu *fpu) if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ fpu->fpstate_active = 1; } @@ -314,7 +334,9 @@ void fpu__activate_fpstate_read(struct fpu *fpu) } else { if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -347,7 +369,9 @@ void fpu__activate_fpstate_write(struct fpu *fpu) fpu->last_cpu = -1; } else { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -432,9 +456,11 @@ void fpu__restore(struct fpu *fpu) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); + trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); fpu->counter++; + trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -463,6 +489,8 @@ void fpu__drop(struct fpu *fpu) fpu->fpstate_active = 0; + trace_x86_fpu_dropped(fpu); + preempt_enable(); } @@ -478,6 +506,9 @@ static inline void copy_init_fpstate_to_fpregs(void) copy_kernel_to_fxregs(&init_fpstate.fxsave); else copy_kernel_to_fregs(&init_fpstate.fsave); + + if (boot_cpu_has(X86_FEATURE_OSPKE)) + copy_init_pkru_to_fpregs(); } /* diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index aacfd7a82cec..2f2b8c7ccb85 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -145,8 +145,8 @@ static void __init fpu__init_system_generic(void) * This is inherent to the XSAVE architecture which puts all state * components into a single, continuous memory block: */ -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); +unsigned int fpu_kernel_xstate_size; +EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -178,7 +178,7 @@ static void __init fpu__init_task_struct_size(void) * Add back the dynamically-calculated register state * size. */ - task_size += xstate_size; + task_size += fpu_kernel_xstate_size; /* * We dynamically size 'struct fpu', so we require that @@ -195,7 +195,7 @@ static void __init fpu__init_task_struct_size(void) } /* - * Set up the xstate_size based on the legacy FPU context size. + * Set up the user and kernel xstate sizes based on the legacy FPU context size. * * We set this up first, and later it will be overwritten by * fpu__init_system_xstate() if the CPU knows about xstates. @@ -208,7 +208,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) on_boot_cpu = 0; /* - * Note that xstate_size might be overwriten later during + * Note that xstate sizes might be overwritten later during * fpu__init_system_xstate(). */ @@ -219,27 +219,17 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct swregs_state); + fpu_kernel_xstate_size = sizeof(struct swregs_state); } else { if (boot_cpu_has(X86_FEATURE_FXSR)) - xstate_size = sizeof(struct fxregs_state); + fpu_kernel_xstate_size = + sizeof(struct fxregs_state); else - xstate_size = sizeof(struct fregs_state); + fpu_kernel_xstate_size = + sizeof(struct fregs_state); } - /* - * Quirk: we don't yet handle the XSAVES* instructions - * correctly, as we don't correctly convert between - * standard and compacted format when interfacing - * with user-space - so disable it for now. - * - * The difference is small: with recent CPUs the - * compacted format is only marginally smaller than - * the standard FPU state format. - * - * ( This is easy to backport while we are fixing - * XSAVES* support. ) - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + fpu_user_xstate_size = fpu_kernel_xstate_size; } /* @@ -327,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - current_thread_info()->status = 0; if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 81422dfb152b..c114b132d121 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -4,6 +4,7 @@ #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -85,21 +86,26 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_read(fpu); - xsave = &fpu->state.xsave; - /* - * Copy the 48bytes defined by the software first into the xstate - * memory layout in the thread struct, so that we can copy the entire - * xstateregs to the user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_read(fpu); + + if (using_compacted_format()) { + ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + } else { + fpstate_sanitize_xstate(fpu); + /* + * Copy the 48 bytes defined by the software into the xsave + * area in the thread struct, so that we can copy the whole + * area to user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + } return ret; } @@ -114,11 +120,27 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + /* + * A whole standard-format XSAVE buffer is needed: + */ + if ((pos != 0) || (count < fpu_user_xstate_size)) + return -EFAULT; xsave = &fpu->state.xsave; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_write(fpu); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + ret = copyin_to_xsaves(kbuf, ubuf, xsave); + else + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* + * In case of failure, mark all states as init: + */ + if (ret) + fpstate_init(&fpu->state); + /* * mxcsr reserved bits must be masked to zero for security reasons. */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 31c6a60505e6..a184c210efba 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -8,8 +8,10 @@ #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/sigframe.h> +#include <asm/trace/fpu.h> static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; @@ -31,7 +33,7 @@ static inline int check_for_xstate(struct fxregs_state __user *buf, /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fpu_user_xstate_size || fx_sw->xstate_size > fx_sw->extended_size) return -1; @@ -88,7 +90,8 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) if (!use_xsave()) return err; - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 *)(buf + fpu_user_xstate_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -125,7 +128,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) else err = copy_fregs_to_user((struct fregs_state __user *) buf); - if (unlikely(err) && __clear_user(buf, xstate_size)) + if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) err = -EFAULT; return err; } @@ -156,8 +159,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!access_ok(VERIFY_WRITE, buf, size)) return -EACCES; @@ -167,7 +170,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active()) { + if (fpregs_active() || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; @@ -175,8 +178,19 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (ia32_fxstate) copy_fxregs_to_kernel(&tsk->thread.fpu); } else { + /* + * It is a *bug* if kernel uses compacted-format for xsave + * area and we copy it out directly to a signal frame. It + * should have been handled above by saving the registers + * directly. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n"); + return -1; + } + fpstate_sanitize_xstate(&tsk->thread.fpu); - if (__copy_to_user(buf_fx, xsave, xstate_size)) + if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } @@ -250,12 +264,12 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - int state_size = xstate_size; + int state_size = fpu_kernel_xstate_size; u64 xfeatures = 0; int fx_only = 0; - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { fpu__clear(fpu); @@ -282,6 +296,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ state_size = sizeof(struct fxregs_state); fx_only = 1; + trace_x86_fpu_xstate_check_failed(fpu); } else { state_size = fx_sw_user.xstate_size; xfeatures = fx_sw_user.xfeatures; @@ -308,9 +323,17 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { + if (using_compacted_format()) { + err = copyin_to_xsaves(NULL, buf_fx, + &fpu->state.xsave); + } else { + err = __copy_from_user(&fpu->state.xsave, + buf_fx, state_size); + } + + if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); @@ -341,7 +364,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) static inline int xstate_sigframe_size(void) { - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; + return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : + fpu_user_xstate_size; } /* @@ -385,15 +409,15 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; - fx_sw_reserved.xstate_size = xstate_size; + fx_sw_reserved.xstate_size = fpu_user_xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION) || - config_enabled(CONFIG_X86_32)) { + if (IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) { int fsave_header_size = sizeof(struct fregs_state); fx_sw_reserved_ia32 = fx_sw_reserved; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4ea2a59483c7..124aa5c593f8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -5,12 +5,14 @@ */ #include <linux/compat.h> #include <linux/cpu.h> +#include <linux/mman.h> #include <linux/pkeys.h> #include <asm/fpu/api.h> #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/tlbflush.h> @@ -44,6 +46,13 @@ static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; /* + * The XSAVE area of kernel can be in standard or compacted format; + * it is always in standard format for user mode. This is the user + * mode standard format size used for signal and ptrace frames. + */ +unsigned int fpu_user_xstate_size; + +/* * Clear all of the X86_FEATURE_* bits that are unavailable * when the CPU has no XSAVE support. */ @@ -105,6 +114,27 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); +static int xfeature_is_supervisor(int xfeature_nr) +{ + /* + * We currently do not support supervisor states, but if + * we did, we could find out like this. + * + * SDM says: If state component 'i' is a user state component, + * ECX[0] return 0; if state component i is a supervisor + * state component, ECX[0] returns 1. + */ + u32 eax, ebx, ecx, edx; + + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return !!(ecx & 1); +} + +static int xfeature_is_user(int xfeature_nr) +{ + return !xfeature_is_supervisor(xfeature_nr); +} + /* * When executing XSAVEOPT (or other optimized XSAVE instructions), if * a processor implementation detects that an FPU state component is still @@ -171,7 +201,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ while (xfeatures) { if (xfeatures & 0x1) { - int offset = xstate_offsets[feature_bit]; + int offset = xstate_comp_offsets[feature_bit]; int size = xstate_sizes[feature_bit]; memcpy((void *)fx + offset, @@ -192,6 +222,15 @@ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; + /* + * Make it clear that XSAVES supervisor states are not yet + * implemented should anyone expect it to work by changing + * bits in XFEATURE_MASK_* macros and XCR0. + */ + WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), + "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + + xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); @@ -217,13 +256,29 @@ static void __init setup_xstate_features(void) /* start at the beginnning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_offsets[0] = 0; + xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); + xstate_offsets[1] = xstate_sizes[0]; + xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) continue; cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_offsets[i] = ebx; + + /* + * If an xfeature is supervisor state, the offset + * in EBX is invalid. We leave it to -1. + */ + if (xfeature_is_user(i)) + xstate_offsets[i] = ebx; + xstate_sizes[i] = eax; /* * In our xstate size checks, we assume that the @@ -233,8 +288,6 @@ static void __init setup_xstate_features(void) WARN_ONCE(last_good_offset > xstate_offsets[i], "x86/fpu: misordered xstate at %d\n", last_good_offset); last_good_offset = xstate_offsets[i]; - - printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax); } } @@ -263,6 +316,33 @@ static void __init print_xstate_features(void) } /* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * We could cache this like xstate_size[], but we only use + * it here, so it would be a waste of space. + */ +static int xfeature_is_aligned(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + /* + * The value returned by ECX[1] indicates the alignment + * of state component 'i' when the compacted format + * of the extended region of an XSAVE area is used: + */ + return !!(ecx & 2); +} + +/* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format * of the xsave aread. @@ -299,10 +379,29 @@ static void __init setup_xstate_comp(void) else xstate_comp_sizes[i] = 0; - if (i > FIRST_EXTENDED_XFEATURE) + if (i > FIRST_EXTENDED_XFEATURE) { xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + xstate_comp_sizes[i-1]; + if (xfeature_is_aligned(i)) + xstate_comp_offsets[i] = + ALIGN(xstate_comp_offsets[i], 64); + } + } +} + +/* + * Print out xstate component offsets and sizes + */ +static void __init print_xstate_offset_size(void) +{ + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", + i, xstate_comp_offsets[i], i, xstate_sizes[i]); } } @@ -322,13 +421,11 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_fpstate.xsave.header.xfeatures = xfeatures_mask; - } /* - * Init all the features state with header_bv being 0x0 + * Init all the features state with header.xfeatures being 0x0 */ copy_kernel_to_xregs_booting(&init_fpstate.xsave); @@ -339,58 +436,19 @@ static void __init setup_init_fpu_buf(void) copy_xregs_to_kernel_booting(&init_fpstate.xsave); } -static int xfeature_is_supervisor(int xfeature_nr) -{ - /* - * We currently do not support supervisor states, but if - * we did, we could find out like this. - * - * SDM says: If state component i is a user state component, - * ECX[0] return 0; if state component i is a supervisor - * state component, ECX[0] returns 1. - u32 eax, ebx, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx; - return !!(ecx & 1); - */ - return 0; -} -/* -static int xfeature_is_user(int xfeature_nr) -{ - return !xfeature_is_supervisor(xfeature_nr); -} -*/ - -/* - * This check is important because it is easy to get XSTATE_* - * confused with XSTATE_BIT_*. - */ -#define CHECK_XFEATURE(nr) do { \ - WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ - WARN_ON(nr >= XFEATURE_MAX); \ -} while (0) - -/* - * We could cache this like xstate_size[], but we only use - * it here, so it would be a waste of space. - */ -static int xfeature_is_aligned(int xfeature_nr) +static int xfeature_uncompacted_offset(int xfeature_nr) { u32 eax, ebx, ecx, edx; - CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); /* - * The value returned by ECX[1] indicates the alignment - * of state component i when the compacted format - * of the extended region of an XSAVE area is used + * Only XSAVES supports supervisor states and it uses compacted + * format. Checking a supervisor state's uncompacted offset is + * an error. */ - return !!(ecx & 2); -} - -static int xfeature_uncompacted_offset(int xfeature_nr) -{ - u32 eax, ebx, ecx, edx; + if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) { + WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); + return -1; + } CHECK_XFEATURE(xfeature_nr); cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); @@ -415,7 +473,7 @@ static int xfeature_size(int xfeature_nr) * that it is obvious which aspect of 'XSAVES' is being handled * by the calling code. */ -static int using_compacted_format(void) +int using_compacted_format(void) { return boot_cpu_has(X86_FEATURE_XSAVES); } @@ -530,11 +588,12 @@ static void do_extra_xstate_size_checks(void) */ paranoid_xstate_size += xfeature_size(i); } - XSTATE_WARN_ON(paranoid_xstate_size != xstate_size); + XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); } + /* - * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0/xfeatures_mask. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer @@ -544,34 +603,33 @@ static void do_extra_xstate_size_checks(void) * Note that we do not currently set any bits on IA32_XSS so * 'XCR0 | IA32_XSS == XCR0' for now. */ -static unsigned int __init calculate_xstate_size(void) +static unsigned int __init get_xsaves_size(void) { unsigned int eax, ebx, ecx, edx; - unsigned int calculated_xstate_size; + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + return ebx; +} - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - /* - * - CPUID function 0DH, sub-function 0: - * EBX enumerates the size (in bytes) required by - * the XSAVE instruction for an XSAVE area - * containing all the *user* state components - * corresponding to bits currently set in XCR0. - */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } else { - /* - * - CPUID function 0DH, sub-function 1: - * EBX enumerates the size (in bytes) required by - * the XSAVES instruction for an XSAVE area - * containing all the state components - * corresponding to bits currently set in - * XCR0 | IA32_XSS. - */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } - return calculated_xstate_size; +static unsigned int __init get_xsave_size(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + return ebx; } /* @@ -591,7 +649,15 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size) static int init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size = calculate_xstate_size(); + unsigned int possible_xstate_size; + unsigned int xsave_size; + + xsave_size = get_xsave_size(); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + possible_xstate_size = get_xsaves_size(); + else + possible_xstate_size = xsave_size; /* Ensure we have the space to store all enabled: */ if (!is_supported_xstate_size(possible_xstate_size)) @@ -601,8 +667,13 @@ static int init_xstate_size(void) * The size is OK, we are definitely going to use xsave, * make it known to the world that we need more space. */ - xstate_size = possible_xstate_size; + fpu_kernel_xstate_size = possible_xstate_size; do_extra_xstate_size_checks(); + + /* + * User space is always in standard format. + */ + fpu_user_xstate_size = xsave_size; return 0; } @@ -644,8 +715,13 @@ void __init fpu__init_system_xstate(void) xfeatures_mask = eax + ((u64)edx << 32); if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * This indicates that something really unexpected happened + * with the enumeration. Disable XSAVE and try to continue + * booting without it. This is too early to BUG(). + */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); - BUG(); + goto out_disable; } xfeatures_mask &= fpu__get_supported_xfeatures_mask(); @@ -653,21 +729,29 @@ void __init fpu__init_system_xstate(void) /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); err = init_xstate_size(); - if (err) { - /* something went wrong, boot without any XSAVE support */ - fpu__init_disable_system_xstate(); - return; - } + if (err) + goto out_disable; + + /* + * Update info used for ptrace frames; use standard-format size and no + * supervisor xstates: + */ + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); - update_regset_xstate_info(xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, - xstate_size, + fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + return; + +out_disable: + /* something went wrong, try to boot without any XSAVE support */ + fpu__init_disable_system_xstate(); } /* @@ -693,6 +777,11 @@ void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) { int feature_nr = fls64(xstate_feature_mask) - 1; + if (!xfeature_enabled(feature_nr)) { + WARN_ON_FPU(1); + return NULL; + } + return (void *)xsave + xstate_comp_offsets[feature_nr]; } /* @@ -778,155 +867,214 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } +#ifdef CONFIG_ARCH_HAS_PKEYS +#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) +#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* - * Set xfeatures (aka XSTATE_BV) bit for a feature that we want - * to take out of its "init state". This will ensure that an - * XRSTOR actually restores the state. + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. */ -static void fpu__xfeature_set_non_init(struct xregs_state *xsave, - int xstate_feature_mask) +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) { - xsave->header.xfeatures |= xstate_feature_mask; + u32 old_pkru; + int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); + u32 new_pkru_bits = 0; + + /* + * This check implies XSAVE support. OSPKE only gets + * set if we enable XSAVE and we enable PKU in XCR0. + */ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return -EINVAL; + /* + * For most XSAVE components, this would be an arduous task: + * brining fpstate up to date with fpregs, updating fpstate, + * then re-populating fpregs. But, for components that are + * never lazily managed, we can just access the fpregs + * directly. PKRU is never managed lazily, so we can just + * manipulate it directly. Make sure it stays that way. + */ + WARN_ON_ONCE(!use_eager_fpu()); + + /* Set the bits we need in PKRU: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_pkru_bits |= PKRU_AD_BIT; + if (init_val & PKEY_DISABLE_WRITE) + new_pkru_bits |= PKRU_WD_BIT; + + /* Shift the bits in to the correct place in PKRU for pkey: */ + new_pkru_bits <<= pkey_shift; + + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); + + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); + + return 0; } +#endif /* ! CONFIG_ARCH_HAS_PKEYS */ /* - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) - * @xsave_state_ptr: a pointer to a copy of the state that you would - * like written in to the current task's FPU xsave state. This pointer - * must not be located in the current tasks's xsave area. - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. + * This is similar to user_regset_copyout(), but will not add offset to + * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static void fpu__xfeature_set_state(int xstate_feature_mask, - void *xstate_feature_src, size_t len) +static inline int xstate_copyout(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) { - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; - struct fpu *fpu = ¤t->thread.fpu; - void *dst; + if ((count == 0) || (pos < start_pos)) + return 0; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - WARN_ONCE(1, "%s() attempted with no xsave support", __func__); - return; + if (end_pos < 0 || pos < end_pos) { + unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + + if (kbuf) { + memcpy(kbuf + pos, data, copy); + } else { + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; + } } + return 0; +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a ptrace buffer. It supports partial copy but pos always starts from + * zero. This is called from xstateregs_get() and there we check the CPU + * has XSAVES. + */ +int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave) +{ + unsigned int offset, size; + int ret, i; + struct xstate_header header; /* - * Tell the FPU code that we need the FPU state to be in - * 'fpu' (not in the registers), and that we need it to - * be stable while we write to it. + * Currently copy_regset_to_user() starts from pos 0: */ - fpu__current_fpstate_write_begin(); + if (unlikely(pos != 0)) + return -EFAULT; /* - * This method *WILL* *NOT* work for compact-format - * buffers. If the 'xstate_feature_mask' is unset in - * xcomp_bv then we may need to move other feature state - * "up" in the buffer. + * The destination is a ptrace buffer; we put in only user xstates: */ - if (xsave->header.xcomp_bv & xstate_feature_mask) { - WARN_ON_ONCE(1); - goto out; - } - - /* find the location in the xsave buffer of the desired state */ - dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; /* - * Make sure that the pointer being passed in did not - * come from the xsave buffer itself. + * Copy xregs_state->header: */ - WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); + offset = offsetof(struct xregs_state, header); + size = sizeof(header); - /* put the caller-provided data in the location */ - memcpy(dst, xstate_feature_src, len); + ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); + + if (ret) + return ret; + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + + if (ret) + return ret; + + if (offset + size >= count) + break; + } + + } /* - * Mark the xfeature so that the CPU knows there is state - * in the buffer now. - */ - fpu__xfeature_set_non_init(xsave, xstate_feature_mask); -out: - /* - * We are done writing to the 'fpu'. Reenable preeption - * and (possibly) move the fpstate back in to the fpregs. + * Fill xsave->i387.sw_reserved value for ptrace frame: */ - fpu__current_fpstate_write_end(); -} + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); -#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) -#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) + ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + + if (ret) + return ret; + + return 0; +} /* - * This will go out and modify the XSAVE buffer so that PKRU is - * set to a particular state for access to 'pkey'. - * - * PKRU state does affect kernel access to user memory. We do - * not modfiy PKRU *itself* here, only the XSAVE state that will - * be restored in to PKRU when we return back to userspace. + * Convert from a ptrace standard-format buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set() and + * there we check the CPU has XSAVES and a whole standard-sized buffer + * exists. */ -int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) +int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct pkru_state *old_pkru_state; - struct pkru_state new_pkru_state; - int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); - u32 new_pkru_bits = 0; + unsigned int offset, size; + int i; + u64 xfeatures; + u64 allowed_features; + + offset = offsetof(struct xregs_state, header); + size = sizeof(xfeatures); + + if (kbuf) { + memcpy(&xfeatures, kbuf + offset, size); + } else { + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; + } /* - * This check implies XSAVE support. OSPKE only gets - * set if we enable XSAVE and we enable PKU in XCR0. + * Reject if the user sets any disabled or supervisor features: */ - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + + if (xfeatures & ~allowed_features) return -EINVAL; - /* Set the bits we need in PKRU */ - if (init_val & PKEY_DISABLE_ACCESS) - new_pkru_bits |= PKRU_AD_BIT; - if (init_val & PKEY_DISABLE_WRITE) - new_pkru_bits |= PKRU_WD_BIT; + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); - /* Shift the bits in to the correct place in PKRU for pkey. */ - new_pkru_bits <<= pkey_shift; + if (xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); - /* Locate old copy of the state in the xsave buffer */ - old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + } /* - * When state is not in the buffer, it is in the init - * state, set it manually. Otherwise, copy out the old - * state. + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': */ - if (!old_pkru_state) - new_pkru_state.pkru = 0; - else - new_pkru_state.pkru = old_pkru_state->pkru; - - /* mask off any old bits in place */ - new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - /* Set the newly-requested bits */ - new_pkru_state.pkru |= new_pkru_bits; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; /* - * We could theoretically live without zeroing pkru.pad. - * The current XSAVE feature state definition says that - * only bytes 0->3 are used. But we do not want to - * chance leaking kernel stack out to userspace in case a - * memcpy() of the whole xsave buffer was done. - * - * They're in the same cacheline anyway. + * Add back in the features that came in from userspace: */ - new_pkru_state.pad = 0; - - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, - sizeof(new_pkru_state)); + xsave->header.xfeatures |= xfeatures; return 0; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d036cfb4495d..8639bb2ae058 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer) == -EBUSY) { + frame_pointer, parent) == -EBUSY) { *parent = old; return; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d784bb547a9d..f16c55bfc090 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void) /* Initialize 32bit specific setup functions */ x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - - reserve_ebda_region(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b72fb0b71dd1..54a2372f5dbb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_ebda_region(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6f8902b0d151..5f401262f12d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) */ __HEAD ENTRY(startup_32) - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * start_secondary(). */ ENTRY(start_cpu0) - movl stack_start, %ecx + movl initial_stack, %ecx movl %ecx, %esp jmp *(initial_code) ENDPROC(start_cpu0) @@ -307,7 +307,7 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp @@ -703,7 +703,7 @@ ENTRY(initial_page_table) .data .balign 4 -ENTRY(stack_start) +ENTRY(initial_stack) .long init_thread_union+THREAD_SIZE __INITRODATA diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5df831ef1442..c98a559c346e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,7 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) +L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) @@ -66,7 +66,7 @@ startup_64: */ /* - * Setup stack for verify_cpu(). "-8" because stack_start is defined + * Setup stack for verify_cpu(). "-8" because initial_stack is defined * this way, see below. Our best guess is a NULL ptr for stack * termination heuristics and we don't want to break anything which * might depend on it (kgdb, ...). @@ -226,7 +226,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip), %rsp + movq initial_stack(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -299,6 +299,7 @@ ENTRY(secondary_startup_64) pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq +ENDPROC(secondary_startup_64) #include "verify_cpu.S" @@ -309,7 +310,7 @@ ENTRY(secondary_startup_64) * start_secondary(). */ ENTRY(start_cpu0) - movq stack_start(%rip),%rsp + movq initial_stack(%rip),%rsp movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder pushq $__KERNEL_CS # set correct cs @@ -318,17 +319,15 @@ ENTRY(start_cpu0) ENDPROC(start_cpu0) #endif - /* SMP bootup changes these two */ + /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - - GLOBAL(stack_start) + GLOBAL(initial_stack) .quad init_thread_union+THREAD_SIZE-8 - .word 0 __FINITDATA bad_address: diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index f112af7aa62e..274fab99169d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -710,31 +710,29 @@ static void hpet_work(struct work_struct *w) complete(&hpet_work->complete); } -static int hpet_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) +static int hpet_cpuhp_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; struct hpet_work_struct work; + + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + destroy_delayed_work_on_stack(&work.work); + return 0; +} + +static int hpet_cpuhp_dead(unsigned int cpu) +{ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); - init_completion(&work.complete); - /* FIXME: add schedule_work_on() */ - schedule_delayed_work_on(cpu, &work.work, 0); - wait_for_completion(&work.complete); - destroy_delayed_work_on_stack(&work.work); - break; - case CPU_DEAD: - if (hdev) { - free_irq(hdev->irq, hdev); - hdev->flags &= ~HPET_DEV_USED; - per_cpu(cpu_hpet_dev, cpu) = NULL; - } - break; - } - return NOTIFY_OK; + if (!hdev) + return 0; + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + return 0; } #else @@ -750,21 +748,112 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) } #endif -static int hpet_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - return NOTIFY_OK; -} +#define hpet_cpuhp_online NULL +#define hpet_cpuhp_dead NULL #endif /* * Clock source related code */ +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +/* + * Reading the HPET counter is a very slow operation. If a large number of + * CPUs are trying to access the HPET counter simultaneously, it can cause + * massive delay and slow down system performance dramatically. This may + * happen when HPET is the default clock source instead of TSC. For a + * really large system with hundreds of CPUs, the slowdown may be so + * severe that it may actually crash the system because of a NMI watchdog + * soft lockup, for example. + * + * If multiple CPUs are trying to access the HPET counter at the same time, + * we don't actually need to read the counter multiple times. Instead, the + * other CPUs can use the counter value read by the first CPU in the group. + * + * This special feature is only enabled on x86-64 systems. It is unlikely + * that 32-bit x86 systems will have enough CPUs to require this feature + * with its associated locking overhead. And we also need 64-bit atomic + * read. + * + * The lock and the hpet value are stored together and can be read in a + * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t + * is 32 bits in size. + */ +union hpet_lock { + struct { + arch_spinlock_t lock; + u32 value; + }; + u64 lockval; +}; + +static union hpet_lock hpet __cacheline_aligned = { + { .lock = __ARCH_SPIN_LOCK_UNLOCKED, }, +}; + +static cycle_t read_hpet(struct clocksource *cs) +{ + unsigned long flags; + union hpet_lock old, new; + + BUILD_BUG_ON(sizeof(union hpet_lock) != 8); + + /* + * Read HPET directly if in NMI. + */ + if (in_nmi()) + return (cycle_t)hpet_readl(HPET_COUNTER); + + /* + * Read the current state of the lock and HPET value atomically. + */ + old.lockval = READ_ONCE(hpet.lockval); + + if (arch_spin_is_locked(&old.lock)) + goto contended; + + local_irq_save(flags); + if (arch_spin_trylock(&hpet.lock)) { + new.value = hpet_readl(HPET_COUNTER); + /* + * Use WRITE_ONCE() to prevent store tearing. + */ + WRITE_ONCE(hpet.value, new.value); + arch_spin_unlock(&hpet.lock); + local_irq_restore(flags); + return (cycle_t)new.value; + } + local_irq_restore(flags); + +contended: + /* + * Contended case + * -------------- + * Wait until the HPET value change or the lock is free to indicate + * its value is up-to-date. + * + * It is possible that old.value has already contained the latest + * HPET value while the lock holder was in the process of releasing + * the lock. Checking for lock state change will enable us to return + * the value immediately instead of waiting for the next HPET reader + * to come along. + */ + do { + cpu_relax(); + new.lockval = READ_ONCE(hpet.lockval); + } while ((new.value == old.value) && arch_spin_is_locked(&new.lock)); + + return (cycle_t)new.value; +} +#else +/* + * For UP or 32-bit. + */ static cycle_t read_hpet(struct clocksource *cs) { return (cycle_t)hpet_readl(HPET_COUNTER); } +#endif static struct clocksource clocksource_hpet = { .name = "hpet", @@ -931,7 +1020,7 @@ out_nohpet: */ static __init int hpet_late_init(void) { - int cpu; + int ret; if (boot_hpet_disable) return -ENODEV; @@ -961,16 +1050,20 @@ static __init int hpet_late_init(void) if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) { - hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); - } - /* This notifier should be called after workqueue is ready */ - __hotcpu_notifier(hpet_cpuhp_notify, -20); - cpu_notifier_register_done(); - + ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE", + hpet_cpuhp_online, NULL); + if (ret) + return ret; + ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL, + hpet_cpuhp_dead); + if (ret) + goto err_cpuhp; return 0; + +err_cpuhp: + cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE); + return ret; } fs_initcall(hpet_late_init); @@ -1020,7 +1113,6 @@ void hpet_disable(void) */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> -#include <asm/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 @@ -1244,7 +1336,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - get_rtc_time(&curr_time); + mc146818_get_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 2bcfb5f2bc44..8771766d46b6 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -36,13 +36,14 @@ #include <linux/percpu.h> #include <linux/kdebug.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/smp.h> #include <asm/hw_breakpoint.h> #include <asm/processor.h> #include <asm/debugreg.h> +#include <asm/user.h> /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 64341aa485ae..1f9b878ef5ef 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,4 +1,5 @@ -#include <linux/module.h> +#include <linux/export.h> +#include <linux/spinlock_types.h> #include <asm/checksum.h> #include <asm/pgtable.h> @@ -42,3 +43,5 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(___preempt_schedule); EXPORT_SYMBOL(___preempt_schedule_notrace); #endif + +EXPORT_SYMBOL(__sw_hweight32); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index efb82f07b29c..6ebe00cb4a3b 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -3,7 +3,7 @@ * */ #include <linux/clockchips.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/timex.h> #include <linux/i8253.h> diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index a979b5bd2fc0..50c89e8a95f2 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -6,7 +6,7 @@ * outb_p/inb_p API uses. */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/dmi.h> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 61521dc19c10..9f669fdd2010 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - - irq_stats(j)->irq_tlb_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 38da8f29a9c8..1f38d9a4d9de 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -8,7 +8,6 @@ * io_apic.c.) */ -#include <linux/module.h> #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> @@ -130,11 +129,9 @@ void irq_ctx_init(int cpu) void do_softirq_own_stack(void) { - struct thread_info *curstk; struct irq_stack *irqstk; u32 *isp, *prev_esp; - curstk = current_stack(); irqstk = __this_cpu_read(softirq_stack); /* build the stack frame on the softirq stack */ diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 206d0b90a3ab..9ebd0b0e73d9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -11,7 +11,6 @@ #include <linux/kernel_stat.h> #include <linux/interrupt.h> #include <linux/seq_file.h> -#include <linux/module.h> #include <linux/delay.h> #include <linux/ftrace.h> #include <linux/uaccess.h> @@ -41,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index dc1404bf8e4b..bdb83e431d89 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -8,7 +8,7 @@ */ #include <linux/debugfs.h> #include <linux/uaccess.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/stat.h> diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f2356bda2b05..3407b148c240 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -99,14 +99,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_saved.nr_map; + nr_e820_entries = e820_saved->nr_map; /* TODO: Pass entries more than E820MAX in bootparams setup data */ if (nr_e820_entries > E820MAX) nr_e820_entries = E820MAX; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_map, &e820_saved.map, + memcpy(¶ms->e820_map, &e820_saved->map, nr_e820_entries * sizeof(struct e820entry)); return 0; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 04cde527d728..8e36f249646e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -50,6 +50,7 @@ #include <asm/apicdef.h> #include <asm/apic.h> #include <asm/nmi.h> +#include <asm/switch_to.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_DX] = 0; gdb_regs[GDB_SI] = 0; gdb_regs[GDB_DI] = 0; - gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp; + gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp; #ifdef CONFIG_X86_32 gdb_regs[GDB_DS] = __KERNEL_DS; gdb_regs[GDB_ES] = __KERNEL_DS; gdb_regs[GDB_PS] = 0; gdb_regs[GDB_CS] = __KERNEL_CS; - gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_PS] = 0; gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_R14] = 0; gdb_regs[GDB_R15] = 0; #endif + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_SP] = p->thread.sp; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 38cf7a741250..28cee019209c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,7 +45,7 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> @@ -961,7 +961,19 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * normal page fault. */ regs->ip = (unsigned long)cur->addr; + /* + * Trap flag (TF) has been set here because this fault + * happened where the single stepping will be done. + * So clear it by resetting the current kprobe: + */ + regs->flags &= ~X86_EFLAGS_TF; + + /* + * If the TF flag was set before the kprobe hit, + * don't touch it: + */ regs->flags |= kcb->kprobe_old_flags; + if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); else diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4425f593f0ec..3bb4c5f021f6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -24,7 +24,7 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/preempt.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index c2bedaea11f7..4afc67f5facc 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -184,7 +184,7 @@ out: static struct kobj_attribute type_attr = __ATTR_RO(type); -static struct bin_attribute data_attr = { +static struct bin_attribute data_attr __ro_after_init = { .attr = { .name = "data", .mode = S_IRUGO, diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eea2a6f72b31..edbbfc854e39 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -21,7 +21,7 @@ */ #include <linux/context_tracking.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/kvm_para.h> #include <linux/cpu.h> @@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_info("kvm-stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); @@ -425,12 +423,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) kvm_spinlock_init(); } -static void kvm_guest_cpu_online(void *dummy) -{ - kvm_guest_cpu_init(); -} - -static void kvm_guest_cpu_offline(void *dummy) +static void kvm_guest_cpu_offline(void) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -439,29 +432,21 @@ static void kvm_guest_cpu_offline(void *dummy) apf_task_wake_all(); } -static int kvm_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int kvm_cpu_online(unsigned int cpu) { - int cpu = (unsigned long)hcpu; - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - case CPU_ONLINE_FROZEN: - smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); - break; - default: - break; - } - return NOTIFY_OK; + local_irq_disable(); + kvm_guest_cpu_init(); + local_irq_enable(); + return 0; } -static struct notifier_block kvm_cpu_notifier = { - .notifier_call = kvm_cpu_notify, -}; +static int kvm_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + kvm_guest_cpu_offline(); + local_irq_enable(); + return 0; +} #endif static void __init kvm_apf_trap_init(void) @@ -496,7 +481,9 @@ void __init kvm_guest_init(void) #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; - register_cpu_notifier(&kvm_cpu_notifier); + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", + kvm_cpu_online, kvm_cpu_down_prepare) < 0) + pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); #else kvm_guest_cpu_init(); #endif @@ -577,9 +564,6 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } - -#ifdef CONFIG_QUEUED_SPINLOCKS - #include <asm/qspinlock.h> static void kvm_wait(u8 *ptr, u8 val) @@ -608,243 +592,6 @@ out: local_irq_restore(flags); } -#else /* !CONFIG_QUEUED_SPINLOCKS */ - -enum kvm_contention_stat { - TAKEN_SLOW, - TAKEN_SLOW_PICKUP, - RELEASED_SLOW, - RELEASED_SLOW_KICKED, - NR_CONTENTION_STATS -}; - -#ifdef CONFIG_KVM_DEBUG_FS -#define HISTO_BUCKETS 30 - -static struct kvm_spinlock_stats -{ - u32 contention_stats[NR_CONTENTION_STATS]; - u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 time_blocked; -} spinlock_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - u8 ret; - u8 old; - - old = READ_ONCE(zero_stats); - if (unlikely(old)) { - ret = cmpxchg(&zero_stats, old, 0); - /* This ensures only one fellow resets the stat */ - if (ret == old) - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - } -} - -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ - check_zero(); - spinlock_stats.contention_stats[var] += val; -} - - -static inline u64 spin_time_start(void) -{ - return sched_clock(); -} - -static void __spin_time_accum(u64 delta, u32 *array) -{ - unsigned index; - - index = ilog2(delta); - check_zero(); - - if (index < HISTO_BUCKETS) - array[index]++; - else - array[HISTO_BUCKETS]++; -} - -static inline void spin_time_accum_blocked(u64 start) -{ - u32 delta; - - delta = sched_clock() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); - spinlock_stats.time_blocked += delta; -} - -static struct dentry *d_spin_debug; -static struct dentry *d_kvm_debug; - -static struct dentry *kvm_init_debugfs(void) -{ - d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); - if (!d_kvm_debug) - printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); - - return d_kvm_debug; -} - -static int __init kvm_spinlock_debugfs(void) -{ - struct dentry *d_kvm; - - d_kvm = kvm_init_debugfs(); - if (d_kvm == NULL) - return -ENOMEM; - - d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); - - debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - - debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW]); - debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); - - debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW]); - debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - - debugfs_create_u64("time_blocked", 0444, d_spin_debug, - &spinlock_stats.time_blocked); - - debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); - - return 0; -} -fs_initcall(kvm_spinlock_debugfs); -#else /* !CONFIG_KVM_DEBUG_FS */ -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ -} - -static inline u64 spin_time_start(void) -{ - return 0; -} - -static inline void spin_time_accum_blocked(u64 start) -{ -} -#endif /* CONFIG_KVM_DEBUG_FS */ - -struct kvm_lock_waiting { - struct arch_spinlock *lock; - __ticket_t want; -}; - -/* cpus 'waiting' on a spinlock to become available */ -static cpumask_t waiting_cpus; - -/* Track spinlock on which a cpu is waiting */ -static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); - -__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) -{ - struct kvm_lock_waiting *w; - int cpu; - u64 start; - unsigned long flags; - __ticket_t head; - - if (in_nmi()) - return; - - w = this_cpu_ptr(&klock_waiting); - cpu = smp_processor_id(); - start = spin_time_start(); - - /* - * Make sure an interrupt handler can't upset things in a - * partially setup state. - */ - local_irq_save(flags); - - /* - * The ordering protocol on this is that the "lock" pointer - * may only be set non-NULL if the "want" ticket is correct. - * If we're updating "want", we must first clear "lock". - */ - w->lock = NULL; - smp_wmb(); - w->want = want; - smp_wmb(); - w->lock = lock; - - add_stats(TAKEN_SLOW, 1); - - /* - * This uses set_bit, which is atomic but we should not rely on its - * reordering gurantees. So barrier is needed after this call. - */ - cpumask_set_cpu(cpu, &waiting_cpus); - - barrier(); - - /* - * Mark entry to slowpath before doing the pickup test to make - * sure we don't deadlock with an unlocker. - */ - __ticket_enter_slowpath(lock); - - /* make sure enter_slowpath, which is atomic does not cross the read */ - smp_mb__after_atomic(); - - /* - * check again make sure it didn't become free while - * we weren't looking. - */ - head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(head, want)) { - add_stats(TAKEN_SLOW_PICKUP, 1); - goto out; - } - - /* - * halt until it's our turn and kicked. Note that we do safe halt - * for irq enabled case to avoid hang when lock info is overwritten - * in irq spinlock slowpath and no spurious interrupt occur to save us. - */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); - -out: - cpumask_clear_cpu(cpu, &waiting_cpus); - w->lock = NULL; - local_irq_restore(flags); - spin_time_accum_blocked(start); -} -PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); - -/* Kick vcpu waiting on @lock->head to reach value @ticket */ -static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) -{ - int cpu; - - add_stats(RELEASED_SLOW, 1); - for_each_cpu(cpu, &waiting_cpus) { - const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (READ_ONCE(w->lock) == lock && - READ_ONCE(w->want) == ticket) { - add_stats(RELEASED_SLOW_KICKED, 1); - kvm_kick_cpu(cpu); - break; - } - } -} - -#endif /* !CONFIG_QUEUED_SPINLOCKS */ - /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -856,16 +603,11 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; -#ifdef CONFIG_QUEUED_SPINLOCKS __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; -#else /* !CONFIG_QUEUED_SPINLOCKS */ - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); - pv_lock_ops.unlock_kick = kvm_unlock_kick; -#endif } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd26bb..60b9949f1e65 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,7 +29,7 @@ #include <asm/x86_init.h> #include <asm/reboot.h> -static int kvmclock = 1; +static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static cycle_t kvm_sched_clock_offset; @@ -289,6 +289,7 @@ void __init kvmclock_init(void) put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c new file mode 100644 index 000000000000..e9d252d873aa --- /dev/null +++ b/arch/x86/kernel/livepatch.c @@ -0,0 +1,65 @@ +/* + * livepatch.c - x86-specific Kernel Live Patching Core + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/livepatch.h> +#include <asm/text-patching.h> + +/* Apply per-object alternatives. Based on x86 module_finalize() */ +void arch_klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ + int cnt; + struct klp_modinfo *info; + Elf_Shdr *s, *alt = NULL, *para = NULL; + void *aseg, *pseg; + const char *objname; + char sec_objname[MODULE_NAME_LEN]; + char secname[KSYM_NAME_LEN]; + + info = patch->mod->klp_info; + objname = obj->name ? obj->name : "vmlinux"; + + /* See livepatch core code for BUILD_BUG_ON() explanation */ + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + + for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) { + /* Apply per-object .klp.arch sections */ + cnt = sscanf(info->secstrings + s->sh_name, + ".klp.arch.%55[^.].%127s", + sec_objname, secname); + if (cnt != 2) + continue; + if (strcmp(sec_objname, objname)) + continue; + if (!strcmp(".altinstructions", secname)) + alt = s; + if (!strcmp(".parainstructions", secname)) + para = s; + } + + if (alt) { + aseg = (void *) alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + + if (para) { + pseg = (void *) para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } +} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 97340f2c437c..0f8d20497383 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -16,7 +16,6 @@ #include <linux/mc146818rtc.h> #include <linux/bitops.h> #include <linux/acpi.h> -#include <linux/module.h> #include <linux/smp.h> #include <linux/pci.h> @@ -500,6 +499,9 @@ void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; + if (!smp_found_config) + return; + if (!mpf) return; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 04b132a767f1..bfe4d6c96fbd 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/hardirq.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 33ee3e0efd65..2c55a003b793 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -3,12 +3,11 @@ * compiled in a FTRACE-compatible way. */ #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/jump_label.h> #include <asm/paravirt.h> -#ifdef CONFIG_QUEUED_SPINLOCKS __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); @@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void) return pv_lock_ops.queued_spin_unlock.func == __raw_callee_save___native_queued_spin_unlock; } -#endif struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP -#ifdef CONFIG_QUEUED_SPINLOCKS .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, -#else /* !CONFIG_QUEUED_SPINLOCKS */ - .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), - .unlock_kick = paravirt_nop, -#endif /* !CONFIG_QUEUED_SPINLOCKS */ #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7b3b3f24c3ea..bbf3d5933eaa 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -19,7 +19,8 @@ */ #include <linux/errno.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> @@ -55,12 +56,12 @@ asm (".pushsection .entry.text, \"ax\"\n" ".popsection"); /* identity function, which can be inlined */ -u32 _paravirt_ident_32(u32 x) +u32 notrace _paravirt_ident_32(u32 x) { return x; } -u64 _paravirt_ident_64(u64 x) +u64 notrace _paravirt_ident_64(u64 x) { return x; } @@ -331,7 +332,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, - .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, #ifdef CONFIG_X86_64 .read_cr8 = native_read_cr8, @@ -388,7 +388,7 @@ NOKPROBE_SYMBOL(native_load_idt); #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif -struct pv_mmu_ops pv_mmu_ops = { +struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 158dc0650d5d..920c6ae08592 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); #endif @@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index e70087a04cc8..bb3840cedb4f 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); #endif @@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 833b1d329c47..5d400ba1349d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -340,7 +340,7 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); struct scatterlist *s; @@ -364,7 +364,7 @@ static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); struct scatterlist *s; @@ -396,7 +396,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, return nelems; error: - calgary_unmap_sg(dev, sg, nelems, dir, NULL); + calgary_unmap_sg(dev, sg, nelems, dir, 0); for_each_sg(sg, s, nelems, i) { sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; @@ -407,7 +407,7 @@ error: static dma_addr_t calgary_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { void *vaddr = page_address(page) + offset; unsigned long uaddr; @@ -422,7 +422,7 @@ static dma_addr_t calgary_map_page(struct device *dev, struct page *page, static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; @@ -432,7 +432,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, } static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) + dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { void *ret = NULL; dma_addr_t mapping; @@ -466,7 +466,7 @@ error: static void calgary_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned int npages; struct iommu_table *tbl = find_iommu_table(dev); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 6ba014c61d62..d30c37750765 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -77,7 +77,7 @@ void __init pci_iommu_alloc(void) } void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long dma_mask; struct page *page; @@ -120,7 +120,7 @@ again: } void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) + dma_addr_t dma_addr, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct page *page = virt_to_page(vaddr); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index da15918d1c81..00e71ce396a8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -28,7 +28,7 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) static dma_addr_t nommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); @@ -55,7 +55,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, */ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct scatterlist *s; int i; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 7c577a178859..b47edb8f5256 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -2,7 +2,7 @@ #include <linux/pci.h> #include <linux/cache.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/swiotlb.h> #include <linux/bootmem.h> #include <linux/dma-mapping.h> @@ -16,7 +16,7 @@ int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs) + unsigned long attrs) { void *vaddr; @@ -37,7 +37,7 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, void x86_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, - struct dma_attrs *attrs) + unsigned long attrs) { if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) swiotlb_free_coherent(dev, size, vaddr, dma_addr); diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index b2f8a33b36ff..24a50301f150 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -7,12 +7,12 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; - x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_PC: - x86_platform.legacy.ebda_search = 1; + x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 92f70147a9a6..0c5315d322c8 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -3,7 +3,7 @@ * Copyright (c) 2015, Intel Corporation. */ #include <linux/platform_device.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/ioport.h> static int found(u64 start, u64 end, void *data) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 96becbbb52e0..28cea7802ecb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,7 +7,8 @@ #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/pm.h> #include <linux/tick.h> #include <linux/random.h> @@ -31,6 +32,7 @@ #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/vm86.h> +#include <asm/switch_to.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -300,7 +302,7 @@ void arch_cpu_idle(void) /* * We use this if we don't have any better idle routine.. */ -void default_idle(void) +void __cpuidle default_idle(void) { trace_cpu_idle_rcuidle(1, smp_processor_id()); safe_halt(); @@ -404,7 +406,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; @@ -415,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) * with interrupts enabled and no flags, which is backwards compatible with the * original MWAIT implementation. */ -static void mwait_idle(void) +static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { trace_cpu_idle_rcuidle(1, smp_processor_id()); @@ -512,6 +514,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) } /* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + struct inactive_task_frame *frame = + (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); + return READ_ONCE_NOCHECK(frame->ret_addr); +} + +/* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack @@ -519,15 +532,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) */ unsigned long get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip; + unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + if (!try_get_task_stack(p)) + return 0; + start = (unsigned long)task_stack_page(p); if (!start) - return 0; + goto out; /* * Layout of the stack page: @@ -536,9 +552,7 @@ unsigned long get_wchan(struct task_struct *p) * PADDING * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start + * ----------- bottom = start * * The tasks stack pointer points at the location where the * framepointer is stored. The data on the stack is: @@ -549,20 +563,25 @@ unsigned long get_wchan(struct task_struct *p) */ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); + bottom = start; sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) - return 0; + goto out; - fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); + fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) - return 0; + goto out; ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) - return ip; + if (!in_sched_functions(ip)) { + ret = ip; + goto out; + } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); } while (count++ < 16 && p->state != TASK_RUNNING); - return 0; + +out: + put_task_stack(p); + return ret; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9f950917528b..bd7be8efdc4c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -25,7 +25,7 @@ #include <linux/delay.h> #include <linux/reboot.h> #include <linux/mc146818rtc.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kallsyms.h> #include <linux/ptrace.h> #include <linux/personality.h> @@ -55,17 +55,6 @@ #include <asm/switch_to.h> #include <asm/vm86.h> -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); - -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((unsigned long *)tsk->thread.sp)[3]; -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -101,7 +90,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = __read_cr4_safe(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); @@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); + struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); + struct inactive_task_frame *frame = &fork_frame->frame; struct task_struct *tsk; int err; - p->thread.sp = (unsigned long) childregs; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.sp0 = (unsigned long) (childregs+1); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - p->thread.ip = (unsigned long) ret_from_kernel_thread; - task_user_gs(p) = __KERNEL_STACK_CANARY; - childregs->ds = __USER_DS; - childregs->es = __USER_DS; - childregs->fs = __KERNEL_PERCPU; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->di = arg; p->thread.io_bitmap_ptr = NULL; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; - p->thread.ip = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6e789ca1f841..b3760b3c1ca0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,7 @@ #include <linux/user.h> #include <linux/interrupt.h> #include <linux/delay.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/notifier.h> #include <linux/kprobes.h> @@ -49,8 +49,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> - -asmlinkage extern void ret_from_fork(void); +#include <asm/vdso.h> __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -110,12 +109,13 @@ void __show_regs(struct pt_regs *regs, int all) get_debugreg(d7, 7); /* Only print out debug registers if they are in their non-default state. */ - if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400)) - return; - - printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); - printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400))) { + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", + d0, d1, d2); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", + d3, d6, d7); + } if (boot_cpu_has(X86_FEATURE_OSPKE)) printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); @@ -141,12 +141,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, { int err; struct pt_regs *childregs; + struct fork_frame *fork_frame; + struct inactive_task_frame *frame; struct task_struct *me = current; p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); - p->thread.sp = (unsigned long) childregs; - set_tsk_thread_flag(p, TIF_FORK); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); @@ -160,15 +165,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->sp = (unsigned long)childregs; - childregs->ss = __KERNEL_DS; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->r12 = arg; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; @@ -511,7 +512,7 @@ void set_personality_ia32(bool x32) current->personality &= ~READ_IMPLIES_EXEC; /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ - current_thread_info()->status &= ~TS_COMPAT; + current->thread.status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); @@ -519,11 +520,24 @@ void set_personality_ia32(bool x32) current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; } } EXPORT_SYMBOL_GPL(set_personality_ia32); +#ifdef CONFIG_CHECKPOINT_RESTORE +static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) +{ + int ret; + + ret = map_vdso_once(image, addr); + if (ret) + return ret; + + return (long)image->size; +} +#endif + long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) { int ret = 0; @@ -577,6 +591,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) break; } +#ifdef CONFIG_CHECKPOINT_RESTORE +# ifdef CONFIG_X86_X32_ABI + case ARCH_MAP_VDSO_X32: + return prctl_map_vdso(&vdso_image_x32, addr); +# endif +# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case ARCH_MAP_VDSO_32: + return prctl_map_vdso(&vdso_image_32, addr); +# endif + case ARCH_MAP_VDSO_64: + return prctl_map_vdso(&vdso_image_64, addr); +#endif + default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 600edd225e81..0e63c0267f99 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return sp; prev_esp = (u32 *)(context); - if (prev_esp) - return (unsigned long)prev_esp; + if (*prev_esp) + return (unsigned long)*prev_esp; return (unsigned long)regs; } @@ -923,15 +923,18 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) case offsetof(struct user32, regs.orig_eax): /* - * A 32-bit debugger setting orig_eax means to restore - * the state of the task restarting a 32-bit syscall. - * Make sure we interpret the -ERESTART* codes correctly - * in case the task is not actually still sitting at the - * exit from a 32-bit syscall with TS_COMPAT still set. + * Warning: bizarre corner case fixup here. A 32-bit + * debugger setting orig_eax to -1 wants to disable + * syscall restart. Make sure that the syscall + * restart code sign-extends orig_ax. Also make sure + * we interpret the -ERESTART* codes correctly if + * loaded into regs->ax in case the task is not + * actually still sitting at the exit from a 32-bit + * syscall with TS_COMPAT still set. */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - task_thread_info(child)->status |= TS_COMPAT; + child->thread.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): @@ -1247,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static struct user_regset x86_64_regsets[] __read_mostly = { +static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1288,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static struct user_regset x86_32_regsets[] __read_mostly = { +static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1341,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = { */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); @@ -1355,7 +1358,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask) const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) + if (!user_64bit_mode(task_pt_regs(task))) #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION return &user_x86_32_view; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 99bfc025111d..5b2cc889ce34 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -61,12 +61,12 @@ void pvclock_resume(void) u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) { unsigned version; - cycle_t ret; u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags); - } while ((src->version & 1) || version != src->version); + version = pvclock_read_begin(src); + flags = src->flags; + } while (pvclock_read_retry(src, version)); return flags & valid_flags; } @@ -79,8 +79,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags); - } while ((src->version & 1) || version != src->version); + version = pvclock_read_begin(src); + ret = __pvclock_read_cycles(src, rdtsc_ordered()); + flags = src->flags; + } while (pvclock_read_retry(src, version)); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { src->flags &= ~PVCLOCK_GUEST_STOPPED; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index cc457ff818ad..51402a7e4ca6 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); #endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#include <linux/jump_label.h> +#include <asm/string_64.h> + +/* Ivy Bridge, Haswell, Broadwell */ +static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if (capid0 & 0x10) + static_branch_inc(&mcsafe_key); +} + +/* Skylake */ +static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if ((capid0 & 0xc0) == 0xc0) + static_branch_inc(&mcsafe_key); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); +#endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a9b31eb815f2..e244c19a2451 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,6 +1,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> +#include <linux/export.h> #include <linux/reboot.h> #include <linux/init.h> #include <linux/pm.h> @@ -55,6 +55,19 @@ bool port_cf9_safe = false; */ /* + * Some machines require the "reboot=a" commandline options + */ +static int __init set_acpi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_ACPI) { + reboot_type = BOOT_ACPI; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "ACPI"); + } + return 0; +} + +/* * Some machines require the "reboot=b" or "reboot=k" commandline options, * this quirk makes that automatic. */ @@ -395,6 +408,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */ + .callback = set_acpi_reboot, + .ident = "Dell OptiPlex 7450 AIO", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"), + }, + }, /* Hewlett-Packard */ { /* Handle problems with rebooting on HP laptops */ @@ -684,7 +705,7 @@ static void native_machine_power_off(void) tboot_shutdown(TB_SHUTDOWN_HALT); } -struct machine_ops machine_ops = { +struct machine_ops machine_ops __ro_after_init = { .power_off = native_machine_power_off, .shutdown = native_machine_shutdown, .emergency_restart = native_machine_emergency_restart, diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 80eab01c1a68..2408c1603438 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -27,8 +27,8 @@ static void remove_e820_regions(struct resource *avail) int i; struct e820entry *entry; - for (i = 0; i < e820.nr_map; i++) { - entry = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + entry = &e820->map[i]; resource_clip(avail, entry->addr, entry->addr + entry->size - 1); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index eceaa082ec3f..79c6311cd912 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -13,7 +13,6 @@ #include <asm/x86_init.h> #include <asm/time.h> #include <asm/intel-mid.h> -#include <asm/rtc.h> #include <asm/setup.h> #ifdef CONFIG_X86_32 @@ -47,7 +46,7 @@ int mach_set_rtc_mmss(const struct timespec *now) rtc_time_to_tm(nowtime, &tm); if (!rtc_valid_tm(&tm)) { - retval = set_rtc_time(&tm); + retval = mc146818_set_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", __func__, retval); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4e7b3991b60..bbfbca5fea0c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -36,7 +36,7 @@ #include <linux/console.h> #include <linux/root_dev.h> #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/efi.h> #include <linux/init.h> #include <linux/edd.h> @@ -113,6 +113,7 @@ #include <asm/prom.h> #include <asm/microcode.h> #include <asm/mmu_context.h> +#include <asm/kaslr.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -209,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -__visible unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features __ro_after_init; #else -__visible unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ @@ -399,10 +400,6 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } -static void __init early_initrd_acpi_init(void) -{ - early_acpi_table_init((void *)initrd_start, initrd_end - initrd_start); -} #else static void __init early_reserve_initrd(void) { @@ -410,9 +407,6 @@ static void __init early_reserve_initrd(void) static void __init reserve_initrd(void) { } -static void __init early_initrd_acpi_init(void) -{ -} #endif /* CONFIG_BLK_DEV_INITRD */ static void __init parse_setup_data(void) @@ -464,8 +458,8 @@ static void __init e820_reserve_setup_data(void) early_memunmap(data, sizeof(*data)); } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + memcpy(e820_saved, e820, sizeof(struct e820map)); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("reserve setup_data"); } @@ -769,7 +763,7 @@ static void __init trim_bios_range(void) */ e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); } /* called before trim_bios_range() to spare extra sanitize */ @@ -1038,7 +1032,7 @@ void __init setup_arch(char **cmdline_p) if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); printk(KERN_INFO "fixed physical RAM map:\n"); e820_print_map("bad_ppro"); } @@ -1059,6 +1053,12 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); @@ -1096,17 +1096,19 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); - if (efi_enabled(EFI_BOOT)) { + reserve_bios_regions(); + + if (efi_enabled(EFI_MEMMAP)) { efi_fake_memmap(); efi_find_mirror(); - } + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be called - * after ExitBootServices(). This is, in fact, a lie. - */ - if (efi_enabled(EFI_MEMMAP)) + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ efi_reserve_boot_services(); + } /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); @@ -1129,7 +1131,13 @@ void __init setup_arch(char **cmdline_p) early_trap_pf_init(); - setup_real_mode(); + /* + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. + */ + mmu_cr4_features = __read_cr4(); memblock_set_current_limit(get_max_mapped()); @@ -1146,7 +1154,7 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); - early_initrd_acpi_init(); + acpi_table_upgrade(); vsmp_init(); @@ -1178,13 +1186,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); - if (boot_cpu_data.cpuid_level >= 0) { - /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = __read_cr4(); - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - } - #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, @@ -1218,8 +1219,7 @@ void __init setup_arch(char **cmdline_p) /* * get boot-time SMP configuration: */ - if (smp_found_config) - get_smp_config(); + get_smp_config(); prefill_possible_map(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e4fcb87ba7a6..2bbd27f89802 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, }; EXPORT_SYMBOL(__per_cpu_offset); @@ -236,6 +236,8 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); + per_cpu(x86_cpu_to_acpiid, cpu) = + early_per_cpu_map(x86_cpu_to_acpiid, cpu); #endif #ifdef CONFIG_X86_32 per_cpu(x86_cpu_to_logical_apicid, cpu) = @@ -244,7 +246,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE - 64; + IRQ_STACK_SIZE; #endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = @@ -271,6 +273,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; + early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; #endif #ifdef CONFIG_X86_32 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 22cc2f9f8aec..763af1d0de64 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -42,6 +42,7 @@ #include <asm/syscalls.h> #include <asm/sigframe.h> +#include <asm/signal.h> #define COPY(x) do { \ get_user_ex(regs->x, &sc->x); \ @@ -146,7 +147,7 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); - err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -245,14 +246,14 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, struct fpu *fpu = ¤t->thread.fpu; /* redzone */ - if (config_enabled(CONFIG_X86_64)) + if (IS_ENABLED(CONFIG_X86_64)) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(sp) == 0) sp = current->sas_ss_sp + current->sas_ss_size; - } else if (config_enabled(CONFIG_X86_32) && + } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && (regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && @@ -262,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } if (fpu->fpstate_active) { - sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; } @@ -547,7 +548,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user32(&frame->info, &ksig->info)) + if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) return -EFAULT; } @@ -660,20 +661,21 @@ badframe: return 0; } -static inline int is_ia32_compat_frame(void) +static inline int is_ia32_compat_frame(struct ksignal *ksig) { - return config_enabled(CONFIG_IA32_EMULATION) && - test_thread_flag(TIF_IA32); + return IS_ENABLED(CONFIG_IA32_EMULATION) && + ksig->ka.sa.sa_flags & SA_IA32_ABI; } -static inline int is_ia32_frame(void) +static inline int is_ia32_frame(struct ksignal *ksig) { - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig); } -static inline int is_x32_frame(void) +static inline int is_x32_frame(struct ksignal *ksig) { - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); + return IS_ENABLED(CONFIG_X86_X32_ABI) && + ksig->ka.sa.sa_flags & SA_X32_ABI; } static int @@ -684,12 +686,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) compat_sigset_t *cset = (compat_sigset_t *) set; /* Set up the stack frame */ - if (is_ia32_frame()) { + if (is_ia32_frame(ksig)) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) return ia32_setup_rt_frame(usig, ksig, cset, regs); else return ia32_setup_frame(usig, ksig, cset, regs); - } else if (is_x32_frame()) { + } else if (is_x32_frame(ksig)) { return x32_setup_rt_frame(ksig, cset, regs); } else { return __setup_rt_frame(ksig->sig, ksig, set, regs); @@ -760,8 +762,30 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { -#ifdef CONFIG_X86_64 - if (in_ia32_syscall()) + /* + * This function is fundamentally broken as currently + * implemented. + * + * The idea is that we want to trigger a call to the + * restart_block() syscall and that we want in_ia32_syscall(), + * in_x32_syscall(), etc. to match whatever they were in the + * syscall being restarted. We assume that the syscall + * instruction at (regs->ip - 2) matches whatever syscall + * instruction we used to enter in the first place. + * + * The problem is that we can get here when ptrace pokes + * syscall-like values into regs even if we're not in a syscall + * at all. + * + * For now, we maintain historical behavior and guess based on + * stored state. We could do better by saving the actual + * syscall arch in restart_block or (with caveats on x32) by + * checking if regs->ip points to 'int $0x80'. The current + * behavior is incorrect if a tracer has a different bitness + * than the tracee. + */ +#ifdef CONFIG_IA32_EMULATION + if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index dc3c0b1c816f..40df33753bae 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,10 +1,125 @@ #include <linux/compat.h> #include <linux/uaccess.h> +#include <linux/ptrace.h> -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +/* + * The compat_siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the compat_siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ +static inline void signal_compat_build_tests(void) +{ + int _sifields_offset = offsetof(compat_siginfo_t, _sifields); + + /* + * If adding a new si_code, there is probably new data in + * the siginfo. Make sure folks bumping the si_code + * limits also have to look at this code. Make sure any + * new fields are handled in copy_siginfo_to_user32()! + */ + BUILD_BUG_ON(NSIGILL != 8); + BUILD_BUG_ON(NSIGFPE != 8); + BUILD_BUG_ON(NSIGSEGV != 4); + BUILD_BUG_ON(NSIGBUS != 5); + BUILD_BUG_ON(NSIGTRAP != 4); + BUILD_BUG_ON(NSIGCHLD != 6); + BUILD_BUG_ON(NSIGSYS != 1); + + /* This is part of the ABI and can never change in size: */ + BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); + /* + * The offsets of all the (unioned) si_fields are fixed + * in the ABI, of course. Make sure none of them ever + * move and are always at the beginning: + */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); +#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + + /* + * Ensure that the size of each si_field never changes. + * If it does, it is a sign that the + * copy_siginfo_to_user32() code below needs to updated + * along with the size in the CHECK_SI_SIZE(). + * + * We repeat this check for both the generic and compat + * siginfos. + * + * Note: it is OK for these to grow as long as the whole + * structure stays within the padding size (checked + * above). + */ +#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name)) +#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name)) + + CHECK_CSI_OFFSET(_kill); + CHECK_CSI_SIZE (_kill, 2*sizeof(int)); + CHECK_SI_SIZE (_kill, 2*sizeof(int)); + + CHECK_CSI_OFFSET(_timer); + CHECK_CSI_SIZE (_timer, 5*sizeof(int)); + CHECK_SI_SIZE (_timer, 6*sizeof(int)); + + CHECK_CSI_OFFSET(_rt); + CHECK_CSI_SIZE (_rt, 3*sizeof(int)); + CHECK_SI_SIZE (_rt, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld); + CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); + CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld_x32); + CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); + /* no _sigchld_x32 in the generic siginfo_t */ + + CHECK_CSI_OFFSET(_sigfault); + CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); + CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigpoll); + CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); + CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigsys); + CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); + CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + + /* any new si_fields should be added here */ +} + +void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) +{ + /* Don't leak in-kernel non-uapi flags to user-space */ + if (oact) + oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (!act) + return; + + /* Don't let flags to be set from userspace */ + act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); + + if (user_64bit_mode(current_pt_regs())) + return; + + if (in_ia32_syscall()) + act->sa.sa_flags |= SA_IA32_ABI; + if (in_x32_syscall()) + act->sa.sa_flags |= SA_X32_ABI; +} + +int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, + bool x32_ABI) { int err = 0; - bool ia32 = test_thread_flag(TIF_IA32); + + signal_compat_build_tests(); if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; @@ -32,13 +147,28 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) &to->_sifields._pad[0]); switch (from->si_code >> 16) { case __SI_FAULT >> 16: + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || + from->si_code == BUS_MCEERR_AO)) + put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); + + if (from->si_signo == SIGSEGV) { + if (from->si_code == SEGV_BNDERR) { + compat_uptr_t lower = (unsigned long)&to->si_lower; + compat_uptr_t upper = (unsigned long)&to->si_upper; + put_user_ex(lower, &to->si_lower); + put_user_ex(upper, &to->si_upper); + } + if (from->si_code == SEGV_PKUERR) + put_user_ex(from->si_pkey, &to->si_pkey); + } break; case __SI_SYS >> 16: put_user_ex(from->si_syscall, &to->si_syscall); put_user_ex(from->si_arch, &to->si_arch); break; case __SI_CHLD >> 16: - if (ia32) { + if (!x32_ABI) { put_user_ex(from->si_utime, &to->si_utime); put_user_ex(from->si_stime, &to->si_stime); } else { @@ -72,6 +202,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) return err; } +/* from syscall's path, where we know the ABI */ +int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +{ + return __copy_siginfo_to_user32(to, from, in_x32_syscall()); +} + int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) { int err = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fafe8b923cac..951f093a96fe 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -43,7 +43,7 @@ #include <linux/init.h> #include <linux/smp.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/percpu.h> #include <linux/bootmem.h> @@ -100,10 +100,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ static int *physical_to_logical_pkg __read_mostly; static unsigned long *physical_package_map __read_mostly;; -static unsigned long *logical_package_map __read_mostly; static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static bool logical_packages_frozen __read_mostly; + +/* Maximum number of SMT threads on any online core */ +int __max_smt_threads __read_mostly; static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { @@ -274,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - new = find_first_zero_bit(logical_package_map, __max_logical_packages); - if (new >= __max_logical_packages) { + if (logical_packages_frozen) { physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package map\n", + pr_warn("APIC(%x) Package %u exceeds logical package max\n", apicid, pkg); return -ENOSPC; } - set_bit(new, logical_package_map); + + new = logical_packages++; pr_info("APIC(%x) Converting physical %u to logical package %u\n", apicid, pkg, new); physical_to_logical_pkg[pkg] = new; @@ -338,6 +342,7 @@ static void __init smp_init_package_map(void) } __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + logical_packages = 0; /* * Possibly larger than what we need as the number of apic ids per @@ -349,10 +354,6 @@ static void __init smp_init_package_map(void) memset(physical_to_logical_pkg, 0xff, size); size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); - logical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); for_each_present_cpu(cpu) { unsigned int apicid = apic->cpu_present_to_apicid(cpu); @@ -366,6 +367,15 @@ static void __init smp_init_package_map(void) set_cpu_possible(cpu, false); set_cpu_present(cpu, false); } + + if (logical_packages > __max_logical_packages) { + pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", + logical_packages, __max_logical_packages); + logical_packages_frozen = true; + __max_logical_packages = logical_packages; + } + + pr_info("Max logical packages: %u\n", __max_logical_packages); } void __init smp_store_boot_cpu_info(void) @@ -461,31 +471,32 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static struct sched_domain_topology_level numa_inside_package_topology[] = { +static struct sched_domain_topology_level x86_numa_in_package_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { NULL, }, +}; + +static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; + /* - * set_sched_topology() sets the topology internal to a CPU. The - * NUMA topologies are layered on top of it to build the full - * system topology. - * - * If NUMA nodes are observed to occur within a CPU package, this - * function should be called. It forces the sched domain code to - * only use the SMT level for the CPU portion of the topology. - * This essentially falls back to relying on NUMA information - * from the SRAT table to describe the entire system topology - * (except for hyperthreads). + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours and Intel Cluster-on-Die have this. */ -static void primarily_use_numa_for_topology(void) -{ - set_sched_topology(numa_inside_package_topology); -} +static bool x86_has_numa_in_package; void set_cpu_sibling_map(int cpu) { @@ -493,7 +504,7 @@ void set_cpu_sibling_map(int cpu) bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; - int i; + int i, threads; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); @@ -548,8 +559,12 @@ void set_cpu_sibling_map(int cpu) c->booted_cores = cpu_data(i).booted_cores; } if (match_die(c, o) && !topology_same_node(c, o)) - primarily_use_numa_for_topology(); + x86_has_numa_in_package = true; } + + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; } /* maps the cpu to the sched domain representing multi-core */ @@ -676,7 +691,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -703,7 +718,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); @@ -742,7 +757,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) + if (APIC_INTEGRATED(boot_cpu_apic_version)) num_starts = 2; else num_starts = 0; @@ -928,7 +943,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) per_cpu(cpu_current_top_of_stack, cpu) = (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else - clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif } @@ -955,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = idle->thread.sp; + initial_stack = idle->thread.sp; /* * Enable the espfix hack for this CPU @@ -980,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -1101,17 +1115,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) common_cpu_up(cpu, tidle); - /* - * We have to walk the irq descriptors to setup the vector - * space for the cpu which comes online. Prevent irq - * alloc/free across the bringup. - */ - irq_lock_sparse(); - err = do_boot_cpu(apicid, cpu, tidle); - if (err) { - irq_unlock_sparse(); pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); return -EIO; } @@ -1129,8 +1134,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) touch_nmi_watchdog(); } - irq_unlock_sparse(); - return 0; } @@ -1235,7 +1238,7 @@ static int __init smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && + if (APIC_INTEGRATED(boot_cpu_apic_version) && !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", @@ -1285,12 +1288,21 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); - current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } + + /* + * Set 'default' x86 topology, this matches default_topology() in that + * it has NUMA nodes as a topology level. See also + * native_smp_cpus_done(). + * + * Must be done before set_cpus_sibling_map() is ran. + */ + set_sched_topology(x86_topology); + set_cpu_sibling_map(0); switch (smp_sanity_check(max_cpus)) { @@ -1310,14 +1322,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) break; } - default_setup_apic_routing(); - if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } + default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); pr_info("CPU%d: ", 0); @@ -1357,6 +1368,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); + if (x86_has_numa_in_package) + set_sched_topology(x86_numa_in_package_topology); + nmi_selftest(); impress_friends(); setup_ioapic_dest(); @@ -1393,9 +1407,21 @@ __init void prefill_possible_map(void) { int i, possible; - /* no processor from mptable or madt */ - if (!num_processors) - num_processors = 1; + /* No boot processor was found in mptable or ACPI MADT */ + if (!num_processors) { + int apicid = boot_cpu_physical_apicid; + int cpu = hard_smp_processor_id(); + + pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); + + /* Make sure boot cpu is enumerated */ + if (apic->cpu_present_to_apicid(0) == BAD_APICID && + apic->apic_id_valid(apicid)) + generic_processor_info(apicid, boot_cpu_apic_version); + + if (!num_processors) + num_processors = 1; + } i = setup_max_cpus ?: 1; if (setup_possible_cpus == -1) { @@ -1441,6 +1467,21 @@ __init void prefill_possible_map(void) #ifdef CONFIG_HOTPLUG_CPU +/* Recompute SMT state for all CPUs on offline */ +static void recompute_smt_state(void) +{ + int max_threads, cpu; + + max_threads = 0; + for_each_online_cpu (cpu) { + int threads = cpumask_weight(topology_sibling_cpumask(cpu)); + + if (threads > max_threads) + max_threads = threads; + } + __max_smt_threads = max_threads; +} + static void remove_siblinginfo(int cpu) { int sibling; @@ -1465,6 +1506,7 @@ static void remove_siblinginfo(int cpu) c->phys_proc_id = 0; c->cpu_core_id = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); + recompute_smt_state(); } static void remove_cpu_from_maps(int cpu) @@ -1622,7 +1664,7 @@ static inline void mwait_play_dead(void) } } -static inline void hlt_play_dead(void) +void hlt_play_dead(void) { if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 9ee98eefc44d..0653788026e2 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -5,83 +5,72 @@ */ #include <linux/sched.h> #include <linux/stacktrace.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/uaccess.h> #include <asm/stacktrace.h> +#include <asm/unwind.h> -static int save_stack_stack(void *data, char *name) +static int save_stack_address(struct stack_trace *trace, unsigned long addr, + bool nosched) { - return 0; -} - -static int -__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) -{ - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return 0; -#endif if (nosched && in_sched_functions(addr)) return 0; + if (trace->skip > 0) { trace->skip--; return 0; } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = addr; - return 0; - } else { - return -1; /* no more room, stop walking the stack */ - } -} -static int save_stack_address(void *data, unsigned long addr, int reliable) -{ - return __save_stack_address(data, addr, reliable, false); + if (trace->nr_entries >= trace->max_entries) + return -1; + + trace->entries[trace->nr_entries++] = addr; + return 0; } -static int -save_stack_address_nosched(void *data, unsigned long addr, int reliable) +static void __save_stack_trace(struct stack_trace *trace, + struct task_struct *task, struct pt_regs *regs, + bool nosched) { - return __save_stack_address(data, addr, reliable, true); -} + struct unwind_state state; + unsigned long addr; -static const struct stacktrace_ops save_stack_ops = { - .stack = save_stack_stack, - .address = save_stack_address, - .walk_stack = print_context_stack, -}; + if (regs) + save_stack_address(trace, regs->ip, nosched); -static const struct stacktrace_ops save_stack_ops_nosched = { - .stack = save_stack_stack, - .address = save_stack_address_nosched, - .walk_stack = print_context_stack, -}; + for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || save_stack_address(trace, addr, nosched)) + break; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, regs, false); } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (!try_get_task_stack(tsk)) + return; + + __save_stack_trace(trace, tsk, NULL, true); + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 9b0185fbe3eb..8402907825b0 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -188,12 +188,12 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; - for (i = 0; i < e820.nr_map; i++) { - if ((e820.map[i].type != E820_RAM) - && (e820.map[i].type != E820_RESERVED_KERN)) + for (i = 0; i < e820->nr_map; i++) { + if ((e820->map[i].type != E820_RAM) + && (e820->map[i].type != E820_RESERVED_KERN)) continue; - add_mac_region(e820.map[i].addr, e820.map[i].size); + add_mac_region(e820->map[i].addr, e820->map[i].size); } tboot->acpi_sinfo.kernel_s3_resume_vector = @@ -323,25 +323,16 @@ static int tboot_wait_for_aps(int num_aps) return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } -static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int tboot_dying_cpu(unsigned int cpu) { - switch (action) { - case CPU_DYING: - atomic_inc(&ap_wfs_count); - if (num_online_cpus() == 1) - if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) - return NOTIFY_BAD; - break; + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) { + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return -EBUSY; } - return NOTIFY_OK; + return 0; } -static struct notifier_block tboot_cpu_notifier = -{ - .notifier_call = tboot_cpu_callback, -}; - #ifdef CONFIG_DEBUG_FS #define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \ @@ -417,8 +408,8 @@ static __init int tboot_late_init(void) tboot_create_trampoline(); atomic_set(&ap_wfs_count, 0); - register_hotcpu_notifier(&tboot_cpu_notifier); - + cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL, + tboot_dying_cpu); #ifdef CONFIG_DEBUG_FS debugfs_create_file("tboot_log", S_IRUSR, arch_debugfs_dir, NULL, &tboot_log_fops); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index cb4a01b41e27..222e84e2432e 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -9,7 +9,6 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include <linux/module.h> #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/asm.h> @@ -74,7 +73,3 @@ int rodata_test(void) return 0; } - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for marking rodata as read-only"); -MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d1590486204a..bd4e3d4d3625 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -21,7 +21,7 @@ #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> @@ -96,6 +96,12 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } +/* + * In IST context, we explicitly disable preemption. This serves two + * purposes: it makes it much less likely that we would accidentally + * schedule in IST context and it will force a warning if we somehow + * manage to schedule by accident. + */ void ist_enter(struct pt_regs *regs) { if (user_mode(regs)) { @@ -110,13 +116,7 @@ void ist_enter(struct pt_regs *regs) rcu_nmi_enter(); } - /* - * We are atomic because we're on the IST stack; or we're on - * x86_32, in which case we still shouldn't schedule; or we're - * on x86_64 and entered from user mode, in which case we're - * still atomic unless ist_begin_non_atomic is called. - */ - preempt_count_add(HARDIRQ_OFFSET); + preempt_disable(); /* This code is a bit fragile. Test it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); @@ -124,7 +124,7 @@ void ist_enter(struct pt_regs *regs) void ist_exit(struct pt_regs *regs) { - preempt_count_sub(HARDIRQ_OFFSET); + preempt_enable_no_resched(); if (!user_mode(regs)) rcu_nmi_exit(); @@ -155,7 +155,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) BUG_ON((unsigned long)(current_top_of_stack() - current_stack_pointer()) >= THREAD_SIZE); - preempt_count_sub(HARDIRQ_OFFSET); + preempt_enable_no_resched(); } /** @@ -165,7 +165,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) */ void ist_end_non_atomic(void) { - preempt_count_add(HARDIRQ_OFFSET); + preempt_disable(); } static nokprobe_inline int @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +#ifdef CONFIG_VMAP_STACK +__visible void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address) +{ + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", + (void *)fault_address, current->stack, + (char *)current->stack + THREAD_SIZE - 1); + die(message, regs, 0); + + /* Be absolutely certain we don't return. */ + panic(message); +} +#endif + #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_VMAP_STACK + unsigned long cr2; +#endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; +#ifdef CONFIG_VMAP_STACK + /* + * If we overflow the stack into a guard page, the CPU will fail + * to deliver #PF and will send #DF instead. Similarly, if we + * take any non-IST exception while too close to the bottom of + * the stack, the processor will get a page fault while + * delivering the exception and will generate a double fault. + * + * According to the SDM (footnote in 6.15 under "Interrupt 14 - + * Page-Fault Exception (#PF): + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being + * deliv- ered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a + * double fault. + * + * The logic below has a small possibility of incorrectly diagnosing + * some errors as stack overflows. For example, if the IDT or GDT + * gets corrupted such that #GP delivery fails due to a bad descriptor + * causing #GP and we hit this condition while CR2 coincidentally + * points to the stack guard page, we'll think we overflowed the + * stack. Given that we're going to panic one way or another + * if this happens, this isn't necessarily worth fixing. + * + * If necessary, we could improve the test by only diagnosing + * a stack overflow if the saved RSP points within 47 bytes of + * the bottom of the stack: if RSP == tsk_stack + 48 and we + * take an exception, the stack is already aligned and there + * will be enough room SS, RSP, RFLAGS, CS, RIP, and a + * possible error code, so a stack overflow would *not* double + * fault. With any less space left, exception delivery could + * fail, and, as a practical matter, we've overflowed the + * stack even if the actual trigger for the double fault was + * something else. + */ + cr2 = read_cr2(); + if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) + handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); +#endif + #ifdef CONFIG_DOUBLEFAULT df_debug(regs, error_code); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 38ba6de56ede..46b2f41f8b05 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -3,7 +3,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> @@ -22,6 +22,8 @@ #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> +#include <asm/apic.h> +#include <asm/intel-family.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -239,7 +241,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu) { unsigned long long tsc_now, ns_now; struct cyc2ns_data *data; @@ -248,7 +250,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_save(flags); sched_clock_idle_sleep_event(); - if (!cpu_khz) + if (!khz) goto done; data = cyc2ns_write_begin(cpu); @@ -261,7 +263,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -335,12 +337,6 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); -int check_tsc_disabled(void) -{ - return tsc_disabled; -} -EXPORT_SYMBOL_GPL(check_tsc_disabled); - #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { @@ -665,19 +661,82 @@ success: } /** - * native_calibrate_tsc - calibrate the tsc on boot + * native_calibrate_tsc + * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) { + unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; + unsigned int crystal_khz; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x15) + return 0; + + eax_denominator = ebx_numerator = ecx_hz = edx = 0; + + /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ + cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + + if (ebx_numerator == 0 || eax_denominator == 0) + return 0; + + crystal_khz = ecx_hz / 1000; + + if (crystal_khz == 0) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + crystal_khz = 24000; /* 24.0 MHz */ + break; + case INTEL_FAM6_SKYLAKE_X: + crystal_khz = 25000; /* 25.0 MHz */ + break; + case INTEL_FAM6_ATOM_GOLDMONT: + crystal_khz = 19200; /* 19.2 MHz */ + break; + } + } + + return crystal_khz * ebx_numerator / eax_denominator; +} + +static unsigned long cpu_khz_from_cpuid(void) +{ + unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x16) + return 0; + + eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; + + cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + + return eax_base_mhz * 1000; +} + +/** + * native_calibrate_cpu - calibrate the cpu on boot + */ +unsigned long native_calibrate_cpu(void) +{ u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - /* Calibrate TSC using MSR for Intel Atom SoCs */ - local_irq_save(flags); - fast_calibrate = try_msr_calibrate_tsc(); - local_irq_restore(flags); + fast_calibrate = cpu_khz_from_cpuid(); + if (fast_calibrate) + return fast_calibrate; + + fast_calibrate = cpu_khz_from_msr(); if (fast_calibrate) return fast_calibrate; @@ -837,8 +896,12 @@ int recalibrate_cpu_khz(void) if (!boot_cpu_has(X86_FEATURE_TSC)) return -ENODEV; + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); @@ -1193,6 +1256,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); + /* Inform the TSC deadline clockevent devices about the recalibration */ + lapic_update_tsc_freq(); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; @@ -1244,8 +1310,18 @@ void __init tsc_init(void) return; } + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + + /* + * Trust non-zero tsc_khz as authorative, + * and use it to sanity check cpu_khz, + * which will be off if system timer is off. + */ + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); @@ -1265,7 +1341,7 @@ void __init tsc_init(void) */ for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(cpu_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu); } if (tsc_disabled > 0) diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 9911a0620f9a..0fe720d64fef 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,14 +1,5 @@ /* - * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. - * - * TSC in Intel Atom SoC runs at a constant rate which can be figured - * by this formula: - * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency> - * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 - * for details. - * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR - * based calibration is the only option. - * + * tsc_msr.c - TSC frequency enumeration via MSR * * Copyright (C) 2013 Intel Corporation * Author: Bin Gao <bin.gao@intel.com> @@ -22,18 +13,10 @@ #include <asm/apic.h> #include <asm/param.h> -/* CPU reference clock frequency: in KHz */ -#define FREQ_80 80000 -#define FREQ_83 83200 -#define FREQ_100 99840 -#define FREQ_133 133200 -#define FREQ_166 166400 - -#define MAX_NUM_FREQS 8 +#define MAX_NUM_FREQS 9 /* - * According to Intel 64 and IA-32 System Programming Guide, - * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. * Unfortunately some Intel Atom SoCs aren't quite compliant to this, * so we need manually differentiate SoC families. This is what the @@ -48,17 +31,18 @@ struct freq_desc { static struct freq_desc freq_desc_tables[] = { /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, /* CLV+ */ - { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, - /* TNG */ - { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, - /* VLV2 */ - { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, - /* ANN */ - { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, - /* AIRMONT */ - { 6, 0x4c, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, FREQ_80, 0, 0, 0 } }, + { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, + /* TNG - Intel Atom processor Z3400 series */ + { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, + /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ + { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, + /* ANN - Intel Atom processor Z3500 series */ + { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, + /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ + { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, + 80000, 93300, 90000, 88900, 87500 } }, }; static int match_cpu(u8 family, u8 model) @@ -79,16 +63,20 @@ static int match_cpu(u8 family, u8 model) (freq_desc_tables[cpu_index].freqs[freq_id]) /* - * Do MSR calibration only for known/supported CPUs. + * MSR-based CPU/TSC frequency discovery for certain CPUs. * - * Returns the calibration value or 0 if MSR calibration failed. + * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy + * Return processor base frequency in KHz, or 0 on failure. */ -unsigned long try_msr_calibrate_tsc(void) +unsigned long cpu_khz_from_msr(void) { u32 lo, hi, ratio, freq_id, freq; unsigned long res; int cpu_index; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); if (cpu_index < 0) return 0; @@ -100,31 +88,17 @@ unsigned long try_msr_calibrate_tsc(void) rdmsr(MSR_IA32_PERF_STATUS, lo, hi); ratio = (hi >> 8) & 0x1f; } - pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); - - if (!ratio) - goto fail; /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); freq_id = lo & 0x7; freq = id_to_freq(cpu_index, freq_id); - pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", - freq_id, freq); - if (!freq) - goto fail; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; - pr_info("TSC runs at %lu KHz\n", res); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; - pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); #endif return res; - -fail: - pr_warn("Fast TSC calibration using MSR failed\n"); - return 0; } diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c new file mode 100644 index 000000000000..a2456d4d286a --- /dev/null +++ b/arch/x86/kernel/unwind_frame.c @@ -0,0 +1,93 @@ +#include <linux/sched.h> +#include <asm/ptrace.h> +#include <asm/bitops.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +#define FRAME_HEADER_SIZE (sizeof(long) * 2) + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + unsigned long addr; + unsigned long *addr_p = unwind_get_return_address_ptr(state); + + if (unwind_done(state)) + return 0; + + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr_p); + + return __kernel_text_address(addr) ? addr : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool update_stack_state(struct unwind_state *state, void *addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If addr isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that 'info->next_sp' could point to an empty stack and 'addr' could + * be on a subsequent stack. + */ + while (!on_stack(info, addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long *next_bp; + + if (unwind_done(state)) + return false; + + next_bp = (unsigned long *)*state->bp; + + /* make sure the next frame's data is accessible */ + if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) + return false; + + /* move to the next frame */ + state->bp = next_bp; + return true; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* don't even attempt to start from user mode regs */ + if (regs && user_mode(regs)) { + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; + } + + /* set up the starting stack frame */ + state->bp = get_frame_pointer(task, regs); + + /* initialize stack info and make sure the frame data is accessible */ + get_stack_info(state->bp, state->task, &state->stack_info, + &state->stack_mask); + update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->bp < first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c new file mode 100644 index 000000000000..9298993dc8b7 --- /dev/null +++ b/arch/x86/kernel/unwind_guess.c @@ -0,0 +1,53 @@ +#include <linux/sched.h> +#include <linux/ftrace.h> +#include <asm/ptrace.h> +#include <asm/bitops.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return ftrace_graph_ret_addr(state->task, &state->graph_idx, + *state->sp, state->sp); +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + + if (unwind_done(state)) + return false; + + do { + for (state->sp++; state->sp < info->end; state->sp++) + if (__kernel_text_address(*state->sp)) + return true; + + state->sp = info->next_sp; + + } while (!get_stack_info(state->sp, state->task, info, + &state->stack_mask)); + + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + + state->task = task; + state->sp = first_frame; + + get_stack_info(first_frame, state->task, &state->stack_info, + &state->stack_mask); + + if (!__kernel_text_address(*first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31d99ff..495c776de4b4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* - * Similar treatment for VEX3 prefix. - * TODO: add XOP/EVEX treatment when insn decoder supports them + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa - * (evex will need setting of both b and x since - * in non-sib encoding evex.x is 4th bit of MODRM.rm) - * Setting VEX3.b (setting because it has inverted meaning): + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* - * TODO: add XOP, EXEV vvvv reading. + * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 3dce1ca0a653..01f30e56f99e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -440,10 +440,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) static inline int is_revectored(int nr, struct revectored_struct *bitmap) { - __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" - :"=r" (nr) - :"m" (*bitmap), "r" (nr)); - return nr; + return test_bit(nr, bitmap->__map); } #define val_byte(val, n) (((__u8 *)&val)[n]) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9297a002d8e5..dbf67f64d5ec 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -97,6 +97,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index cd05942bc918..b2cee3d19477 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -1,7 +1,8 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ -#include <linux/module.h> +#include <linux/export.h> +#include <linux/spinlock_types.h> #include <linux/smp.h> #include <net/checksum.h> @@ -37,13 +38,16 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); -EXPORT_SYMBOL_GPL(memcpy_mcsafe); +EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(__sw_hweight32); +EXPORT_SYMBOL(__sw_hweight64); + /* * Export string functions. We normally rely on gcc builtin for most of these, * but gcc sometimes decides not to inline them. diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index dad5fe9633a3..0bd9f1287f39 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,7 +5,7 @@ */ #include <linux/init.h> #include <linux/ioport.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/pci.h> #include <asm/bios_ebda.h> @@ -91,7 +91,8 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; -struct x86_platform_ops x86_platform = { +struct x86_platform_ops x86_platform __ro_after_init = { + .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, @@ -107,7 +108,7 @@ struct x86_platform_ops x86_platform = { EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi = { +struct x86_msi_ops x86_msi __ro_after_init = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, @@ -136,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev) } #endif -struct x86_io_apic_ops x86_io_apic_ops = { +struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = { .read = native_io_apic_read, .disable = native_disable_io_apic, }; |