summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c55
-rw-r--r--arch/x86/kernel/acpi/cppc_msr.c58
-rw-r--r--arch/x86/kernel/acpi/cstate.c4
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_gart_64.c21
-rw-r--r--arch/x86/kernel/amd_nb.c43
-rw-r--r--arch/x86/kernel/apb_timer.c29
-rw-r--r--arch/x86/kernel/apic/apic.c146
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c26
-rw-r--r--arch/x86/kernel/apic/apic_noop.c4
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c7
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c3
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c20
-rw-r--r--arch/x86/kernel/apic/io_apic.c29
-rw-r--r--arch/x86/kernel/apic/ipi.c1
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c9
-rw-r--r--arch/x86/kernel/apic/probe_64.c3
-rw-r--r--arch/x86/kernel/apic/vector.c25
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c88
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c89
-rw-r--r--arch/x86/kernel/asm-offsets.c11
-rw-r--r--arch/x86/kernel/asm-offsets_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c5
-rw-r--r--arch/x86/kernel/cpu/amd.c16
-rw-r--r--arch/x86/kernel/cpu/common.c45
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c14
-rw-r--r--arch/x86/kernel/cpu/intel.c14
-rw-r--r--arch/x86/kernel/cpu/match.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c50
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c206
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c49
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c62
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c261
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/rdrand.c4
-rw-r--r--arch/x86/kernel/cpu/vmware.c15
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/dumpstack.c267
-rw-r--r--arch/x86/kernel/dumpstack_32.c156
-rw-r--r--arch/x86/kernel/dumpstack_64.c332
-rw-r--r--arch/x86/kernel/e820.c155
-rw-r--r--arch/x86/kernel/early-quirks.c509
-rw-r--r--arch/x86/kernel/ebda.c114
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/fpu/core.c37
-rw-r--r--arch/x86/kernel/fpu/init.c35
-rw-r--r--arch/x86/kernel/fpu/regset.c52
-rw-r--r--arch/x86/kernel/fpu/signal.c58
-rw-r--r--arch/x86/kernel/fpu/xstate.c550
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/head_64.S15
-rw-r--r--arch/x86/kernel/hpet.c166
-rw-r--r--arch/x86/kernel/hw_breakpoint.c3
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c5
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/io_delay.c2
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/irq_32.c3
-rw-r--r--arch/x86/kernel/irq_64.c4
-rw-r--r--arch/x86/kernel/kdebugfs.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c4
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/kprobes/core.c14
-rw-r--r--arch/x86/kernel/kprobes/opt.c2
-rw-r--r--arch/x86/kernel/ksysfs.c2
-rw-r--r--arch/x86/kernel/kvm.c292
-rw-r--r--arch/x86/kernel/kvmclock.c3
-rw-r--r--arch/x86/kernel/livepatch.c65
-rw-r--r--arch/x86/kernel/mpparse.c4
-rw-r--r--arch/x86/kernel/nmi.c1
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c9
-rw-r--r--arch/x86/kernel/paravirt.c10
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c14
-rw-r--r--arch/x86/kernel/pci-dma.c4
-rw-r--r--arch/x86/kernel/pci-nommu.c4
-rw-r--r--arch/x86/kernel/pci-swiotlb.c6
-rw-r--r--arch/x86/kernel/platform-quirks.c4
-rw-r--r--arch/x86/kernel/pmem.c2
-rw-r--r--arch/x86/kernel/process.c51
-rw-r--r--arch/x86/kernel/process_32.c35
-rw-r--r--arch/x86/kernel/process_64.c67
-rw-r--r--arch/x86/kernel/ptrace.c27
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kernel/quirks.c31
-rw-r--r--arch/x86/kernel/reboot.c25
-rw-r--r--arch/x86/kernel/resource.c4
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c64
-rw-r--r--arch/x86/kernel/setup_percpu.c7
-rw-r--r--arch/x86/kernel/signal.c56
-rw-r--r--arch/x86/kernel/signal_compat.c142
-rw-r--r--arch/x86/kernel/smpboot.c146
-rw-r--r--arch/x86/kernel/stacktrace.c81
-rw-r--r--arch/x86/kernel/tboot.c33
-rw-r--r--arch/x86/kernel/test_rodata.c5
-rw-r--r--arch/x86/kernel/traps.c83
-rw-r--r--arch/x86/kernel/tsc.c112
-rw-r--r--arch/x86/kernel/tsc_msr.c68
-rw-r--r--arch/x86/kernel/unwind_frame.c93
-rw-r--r--arch/x86/kernel/unwind_guess.c53
-rw-r--r--arch/x86/kernel/uprobes.c22
-rw-r--r--arch/x86/kernel/vm86_32.c5
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c8
-rw-r--r--arch/x86/kernel/x86_init.c9
122 files changed, 3325 insertions, 2316 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0503f5bfb18d..4dd5d500eb60 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
@@ -125,6 +126,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+ifdef CONFIG_FRAME_POINTER
+obj-y += unwind_frame.o
+else
+obj-y += unwind_guess.o
+endif
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 3242e591fa82..26b78d86f25a 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9414f84584e4..8a5abaa7d453 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -28,7 +28,7 @@
#include <linux/acpi_pmtmr.h>
#include <linux/efi.h>
#include <linux/cpumask.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/dmi.h>
#include <linux/irq.h>
#include <linux/slab.h>
@@ -161,28 +161,29 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
/**
* acpi_register_lapic - register a local apic and generates a logic cpu number
* @id: local apic id to register
+ * @acpiid: ACPI id to register
* @enabled: this cpu is enabled or not
*
* Returns the logic cpu number which maps to the local apic
*/
-static int acpi_register_lapic(int id, u8 enabled)
+static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
{
unsigned int ver = 0;
+ int cpu;
if (id >= MAX_LOCAL_APIC) {
printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
return -EINVAL;
}
- if (!enabled) {
- ++disabled_cpus;
- return -EINVAL;
- }
-
if (boot_cpu_physical_apicid != -1U)
- ver = apic_version[boot_cpu_physical_apicid];
+ ver = boot_cpu_apic_version;
- return generic_processor_info(id, ver);
+ cpu = __generic_processor_info(id, ver, enabled);
+ if (cpu >= 0)
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
+
+ return cpu;
}
static int __init
@@ -212,7 +213,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
if (!apic->apic_id_valid(apic_id) && enabled)
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
else
- acpi_register_lapic(apic_id, enabled);
+ acpi_register_lapic(apic_id, processor->uid, enabled);
#else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
#endif
@@ -232,6 +233,10 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
acpi_table_print_madt_entry(header);
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -240,6 +245,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
* when we use CPU hotplug.
*/
acpi_register_lapic(processor->id, /* APIC ID */
+ processor->processor_id, /* ACPI ID */
processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
@@ -258,6 +264,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
acpi_table_print_madt_entry(header);
acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->processor_id, /* ACPI ID */
processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
@@ -274,6 +281,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
return -EINVAL;
+ acpi_table_print_madt_entry(header);
+
acpi_lapic_addr = lapic_addr_ovr->address;
return 0;
@@ -697,7 +706,7 @@ static void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
@@ -708,13 +717,14 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
numa_set_node(cpu, nid);
}
#endif
+ return 0;
}
int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
{
int cpu;
- cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED);
+ cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED);
if (cpu < 0) {
pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
return cpu;
@@ -990,21 +1000,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
- /*
- * Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
- */
-
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
- if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
- return count;
- }
-
- register_lapic_address(acpi_lapic_addr);
-
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
acpi_parse_sapic, MAX_LOCAL_APIC);
@@ -1023,8 +1018,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
return ret;
}
- x2count = madt_proc[0].count;
- count = madt_proc[1].count;
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
}
if (!count && !x2count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -1505,7 +1500,7 @@ void __init acpi_boot_table_init(void)
* If acpi_disabled, bail out
*/
if (acpi_disabled)
- return;
+ return;
/*
* Initialize the ACPI boot-time table parser.
diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c
new file mode 100644
index 000000000000..6fb478bf82fd
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc_msr.c
@@ -0,0 +1,58 @@
+/*
+ * cppc_msr.c: MSR Interface for CPPC
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 4b28159e0421..af15f4444330 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -5,7 +5,7 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/cpu.h>
@@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
struct cstate_entry *percpu_entry;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index adb3eaf8fe2a..48587335ede8 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
- stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
+ initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 8e3842fc8bea..63ff468a7986 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -20,7 +20,6 @@
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
-#include <linux/module.h>
#include <linux/topology.h>
#include <linux/interrupt.h>
#include <linux/bitmap.h>
@@ -242,7 +241,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
static dma_addr_t gart_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
unsigned long bus;
phys_addr_t paddr = page_to_phys(page) + offset;
@@ -264,7 +263,7 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page,
*/
static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
unsigned long iommu_page;
int npages;
@@ -286,7 +285,7 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
* Wrapper for pci_unmap_single working with scatterlists.
*/
static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, struct dma_attrs *attrs)
+ enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s;
int i;
@@ -294,7 +293,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
for_each_sg(sg, s, nents, i) {
if (!s->dma_length || !s->length)
break;
- gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
+ gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0);
}
}
@@ -316,7 +315,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
addr = dma_map_area(dev, addr, s->length, dir, 0);
if (addr == bad_dma_addr) {
if (i > 0)
- gart_unmap_sg(dev, sg, i, dir, NULL);
+ gart_unmap_sg(dev, sg, i, dir, 0);
nents = 0;
sg[0].dma_length = 0;
break;
@@ -387,7 +386,7 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
* Merge chunks that have page aligned sizes into a continuous mapping.
*/
static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, struct dma_attrs *attrs)
+ enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s, *ps, *start_sg, *sgmap;
int need = 0, nextneed, i, out, start;
@@ -457,7 +456,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
error:
flush_gart();
- gart_unmap_sg(dev, sg, out, dir, NULL);
+ gart_unmap_sg(dev, sg, out, dir, 0);
/* When it was forced or merged try again in a dumb way */
if (force_iommu || iommu_merge) {
@@ -477,7 +476,7 @@ error:
/* allocate and map a coherent mapping */
static void *
gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
- gfp_t flag, struct dma_attrs *attrs)
+ gfp_t flag, unsigned long attrs)
{
dma_addr_t paddr;
unsigned long align_mask;
@@ -509,9 +508,9 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
/* free a coherent mapping */
static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr, struct dma_attrs *attrs)
+ dma_addr_t dma_addr, unsigned long attrs)
{
- gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
+ gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index a147e676fc7b..4fdf6230d93c 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -9,7 +9,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <asm/amd_nb.h>
@@ -71,8 +71,8 @@ int amd_cache_northbridges(void)
while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
i++;
- if (i == 0)
- return 0;
+ if (!i)
+ return -ENODEV;
nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
@@ -219,24 +219,22 @@ int amd_set_subcaches(int cpu, unsigned long mask)
return 0;
}
-static int amd_cache_gart(void)
+static void amd_cache_gart(void)
{
u16 i;
- if (!amd_nb_has_feature(AMD_NB_GART))
- return 0;
-
- flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
- if (!flush_words) {
- amd_northbridges.flags &= ~AMD_NB_GART;
- return -ENOMEM;
- }
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
- for (i = 0; i != amd_nb_num(); i++)
- pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
- &flush_words[i]);
+ flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ pr_notice("Cannot initialize GART flush words, GART support disabled\n");
+ return;
+ }
- return 0;
+ for (i = 0; i != amd_nb_num(); i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
}
void amd_flush_garts(void)
@@ -278,17 +276,10 @@ EXPORT_SYMBOL_GPL(amd_flush_garts);
static __init int init_amd_nbs(void)
{
- int err = 0;
-
- err = amd_cache_northbridges();
-
- if (err < 0)
- pr_notice("Cannot enumerate AMD northbridges\n");
+ amd_cache_northbridges();
+ amd_cache_gart();
- if (amd_cache_gart() < 0)
- pr_notice("Cannot initialize GART flush words, GART support disabled\n");
-
- return err;
+ return 0;
}
/* This has to go after the PCI subsystem */
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index cefacbad1531..456316f6c868 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -215,26 +215,18 @@ void apbt_setup_secondary_clock(void)
* cpu timers during the offline process due to the ordering of notification.
* the extra interrupt is harmless.
*/
-static int apbt_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
+static int apbt_cpu_dead(unsigned int cpu)
{
- unsigned long cpu = (unsigned long)hcpu;
struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DEAD:
- dw_apb_clockevent_pause(adev->timer);
- if (system_state == SYSTEM_RUNNING) {
- pr_debug("skipping APBT CPU %lu offline\n", cpu);
- } else {
- pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
- dw_apb_clockevent_stop(adev->timer);
- }
- break;
- default:
- pr_debug("APBT notified %lu, no action\n", action);
+ dw_apb_clockevent_pause(adev->timer);
+ if (system_state == SYSTEM_RUNNING) {
+ pr_debug("skipping APBT CPU %u offline\n", cpu);
+ } else {
+ pr_debug("APBT clockevent for cpu %u offline\n", cpu);
+ dw_apb_clockevent_stop(adev->timer);
}
- return NOTIFY_OK;
+ return 0;
}
static __init int apbt_late_init(void)
@@ -242,9 +234,8 @@ static __init int apbt_late_init(void)
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
!apb_timer_block_enabled)
return 0;
- /* This notifier should be called after workqueue is ready */
- hotcpu_notifier(apbt_cpuhp_notify, -20);
- return 0;
+ return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL,
+ apbt_cpu_dead);
}
fs_initcall(apbt_late_init);
#else
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 60078a67d7e3..88c657b057e2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -23,7 +23,7 @@
#include <linux/bootmem.h>
#include <linux/ftrace.h>
#include <linux/ioport.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/timex.h>
@@ -64,6 +64,8 @@ unsigned disabled_cpus;
unsigned int boot_cpu_physical_apicid = -1U;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
+u8 boot_cpu_apic_version;
+
/*
* The highest APIC ID seen during enumeration.
*/
@@ -92,8 +94,10 @@ static int apic_extnmi = APIC_EXTNMI_BSP;
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
#ifdef CONFIG_X86_32
@@ -145,7 +149,7 @@ static int force_enable_local_apic __initdata;
*/
static int __init parse_lapic(char *arg)
{
- if (config_enabled(CONFIG_X86_32) && !arg)
+ if (IS_ENABLED(CONFIG_X86_32) && !arg)
force_enable_local_apic = 1;
else if (arg && !strncmp(arg, "notscdeadline", 13))
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
@@ -311,7 +315,7 @@ int lapic_get_maxlvt(void)
/* Clock divisor */
#define APIC_DIVISOR 16
-#define TSC_DIVISOR 32
+#define TSC_DIVISOR 8
/*
* This function sets up the local APIC timer, with a timeout of
@@ -563,13 +567,37 @@ static void setup_APIC_timer(void)
CLOCK_EVT_FEAT_DUMMY);
levt->set_next_event = lapic_next_deadline;
clockevents_config_and_register(levt,
- (tsc_khz / TSC_DIVISOR) * 1000,
+ tsc_khz * (1000 / TSC_DIVISOR),
0xF, ~0UL);
} else
clockevents_register_device(levt);
}
/*
+ * Install the updated TSC frequency from recalibration at the TSC
+ * deadline clockevent devices.
+ */
+static void __lapic_update_tsc_freq(void *info)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR));
+}
+
+void lapic_update_tsc_freq(void)
+{
+ /*
+ * The clockevent device's ->mult and ->shift can both be
+ * changed. In order to avoid races, schedule the frequency
+ * update code on each CPU.
+ */
+ on_each_cpu(__lapic_update_tsc_freq, NULL, 0);
+}
+
+/*
* In this functions we calibrate APIC bus clocks to the external timer.
*
* We want to do the calibration only once since we want to have local timer
@@ -1348,7 +1376,6 @@ void setup_local_APIC(void)
* Actually disabling the focus CPU check just makes the hang less
* frequent as it makes the interrupt distributon model be more
* like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
*/
/*
@@ -1597,6 +1624,9 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
int ret, ir_stat;
+ if (skip_ioapic_setup)
+ return;
+
ir_stat = irq_remapping_prepare();
if (ir_stat < 0 && !x2apic_supported())
return;
@@ -1787,8 +1817,7 @@ void __init init_apic_mappings(void)
* since smp_sanity_check is prepared for such a case
* and disable smp mode
*/
- apic_version[new_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
}
@@ -1799,17 +1828,14 @@ void __init register_lapic_address(unsigned long address)
if (!x2apic_mode) {
set_fixmap_nocache(FIX_APIC_BASE, address);
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, mp_lapic_addr);
+ APIC_BASE, address);
}
if (boot_cpu_physical_apicid == -1U) {
boot_cpu_physical_apicid = read_apic_id();
- apic_version[boot_cpu_physical_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
}
-int apic_version[MAX_LOCAL_APIC];
-
/*
* Local APIC interrupts
*/
@@ -1998,7 +2024,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-int generic_processor_info(int apicid, int version)
+/*
+ * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
+ * contiguously, it equals to current allocated max logical CPU ID plus 1.
+ * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of
+ * nr_logical_cpuids is nr_cpu_ids.
+ *
+ * NOTE: Reserve 0 for BSP.
+ */
+static int nr_logical_cpuids = 1;
+
+/*
+ * Used to store mapping between logical CPU IDs and APIC IDs.
+ */
+static int cpuid_to_apicid[] = {
+ [0 ... NR_CPUS - 1] = -1,
+};
+
+/*
+ * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
+ * and cpuid_to_apicid[] synchronized.
+ */
+static int allocate_logical_cpuid(int apicid)
+{
+ int i;
+
+ /*
+ * cpuid <-> apicid mapping is persistent, so when a cpu is up,
+ * check if the kernel has allocated a cpuid for it.
+ */
+ for (i = 0; i < nr_logical_cpuids; i++) {
+ if (cpuid_to_apicid[i] == apicid)
+ return i;
+ }
+
+ /* Allocate a new cpuid. */
+ if (nr_logical_cpuids >= nr_cpu_ids) {
+ WARN_ONCE(1, "Only %d processors supported."
+ "Processor %d/0x%x and the rest are ignored.\n",
+ nr_cpu_ids - 1, nr_logical_cpuids, apicid);
+ return -1;
+ }
+
+ cpuid_to_apicid[nr_logical_cpuids] = apicid;
+ return nr_logical_cpuids++;
+}
+
+int __generic_processor_info(int apicid, int version, bool enabled)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2045,7 +2117,7 @@ int generic_processor_info(int apicid, int version)
int thiscpu = max + disabled_cpus - 1;
pr_warning(
- "ACPI: NR_CPUS/possible_cpus limit of %i almost"
+ "APIC: NR_CPUS/possible_cpus limit of %i almost"
" reached. Keeping one slot for boot cpu."
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
@@ -2056,15 +2128,16 @@ int generic_processor_info(int apicid, int version)
if (num_processors >= nr_cpu_ids) {
int thiscpu = max + disabled_cpus;
- pr_warning(
- "ACPI: NR_CPUS/possible_cpus limit of %i reached."
- " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+ if (enabled) {
+ pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
+ "reached. Processor %d/0x%x ignored.\n",
+ max, thiscpu, apicid);
+ }
disabled_cpus++;
return -EINVAL;
}
- num_processors++;
if (apicid == boot_cpu_physical_apicid) {
/*
* x86_bios_cpu_apicid is required to have processors listed
@@ -2074,8 +2147,16 @@ int generic_processor_info(int apicid, int version)
* for BSP.
*/
cpu = 0;
- } else
- cpu = cpumask_next_zero(-1, cpu_present_mask);
+
+ /* Logical cpuid 0 is reserved for BSP. */
+ cpuid_to_apicid[0] = apicid;
+ } else {
+ cpu = allocate_logical_cpuid(apicid);
+ if (cpu < 0) {
+ disabled_cpus++;
+ return -EINVAL;
+ }
+ }
/*
* This can happen on physical hotplug. The sanity check at boot time
@@ -2085,8 +2166,9 @@ int generic_processor_info(int apicid, int version)
if (topology_update_package_map(apicid, cpu) < 0) {
int thiscpu = max + disabled_cpus;
- pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+ pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n",
thiscpu, apicid);
+
disabled_cpus++;
return -ENOSPC;
}
@@ -2099,14 +2181,12 @@ int generic_processor_info(int apicid, int version)
cpu, apicid);
version = 0x10;
}
- apic_version[apicid] = version;
- if (version != apic_version[boot_cpu_physical_apicid]) {
+ if (version != boot_cpu_apic_version) {
pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
- apic_version[boot_cpu_physical_apicid], cpu, version);
+ boot_cpu_apic_version, cpu, version);
}
- physid_set(apicid, phys_cpu_present_map);
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
@@ -2119,11 +2199,23 @@ int generic_processor_info(int apicid, int version)
apic->x86_32_early_logical_apicid(cpu);
#endif
set_cpu_possible(cpu, true);
- set_cpu_present(cpu, true);
+
+ if (enabled) {
+ num_processors++;
+ physid_set(apicid, phys_cpu_present_map);
+ set_cpu_present(cpu, true);
+ } else {
+ disabled_cpus++;
+ }
return cpu;
}
+int generic_processor_info(int apicid, int version)
+{
+ return __generic_processor_info(apicid, version, true);
+}
+
int hard_smp_processor_id(void)
{
return read_apic_id();
@@ -2246,7 +2338,7 @@ int __init APIC_init_uniprocessor(void)
* Complain if the BIOS pretends there is one.
*/
if (!boot_cpu_has(X86_FEATURE_APIC) &&
- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
return -1;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 76f89e2b245a..a4d7ff20ed22 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -15,7 +15,7 @@
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/hardirq.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
@@ -25,7 +25,7 @@
static struct apic apic_physflat;
static struct apic apic_flat;
-struct apic __read_mostly *apic = &apic_flat;
+struct apic *apic __ro_after_init = &apic_flat;
EXPORT_SYMBOL_GPL(apic);
static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -116,27 +116,17 @@ static void flat_send_IPI_all(int vector)
static unsigned int flat_get_apic_id(unsigned long x)
{
- unsigned int id;
-
- id = (((x)>>24) & 0xFFu);
-
- return id;
+ return (x >> 24) & 0xFF;
}
static unsigned long set_apic_id(unsigned int id)
{
- unsigned long x;
-
- x = ((id & 0xFFu)<<24);
- return x;
+ return (id & 0xFF) << 24;
}
static unsigned int read_xapic_id(void)
{
- unsigned int id;
-
- id = flat_get_apic_id(apic_read(APIC_ID));
- return id;
+ return flat_get_apic_id(apic_read(APIC_ID));
}
static int flat_apic_id_registered(void)
@@ -154,7 +144,7 @@ static int flat_probe(void)
return 1;
}
-static struct apic apic_flat = {
+static struct apic apic_flat __ro_after_init = {
.name = "flat",
.probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -181,7 +171,6 @@ static struct apic apic_flat = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
@@ -249,7 +238,7 @@ static int physflat_probe(void)
return 0;
}
-static struct apic apic_physflat = {
+static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
.probe = physflat_probe,
@@ -278,7 +267,6 @@ static struct apic apic_physflat = {
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 13d19ed58514..b109e4389c92 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -11,7 +11,6 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
@@ -109,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v)
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
}
-struct apic apic_noop = {
+struct apic apic_noop __ro_after_init = {
.name = "noop",
.probe = noop_probe,
.acpi_madt_oem_check = NULL,
@@ -141,7 +140,6 @@ struct apic apic_noop = {
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index ab5c2c685a3c..e08fe2c8dd8c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -40,10 +40,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
static unsigned long numachip1_set_apic_id(unsigned int id)
{
- unsigned long x;
-
- x = ((id & 0xffU) << 24);
- return x;
+ return (id & 0xff) << 24;
}
static unsigned int numachip2_get_apic_id(unsigned long x)
@@ -269,7 +266,6 @@ static const struct apic apic_numachip1 __refconst = {
.get_apic_id = numachip1_get_apic_id,
.set_apic_id = numachip1_set_apic_id,
- .apic_id_mask = 0xffU << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
@@ -321,7 +317,6 @@ static const struct apic apic_numachip2 __refconst = {
.get_apic_id = numachip2_get_apic_id,
.set_apic_id = numachip2_set_apic_id,
- .apic_id_mask = 0xffU << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cf9bd896c12d..56012010332c 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -142,7 +142,7 @@ static int probe_bigsmp(void)
return dmi_bigsmp;
}
-static struct apic apic_bigsmp = {
+static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
@@ -171,7 +171,6 @@ static struct apic apic_bigsmp = {
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 7788ce643bf4..c73c9fb281e1 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -16,7 +16,7 @@
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/delay.h>
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -26,32 +26,32 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
}
#endif
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
static void nmi_raise_cpu_backtrace(cpumask_t *mask)
{
apic->send_IPI_mask(mask, NMI_VECTOR);
}
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
{
- nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
+ nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_raise_cpu_backtrace);
}
-static int
-arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
if (nmi_cpu_backtrace(regs))
return NMI_HANDLED;
return NMI_DONE;
}
-NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
-static int __init register_trigger_all_cpu_backtrace(void)
+static int __init register_nmi_cpu_backtrace_handler(void)
{
- register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
+ register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
0, "arch_bt");
return 0;
}
-early_initcall(register_trigger_all_cpu_backtrace);
+early_initcall(register_nmi_cpu_backtrace_handler);
#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 84e33ff5a6d5..48e6d84f173e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -39,7 +39,7 @@
#include <linux/mc146818rtc.h>
#include <linux/compiler.h>
#include <linux/acpi.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscore_ops.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
@@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
return __irq_domain_alloc_irqs(domain, irq, 1,
ioapic_alloc_attr_node(info),
- info, legacy);
+ info, legacy, NULL);
}
/*
@@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
info->ioapic_pin))
return -ENOMEM;
} else {
- irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
+ NULL);
if (irq >= 0) {
irq_data = irq_domain_get_irq_data(domain, irq);
data = irq_data->chip_data;
@@ -1592,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void)
* no meaning without the serial APIC bus.
*/
if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ || APIC_XAPIC(boot_cpu_apic_version))
return;
setup_ioapic_ids_from_mpc_nocheck();
}
@@ -2422,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id)
static u8 io_apic_unique_id(int idx, u8 id)
{
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ !APIC_XAPIC(boot_cpu_apic_version))
return io_apic_get_unique_id(idx, id);
else
return id;
@@ -2567,29 +2568,25 @@ static struct resource * __init ioapic_setup_resources(void)
unsigned long n;
struct resource *res;
char *mem;
- int i, num = 0;
+ int i;
- for_each_ioapic(i)
- num++;
- if (num == 0)
+ if (nr_ioapics == 0)
return NULL;
n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
- n *= num;
+ n *= nr_ioapics;
mem = alloc_bootmem(n);
res = (void *)mem;
- mem += sizeof(struct resource) * num;
+ mem += sizeof(struct resource) * nr_ioapics;
- num = 0;
for_each_ioapic(i) {
- res[num].name = mem;
- res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
- num++;
- ioapics[i].iomem_res = res;
+ ioapics[i].iomem_res = &res[i];
}
ioapic_resources = res;
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 2a0f225afebd..3a205d4a12d0 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -8,7 +8,6 @@
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/cpu.h>
-#include <linux/module.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ade25320df96..015bbf30e3e3 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
}
-static struct irq_chip hpet_msi_controller = {
+static struct irq_chip hpet_msi_controller __ro_after_init = {
.name = "HPET-MSI",
.irq_unmask = hpet_msi_unmask,
.irq_mask = hpet_msi_mask,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index f316e34abb42..c48264e202fd 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -8,7 +8,7 @@
*/
#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
@@ -72,7 +72,7 @@ static int probe_default(void)
return 1;
}
-static struct apic apic_default = {
+static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
@@ -101,7 +101,6 @@ static struct apic apic_default = {
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
.cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
@@ -127,7 +126,7 @@ static struct apic apic_default = {
apic_driver(apic_default);
-struct apic *apic = &apic_default;
+struct apic *apic __ro_after_init = &apic_default;
EXPORT_SYMBOL_GPL(apic);
static int cmdline_apic __initdata;
@@ -153,7 +152,7 @@ early_param("apic", parse_apic);
void __init default_setup_apic_routing(void)
{
- int version = apic_version[boot_cpu_physical_apicid];
+ int version = boot_cpu_apic_version;
if (num_possible_cpus() > 8) {
switch (boot_cpu_data.x86_vendor) {
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1793dba7a741..c303054b90b5 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -11,10 +11,9 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/dmar.h>
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index a5e400afc563..5d30c5e42bb1 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -523,7 +523,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
struct apic_chip_data *data = irq_data->chip_data;
int err, irq = irq_data->irq;
- if (!config_enabled(CONFIG_SMP))
+ if (!IS_ENABLED(CONFIG_SMP))
return -EPERM;
if (!cpumask_intersects(dest, cpu_online_mask))
@@ -661,11 +661,28 @@ void irq_complete_move(struct irq_cfg *cfg)
*/
void irq_force_complete_move(struct irq_desc *desc)
{
- struct irq_data *irqdata = irq_desc_get_irq_data(desc);
- struct apic_chip_data *data = apic_chip_data(irqdata);
- struct irq_cfg *cfg = data ? &data->cfg : NULL;
+ struct irq_data *irqdata;
+ struct apic_chip_data *data;
+ struct irq_cfg *cfg;
unsigned int cpu;
+ /*
+ * The function is called for all descriptors regardless of which
+ * irqdomain they belong to. For example if an IRQ is provided by
+ * an irq_chip as part of a GPIO driver, the chip data for that
+ * descriptor is specific to the irq_chip in question.
+ *
+ * Check first that the chip_data is what we expect
+ * (apic_chip_data) before touching it any further.
+ */
+ irqdata = irq_domain_get_irq_data(x86_vector_domain,
+ irq_desc_get_irq(desc));
+ if (!irqdata)
+ return;
+
+ data = apic_chip_data(irqdata);
+ cfg = data ? &data->cfg : NULL;
+
if (!cfg)
return;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index aca8b75c1552..200af5ae9662 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -152,68 +152,53 @@ static void init_x2apic_ldr(void)
}
}
- /*
- * At CPU state changes, update the x2apic cluster sibling info.
- */
-static int
-update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
+/*
+ * At CPU state changes, update the x2apic cluster sibling info.
+ */
+static int x2apic_prepare_cpu(unsigned int cpu)
{
- unsigned int this_cpu = (unsigned long)hcpu;
- unsigned int cpu;
- int err = 0;
-
- switch (action) {
- case CPU_UP_PREPARE:
- if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
- GFP_KERNEL)) {
- err = -ENOMEM;
- } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
- GFP_KERNEL)) {
- free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
- err = -ENOMEM;
- }
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- for_each_online_cpu(cpu) {
- if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
- continue;
- cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
- cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
- }
- free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
- free_cpumask_var(per_cpu(ipi_mask, this_cpu));
- break;
+ if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) {
+ free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
+ return -ENOMEM;
}
- return notifier_from_errno(err);
+ return 0;
}
-static struct notifier_block x2apic_cpu_notifier = {
- .notifier_call = update_clusterinfo,
-};
-
-static int x2apic_init_cpu_notifier(void)
+static int x2apic_dead_cpu(unsigned int this_cpu)
{
- int cpu = smp_processor_id();
-
- zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
-
- BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+ int cpu;
- cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
- register_hotcpu_notifier(&x2apic_cpu_notifier);
- return 1;
+ for_each_online_cpu(cpu) {
+ if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+ continue;
+ cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ }
+ free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+ free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+ return 0;
}
static int x2apic_cluster_probe(void)
{
- if (x2apic_mode)
- return x2apic_init_cpu_notifier();
- else
+ int cpu = smp_processor_id();
+ int ret;
+
+ if (!x2apic_mode)
+ return 0;
+
+ ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
+ x2apic_prepare_cpu, x2apic_dead_cpu);
+ if (ret < 0) {
+ pr_err("Failed to register X2APIC_PREPARE\n");
return 0;
+ }
+ cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
+ return 1;
}
static const struct cpumask *x2apic_cluster_target_cpus(void)
@@ -242,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
}
-static struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
@@ -270,7 +255,6 @@ static struct apic apic_x2apic_cluster = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a1242e2c12e6..ff111f05a314 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -98,7 +98,7 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}
-static struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
@@ -126,7 +126,6 @@ static struct apic apic_x2apic_phys = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 29003154fafd..aeef53ce93e1 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -12,7 +12,7 @@
#include <linux/proc_fs.h>
#include <linux/threads.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/sched.h>
@@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
if (strncmp(oem_id, "SGI", 3) != 0)
return 0;
+ if (numa_off) {
+ pr_err("UV: NUMA is off, disabling UV support\n");
+ return 0;
+ }
+
/* Setup early hub type field in uv_hub_info for Node 0 */
uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
@@ -325,7 +330,7 @@ static __init void build_uv_gr_table(void)
struct uv_gam_range_entry *gre = uv_gre_table;
struct uv_gam_range_s *grt;
unsigned long last_limit = 0, ram_limit = 0;
- int bytes, i, sid, lsid = -1;
+ int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
if (!gre)
return;
@@ -356,11 +361,12 @@ static __init void build_uv_gr_table(void)
}
sid = gre->sockid - _min_socket;
if (lsid < sid) { /* new range */
- grt = &_gr_table[sid];
- grt->base = lsid;
+ grt = &_gr_table[indx];
+ grt->base = lindx;
grt->nasid = gre->nasid;
grt->limit = last_limit = gre->limit;
lsid = sid;
+ lindx = indx++;
continue;
}
if (lsid == sid && !ram_limit) { /* update range */
@@ -371,7 +377,7 @@ static __init void build_uv_gr_table(void)
}
if (!ram_limit) { /* non-contiguous ram range */
grt++;
- grt->base = sid - 1;
+ grt->base = lindx;
grt->nasid = gre->nasid;
grt->limit = last_limit = gre->limit;
continue;
@@ -527,11 +533,8 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
static unsigned long set_apic_id(unsigned int id)
{
- unsigned long x;
-
- /* maskout x2apic_extra_bits ? */
- x = id;
- return x;
+ /* CHECKME: Do we need to mask out the xapic extra bits? */
+ return id;
}
static unsigned int uv_read_apic_id(void)
@@ -554,7 +557,7 @@ static int uv_probe(void)
return apic == &apic_x2apic_uv_x;
}
-static struct apic __refdata apic_x2apic_uv_x = {
+static struct apic apic_x2apic_uv_x __ro_after_init = {
.name = "UV large system",
.probe = uv_probe,
@@ -582,7 +585,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
@@ -919,16 +921,16 @@ static void uv_heartbeat(unsigned long ignored)
uv_set_scir_bits(bits);
/* enable next timer period */
- mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
-static void uv_heartbeat_enable(int cpu)
+static int uv_heartbeat_enable(unsigned int cpu)
{
while (!uv_cpu_scir_info(cpu)->enabled) {
struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
- setup_timer(timer, uv_heartbeat, cpu);
+ setup_pinned_timer(timer, uv_heartbeat, cpu);
timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
add_timer_on(timer, cpu);
uv_cpu_scir_info(cpu)->enabled = 1;
@@ -936,43 +938,24 @@ static void uv_heartbeat_enable(int cpu)
/* also ensure that boot cpu is enabled */
cpu = 0;
}
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
-static void uv_heartbeat_disable(int cpu)
+static int uv_heartbeat_disable(unsigned int cpu)
{
if (uv_cpu_scir_info(cpu)->enabled) {
uv_cpu_scir_info(cpu)->enabled = 0;
del_timer(&uv_cpu_scir_info(cpu)->timer);
}
uv_set_cpu_scir_bits(cpu, 0xff);
-}
-
-/*
- * cpu hotplug notifier
- */
-static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- uv_heartbeat_enable(cpu);
- break;
- case CPU_DOWN_PREPARE:
- uv_heartbeat_disable(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+ return 0;
}
static __init void uv_scir_register_cpu_notifier(void)
{
- hotcpu_notifier(uv_scir_cpu_notify, 0);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online",
+ uv_heartbeat_enable, uv_heartbeat_disable);
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -1156,19 +1139,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (!index) {
pr_info("UV: GAM Range Table...\n");
- pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n",
+ pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n",
"Range", "", "Size", "Type", "NASID",
- "SID", "PN", "PXM");
+ "SID", "PN");
}
pr_info(
- "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n",
+ "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n",
index++,
(unsigned long)lgre << UV_GAM_RANGE_SHFT,
(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
((unsigned long)(gre->limit - lgre)) >>
(30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
- gre->type, gre->nasid, gre->sockid,
- gre->pnode, gre->pxm);
+ gre->type, gre->nasid, gre->sockid, gre->pnode);
lgre = gre->limit;
if (sock_min > gre->sockid)
@@ -1287,7 +1269,7 @@ static void __init build_socket_tables(void)
_pnode_to_socket[i] = SOCK_EMPTY;
/* fill in pnode/node/addr conversion list values */
- pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n");
+ pr_info("UV: GAM Building socket/pnode conversion tables\n");
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
continue;
@@ -1295,20 +1277,18 @@ static void __init build_socket_tables(void)
if (_socket_to_pnode[i] != SOCK_EMPTY)
continue; /* duplicate */
_socket_to_pnode[i] = gre->pnode;
- _socket_to_node[i] = gre->pxm;
i = gre->pnode - minpnode;
_pnode_to_socket[i] = gre->sockid;
pr_info(
- "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n",
+ "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
gre->sockid, gre->type, gre->nasid,
_socket_to_pnode[gre->sockid - minsock],
- _socket_to_node[gre->sockid - minsock],
_pnode_to_socket[gre->pnode - minpnode]);
}
- /* check socket -> node values */
+ /* Set socket -> node values */
lnid = -1;
for_each_present_cpu(cpu) {
int nid = cpu_to_node(cpu);
@@ -1319,14 +1299,9 @@ static void __init build_socket_tables(void)
lnid = nid;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
sockid = apicid >> uv_cpuid.socketid_shift;
- i = sockid - minsock;
-
- if (nid != _socket_to_node[i]) {
- pr_warn(
- "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n",
- i, sockid, gre->type, _socket_to_node[i], nid);
- _socket_to_node[i] = nid;
- }
+ _socket_to_node[sockid - minsock] = nid;
+ pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
+ sockid, apicid, nid);
}
/* Setup physical blade to pnode translation from GAM Range Table */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 674134e9f5e5..c62e015b126c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -29,9 +29,14 @@
void common(void) {
BLANK();
- OFFSET(TI_flags, thread_info, flags);
- OFFSET(TI_status, thread_info, status);
- OFFSET(TI_addr_limit, thread_info, addr_limit);
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
+
+ BLANK();
+ OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
+ OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
BLANK();
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ecdc1d217dc0..880aa093268d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -57,6 +57,11 @@ void foo(void)
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+#ifdef CONFIG_CC_STACKPROTECTOR
+ BLANK();
+ OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d875f97d4e0b..210927ee2e74 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -56,6 +56,11 @@ int main(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
BLANK();
+#ifdef CONFIG_CC_STACKPROTECTOR
+ DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+ BLANK();
+#endif
+
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c343a54bed39..b81fe2d63e15 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -669,19 +669,30 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
+#define MSR_AMD64_DE_CFG 0xC0011029
+
+static void init_amd_ln(struct cpuinfo_x86 *c)
+{
+ /*
+ * Apply erratum 665 fix unconditionally so machines without a BIOS
+ * fix work.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG, 31);
+}
+
static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
rdmsrl(0xc0011005, value);
if (value & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
}
}
}
@@ -726,6 +737,7 @@ static void init_amd(struct cpuinfo_x86 *c)
case 6: init_amd_k7(c); break;
case 0xf: init_amd_k8(c); break;
case 0x10: init_amd_gh(c); break;
+ case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0fe6953f421c..9bd910a7dd0a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2,7 +2,7 @@
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/ctype.h>
@@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
- return;
-
- cpu_detect(c);
- get_cpu_vendor(c);
- get_cpu_cap(c);
+ if (have_cpuid_p()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ get_cpu_cap(c);
- if (this_cpu->c_early_init)
- this_cpu->c_early_init(c);
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
- c->cpu_index = 0;
- filter_cpuid_features(c, false);
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
- if (this_cpu->c_bsp_init)
- this_cpu->c_bsp_init(c);
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ }
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
fpu__init_system(c);
@@ -1265,9 +1264,14 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
-struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
- (unsigned long) debug_idt_table };
+struct desc_ptr idt_descr __ro_after_init = {
+ .size = NR_VECTORS * 16 - 1,
+ .address = (unsigned long) idt_table,
+};
+const struct desc_ptr debug_idt_descr = {
+ .size = NR_VECTORS * 16 - 1,
+ .address = (unsigned long) debug_idt_table,
+};
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
@@ -1281,7 +1285,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
@@ -1305,11 +1309,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
- /*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to
- * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
- */
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
@@ -1452,7 +1451,7 @@ void cpu_init(void)
struct task_struct *me;
struct tss_struct *t;
unsigned long v;
- int cpu = stack_smp_processor_id();
+ int cpu = raw_smp_processor_id();
int i;
wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 73d391ae452f..35691a6b0d32 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,7 +21,8 @@
*
*/
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
@@ -85,3 +86,14 @@ bool __init hypervisor_x2apic_available(void)
x86_hyper->x2apic_available &&
x86_hyper->x2apic_available();
}
+
+void hypervisor_pin_vcpu(int cpu)
+{
+ if (!x86_hyper)
+ return;
+
+ if (x86_hyper->pin_vcpu)
+ x86_hyper->pin_vcpu(cpu);
+ else
+ WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 6e2ffbebbcdb..fcd484d2bb03 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -5,7 +5,7 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/thread_info.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
@@ -13,6 +13,7 @@
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
+#include <asm/intel-family.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -300,15 +301,14 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
}
/*
- * P4 Xeon errata 037 workaround.
+ * P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
- MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
- > 0) {
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
- pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");
+ pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n");
}
}
@@ -509,6 +509,10 @@ static void init_intel(struct cpuinfo_x86 *c)
(c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
+ if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
+ ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
+ set_cpu_bug(c, X86_BUG_MONITOR);
+
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index fbb5e90557a5..e42117d5f4d7 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -1,7 +1,7 @@
#include <asm/cpu_device_id.h>
#include <asm/cpufeature.h>
#include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
/**
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 34c89a3e8260..83f1a98d37db 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -46,7 +46,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
return;
mce_setup(&m);
- m.bank = 1;
+ m.bank = -1;
/* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 92e5e37d97bf..a7fdf453d895 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -41,6 +41,7 @@
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
+#include <linux/jump_label.h>
#include <asm/processor.h>
#include <asm/traps.h>
@@ -292,6 +293,13 @@ static void print_mce(struct mce *m)
if (m->misc)
pr_cont("MISC %llx ", m->misc);
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
pr_cont("\n");
/*
* Note this output is parsed by external tools and old fields
@@ -425,7 +433,7 @@ static u64 mce_rdmsrl(u32 msr)
}
if (rdmsrl_safe(msr, &v)) {
- WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
+ WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
/*
* Return zero in case the access faulted. This should
* not happen normally but can happen if the CPU does
@@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));
+
if (m->status & MCI_STATUS_ADDRV) {
m->addr = mce_rdmsrl(msr_ops.addr(i));
@@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i)
m->addr >>= shift;
m->addr <<= shift;
}
+
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV)
+ m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
}
}
@@ -1309,7 +1335,7 @@ static void __restart_timer(struct timer_list *t, unsigned long interval)
if (timer_pending(t)) {
if (time_before(when, t->expires))
- mod_timer_pinned(t, when);
+ mod_timer(t, when);
} else {
t->expires = round_jiffies(when);
add_timer_on(t, smp_processor_id());
@@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
- /*
- * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
- * whether this processor will actually generate recoverable
- * machine checks. Check to see if this is an E7 model Xeon.
- * We can't do a model number check because E5 and E7 use the
- * same model number. E5 doesn't support recovery, E7 does.
- */
- if (mca_cfg.recovery || (mca_cfg.ser &&
- !strncmp(c->x86_model_id,
- "Intel(R) Xeon(R) CPU E7-", 24)))
- set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
@@ -1735,7 +1750,7 @@ static void __mcheck_cpu_init_timer(void)
struct timer_list *t = this_cpu_ptr(&mce_timer);
unsigned int cpu = smp_processor_id();
- setup_timer(t, mce_timer_fn, cpu);
+ setup_pinned_timer(t, mce_timer_fn, cpu);
mce_start_timer(cpu, t);
}
@@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank)
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
*/
static int __init mcheck_enable(char *str)
{
@@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void)
static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
static int __init mcheck_late_init(void)
{
+ if (mca_cfg.recovery)
+ static_branch_inc(&mcsafe_key);
+
mcheck_debugfs_init();
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 10b0661651e0..9b5403462936 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/string.h>
#include <asm/amd_nb.h>
#include <asm/apic.h>
@@ -63,37 +64,74 @@ static const char * const th_names[] = {
"execution_unit",
};
-/* Define HWID to IP type mappings for Scalable MCA */
-struct amd_hwid amd_hwids[] = {
- [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
- [SMCA_DF] = { "data_fabric", 0x2E },
- [SMCA_UMC] = { "umc", 0x96 },
- [SMCA_PB] = { "param_block", 0x5 },
- [SMCA_PSP] = { "psp", 0xFF },
- [SMCA_SMU] = { "smu", 0x1 },
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
};
-EXPORT_SYMBOL_GPL(amd_hwids);
-
-const char * const amd_core_mcablock_names[] = {
- [SMCA_LS] = "load_store",
- [SMCA_IF] = "insn_fetch",
- [SMCA_L2_CACHE] = "l2_cache",
- [SMCA_DE] = "decode_unit",
- [RES] = "",
- [SMCA_EX] = "execution_unit",
- [SMCA_FP] = "floating_point",
- [SMCA_L3_CACHE] = "l3_cache",
+
+struct smca_bank_name smca_bank_names[] = {
+ [SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU] = { "smu", "System Management Unit" },
};
-EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+EXPORT_SYMBOL_GPL(smca_bank_names);
+
+static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype, xec_bitmap } */
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
-const char * const amd_df_mcablock_names[] = {
- [SMCA_CS] = "coherent_slave",
- [SMCA_PIE] = "pie",
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
};
-EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
+struct smca_bank_info smca_banks[MAX_NR_BANKS];
+EXPORT_SYMBOL_GPL(smca_banks);
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
+static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
@@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
* CPU Initialization
*/
+static void get_smca_bank_info(unsigned int bank)
+{
+ unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+ struct smca_hwid_mcatype *type;
+ u32 high, instanceId;
+ u16 hwid, mcatype;
+
+ /* Collect bank_info using CPU 0 for now. */
+ if (cpu)
+ return;
+
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid = high & MCI_IPID_HWID;
+ mcatype = (high & MCI_IPID_MCATYPE) >> 16;
+ hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ type = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == type->hwid_mcatype) {
+ smca_banks[bank].type = type;
+ smca_banks[bank].type_instance = instanceId;
+ break;
+ }
+ }
+}
+
struct thresh_restart {
struct threshold_block *b;
int reset;
@@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
-static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
{
u32 addr = 0, offset = 0;
@@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
*/
u32 low, high;
- if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return addr;
if (!(low & MCI_CONFIG_MCAX))
return addr;
- if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
}
@@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
*/
smca_high &= ~BIT(2);
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
+ smca_high |= BIT(5);
+
wrmsr(smca_addr, smca_low, smca_high);
}
@@ -421,12 +503,15 @@ out:
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
+ unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (mce_flags.smca)
+ get_smca_bank_info(bank);
+
for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(address, low, high, bank, block);
+ address = get_block_address(cpu, address, low, high, bank, block);
if (!address)
break;
@@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
if (threshold_err)
m.misc = misc;
- if (m.status & MCI_STATUS_ADDRV)
+ if (m.status & MCI_STATUS_ADDRV) {
rdmsrl(msr_addr, m.addr);
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m.addr >> 56) & 0x3f;
+
+ m.addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+
+ if (m.status & MCI_STATUS_SYNDV)
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ }
+
mce_log(&m);
wrmsrl(msr_status, 0);
@@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void)
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
- int cpu = smp_processor_id();
- unsigned int bank, block;
+ unsigned int bank, block, cpu = smp_processor_id();
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(address, low, high, bank, block);
+ address = get_block_address(cpu, address, low, high, bank, block);
if (!address)
break;
@@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = {
.default_attrs = default_attrs,
};
+static const char *get_name(unsigned int bank, struct threshold_block *b)
+{
+ unsigned int bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ if (!smca_banks[bank].type)
+ return NULL;
+
+ bank_type = smca_banks[bank].type->bank_type;
+
+ if (b && bank_type == SMCA_UMC) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ return NULL;
+ }
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%x", smca_bank_names[bank_type].name,
+ smca_banks[bank].type_instance);
+ return buf_mcatype;
+}
+
static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
unsigned int block, u32 address)
{
@@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
- (bank == 4 ? bank4_names(b) : th_names[bank]));
+ get_name(bank, b));
if (err)
goto out_free;
recurse:
- address = get_block_address(address, low, high, bank, ++block);
+ address = get_block_address(cpu, address, low, high, bank, ++block);
if (!address)
return 0;
@@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
struct device *dev = per_cpu(mce_device, cpu);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = th_names[bank];
+ const char *name = get_name(bank, NULL);
int err = 0;
if (is_shared_bank(bank)) {
@@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
}
}
- err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank));
+ err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
if (!err)
goto out;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 8581963894c7..620ab06bcf45 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -54,26 +54,27 @@ static LIST_HEAD(pcache);
*/
static u8 *container;
static size_t container_size;
+static bool ucode_builtin;
static u32 ucode_new_rev;
-u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u8 amd_ucode_patch[PATCH_MAX_SIZE];
static u16 this_equiv_id;
static struct cpio_data ucode_cpio;
-/*
- * Microcode patch container file is prepended to the initrd in cpio format.
- * See Documentation/x86/early-microcode.txt
- */
-static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
static struct cpio_data __init find_ucode_in_initrd(void)
{
- long offset = 0;
+#ifdef CONFIG_BLK_DEV_INITRD
char *path;
void *start;
size_t size;
+ /*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/early-microcode.txt
+ */
+ static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+
#ifdef CONFIG_X86_32
struct boot_params *p;
@@ -89,9 +90,12 @@ static struct cpio_data __init find_ucode_in_initrd(void)
path = ucode_path;
start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
size = boot_params.hdr.ramdisk_size;
-#endif
+#endif /* !CONFIG_X86_32 */
- return find_cpio_data(path, start, size, &offset);
+ return find_cpio_data(path, start, size, NULL);
+#else
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
}
static size_t compute_container_size(u8 *data, u32 total_size)
@@ -278,22 +282,26 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
void __init load_ucode_amd_bsp(unsigned int family)
{
struct cpio_data cp;
+ bool *builtin;
void **data;
size_t *size;
#ifdef CONFIG_X86_32
data = (void **)__pa_nodebug(&ucode_cpio.data);
size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+ builtin = (bool *)__pa_nodebug(&ucode_builtin);
#else
data = &ucode_cpio.data;
size = &ucode_cpio.size;
+ builtin = &ucode_builtin;
#endif
- cp = find_ucode_in_initrd();
- if (!cp.data) {
- if (!load_builtin_amd_microcode(&cp, family))
- return;
- }
+ *builtin = load_builtin_amd_microcode(&cp, family);
+ if (!*builtin)
+ cp = find_ucode_in_initrd();
+
+ if (!(cp.data && cp.size))
+ return;
*data = cp.data;
*size = cp.size;
@@ -352,6 +360,7 @@ void load_ucode_amd_ap(void)
unsigned int cpu = smp_processor_id();
struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
+ u8 *cont = container;
u32 rev, eax;
u16 eq_id;
@@ -368,8 +377,12 @@ void load_ucode_amd_ap(void)
if (check_current_patch_level(&rev, false))
return;
+ /* Add CONFIG_RANDOMIZE_MEMORY offset. */
+ if (!ucode_builtin)
+ cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+
eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+ eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
eq_id = find_equiv_id(eq, eax);
if (!eq_id)
@@ -431,6 +444,10 @@ int __init save_microcode_in_initrd_amd(void)
else
container = cont_va;
+ /* Add CONFIG_RANDOMIZE_MEMORY offset. */
+ if (!ucode_builtin)
+ container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ac360bfbbdb6..5ce5155f0695 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -60,7 +60,6 @@ static bool dis_ucode_ldr;
static DEFINE_MUTEX(microcode_mutex);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-EXPORT_SYMBOL_GPL(ucode_cpu_info);
/*
* Operations that are run on a target cpu:
@@ -175,24 +174,24 @@ void load_ucode_ap(void)
}
}
-int __init save_microcode_in_initrd(void)
+static int __init save_microcode_in_initrd(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
if (c->x86 >= 6)
- save_microcode_in_initrd_intel();
+ return save_microcode_in_initrd_intel();
break;
case X86_VENDOR_AMD:
if (c->x86 >= 0x10)
- save_microcode_in_initrd_amd();
+ return save_microcode_in_initrd_amd();
break;
default:
break;
}
- return 0;
+ return -EINVAL;
}
void reload_early_microcode(void)
@@ -559,55 +558,36 @@ static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
-static int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+static int mc_cpu_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
dev = get_cpu_device(cpu);
+ microcode_update_cpu(cpu);
+ pr_debug("CPU%d added\n", cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- microcode_update_cpu(cpu);
- pr_debug("CPU%d added\n", cpu);
- /*
- * "break" is missing on purpose here because we want to fall
- * through in order to create the sysfs group.
- */
-
- case CPU_DOWN_FAILED:
- if (sysfs_create_group(&dev->kobj, &mc_attr_group))
- pr_err("Failed to create group for CPU%d\n", cpu);
- break;
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
- case CPU_DOWN_PREPARE:
- /* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("CPU%d removed\n", cpu);
- break;
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev;
+ dev = get_cpu_device(cpu);
+ /* Suspend is in progress, only remove the interface */
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ pr_debug("CPU%d removed\n", cpu);
/*
- * case CPU_DEAD:
- *
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
- }
-
- /* The CPU refused to come up during a system resume */
- if (action == CPU_UP_CANCELED_FROZEN)
- microcode_fini_cpu(cpu);
-
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block mc_cpu_notifier = {
- .notifier_call = mc_cpu_callback,
-};
-
static struct attribute *cpu_root_microcode_attrs[] = {
&dev_attr_reload.attr,
NULL
@@ -666,7 +646,8 @@ int __init microcode_init(void)
goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
- register_hotcpu_notifier(&mc_cpu_notifier);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
@@ -691,4 +672,5 @@ int __init microcode_init(void)
return error;
}
+fs_initcall(save_microcode_in_initrd);
late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 65cbbcd48fe4..cdc0deab00c9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -40,9 +40,13 @@
#include <asm/msr.h>
/*
- * Temporary microcode blobs pointers storage. We note here the pointers to
- * microcode blobs we've got from whatever storage (detached initrd, builtin).
- * Later on, we put those into final storage mc_saved_data.mc_saved.
+ * Temporary microcode blobs pointers storage. We note here during early load
+ * the pointers to microcode blobs we've got from whatever storage (detached
+ * initrd, builtin). Later on, we put those into final storage
+ * mc_saved_data.mc_saved.
+ *
+ * Important: those are offsets from the beginning of initrd or absolute
+ * addresses within the kernel image when built-in.
*/
static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
@@ -51,8 +55,15 @@ static struct mc_saved_data {
struct microcode_intel **mc_saved;
} mc_saved_data;
+/* Microcode blobs within the initrd. 0 if builtin. */
+static struct ucode_blobs {
+ unsigned long start;
+ bool valid;
+} blobs;
+
+/* Go through saved patches and find the one suitable for the current CPU. */
static enum ucode_state
-load_microcode_early(struct microcode_intel **saved,
+find_microcode_patch(struct microcode_intel **saved,
unsigned int num_saved, struct ucode_cpu_info *uci)
{
struct microcode_intel *ucode_ptr, *new_mc = NULL;
@@ -121,13 +132,13 @@ load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
if (!mcs->mc_saved) {
copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
- return load_microcode_early(mc_saved_tmp, count, uci);
+ return find_microcode_patch(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
microcode_phys(mc_saved_tmp, mcs);
- return load_microcode_early(mc_saved_tmp, count, uci);
+ return find_microcode_patch(mc_saved_tmp, count, uci);
#else
- return load_microcode_early(mcs->mc_saved, count, uci);
+ return find_microcode_patch(mcs->mc_saved, count, uci);
#endif
}
}
@@ -450,8 +461,6 @@ static void show_saved_mc(void)
#endif
}
-#ifdef CONFIG_HOTPLUG_CPU
-static DEFINE_MUTEX(x86_cpu_microcode_mutex);
/*
* Save this mc into mc_saved_data. So it will be loaded early when a CPU is
* hot added or resumes.
@@ -459,19 +468,18 @@ static DEFINE_MUTEX(x86_cpu_microcode_mutex);
* Please make sure this mc should be a valid microcode patch before calling
* this function.
*/
-int save_mc_for_early(u8 *mc)
+static void save_mc_for_early(u8 *mc)
{
+#ifdef CONFIG_HOTPLUG_CPU
+ /* Synchronization during CPU hotplug. */
+ static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int mc_saved_count_init;
unsigned int num_saved;
struct microcode_intel **mc_saved;
- int ret = 0;
- int i;
+ int ret, i;
- /*
- * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
- * hotplug.
- */
mutex_lock(&x86_cpu_microcode_mutex);
mc_saved_count_init = mc_saved_data.num_saved;
@@ -509,11 +517,8 @@ int save_mc_for_early(u8 *mc)
out:
mutex_unlock(&x86_cpu_microcode_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(save_mc_for_early);
#endif
+}
static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
{
@@ -532,37 +537,6 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
#endif
}
-static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- unsigned long start, unsigned long size,
- struct ucode_cpu_info *uci)
-{
- struct cpio_data cd;
- long offset = 0;
-#ifdef CONFIG_X86_32
- char *p = (char *)__pa_nodebug(ucode_name);
-#else
- char *p = ucode_name;
-#endif
-
- cd.data = NULL;
- cd.size = 0;
-
- /* try built-in microcode if no initrd */
- if (!size) {
- if (!load_builtin_intel_microcode(&cd))
- return UCODE_ERROR;
- } else {
- cd = find_cpio_data(p, (void *)start, size, &offset);
- if (!cd.data)
- return UCODE_ERROR;
- }
-
- return get_matching_model_microcode(start, cd.data, cd.size,
- mcs, mc_ptrs, uci);
-}
-
/*
* Print ucode update info.
*/
@@ -680,38 +654,117 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
*/
int __init save_microcode_in_initrd_intel(void)
{
- unsigned int count = mc_saved_data.num_saved;
struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
- int ret = 0;
+ unsigned int count = mc_saved_data.num_saved;
+ unsigned long offset = 0;
+ int ret;
if (!count)
- return ret;
+ return 0;
+
+ /*
+ * We have found a valid initrd but it might've been relocated in the
+ * meantime so get its updated address.
+ */
+ if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid)
+ offset = initrd_start;
- copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count);
+ copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
-
- show_saved_mc();
+ else
+ show_saved_mc();
return ret;
}
+static __init enum ucode_state
+__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+ char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name)
+ : ucode_name;
+# ifdef CONFIG_X86_32
+ unsigned long start = 0, size;
+ struct boot_params *params;
+
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ size = params->hdr.ramdisk_size;
+
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ start = (size ? params->hdr.ramdisk_image : 0);
+
+# else /* CONFIG_X86_64 */
+ unsigned long start = 0, size;
+
+ size = (u64)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (u64)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+
+ start += PAGE_OFFSET;
+ }
+# endif
+
+ *cd = find_cpio_data(p, (void *)start, size, NULL);
+ if (cd->data) {
+ blbp->start = start;
+ blbp->valid = true;
+
+ return UCODE_OK;
+ } else
+#endif /* CONFIG_BLK_DEV_INITRD */
+ return UCODE_ERROR;
+}
+
+static __init enum ucode_state
+scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
+ struct ucode_cpu_info *uci, struct ucode_blobs *blbp)
+{
+ struct cpio_data cd = { NULL, 0, "" };
+ enum ucode_state ret;
+
+ /* try built-in microcode first */
+ if (load_builtin_intel_microcode(&cd))
+ /*
+ * Invalidate blobs as we might've gotten an initrd too,
+ * supplied by the boot loader, by mistake or simply forgotten
+ * there. That's fine, we ignore it since we've found builtin
+ * microcode already.
+ */
+ blbp->valid = false;
+ else {
+ ret = __scan_microcode_initrd(&cd, blbp);
+ if (ret != UCODE_OK)
+ return ret;
+ }
+
+ return get_matching_model_microcode(blbp->start, cd.data, cd.size,
+ mcs, mc_ptrs, uci);
+}
+
static void __init
_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- unsigned long start, unsigned long size)
+ struct ucode_blobs *blbp)
{
struct ucode_cpu_info uci;
enum ucode_state ret;
collect_cpu_info_early(&uci);
- ret = scan_microcode(mcs, mc_ptrs, start, size, &uci);
+ ret = scan_microcode(mcs, mc_ptrs, &uci, blbp);
if (ret != UCODE_OK)
return;
- ret = load_microcode(mcs, mc_ptrs, start, &uci);
+ ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci);
if (ret != UCODE_OK)
return;
@@ -720,54 +773,60 @@ _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
void __init load_ucode_intel_bsp(void)
{
- u64 start, size;
-#ifdef CONFIG_X86_32
- struct boot_params *p;
-
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- size = p->hdr.ramdisk_size;
+ struct ucode_blobs *blobs_p;
+ struct mc_saved_data *mcs;
+ unsigned long *ptrs;
- /*
- * Set start only if we have an initrd image. We cannot use initrd_start
- * because it is not set that early yet.
- */
- start = (size ? p->hdr.ramdisk_image : 0);
-
- _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_tmp_ptrs),
- start, size);
+#ifdef CONFIG_X86_32
+ mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+ ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs);
+ blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
#else
- size = boot_params.hdr.ramdisk_size;
- start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
-
- _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size);
+ mcs = &mc_saved_data;
+ ptrs = mc_tmp_ptrs;
+ blobs_p = &blobs;
#endif
+
+ _load_ucode_intel_bsp(mcs, ptrs, blobs_p);
}
void load_ucode_intel_ap(void)
{
- unsigned long *mcs_tmp_p;
- struct mc_saved_data *mcs_p;
+ struct ucode_blobs *blobs_p;
+ unsigned long *ptrs, start = 0;
+ struct mc_saved_data *mcs;
struct ucode_cpu_info uci;
enum ucode_state ret;
-#ifdef CONFIG_X86_32
- mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
- mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+#ifdef CONFIG_X86_32
+ mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+ ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
+ blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
#else
- mcs_tmp_p = mc_tmp_ptrs;
- mcs_p = &mc_saved_data;
+ mcs = &mc_saved_data;
+ ptrs = mc_tmp_ptrs;
+ blobs_p = &blobs;
#endif
/*
* If there is no valid ucode previously saved in memory, no need to
* update ucode on this AP.
*/
- if (!mcs_p->num_saved)
+ if (!mcs->num_saved)
return;
+ if (blobs_p->valid) {
+ start = blobs_p->start;
+
+ /*
+ * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles
+ * physmem mapping too and there we have the initrd.
+ */
+ start += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+ }
+
collect_cpu_info_early(&uci);
- ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci);
+ ret = load_microcode(mcs, ptrs, start, &uci);
if (ret != UCODE_OK)
return;
@@ -784,7 +843,7 @@ void reload_ucode_intel(void)
collect_cpu_info_early(&uci);
- ret = load_microcode_early(mc_saved_data.mc_saved,
+ ret = find_microcode_patch(mc_saved_data.mc_saved,
mc_saved_data.num_saved, &uci);
if (ret != UCODE_OK)
return;
@@ -794,6 +853,7 @@ void reload_ucode_intel(void)
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
+ static struct cpu_signature prev;
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
unsigned int val[2];
@@ -808,8 +868,13 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
}
csig->rev = c->microcode;
- pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
- cpu_num, csig->sig, csig->pf, csig->rev);
+
+ /* No extra locking on prev, races are harmless. */
+ if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) {
+ pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n",
+ csig->sig, csig->pf, csig->rev);
+ prev = *csig;
+ }
return 0;
}
@@ -838,6 +903,7 @@ static int apply_microcode_intel(int cpu)
struct ucode_cpu_info *uci;
struct cpuinfo_x86 *c;
unsigned int val[2];
+ static int prev_rev;
/* We should bind the task to the CPU */
if (WARN_ON(raw_smp_processor_id() != cpu))
@@ -872,11 +938,14 @@ static int apply_microcode_intel(int cpu)
return -1;
}
- pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
- cpu, val[1],
- mc->hdr.date & 0xffff,
- mc->hdr.date >> 24,
- (mc->hdr.date >> 16) & 0xff);
+ if (val[1] != prev_rev) {
+ pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
+ val[1],
+ mc->hdr.date & 0xffff,
+ mc->hdr.date >> 24,
+ (mc->hdr.date >> 16) & 0xff);
+ prev_rev = val[1];
+ }
c = &cpu_data(cpu);
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 2ce1a7dc45b7..406cb6c0d9dd 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -141,7 +141,6 @@ int microcode_sanity_check(void *mc, int print_err)
}
return 0;
}
-EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
* Returns 1 if update has been found, 0 otherwise.
@@ -183,4 +182,3 @@ int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
return find_matching_signature(mc, csig, cpf);
}
-EXPORT_SYMBOL_GPL(has_newer_microcode);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 10c11b4da31d..8f44c5a50ab8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -13,7 +13,8 @@
#include <linux/types.h>
#include <linux/time.h>
#include <linux/clocksource.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/hardirq.h>
#include <linux/efi.h>
#include <linux/interrupt.h>
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 31e951ce6dff..3b442b64c72d 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -17,7 +17,6 @@
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/smp.h>
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 16e37a2581ac..fdc55215d44d 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -4,7 +4,7 @@
*/
#define DEBUG
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index d76f13d6d8d6..6d9b45549109 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -2,7 +2,6 @@
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
-#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7d393ecdeee6..24e87e74990d 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -38,7 +38,7 @@
#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/sort.h>
@@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
static bool mtrr_aps_delayed_init;
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
const struct mtrr_ops *mtrr_if;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
-void set_mtrr_ops(const struct mtrr_ops *ops)
+void __init set_mtrr_ops(const struct mtrr_ops *ops)
{
if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
mtrr_ops[ops->vendor] = ops;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 6c7ced07d16d..ad8bd763efa5 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index,
bool get_mtrr_state(void);
void mtrr_bp_pat_init(void);
-extern void set_mtrr_ops(const struct mtrr_ops *ops);
+extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 2e8caf03f593..181eabecae25 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -12,7 +12,7 @@
*/
#include <linux/percpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index f6f50c4ceaec..cfa97ff67bda 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -39,9 +39,9 @@ __setup("nordrand", x86_rdrand_setup);
*/
#define SANITY_CHECK_LOOPS 8
+#ifdef CONFIG_ARCH_RANDOM
void x86_init_rdrand(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_ARCH_RANDOM
unsigned long tmp;
int i;
@@ -55,5 +55,5 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
return;
}
}
-#endif
}
+#endif
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 8cac429b6a1d..81160578b91a 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,10 +22,12 @@
*/
#include <linux/dmi.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
+#include <asm/apic.h>
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -81,10 +83,17 @@ static void __init vmware_platform_setup(void)
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
- if (ebx != UINT_MAX)
+ if (ebx != UINT_MAX) {
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- else
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Skip lapic calibration since we know the bus frequency. */
+ lapic_timer_frequency = ecx / HZ;
+ pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
+ ecx);
+#endif
+ } else {
pr_warn("Failed to get TSC freq from the hypervisor\n");
+ }
}
/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 9ef978d69c22..9616cf76940c 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -20,7 +20,7 @@
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2bb25c3fe2e8..9b7cf5c28f5f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -17,7 +17,7 @@
#include <linux/sysfs.h>
#include <asm/stacktrace.h>
-
+#include <asm/unwind.h>
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
@@ -25,11 +25,29 @@ unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
+{
+ unsigned long *begin = task_stack_page(task);
+ unsigned long *end = task_stack_page(task) + THREAD_SIZE;
+
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_TASK;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
static void printk_stack_address(unsigned long address, int reliable,
- void *data)
+ char *log_lvl)
{
+ touch_nmi_watchdog();
printk("%s [<%p>] %s%pB\n",
- (char *)data, (void *)address, reliable ? "" : "? ",
+ log_lvl, (void *)address, reliable ? "" : "? ",
(void *)address);
}
@@ -38,165 +56,120 @@ void printk_address(unsigned long address)
pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct thread_info *tinfo, int *graph)
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl)
{
- struct task_struct *task;
- unsigned long ret_addr;
- int index;
+ struct unwind_state state;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+ int graph_idx = 0;
- if (addr != (unsigned long)return_to_handler)
- return;
-
- task = tinfo->task;
- index = task->curr_ret_stack;
-
- if (!task->ret_stack || index < *graph)
- return;
-
- index -= *graph;
- ret_addr = task->ret_stack[index].ret;
+ printk("%sCall Trace:\n", log_lvl);
- ops->address(data, ret_addr, 1);
+ unwind_start(&state, task, regs, stack);
- (*graph)++;
-}
-#else
-static inline void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct thread_info *tinfo, int *graph)
-{ }
-#endif
+ /*
+ * Iterate through the stacks, starting with the current stack pointer.
+ * Each stack has a pointer to the next one.
+ *
+ * x86-64 can have several stacks:
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
+ *
+ * x86-32 can have up to three stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
+ */
+ for (; stack; stack = stack_info.next_sp) {
+ const char *str_begin, *str_end;
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
+ /*
+ * If we overflowed the task stack into a guard page, jump back
+ * to the bottom of the usable stack.
+ */
+ if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
+ stack = task_stack_page(task);
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
-}
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
-unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, 0);
- }
- print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+ stack_type_str(stack_info.type, &str_begin, &str_end);
+ if (str_begin)
+ printk("%s <%s> ", log_lvl, str_begin);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+ * same time, follow proper stack frames with the unwinder.
+ *
+ * Addresses found during the scan which are not reported by
+ * the unwinder are considered to be additional clues which are
+ * sometimes useful for debugging and are prefixed with '?'.
+ * This also serves as a failsafe option in case the unwinder
+ * goes off in the weeds.
+ */
+ for (; stack < stack_info.end; stack++) {
+ unsigned long real_addr;
+ int reliable = 0;
+ unsigned long addr = *stack;
+ unsigned long *ret_addr_p =
+ unwind_get_return_address_ptr(&state);
+
+ if (!__kernel_text_address(addr))
+ continue;
+
+ if (stack == ret_addr_p)
+ reliable = 1;
+
+ /*
+ * When function graph tracing is enabled for a
+ * function, its return address on the stack is
+ * replaced with the address of an ftrace handler
+ * (return_to_handler). In that case, before printing
+ * the "real" address, we want to print the handler
+ * address as an "unreliable" hint that function graph
+ * tracing was involved.
+ */
+ real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+ addr, stack);
+ if (real_addr != addr)
+ printk_stack_address(addr, 0, log_lvl);
+ printk_stack_address(real_addr, reliable, log_lvl);
+
+ if (!reliable)
+ continue;
+
+ /*
+ * Get the next frame from the unwinder. No need to
+ * check for an error: if anything goes wrong, the rest
+ * of the addresses will just be printed as unreliable.
+ */
+ unwind_next_frame(&state);
}
- stack++;
- }
- return bp;
-}
-EXPORT_SYMBOL_GPL(print_context_stack);
-
-unsigned long
-print_context_stack_bp(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long *ret_addr = &frame->return_address;
-
- while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
- unsigned long addr = *ret_addr;
- if (!__kernel_text_address(addr))
- break;
-
- if (ops->address(data, addr, 1))
- break;
- frame = frame->next_frame;
- ret_addr = &frame->return_address;
- print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+ if (str_end)
+ printk("%s <%s> ", log_lvl, str_end);
}
-
- return (unsigned long)frame;
-}
-EXPORT_SYMBOL_GPL(print_context_stack_bp);
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static int print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk_stack_address(addr, reliable, data);
- return 0;
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .stack = print_trace_stack,
- .address = print_trace_address,
- .walk_stack = print_context_stack,
-};
-
-void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
}
void show_stack(struct task_struct *task, unsigned long *sp)
{
- unsigned long bp = 0;
- unsigned long stack;
+ task = task ? : current;
/*
* Stack frames below this one aren't interesting. Don't show them
* if we're printing for %current.
*/
- if (!sp && (!task || task == current)) {
- sp = &stack;
- bp = stack_frame(current, NULL);
- }
+ if (!sp && task == current)
+ sp = get_stack_pointer(current, NULL);
- show_stack_log_lvl(task, NULL, sp, bp, "");
+ show_stack_log_lvl(task, NULL, sp, "");
+}
+
+void show_stack_regs(struct pt_regs *regs)
+{
+ show_stack_log_lvl(current, regs, NULL, "");
}
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -228,6 +201,8 @@ unsigned long oops_begin(void)
EXPORT_SYMBOL_GPL(oops_begin);
NOKPROBE_SYMBOL(oops_begin);
+void __noreturn rewind_stack_do_exit(int signr);
+
void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
@@ -249,7 +224,13 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
- do_exit(signr);
+
+ /*
+ * We're not going to return, but we might be on an IST stack or
+ * have very little stack space left. Rewind the stack and kill
+ * the task.
+ */
+ rewind_stack_do_exit(signr);
}
NOKPROBE_SYMBOL(oops_end);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 464ffd69b92e..06eb322b5f9f 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -7,7 +7,7 @@
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
#include <linux/sysfs.h>
@@ -16,93 +16,121 @@
#include <asm/stacktrace.h>
-static void *is_irq_stack(void *p, void *irq)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
{
- if (p < irq || p >= (irq + THREAD_SIZE))
- return NULL;
- return irq + THREAD_SIZE;
+ switch (type) {
+ case STACK_TYPE_IRQ:
+ case STACK_TYPE_SOFTIRQ:
+ *begin = "IRQ";
+ *end = "EOI";
+ break;
+ default:
+ *begin = NULL;
+ *end = NULL;
+ }
}
-
-static void *is_hardirq_stack(unsigned long *stack, int cpu)
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- void *irq = per_cpu(hardirq_stack, cpu);
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- return is_irq_stack(stack, irq);
-}
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
-static void *is_softirq_stack(unsigned long *stack, int cpu)
-{
- void *irq = per_cpu(softirq_stack, cpu);
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
- return is_irq_stack(stack, irq);
+ /*
+ * See irq_32.c -- the next stack pointer is stored at the beginning of
+ * the stack.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
}
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- int graph = 0;
- u32 *prev_esp;
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- if (!task)
- task = current;
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
- if (!stack) {
- unsigned long dummy;
+ info->type = STACK_TYPE_SOFTIRQ;
+ info->begin = begin;
+ info->end = end;
- stack = &dummy;
- if (task != current)
- stack = (unsigned long *)task->thread.sp;
- }
+ /*
+ * The next stack pointer is stored at the beginning of the stack.
+ * See irq_32.c.
+ */
+ info->next_sp = (unsigned long *)*begin;
- if (!bp)
- bp = stack_frame(task, regs);
+ return true;
+}
- for (;;) {
- struct thread_info *context;
- void *end_stack;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
- end_stack = is_hardirq_stack(stack, cpu);
- if (!end_stack)
- end_stack = is_softirq_stack(stack, cpu);
+ task = task ? : current;
- context = task_thread_info(task);
- bp = ops->walk_stack(context, stack, bp, ops, data,
- end_stack, &graph);
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- /* Stop if not on irq stack */
- if (!end_stack)
- break;
+ if (task != current)
+ goto unknown;
- /* The previous esp is saved on the bottom of the stack */
- prev_esp = (u32 *)(end_stack - THREAD_SIZE);
- stack = (unsigned long *)*prev_esp;
- if (!stack)
- break;
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
- if (ops->stack(data, "IRQ") < 0)
- break;
- touch_nmi_watchdog();
+ if (in_softirq_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
+ /*
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
+ */
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type))
+ goto unknown;
+ *visit_mask |= 1UL << info->type;
}
- put_cpu();
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl)
{
unsigned long *stack;
int i;
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ if (!try_get_task_stack(task))
+ return;
+
+ sp = sp ? : get_stack_pointer(task, regs);
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
@@ -117,7 +145,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
touch_nmi_watchdog();
}
pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
+
+ put_task_stack(task);
}
@@ -139,7 +169,7 @@ void show_regs(struct pt_regs *regs)
u8 *ip;
pr_emerg("Stack:\n");
- show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
+ show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
pr_emerg("Code:");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 5f1c6266eb30..36cf1a498227 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -7,7 +7,7 @@
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
#include <linux/sysfs.h>
@@ -16,264 +16,150 @@
#include <asm/stacktrace.h>
+static char *exception_stack_names[N_EXCEPTION_STACKS] = {
+ [ DOUBLEFAULT_STACK-1 ] = "#DF",
+ [ NMI_STACK-1 ] = "NMI",
+ [ DEBUG_STACK-1 ] = "#DB",
+ [ MCE_STACK-1 ] = "#MC",
+};
-#define N_EXCEPTION_STACKS_END \
- (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
-
-static char x86_stack_ids[][8] = {
- [ DEBUG_STACK-1 ] = "#DB",
- [ NMI_STACK-1 ] = "NMI",
- [ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ MCE_STACK-1 ] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [ N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS_END ] = "#DB[?]"
-#endif
+static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
{
- unsigned k;
-
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end = per_cpu(orig_ist, cpu).ist[k];
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= end - EXCEPTION_STKSZ) {
- /*
- * Make sure we only iterate through an exception
- * stack once. If it comes up for the second time
- * then there's something wrong going on - just
- * break out and return NULL:
- */
- if (*usedp & (1U << k))
- break;
- *usedp |= 1U << k;
- *idp = x86_stack_ids[k];
- return (unsigned long *)end;
- }
- /*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
- */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
- unsigned j = N_EXCEPTION_STACKS - 1;
-
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- ++j;
- end -= EXCEPTION_STKSZ;
- x86_stack_ids[j][4] = '1' +
- (j - N_EXCEPTION_STACKS);
- } while (stack < end - EXCEPTION_STKSZ);
- if (*usedp & (1U << j))
- break;
- *usedp |= 1U << j;
- *idp = x86_stack_ids[j];
- return (unsigned long *)end;
- }
-#endif
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+ switch (type) {
+ case STACK_TYPE_IRQ:
+ *begin = "IRQ";
+ *end = "EOI";
+ break;
+ case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
+ *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
+ *end = "EOE";
+ break;
+ default:
+ *begin = NULL;
+ *end = NULL;
}
- return NULL;
}
-static inline int
-in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
- unsigned long *irq_stack_end)
+static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
- return (stack >= irq_stack && stack < irq_stack_end);
-}
-
-static const unsigned long irq_stack_size =
- (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
-
-enum stack_type {
- STACK_IS_UNKNOWN,
- STACK_IS_NORMAL,
- STACK_IS_EXCEPTION,
- STACK_IS_IRQ,
-};
-
-static enum stack_type
-analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
- unsigned long **stack_end, unsigned long *irq_stack,
- unsigned *used, char **id)
-{
- unsigned long addr;
+ unsigned long *begin, *end;
+ struct pt_regs *regs;
+ unsigned k;
- addr = ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- if ((unsigned long)task_stack_page(task) == addr)
- return STACK_IS_NORMAL;
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
- *stack_end = in_exception_stack(cpu, (unsigned long)stack,
- used, id);
- if (*stack_end)
- return STACK_IS_EXCEPTION;
+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+ end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k];
+ begin = end - (exception_stack_sizes[k] / sizeof(long));
+ regs = (struct pt_regs *)end - 1;
- if (!irq_stack)
- return STACK_IS_NORMAL;
+ if (stack < begin || stack >= end)
+ continue;
- *stack_end = irq_stack;
- irq_stack = irq_stack - irq_stack_size;
+ info->type = STACK_TYPE_EXCEPTION + k;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)regs->sp;
- if (in_irq_stack(stack, irq_stack, *stack_end))
- return STACK_IS_IRQ;
+ return true;
+ }
- return STACK_IS_UNKNOWN;
+ return false;
}
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- struct thread_info *tinfo;
- unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
- unsigned long dummy;
- unsigned used = 0;
- int graph = 0;
- int done = 0;
-
- if (!task)
- task = current;
-
- if (!stack) {
- if (regs)
- stack = (unsigned long *)regs->sp;
- else if (task != current)
- stack = (unsigned long *)task->thread.sp;
- else
- stack = &dummy;
- }
+ unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
- if (!bp)
- bp = stack_frame(task, regs);
/*
- * Print function call entries in all stacks, starting at the
- * current stack address. If the stacks consist of nested
- * exceptions
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
*/
- tinfo = task_thread_info(task);
- while (!done) {
- unsigned long *stack_end;
- enum stack_type stype;
- char *id;
+ if (stack < begin || stack > end)
+ return false;
- stype = analyze_stack(cpu, task, stack, &stack_end,
- irq_stack, &used, &id);
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
- /* Default finish unless specified to continue */
- done = 1;
+ /*
+ * The next stack pointer is the first thing pushed by the entry code
+ * after switching to the irq stack.
+ */
+ info->next_sp = (unsigned long *)*(end - 1);
- switch (stype) {
+ return true;
+}
- /* Break out early if we are on the thread stack */
- case STACK_IS_NORMAL:
- break;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
- case STACK_IS_EXCEPTION:
+ task = task ? : current;
- if (ops->stack(data, id) < 0)
- break;
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- bp = ops->walk_stack(tinfo, stack, bp, ops,
- data, stack_end, &graph);
- ops->stack(data, "<EOE>");
- /*
- * We link to the next stack via the
- * second-to-last pointer (index -2 to end) in the
- * exception stack:
- */
- stack = (unsigned long *) stack_end[-2];
- done = 0;
- break;
+ if (task != current)
+ goto unknown;
- case STACK_IS_IRQ:
+ if (in_exception_stack(stack, info))
+ goto recursion_check;
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = ops->walk_stack(tinfo, stack, bp,
- ops, data, stack_end, &graph);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (stack_end[-1]);
- irq_stack = NULL;
- ops->stack(data, "EOI");
- done = 0;
- break;
+ if (in_irq_stack(stack, info))
+ goto recursion_check;
- case STACK_IS_UNKNOWN:
- ops->stack(data, "UNK");
- break;
- }
- }
+ goto unknown;
+recursion_check:
/*
- * This handles the process stack:
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
*/
- bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
- put_cpu();
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type))
+ goto unknown;
+ *visit_mask |= 1UL << info->type;
+ }
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl)
{
unsigned long *irq_stack_end;
unsigned long *irq_stack;
unsigned long *stack;
- int cpu;
int i;
- preempt_disable();
- cpu = smp_processor_id();
+ if (!try_get_task_stack(task))
+ return;
- irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
- irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+ irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
- /*
- * Debugging aid: "show_stack(NULL, NULL);" prints the
- * back trace for this cpu:
- */
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ sp = sp ? : get_stack_pointer(task, regs);
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
+ unsigned long word;
+
if (stack >= irq_stack && stack <= irq_stack_end) {
if (stack == irq_stack_end) {
stack = (unsigned long *) (irq_stack_end[-1]);
@@ -283,26 +169,31 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (kstack_end(stack))
break;
}
+
+ if (probe_kernel_address(stack, word))
+ break;
+
if ((i % STACKSLOTS_PER_LINE) == 0) {
if (i != 0)
pr_cont("\n");
- printk("%s %016lx", log_lvl, *stack++);
+ printk("%s %016lx", log_lvl, word);
} else
- pr_cont(" %016lx", *stack++);
+ pr_cont(" %016lx", word);
+
+ stack++;
touch_nmi_watchdog();
}
- preempt_enable();
pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
+
+ put_task_stack(task);
}
void show_regs(struct pt_regs *regs)
{
int i;
- unsigned long sp;
- sp = regs->sp;
show_regs_print_info(KERN_DEFAULT);
__show_regs(regs, 1);
@@ -317,8 +208,7 @@ void show_regs(struct pt_regs *regs)
u8 *ip;
printk(KERN_DEFAULT "Stack:\n");
- show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- 0, KERN_DEFAULT);
+ show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
printk(KERN_DEFAULT "Code: ");
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 621b501f8935..b85fe5f91c3f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,8 +40,10 @@
* user can e.g. boot the original kernel with mem=1G while still booting the
* next kernel with full memory.
*/
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820map initial_e820 __initdata;
+static struct e820map initial_e820_saved __initdata;
+struct e820map *e820 __refdata = &initial_e820;
+struct e820map *e820_saved __refdata = &initial_e820_saved;
/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
@@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (type && ei->type != type)
continue;
@@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (type && ei->type != type)
continue;
@@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
void __init e820_add_region(u64 start, u64 size, int type)
{
- __e820_add_region(&e820, start, size, type);
+ __e820_add_region(e820, start, size, type);
}
static void __init e820_print_type(u32 type)
@@ -164,12 +166,12 @@ void __init e820_print_map(char *who)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820->nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
- (unsigned long long) e820.map[i].addr,
+ (unsigned long long) e820->map[i].addr,
(unsigned long long)
- (e820.map[i].addr + e820.map[i].size - 1));
- e820_print_type(e820.map[i].type);
+ (e820->map[i].addr + e820->map[i].size - 1));
+ e820_print_type(e820->map[i].type);
printk(KERN_CONT "\n");
}
}
@@ -348,7 +350,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
* continue building up new bios map based on this
* information
*/
- if (current_type != last_type || current_type == E820_PRAM) {
+ if (current_type != last_type) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
@@ -388,11 +390,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
while (nr_map) {
u64 start = biosmap->addr;
u64 size = biosmap->size;
- u64 end = start + size;
+ u64 end = start + size - 1;
u32 type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
+ if (start > end && likely(size))
return -1;
e820_add_region(start, size, type);
@@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
- return __e820_update_range(&e820, start, size, old_type, new_type);
+ return __e820_update_range(e820, start, size, old_type, new_type);
}
static u64 __init e820_update_range_saved(u64 start, u64 size,
unsigned old_type, unsigned new_type)
{
- return __e820_update_range(&e820_saved, start, size, old_type,
+ return __e820_update_range(e820_saved, start, size, old_type,
new_type);
}
@@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
e820_print_type(old_type);
printk(KERN_CONT "\n");
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
u64 final_start, final_end;
u64 ei_end;
@@ -566,15 +568,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
void __init update_e820(void)
{
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map))
+ if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map))
return;
printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
{
- sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map),
- &e820_saved.nr_map);
+ sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map),
+ &e820_saved->nr_map);
}
#define MAX_GAP_END 0x100000000ull
/*
@@ -584,14 +586,14 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
unsigned long start_addr, unsigned long long end_addr)
{
unsigned long long last;
- int i = e820.nr_map;
+ int i = e820->nr_map;
int found = 0;
last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
+ unsigned long long start = e820->map[i].addr;
+ unsigned long long end = start + e820->map[i].size;
if (end < start_addr)
continue;
@@ -649,6 +651,33 @@ __init void e820_setup_gap(void)
gapstart, gapstart + gapsize - 1);
}
+/*
+ * Called late during init, in free_initmem().
+ *
+ * Initial e820 and e820_saved are largish __initdata arrays.
+ * Copy them to (usually much smaller) dynamically allocated area.
+ * This is done after all tweaks we ever do to them:
+ * all functions which modify them are __init functions,
+ * they won't exist after this point.
+ */
+__init void e820_reallocate_tables(void)
+{
+ struct e820map *n;
+ int size;
+
+ size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820, size);
+ e820 = n;
+
+ size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_saved, size);
+ e820_saved = n;
+}
+
/**
* Because of the size limitation of struct boot_params, only first
* 128 E820 memory entries are passed to kernel via
@@ -665,7 +694,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
entries = sdata->len / sizeof(struct e820entry);
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
early_memunmap(sdata, data_len);
printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
@@ -686,8 +715,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
int i;
unsigned long pfn = 0;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (pfn < PFN_UP(ei->addr))
register_nosave_region(pfn, PFN_UP(ei->addr));
@@ -712,8 +741,8 @@ static int __init e820_mark_nvs_memory(void)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (ei->type == E820_NVS)
acpi_nvs_register(ei->addr, ei->size);
@@ -754,22 +783,18 @@ u64 __init early_reserve_e820(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
unsigned long start_pfn;
unsigned long end_pfn;
- /*
- * Persistent memory is accounted as ram for purposes of
- * establishing max_pfn and mem_map.
- */
- if (ei->type != E820_RAM && ei->type != E820_PRAM)
+ if (ei->type != type)
continue;
start_pfn = ei->addr >> PAGE_SHIFT;
@@ -794,15 +819,15 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
}
unsigned long __init e820_end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN);
+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
}
unsigned long __init e820_end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL << (32-PAGE_SHIFT));
+ return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM);
}
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
{
early_printk(msg);
panic(msg);
@@ -856,7 +881,7 @@ static int __init parse_memmap_one(char *p)
*/
saved_max_pfn = e820_end_of_ram_pfn();
#endif
- e820.nr_map = 0;
+ e820->nr_map = 0;
userdef = 1;
return 0;
}
@@ -903,8 +928,8 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map),
- &e820.nr_map) < 0)
+ if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map),
+ &e820->nr_map) < 0)
early_panic("Invalid user supplied memory map");
printk(KERN_INFO "e820: user-defined physical RAM map:\n");
@@ -912,7 +937,7 @@ void __init finish_e820_parsing(void)
}
}
-static const char *e820_type_to_string(int e820_type)
+static const char *__init e820_type_to_string(int e820_type)
{
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -926,7 +951,7 @@ static const char *e820_type_to_string(int e820_type)
}
}
-static unsigned long e820_type_to_iomem_type(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(int e820_type)
{
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -942,7 +967,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type)
}
}
-static unsigned long e820_type_to_iores_desc(int e820_type)
+static unsigned long __init e820_type_to_iores_desc(int e820_type)
{
switch (e820_type) {
case E820_ACPI:
@@ -961,7 +986,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type)
}
}
-static bool do_mark_busy(u32 type, struct resource *res)
+static bool __init do_mark_busy(u32 type, struct resource *res)
{
/* this is the legacy bios/dos rom-shadow + mmio region */
if (res->start < (1ULL<<20))
@@ -991,35 +1016,35 @@ void __init e820_reserve_resources(void)
struct resource *res;
u64 end;
- res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
+ res = alloc_bootmem(sizeof(struct resource) * e820->nr_map);
e820_res = res;
- for (i = 0; i < e820.nr_map; i++) {
- end = e820.map[i].addr + e820.map[i].size - 1;
+ for (i = 0; i < e820->nr_map; i++) {
+ end = e820->map[i].addr + e820->map[i].size - 1;
if (end != (resource_size_t)end) {
res++;
continue;
}
- res->name = e820_type_to_string(e820.map[i].type);
- res->start = e820.map[i].addr;
+ res->name = e820_type_to_string(e820->map[i].type);
+ res->start = e820->map[i].addr;
res->end = end;
- res->flags = e820_type_to_iomem_type(e820.map[i].type);
- res->desc = e820_type_to_iores_desc(e820.map[i].type);
+ res->flags = e820_type_to_iomem_type(e820->map[i].type);
+ res->desc = e820_type_to_iores_desc(e820->map[i].type);
/*
* don't register the region that could be conflicted with
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (do_mark_busy(e820.map[i].type, res)) {
+ if (do_mark_busy(e820->map[i].type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
}
- for (i = 0; i < e820_saved.nr_map; i++) {
- struct e820entry *entry = &e820_saved.map[i];
+ for (i = 0; i < e820_saved->nr_map; i++) {
+ struct e820entry *entry = &e820_saved->map[i];
firmware_map_add_early(entry->addr,
entry->addr + entry->size,
e820_type_to_string(entry->type));
@@ -1027,7 +1052,7 @@ void __init e820_reserve_resources(void)
}
/* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+static unsigned long __init ram_alignment(resource_size_t pos)
{
unsigned long mb = pos >> 20;
@@ -1051,7 +1076,7 @@ void __init e820_reserve_resources_late(void)
struct resource *res;
res = e820_res;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820->nr_map; i++) {
if (!res->parent && res->end)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
@@ -1061,8 +1086,8 @@ void __init e820_reserve_resources_late(void)
* Try to bump up RAM regions to reasonable boundaries to
* avoid stolen RAM:
*/
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *entry = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *entry = &e820->map[i];
u64 start, end;
if (entry->type != E820_RAM)
@@ -1110,7 +1135,7 @@ char *__init default_machine_specific_memory_setup(void)
who = "BIOS-e801";
}
- e820.nr_map = 0;
+ e820->nr_map = 0;
e820_add_region(0, LOWMEMSIZE(), E820_RAM);
e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
}
@@ -1124,7 +1149,7 @@ void __init setup_memory_map(void)
char *who;
who = x86_init.resources.memory_setup();
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ memcpy(e820_saved, e820, sizeof(struct e820map));
printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
@@ -1141,8 +1166,8 @@ void __init memblock_x86_fill(void)
*/
memblock_allow_resize();
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
end = ei->addr + ei->size;
if (end != (resource_size_t)end)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index bca14c899137..18bb3a639197 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -11,7 +11,11 @@
#include <linux/pci.h>
#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
#include <drm/i915_drm.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
@@ -21,6 +25,9 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
+
+#define dev_err(msg) pr_err("pci 0000:%02x:%02x.%d: %s", bus, slot, func, msg)
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -76,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func)
#ifdef CONFIG_ACPI
#ifdef CONFIG_X86_IO_APIC
/*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
* Unfortunately that's not true on many Asus boards.
@@ -223,36 +237,19 @@ static void __init intel_remapping_check(int num, int slot, int func)
* despite the efforts of the "RAM buffer" approach, which simply rounds
* memory boundaries up to 64M to try to catch space that may decode
* as RAM and so is not suitable for MMIO.
- *
- * And yes, so far on current devices the base addr is always under 4G.
*/
-static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size)
-{
- u32 base;
-
- /*
- * For the PCI IDs in this quirk, the stolen base is always
- * in 0x5c, aka the BDSM register (yes that's really what
- * it's called).
- */
- base = read_pci_config(num, slot, func, 0x5c);
- base &= ~((1<<20) - 1);
-
- return base;
-}
#define KB(x) ((x) * 1024UL)
#define MB(x) (KB (KB (x)))
-#define GB(x) (MB (KB (x)))
static size_t __init i830_tseg_size(void)
{
- u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
- if (!(tmp & TSEG_ENABLE))
+ if (!(esmramc & TSEG_ENABLE))
return 0;
- if (tmp & I830_TSEG_SIZE_1M)
+ if (esmramc & I830_TSEG_SIZE_1M)
return MB(1);
else
return KB(512);
@@ -260,27 +257,26 @@ static size_t __init i830_tseg_size(void)
static size_t __init i845_tseg_size(void)
{
- u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+ u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK;
- if (!(tmp & TSEG_ENABLE))
+ if (!(esmramc & TSEG_ENABLE))
return 0;
- switch (tmp & I845_TSEG_SIZE_MASK) {
- case I845_TSEG_SIZE_512K:
- return KB(512);
- case I845_TSEG_SIZE_1M:
- return MB(1);
+ switch (tseg_size) {
+ case I845_TSEG_SIZE_512K: return KB(512);
+ case I845_TSEG_SIZE_1M: return MB(1);
default:
- WARN_ON(1);
- return 0;
+ WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc);
}
+ return 0;
}
static size_t __init i85x_tseg_size(void)
{
- u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
- if (!(tmp & TSEG_ENABLE))
+ if (!(esmramc & TSEG_ENABLE))
return 0;
return MB(1);
@@ -300,285 +296,287 @@ static size_t __init i85x_mem_size(void)
* On 830/845/85x the stolen memory base isn't available in any
* register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
*/
-static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size)
+static phys_addr_t __init i830_stolen_base(int num, int slot, int func,
+ size_t stolen_size)
{
- return i830_mem_size() - i830_tseg_size() - stolen_size;
+ return (phys_addr_t)i830_mem_size() - i830_tseg_size() - stolen_size;
}
-static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size)
+static phys_addr_t __init i845_stolen_base(int num, int slot, int func,
+ size_t stolen_size)
{
- return i830_mem_size() - i845_tseg_size() - stolen_size;
+ return (phys_addr_t)i830_mem_size() - i845_tseg_size() - stolen_size;
}
-static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size)
+static phys_addr_t __init i85x_stolen_base(int num, int slot, int func,
+ size_t stolen_size)
{
- return i85x_mem_size() - i85x_tseg_size() - stolen_size;
+ return (phys_addr_t)i85x_mem_size() - i85x_tseg_size() - stolen_size;
}
-static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size)
+static phys_addr_t __init i865_stolen_base(int num, int slot, int func,
+ size_t stolen_size)
{
+ u16 toud;
+
/*
* FIXME is the graphics stolen memory region
* always at TOUD? Ie. is it always the last
* one to be allocated by the BIOS?
*/
- return read_pci_config_16(0, 0, 0, I865_TOUD) << 16;
+ toud = read_pci_config_16(0, 0, 0, I865_TOUD);
+
+ return (phys_addr_t)toud << 16;
+}
+
+static phys_addr_t __init gen3_stolen_base(int num, int slot, int func,
+ size_t stolen_size)
+{
+ u32 bsm;
+
+ /* Almost universally we can find the Graphics Base of Stolen Memory
+ * at register BSM (0x5c) in the igfx configuration space. On a few
+ * (desktop) machines this is also mirrored in the bridge device at
+ * different locations, or in the MCHBAR.
+ */
+ bsm = read_pci_config(num, slot, func, INTEL_BSM);
+
+ return (phys_addr_t)bsm & INTEL_BSM_MASK;
}
static size_t __init i830_stolen_size(int num, int slot, int func)
{
- size_t stolen_size;
u16 gmch_ctrl;
+ u16 gms;
gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
-
- switch (gmch_ctrl & I830_GMCH_GMS_MASK) {
- case I830_GMCH_GMS_STOLEN_512:
- stolen_size = KB(512);
- break;
- case I830_GMCH_GMS_STOLEN_1024:
- stolen_size = MB(1);
- break;
- case I830_GMCH_GMS_STOLEN_8192:
- stolen_size = MB(8);
- break;
- case I830_GMCH_GMS_LOCAL:
- /* local memory isn't part of the normal address space */
- stolen_size = 0;
- break;
+ gms = gmch_ctrl & I830_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I830_GMCH_GMS_STOLEN_512: return KB(512);
+ case I830_GMCH_GMS_STOLEN_1024: return MB(1);
+ case I830_GMCH_GMS_STOLEN_8192: return MB(8);
+ /* local memory isn't part of the normal address space */
+ case I830_GMCH_GMS_LOCAL: return 0;
default:
- return 0;
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
}
- return stolen_size;
+ return 0;
}
static size_t __init gen3_stolen_size(int num, int slot, int func)
{
- size_t stolen_size;
u16 gmch_ctrl;
+ u16 gms;
gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
-
- switch (gmch_ctrl & I855_GMCH_GMS_MASK) {
- case I855_GMCH_GMS_STOLEN_1M:
- stolen_size = MB(1);
- break;
- case I855_GMCH_GMS_STOLEN_4M:
- stolen_size = MB(4);
- break;
- case I855_GMCH_GMS_STOLEN_8M:
- stolen_size = MB(8);
- break;
- case I855_GMCH_GMS_STOLEN_16M:
- stolen_size = MB(16);
- break;
- case I855_GMCH_GMS_STOLEN_32M:
- stolen_size = MB(32);
- break;
- case I915_GMCH_GMS_STOLEN_48M:
- stolen_size = MB(48);
- break;
- case I915_GMCH_GMS_STOLEN_64M:
- stolen_size = MB(64);
- break;
- case G33_GMCH_GMS_STOLEN_128M:
- stolen_size = MB(128);
- break;
- case G33_GMCH_GMS_STOLEN_256M:
- stolen_size = MB(256);
- break;
- case INTEL_GMCH_GMS_STOLEN_96M:
- stolen_size = MB(96);
- break;
- case INTEL_GMCH_GMS_STOLEN_160M:
- stolen_size = MB(160);
- break;
- case INTEL_GMCH_GMS_STOLEN_224M:
- stolen_size = MB(224);
- break;
- case INTEL_GMCH_GMS_STOLEN_352M:
- stolen_size = MB(352);
- break;
+ gms = gmch_ctrl & I855_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I855_GMCH_GMS_STOLEN_1M: return MB(1);
+ case I855_GMCH_GMS_STOLEN_4M: return MB(4);
+ case I855_GMCH_GMS_STOLEN_8M: return MB(8);
+ case I855_GMCH_GMS_STOLEN_16M: return MB(16);
+ case I855_GMCH_GMS_STOLEN_32M: return MB(32);
+ case I915_GMCH_GMS_STOLEN_48M: return MB(48);
+ case I915_GMCH_GMS_STOLEN_64M: return MB(64);
+ case G33_GMCH_GMS_STOLEN_128M: return MB(128);
+ case G33_GMCH_GMS_STOLEN_256M: return MB(256);
+ case INTEL_GMCH_GMS_STOLEN_96M: return MB(96);
+ case INTEL_GMCH_GMS_STOLEN_160M:return MB(160);
+ case INTEL_GMCH_GMS_STOLEN_224M:return MB(224);
+ case INTEL_GMCH_GMS_STOLEN_352M:return MB(352);
default:
- stolen_size = 0;
- break;
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
}
- return stolen_size;
+ return 0;
}
static size_t __init gen6_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
+ u16 gms;
gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
- gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
- gmch_ctrl &= SNB_GMCH_GMS_MASK;
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
- return gmch_ctrl << 25; /* 32 MB units */
+ return (size_t)gms * MB(32);
}
static size_t __init gen8_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
+ u16 gms;
gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
- gmch_ctrl >>= BDW_GMCH_GMS_SHIFT;
- gmch_ctrl &= BDW_GMCH_GMS_MASK;
- return gmch_ctrl << 25; /* 32 MB units */
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ return (size_t)gms * MB(32);
}
static size_t __init chv_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
+ u16 gms;
gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
- gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
- gmch_ctrl &= SNB_GMCH_GMS_MASK;
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
/*
* 0x0 to 0x10: 32MB increments starting at 0MB
* 0x11 to 0x16: 4MB increments starting at 8MB
* 0x17 to 0x1d: 4MB increments start at 36MB
*/
- if (gmch_ctrl < 0x11)
- return gmch_ctrl << 25;
- else if (gmch_ctrl < 0x17)
- return (gmch_ctrl - 0x11 + 2) << 22;
+ if (gms < 0x11)
+ return (size_t)gms * MB(32);
+ else if (gms < 0x17)
+ return (size_t)(gms - 0x11 + 2) * MB(4);
else
- return (gmch_ctrl - 0x17 + 9) << 22;
+ return (size_t)(gms - 0x17 + 9) * MB(4);
}
-struct intel_stolen_funcs {
- size_t (*size)(int num, int slot, int func);
- u32 (*base)(int num, int slot, int func, size_t size);
-};
-
static size_t __init gen9_stolen_size(int num, int slot, int func)
{
u16 gmch_ctrl;
+ u16 gms;
gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
- gmch_ctrl >>= BDW_GMCH_GMS_SHIFT;
- gmch_ctrl &= BDW_GMCH_GMS_MASK;
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
- if (gmch_ctrl < 0xf0)
- return gmch_ctrl << 25; /* 32 MB units */
+ /* 0x0 to 0xef: 32MB increments starting at 0MB */
+ /* 0xf0 to 0xfe: 4MB increments starting at 4MB */
+ if (gms < 0xf0)
+ return (size_t)gms * MB(32);
else
- /* 4MB increments starting at 0xf0 for 4MB */
- return (gmch_ctrl - 0xf0 + 1) << 22;
+ return (size_t)(gms - 0xf0 + 1) * MB(4);
}
-typedef size_t (*stolen_size_fn)(int num, int slot, int func);
+struct intel_early_ops {
+ size_t (*stolen_size)(int num, int slot, int func);
+ phys_addr_t (*stolen_base)(int num, int slot, int func, size_t size);
+};
-static const struct intel_stolen_funcs i830_stolen_funcs __initconst = {
- .base = i830_stolen_base,
- .size = i830_stolen_size,
+static const struct intel_early_ops i830_early_ops __initconst = {
+ .stolen_base = i830_stolen_base,
+ .stolen_size = i830_stolen_size,
};
-static const struct intel_stolen_funcs i845_stolen_funcs __initconst = {
- .base = i845_stolen_base,
- .size = i830_stolen_size,
+static const struct intel_early_ops i845_early_ops __initconst = {
+ .stolen_base = i845_stolen_base,
+ .stolen_size = i830_stolen_size,
};
-static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = {
- .base = i85x_stolen_base,
- .size = gen3_stolen_size,
+static const struct intel_early_ops i85x_early_ops __initconst = {
+ .stolen_base = i85x_stolen_base,
+ .stolen_size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs i865_stolen_funcs __initconst = {
- .base = i865_stolen_base,
- .size = gen3_stolen_size,
+static const struct intel_early_ops i865_early_ops __initconst = {
+ .stolen_base = i865_stolen_base,
+ .stolen_size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = {
- .base = intel_stolen_base,
- .size = gen3_stolen_size,
+static const struct intel_early_ops gen3_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen3_stolen_size,
};
-static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = {
- .base = intel_stolen_base,
- .size = gen6_stolen_size,
+static const struct intel_early_ops gen6_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen6_stolen_size,
};
-static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = {
- .base = intel_stolen_base,
- .size = gen8_stolen_size,
+static const struct intel_early_ops gen8_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen8_stolen_size,
};
-static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = {
- .base = intel_stolen_base,
- .size = gen9_stolen_size,
+static const struct intel_early_ops gen9_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen9_stolen_size,
};
-static const struct intel_stolen_funcs chv_stolen_funcs __initconst = {
- .base = intel_stolen_base,
- .size = chv_stolen_size,
+static const struct intel_early_ops chv_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = chv_stolen_size,
};
-static const struct pci_device_id intel_stolen_ids[] __initconst = {
- INTEL_I830_IDS(&i830_stolen_funcs),
- INTEL_I845G_IDS(&i845_stolen_funcs),
- INTEL_I85X_IDS(&i85x_stolen_funcs),
- INTEL_I865G_IDS(&i865_stolen_funcs),
- INTEL_I915G_IDS(&gen3_stolen_funcs),
- INTEL_I915GM_IDS(&gen3_stolen_funcs),
- INTEL_I945G_IDS(&gen3_stolen_funcs),
- INTEL_I945GM_IDS(&gen3_stolen_funcs),
- INTEL_VLV_M_IDS(&gen6_stolen_funcs),
- INTEL_VLV_D_IDS(&gen6_stolen_funcs),
- INTEL_PINEVIEW_IDS(&gen3_stolen_funcs),
- INTEL_I965G_IDS(&gen3_stolen_funcs),
- INTEL_G33_IDS(&gen3_stolen_funcs),
- INTEL_I965GM_IDS(&gen3_stolen_funcs),
- INTEL_GM45_IDS(&gen3_stolen_funcs),
- INTEL_G45_IDS(&gen3_stolen_funcs),
- INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs),
- INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs),
- INTEL_SNB_D_IDS(&gen6_stolen_funcs),
- INTEL_SNB_M_IDS(&gen6_stolen_funcs),
- INTEL_IVB_M_IDS(&gen6_stolen_funcs),
- INTEL_IVB_D_IDS(&gen6_stolen_funcs),
- INTEL_HSW_D_IDS(&gen6_stolen_funcs),
- INTEL_HSW_M_IDS(&gen6_stolen_funcs),
- INTEL_BDW_M_IDS(&gen8_stolen_funcs),
- INTEL_BDW_D_IDS(&gen8_stolen_funcs),
- INTEL_CHV_IDS(&chv_stolen_funcs),
- INTEL_SKL_IDS(&gen9_stolen_funcs),
- INTEL_BXT_IDS(&gen9_stolen_funcs),
- INTEL_KBL_IDS(&gen9_stolen_funcs),
+static const struct pci_device_id intel_early_ids[] __initconst = {
+ INTEL_I830_IDS(&i830_early_ops),
+ INTEL_I845G_IDS(&i845_early_ops),
+ INTEL_I85X_IDS(&i85x_early_ops),
+ INTEL_I865G_IDS(&i865_early_ops),
+ INTEL_I915G_IDS(&gen3_early_ops),
+ INTEL_I915GM_IDS(&gen3_early_ops),
+ INTEL_I945G_IDS(&gen3_early_ops),
+ INTEL_I945GM_IDS(&gen3_early_ops),
+ INTEL_VLV_M_IDS(&gen6_early_ops),
+ INTEL_VLV_D_IDS(&gen6_early_ops),
+ INTEL_PINEVIEW_IDS(&gen3_early_ops),
+ INTEL_I965G_IDS(&gen3_early_ops),
+ INTEL_G33_IDS(&gen3_early_ops),
+ INTEL_I965GM_IDS(&gen3_early_ops),
+ INTEL_GM45_IDS(&gen3_early_ops),
+ INTEL_G45_IDS(&gen3_early_ops),
+ INTEL_IRONLAKE_D_IDS(&gen3_early_ops),
+ INTEL_IRONLAKE_M_IDS(&gen3_early_ops),
+ INTEL_SNB_D_IDS(&gen6_early_ops),
+ INTEL_SNB_M_IDS(&gen6_early_ops),
+ INTEL_IVB_M_IDS(&gen6_early_ops),
+ INTEL_IVB_D_IDS(&gen6_early_ops),
+ INTEL_HSW_D_IDS(&gen6_early_ops),
+ INTEL_HSW_M_IDS(&gen6_early_ops),
+ INTEL_BDW_M_IDS(&gen8_early_ops),
+ INTEL_BDW_D_IDS(&gen8_early_ops),
+ INTEL_CHV_IDS(&chv_early_ops),
+ INTEL_SKL_IDS(&gen9_early_ops),
+ INTEL_BXT_IDS(&gen9_early_ops),
+ INTEL_KBL_IDS(&gen9_early_ops),
};
-static void __init intel_graphics_stolen(int num, int slot, int func)
+static void __init
+intel_graphics_stolen(int num, int slot, int func,
+ const struct intel_early_ops *early_ops)
{
+ phys_addr_t base, end;
size_t size;
+
+ size = early_ops->stolen_size(num, slot, func);
+ base = early_ops->stolen_base(num, slot, func, size);
+
+ if (!size || !base)
+ return;
+
+ end = base + size - 1;
+ printk(KERN_INFO "Reserving Intel graphics memory at %pa-%pa\n",
+ &base, &end);
+
+ /* Mark this space as reserved */
+ e820_add_region(base, size, E820_RESERVED);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+}
+
+static void __init intel_graphics_quirks(int num, int slot, int func)
+{
+ const struct intel_early_ops *early_ops;
+ u16 device;
int i;
- u32 start;
- u16 device, subvendor, subdevice;
device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
- subvendor = read_pci_config_16(num, slot, func,
- PCI_SUBSYSTEM_VENDOR_ID);
- subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID);
-
- for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {
- if (intel_stolen_ids[i].device == device) {
- const struct intel_stolen_funcs *stolen_funcs =
- (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data;
- size = stolen_funcs->size(num, slot, func);
- start = stolen_funcs->base(num, slot, func, size);
- if (size && start) {
- printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n",
- start, start + (u32)size - 1);
- /* Mark this space as reserved */
- e820_add_region(start, size, E820_RESERVED);
- sanitize_e820_map(e820.map,
- ARRAY_SIZE(e820.map),
- &e820.nr_map);
- }
- return;
- }
+
+ for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
+ kernel_ulong_t driver_data = intel_early_ids[i].driver_data;
+
+ if (intel_early_ids[i].device != device)
+ continue;
+
+ early_ops = (typeof(early_ops))driver_data;
+
+ intel_graphics_stolen(num, slot, func, early_ops);
+
+ return;
}
}
@@ -590,6 +588,61 @@ static void __init force_disable_hpet(int num, int slot, int func)
#endif
}
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc."))
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ dev_err("Cannot power up Apple AirPort card\n");
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ dev_err("Cannot iomap Apple AirPort card\n");
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
@@ -603,12 +656,6 @@ struct chipset {
void (*f)(int num, int slot, int func);
};
-/*
- * Only works for devices on the root bus. If you add any devices
- * not on bus 0 readd another loop level in early_quirks(). But
- * be careful because at least the Nvidia quirk here relies on
- * only matching on bus 0.
- */
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -627,7 +674,7 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
- QFLAG_APPLY_ONCE, intel_graphics_stolen },
+ QFLAG_APPLY_ONCE, intel_graphics_quirks },
/*
* HPET on the current version of the Baytrail platform has accuracy
* problems: it will halt in deep idle state - so we disable it.
@@ -638,9 +685,13 @@ static struct chipset early_qrk[] __initdata = {
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
};
+static void __init early_pci_scan_bus(int bus);
+
/**
* check_dev_quirk - apply early quirks to a given PCI device
* @num: bus number
@@ -649,7 +700,7 @@ static struct chipset early_qrk[] __initdata = {
*
* Check the vendor & device ID against the early quirks table.
*
- * If the device is single function, let early_quirks() know so we don't
+ * If the device is single function, let early_pci_scan_bus() know so we don't
* poke at this device again.
*/
static int __init check_dev_quirk(int num, int slot, int func)
@@ -658,6 +709,7 @@ static int __init check_dev_quirk(int num, int slot, int func)
u16 vendor;
u16 device;
u8 type;
+ u8 sec;
int i;
class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
@@ -685,25 +737,36 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
+
+ if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
if (!(type & 0x80))
return -1;
return 0;
}
-void __init early_quirks(void)
+static void __init early_pci_scan_bus(int bus)
{
int slot, func;
- if (!early_pci_allowed())
- return;
-
/* Poor man's PCI discovery */
- /* Only scan the root bus */
for (slot = 0; slot < 32; slot++)
for (func = 0; func < 8; func++) {
/* Only probe function 0 on single fn devices */
- if (check_dev_quirk(0, slot, func))
+ if (check_dev_quirk(bus, slot, func))
break;
}
}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c
index afe65dffee80..4312f8ae71b7 100644
--- a/arch/x86/kernel/ebda.c
+++ b/arch/x86/kernel/ebda.c
@@ -6,66 +6,92 @@
#include <asm/bios_ebda.h>
/*
+ * This function reserves all conventional PC system BIOS related
+ * firmware memory areas (some of which are data, some of which
+ * are code), that must not be used by the kernel as available
+ * RAM.
+ *
* The BIOS places the EBDA/XBDA at the top of conventional
* memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
+ * conventional memory (int 0x12) too.
+ *
+ * This means that as a first approximation on most systems we can
+ * guess the reserved BIOS area by looking at the low BIOS RAM size
+ * value and assume that everything above that value (up to 1MB) is
+ * reserved.
+ *
+ * But life in firmware country is not that simple:
+ *
+ * - This code also contains a quirk for Dell systems that neglect
+ * to reserve the EBDA area in the 'RAM size' value ...
+ *
+ * - The same quirk also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). (Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.)
+ *
+ * - Plus paravirt systems don't have a reliable value in the
+ * 'BIOS RAM size' pointer we can rely on, so we must quirk
+ * them too.
+ *
+ * Due to those various problems this function is deliberately
+ * very conservative and tries to err on the side of reserving
+ * too much, to not risk reserving too little.
+ *
+ * Losing a small amount of memory in the bottom megabyte is
+ * rarely a problem, as long as we have enough memory to install
+ * the SMP bootup trampoline which *must* be in this area.
*
- * This functions is deliberately very conservative. Losing
- * memory in the bottom megabyte is rarely a problem, as long
- * as we have enough memory to install the trampoline. Using
- * memory that is in use by the BIOS or by some DMA device
- * the BIOS didn't shut down *is* a big problem.
+ * Using memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem to the kernel,
+ * obviously.
*/
-#define BIOS_LOWMEM_KILOBYTES 0x413
-#define LOWMEM_CAP 0x9f000U /* Absolute maximum */
-#define INSANE_CUTOFF 0x20000U /* Less than this = insane */
+#define BIOS_RAM_SIZE_KB_PTR 0x413
-void __init reserve_ebda_region(void)
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+void __init reserve_bios_regions(void)
{
- unsigned int lowmem, ebda_addr;
+ unsigned int bios_start, ebda_start;
/*
- * To determine the position of the EBDA and the
- * end of conventional memory, we need to look at
- * the BIOS data area. In a paravirtual environment
- * that area is absent. We'll just have to assume
- * that the paravirt case can handle memory setup
- * correctly, without our help.
+ * NOTE: In a paravirtual environment the BIOS reserved
+ * area is absent. We'll just have to assume that the
+ * paravirt case can handle memory setup correctly,
+ * without our help.
*/
- if (!x86_platform.legacy.ebda_search)
+ if (!x86_platform.legacy.reserve_bios_regions)
return;
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
/*
- * Note: some old Dells seem to need 4k EBDA without
- * reporting so, so just consider the memory above 0x9f000
- * to be off limits (bugzilla 2990).
+ * BIOS RAM size is encoded in kilobytes, convert it
+ * to bytes to get a first guess at where the BIOS
+ * firmware area starts:
*/
+ bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR);
+ bios_start <<= 10;
- /* If the EBDA address is below 128K, assume it is bogus */
- if (ebda_addr < INSANE_CUTOFF)
- ebda_addr = LOWMEM_CAP;
+ /*
+ * If bios_start is less than 128K, assume it is bogus
+ * and bump it up to 640K. Similarly, if bios_start is above 640K,
+ * don't trust it.
+ */
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
- /* If lowmem is less than 128K, assume it is bogus */
- if (lowmem < INSANE_CUTOFF)
- lowmem = LOWMEM_CAP;
+ /* Get the start address of the EBDA page: */
+ ebda_start = get_bios_ebda();
- /* Use the lower of the lowmem and EBDA markers as the cutoff */
- lowmem = min(lowmem, ebda_addr);
- lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */
+ /*
+ * If the EBDA start address is sane and is below the BIOS region,
+ * then also reserve everything from the EBDA start address up to
+ * the BIOS region.
+ */
+ if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
- /* reserve all memory between lowmem and the 1MB mark */
- memblock_reserve(lowmem, 0x100000 - lowmem);
+ /* Reserve all memory between bios_start and the 1MB mark: */
+ memblock_reserve(bios_start, 0x100000 - bios_start);
}
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4d38416e2a7f..04f89caef9c4 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
# error "Need more than one PGD for the ESPFIX hack"
#endif
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
/* This contains the *bottom* address of the espfix stack */
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 97027545a72d..47004010ad5d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -8,9 +8,14 @@
#include <asm/fpu/internal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
+#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <linux/hardirq.h>
+#include <linux/pkeys.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/fpu.h>
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
@@ -192,6 +197,7 @@ void fpu__save(struct fpu *fpu)
WARN_ON_FPU(fpu != &current->thread.fpu);
preempt_disable();
+ trace_x86_fpu_before_save(fpu);
if (fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(fpu)) {
if (use_eager_fpu())
@@ -200,6 +206,7 @@ void fpu__save(struct fpu *fpu)
fpregs_deactivate(fpu);
}
}
+ trace_x86_fpu_after_save(fpu);
preempt_enable();
}
EXPORT_SYMBOL_GPL(fpu__save);
@@ -222,7 +229,14 @@ void fpstate_init(union fpregs_state *state)
return;
}
- memset(state, 0, xstate_size);
+ memset(state, 0, fpu_kernel_xstate_size);
+
+ /*
+ * XRSTORS requires that this bit is set in xcomp_bv, or
+ * it will #GP. Make sure it is replaced after the memset().
+ */
+ if (static_cpu_has(X86_FEATURE_XSAVES))
+ state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT;
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(&state->fxsave);
@@ -247,7 +261,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
* leak into the child task:
*/
if (use_eager_fpu())
- memset(&dst_fpu->state.xsave, 0, xstate_size);
+ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
/*
* Save current FPU registers directly into the child
@@ -266,7 +280,8 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
*/
preempt_disable();
if (!copy_fpregs_to_fpstate(dst_fpu)) {
- memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
+ memcpy(&src_fpu->state, &dst_fpu->state,
+ fpu_kernel_xstate_size);
if (use_eager_fpu())
copy_kernel_to_fpregs(&src_fpu->state);
@@ -275,6 +290,9 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
}
preempt_enable();
+ trace_x86_fpu_copy_src(src_fpu);
+ trace_x86_fpu_copy_dst(dst_fpu);
+
return 0;
}
@@ -288,7 +306,9 @@ void fpu__activate_curr(struct fpu *fpu)
if (!fpu->fpstate_active) {
fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+ trace_x86_fpu_activate_state(fpu);
/* Safe to do for the current task: */
fpu->fpstate_active = 1;
}
@@ -314,7 +334,9 @@ void fpu__activate_fpstate_read(struct fpu *fpu)
} else {
if (!fpu->fpstate_active) {
fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+ trace_x86_fpu_activate_state(fpu);
/* Safe to do for current and for stopped child tasks: */
fpu->fpstate_active = 1;
}
@@ -347,7 +369,9 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
fpu->last_cpu = -1;
} else {
fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+ trace_x86_fpu_activate_state(fpu);
/* Safe to do for stopped child tasks: */
fpu->fpstate_active = 1;
}
@@ -432,9 +456,11 @@ void fpu__restore(struct fpu *fpu)
/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
kernel_fpu_disable();
+ trace_x86_fpu_before_restore(fpu);
fpregs_activate(fpu);
copy_kernel_to_fpregs(&fpu->state);
fpu->counter++;
+ trace_x86_fpu_after_restore(fpu);
kernel_fpu_enable();
}
EXPORT_SYMBOL_GPL(fpu__restore);
@@ -463,6 +489,8 @@ void fpu__drop(struct fpu *fpu)
fpu->fpstate_active = 0;
+ trace_x86_fpu_dropped(fpu);
+
preempt_enable();
}
@@ -478,6 +506,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
copy_kernel_to_fxregs(&init_fpstate.fxsave);
else
copy_kernel_to_fregs(&init_fpstate.fsave);
+
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ copy_init_pkru_to_fpregs();
}
/*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index aacfd7a82cec..2f2b8c7ccb85 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -145,8 +145,8 @@ static void __init fpu__init_system_generic(void)
* This is inherent to the XSAVE architecture which puts all state
* components into a single, continuous memory block:
*/
-unsigned int xstate_size;
-EXPORT_SYMBOL_GPL(xstate_size);
+unsigned int fpu_kernel_xstate_size;
+EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
/* Get alignment of the TYPE. */
#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
@@ -178,7 +178,7 @@ static void __init fpu__init_task_struct_size(void)
* Add back the dynamically-calculated register state
* size.
*/
- task_size += xstate_size;
+ task_size += fpu_kernel_xstate_size;
/*
* We dynamically size 'struct fpu', so we require that
@@ -195,7 +195,7 @@ static void __init fpu__init_task_struct_size(void)
}
/*
- * Set up the xstate_size based on the legacy FPU context size.
+ * Set up the user and kernel xstate sizes based on the legacy FPU context size.
*
* We set this up first, and later it will be overwritten by
* fpu__init_system_xstate() if the CPU knows about xstates.
@@ -208,7 +208,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
on_boot_cpu = 0;
/*
- * Note that xstate_size might be overwriten later during
+ * Note that xstate sizes might be overwritten later during
* fpu__init_system_xstate().
*/
@@ -219,27 +219,17 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- xstate_size = sizeof(struct swregs_state);
+ fpu_kernel_xstate_size = sizeof(struct swregs_state);
} else {
if (boot_cpu_has(X86_FEATURE_FXSR))
- xstate_size = sizeof(struct fxregs_state);
+ fpu_kernel_xstate_size =
+ sizeof(struct fxregs_state);
else
- xstate_size = sizeof(struct fregs_state);
+ fpu_kernel_xstate_size =
+ sizeof(struct fregs_state);
}
- /*
- * Quirk: we don't yet handle the XSAVES* instructions
- * correctly, as we don't correctly convert between
- * standard and compacted format when interfacing
- * with user-space - so disable it for now.
- *
- * The difference is small: with recent CPUs the
- * compacted format is only marginally smaller than
- * the standard FPU state format.
- *
- * ( This is easy to backport while we are fixing
- * XSAVES* support. )
- */
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ fpu_user_xstate_size = fpu_kernel_xstate_size;
}
/*
@@ -327,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void)
on_boot_cpu = 0;
WARN_ON_FPU(current->thread.fpu.fpstate_active);
- current_thread_info()->status = 0;
if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 81422dfb152b..c114b132d121 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -4,6 +4,7 @@
#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -85,21 +86,26 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -ENODEV;
- fpu__activate_fpstate_read(fpu);
-
xsave = &fpu->state.xsave;
- /*
- * Copy the 48bytes defined by the software first into the xstate
- * memory layout in the thread struct, so that we can copy the entire
- * xstateregs to the user using one user_regset_copyout().
- */
- memcpy(&xsave->i387.sw_reserved,
- xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
- /*
- * Copy the xstate memory layout.
- */
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ fpu__activate_fpstate_read(fpu);
+
+ if (using_compacted_format()) {
+ ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave);
+ } else {
+ fpstate_sanitize_xstate(fpu);
+ /*
+ * Copy the 48 bytes defined by the software into the xsave
+ * area in the thread struct, so that we can copy the whole
+ * area to user using one user_regset_copyout().
+ */
+ memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+
+ /*
+ * Copy the xstate memory layout.
+ */
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ }
return ret;
}
@@ -114,11 +120,27 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -ENODEV;
- fpu__activate_fpstate_write(fpu);
+ /*
+ * A whole standard-format XSAVE buffer is needed:
+ */
+ if ((pos != 0) || (count < fpu_user_xstate_size))
+ return -EFAULT;
xsave = &fpu->state.xsave;
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ fpu__activate_fpstate_write(fpu);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ ret = copyin_to_xsaves(kbuf, ubuf, xsave);
+ else
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+
+ /*
+ * In case of failure, mark all states as init:
+ */
+ if (ret)
+ fpstate_init(&fpu->state);
+
/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 31c6a60505e6..a184c210efba 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -8,8 +8,10 @@
#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
#include <asm/sigframe.h>
+#include <asm/trace/fpu.h>
static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
@@ -31,7 +33,7 @@ static inline int check_for_xstate(struct fxregs_state __user *buf,
/* Check for the first magic field and other error scenarios. */
if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
fx_sw->xstate_size < min_xstate_size ||
- fx_sw->xstate_size > xstate_size ||
+ fx_sw->xstate_size > fpu_user_xstate_size ||
fx_sw->xstate_size > fx_sw->extended_size)
return -1;
@@ -88,7 +90,8 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
if (!use_xsave())
return err;
- err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
+ err |= __put_user(FP_XSTATE_MAGIC2,
+ (__u32 *)(buf + fpu_user_xstate_size));
/*
* Read the xfeatures which we copied (directly from the cpu or
@@ -125,7 +128,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
else
err = copy_fregs_to_user((struct fregs_state __user *) buf);
- if (unlikely(err) && __clear_user(buf, xstate_size))
+ if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
err = -EFAULT;
return err;
}
@@ -156,8 +159,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
struct task_struct *tsk = current;
int ia32_fxstate = (buf != buf_fx);
- ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
- config_enabled(CONFIG_IA32_EMULATION));
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
if (!access_ok(VERIFY_WRITE, buf, size))
return -EACCES;
@@ -167,7 +170,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
sizeof(struct user_i387_ia32_struct), NULL,
(struct _fpstate_32 __user *) buf) ? -1 : 1;
- if (fpregs_active()) {
+ if (fpregs_active() || using_compacted_format()) {
/* Save the live register state to the user directly. */
if (copy_fpregs_to_sigframe(buf_fx))
return -1;
@@ -175,8 +178,19 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
if (ia32_fxstate)
copy_fxregs_to_kernel(&tsk->thread.fpu);
} else {
+ /*
+ * It is a *bug* if kernel uses compacted-format for xsave
+ * area and we copy it out directly to a signal frame. It
+ * should have been handled above by saving the registers
+ * directly.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n");
+ return -1;
+ }
+
fpstate_sanitize_xstate(&tsk->thread.fpu);
- if (__copy_to_user(buf_fx, xsave, xstate_size))
+ if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
return -1;
}
@@ -250,12 +264,12 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
int ia32_fxstate = (buf != buf_fx);
struct task_struct *tsk = current;
struct fpu *fpu = &tsk->thread.fpu;
- int state_size = xstate_size;
+ int state_size = fpu_kernel_xstate_size;
u64 xfeatures = 0;
int fx_only = 0;
- ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
- config_enabled(CONFIG_IA32_EMULATION));
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
if (!buf) {
fpu__clear(fpu);
@@ -282,6 +296,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
*/
state_size = sizeof(struct fxregs_state);
fx_only = 1;
+ trace_x86_fpu_xstate_check_failed(fpu);
} else {
state_size = fx_sw_user.xstate_size;
xfeatures = fx_sw_user.xfeatures;
@@ -308,9 +323,17 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
*/
fpu__drop(fpu);
- if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) ||
- __copy_from_user(&env, buf, sizeof(env))) {
+ if (using_compacted_format()) {
+ err = copyin_to_xsaves(NULL, buf_fx,
+ &fpu->state.xsave);
+ } else {
+ err = __copy_from_user(&fpu->state.xsave,
+ buf_fx, state_size);
+ }
+
+ if (err || __copy_from_user(&env, buf, sizeof(env))) {
fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
err = -1;
} else {
sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
@@ -341,7 +364,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
static inline int xstate_sigframe_size(void)
{
- return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
+ return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
+ fpu_user_xstate_size;
}
/*
@@ -385,15 +409,15 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
*/
void fpu__init_prepare_fx_sw_frame(void)
{
- int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
+ int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
fx_sw_reserved.xfeatures = xfeatures_mask;
- fx_sw_reserved.xstate_size = xstate_size;
+ fx_sw_reserved.xstate_size = fpu_user_xstate_size;
- if (config_enabled(CONFIG_IA32_EMULATION) ||
- config_enabled(CONFIG_X86_32)) {
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ IS_ENABLED(CONFIG_X86_32)) {
int fsave_header_size = sizeof(struct fregs_state);
fx_sw_reserved_ia32 = fx_sw_reserved;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4ea2a59483c7..124aa5c593f8 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -5,12 +5,14 @@
*/
#include <linux/compat.h>
#include <linux/cpu.h>
+#include <linux/mman.h>
#include <linux/pkeys.h>
#include <asm/fpu/api.h>
#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
#include <asm/tlbflush.h>
@@ -44,6 +46,13 @@ static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] =
static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
/*
+ * The XSAVE area of kernel can be in standard or compacted format;
+ * it is always in standard format for user mode. This is the user
+ * mode standard format size used for signal and ptrace frames.
+ */
+unsigned int fpu_user_xstate_size;
+
+/*
* Clear all of the X86_FEATURE_* bits that are unavailable
* when the CPU has no XSAVE support.
*/
@@ -105,6 +114,27 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+static int xfeature_is_supervisor(int xfeature_nr)
+{
+ /*
+ * We currently do not support supervisor states, but if
+ * we did, we could find out like this.
+ *
+ * SDM says: If state component 'i' is a user state component,
+ * ECX[0] return 0; if state component i is a supervisor
+ * state component, ECX[0] returns 1.
+ */
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return !!(ecx & 1);
+}
+
+static int xfeature_is_user(int xfeature_nr)
+{
+ return !xfeature_is_supervisor(xfeature_nr);
+}
+
/*
* When executing XSAVEOPT (or other optimized XSAVE instructions), if
* a processor implementation detects that an FPU state component is still
@@ -171,7 +201,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
*/
while (xfeatures) {
if (xfeatures & 0x1) {
- int offset = xstate_offsets[feature_bit];
+ int offset = xstate_comp_offsets[feature_bit];
int size = xstate_sizes[feature_bit];
memcpy((void *)fx + offset,
@@ -192,6 +222,15 @@ void fpu__init_cpu_xstate(void)
{
if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
return;
+ /*
+ * Make it clear that XSAVES supervisor states are not yet
+ * implemented should anyone expect it to work by changing
+ * bits in XFEATURE_MASK_* macros and XCR0.
+ */
+ WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
+ "x86/fpu: XSAVES supervisor states are not yet implemented.\n");
+
+ xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
@@ -217,13 +256,29 @@ static void __init setup_xstate_features(void)
/* start at the beginnning of the "extended state" */
unsigned int last_good_offset = offsetof(struct xregs_state,
extended_state_area);
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_offsets[0] = 0;
+ xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
+ xstate_offsets[1] = xstate_sizes[0];
+ xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
if (!xfeature_enabled(i))
continue;
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
- xstate_offsets[i] = ebx;
+
+ /*
+ * If an xfeature is supervisor state, the offset
+ * in EBX is invalid. We leave it to -1.
+ */
+ if (xfeature_is_user(i))
+ xstate_offsets[i] = ebx;
+
xstate_sizes[i] = eax;
/*
* In our xstate size checks, we assume that the
@@ -233,8 +288,6 @@ static void __init setup_xstate_features(void)
WARN_ONCE(last_good_offset > xstate_offsets[i],
"x86/fpu: misordered xstate at %d\n", last_good_offset);
last_good_offset = xstate_offsets[i];
-
- printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax);
}
}
@@ -263,6 +316,33 @@ static void __init print_xstate_features(void)
}
/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do { \
+ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
+ WARN_ON(nr >= XFEATURE_MAX); \
+} while (0)
+
+/*
+ * We could cache this like xstate_size[], but we only use
+ * it here, so it would be a waste of space.
+ */
+static int xfeature_is_aligned(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ /*
+ * The value returned by ECX[1] indicates the alignment
+ * of state component 'i' when the compacted format
+ * of the extended region of an XSAVE area is used:
+ */
+ return !!(ecx & 2);
+}
+
+/*
* This function sets up offsets and sizes of all extended states in
* xsave area. This supports both standard format and compacted format
* of the xsave aread.
@@ -299,10 +379,29 @@ static void __init setup_xstate_comp(void)
else
xstate_comp_sizes[i] = 0;
- if (i > FIRST_EXTENDED_XFEATURE)
+ if (i > FIRST_EXTENDED_XFEATURE) {
xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ xstate_comp_sizes[i-1];
+ if (xfeature_is_aligned(i))
+ xstate_comp_offsets[i] =
+ ALIGN(xstate_comp_offsets[i], 64);
+ }
+ }
+}
+
+/*
+ * Print out xstate component offsets and sizes
+ */
+static void __init print_xstate_offset_size(void)
+{
+ int i;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+ pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
+ i, xstate_comp_offsets[i], i, xstate_sizes[i]);
}
}
@@ -322,13 +421,11 @@ static void __init setup_init_fpu_buf(void)
setup_xstate_features();
print_xstate_features();
- if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
- init_fpstate.xsave.header.xfeatures = xfeatures_mask;
- }
/*
- * Init all the features state with header_bv being 0x0
+ * Init all the features state with header.xfeatures being 0x0
*/
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
@@ -339,58 +436,19 @@ static void __init setup_init_fpu_buf(void)
copy_xregs_to_kernel_booting(&init_fpstate.xsave);
}
-static int xfeature_is_supervisor(int xfeature_nr)
-{
- /*
- * We currently do not support supervisor states, but if
- * we did, we could find out like this.
- *
- * SDM says: If state component i is a user state component,
- * ECX[0] return 0; if state component i is a supervisor
- * state component, ECX[0] returns 1.
- u32 eax, ebx, ecx, edx;
- cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx;
- return !!(ecx & 1);
- */
- return 0;
-}
-/*
-static int xfeature_is_user(int xfeature_nr)
-{
- return !xfeature_is_supervisor(xfeature_nr);
-}
-*/
-
-/*
- * This check is important because it is easy to get XSTATE_*
- * confused with XSTATE_BIT_*.
- */
-#define CHECK_XFEATURE(nr) do { \
- WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
- WARN_ON(nr >= XFEATURE_MAX); \
-} while (0)
-
-/*
- * We could cache this like xstate_size[], but we only use
- * it here, so it would be a waste of space.
- */
-static int xfeature_is_aligned(int xfeature_nr)
+static int xfeature_uncompacted_offset(int xfeature_nr)
{
u32 eax, ebx, ecx, edx;
- CHECK_XFEATURE(xfeature_nr);
- cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
/*
- * The value returned by ECX[1] indicates the alignment
- * of state component i when the compacted format
- * of the extended region of an XSAVE area is used
+ * Only XSAVES supports supervisor states and it uses compacted
+ * format. Checking a supervisor state's uncompacted offset is
+ * an error.
*/
- return !!(ecx & 2);
-}
-
-static int xfeature_uncompacted_offset(int xfeature_nr)
-{
- u32 eax, ebx, ecx, edx;
+ if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
+ WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
+ return -1;
+ }
CHECK_XFEATURE(xfeature_nr);
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
@@ -415,7 +473,7 @@ static int xfeature_size(int xfeature_nr)
* that it is obvious which aspect of 'XSAVES' is being handled
* by the calling code.
*/
-static int using_compacted_format(void)
+int using_compacted_format(void)
{
return boot_cpu_has(X86_FEATURE_XSAVES);
}
@@ -530,11 +588,12 @@ static void do_extra_xstate_size_checks(void)
*/
paranoid_xstate_size += xfeature_size(i);
}
- XSTATE_WARN_ON(paranoid_xstate_size != xstate_size);
+ XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
}
+
/*
- * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ * Get total size of enabled xstates in XCR0/xfeatures_mask.
*
* Note the SDM's wording here. "sub-function 0" only enumerates
* the size of the *user* states. If we use it to size a buffer
@@ -544,34 +603,33 @@ static void do_extra_xstate_size_checks(void)
* Note that we do not currently set any bits on IA32_XSS so
* 'XCR0 | IA32_XSS == XCR0' for now.
*/
-static unsigned int __init calculate_xstate_size(void)
+static unsigned int __init get_xsaves_size(void)
{
unsigned int eax, ebx, ecx, edx;
- unsigned int calculated_xstate_size;
+ /*
+ * - CPUID function 0DH, sub-function 1:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVES instruction for an XSAVE area
+ * containing all the state components
+ * corresponding to bits currently set in
+ * XCR0 | IA32_XSS.
+ */
+ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
- if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
- /*
- * - CPUID function 0DH, sub-function 0:
- * EBX enumerates the size (in bytes) required by
- * the XSAVE instruction for an XSAVE area
- * containing all the *user* state components
- * corresponding to bits currently set in XCR0.
- */
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- calculated_xstate_size = ebx;
- } else {
- /*
- * - CPUID function 0DH, sub-function 1:
- * EBX enumerates the size (in bytes) required by
- * the XSAVES instruction for an XSAVE area
- * containing all the state components
- * corresponding to bits currently set in
- * XCR0 | IA32_XSS.
- */
- cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
- calculated_xstate_size = ebx;
- }
- return calculated_xstate_size;
+static unsigned int __init get_xsave_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 0:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVE instruction for an XSAVE area
+ * containing all the *user* state components
+ * corresponding to bits currently set in XCR0.
+ */
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ return ebx;
}
/*
@@ -591,7 +649,15 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size)
static int init_xstate_size(void)
{
/* Recompute the context size for enabled features: */
- unsigned int possible_xstate_size = calculate_xstate_size();
+ unsigned int possible_xstate_size;
+ unsigned int xsave_size;
+
+ xsave_size = get_xsave_size();
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ possible_xstate_size = get_xsaves_size();
+ else
+ possible_xstate_size = xsave_size;
/* Ensure we have the space to store all enabled: */
if (!is_supported_xstate_size(possible_xstate_size))
@@ -601,8 +667,13 @@ static int init_xstate_size(void)
* The size is OK, we are definitely going to use xsave,
* make it known to the world that we need more space.
*/
- xstate_size = possible_xstate_size;
+ fpu_kernel_xstate_size = possible_xstate_size;
do_extra_xstate_size_checks();
+
+ /*
+ * User space is always in standard format.
+ */
+ fpu_user_xstate_size = xsave_size;
return 0;
}
@@ -644,8 +715,13 @@ void __init fpu__init_system_xstate(void)
xfeatures_mask = eax + ((u64)edx << 32);
if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ /*
+ * This indicates that something really unexpected happened
+ * with the enumeration. Disable XSAVE and try to continue
+ * booting without it. This is too early to BUG().
+ */
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
- BUG();
+ goto out_disable;
}
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
@@ -653,21 +729,29 @@ void __init fpu__init_system_xstate(void)
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
err = init_xstate_size();
- if (err) {
- /* something went wrong, boot without any XSAVE support */
- fpu__init_disable_system_xstate();
- return;
- }
+ if (err)
+ goto out_disable;
+
+ /*
+ * Update info used for ptrace frames; use standard-format size and no
+ * supervisor xstates:
+ */
+ update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
- update_regset_xstate_info(xstate_size, xfeatures_mask);
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
setup_xstate_comp();
+ print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
xfeatures_mask,
- xstate_size,
+ fpu_kernel_xstate_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
+ return;
+
+out_disable:
+ /* something went wrong, try to boot without any XSAVE support */
+ fpu__init_disable_system_xstate();
}
/*
@@ -693,6 +777,11 @@ void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
{
int feature_nr = fls64(xstate_feature_mask) - 1;
+ if (!xfeature_enabled(feature_nr)) {
+ WARN_ON_FPU(1);
+ return NULL;
+ }
+
return (void *)xsave + xstate_comp_offsets[feature_nr];
}
/*
@@ -778,155 +867,214 @@ const void *get_xsave_field_ptr(int xsave_state)
return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
+#ifdef CONFIG_ARCH_HAS_PKEYS
+#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
+#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
/*
- * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
- * to take out of its "init state". This will ensure that an
- * XRSTOR actually restores the state.
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
*/
-static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
- int xstate_feature_mask)
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
{
- xsave->header.xfeatures |= xstate_feature_mask;
+ u32 old_pkru;
+ int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+ u32 new_pkru_bits = 0;
+
+ /*
+ * This check implies XSAVE support. OSPKE only gets
+ * set if we enable XSAVE and we enable PKU in XCR0.
+ */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return -EINVAL;
+ /*
+ * For most XSAVE components, this would be an arduous task:
+ * brining fpstate up to date with fpregs, updating fpstate,
+ * then re-populating fpregs. But, for components that are
+ * never lazily managed, we can just access the fpregs
+ * directly. PKRU is never managed lazily, so we can just
+ * manipulate it directly. Make sure it stays that way.
+ */
+ WARN_ON_ONCE(!use_eager_fpu());
+
+ /* Set the bits we need in PKRU: */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_pkru_bits |= PKRU_AD_BIT;
+ if (init_val & PKEY_DISABLE_WRITE)
+ new_pkru_bits |= PKRU_WD_BIT;
+
+ /* Shift the bits in to the correct place in PKRU for pkey: */
+ new_pkru_bits <<= pkey_shift;
+
+ /* Get old PKRU and mask off any old bits in place: */
+ old_pkru = read_pkru();
+ old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+
+ /* Write old part along with new part: */
+ write_pkru(old_pkru | new_pkru_bits);
+
+ return 0;
}
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
/*
- * This function is safe to call whether the FPU is in use or not.
- *
- * Note that this only works on the current task.
- *
- * Inputs:
- * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
- * XFEATURE_MASK_SSE, etc...)
- * @xsave_state_ptr: a pointer to a copy of the state that you would
- * like written in to the current task's FPU xsave state. This pointer
- * must not be located in the current tasks's xsave area.
- * Output:
- * address of the state in the xsave area or NULL if the state
- * is not present or is in its 'init state'.
+ * This is similar to user_regset_copyout(), but will not add offset to
+ * the source data pointer or increment pos, count, kbuf, and ubuf.
*/
-static void fpu__xfeature_set_state(int xstate_feature_mask,
- void *xstate_feature_src, size_t len)
+static inline int xstate_copyout(unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf,
+ const void *data, const int start_pos,
+ const int end_pos)
{
- struct xregs_state *xsave = &current->thread.fpu.state.xsave;
- struct fpu *fpu = &current->thread.fpu;
- void *dst;
+ if ((count == 0) || (pos < start_pos))
+ return 0;
- if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
- WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
- return;
+ if (end_pos < 0 || pos < end_pos) {
+ unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
+
+ if (kbuf) {
+ memcpy(kbuf + pos, data, copy);
+ } else {
+ if (__copy_to_user(ubuf + pos, data, copy))
+ return -EFAULT;
+ }
}
+ return 0;
+}
+
+/*
+ * Convert from kernel XSAVES compacted format to standard format and copy
+ * to a ptrace buffer. It supports partial copy but pos always starts from
+ * zero. This is called from xstateregs_get() and there we check the CPU
+ * has XSAVES.
+ */
+int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
+ void __user *ubuf, struct xregs_state *xsave)
+{
+ unsigned int offset, size;
+ int ret, i;
+ struct xstate_header header;
/*
- * Tell the FPU code that we need the FPU state to be in
- * 'fpu' (not in the registers), and that we need it to
- * be stable while we write to it.
+ * Currently copy_regset_to_user() starts from pos 0:
*/
- fpu__current_fpstate_write_begin();
+ if (unlikely(pos != 0))
+ return -EFAULT;
/*
- * This method *WILL* *NOT* work for compact-format
- * buffers. If the 'xstate_feature_mask' is unset in
- * xcomp_bv then we may need to move other feature state
- * "up" in the buffer.
+ * The destination is a ptrace buffer; we put in only user xstates:
*/
- if (xsave->header.xcomp_bv & xstate_feature_mask) {
- WARN_ON_ONCE(1);
- goto out;
- }
-
- /* find the location in the xsave buffer of the desired state */
- dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+ header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
/*
- * Make sure that the pointer being passed in did not
- * come from the xsave buffer itself.
+ * Copy xregs_state->header:
*/
- WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(header);
- /* put the caller-provided data in the location */
- memcpy(dst, xstate_feature_src, len);
+ ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count);
+
+ if (ret)
+ return ret;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ /*
+ * Copy only in-use xstates:
+ */
+ if ((header.xfeatures >> i) & 1) {
+ void *src = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count);
+
+ if (ret)
+ return ret;
+
+ if (offset + size >= count)
+ break;
+ }
+
+ }
/*
- * Mark the xfeature so that the CPU knows there is state
- * in the buffer now.
- */
- fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
-out:
- /*
- * We are done writing to the 'fpu'. Reenable preeption
- * and (possibly) move the fpstate back in to the fpregs.
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
*/
- fpu__current_fpstate_write_end();
-}
+ offset = offsetof(struct fxregs_state, sw_reserved);
+ size = sizeof(xstate_fx_sw_bytes);
-#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
-#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+ ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
/*
- * This will go out and modify the XSAVE buffer so that PKRU is
- * set to a particular state for access to 'pkey'.
- *
- * PKRU state does affect kernel access to user memory. We do
- * not modfiy PKRU *itself* here, only the XSAVE state that will
- * be restored in to PKRU when we return back to userspace.
+ * Convert from a ptrace standard-format buffer to kernel XSAVES format
+ * and copy to the target thread. This is called from xstateregs_set() and
+ * there we check the CPU has XSAVES and a whole standard-sized buffer
+ * exists.
*/
-int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val)
+int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
+ struct xregs_state *xsave)
{
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
- struct pkru_state *old_pkru_state;
- struct pkru_state new_pkru_state;
- int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
- u32 new_pkru_bits = 0;
+ unsigned int offset, size;
+ int i;
+ u64 xfeatures;
+ u64 allowed_features;
+
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(xfeatures);
+
+ if (kbuf) {
+ memcpy(&xfeatures, kbuf + offset, size);
+ } else {
+ if (__copy_from_user(&xfeatures, ubuf + offset, size))
+ return -EFAULT;
+ }
/*
- * This check implies XSAVE support. OSPKE only gets
- * set if we enable XSAVE and we enable PKU in XCR0.
+ * Reject if the user sets any disabled or supervisor features:
*/
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
+
+ if (xfeatures & ~allowed_features)
return -EINVAL;
- /* Set the bits we need in PKRU */
- if (init_val & PKEY_DISABLE_ACCESS)
- new_pkru_bits |= PKRU_AD_BIT;
- if (init_val & PKEY_DISABLE_WRITE)
- new_pkru_bits |= PKRU_WD_BIT;
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = ((u64)1 << i);
- /* Shift the bits in to the correct place in PKRU for pkey. */
- new_pkru_bits <<= pkey_shift;
+ if (xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, 1 << i);
- /* Locate old copy of the state in the xsave buffer */
- old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ if (kbuf) {
+ memcpy(dst, kbuf + offset, size);
+ } else {
+ if (__copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
+ }
+ }
+ }
/*
- * When state is not in the buffer, it is in the init
- * state, set it manually. Otherwise, copy out the old
- * state.
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
*/
- if (!old_pkru_state)
- new_pkru_state.pkru = 0;
- else
- new_pkru_state.pkru = old_pkru_state->pkru;
-
- /* mask off any old bits in place */
- new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
- /* Set the newly-requested bits */
- new_pkru_state.pkru |= new_pkru_bits;
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
/*
- * We could theoretically live without zeroing pkru.pad.
- * The current XSAVE feature state definition says that
- * only bytes 0->3 are used. But we do not want to
- * chance leaking kernel stack out to userspace in case a
- * memcpy() of the whole xsave buffer was done.
- *
- * They're in the same cacheline anyway.
+ * Add back in the features that came in from userspace:
*/
- new_pkru_state.pad = 0;
-
- fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
- sizeof(new_pkru_state));
+ xsave->header.xfeatures |= xfeatures;
return 0;
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d036cfb4495d..8639bb2ae058 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
}
if (ftrace_push_return_trace(old, self_addr, &trace.depth,
- frame_pointer) == -EBUSY) {
+ frame_pointer, parent) == -EBUSY) {
*parent = old;
return;
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d784bb547a9d..f16c55bfc090 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void)
/* Initialize 32bit specific setup functions */
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
-
- reserve_ebda_region();
}
asmlinkage __visible void __init i386_start_kernel(void)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b72fb0b71dd1..54a2372f5dbb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
copy_bootdata(__va(real_mode_data));
x86_early_init_platform_quirks();
- reserve_ebda_region();
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6f8902b0d151..5f401262f12d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
*/
__HEAD
ENTRY(startup_32)
- movl pa(stack_start),%ecx
+ movl pa(initial_stack),%ecx
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
@@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4
* start_secondary().
*/
ENTRY(start_cpu0)
- movl stack_start, %ecx
+ movl initial_stack, %ecx
movl %ecx, %esp
jmp *(initial_code)
ENDPROC(start_cpu0)
@@ -307,7 +307,7 @@ ENTRY(startup_32_smp)
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
- movl pa(stack_start),%ecx
+ movl pa(initial_stack),%ecx
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
@@ -703,7 +703,7 @@ ENTRY(initial_page_table)
.data
.balign 4
-ENTRY(stack_start)
+ENTRY(initial_stack)
.long init_thread_union+THREAD_SIZE
__INITRODATA
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 5df831ef1442..c98a559c346e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,7 +38,7 @@
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
+L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
L4_START_KERNEL = pgd_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
@@ -66,7 +66,7 @@ startup_64:
*/
/*
- * Setup stack for verify_cpu(). "-8" because stack_start is defined
+ * Setup stack for verify_cpu(). "-8" because initial_stack is defined
* this way, see below. Our best guess is a NULL ptr for stack
* termination heuristics and we don't want to break anything which
* might depend on it (kgdb, ...).
@@ -226,7 +226,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0
/* Setup a boot time stack */
- movq stack_start(%rip), %rsp
+ movq initial_stack(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -299,6 +299,7 @@ ENTRY(secondary_startup_64)
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
lretq
+ENDPROC(secondary_startup_64)
#include "verify_cpu.S"
@@ -309,7 +310,7 @@ ENTRY(secondary_startup_64)
* start_secondary().
*/
ENTRY(start_cpu0)
- movq stack_start(%rip),%rsp
+ movq initial_stack(%rip),%rsp
movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
pushq $__KERNEL_CS # set correct cs
@@ -318,17 +319,15 @@ ENTRY(start_cpu0)
ENDPROC(start_cpu0)
#endif
- /* SMP bootup changes these two */
+ /* Both SMP bootup and ACPI suspend change these variables */
__REFDATA
.balign 8
GLOBAL(initial_code)
.quad x86_64_start_kernel
GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
-
- GLOBAL(stack_start)
+ GLOBAL(initial_stack)
.quad init_thread_union+THREAD_SIZE-8
- .word 0
__FINITDATA
bad_address:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index f112af7aa62e..274fab99169d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -710,31 +710,29 @@ static void hpet_work(struct work_struct *w)
complete(&hpet_work->complete);
}
-static int hpet_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
+static int hpet_cpuhp_online(unsigned int cpu)
{
- unsigned long cpu = (unsigned long)hcpu;
struct hpet_work_struct work;
+
+ INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
+ init_completion(&work.complete);
+ /* FIXME: add schedule_work_on() */
+ schedule_delayed_work_on(cpu, &work.work, 0);
+ wait_for_completion(&work.complete);
+ destroy_delayed_work_on_stack(&work.work);
+ return 0;
+}
+
+static int hpet_cpuhp_dead(unsigned int cpu)
+{
struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
- init_completion(&work.complete);
- /* FIXME: add schedule_work_on() */
- schedule_delayed_work_on(cpu, &work.work, 0);
- wait_for_completion(&work.complete);
- destroy_delayed_work_on_stack(&work.work);
- break;
- case CPU_DEAD:
- if (hdev) {
- free_irq(hdev->irq, hdev);
- hdev->flags &= ~HPET_DEV_USED;
- per_cpu(cpu_hpet_dev, cpu) = NULL;
- }
- break;
- }
- return NOTIFY_OK;
+ if (!hdev)
+ return 0;
+ free_irq(hdev->irq, hdev);
+ hdev->flags &= ~HPET_DEV_USED;
+ per_cpu(cpu_hpet_dev, cpu) = NULL;
+ return 0;
}
#else
@@ -750,21 +748,112 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
}
#endif
-static int hpet_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- return NOTIFY_OK;
-}
+#define hpet_cpuhp_online NULL
+#define hpet_cpuhp_dead NULL
#endif
/*
* Clock source related code
*/
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the hpet value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
+
+static union hpet_lock hpet __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+
+static cycle_t read_hpet(struct clocksource *cs)
+{
+ unsigned long flags;
+ union hpet_lock old, new;
+
+ BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
+
+ /*
+ * Read HPET directly if in NMI.
+ */
+ if (in_nmi())
+ return (cycle_t)hpet_readl(HPET_COUNTER);
+
+ /*
+ * Read the current state of the lock and HPET value atomically.
+ */
+ old.lockval = READ_ONCE(hpet.lockval);
+
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
+
+ local_irq_save(flags);
+ if (arch_spin_trylock(&hpet.lock)) {
+ new.value = hpet_readl(HPET_COUNTER);
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(hpet.value, new.value);
+ arch_spin_unlock(&hpet.lock);
+ local_irq_restore(flags);
+ return (cycle_t)new.value;
+ }
+ local_irq_restore(flags);
+
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the HPET value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * HPET value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next HPET reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(hpet.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+
+ return (cycle_t)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
static cycle_t read_hpet(struct clocksource *cs)
{
return (cycle_t)hpet_readl(HPET_COUNTER);
}
+#endif
static struct clocksource clocksource_hpet = {
.name = "hpet",
@@ -931,7 +1020,7 @@ out_nohpet:
*/
static __init int hpet_late_init(void)
{
- int cpu;
+ int ret;
if (boot_hpet_disable)
return -ENODEV;
@@ -961,16 +1050,20 @@ static __init int hpet_late_init(void)
if (boot_cpu_has(X86_FEATURE_ARAT))
return 0;
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu) {
- hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
- }
-
/* This notifier should be called after workqueue is ready */
- __hotcpu_notifier(hpet_cpuhp_notify, -20);
- cpu_notifier_register_done();
-
+ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE",
+ hpet_cpuhp_online, NULL);
+ if (ret)
+ return ret;
+ ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL,
+ hpet_cpuhp_dead);
+ if (ret)
+ goto err_cpuhp;
return 0;
+
+err_cpuhp:
+ cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE);
+ return ret;
}
fs_initcall(hpet_late_init);
@@ -1020,7 +1113,6 @@ void hpet_disable(void)
*/
#include <linux/mc146818rtc.h>
#include <linux/rtc.h>
-#include <asm/rtc.h>
#define DEFAULT_RTC_INT_FREQ 64
#define DEFAULT_RTC_SHIFT 6
@@ -1244,7 +1336,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
memset(&curr_time, 0, sizeof(struct rtc_time));
if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
- get_rtc_time(&curr_time);
+ mc146818_get_time(&curr_time);
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 2bcfb5f2bc44..8771766d46b6 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -36,13 +36,14 @@
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <asm/hw_breakpoint.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
+#include <asm/user.h>
/* Per cpu debug control register value */
DEFINE_PER_CPU(unsigned long, cpu_dr7);
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 64341aa485ae..1f9b878ef5ef 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,4 +1,5 @@
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/spinlock_types.h>
#include <asm/checksum.h>
#include <asm/pgtable.h>
@@ -42,3 +43,5 @@ EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(___preempt_schedule);
EXPORT_SYMBOL(___preempt_schedule_notrace);
#endif
+
+EXPORT_SYMBOL(__sw_hweight32);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index efb82f07b29c..6ebe00cb4a3b 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,7 +3,7 @@
*
*/
#include <linux/clockchips.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/timex.h>
#include <linux/i8253.h>
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index a979b5bd2fc0..50c89e8a95f2 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -6,7 +6,7 @@
* outb_p/inb_p API uses.
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/dmi.h>
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 61521dc19c10..9f669fdd2010 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_puts(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
- irq_stats(j)->irq_tlb_count);
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
seq_puts(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 38da8f29a9c8..1f38d9a4d9de 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -8,7 +8,6 @@
* io_apic.c.)
*/
-#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
@@ -130,11 +129,9 @@ void irq_ctx_init(int cpu)
void do_softirq_own_stack(void)
{
- struct thread_info *curstk;
struct irq_stack *irqstk;
u32 *isp, *prev_esp;
- curstk = current_stack();
irqstk = __this_cpu_read(softirq_stack);
/* build the stack frame on the softirq stack */
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 206d0b90a3ab..9ebd0b0e73d9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -11,7 +11,6 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
-#include <linux/module.h>
#include <linux/delay.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
@@ -41,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (user_mode(regs))
return;
- if (regs->sp >= curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
regs->sp <= curbase + THREAD_SIZE)
return;
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index dc1404bf8e4b..bdb83e431d89 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -8,7 +8,7 @@
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/stat.h>
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index f2356bda2b05..3407b148c240 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -99,14 +99,14 @@ static int setup_e820_entries(struct boot_params *params)
{
unsigned int nr_e820_entries;
- nr_e820_entries = e820_saved.nr_map;
+ nr_e820_entries = e820_saved->nr_map;
/* TODO: Pass entries more than E820MAX in bootparams setup data */
if (nr_e820_entries > E820MAX)
nr_e820_entries = E820MAX;
params->e820_entries = nr_e820_entries;
- memcpy(&params->e820_map, &e820_saved.map,
+ memcpy(&params->e820_map, &e820_saved->map,
nr_e820_entries * sizeof(struct e820entry));
return 0;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 04cde527d728..8e36f249646e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -50,6 +50,7 @@
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/nmi.h>
+#include <asm/switch_to.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
@@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_DX] = 0;
gdb_regs[GDB_SI] = 0;
gdb_regs[GDB_DI] = 0;
- gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp;
+ gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp;
#ifdef CONFIG_X86_32
gdb_regs[GDB_DS] = __KERNEL_DS;
gdb_regs[GDB_ES] = __KERNEL_DS;
gdb_regs[GDB_PS] = 0;
gdb_regs[GDB_CS] = __KERNEL_CS;
- gdb_regs[GDB_PC] = p->thread.ip;
gdb_regs[GDB_SS] = __KERNEL_DS;
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
#else
- gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs32[GDB_PS] = 0;
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
@@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_R14] = 0;
gdb_regs[GDB_R15] = 0;
#endif
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_SP] = p->thread.sp;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 38cf7a741250..28cee019209c 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,7 +45,7 @@
#include <linux/slab.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
@@ -961,7 +961,19 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* normal page fault.
*/
regs->ip = (unsigned long)cur->addr;
+ /*
+ * Trap flag (TF) has been set here because this fault
+ * happened where the single stepping will be done.
+ * So clear it by resetting the current kprobe:
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ /*
+ * If the TF flag was set before the kprobe hit,
+ * don't touch it:
+ */
regs->flags |= kcb->kprobe_old_flags;
+
if (kcb->kprobe_status == KPROBE_REENTER)
restore_previous_kprobe(kcb);
else
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 4425f593f0ec..3bb4c5f021f6 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -24,7 +24,7 @@
#include <linux/slab.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index c2bedaea11f7..4afc67f5facc 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -184,7 +184,7 @@ out:
static struct kobj_attribute type_attr = __ATTR_RO(type);
-static struct bin_attribute data_attr = {
+static struct bin_attribute data_attr __ro_after_init = {
.attr = {
.name = "data",
.mode = S_IRUGO,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index eea2a6f72b31..edbbfc854e39 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -21,7 +21,7 @@
*/
#include <linux/context_tracking.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
@@ -301,8 +301,6 @@ static void kvm_register_steal_time(void)
if (!has_steal_clock)
return;
- memset(st, 0, sizeof(*st));
-
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
pr_info("kvm-stealtime: cpu %d, msr %llx\n",
cpu, (unsigned long long) slow_virt_to_phys(st));
@@ -425,12 +423,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
kvm_spinlock_init();
}
-static void kvm_guest_cpu_online(void *dummy)
-{
- kvm_guest_cpu_init();
-}
-
-static void kvm_guest_cpu_offline(void *dummy)
+static void kvm_guest_cpu_offline(void)
{
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -439,29 +432,21 @@ static void kvm_guest_cpu_offline(void *dummy)
apf_task_wake_all();
}
-static int kvm_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
+static int kvm_cpu_online(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- case CPU_ONLINE_FROZEN:
- smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+ local_irq_disable();
+ kvm_guest_cpu_init();
+ local_irq_enable();
+ return 0;
}
-static struct notifier_block kvm_cpu_notifier = {
- .notifier_call = kvm_cpu_notify,
-};
+static int kvm_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ kvm_guest_cpu_offline();
+ local_irq_enable();
+ return 0;
+}
#endif
static void __init kvm_apf_trap_init(void)
@@ -496,7 +481,9 @@ void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
- register_cpu_notifier(&kvm_cpu_notifier);
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+ kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+ pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
#else
kvm_guest_cpu_init();
#endif
@@ -577,9 +564,6 @@ static void kvm_kick_cpu(int cpu)
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}
-
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
#include <asm/qspinlock.h>
static void kvm_wait(u8 *ptr, u8 val)
@@ -608,243 +592,6 @@ out:
local_irq_restore(flags);
}
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-
-enum kvm_contention_stat {
- TAKEN_SLOW,
- TAKEN_SLOW_PICKUP,
- RELEASED_SLOW,
- RELEASED_SLOW_KICKED,
- NR_CONTENTION_STATS
-};
-
-#ifdef CONFIG_KVM_DEBUG_FS
-#define HISTO_BUCKETS 30
-
-static struct kvm_spinlock_stats
-{
- u32 contention_stats[NR_CONTENTION_STATS];
- u32 histo_spin_blocked[HISTO_BUCKETS+1];
- u64 time_blocked;
-} spinlock_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- u8 ret;
- u8 old;
-
- old = READ_ONCE(zero_stats);
- if (unlikely(old)) {
- ret = cmpxchg(&zero_stats, old, 0);
- /* This ensures only one fellow resets the stat */
- if (ret == old)
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- }
-}
-
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
- check_zero();
- spinlock_stats.contention_stats[var] += val;
-}
-
-
-static inline u64 spin_time_start(void)
-{
- return sched_clock();
-}
-
-static void __spin_time_accum(u64 delta, u32 *array)
-{
- unsigned index;
-
- index = ilog2(delta);
- check_zero();
-
- if (index < HISTO_BUCKETS)
- array[index]++;
- else
- array[HISTO_BUCKETS]++;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
- u32 delta;
-
- delta = sched_clock() - start;
- __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
- spinlock_stats.time_blocked += delta;
-}
-
-static struct dentry *d_spin_debug;
-static struct dentry *d_kvm_debug;
-
-static struct dentry *kvm_init_debugfs(void)
-{
- d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
- if (!d_kvm_debug)
- printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
-
- return d_kvm_debug;
-}
-
-static int __init kvm_spinlock_debugfs(void)
-{
- struct dentry *d_kvm;
-
- d_kvm = kvm_init_debugfs();
- if (d_kvm == NULL)
- return -ENOMEM;
-
- d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
-
- debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-
- debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW]);
- debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
-
- debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW]);
- debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-
- debugfs_create_u64("time_blocked", 0444, d_spin_debug,
- &spinlock_stats.time_blocked);
-
- debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
- spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
-
- return 0;
-}
-fs_initcall(kvm_spinlock_debugfs);
-#else /* !CONFIG_KVM_DEBUG_FS */
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
-}
-
-static inline u64 spin_time_start(void)
-{
- return 0;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-}
-#endif /* CONFIG_KVM_DEBUG_FS */
-
-struct kvm_lock_waiting {
- struct arch_spinlock *lock;
- __ticket_t want;
-};
-
-/* cpus 'waiting' on a spinlock to become available */
-static cpumask_t waiting_cpus;
-
-/* Track spinlock on which a cpu is waiting */
-static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
-
-__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
-{
- struct kvm_lock_waiting *w;
- int cpu;
- u64 start;
- unsigned long flags;
- __ticket_t head;
-
- if (in_nmi())
- return;
-
- w = this_cpu_ptr(&klock_waiting);
- cpu = smp_processor_id();
- start = spin_time_start();
-
- /*
- * Make sure an interrupt handler can't upset things in a
- * partially setup state.
- */
- local_irq_save(flags);
-
- /*
- * The ordering protocol on this is that the "lock" pointer
- * may only be set non-NULL if the "want" ticket is correct.
- * If we're updating "want", we must first clear "lock".
- */
- w->lock = NULL;
- smp_wmb();
- w->want = want;
- smp_wmb();
- w->lock = lock;
-
- add_stats(TAKEN_SLOW, 1);
-
- /*
- * This uses set_bit, which is atomic but we should not rely on its
- * reordering gurantees. So barrier is needed after this call.
- */
- cpumask_set_cpu(cpu, &waiting_cpus);
-
- barrier();
-
- /*
- * Mark entry to slowpath before doing the pickup test to make
- * sure we don't deadlock with an unlocker.
- */
- __ticket_enter_slowpath(lock);
-
- /* make sure enter_slowpath, which is atomic does not cross the read */
- smp_mb__after_atomic();
-
- /*
- * check again make sure it didn't become free while
- * we weren't looking.
- */
- head = READ_ONCE(lock->tickets.head);
- if (__tickets_equal(head, want)) {
- add_stats(TAKEN_SLOW_PICKUP, 1);
- goto out;
- }
-
- /*
- * halt until it's our turn and kicked. Note that we do safe halt
- * for irq enabled case to avoid hang when lock info is overwritten
- * in irq spinlock slowpath and no spurious interrupt occur to save us.
- */
- if (arch_irqs_disabled_flags(flags))
- halt();
- else
- safe_halt();
-
-out:
- cpumask_clear_cpu(cpu, &waiting_cpus);
- w->lock = NULL;
- local_irq_restore(flags);
- spin_time_accum_blocked(start);
-}
-PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
-
-/* Kick vcpu waiting on @lock->head to reach value @ticket */
-static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
-{
- int cpu;
-
- add_stats(RELEASED_SLOW, 1);
- for_each_cpu(cpu, &waiting_cpus) {
- const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
- if (READ_ONCE(w->lock) == lock &&
- READ_ONCE(w->want) == ticket) {
- add_stats(RELEASED_SLOW_KICKED, 1);
- kvm_kick_cpu(cpu);
- break;
- }
- }
-}
-
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
-
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -856,16 +603,11 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
-#ifdef CONFIG_QUEUED_SPINLOCKS
__pv_init_lock_hash();
pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = kvm_wait;
pv_lock_ops.kick = kvm_kick_cpu;
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
- pv_lock_ops.unlock_kick = kvm_unlock_kick;
-#endif
}
static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1d39bfbd26bb..60b9949f1e65 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,7 +29,7 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
-static int kvmclock = 1;
+static int kvmclock __ro_after_init = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
static cycle_t kvm_sched_clock_offset;
@@ -289,6 +289,7 @@ void __init kvmclock_init(void)
put_cpu();
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.calibrate_cpu = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..e9d252d873aa
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,65 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+#include <asm/text-patching.h>
+
+/* Apply per-object alternatives. Based on x86 module_finalize() */
+void arch_klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ int cnt;
+ struct klp_modinfo *info;
+ Elf_Shdr *s, *alt = NULL, *para = NULL;
+ void *aseg, *pseg;
+ const char *objname;
+ char sec_objname[MODULE_NAME_LEN];
+ char secname[KSYM_NAME_LEN];
+
+ info = patch->mod->klp_info;
+ objname = obj->name ? obj->name : "vmlinux";
+
+ /* See livepatch core code for BUILD_BUG_ON() explanation */
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+ for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
+ /* Apply per-object .klp.arch sections */
+ cnt = sscanf(info->secstrings + s->sh_name,
+ ".klp.arch.%55[^.].%127s",
+ sec_objname, secname);
+ if (cnt != 2)
+ continue;
+ if (strcmp(sec_objname, objname))
+ continue;
+ if (!strcmp(".altinstructions", secname))
+ alt = s;
+ if (!strcmp(".parainstructions", secname))
+ para = s;
+ }
+
+ if (alt) {
+ aseg = (void *) alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
+
+ if (para) {
+ pseg = (void *) para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 97340f2c437c..0f8d20497383 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -16,7 +16,6 @@
#include <linux/mc146818rtc.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
-#include <linux/module.h>
#include <linux/smp.h>
#include <linux/pci.h>
@@ -500,6 +499,9 @@ void __init default_get_smp_config(unsigned int early)
{
struct mpf_intel *mpf = mpf_found;
+ if (!smp_found_config)
+ return;
+
if (!mpf)
return;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 04b132a767f1..bfe4d6c96fbd 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -17,6 +17,7 @@
#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/hardirq.h>
+#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/export.h>
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 33ee3e0efd65..2c55a003b793 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -3,12 +3,11 @@
* compiled in a FTRACE-compatible way.
*/
#include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/jump_label.h>
#include <asm/paravirt.h>
-#ifdef CONFIG_QUEUED_SPINLOCKS
__visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
native_queued_spin_unlock(lock);
@@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void)
return pv_lock_ops.queued_spin_unlock.func ==
__raw_callee_save___native_queued_spin_unlock;
}
-#endif
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
-#ifdef CONFIG_QUEUED_SPINLOCKS
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
- .unlock_kick = paravirt_nop,
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 7b3b3f24c3ea..bbf3d5933eaa 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -19,7 +19,8 @@
*/
#include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
@@ -55,12 +56,12 @@ asm (".pushsection .entry.text, \"ax\"\n"
".popsection");
/* identity function, which can be inlined */
-u32 _paravirt_ident_32(u32 x)
+u32 notrace _paravirt_ident_32(u32 x)
{
return x;
}
-u64 _paravirt_ident_64(u64 x)
+u64 notrace _paravirt_ident_64(u64 x)
{
return x;
}
@@ -331,7 +332,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
.write_cr4 = native_write_cr4,
#ifdef CONFIG_X86_64
.read_cr8 = native_read_cr8,
@@ -388,7 +388,7 @@ NOKPROBE_SYMBOL(native_load_idt);
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
#endif
-struct pv_mmu_ops pv_mmu_ops = {
+struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 158dc0650d5d..920c6ae08592 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
#endif
@@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index e70087a04cc8..bb3840cedb4f 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
#endif
@@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 833b1d329c47..5d400ba1349d 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -340,7 +340,7 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
int nelems,enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct iommu_table *tbl = find_iommu_table(dev);
struct scatterlist *s;
@@ -364,7 +364,7 @@ static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct iommu_table *tbl = find_iommu_table(dev);
struct scatterlist *s;
@@ -396,7 +396,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
return nelems;
error:
- calgary_unmap_sg(dev, sg, nelems, dir, NULL);
+ calgary_unmap_sg(dev, sg, nelems, dir, 0);
for_each_sg(sg, s, nelems, i) {
sg->dma_address = DMA_ERROR_CODE;
sg->dma_length = 0;
@@ -407,7 +407,7 @@ error:
static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
void *vaddr = page_address(page) + offset;
unsigned long uaddr;
@@ -422,7 +422,7 @@ static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct iommu_table *tbl = find_iommu_table(dev);
unsigned int npages;
@@ -432,7 +432,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
}
static void* calgary_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs)
+ dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
{
void *ret = NULL;
dma_addr_t mapping;
@@ -466,7 +466,7 @@ error:
static void calgary_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_handle,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
unsigned int npages;
struct iommu_table *tbl = find_iommu_table(dev);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 6ba014c61d62..d30c37750765 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -77,7 +77,7 @@ void __init pci_iommu_alloc(void)
}
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
unsigned long dma_mask;
struct page *page;
@@ -120,7 +120,7 @@ again:
}
void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr, struct dma_attrs *attrs)
+ dma_addr_t dma_addr, unsigned long attrs)
{
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
struct page *page = virt_to_page(vaddr);
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index da15918d1c81..00e71ce396a8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -28,7 +28,7 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
dma_addr_t bus = page_to_phys(page) + offset;
WARN_ON(size == 0);
@@ -55,7 +55,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
*/
static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
int nents, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct scatterlist *s;
int i;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7c577a178859..b47edb8f5256 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -2,7 +2,7 @@
#include <linux/pci.h>
#include <linux/cache.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/swiotlb.h>
#include <linux/bootmem.h>
#include <linux/dma-mapping.h>
@@ -16,7 +16,7 @@ int swiotlb __read_mostly;
void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
void *vaddr;
@@ -37,7 +37,7 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
void x86_swiotlb_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_addr,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr)))
swiotlb_free_coherent(dev, size, vaddr, dma_addr);
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index b2f8a33b36ff..24a50301f150 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -7,12 +7,12 @@
void __init x86_early_init_platform_quirks(void)
{
x86_platform.legacy.rtc = 1;
- x86_platform.legacy.ebda_search = 0;
+ x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 1;
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_PC:
- x86_platform.legacy.ebda_search = 1;
+ x86_platform.legacy.reserve_bios_regions = 1;
break;
case X86_SUBARCH_XEN:
case X86_SUBARCH_LGUEST:
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 92f70147a9a6..0c5315d322c8 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -3,7 +3,7 @@
* Copyright (c) 2015, Intel Corporation.
*/
#include <linux/platform_device.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/ioport.h>
static int found(u64 start, u64 end, void *data)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 96becbbb52e0..28cea7802ecb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,7 +7,8 @@
#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/pm.h>
#include <linux/tick.h>
#include <linux/random.h>
@@ -31,6 +32,7 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -300,7 +302,7 @@ void arch_cpu_idle(void)
/*
* We use this if we don't have any better idle routine..
*/
-void default_idle(void)
+void __cpuidle default_idle(void)
{
trace_cpu_idle_rcuidle(1, smp_processor_id());
safe_halt();
@@ -404,7 +406,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
if (c->x86_vendor != X86_VENDOR_INTEL)
return 0;
- if (!cpu_has(c, X86_FEATURE_MWAIT))
+ if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR))
return 0;
return 1;
@@ -415,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
* with interrupts enabled and no flags, which is backwards compatible with the
* original MWAIT implementation.
*/
-static void mwait_idle(void)
+static __cpuidle void mwait_idle(void)
{
if (!current_set_polling_and_test()) {
trace_cpu_idle_rcuidle(1, smp_processor_id());
@@ -512,6 +514,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+ struct inactive_task_frame *frame =
+ (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
+ return READ_ONCE_NOCHECK(frame->ret_addr);
+}
+
+/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
@@ -519,15 +532,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
*/
unsigned long get_wchan(struct task_struct *p)
{
- unsigned long start, bottom, top, sp, fp, ip;
+ unsigned long start, bottom, top, sp, fp, ip, ret = 0;
int count = 0;
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
+ if (!try_get_task_stack(p))
+ return 0;
+
start = (unsigned long)task_stack_page(p);
if (!start)
- return 0;
+ goto out;
/*
* Layout of the stack page:
@@ -536,9 +552,7 @@ unsigned long get_wchan(struct task_struct *p)
* PADDING
* ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
* stack
- * ----------- bottom = start + sizeof(thread_info)
- * thread_info
- * ----------- start
+ * ----------- bottom = start
*
* The tasks stack pointer points at the location where the
* framepointer is stored. The data on the stack is:
@@ -549,20 +563,25 @@ unsigned long get_wchan(struct task_struct *p)
*/
top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
top -= 2 * sizeof(unsigned long);
- bottom = start + sizeof(struct thread_info);
+ bottom = start;
sp = READ_ONCE(p->thread.sp);
if (sp < bottom || sp > top)
- return 0;
+ goto out;
- fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+ fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
do {
if (fp < bottom || fp > top)
- return 0;
+ goto out;
ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
- if (!in_sched_functions(ip))
- return ip;
+ if (!in_sched_functions(ip)) {
+ ret = ip;
+ goto out;
+ }
fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
} while (count++ < 16 && p->state != TASK_RUNNING);
- return 0;
+
+out:
+ put_task_stack(p);
+ return ret;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9f950917528b..bd7be8efdc4c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -25,7 +25,7 @@
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/mc146818rtc.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/personality.h>
@@ -55,17 +55,6 @@
#include <asm/switch_to.h>
#include <asm/vm86.h>
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return ((unsigned long *)tsk->thread.sp)[3];
-}
-
void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -101,7 +90,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = __read_cr4_safe();
+ cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
@@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *childregs = task_pt_regs(p);
+ struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+ struct inactive_task_frame *frame = &fork_frame->frame;
struct task_struct *tsk;
int err;
- p->thread.sp = (unsigned long) childregs;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.sp0 = (unsigned long) (childregs+1);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
- p->thread.ip = (unsigned long) ret_from_kernel_thread;
- task_user_gs(p) = __KERNEL_STACK_CANARY;
- childregs->ds = __USER_DS;
- childregs->es = __USER_DS;
- childregs->fs = __KERNEL_PERCPU;
- childregs->bx = sp; /* function */
- childregs->bp = arg;
- childregs->orig_ax = -1;
- childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ frame->bx = sp; /* function */
+ frame->di = arg;
p->thread.io_bitmap_ptr = NULL;
return 0;
}
+ frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
- p->thread.ip = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());
p->thread.io_bitmap_ptr = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6e789ca1f841..b3760b3c1ca0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,7 @@
#include <linux/user.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
@@ -49,8 +49,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
-
-asmlinkage extern void ret_from_fork(void);
+#include <asm/vdso.h>
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -110,12 +109,13 @@ void __show_regs(struct pt_regs *regs, int all)
get_debugreg(d7, 7);
/* Only print out debug registers if they are in their non-default state. */
- if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
- (d6 == DR6_RESERVED) && (d7 == 0x400))
- return;
-
- printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
- printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))) {
+ printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
+ d0, d1, d2);
+ printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
+ d3, d6, d7);
+ }
if (boot_cpu_has(X86_FEATURE_OSPKE))
printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
@@ -141,12 +141,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
{
int err;
struct pt_regs *childregs;
+ struct fork_frame *fork_frame;
+ struct inactive_task_frame *frame;
struct task_struct *me = current;
p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
- p->thread.sp = (unsigned long) childregs;
- set_tsk_thread_flag(p, TIF_FORK);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -160,15 +165,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
- childregs->sp = (unsigned long)childregs;
- childregs->ss = __KERNEL_DS;
- childregs->bx = sp; /* function */
- childregs->bp = arg;
- childregs->orig_ax = -1;
- childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ frame->bx = sp; /* function */
+ frame->r12 = arg;
return 0;
}
+ frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
@@ -511,7 +512,7 @@ void set_personality_ia32(bool x32)
current->personality &= ~READ_IMPLIES_EXEC;
/* in_compat_syscall() uses the presence of the x32
syscall bit flag to determine compat status */
- current_thread_info()->status &= ~TS_COMPAT;
+ current->thread.status &= ~TS_COMPAT;
} else {
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
@@ -519,11 +520,24 @@ void set_personality_ia32(bool x32)
current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
- current_thread_info()->status |= TS_COMPAT;
+ current->thread.status |= TS_COMPAT;
}
}
EXPORT_SYMBOL_GPL(set_personality_ia32);
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ int ret;
+
+ ret = map_vdso_once(image, addr);
+ if (ret)
+ return ret;
+
+ return (long)image->size;
+}
+#endif
+
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{
int ret = 0;
@@ -577,6 +591,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
break;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32_ABI
+ case ARCH_MAP_VDSO_X32:
+ return prctl_map_vdso(&vdso_image_x32, addr);
+# endif
+# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ case ARCH_MAP_VDSO_32:
+ return prctl_map_vdso(&vdso_image_32, addr);
+# endif
+ case ARCH_MAP_VDSO_64:
+ return prctl_map_vdso(&vdso_image_64, addr);
+#endif
+
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 600edd225e81..0e63c0267f99 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)
return sp;
prev_esp = (u32 *)(context);
- if (prev_esp)
- return (unsigned long)prev_esp;
+ if (*prev_esp)
+ return (unsigned long)*prev_esp;
return (unsigned long)regs;
}
@@ -923,15 +923,18 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
case offsetof(struct user32, regs.orig_eax):
/*
- * A 32-bit debugger setting orig_eax means to restore
- * the state of the task restarting a 32-bit syscall.
- * Make sure we interpret the -ERESTART* codes correctly
- * in case the task is not actually still sitting at the
- * exit from a 32-bit syscall with TS_COMPAT still set.
+ * Warning: bizarre corner case fixup here. A 32-bit
+ * debugger setting orig_eax to -1 wants to disable
+ * syscall restart. Make sure that the syscall
+ * restart code sign-extends orig_ax. Also make sure
+ * we interpret the -ERESTART* codes correctly if
+ * loaded into regs->ax in case the task is not
+ * actually still sitting at the exit from a 32-bit
+ * syscall with TS_COMPAT still set.
*/
regs->orig_ax = value;
if (syscall_get_nr(child, regs) >= 0)
- task_thread_info(child)->status |= TS_COMPAT;
+ child->thread.status |= TS_I386_REGS_POKED;
break;
case offsetof(struct user32, regs.eflags):
@@ -1247,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
#ifdef CONFIG_X86_64
-static struct user_regset x86_64_regsets[] __read_mostly = {
+static struct user_regset x86_64_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1288,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = {
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static struct user_regset x86_32_regsets[] __read_mostly = {
+static struct user_regset x86_32_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1341,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = {
*/
u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
{
#ifdef CONFIG_X86_64
x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
@@ -1355,7 +1358,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
+ if (!user_64bit_mode(task_pt_regs(task)))
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
return &user_x86_32_view;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 99bfc025111d..5b2cc889ce34 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -61,12 +61,12 @@ void pvclock_resume(void)
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
{
unsigned version;
- cycle_t ret;
u8 flags;
do {
- version = __pvclock_read_cycles(src, &ret, &flags);
- } while ((src->version & 1) || version != src->version);
+ version = pvclock_read_begin(src);
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
return flags & valid_flags;
}
@@ -79,8 +79,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
u8 flags;
do {
- version = __pvclock_read_cycles(src, &ret, &flags);
- } while ((src->version & 1) || version != src->version);
+ version = pvclock_read_begin(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
src->flags &= ~PVCLOCK_GUEST_STOPPED;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index cc457ff818ad..51402a7e4ca6 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
amd_disable_seq_and_redirect_scrub);
#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#include <linux/jump_label.h>
+#include <asm/string_64.h>
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if (capid0 & 0x10)
+ static_branch_inc(&mcsafe_key);
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if ((capid0 & 0xc0) == 0xc0)
+ static_branch_inc(&mcsafe_key);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a9b31eb815f2..e244c19a2451 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,6 +1,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/pm.h>
@@ -55,6 +55,19 @@ bool port_cf9_safe = false;
*/
/*
+ * Some machines require the "reboot=a" commandline options
+ */
+static int __init set_acpi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_ACPI) {
+ reboot_type = BOOT_ACPI;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "ACPI");
+ }
+ return 0;
+}
+
+/*
* Some machines require the "reboot=b" or "reboot=k" commandline options,
* this quirk makes that automatic.
*/
@@ -395,6 +408,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */
+ .callback = set_acpi_reboot,
+ .ident = "Dell OptiPlex 7450 AIO",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"),
+ },
+ },
/* Hewlett-Packard */
{ /* Handle problems with rebooting on HP laptops */
@@ -684,7 +705,7 @@ static void native_machine_power_off(void)
tboot_shutdown(TB_SHUTDOWN_HALT);
}
-struct machine_ops machine_ops = {
+struct machine_ops machine_ops __ro_after_init = {
.power_off = native_machine_power_off,
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 80eab01c1a68..2408c1603438 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -27,8 +27,8 @@ static void remove_e820_regions(struct resource *avail)
int i;
struct e820entry *entry;
- for (i = 0; i < e820.nr_map; i++) {
- entry = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ entry = &e820->map[i];
resource_clip(avail, entry->addr,
entry->addr + entry->size - 1);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index eceaa082ec3f..79c6311cd912 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -13,7 +13,6 @@
#include <asm/x86_init.h>
#include <asm/time.h>
#include <asm/intel-mid.h>
-#include <asm/rtc.h>
#include <asm/setup.h>
#ifdef CONFIG_X86_32
@@ -47,7 +46,7 @@ int mach_set_rtc_mmss(const struct timespec *now)
rtc_time_to_tm(nowtime, &tm);
if (!rtc_valid_tm(&tm)) {
- retval = set_rtc_time(&tm);
+ retval = mc146818_set_time(&tm);
if (retval)
printk(KERN_ERR "%s: RTC write failed with error %d\n",
__func__, retval);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4e7b3991b60..bbfbca5fea0c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -36,7 +36,7 @@
#include <linux/console.h>
#include <linux/root_dev.h>
#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/edd.h>
@@ -113,6 +113,7 @@
#include <asm/prom.h>
#include <asm/microcode.h>
#include <asm/mmu_context.h>
+#include <asm/kaslr.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -209,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-__visible unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features __ro_after_init;
#else
-__visible unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
#endif
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
@@ -399,10 +400,6 @@ static void __init reserve_initrd(void)
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
-static void __init early_initrd_acpi_init(void)
-{
- early_acpi_table_init((void *)initrd_start, initrd_end - initrd_start);
-}
#else
static void __init early_reserve_initrd(void)
{
@@ -410,9 +407,6 @@ static void __init early_reserve_initrd(void)
static void __init reserve_initrd(void)
{
}
-static void __init early_initrd_acpi_init(void)
-{
-}
#endif /* CONFIG_BLK_DEV_INITRD */
static void __init parse_setup_data(void)
@@ -464,8 +458,8 @@ static void __init e820_reserve_setup_data(void)
early_memunmap(data, sizeof(*data));
}
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+ memcpy(e820_saved, e820, sizeof(struct e820map));
printk(KERN_INFO "extended physical RAM map:\n");
e820_print_map("reserve setup_data");
}
@@ -769,7 +763,7 @@ static void __init trim_bios_range(void)
*/
e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
}
/* called before trim_bios_range() to spare extra sanitize */
@@ -1038,7 +1032,7 @@ void __init setup_arch(char **cmdline_p)
if (ppro_with_ram_bug()) {
e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
printk(KERN_INFO "fixed physical RAM map:\n");
e820_print_map("bad_ppro");
}
@@ -1059,6 +1053,12 @@ void __init setup_arch(char **cmdline_p)
max_possible_pfn = max_pfn;
+ /*
+ * Define random base addresses for memory sections after max_pfn is
+ * defined and before each memory section base is used.
+ */
+ kernel_randomize_memory();
+
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
@@ -1096,17 +1096,19 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
- if (efi_enabled(EFI_BOOT)) {
+ reserve_bios_regions();
+
+ if (efi_enabled(EFI_MEMMAP)) {
efi_fake_memmap();
efi_find_mirror();
- }
+ efi_esrt_init();
- /*
- * The EFI specification says that boot service code won't be called
- * after ExitBootServices(). This is, in fact, a lie.
- */
- if (efi_enabled(EFI_MEMMAP))
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
efi_reserve_boot_services();
+ }
/* preallocate 4k for mptable mpc */
early_reserve_e820_mpc_new();
@@ -1129,7 +1131,13 @@ void __init setup_arch(char **cmdline_p)
early_trap_pf_init();
- setup_real_mode();
+ /*
+ * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+ * with the current CR4 value. This may not be necessary, but
+ * auditing all the early-boot CR4 manipulation would be needed to
+ * rule it out.
+ */
+ mmu_cr4_features = __read_cr4();
memblock_set_current_limit(get_max_mapped());
@@ -1146,7 +1154,7 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
- early_initrd_acpi_init();
+ acpi_table_upgrade();
vsmp_init();
@@ -1178,13 +1186,6 @@ void __init setup_arch(char **cmdline_p)
kasan_init();
- if (boot_cpu_data.cpuid_level >= 0) {
- /* A CPU has %cr4 if and only if it has CPUID */
- mmu_cr4_features = __read_cr4();
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- }
-
#ifdef CONFIG_X86_32
/* sync back kernel address range */
clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
@@ -1218,8 +1219,7 @@ void __init setup_arch(char **cmdline_p)
/*
* get boot-time SMP configuration:
*/
- if (smp_found_config)
- get_smp_config();
+ get_smp_config();
prefill_possible_map();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e4fcb87ba7a6..2bbd27f89802 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
};
EXPORT_SYMBOL(__per_cpu_offset);
@@ -236,6 +236,8 @@ void __init setup_per_cpu_areas(void)
early_per_cpu_map(x86_cpu_to_apicid, cpu);
per_cpu(x86_bios_cpu_apicid, cpu) =
early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+ per_cpu(x86_cpu_to_acpiid, cpu) =
+ early_per_cpu_map(x86_cpu_to_acpiid, cpu);
#endif
#ifdef CONFIG_X86_32
per_cpu(x86_cpu_to_logical_apicid, cpu) =
@@ -244,7 +246,7 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_64
per_cpu(irq_stack_ptr, cpu) =
per_cpu(irq_stack_union.irq_stack, cpu) +
- IRQ_STACK_SIZE - 64;
+ IRQ_STACK_SIZE;
#endif
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
@@ -271,6 +273,7 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_LOCAL_APIC
early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+ early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
#endif
#ifdef CONFIG_X86_32
early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 22cc2f9f8aec..763af1d0de64 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -42,6 +42,7 @@
#include <asm/syscalls.h>
#include <asm/sigframe.h>
+#include <asm/signal.h>
#define COPY(x) do { \
get_user_ex(regs->x, &sc->x); \
@@ -146,7 +147,7 @@ static int restore_sigcontext(struct pt_regs *regs,
buf = (void __user *)buf_val;
} get_user_catch(err);
- err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32));
+ err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
force_iret();
@@ -245,14 +246,14 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
struct fpu *fpu = &current->thread.fpu;
/* redzone */
- if (config_enabled(CONFIG_X86_64))
+ if (IS_ENABLED(CONFIG_X86_64))
sp -= 128;
/* This is the X/Open sanctioned signal stack switching. */
if (ka->sa.sa_flags & SA_ONSTACK) {
if (sas_ss_flags(sp) == 0)
sp = current->sas_ss_sp + current->sas_ss_size;
- } else if (config_enabled(CONFIG_X86_32) &&
+ } else if (IS_ENABLED(CONFIG_X86_32) &&
!onsigstack &&
(regs->ss & 0xffff) != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
@@ -262,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
}
if (fpu->fpstate_active) {
- sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+ sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
&buf_fx, &math_size);
*fpstate = (void __user *)sp;
}
@@ -547,7 +548,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
return -EFAULT;
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user32(&frame->info, &ksig->info))
+ if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
return -EFAULT;
}
@@ -660,20 +661,21 @@ badframe:
return 0;
}
-static inline int is_ia32_compat_frame(void)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
- return config_enabled(CONFIG_IA32_EMULATION) &&
- test_thread_flag(TIF_IA32);
+ return IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
}
-static inline int is_ia32_frame(void)
+static inline int is_ia32_frame(struct ksignal *ksig)
{
- return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
}
-static inline int is_x32_frame(void)
+static inline int is_x32_frame(struct ksignal *ksig)
{
- return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
}
static int
@@ -684,12 +686,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
compat_sigset_t *cset = (compat_sigset_t *) set;
/* Set up the stack frame */
- if (is_ia32_frame()) {
+ if (is_ia32_frame(ksig)) {
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
return ia32_setup_rt_frame(usig, ksig, cset, regs);
else
return ia32_setup_frame(usig, ksig, cset, regs);
- } else if (is_x32_frame()) {
+ } else if (is_x32_frame(ksig)) {
return x32_setup_rt_frame(ksig, cset, regs);
} else {
return __setup_rt_frame(ksig->sig, ksig, set, regs);
@@ -760,8 +762,30 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
{
-#ifdef CONFIG_X86_64
- if (in_ia32_syscall())
+ /*
+ * This function is fundamentally broken as currently
+ * implemented.
+ *
+ * The idea is that we want to trigger a call to the
+ * restart_block() syscall and that we want in_ia32_syscall(),
+ * in_x32_syscall(), etc. to match whatever they were in the
+ * syscall being restarted. We assume that the syscall
+ * instruction at (regs->ip - 2) matches whatever syscall
+ * instruction we used to enter in the first place.
+ *
+ * The problem is that we can get here when ptrace pokes
+ * syscall-like values into regs even if we're not in a syscall
+ * at all.
+ *
+ * For now, we maintain historical behavior and guess based on
+ * stored state. We could do better by saving the actual
+ * syscall arch in restart_block or (with caveats on x32) by
+ * checking if regs->ip points to 'int $0x80'. The current
+ * behavior is incorrect if a tracer has a different bitness
+ * than the tracee.
+ */
+#ifdef CONFIG_IA32_EMULATION
+ if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index dc3c0b1c816f..40df33753bae 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -1,10 +1,125 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
+#include <linux/ptrace.h>
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+/*
+ * The compat_siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the compat_siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+static inline void signal_compat_build_tests(void)
+{
+ int _sifields_offset = offsetof(compat_siginfo_t, _sifields);
+
+ /*
+ * If adding a new si_code, there is probably new data in
+ * the siginfo. Make sure folks bumping the si_code
+ * limits also have to look at this code. Make sure any
+ * new fields are handled in copy_siginfo_to_user32()!
+ */
+ BUILD_BUG_ON(NSIGILL != 8);
+ BUILD_BUG_ON(NSIGFPE != 8);
+ BUILD_BUG_ON(NSIGSEGV != 4);
+ BUILD_BUG_ON(NSIGBUS != 5);
+ BUILD_BUG_ON(NSIGTRAP != 4);
+ BUILD_BUG_ON(NSIGCHLD != 6);
+ BUILD_BUG_ON(NSIGSYS != 1);
+
+ /* This is part of the ABI and can never change in size: */
+ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
+ /*
+ * The offsets of all the (unioned) si_fields are fixed
+ * in the ABI, of course. Make sure none of them ever
+ * move and are always at the beginning:
+ */
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int));
+#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name))
+
+ /*
+ * Ensure that the size of each si_field never changes.
+ * If it does, it is a sign that the
+ * copy_siginfo_to_user32() code below needs to updated
+ * along with the size in the CHECK_SI_SIZE().
+ *
+ * We repeat this check for both the generic and compat
+ * siginfos.
+ *
+ * Note: it is OK for these to grow as long as the whole
+ * structure stays within the padding size (checked
+ * above).
+ */
+#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name))
+#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name))
+
+ CHECK_CSI_OFFSET(_kill);
+ CHECK_CSI_SIZE (_kill, 2*sizeof(int));
+ CHECK_SI_SIZE (_kill, 2*sizeof(int));
+
+ CHECK_CSI_OFFSET(_timer);
+ CHECK_CSI_SIZE (_timer, 5*sizeof(int));
+ CHECK_SI_SIZE (_timer, 6*sizeof(int));
+
+ CHECK_CSI_OFFSET(_rt);
+ CHECK_CSI_SIZE (_rt, 3*sizeof(int));
+ CHECK_SI_SIZE (_rt, 4*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigchld);
+ CHECK_CSI_SIZE (_sigchld, 5*sizeof(int));
+ CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigchld_x32);
+ CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int));
+ /* no _sigchld_x32 in the generic siginfo_t */
+
+ CHECK_CSI_OFFSET(_sigfault);
+ CHECK_CSI_SIZE (_sigfault, 4*sizeof(int));
+ CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigpoll);
+ CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
+ CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+
+ CHECK_CSI_OFFSET(_sigsys);
+ CHECK_CSI_SIZE (_sigsys, 3*sizeof(int));
+ CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+
+ /* any new si_fields should be added here */
+}
+
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ /* Don't leak in-kernel non-uapi flags to user-space */
+ if (oact)
+ oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (!act)
+ return;
+
+ /* Don't let flags to be set from userspace */
+ act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (user_64bit_mode(current_pt_regs()))
+ return;
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
+
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
+ bool x32_ABI)
{
int err = 0;
- bool ia32 = test_thread_flag(TIF_IA32);
+
+ signal_compat_build_tests();
if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
return -EFAULT;
@@ -32,13 +147,28 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
&to->_sifields._pad[0]);
switch (from->si_code >> 16) {
case __SI_FAULT >> 16:
+ if (from->si_signo == SIGBUS &&
+ (from->si_code == BUS_MCEERR_AR ||
+ from->si_code == BUS_MCEERR_AO))
+ put_user_ex(from->si_addr_lsb, &to->si_addr_lsb);
+
+ if (from->si_signo == SIGSEGV) {
+ if (from->si_code == SEGV_BNDERR) {
+ compat_uptr_t lower = (unsigned long)&to->si_lower;
+ compat_uptr_t upper = (unsigned long)&to->si_upper;
+ put_user_ex(lower, &to->si_lower);
+ put_user_ex(upper, &to->si_upper);
+ }
+ if (from->si_code == SEGV_PKUERR)
+ put_user_ex(from->si_pkey, &to->si_pkey);
+ }
break;
case __SI_SYS >> 16:
put_user_ex(from->si_syscall, &to->si_syscall);
put_user_ex(from->si_arch, &to->si_arch);
break;
case __SI_CHLD >> 16:
- if (ia32) {
+ if (!x32_ABI) {
put_user_ex(from->si_utime, &to->si_utime);
put_user_ex(from->si_stime, &to->si_stime);
} else {
@@ -72,6 +202,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
return err;
}
+/* from syscall's path, where we know the ABI */
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+ return __copy_siginfo_to_user32(to, from, in_x32_syscall());
+}
+
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
int err = 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fafe8b923cac..951f093a96fe 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -43,7 +43,7 @@
#include <linux/init.h>
#include <linux/smp.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/percpu.h>
#include <linux/bootmem.h>
@@ -100,10 +100,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
/* Logical package management. We might want to allocate that dynamically */
static int *physical_to_logical_pkg __read_mostly;
static unsigned long *physical_package_map __read_mostly;;
-static unsigned long *logical_package_map __read_mostly;
static unsigned int max_physical_pkg_id __read_mostly;
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
+static unsigned int logical_packages __read_mostly;
+static bool logical_packages_frozen __read_mostly;
+
+/* Maximum number of SMT threads on any online core */
+int __max_smt_threads __read_mostly;
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
@@ -274,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
if (test_and_set_bit(pkg, physical_package_map))
goto found;
- new = find_first_zero_bit(logical_package_map, __max_logical_packages);
- if (new >= __max_logical_packages) {
+ if (logical_packages_frozen) {
physical_to_logical_pkg[pkg] = -1;
- pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+ pr_warn("APIC(%x) Package %u exceeds logical package max\n",
apicid, pkg);
return -ENOSPC;
}
- set_bit(new, logical_package_map);
+
+ new = logical_packages++;
pr_info("APIC(%x) Converting physical %u to logical package %u\n",
apicid, pkg, new);
physical_to_logical_pkg[pkg] = new;
@@ -338,6 +342,7 @@ static void __init smp_init_package_map(void)
}
__max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
+ logical_packages = 0;
/*
* Possibly larger than what we need as the number of apic ids per
@@ -349,10 +354,6 @@ static void __init smp_init_package_map(void)
memset(physical_to_logical_pkg, 0xff, size);
size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
physical_package_map = kzalloc(size, GFP_KERNEL);
- size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
- logical_package_map = kzalloc(size, GFP_KERNEL);
-
- pr_info("Max logical packages: %u\n", __max_logical_packages);
for_each_present_cpu(cpu) {
unsigned int apicid = apic->cpu_present_to_apicid(cpu);
@@ -366,6 +367,15 @@ static void __init smp_init_package_map(void)
set_cpu_possible(cpu, false);
set_cpu_present(cpu, false);
}
+
+ if (logical_packages > __max_logical_packages) {
+ pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n",
+ logical_packages, __max_logical_packages);
+ logical_packages_frozen = true;
+ __max_logical_packages = logical_packages;
+ }
+
+ pr_info("Max logical packages: %u\n", __max_logical_packages);
}
void __init smp_store_boot_cpu_info(void)
@@ -461,31 +471,32 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
-static struct sched_domain_topology_level numa_inside_package_topology[] = {
+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { NULL, },
+};
+
+static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
+
/*
- * set_sched_topology() sets the topology internal to a CPU. The
- * NUMA topologies are layered on top of it to build the full
- * system topology.
- *
- * If NUMA nodes are observed to occur within a CPU package, this
- * function should be called. It forces the sched domain code to
- * only use the SMT level for the CPU portion of the topology.
- * This essentially falls back to relying on NUMA information
- * from the SRAT table to describe the entire system topology
- * (except for hyperthreads).
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours and Intel Cluster-on-Die have this.
*/
-static void primarily_use_numa_for_topology(void)
-{
- set_sched_topology(numa_inside_package_topology);
-}
+static bool x86_has_numa_in_package;
void set_cpu_sibling_map(int cpu)
{
@@ -493,7 +504,7 @@ void set_cpu_sibling_map(int cpu)
bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct cpuinfo_x86 *o;
- int i;
+ int i, threads;
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
@@ -548,8 +559,12 @@ void set_cpu_sibling_map(int cpu)
c->booted_cores = cpu_data(i).booted_cores;
}
if (match_die(c, o) && !topology_same_node(c, o))
- primarily_use_numa_for_topology();
+ x86_has_numa_in_package = true;
}
+
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
}
/* maps the cpu to the sched domain representing multi-core */
@@ -676,7 +691,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
@@ -703,7 +718,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
@@ -742,7 +757,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid]))
+ if (APIC_INTEGRATED(boot_cpu_apic_version))
num_starts = 2;
else
num_starts = 0;
@@ -928,7 +943,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
per_cpu(cpu_current_top_of_stack, cpu) =
(unsigned long)task_stack_page(idle) + THREAD_SIZE;
#else
- clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
#endif
}
@@ -955,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
- stack_start = idle->thread.sp;
+ initial_stack = idle->thread.sp;
/*
* Enable the espfix hack for this CPU
@@ -980,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
@@ -1101,17 +1115,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
- /*
- * We have to walk the irq descriptors to setup the vector
- * space for the cpu which comes online. Prevent irq
- * alloc/free across the bringup.
- */
- irq_lock_sparse();
-
err = do_boot_cpu(apicid, cpu, tidle);
-
if (err) {
- irq_unlock_sparse();
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1129,8 +1134,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
- irq_unlock_sparse();
-
return 0;
}
@@ -1235,7 +1238,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
/*
* If we couldn't find a local APIC, then get out of here now!
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
+ if (APIC_INTEGRATED(boot_cpu_apic_version) &&
!boot_cpu_has(X86_FEATURE_APIC)) {
if (!disable_apic) {
pr_err("BIOS bug, local APIC #%d not detected!...\n",
@@ -1285,12 +1288,21 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
cpumask_copy(cpu_callin_mask, cpumask_of(0));
mb();
- current_thread_info()->cpu = 0; /* needed? */
for_each_possible_cpu(i) {
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
+
+ /*
+ * Set 'default' x86 topology, this matches default_topology() in that
+ * it has NUMA nodes as a topology level. See also
+ * native_smp_cpus_done().
+ *
+ * Must be done before set_cpus_sibling_map() is ran.
+ */
+ set_sched_topology(x86_topology);
+
set_cpu_sibling_map(0);
switch (smp_sanity_check(max_cpus)) {
@@ -1310,14 +1322,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
break;
}
- default_setup_apic_routing();
-
if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
read_apic_id(), boot_cpu_physical_apicid);
/* Or can we switch back to PIC here? */
}
+ default_setup_apic_routing();
cpu0_logical_apicid = apic_bsp_setup(false);
pr_info("CPU%d: ", 0);
@@ -1357,6 +1368,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done\n");
+ if (x86_has_numa_in_package)
+ set_sched_topology(x86_numa_in_package_topology);
+
nmi_selftest();
impress_friends();
setup_ioapic_dest();
@@ -1393,9 +1407,21 @@ __init void prefill_possible_map(void)
{
int i, possible;
- /* no processor from mptable or madt */
- if (!num_processors)
- num_processors = 1;
+ /* No boot processor was found in mptable or ACPI MADT */
+ if (!num_processors) {
+ int apicid = boot_cpu_physical_apicid;
+ int cpu = hard_smp_processor_id();
+
+ pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
+
+ /* Make sure boot cpu is enumerated */
+ if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
+ apic->apic_id_valid(apicid))
+ generic_processor_info(apicid, boot_cpu_apic_version);
+
+ if (!num_processors)
+ num_processors = 1;
+ }
i = setup_max_cpus ?: 1;
if (setup_possible_cpus == -1) {
@@ -1441,6 +1467,21 @@ __init void prefill_possible_map(void)
#ifdef CONFIG_HOTPLUG_CPU
+/* Recompute SMT state for all CPUs on offline */
+static void recompute_smt_state(void)
+{
+ int max_threads, cpu;
+
+ max_threads = 0;
+ for_each_online_cpu (cpu) {
+ int threads = cpumask_weight(topology_sibling_cpumask(cpu));
+
+ if (threads > max_threads)
+ max_threads = threads;
+ }
+ __max_smt_threads = max_threads;
+}
+
static void remove_siblinginfo(int cpu)
{
int sibling;
@@ -1465,6 +1506,7 @@ static void remove_siblinginfo(int cpu)
c->phys_proc_id = 0;
c->cpu_core_id = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
+ recompute_smt_state();
}
static void remove_cpu_from_maps(int cpu)
@@ -1622,7 +1664,7 @@ static inline void mwait_play_dead(void)
}
}
-static inline void hlt_play_dead(void)
+void hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
wbinvd();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 9ee98eefc44d..0653788026e2 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -5,83 +5,72 @@
*/
#include <linux/sched.h>
#include <linux/stacktrace.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
+#include <asm/unwind.h>
-static int save_stack_stack(void *data, char *name)
+static int save_stack_address(struct stack_trace *trace, unsigned long addr,
+ bool nosched)
{
- return 0;
-}
-
-static int
-__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
-{
- struct stack_trace *trace = data;
-#ifdef CONFIG_FRAME_POINTER
- if (!reliable)
- return 0;
-#endif
if (nosched && in_sched_functions(addr))
return 0;
+
if (trace->skip > 0) {
trace->skip--;
return 0;
}
- if (trace->nr_entries < trace->max_entries) {
- trace->entries[trace->nr_entries++] = addr;
- return 0;
- } else {
- return -1; /* no more room, stop walking the stack */
- }
-}
-static int save_stack_address(void *data, unsigned long addr, int reliable)
-{
- return __save_stack_address(data, addr, reliable, false);
+ if (trace->nr_entries >= trace->max_entries)
+ return -1;
+
+ trace->entries[trace->nr_entries++] = addr;
+ return 0;
}
-static int
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+static void __save_stack_trace(struct stack_trace *trace,
+ struct task_struct *task, struct pt_regs *regs,
+ bool nosched)
{
- return __save_stack_address(data, addr, reliable, true);
-}
+ struct unwind_state state;
+ unsigned long addr;
-static const struct stacktrace_ops save_stack_ops = {
- .stack = save_stack_stack,
- .address = save_stack_address,
- .walk_stack = print_context_stack,
-};
+ if (regs)
+ save_stack_address(trace, regs->ip, nosched);
-static const struct stacktrace_ops save_stack_ops_nosched = {
- .stack = save_stack_stack,
- .address = save_stack_address_nosched,
- .walk_stack = print_context_stack,
-};
+ for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || save_stack_address(trace, addr, nosched))
+ break;
+ }
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
/*
* Save stack-backtrace addresses into a stack_trace buffer.
*/
void save_stack_trace(struct stack_trace *trace)
{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ __save_stack_trace(trace, current, NULL, false);
}
EXPORT_SYMBOL_GPL(save_stack_trace);
void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
- dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ __save_stack_trace(trace, current, regs, false);
}
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ if (!try_get_task_stack(tsk))
+ return;
+
+ __save_stack_trace(trace, tsk, NULL, true);
+
+ put_task_stack(tsk);
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 9b0185fbe3eb..8402907825b0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -188,12 +188,12 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
- for (i = 0; i < e820.nr_map; i++) {
- if ((e820.map[i].type != E820_RAM)
- && (e820.map[i].type != E820_RESERVED_KERN))
+ for (i = 0; i < e820->nr_map; i++) {
+ if ((e820->map[i].type != E820_RAM)
+ && (e820->map[i].type != E820_RESERVED_KERN))
continue;
- add_mac_region(e820.map[i].addr, e820.map[i].size);
+ add_mac_region(e820->map[i].addr, e820->map[i].size);
}
tboot->acpi_sinfo.kernel_s3_resume_vector =
@@ -323,25 +323,16 @@ static int tboot_wait_for_aps(int num_aps)
return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
}
-static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int tboot_dying_cpu(unsigned int cpu)
{
- switch (action) {
- case CPU_DYING:
- atomic_inc(&ap_wfs_count);
- if (num_online_cpus() == 1)
- if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
- return NOTIFY_BAD;
- break;
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1) {
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return -EBUSY;
}
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block tboot_cpu_notifier =
-{
- .notifier_call = tboot_cpu_callback,
-};
-
#ifdef CONFIG_DEBUG_FS
#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \
@@ -417,8 +408,8 @@ static __init int tboot_late_init(void)
tboot_create_trampoline();
atomic_set(&ap_wfs_count, 0);
- register_hotcpu_notifier(&tboot_cpu_notifier);
-
+ cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL,
+ tboot_dying_cpu);
#ifdef CONFIG_DEBUG_FS
debugfs_create_file("tboot_log", S_IRUSR,
arch_debugfs_dir, NULL, &tboot_log_fops);
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index cb4a01b41e27..222e84e2432e 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -9,7 +9,6 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
-#include <linux/module.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/asm.h>
@@ -74,7 +73,3 @@ int rodata_test(void)
return 0;
}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d1590486204a..bd4e3d4d3625 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -21,7 +21,7 @@
#include <linux/kdebug.h>
#include <linux/kgdb.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/string.h>
@@ -96,6 +96,12 @@ static inline void cond_local_irq_disable(struct pt_regs *regs)
local_irq_disable();
}
+/*
+ * In IST context, we explicitly disable preemption. This serves two
+ * purposes: it makes it much less likely that we would accidentally
+ * schedule in IST context and it will force a warning if we somehow
+ * manage to schedule by accident.
+ */
void ist_enter(struct pt_regs *regs)
{
if (user_mode(regs)) {
@@ -110,13 +116,7 @@ void ist_enter(struct pt_regs *regs)
rcu_nmi_enter();
}
- /*
- * We are atomic because we're on the IST stack; or we're on
- * x86_32, in which case we still shouldn't schedule; or we're
- * on x86_64 and entered from user mode, in which case we're
- * still atomic unless ist_begin_non_atomic is called.
- */
- preempt_count_add(HARDIRQ_OFFSET);
+ preempt_disable();
/* This code is a bit fragile. Test it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
@@ -124,7 +124,7 @@ void ist_enter(struct pt_regs *regs)
void ist_exit(struct pt_regs *regs)
{
- preempt_count_sub(HARDIRQ_OFFSET);
+ preempt_enable_no_resched();
if (!user_mode(regs))
rcu_nmi_exit();
@@ -155,7 +155,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
BUG_ON((unsigned long)(current_top_of_stack() -
current_stack_pointer()) >= THREAD_SIZE);
- preempt_count_sub(HARDIRQ_OFFSET);
+ preempt_enable_no_resched();
}
/**
@@ -165,7 +165,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
*/
void ist_end_non_atomic(void)
{
- preempt_count_add(HARDIRQ_OFFSET);
+ preempt_disable();
}
static nokprobe_inline int
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address)
+{
+ printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+ (void *)fault_address, current->stack,
+ (char *)current->stack + THREAD_SIZE - 1);
+ die(message, regs, 0);
+
+ /* Be absolutely certain we don't return. */
+ panic(message);
+}
+#endif
+
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long cr2;
+#endif
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_DF;
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * deliv- ered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ cr2 = read_cr2();
+ if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+ handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
#ifdef CONFIG_DOUBLEFAULT
df_debug(regs, error_code);
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 38ba6de56ede..46b2f41f8b05 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -3,7 +3,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
@@ -22,6 +22,8 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
#include <asm/geode.h>
+#include <asm/apic.h>
+#include <asm/intel-family.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -239,7 +241,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
return ns;
}
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu)
{
unsigned long long tsc_now, ns_now;
struct cyc2ns_data *data;
@@ -248,7 +250,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
local_irq_save(flags);
sched_clock_idle_sleep_event();
- if (!cpu_khz)
+ if (!khz)
goto done;
data = cyc2ns_write_begin(cpu);
@@ -261,7 +263,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+ clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -335,12 +337,6 @@ int check_tsc_unstable(void)
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);
-int check_tsc_disabled(void)
-{
- return tsc_disabled;
-}
-EXPORT_SYMBOL_GPL(check_tsc_disabled);
-
#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
@@ -665,19 +661,82 @@ success:
}
/**
- * native_calibrate_tsc - calibrate the tsc on boot
+ * native_calibrate_tsc
+ * Determine TSC frequency via CPUID, else return 0.
*/
unsigned long native_calibrate_tsc(void)
{
+ unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
+ unsigned int crystal_khz;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < 0x15)
+ return 0;
+
+ eax_denominator = ebx_numerator = ecx_hz = edx = 0;
+
+ /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+ cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
+
+ if (ebx_numerator == 0 || eax_denominator == 0)
+ return 0;
+
+ crystal_khz = ecx_hz / 1000;
+
+ if (crystal_khz == 0) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ crystal_khz = 24000; /* 24.0 MHz */
+ break;
+ case INTEL_FAM6_SKYLAKE_X:
+ crystal_khz = 25000; /* 25.0 MHz */
+ break;
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ crystal_khz = 19200; /* 19.2 MHz */
+ break;
+ }
+ }
+
+ return crystal_khz * ebx_numerator / eax_denominator;
+}
+
+static unsigned long cpu_khz_from_cpuid(void)
+{
+ unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < 0x16)
+ return 0;
+
+ eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
+
+ cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
+
+ return eax_base_mhz * 1000;
+}
+
+/**
+ * native_calibrate_cpu - calibrate the cpu on boot
+ */
+unsigned long native_calibrate_cpu(void)
+{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
unsigned long flags, latch, ms, fast_calibrate;
int hpet = is_hpet_enabled(), i, loopmin;
- /* Calibrate TSC using MSR for Intel Atom SoCs */
- local_irq_save(flags);
- fast_calibrate = try_msr_calibrate_tsc();
- local_irq_restore(flags);
+ fast_calibrate = cpu_khz_from_cpuid();
+ if (fast_calibrate)
+ return fast_calibrate;
+
+ fast_calibrate = cpu_khz_from_msr();
if (fast_calibrate)
return fast_calibrate;
@@ -837,8 +896,12 @@ int recalibrate_cpu_khz(void)
if (!boot_cpu_has(X86_FEATURE_TSC))
return -ENODEV;
+ cpu_khz = x86_platform.calibrate_cpu();
tsc_khz = x86_platform.calibrate_tsc();
- cpu_khz = tsc_khz;
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
cpu_khz_old, cpu_khz);
@@ -1193,6 +1256,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
(unsigned long)tsc_khz / 1000,
(unsigned long)tsc_khz % 1000);
+ /* Inform the TSC deadline clockevent devices about the recalibration */
+ lapic_update_tsc_freq();
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1244,8 +1310,18 @@ void __init tsc_init(void)
return;
}
+ cpu_khz = x86_platform.calibrate_cpu();
tsc_khz = x86_platform.calibrate_tsc();
- cpu_khz = tsc_khz;
+
+ /*
+ * Trust non-zero tsc_khz as authorative,
+ * and use it to sanity check cpu_khz,
+ * which will be off if system timer is off.
+ */
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
if (!tsc_khz) {
mark_tsc_unstable("could not calculate TSC khz");
@@ -1265,7 +1341,7 @@ void __init tsc_init(void)
*/
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(cpu_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu);
}
if (tsc_disabled > 0)
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 9911a0620f9a..0fe720d64fef 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -1,14 +1,5 @@
/*
- * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
- *
- * TSC in Intel Atom SoC runs at a constant rate which can be figured
- * by this formula:
- * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
- * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
- * for details.
- * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
- * based calibration is the only option.
- *
+ * tsc_msr.c - TSC frequency enumeration via MSR
*
* Copyright (C) 2013 Intel Corporation
* Author: Bin Gao <bin.gao@intel.com>
@@ -22,18 +13,10 @@
#include <asm/apic.h>
#include <asm/param.h>
-/* CPU reference clock frequency: in KHz */
-#define FREQ_80 80000
-#define FREQ_83 83200
-#define FREQ_100 99840
-#define FREQ_133 133200
-#define FREQ_166 166400
-
-#define MAX_NUM_FREQS 8
+#define MAX_NUM_FREQS 9
/*
- * According to Intel 64 and IA-32 System Programming Guide,
- * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
* read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
* Unfortunately some Intel Atom SoCs aren't quite compliant to this,
* so we need manually differentiate SoC families. This is what the
@@ -48,17 +31,18 @@ struct freq_desc {
static struct freq_desc freq_desc_tables[] = {
/* PNW */
- { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+ { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } },
/* CLV+ */
- { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
- /* TNG */
- { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
- /* VLV2 */
- { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
- /* ANN */
- { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
- /* AIRMONT */
- { 6, 0x4c, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, FREQ_80, 0, 0, 0 } },
+ { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } },
+ /* TNG - Intel Atom processor Z3400 series */
+ { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } },
+ /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */
+ { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } },
+ /* ANN - Intel Atom processor Z3500 series */
+ { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } },
+ /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */
+ { 6, 0x4c, 1, { 83300, 100000, 133300, 116700,
+ 80000, 93300, 90000, 88900, 87500 } },
};
static int match_cpu(u8 family, u8 model)
@@ -79,16 +63,20 @@ static int match_cpu(u8 family, u8 model)
(freq_desc_tables[cpu_index].freqs[freq_id])
/*
- * Do MSR calibration only for known/supported CPUs.
+ * MSR-based CPU/TSC frequency discovery for certain CPUs.
*
- * Returns the calibration value or 0 if MSR calibration failed.
+ * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
+ * Return processor base frequency in KHz, or 0 on failure.
*/
-unsigned long try_msr_calibrate_tsc(void)
+unsigned long cpu_khz_from_msr(void)
{
u32 lo, hi, ratio, freq_id, freq;
unsigned long res;
int cpu_index;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
if (cpu_index < 0)
return 0;
@@ -100,31 +88,17 @@ unsigned long try_msr_calibrate_tsc(void)
rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
ratio = (hi >> 8) & 0x1f;
}
- pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
-
- if (!ratio)
- goto fail;
/* Get FSB FREQ ID */
rdmsr(MSR_FSB_FREQ, lo, hi);
freq_id = lo & 0x7;
freq = id_to_freq(cpu_index, freq_id);
- pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
- freq_id, freq);
- if (!freq)
- goto fail;
/* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
res = freq * ratio;
- pr_info("TSC runs at %lu KHz\n", res);
#ifdef CONFIG_X86_LOCAL_APIC
lapic_timer_frequency = (freq * 1000) / HZ;
- pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
#endif
return res;
-
-fail:
- pr_warn("Fast TSC calibration using MSR failed\n");
- return 0;
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644
index 000000000000..a2456d4d286a
--- /dev/null
+++ b/arch/x86/kernel/unwind_frame.c
@@ -0,0 +1,93 @@
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ unsigned long addr;
+ unsigned long *addr_p = unwind_get_return_address_ptr(state);
+
+ if (unwind_done(state))
+ return 0;
+
+ addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
+ addr_p);
+
+ return __kernel_text_address(addr) ? addr : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+static bool update_stack_state(struct unwind_state *state, void *addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+
+ /*
+ * If addr isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that 'info->next_sp' could point to an empty stack and 'addr' could
+ * be on a subsequent stack.
+ */
+ while (!on_stack(info, addr, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long *next_bp;
+
+ if (unwind_done(state))
+ return false;
+
+ next_bp = (unsigned long *)*state->bp;
+
+ /* make sure the next frame's data is accessible */
+ if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
+ return false;
+
+ /* move to the next frame */
+ state->bp = next_bp;
+ return true;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ /* don't even attempt to start from user mode regs */
+ if (regs && user_mode(regs)) {
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+ }
+
+ /* set up the starting stack frame */
+ state->bp = get_frame_pointer(task, regs);
+
+ /* initialize stack info and make sure the frame data is accessible */
+ get_stack_info(state->bp, state->task, &state->stack_info,
+ &state->stack_mask);
+ update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->bp < first_frame))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644
index 000000000000..9298993dc8b7
--- /dev/null
+++ b/arch/x86/kernel/unwind_guess.c
@@ -0,0 +1,53 @@
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ *state->sp, state->sp);
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct stack_info *info = &state->stack_info;
+
+ if (unwind_done(state))
+ return false;
+
+ do {
+ for (state->sp++; state->sp < info->end; state->sp++)
+ if (__kernel_text_address(*state->sp))
+ return true;
+
+ state->sp = info->next_sp;
+
+ } while (!get_stack_info(state->sp, state->task, info,
+ &state->stack_mask));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->task = task;
+ state->sp = first_frame;
+
+ get_stack_info(first_frame, state->task, &state->stack_info,
+ &state->stack_mask);
+
+ if (!__kernel_text_address(*first_frame))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c1ff31d99ff..495c776de4b4 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
*cursor &= 0xfe;
}
/*
- * Similar treatment for VEX3 prefix.
- * TODO: add XOP/EVEX treatment when insn decoder supports them
+ * Similar treatment for VEX3/EVEX prefix.
+ * TODO: add XOP treatment when insn decoder supports them
*/
- if (insn->vex_prefix.nbytes == 3) {
+ if (insn->vex_prefix.nbytes >= 3) {
/*
* vex2: c5 rvvvvLpp (has no b bit)
* vex3/xop: c4/8f rxbmmmmm wvvvvLpp
* evex: 62 rxbR00mm wvvvv1pp zllBVaaa
- * (evex will need setting of both b and x since
- * in non-sib encoding evex.x is 4th bit of MODRM.rm)
- * Setting VEX3.b (setting because it has inverted meaning):
+ * Setting VEX3.b (setting because it has inverted meaning).
+ * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+ * is the 4th bit of MODRM.rm, and needs the same treatment.
+ * For VEX3-encoded insns, VEX3.x value has no effect in
+ * non-SIB encoding, the change is superfluous but harmless.
*/
cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
- *cursor |= 0x20;
+ *cursor |= 0x60;
}
/*
@@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
reg = MODRM_REG(insn); /* Fetch modrm.reg */
reg2 = 0xff; /* Fetch vex.vvvv */
- if (insn->vex_prefix.nbytes == 2)
- reg2 = insn->vex_prefix.bytes[1];
- else if (insn->vex_prefix.nbytes == 3)
+ if (insn->vex_prefix.nbytes)
reg2 = insn->vex_prefix.bytes[2];
/*
- * TODO: add XOP, EXEV vvvv reading.
+ * TODO: add XOP vvvv reading.
*
* vex.vvvv field is in bits 6-3, bits are inverted.
* But in 32-bit mode, high-order bit may be ignored.
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 3dce1ca0a653..01f30e56f99e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -440,10 +440,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
static inline int is_revectored(int nr, struct revectored_struct *bitmap)
{
- __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
- :"=r" (nr)
- :"m" (*bitmap), "r" (nr));
- return nr;
+ return test_bit(nr, bitmap->__map);
}
#define val_byte(val, n) (((__u8 *)&val)[n])
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9297a002d8e5..dbf67f64d5ec 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,6 +97,7 @@ SECTIONS
_stext = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
ENTRY_TEXT
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index cd05942bc918..b2cee3d19477 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -1,7 +1,8 @@
/* Exports for assembly files.
All C exports should go in the respective C files. */
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/spinlock_types.h>
#include <linux/smp.h>
#include <net/checksum.h>
@@ -37,13 +38,16 @@ EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(_copy_from_user);
EXPORT_SYMBOL(_copy_to_user);
-EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
EXPORT_SYMBOL(csum_partial);
+EXPORT_SYMBOL(__sw_hweight32);
+EXPORT_SYMBOL(__sw_hweight64);
+
/*
* Export string functions. We normally rely on gcc builtin for most of these,
* but gcc sometimes decides not to inline them.
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index dad5fe9633a3..0bd9f1287f39 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -5,7 +5,7 @@
*/
#include <linux/init.h>
#include <linux/ioport.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pci.h>
#include <asm/bios_ebda.h>
@@ -91,7 +91,8 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
static int default_i8042_detect(void) { return 1; };
-struct x86_platform_ops x86_platform = {
+struct x86_platform_ops x86_platform __ro_after_init = {
+ .calibrate_cpu = native_calibrate_cpu,
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
@@ -107,7 +108,7 @@ struct x86_platform_ops x86_platform = {
EXPORT_SYMBOL_GPL(x86_platform);
#if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi = {
+struct x86_msi_ops x86_msi __ro_after_init = {
.setup_msi_irqs = native_setup_msi_irqs,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
@@ -136,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
}
#endif
-struct x86_io_apic_ops x86_io_apic_ops = {
+struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = {
.read = native_io_apic_read,
.disable = native_disable_io_apic,
};