diff options
Diffstat (limited to 'arch/x86/kernel')
70 files changed, 1861 insertions, 1436 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 00df34c263cc..3269a0e23d3a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -48,6 +48,7 @@ obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o +obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o @@ -144,6 +145,10 @@ obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o +obj-$(CONFIG_X86_CET) += cet.o + +obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 53369c57751e..2a0ea38955df 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -170,7 +170,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) */ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) { - unsigned int ver = 0; int cpu; if (id >= MAX_LOCAL_APIC) { @@ -183,10 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) return -EINVAL; } - if (boot_cpu_physical_apicid != -1U) - ver = boot_cpu_apic_version; - - cpu = generic_processor_info(id, ver); + cpu = generic_processor_info(id); if (cpu >= 0) early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; @@ -240,7 +236,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - if (!apic->apic_id_valid(apic_id)) { + if (!apic_id_valid(apic_id)) { if (enabled) pr_warn("x2apic entry ignored\n"); return 0; @@ -1182,7 +1178,7 @@ static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, acpi_mp_wake_mailbox_paddr = mp_wake->base_address; - acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); return 0; } @@ -1279,7 +1275,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) /* * if "noapic" boot option, don't look for IO-APICs */ - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { pr_info("Skipping IOAPIC probe due to 'noapic' option.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index a6fcaf16cdbf..2ee867d796d9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -7,7 +7,7 @@ # In particualr, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index af49e24b46a4..760adac3d1a8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -63,6 +63,8 @@ #include <asm/irq_regs.h> #include <asm/cpu.h> +#include "local.h" + unsigned int num_processors; unsigned disabled_cpus; @@ -74,11 +76,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; /* - * The highest APIC ID seen during enumeration. - */ -static unsigned int max_physical_apicid; - -/* * Bitmask of physically existing CPUs: */ physid_mask_t phys_cpu_present_map; @@ -104,26 +101,20 @@ static bool virt_ext_dest_id __ro_after_init; /* For parallel bootup. */ unsigned long apic_mmio_base __ro_after_init; +static inline bool apic_accessible(void) +{ + return x2apic_mode || apic_mmio_base; +} + /* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); #ifdef CONFIG_X86_32 - -/* - * On x86_32, the mapping between cpu and logical apicid may vary - * depending on apic in use. The following early percpu variable is - * used for the mapping. This is where the behaviors of x86_64 and 32 - * actually diverge. Let's keep it ugly for now. - */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); - /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase __ro_after_init; @@ -179,8 +170,8 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -unsigned long mp_lapic_addr __ro_after_init; -int disable_apic __ro_after_init; +static unsigned long mp_lapic_addr __ro_after_init; +bool apic_is_disabled __ro_after_init; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ @@ -206,8 +197,6 @@ unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); -static unsigned long apic_phys __ro_after_init; - /* * Get the LAPIC version */ @@ -247,31 +236,7 @@ static int modern_apic(void) */ static void __init apic_disable(void) { - pr_info("APIC: switched to apic NOOP\n"); - apic = &apic_noop; -} - -void native_apic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 native_safe_apic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - inc_irq_stat(icr_read_retry_count); - udelay(100); - } while (timeout++ < 1000); - - return send_status; + apic_install_driver(&apic_noop); } void native_apic_icr_write(u32 low, u32 id) @@ -537,7 +502,7 @@ static int lapic_timer_set_oneshot(struct clock_event_device *evt) static void lapic_timer_broadcast(const struct cpumask *mask) { #ifdef CONFIG_SMP - apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR); + __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR); #endif } @@ -810,7 +775,7 @@ bool __init apic_needs_pit(void) return true; /* Is there an APIC at all or is it disabled? */ - if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic) + if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled) return true; /* @@ -1110,7 +1075,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); + apic_eoi(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1134,8 +1099,7 @@ void clear_local_APIC(void) int maxlvt; u32 v; - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) + if (!apic_accessible()) return; maxlvt = lapic_get_maxlvt(); @@ -1225,8 +1189,7 @@ void apic_soft_disable(void) */ void disable_local_APIC(void) { - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) + if (!apic_accessible()) return; apic_soft_disable(); @@ -1299,7 +1262,7 @@ enum apic_intr_mode_id apic_intr_mode __ro_after_init; static int __init __apic_intr_mode_select(void) { /* Check kernel option */ - if (disable_apic) { + if (apic_is_disabled) { pr_info("APIC disabled via kernel command line\n"); return APIC_PIC; } @@ -1308,7 +1271,7 @@ static int __init __apic_intr_mode_select(void) #ifdef CONFIG_X86_64 /* On 64-bit, the APIC must be integrated, Check local APIC only */ if (!boot_cpu_has(X86_FEATURE_APIC)) { - disable_apic = 1; + apic_is_disabled = true; pr_info("APIC disabled by BIOS\n"); return APIC_PIC; } @@ -1317,16 +1280,15 @@ static int __init __apic_intr_mode_select(void) /* Neither 82489DX nor integrated APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { - disable_apic = 1; + apic_is_disabled = true; return APIC_PIC; } /* If the BIOS pretends there is an integrated APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC) && APIC_INTEGRATED(boot_cpu_apic_version)) { - disable_apic = 1; - pr_err(FW_BUG "Local APIC %d not detected, force emulation\n", - boot_cpu_physical_apicid); + apic_is_disabled = true; + pr_err(FW_BUG "Local APIC not detected, force emulation\n"); return APIC_PIC; } #endif @@ -1347,12 +1309,6 @@ static int __init __apic_intr_mode_select(void) pr_info("APIC: SMP mode deactivated\n"); return APIC_SYMMETRIC_IO_NO_ROUTING; } - - if (read_apic_id() != boot_cpu_physical_apicid) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - read_apic_id(), boot_cpu_physical_apicid); - /* Or can we switch back to PIC here? */ - } #endif return APIC_SYMMETRIC_IO; @@ -1439,7 +1395,9 @@ void __init apic_intr_mode_init(void) break; } - default_setup_apic_routing(); + x86_64_probe_apic(); + + x86_32_install_bigsmp(); if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1521,7 +1479,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * per set bit. */ for_each_set_bit(bit, isr->map, APIC_IR_BITS) - ack_APIC_irq(); + apic_eoi(); return true; } @@ -1533,7 +1491,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * interrupt from previous kernel might still have ISR bit set. * * Most probably by now the CPU has serviced that pending interrupt and it - * might not have done the ack_APIC_irq() because it thought, interrupt + * might not have done the apic_eoi() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector @@ -1567,7 +1525,7 @@ static void setup_local_APIC(void) int cpu = smp_processor_id(); unsigned int value; - if (disable_apic) { + if (apic_is_disabled) { disable_ioapic_support(); return; } @@ -1589,36 +1547,18 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - BUG_ON(!apic->apic_id_registered()); + /* Validate that the APIC is registered if required */ + BUG_ON(apic->apic_id_registered && !apic->apic_id_registered()); /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... + * document number 292116). + * + * Except for APICs which operate in physical destination mode. */ - apic->init_apic_ldr(); - -#ifdef CONFIG_X86_32 - if (apic->dest_mode_logical) { - int logical_apicid, ldr_apicid; - - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches - * the actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - if (logical_apicid != BAD_APICID) - WARN_ON(logical_apicid != ldr_apicid); - /* Always use the value from LDR. */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; - } -#endif + if (apic->init_apic_ldr) + apic->init_apic_ldr(); /* * Set Task Priority to 'accept all except vectors 0-31'. An APIC @@ -1691,7 +1631,7 @@ static void setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!cpu && (pic_mode || !value || skip_ioapic_setup)) { + if (!cpu && (pic_mode || !value || ioapic_is_disabled)) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); } else { @@ -1748,6 +1688,25 @@ void apic_ap_setup(void) end_local_APIC_setup(); } +static __init void cpu_set_boot_apic(void); + +static __init void apic_read_boot_cpu_id(bool x2apic) +{ + /* + * This can be invoked from check_x2apic() before the APIC has been + * selected. But that code knows for sure that the BIOS enabled + * X2APIC. + */ + if (x2apic) { + boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID); + boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR)); + } else { + boot_cpu_physical_apicid = read_apic_id(); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); + } + cpu_set_boot_apic(); +} + #ifdef CONFIG_X86_X2APIC int x2apic_mode; EXPORT_SYMBOL_GPL(x2apic_mode); @@ -1847,6 +1806,8 @@ void x2apic_setup(void) __x2apic_enable(); } +static __init void apic_set_fixmap(void); + static __init void x2apic_disable(void) { u32 x2apic_id, state = x2apic_state; @@ -1867,7 +1828,7 @@ static __init void x2apic_disable(void) } __x2apic_disable(); - register_lapic_address(mp_lapic_addr); + apic_set_fixmap(); } static __init void x2apic_enable(void) @@ -1928,6 +1889,7 @@ void __init check_x2apic(void) x2apic_state = X2APIC_ON_LOCKED; else x2apic_state = X2APIC_ON; + apic_read_boot_cpu_id(true); } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } @@ -1943,7 +1905,7 @@ void __init check_x2apic(void) pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n"); pr_err("Disabling APIC, expect reduced performance and functionality.\n"); - disable_apic = 1; + apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); } @@ -1956,7 +1918,7 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n"); return; } @@ -1994,19 +1956,19 @@ void __init enable_IR_x2apic(void) * On AMD64 we trust the BIOS - if it says no APIC it is likely * not correctly set up (usually the APIC timer won't work etc.) */ -static int __init detect_init_APIC(void) +static bool __init detect_init_APIC(void) { if (!boot_cpu_has(X86_FEATURE_APIC)) { pr_info("No local APIC present\n"); - return -1; + return false; } - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - return 0; + register_lapic_address(APIC_DEFAULT_PHYS_BASE); + return true; } #else -static int __init apic_verify(void) +static bool __init apic_verify(unsigned long addr) { u32 features, h, l; @@ -2017,28 +1979,28 @@ static int __init apic_verify(void) features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { pr_warn("Could not enable APIC!\n"); - return -1; + return false; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ if (boot_cpu_data.x86 >= 6) { rdmsr(MSR_IA32_APICBASE, l, h); if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + addr = l & MSR_IA32_APICBASE_BASE; } + register_lapic_address(addr); pr_info("Found and enabled local APIC!\n"); - return 0; + return true; } -int __init apic_force_enable(unsigned long addr) +bool __init apic_force_enable(unsigned long addr) { u32 h, l; - if (disable_apic) - return -1; + if (apic_is_disabled) + return false; /* * Some BIOSes disable the local APIC in the APIC_BASE @@ -2055,17 +2017,17 @@ int __init apic_force_enable(unsigned long addr) enabled_via_apicbase = 1; } } - return apic_verify(); + return apic_verify(addr); } /* * Detect and initialize APIC */ -static int __init detect_init_APIC(void) +static bool __init detect_init_APIC(void) { /* Disabled by kernel option? */ - if (disable_apic) - return -1; + if (apic_is_disabled) + return false; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -2092,22 +2054,22 @@ static int __init detect_init_APIC(void) if (!force_enable_local_apic) { pr_info("Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); - return -1; + return false; } - if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) - return -1; + if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE)) + return false; } else { - if (apic_verify()) - return -1; + if (!apic_verify(APIC_DEFAULT_PHYS_BASE)) + return false; } apic_pm_activate(); - return 0; + return true; no_apic: pr_info("No local APIC present or hardware disabled\n"); - return -1; + return false; } #endif @@ -2116,64 +2078,38 @@ no_apic: */ void __init init_apic_mappings(void) { - unsigned int new_apicid; - if (apic_validate_deadline_timer()) pr_info("TSC deadline timer available\n"); - if (x2apic_mode) { - boot_cpu_physical_apicid = read_apic_id(); + if (x2apic_mode) return; - } - /* If no local APIC can be found return early */ - if (!smp_found_config && detect_init_APIC()) { - /* lets NOP'ify apic operations */ - pr_info("APIC: disable apic facility\n"); - apic_disable(); - } else { - apic_phys = mp_lapic_addr; - - /* - * If the system has ACPI MADT tables or MP info, the LAPIC - * address is already registered. - */ - if (!acpi_lapic && !smp_found_config) - register_lapic_address(apic_phys); + if (!smp_found_config) { + if (!detect_init_APIC()) { + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } + num_processors = 1; } +} - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - new_apicid = read_apic_id(); - if (boot_cpu_physical_apicid != new_apicid) { - boot_cpu_physical_apicid = new_apicid; - /* - * yeah -- we lie about apic_version - * in case if apic was disabled via boot option - * but it's not a problem for SMP compiled kernel - * since apic_intr_mode_select is prepared for such - * a case and disable smp mode - */ - boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); - } +static __init void apic_set_fixmap(void) +{ + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + apic_mmio_base = APIC_BASE; + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + apic_mmio_base, mp_lapic_addr); + apic_read_boot_cpu_id(false); } void __init register_lapic_address(unsigned long address) { + /* This should only happen once */ + WARN_ON_ONCE(mp_lapic_addr); mp_lapic_addr = address; - if (!x2apic_mode) { - set_fixmap_nocache(FIX_APIC_BASE, address); - apic_mmio_base = APIC_BASE; - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, address); - } - if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = read_apic_id(); - boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); - } + if (!x2apic_mode) + apic_set_fixmap(); } /* @@ -2210,7 +2146,7 @@ static noinline void handle_spurious_interrupt(u8 vector) if (v & (1 << (vector & 0x1f))) { pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", vector, smp_processor_id()); - ack_APIC_irq(); + apic_eoi(); } else { pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", vector, smp_processor_id()); @@ -2261,7 +2197,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); v = apic_read(APIC_ESR); - ack_APIC_irq(); + apic_eoi(); atomic_inc(&irq_err_count); apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", @@ -2446,54 +2382,43 @@ static int allocate_logical_cpuid(int apicid) return nr_logical_cpuids++; } -int generic_processor_info(int apicid, int version) +static void cpu_update_apic(int cpu, int apicid) { - int cpu, max = nr_cpu_ids; - bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, - phys_cpu_present_map); +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; +#endif + set_cpu_possible(cpu, true); + physid_set(apicid, phys_cpu_present_map); + set_cpu_present(cpu, true); + num_processors++; - /* - * boot_cpu_physical_apicid is designed to have the apicid - * returned by read_apic_id(), i.e, the apicid of the - * currently booting-up processor. However, on some platforms, - * it is temporarily modified by the apicid reported as BSP - * through MP table. Concretely: - * - * - arch/x86/kernel/mpparse.c: MP_processor_info() - * - arch/x86/mm/amdtopology.c: amd_numa_init() - * - * This function is executed with the modified - * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel - * parameter doesn't work to disable APs on kdump 2nd kernel. - * - * Since fixing handling of boot_cpu_physical_apicid requires - * another discussion and tests on each platform, we leave it - * for now and here we use read_apic_id() directly in this - * function, generic_processor_info(). - */ - if (disabled_cpu_apicid != BAD_APICID && - disabled_cpu_apicid != read_apic_id() && - disabled_cpu_apicid == apicid) { - int thiscpu = num_processors + disabled_cpus; + if (system_state != SYSTEM_BOOTING) + cpu_mark_primary_thread(cpu, apicid); +} - pr_warn("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", thiscpu, apicid); +static __init void cpu_set_boot_apic(void) +{ + cpuid_to_apicid[0] = boot_cpu_physical_apicid; + cpu_update_apic(0, boot_cpu_physical_apicid); + x86_32_probe_bigsmp_early(); +} - disabled_cpus++; - return -ENODEV; - } +int generic_processor_info(int apicid) +{ + int cpu, max = nr_cpu_ids; - /* - * If boot cpu has not been detected yet, then only allow upto - * nr_cpu_ids - 1 processors and keep one slot free for boot cpu - */ - if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 && - apicid != boot_cpu_physical_apicid) { - int thiscpu = max + disabled_cpus - 1; + /* The boot CPU must be set before MADT/MPTABLE parsing happens */ + if (cpuid_to_apicid[0] == BAD_APICID) + panic("Boot CPU APIC not registered yet\n"); - pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" - " reached. Keeping one slot for boot cpu." - " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + if (apicid == boot_cpu_physical_apicid) + return 0; + + if (disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n", + thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2509,66 +2434,16 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - * boot_cpu_init() already hold bit 0 in cpu_present_mask - * for BSP. - */ - cpu = 0; - - /* Logical cpuid 0 is reserved for BSP. */ - cpuid_to_apicid[0] = apicid; - } else { - cpu = allocate_logical_cpuid(apicid); - if (cpu < 0) { - disabled_cpus++; - return -EINVAL; - } - } - - /* - * Validate version - */ - if (version == 0x0) { - pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); - version = 0x10; - } - - if (version != boot_cpu_apic_version) { - pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - boot_cpu_apic_version, cpu, version); + cpu = allocate_logical_cpuid(apicid); + if (cpu < 0) { + disabled_cpus++; + return -EINVAL; } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) - early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; - early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; -#endif -#ifdef CONFIG_X86_32 - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - apic->x86_32_early_logical_apicid(cpu); -#endif - set_cpu_possible(cpu, true); - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - num_processors++; - - if (system_state != SYSTEM_BOOTING) - cpu_mark_primary_thread(cpu, apicid); - + cpu_update_apic(cpu, apicid); return cpu; } -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar) @@ -2610,47 +2485,10 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); -#ifdef CONFIG_X86_64 -void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) - (*drv)->wakeup_secondary_cpu_64 = handler; -} -#endif - -/* - * Override the generic EOI implementation with an optimized version. - * Only called during early boot when only one CPU is active and with - * interrupts disabled, so we know this does not race with actual APIC driver - * use. - */ -void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - /* Should happen once for each apic */ - WARN_ON((*drv)->eoi_write == eoi_write); - (*drv)->native_eoi_write = (*drv)->eoi_write; - (*drv)->eoi_write = eoi_write; - } -} - static void __init apic_bsp_up_setup(void) { #ifdef CONFIG_X86_64 apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); } @@ -2919,7 +2757,7 @@ int apic_is_clustered_box(void) */ static int __init setup_disableapic(char *arg) { - disable_apic = 1; + apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } @@ -2956,11 +2794,11 @@ early_param("nolapic_timer", parse_nolapic_timer); static int __init apic_set_verbosity(char *arg) { if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; + if (IS_ENABLED(CONFIG_X86_32)) + return -EINVAL; + + ioapic_is_disabled = false; return 0; -#endif - return -EINVAL; } if (strcmp("debug", arg) == 0) @@ -2981,11 +2819,11 @@ early_param("apic", apic_set_verbosity); static int __init lapic_insert_resource(void) { - if (!apic_phys) + if (!apic_mmio_base) return -1; /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; + lapic_resource.start = apic_mmio_base; lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; insert_resource(&iomem_resource, &lapic_resource); diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 02b4839478b1..7bc5d9bf59cd 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -6,6 +6,8 @@ #include <linux/irq.h> #include <asm/apic.h> +#include "local.h" + u32 apic_default_calc_apicid(unsigned int cpu) { return per_cpu(x86_cpu_to_apicid, cpu); @@ -29,18 +31,27 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) int default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + return (int)per_cpu(x86_cpu_to_apicid, mps_cpu); else return BAD_APICID; } EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); -int default_check_phys_apicid_present(int phys_apicid) +bool default_apic_id_registered(void) { - return physid_isset(phys_apicid, phys_cpu_present_map); + return physid_isset(read_apic_id(), phys_cpu_present_map); } -int default_apic_id_valid(u32 apicid) +/* + * Set up the logical destination ID when the APIC operates in logical + * destination mode. + */ +void default_init_apic_ldr(void) { - return (apicid < 255); + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 8f72b4351c9f..032a84e2c3cc 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -28,26 +28,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -void flat_init_apic_ldr(void) -{ - unsigned long val; - unsigned long num, id; - - num = smp_processor_id(); - id = 1UL << num; - apic_write(APIC_DFR, APIC_DFR_FLAT); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - static void _flat_send_IPI_mask(unsigned long mask, int vector) { unsigned long flags; @@ -86,16 +66,6 @@ static u32 set_apic_id(unsigned int id) return (id & 0xFF) << 24; } -static unsigned int read_xapic_id(void) -{ - return flat_get_apic_id(apic_read(APIC_ID)); -} - -static int flat_apic_id_registered(void) -{ - return physid_isset(read_xapic_id(), phys_cpu_present_map); -} - static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; @@ -110,23 +80,18 @@ static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = flat_apic_id_registered, + .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, + .init_apic_ldr = default_init_apic_ldr, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = flat_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -139,15 +104,13 @@ static struct apic apic_flat __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; /* @@ -178,22 +141,9 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static void physflat_init_apic_ldr(void) -{ - /* - * LDR and DFR are not involved in physflat mode, rather: - * "In physical destination mode, the destination processor is - * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1) - */ -} - static int physflat_probe(void) { - if (apic == &apic_physflat || num_possible_cpus() > 8 || - jailhouse_paravirt()) - return 1; - - return 0; + return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt(); } static struct apic apic_physflat __ro_after_init = { @@ -201,8 +151,7 @@ static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = flat_apic_id_registered, + .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, @@ -210,14 +159,11 @@ static struct apic apic_physflat __ro_after_init = { .disable_esr = 0, .check_apicid_used = NULL, - .init_apic_ldr = physflat_init_apic_ldr, .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = flat_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, @@ -230,15 +176,13 @@ static struct apic apic_physflat __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; /* diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index fe78319e0f7a..966d7cf10b95 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -8,92 +8,42 @@ * Though in case if apic is disabled (for some reason) we try * to not uglify the caller's code and allow to call (some) apic routines * like self-ipi, etc... + * + * FIXME: Remove this gunk. The above argument which was intentionally left + * in place is silly to begin with because none of the callbacks except for + * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh... */ #include <linux/cpumask.h> #include <linux/thread_info.h> #include <asm/apic.h> -static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } static void noop_send_IPI_allbutself(int vector) { } static void noop_send_IPI_all(int vector) { } static void noop_send_IPI_self(int vector) { } -static void noop_apic_wait_icr_idle(void) { } static void noop_apic_icr_write(u32 low, u32 id) { } - -static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) -{ - return -1; -} - -static u32 noop_safe_apic_wait_icr_idle(void) -{ - return 0; -} - -static u64 noop_apic_icr_read(void) -{ - return 0; -} - -static int noop_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return 0; -} - -static unsigned int noop_get_apic_id(unsigned long x) -{ - return 0; -} - -static int noop_probe(void) -{ - /* - * NOOP apic should not ever be - * enabled via probe routine - */ - return 0; -} - -static int noop_apic_id_registered(void) -{ - /* - * if we would be really "pedantic" - * we should pass read_apic_id() here - * but since NOOP suppose APIC ID = 0 - * lets save a few cycles - */ - return physid_isset(0, phys_cpu_present_map); -} +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; } +static u64 noop_apic_icr_read(void) { return 0; } +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; } +static unsigned int noop_get_apic_id(unsigned long x) { return 0; } +static void noop_apic_eoi(void) { } static u32 noop_apic_read(u32 reg) { - WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); return 0; } -static void noop_apic_write(u32 reg, u32 v) -{ - WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); -} - -#ifdef CONFIG_X86_32 -static int noop_x86_32_early_logical_apicid(int cpu) +static void noop_apic_write(u32 reg, u32 val) { - return BAD_APICID; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled); } -#endif struct apic apic_noop __ro_after_init = { .name = "noop", - .probe = noop_probe, - .acpi_madt_oem_check = NULL, - - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = noop_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, @@ -101,18 +51,13 @@ struct apic apic_noop __ro_after_init = { .disable_esr = 0, .check_apicid_used = default_check_apicid_used, - .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = noop_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = noop_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_flat_calc_apicid, @@ -125,17 +70,9 @@ struct apic apic_noop __ro_after_init = { .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, - .inquire_remote_apic = NULL, - .read = noop_apic_read, .write = noop_apic_write, - .eoi_write = noop_apic_write, + .eoi = noop_apic_eoi, .icr_read = noop_apic_icr_read, .icr_write = noop_apic_icr_write, - .wait_icr_idle = noop_apic_wait_icr_idle, - .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, - -#ifdef CONFIG_X86_32 - .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, -#endif }; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a54d817eb4b6..63f3d7be9dc7 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -56,17 +56,6 @@ static u32 numachip2_set_apic_id(unsigned int id) return id << 24; } -static int numachip_apic_id_valid(u32 apicid) -{ - /* Trust what bootloader passes in MADT */ - return 1; -} - -static int numachip_apic_id_registered(void) -{ - return 1; -} - static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) { return initial_apic_id >> index_msb; @@ -228,38 +217,20 @@ static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -/* APIC IPIs are queued */ -static void numachip_apic_wait_icr_idle(void) -{ -} - -/* APIC NMI IPIs are queued */ -static u32 numachip_safe_apic_wait_icr_idle(void) -{ - return 0; -} - static const struct apic apic_numachip1 __refconst = { .name = "NumaConnect system", .probe = numachip1_probe, .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, - .apic_id_valid = numachip_apic_id_valid, - .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, @@ -273,15 +244,12 @@ static const struct apic apic_numachip1 __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = numachip_apic_wait_icr_idle, - .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip1); @@ -290,23 +258,16 @@ static const struct apic apic_numachip2 __refconst = { .name = "NumaConnect2 system", .probe = numachip2_probe, .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, - .apic_id_valid = numachip_apic_id_valid, - .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = numachip_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, @@ -320,15 +281,12 @@ static const struct apic apic_numachip2 __refconst = { .send_IPI_self = numachip_send_IPI_self, .wakeup_secondary_cpu = numachip_wakeup_secondary, - .inquire_remote_apic = NULL, /* REMRD not supported */ .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = numachip_apic_wait_icr_idle, - .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle, }; apic_driver(apic_numachip2); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 77555f66c14d..0e5535add4b5 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -18,56 +18,17 @@ static unsigned bigsmp_get_apic_id(unsigned long x) return (x >> 24) & 0xFF; } -static int bigsmp_apic_id_registered(void) -{ - return 1; -} - static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return false; } -static int bigsmp_early_logical_apicid(int cpu) -{ - /* on bigsmp, logical apicid is the same as physical */ - return early_per_cpu(x86_cpu_to_apicid, cpu); -} - -/* - * bigsmp enables physical destination mode - * and doesn't use LDR and DFR - */ -static void bigsmp_init_apic_ldr(void) -{ -} - -static void bigsmp_setup_apic_routing(void) -{ - printk(KERN_INFO - "Enabling APIC mode: Physflat. Using %d I/O APICs\n", - nr_ioapics); -} - -static int bigsmp_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < nr_cpu_ids) - return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); - - return BAD_APICID; -} - static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ physids_promote(0xFFL, retmap); } -static int bigsmp_check_phys_apicid_present(int phys_apicid) -{ - return 1; -} - static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -111,21 +72,13 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { static int probe_bigsmp(void) { - if (def_to_bigsmp) - dmi_bigsmp = 1; - else - dmi_check_system(bigsmp_dmi_table); - - return dmi_bigsmp; + return dmi_check_system(bigsmp_dmi_table); } static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, - .apic_id_registered = bigsmp_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, @@ -133,14 +86,11 @@ static struct apic apic_bigsmp __ro_after_init = { .disable_esr = 1, .check_apicid_used = bigsmp_check_apicid_used, - .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, - .setup_apic_routing = bigsmp_setup_apic_routing, - .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - .check_phys_apicid_present = bigsmp_check_phys_apicid_present, + .cpu_present_to_apicid = default_cpu_present_to_apicid, .phys_pkg_id = bigsmp_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, @@ -153,37 +103,24 @@ static struct apic apic_bigsmp __ro_after_init = { .send_IPI_all = bigsmp_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; -void __init generic_bigsmp_probe(void) +bool __init apic_bigsmp_possible(bool cmdline_override) { - unsigned int cpu; - - if (!probe_bigsmp()) - return; - - apic = &apic_bigsmp; - - for_each_possible_cpu(cpu) { - if (early_per_cpu(x86_cpu_to_logical_apicid, - cpu) == BAD_APICID) - continue; - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = - bigsmp_early_logical_apicid(cpu); - } + return apic == &apic_bigsmp || !cmdline_override; +} - pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name); +void __init apic_bigsmp_force(void) +{ + if (apic != &apic_bigsmp) + apic_install_driver(&apic_bigsmp); } apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index d6e01f924299..45af535c44a0 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -21,6 +21,8 @@ #include <linux/init.h> #include <linux/delay.h> +#include "local.h" + #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { @@ -31,7 +33,7 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_cpumask_backtrace static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - apic->send_IPI_mask(mask, NMI_VECTOR); + __apic_send_IPI_mask(mask, NMI_VECTOR); } void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu) diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c new file mode 100644 index 000000000000..821e2e536f19 --- /dev/null +++ b/arch/x86/kernel/apic/init.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) "APIC: " fmt + +#include <asm/apic.h> + +#include "local.h" + +/* + * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions + * for each callback. The callbacks are setup during boot and all except + * wait_icr_idle() must be initialized before usage. The IPI wrappers + * use static_call() and not static_call_cond() to catch any fails. + */ +#define DEFINE_APIC_CALL(__cb) \ + DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb) + +DEFINE_APIC_CALL(eoi); +DEFINE_APIC_CALL(native_eoi); +DEFINE_APIC_CALL(icr_read); +DEFINE_APIC_CALL(icr_write); +DEFINE_APIC_CALL(read); +DEFINE_APIC_CALL(send_IPI); +DEFINE_APIC_CALL(send_IPI_mask); +DEFINE_APIC_CALL(send_IPI_mask_allbutself); +DEFINE_APIC_CALL(send_IPI_allbutself); +DEFINE_APIC_CALL(send_IPI_all); +DEFINE_APIC_CALL(send_IPI_self); +DEFINE_APIC_CALL(wait_icr_idle); +DEFINE_APIC_CALL(wakeup_secondary_cpu); +DEFINE_APIC_CALL(wakeup_secondary_cpu_64); +DEFINE_APIC_CALL(write); + +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask); +EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self); + +/* The container for function call overrides */ +struct apic_override __x86_apic_override __initdata; + +#define apply_override(__cb) \ + if (__x86_apic_override.__cb) \ + apic->__cb = __x86_apic_override.__cb + +static __init void restore_override_callbacks(void) +{ + apply_override(eoi); + apply_override(native_eoi); + apply_override(write); + apply_override(read); + apply_override(send_IPI); + apply_override(send_IPI_mask); + apply_override(send_IPI_mask_allbutself); + apply_override(send_IPI_allbutself); + apply_override(send_IPI_all); + apply_override(send_IPI_self); + apply_override(icr_read); + apply_override(icr_write); + apply_override(wakeup_secondary_cpu); + apply_override(wakeup_secondary_cpu_64); +} + +#define update_call(__cb) \ + static_call_update(apic_call_##__cb, *apic->__cb) + +static __init void update_static_calls(void) +{ + update_call(eoi); + update_call(native_eoi); + update_call(write); + update_call(read); + update_call(send_IPI); + update_call(send_IPI_mask); + update_call(send_IPI_mask_allbutself); + update_call(send_IPI_allbutself); + update_call(send_IPI_all); + update_call(send_IPI_self); + update_call(icr_read); + update_call(icr_write); + update_call(wait_icr_idle); + update_call(wakeup_secondary_cpu); + update_call(wakeup_secondary_cpu_64); +} + +void __init apic_setup_apic_calls(void) +{ + /* Ensure that the default APIC has native_eoi populated */ + apic->native_eoi = apic->eoi; + update_static_calls(); + pr_info("Static calls initialized\n"); +} + +void __init apic_install_driver(struct apic *driver) +{ + if (apic == driver) + return; + + apic = driver; + + if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid) + apic->max_apic_id = x2apic_max_apicid; + + /* Copy the original eoi() callback as KVM/HyperV might overwrite it */ + if (!apic->native_eoi) + apic->native_eoi = apic->eoi; + + /* Apply any already installed callback overrides */ + restore_override_callbacks(); + update_static_calls(); + + pr_info("Switched APIC routing to: %s\n", driver->name); +} diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4241dc243aa8..00da6cf6b07d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -178,7 +178,7 @@ int mp_bus_id_to_type[MAX_MP_BUSSES]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -int skip_ioapic_setup; +bool ioapic_is_disabled __ro_after_init; /** * disable_ioapic_support() - disables ioapic support at runtime @@ -189,7 +189,7 @@ void disable_ioapic_support(void) noioapicquirk = 1; noioapicreroute = -1; #endif - skip_ioapic_setup = 1; + ioapic_is_disabled = true; } static int __init parse_noapic(char *str) @@ -831,7 +831,7 @@ static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) { int ioapic, pin, idx; - if (skip_ioapic_setup) + if (ioapic_is_disabled) return -1; ioapic = mp_find_ioapic(gsi); @@ -1366,7 +1366,7 @@ void __init enable_IO_APIC(void) int i8259_apic, i8259_pin; int apic, pin; - if (skip_ioapic_setup) + if (ioapic_is_disabled) nr_ioapics = 0; if (!nr_legacy_irqs() || !nr_ioapics) @@ -1511,13 +1511,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) physid_set(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { - physid_mask_t tmp; - apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), - &tmp); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mpc_ioapic_id(ioapic_idx)); - physids_or(phys_id_present_map, phys_id_present_map, tmp); + apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } /* @@ -1827,7 +1823,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) * We must acknowledge the irq before we move it or the acknowledge will * not propagate properly. */ - ack_APIC_irq(); + apic_eoi(); /* * Tail end of clearing remote IRR bit (either by delivering the EOI @@ -2050,7 +2046,7 @@ static void unmask_lapic_irq(struct irq_data *data) static void ack_lapic_irq(struct irq_data *data) { - ack_APIC_irq(); + apic_eoi(); } static struct irq_chip lapic_chip __read_mostly = { @@ -2095,7 +2091,7 @@ static inline void __init unlock_ExtINT_logic(void) entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); - apic_id = hard_smp_processor_id(); + apic_id = read_apic_id(); memset(&entry1, 0, sizeof(entry1)); entry1.dest_mode_logical = true; @@ -2399,7 +2395,7 @@ void __init setup_IO_APIC(void) { int ioapic; - if (skip_ioapic_setup || !nr_ioapics) + if (ioapic_is_disabled || !nr_ioapics) return; io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; @@ -2546,7 +2542,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - apic->apicid_to_cpu_present(apic_id, &tmp); + physid_set_mask_of_physid(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { @@ -2715,7 +2711,7 @@ void __init io_apic_init_mappings(void) "address found in MPTABLE, " "disabling IO/APIC support!\n"); smp_found_config = 0; - skip_ioapic_setup = 1; + ioapic_is_disabled = true; goto fake_ioapic_page; } #endif diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 9bfd6e397384..a44ba7209ef3 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> +#include <linux/delay.h> #include <linux/smp.h> + #include <asm/io_apic.h> #include "local.h" @@ -52,9 +54,9 @@ void apic_send_IPI_allbutself(unsigned int vector) return; if (static_branch_likely(&apic_use_ipi_shorthand)) - apic->send_IPI_allbutself(vector); + __apic_send_IPI_allbutself(vector); else - apic->send_IPI_mask_allbutself(cpu_online_mask, vector); + __apic_send_IPI_mask_allbutself(cpu_online_mask, vector); } /* @@ -68,12 +70,12 @@ void native_smp_send_reschedule(int cpu) WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } - apic->send_IPI(cpu, RESCHEDULE_VECTOR); + __apic_send_IPI(cpu, RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); + __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); } void native_send_call_func_ipi(const struct cpumask *mask) @@ -85,14 +87,14 @@ void native_send_call_func_ipi(const struct cpumask *mask) goto sendmask; if (cpumask_test_cpu(cpu, mask)) - apic->send_IPI_all(CALL_FUNCTION_VECTOR); + __apic_send_IPI_all(CALL_FUNCTION_VECTOR); else if (num_online_cpus() > 1) - apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR); return; } sendmask: - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } #endif /* CONFIG_SMP */ @@ -102,74 +104,77 @@ static inline int __prepare_ICR2(unsigned int mask) return SET_XAPIC_DEST_FIELD(mask); } -static inline void __xapic_wait_icr_idle(void) +u32 apic_mem_wait_icr_idle_timeout(void) +{ + int cnt; + + for (cnt = 0; cnt < 1000; cnt++) { + if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY)) + return 0; + inc_irq_stat(icr_read_retry_count); + udelay(100); + } + return APIC_ICR_BUSY; +} + +void apic_mem_wait_icr_idle(void) { while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -void __default_send_IPI_shortcut(unsigned int shortcut, int vector) +/* + * This is safe against interruption because it only writes the lower 32 + * bits of the APIC_ICR register. The destination field is ignored for + * short hand IPIs. + * + * wait_icr_idle() + * write(ICR2, dest) + * NMI + * wait_icr_idle() + * write(ICR) + * wait_icr_idle() + * write(ICR) + * + * This function does not need to disable interrupts as there is no ICR2 + * interaction. The memory write is direct except when the machine is + * affected by the 11AP Pentium erratum, which turns the plain write into + * an XCHG operation. + */ +static void __default_send_IPI_shortcut(unsigned int shortcut, int vector) { /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; - - /* - * Wait for idle. + * Wait for the previous ICR command to complete. Use + * safe_apic_wait_icr_idle() for the NMI vector as there have been + * issues where otherwise the system hangs when the panic CPU tries + * to stop the others before launching the kdump kernel. */ if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); + apic_mem_wait_icr_idle_timeout(); else - __xapic_wait_icr_idle(); + apic_mem_wait_icr_idle(); - /* - * No need to touch the target chip field. Also the destination - * mode is ignored when a shorthand is used. - */ - cfg = __prepare_ICR(shortcut, vector, 0); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); + /* Destination field (ICR2) and the destination mode are ignored */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0)); } /* * This is used to send an IPI with no shorthand notation (the destination is * specified in bits 56 to 63 of the ICR). */ -void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +void __default_send_IPI_dest_field(unsigned int dest_mask, int vector, + unsigned int dest_mode) { - unsigned long cfg; - - /* - * Wait for idle. - */ + /* See comment in __default_send_IPI_shortcut() */ if (unlikely(vector == NMI_VECTOR)) - safe_apic_wait_icr_idle(); + apic_mem_wait_icr_idle_timeout(); else - __xapic_wait_icr_idle(); + apic_mem_wait_icr_idle(); - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - native_apic_mem_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, dest); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - native_apic_mem_write(APIC_ICR, cfg); + /* Set the IPI destination field in the ICR */ + native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask)); + /* Send it with the proper destination mode */ + native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode)); } void default_send_IPI_single_phys(int cpu, int vector) @@ -184,18 +189,13 @@ void default_send_IPI_single_phys(int cpu, int vector) void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) { - unsigned long query_cpu; unsigned long flags; + unsigned long cpu; - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicast to each CPU instead. - * - mbligh - */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) { + for_each_cpu(cpu, mask) { __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, - query_cpu), vector, APIC_DEST_PHYSICAL); + cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } @@ -203,18 +203,15 @@ void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector) { - unsigned int this_cpu = smp_processor_id(); - unsigned int query_cpu; + unsigned int cpu, this_cpu = smp_processor_id(); unsigned long flags; - /* See Hack comment above */ - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) continue; __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, - query_cpu), vector, APIC_DEST_PHYSICAL); + cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } @@ -224,7 +221,7 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, */ void default_send_IPI_single(int cpu, int vector) { - apic->send_IPI_mask(cpumask_of(cpu), vector); + __apic_send_IPI_mask(cpumask_of(cpu), vector); } void default_send_IPI_allbutself(int vector) @@ -243,50 +240,32 @@ void default_send_IPI_self(int vector) } #ifdef CONFIG_X86_32 - -void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, - int vector) +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) { unsigned long flags; - unsigned int query_cpu; - - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicasts to each CPU instead. This - * should be modified to do 1 message per cluster ID - mbligh - */ + unsigned int cpu; local_irq_save(flags); - for_each_cpu(query_cpu, mask) - __default_send_IPI_dest_field( - early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); + for_each_cpu(cpu, mask) + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector) { + unsigned int cpu, this_cpu = smp_processor_id(); unsigned long flags; - unsigned int query_cpu; - unsigned int this_cpu = smp_processor_id(); - - /* See Hack comment above */ local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) + for_each_cpu(cpu, mask) { + if (cpu == this_cpu) continue; - __default_send_IPI_dest_field( - early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); - } + __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL); + } local_irq_restore(flags); } -/* - * This is only used on smaller machines. - */ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; @@ -302,7 +281,6 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) } #ifdef CONFIG_SMP -/* must come after the send_IPI functions above for inlining */ static int convert_apicid_to_cpu(int apic_id) { int i; @@ -321,7 +299,7 @@ int safe_smp_processor_id(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; - apicid = hard_smp_processor_id(); + apicid = read_apic_id(); if (apicid == BAD_APICID) return 0; diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index a997d849509a..ec219c659c7d 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -13,18 +13,16 @@ #include <asm/irq_vectors.h> #include <asm/apic.h> -/* APIC flat 64 */ -void flat_init_apic_ldr(void); - /* X2APIC */ -int x2apic_apic_id_valid(u32 apicid); -int x2apic_apic_id_registered(void); void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); unsigned int x2apic_get_apic_id(unsigned long id); u32 x2apic_set_apic_id(unsigned int id); int x2apic_phys_pkg_id(int initial_apicid, int index_msb); + +void x2apic_send_IPI_all(int vector); +void x2apic_send_IPI_allbutself(int vector); void x2apic_send_IPI_self(int vector); -void __x2apic_send_IPI_shorthand(int vector, u32 which); +extern u32 x2apic_max_apicid; /* IPI */ @@ -46,7 +44,10 @@ static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, return icr; } -void __default_send_IPI_shortcut(unsigned int shortcut, int vector); +void default_init_apic_ldr(void); + +void apic_mem_wait_icr_idle(void); +u32 apic_mem_wait_icr_idle_timeout(void); /* * This is used to send an IPI with no shorthand notation (the destination is @@ -62,8 +63,23 @@ void default_send_IPI_allbutself(int vector); void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); +bool default_apic_id_registered(void); + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); +void x86_32_probe_bigsmp_early(void); +void x86_32_install_bigsmp(void); +#else +static inline void x86_32_probe_bigsmp_early(void) { } +static inline void x86_32_install_bigsmp(void) { } +#endif + +#ifdef CONFIG_X86_BIGSMP +bool apic_bigsmp_possible(bool cmdline_selected); +void apic_bigsmp_force(void); +#else +static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; +static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 35d5b8fb18ef..6b6b711678fe 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static const struct msi_parent_ops x86_vector_msi_parent_ops = { struct irq_domain * __init native_create_pci_msi_domain(void) { - if (disable_apic) + if (apic_is_disabled) return NULL; x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index a61f642b1b90..9a06df6cdd68 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -10,46 +10,14 @@ #include <linux/errno.h> #include <linux/smp.h> +#include <xen/xen.h> + #include <asm/io_apic.h> #include <asm/apic.h> #include <asm/acpi.h> #include "local.h" -static int default_x86_32_early_logical_apicid(int cpu) -{ - return 1 << cpu; -} - -static void setup_apic_flat_routing(void) -{ -#ifdef CONFIG_X86_IO_APIC - printk(KERN_INFO - "Enabling APIC mode: Flat. Using %d I/O APICs\n", - nr_ioapics); -#endif -} - -static int default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - -/* - * Set up the logical destination ID. Intel recommends to set DFR, LDR and - * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual" - * (Intel document number 292116). - */ -static void default_init_apic_ldr(void) -{ - unsigned long val; - - apic_write(APIC_DFR, APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); - apic_write(APIC_LDR, val); -} - static int default_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -65,8 +33,6 @@ static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, - .acpi_madt_oem_check = NULL, - .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, @@ -77,14 +43,11 @@ static struct apic apic_default __ro_after_init = { .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = setup_apic_flat_routing, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = default_phys_pkg_id, + .max_apic_id = 0xFE, .get_apic_id = default_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_flat_calc_apicid, @@ -95,17 +58,13 @@ static struct apic apic_default __ro_after_init = { .send_IPI_all = default_send_IPI_all, .send_IPI_self = default_send_IPI_self, - .inquire_remote_apic = default_inquire_remote_apic, - .read = native_apic_mem_read, .write = native_apic_mem_write, - .eoi_write = native_apic_mem_write, + .eoi = native_apic_mem_eoi, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, - .wait_icr_idle = native_apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, - - .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, + .wait_icr_idle = apic_mem_wait_icr_idle, + .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; apic_driver(apic_default); @@ -123,7 +82,7 @@ static int __init parse_apic(char *arg) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if (!strcmp((*drv)->name, arg)) { - apic = *drv; + apic_install_driver(*drv); cmdline_apic = 1; return 0; } @@ -134,49 +93,43 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init default_setup_apic_routing(void) +void __init x86_32_probe_bigsmp_early(void) { - int version = boot_cpu_apic_version; + if (nr_cpu_ids <= 8 || xen_pv_domain()) + return; - if (num_possible_cpus() > 8) { + if (IS_ENABLED(CONFIG_X86_BIGSMP)) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; + if (!APIC_XAPIC(boot_cpu_apic_version)) break; - } /* P4 and above */ fallthrough; case X86_VENDOR_HYGON: case X86_VENDOR_AMD: - def_to_bigsmp = 1; + if (apic_bigsmp_possible(cmdline_apic)) + return; + break; } } + pr_info("Limiting to 8 possible CPUs\n"); + set_nr_cpu_ids(8); +} -#ifdef CONFIG_X86_BIGSMP - /* - * This is used to switch to bigsmp mode when - * - There is no apic= option specified by the user - * - generic_apic_probe() has chosen apic_default as the sub_arch - * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support - */ - - if (!cmdline_apic && apic == &apic_default) - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); +void __init x86_32_install_bigsmp(void) +{ + if (nr_cpu_ids > 8 && !xen_pv_domain()) + apic_bigsmp_force(); } -void __init generic_apic_probe(void) +void __init x86_32_probe_apic(void) { if (!cmdline_apic) { struct apic **drv; for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->probe()) { - apic = *drv; + apic_install_driver(*drv); break; } } @@ -184,26 +137,4 @@ void __init generic_apic_probe(void) if (drv == __apicdrivers_end) panic("Didn't find an APIC driver"); } - printk(KERN_INFO "Using APIC driver %s\n", apic->name); -} - -/* This function can switch the APIC even after the initial ->probe() */ -int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - struct apic **drv; - - for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { - if (!(*drv)->acpi_madt_oem_check) - continue; - if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) - continue; - - if (!cmdline_apic) { - apic = *drv; - printk(KERN_INFO "Switched to APIC driver `%s'.\n", - apic->name); - } - return 1; - } - return 0; } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c46720f185c0..ecdf0c4121e1 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -13,10 +13,8 @@ #include "local.h" -/* - * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. - */ -void __init default_setup_apic_routing(void) +/* Select the appropriate APIC driver */ +void __init x86_64_probe_apic(void) { struct apic **drv; @@ -24,11 +22,7 @@ void __init default_setup_apic_routing(void) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->probe && (*drv)->probe()) { - if (apic != *drv) { - apic = *drv; - pr_info("Switched APIC routing to %s.\n", - apic->name); - } + apic_install_driver(*drv); break; } } @@ -40,11 +34,7 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { - if (apic != *drv) { - apic = *drv; - pr_info("Setting APIC routing to %s.\n", - apic->name); - } + apic_install_driver(*drv); return 1; } } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c1efebd27e6c..319448d87b99 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask; static struct irq_chip lapic_controller; static struct irq_matrix *vector_matrix; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct hlist_head, cleanup_list); + +static void vector_cleanup_callback(struct timer_list *tmr); + +struct vector_cleanup { + struct hlist_head head; + struct timer_list timer; +}; + +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = { + .head = HLIST_HEAD_INIT, + .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED), +}; #endif void lock_vector_lock(void) @@ -536,7 +547,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, struct irq_data *irqd; int i, err, node; - if (disable_apic) + if (apic_is_disabled) return -ENXIO; /* @@ -680,7 +691,7 @@ static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, * if IRQ remapping is enabled. APIC IDs above 15 bits are * only permitted if IRQ remapping is enabled, so check that. */ - if (apic->apic_id_valid(32768)) + if (apic_id_valid(32768)) return 0; return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); @@ -841,10 +852,21 @@ void lapic_online(void) this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); } +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr); + void lapic_offline(void) { + struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup); + lock_vector_lock(); + + /* In case the vector cleanup timer has not expired */ + __vector_cleanup(cl, false); + irq_matrix_offline(vector_matrix); + WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(!hlist_empty(&cl->head)); + unlock_vector_lock(); } @@ -876,7 +898,7 @@ static int apic_retrigger_irq(struct irq_data *irqd) unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI(apicd->cpu, apicd->vector); + __apic_send_IPI(apicd->cpu, apicd->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -885,7 +907,7 @@ static int apic_retrigger_irq(struct irq_data *irqd) void apic_ack_irq(struct irq_data *irqd) { irq_move_irq(irqd); - ack_APIC_irq(); + apic_eoi(); } void apic_ack_edge(struct irq_data *irqd) @@ -934,62 +956,98 @@ static void free_moved_vector(struct apic_chip_data *apicd) apicd->move_in_progress = 0; } -DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) +static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) { - struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); struct apic_chip_data *apicd; struct hlist_node *tmp; + bool rearm = false; - ack_APIC_irq(); - /* Prevent vectors vanishing under us */ - raw_spin_lock(&vector_lock); + lockdep_assert_held(&vector_lock); - hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { unsigned int irr, vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned - * up is registered at the APICs IRR. If so, then this is - * not the best time to clean it up. Clean it up in the - * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest - * priority external vector, so on return from this - * interrupt the device interrupt will happen first. + * up is registered at the APICs IRR. That's clearly a + * hardware issue if the vector arrived on the old target + * _after_ interrupts were disabled above. Keep @apicd + * on the list and schedule the timer again to give the CPU + * a chance to handle the pending interrupt. + * + * Do not check IRR when called from lapic_offline(), because + * fixup_irqs() was just called to scan IRR for set bits and + * forward them to new destination CPUs via IPIs. */ - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; if (irr & (1U << (vector % 32))) { - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); + rearm = true; continue; } free_moved_vector(apicd); } - raw_spin_unlock(&vector_lock); + /* + * Must happen under vector_lock to make the timer_pending() check + * in __vector_schedule_cleanup() race free against the rearm here. + */ + if (rearm) + mod_timer(&cl->timer, jiffies + 1); +} + +static void vector_cleanup_callback(struct timer_list *tmr) +{ + struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer); + + /* Prevent vectors vanishing under us */ + raw_spin_lock_irq(&vector_lock); + __vector_cleanup(cl, true); + raw_spin_unlock_irq(&vector_lock); } -static void __send_cleanup_vector(struct apic_chip_data *apicd) +static void __vector_schedule_cleanup(struct apic_chip_data *apicd) { - unsigned int cpu; + unsigned int cpu = apicd->prev_cpu; raw_spin_lock(&vector_lock); apicd->move_in_progress = 0; - cpu = apicd->prev_cpu; if (cpu_online(cpu)) { - hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); - apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu); + + hlist_add_head(&apicd->clist, &cl->head); + + /* + * The lockless timer_pending() check is safe here. If it + * returns true, then the callback will observe this new + * apic data in the hlist as everything is serialized by + * vector lock. + * + * If it returns false then the timer is either not armed + * or the other CPU executes the callback, which again + * would be blocked on vector lock. Rearming it in the + * latter case makes it fire for nothing. + * + * This is also safe against the callback rearming the timer + * because that's serialized via vector lock too. + */ + if (!timer_pending(&cl->timer)) { + cl->timer.expires = jiffies + 1; + add_timer_on(&cl->timer, cpu); + } } else { apicd->prev_vector = 0; } raw_spin_unlock(&vector_lock); } -void send_cleanup_vector(struct irq_cfg *cfg) +void vector_schedule_cleanup(struct irq_cfg *cfg) { struct apic_chip_data *apicd; apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); if (apicd->move_in_progress) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } void irq_complete_move(struct irq_cfg *cfg) @@ -1007,7 +1065,7 @@ void irq_complete_move(struct irq_cfg *cfg) * on the same CPU. */ if (apicd->cpu == smp_processor_id()) - __send_cleanup_vector(apicd); + __vector_schedule_cleanup(apicd); } /* @@ -1150,7 +1208,7 @@ static void __init print_local_APIC(void *dummy) u64 icr; pr_debug("printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); + smp_processor_id(), read_apic_id()); v = apic_read(APIC_ID); pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b2b2b7f3e03f..affbff65e497 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -83,16 +83,6 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } -static void x2apic_send_IPI_allbutself(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); -} - -static void x2apic_send_IPI_all(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); -} - static u32 x2apic_calc_apicid(unsigned int cpu) { return x86_cpu_to_logical_apicid[cpu]; @@ -236,8 +226,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = x2apic_apic_id_valid, - .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, @@ -247,12 +235,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = x2apic_phys_pkg_id, + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -265,15 +252,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .inquire_remote_apic = NULL, - .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 896bc41cb2ba..788cdb4ee394 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -8,11 +8,13 @@ int x2apic_phys; static struct apic apic_x2apic_phys; -static u32 x2apic_max_apicid __ro_after_init; +u32 x2apic_max_apicid __ro_after_init = UINT_MAX; void __init x2apic_set_max_apicid(u32 apicid) { x2apic_max_apicid = apicid; + if (apic->x2apic_set_max_apicid) + apic->max_apic_id = apicid; } static int __init set_x2apic_phys_mode(char *arg) @@ -81,43 +83,28 @@ static void __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } -static void x2apic_send_IPI_allbutself(int vector) +static void __x2apic_send_IPI_shorthand(int vector, u32 which) { - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); -} + unsigned long cfg = __prepare_ICR(which, vector, 0); -static void x2apic_send_IPI_all(int vector) -{ - __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); + native_x2apic_icr_write(cfg, 0); } -static void init_x2apic_ldr(void) +void x2apic_send_IPI_allbutself(int vector) { + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); } -static int x2apic_phys_probe(void) -{ - if (!x2apic_mode) - return 0; - - if (x2apic_phys || x2apic_fadt_phys()) - return 1; - - return apic == &apic_x2apic_phys; -} - -/* Common x2apic functions, also used by x2apic_cluster */ -int x2apic_apic_id_valid(u32 apicid) +void x2apic_send_IPI_all(int vector) { - if (x2apic_max_apicid && apicid > x2apic_max_apicid) - return 0; - - return 1; + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); } -int x2apic_apic_id_registered(void) +void x2apic_send_IPI_self(int vector) { - return 1; + apic_write(APIC_SELF_IPI, vector); } void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) @@ -126,13 +113,15 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) native_x2apic_icr_write(cfg, apicid); } -void __x2apic_send_IPI_shorthand(int vector, u32 which) +static int x2apic_phys_probe(void) { - unsigned long cfg = __prepare_ICR(which, vector, 0); + if (!x2apic_mode) + return 0; - /* x2apic MSRs are special and need a special fence: */ - weak_wrmsr_fence(); - native_x2apic_icr_write(cfg, 0); + if (x2apic_phys || x2apic_fadt_phys()) + return 1; + + return apic == &apic_x2apic_phys; } unsigned int x2apic_get_apic_id(unsigned long id) @@ -150,33 +139,22 @@ int x2apic_phys_pkg_id(int initial_apicid, int index_msb) return initial_apicid >> index_msb; } -void x2apic_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .apic_id_valid = x2apic_apic_id_valid, - .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = x2apic_phys_pkg_id, + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, @@ -189,15 +167,11 @@ static struct apic apic_x2apic_phys __ro_after_init = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_self = x2apic_send_IPI_self, - .inquire_remote_apic = NULL, - .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b524dee1cbbb..d9f5d7492f83 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -25,6 +25,8 @@ #include <asm/uv/uv.h> #include <asm/apic.h> +#include "local.h" + static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; @@ -777,30 +779,6 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static int uv_apic_id_valid(u32 apicid) -{ - return 1; -} - -static int uv_apic_id_registered(void) -{ - return 1; -} - -static void uv_init_apic_ldr(void) -{ -} - -static u32 apic_uv_calc_apicid(unsigned int cpu) -{ - return apic_default_calc_apicid(cpu); -} - -static unsigned int x2apic_get_apic_id(unsigned long id) -{ - return id; -} - static u32 set_apic_id(unsigned int id) { return id; @@ -816,11 +794,6 @@ static int uv_phys_pkg_id(int initial_apicid, int index_msb) return uv_read_apic_id() >> index_msb; } -static void uv_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - static int uv_probe(void) { return apic == &apic_x2apic_uv_x; @@ -831,45 +804,35 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .apic_id_valid = uv_apic_id_valid, - .apic_id_registered = uv_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, - .check_apicid_used = NULL, - .init_apic_ldr = uv_init_apic_ldr, - .ioapic_phys_id_map = NULL, - .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = NULL, - .check_phys_apicid_present = default_check_phys_apicid_present, .phys_pkg_id = uv_phys_pkg_id, + .max_apic_id = UINT_MAX, .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .calc_dest_apicid = apic_uv_calc_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_all = uv_send_IPI_all, - .send_IPI_self = uv_send_IPI_self, + .send_IPI_self = x2apic_send_IPI_self, .wakeup_secondary_cpu = uv_wakeup_secondary, - .inquire_remote_apic = NULL, .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_eoi_write, + .eoi = native_apic_msr_eoi, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, - .wait_icr_idle = native_x2apic_wait_icr_idle, - .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 @@ -1844,7 +1807,7 @@ static void __init uv_system_init_hub(void) /* Initialize per CPU info: */ for_each_possible_cpu(cpu) { - int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + int apicid = per_cpu(x86_cpu_to_apicid, cpu); unsigned short bid; unsigned short pnode; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c6c15ce1952f..5934ee5bc087 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -239,12 +239,6 @@ extern int (*console_blank_hook)(int); #endif /* - * The apm_bios device is one of the misc char devices. - * This is its minor number. - */ -#define APM_MINOR_DEV 134 - -/* * Various options can be changed at boot time as follows: * (We allow underscores for compatibility with the modules code) * apm=on/off enable/disable APM diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 44c3601cfdc4..190c120f4285 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -63,11 +63,6 @@ int audit_classify_syscall(int abi, unsigned syscall) static int __init audit_classes_init(void) { #ifdef CONFIG_IA32_EMULATION - extern __u32 ia32_dir_class[]; - extern __u32 ia32_write_class[]; - extern __u32 ia32_read_class[]; - extern __u32 ia32_chattr_class[]; - extern __u32 ia32_signal_class[]; audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c new file mode 100644 index 000000000000..d2c732a34e5d --- /dev/null +++ b/arch/x86/kernel/cet.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/ptrace.h> +#include <asm/bugs.h> +#include <asm/traps.h> + +enum cp_error_code { + CP_EC = (1 << 15) - 1, + + CP_RET = 1, + CP_IRET = 2, + CP_ENDBR = 3, + CP_RSTRORSSP = 4, + CP_SETSSBSY = 5, + + CP_ENCL = 1 << 15, +}; + +static const char cp_err[][10] = { + [0] = "unknown", + [1] = "near ret", + [2] = "far/iret", + [3] = "endbranch", + [4] = "rstorssp", + [5] = "setssbsy", +}; + +static const char *cp_err_string(unsigned long error_code) +{ + unsigned int cpec = error_code & CP_EC; + + if (cpec >= ARRAY_SIZE(cp_err)) + cpec = 0; + return cp_err[cpec]; +} + +static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code) +{ + WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n", + user_mode(regs) ? "user mode" : "kernel mode", + cp_err_string(error_code)); +} + +static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + unsigned long ssp; + + /* + * An exception was just taken from userspace. Since interrupts are disabled + * here, no scheduling should have messed with the registers yet and they + * will be whatever is live in userspace. So read the SSP before enabling + * interrupts so locking the fpregs to do it later is not required. + */ + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + cond_local_irq_enable(regs); + + tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_CP; + + /* Ratelimit to prevent log spamming. */ + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + __ratelimit(&cpf_rate)) { + pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, ssp, error_code, + cp_err_string(error_code), + error_code & CP_ENCL ? " in enclave" : ""); + print_vma_addr(KERN_CONT " in ", regs->ip); + pr_cont("\n"); + } + + force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0); + cond_local_irq_disable(regs); +} + +static __ro_after_init bool ibt_fatal = true; + +static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) +{ + if ((error_code & CP_EC) != CP_ENDBR) { + do_unexpected_cp(regs, error_code); + return; + } + + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { + regs->ax = 0; + return; + } + + pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); + if (!ibt_fatal) { + printk(KERN_DEFAULT CUT_HERE); + __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + return; + } + BUG(); +} + +static int __init ibt_setup(char *str) +{ + if (!strcmp(str, "off")) + setup_clear_cpu_cap(X86_FEATURE_IBT); + + if (!strcmp(str, "warn")) + ibt_fatal = false; + + return 1; +} + +__setup("ibt=", ibt_setup); + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (user_mode(regs)) { + if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + do_user_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } else { + if (cpu_feature_enabled(X86_FEATURE_IBT)) + do_kernel_cp_fault(regs, error_code); + else + do_unexpected_cp(regs, error_code); + } +} diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 485441b7f030..bfeb18fad63f 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -51,7 +51,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7eca6a8abbb1..dd8379d84445 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1047,7 +1047,7 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_FSRS); /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); + c->apicid = read_apic_id(); /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 41b573f34a10..382d4e6b848d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -587,27 +587,43 @@ __noendbr void ibt_restore(u64 save) static __always_inline void setup_cet(struct cpuinfo_x86 *c) { - u64 msr = CET_ENDBR_EN; + bool user_shstk, kernel_ibt; - if (!HAS_KERNEL_IBT || - !cpu_feature_enabled(X86_FEATURE_IBT)) + if (!IS_ENABLED(CONFIG_X86_CET)) return; - wrmsrl(MSR_IA32_S_CET, msr); + kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT); + user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) && + IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK); + + if (!kernel_ibt && !user_shstk) + return; + + if (user_shstk) + set_cpu_cap(c, X86_FEATURE_USER_SHSTK); + + if (kernel_ibt) + wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + else + wrmsrl(MSR_IA32_S_CET, 0); + cr4_set_bits(X86_CR4_CET); - if (!ibt_selftest()) { + if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); - return; } } __noendbr void cet_disable(void) { - if (cpu_feature_enabled(X86_FEATURE_IBT)) - wrmsrl(MSR_IA32_S_CET, 0); + if (!(cpu_feature_enabled(X86_FEATURE_IBT) || + cpu_feature_enabled(X86_FEATURE_SHSTK))) + return; + + wrmsrl(MSR_IA32_S_CET, 0); + wrmsrl(MSR_IA32_U_CET, 0); } /* @@ -1264,11 +1280,11 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), @@ -1491,6 +1507,9 @@ static void __init cpu_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen <= 0) return; @@ -1958,7 +1977,7 @@ void enable_sep_cpu(void) } #endif -void __init identify_boot_cpu(void) +static __init void identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index f6748c8bd647..e462c1d3800a 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, {} }; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 5a2962c492d3..defdc594be14 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -8,6 +8,7 @@ */ #include <linux/io.h> +#include <asm/apic.h> #include <asm/cpu.h> #include <asm/smp.h> #include <asm/numa.h> @@ -300,7 +301,7 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ - c->apicid = hard_smp_processor_id(); + c->apicid = read_apic_id(); /* * XXX someone from Hygon needs to confirm this DTRT diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c4ec4ca47e11..c267f43de39e 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -759,7 +759,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - ack_APIC_irq(); + apic_eoi(); } /* diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 12cf2e7ca33c..4d8d4bcf915d 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -270,8 +270,7 @@ static void __maybe_unused raise_mce(struct mce *m) mce_irq_ipi, NULL, 0); preempt_enable(); } else if (m->inject_flags & MCJ_NMI_BROADCAST) - apic->send_IPI_mask(mce_inject_cpumask, - NMI_VECTOR); + __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 6a059a035021..ef4e7bb5fd88 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -27,5 +27,5 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - ack_APIC_irq(); + apic_eoi(); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index c7969e806c64..e6bba12c759c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -32,6 +32,7 @@ #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> #include <asm/numa.h> +#include <asm/svm.h> /* Is Linux running as the root partition? */ bool hv_root_partition; @@ -39,6 +40,10 @@ bool hv_root_partition; bool hv_nested; struct ms_hyperv_info ms_hyperv; +/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */ +bool hyperv_paravisor_present __ro_after_init; +EXPORT_SYMBOL_GPL(hyperv_paravisor_present); + #if IS_ENABLED(CONFIG_HYPERV) static inline unsigned int hv_get_nested_reg(unsigned int reg) { @@ -65,8 +70,8 @@ u64 hv_get_non_nested_register(unsigned int reg) { u64 value; - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) - hv_ghcb_msr_read(reg, &value); + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) + hv_ivm_msr_read(reg, &value); else rdmsrl(reg, value); return value; @@ -75,8 +80,8 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_register); void hv_set_non_nested_register(unsigned int reg, u64 value) { - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { - hv_ghcb_msr_write(reg, value); + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { + hv_ivm_msr_write(reg, value); /* Write proxy bit via wrmsl instruction */ if (hv_is_sint_reg(reg)) @@ -119,7 +124,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) vmbus_handler(); if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) - ack_APIC_irq(); + apic_eoi(); set_irq_regs(old_regs); } @@ -147,7 +152,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) if (hv_stimer0_handler) hv_stimer0_handler(); add_interrupt_randomness(HYPERV_STIMER0_VECTOR); - ack_APIC_irq(); + apic_eoi(); set_irq_regs(old_regs); } @@ -295,6 +300,15 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) native_smp_prepare_cpus(max_cpus); + /* + * Override wakeup_secondary_cpu_64 callback for SEV-SNP + * enlightened guest. + */ + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap; + return; + } + #ifdef CONFIG_X86_64 for_each_present_cpu(i) { if (i == 0) @@ -313,6 +327,26 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) } #endif +/* + * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the + * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0 + * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns + * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a + * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and + * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs + * from the array and hence doesn't create the necessary irq description info. + * + * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here, + * except don't change 'legacy_pic', which keeps its default value + * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero + * nr_legacy_irqs() and eventually serial console interrupts works properly. + */ +static void __init reduced_hw_init(void) +{ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; +} + static void __init ms_hyperv_init_platform(void) { int hv_max_functions_eax; @@ -399,11 +433,33 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.shared_gpa_boundary = BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); + hyperv_paravisor_present = !!ms_hyperv.paravisor_present; + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); - if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) + + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { static_branch_enable(&isolation_type_snp); + } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) { + static_branch_enable(&isolation_type_tdx); + + /* A TDX VM must use x2APIC and doesn't use lazy EOI. */ + ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; + + if (!ms_hyperv.paravisor_present) { + /* To be supported: more work is required. */ + ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; + + /* HV_REGISTER_CRASH_CTL is unsupported. */ + ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + + /* Don't trust Hyper-V's TLB-flushing hypercalls. */ + ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + + x86_init.acpi.reduced_hw_early_init = reduced_hw_init; + } + } } if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { @@ -473,7 +529,7 @@ static void __init ms_hyperv_init_platform(void) #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || - (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)) + ms_hyperv.paravisor_present) hv_vtom_init(); /* * Setup the hook to get control post apic initialization. @@ -497,7 +553,8 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; - if (hv_root_partition) + if (hv_root_partition || + (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif @@ -560,6 +617,22 @@ static bool __init ms_hyperv_msi_ext_dest_id(void) return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; } +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_r8(ghcb, regs->r8); +} + +static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, @@ -567,4 +640,8 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init.x2apic_available = ms_hyperv_x2apic_available, .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, +#endif }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 099b6f0d96bd..31c0e68f6227 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -4,6 +4,8 @@ #include <linux/string.h> #include <linux/seq_file.h> #include <linux/cpufreq.h> +#include <asm/prctl.h> +#include <linux/proc_fs.h> #include "cpu.h" @@ -175,3 +177,24 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +static void dump_x86_features(struct seq_file *m, unsigned long features) +{ + if (features & ARCH_SHSTK_SHSTK) + seq_puts(m, "shstk "); + if (features & ARCH_SHSTK_WRSS) + seq_puts(m, "wrss "); +} + +void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) +{ + seq_puts(m, "x86_Thread_features:\t"); + dump_x86_features(m, task->thread.features); + seq_putc(m, '\n'); + + seq_puts(m, "x86_Thread_features_locked:\t"); + dump_x86_features(m, task->thread.features_locked); + seq_putc(m, '\n'); +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 458cb7419502..8f559eeae08e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -45,7 +45,21 @@ static u64 prefetch_disable_bits; */ static unsigned int pseudo_lock_major; static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); -static struct class *pseudo_lock_class; + +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) +{ + const struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); +} + +static const struct class pseudo_lock_class = { + .name = "pseudo_lock", + .devnode = pseudo_lock_devnode, +}; /** * get_prefetch_disable_bits - prefetch disable bits of supported platforms @@ -1353,7 +1367,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) &pseudo_measure_fops); } - dev = device_create(pseudo_lock_class, NULL, + dev = device_create(&pseudo_lock_class, NULL, MKDEV(pseudo_lock_major, new_minor), rdtgrp, "%s", rdtgrp->kn->name); @@ -1383,7 +1397,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) goto out; out_device: - device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); out_debugfs: debugfs_remove_recursive(plr->debugfs_dir); pseudo_lock_minor_release(new_minor); @@ -1424,7 +1438,7 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) pseudo_lock_cstates_relax(plr); debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); pseudo_lock_minor_release(plr->minor); free: @@ -1560,16 +1574,6 @@ static const struct file_operations pseudo_lock_dev_fops = { .mmap = pseudo_lock_dev_mmap, }; -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); -} - int rdt_pseudo_lock_init(void) { int ret; @@ -1580,21 +1584,18 @@ int rdt_pseudo_lock_init(void) pseudo_lock_major = ret; - pseudo_lock_class = class_create("pseudo_lock"); - if (IS_ERR(pseudo_lock_class)) { - ret = PTR_ERR(pseudo_lock_class); + ret = class_register(&pseudo_lock_class); + if (ret) { unregister_chrdev(pseudo_lock_major, "pseudo_lock"); return ret; } - pseudo_lock_class->devnode = pseudo_lock_devnode; return 0; } void rdt_pseudo_lock_release(void) { - class_destroy(pseudo_lock_class); - pseudo_lock_class = NULL; + class_unregister(&pseudo_lock_class); unregister_chrdev(pseudo_lock_major, "pseudo_lock"); pseudo_lock_major = 0; } diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index c3e37eaec8ec..7aaa3652e31d 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -204,6 +204,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) continue; xa_erase(&vepc->page_array, index); + cond_resched(); } /* @@ -222,6 +223,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) list_add_tail(&epc_page->list, &secs_pages); xa_erase(&vepc->page_array, index); + cond_resched(); } /* @@ -243,6 +245,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) if (sgx_vepc_free_page(epc_page)) list_add_tail(&epc_page->list, &secs_pages); + cond_resched(); } if (!list_empty(&secs_pages)) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index bdc0d5539b57..dae436253de4 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,7 +40,6 @@ #include <asm/processor.h> #include <asm/msr.h> -static struct class *cpuid_class; static enum cpuhp_state cpuhp_cpuid_state; struct cpuid_regs_done { @@ -124,26 +123,31 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; +static char *cpuid_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + +static const struct class cpuid_class = { + .name = "cpuid", + .devnode = cpuid_devnode, +}; + static int cpuid_device_create(unsigned int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + dev = device_create(&cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); return PTR_ERR_OR_ZERO(dev); } static int cpuid_device_destroy(unsigned int cpu) { - device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + device_destroy(&cpuid_class, MKDEV(CPUID_MAJOR, cpu)); return 0; } -static char *cpuid_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); -} - static int __init cpuid_init(void) { int err; @@ -154,12 +158,9 @@ static int __init cpuid_init(void) CPUID_MAJOR); return -EBUSY; } - cpuid_class = class_create("cpuid"); - if (IS_ERR(cpuid_class)) { - err = PTR_ERR(cpuid_class); + err = class_register(&cpuid_class); + if (err) goto out_chrdev; - } - cpuid_class->devnode = cpuid_devnode; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", cpuid_device_create, cpuid_device_destroy); @@ -170,7 +171,7 @@ static int __init cpuid_init(void) return 0; out_class: - class_destroy(cpuid_class); + class_unregister(&cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); return err; @@ -180,7 +181,7 @@ module_init(cpuid_init); static void __exit cpuid_exit(void) { cpuhp_remove_state(cpuhp_cpuid_state); - class_destroy(cpuid_class); + class_unregister(&cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); } module_exit(cpuid_exit); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 587c7743fd21..c92d88680dbf 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -48,27 +48,6 @@ struct crash_memmap_data { unsigned int type; }; -/* - * This is used to VMCLEAR all VMCSs loaded on the - * processor. And when loading kvm_intel module, the - * callback function pointer will be assigned. - * - * protected by rcu. - */ -crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; -EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); - -static inline void cpu_crash_vmclear_loaded_vmcss(void) -{ - crash_vmclear_fn *do_vmclear_operation = NULL; - - rcu_read_lock(); - do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); - if (do_vmclear_operation) - do_vmclear_operation(); - rcu_read_unlock(); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -76,11 +55,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) crash_save_cpu(regs, cpu); /* - * VMCLEAR VMCSs loaded on all cpus if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - - /* * Disable Intel PT to stop its logging */ cpu_emergency_stop_pt(); @@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); - /* - * VMCLEAR VMCSs loaded on this cpu if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - cpu_emergency_disable_virtualization(); /* diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 28da5dd83fc0..87d38f17ff5c 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -128,16 +128,15 @@ static void __init dtb_setup_hpet(void) static void __init dtb_cpu_setup(void) { struct device_node *dn; - u32 apic_id, version; + u32 apic_id; - version = GET_APIC_VERSION(apic_read(APIC_LVR)); for_each_of_cpu_node(dn) { apic_id = of_get_cpu_hwid(dn, 0); if (apic_id == ~0U) { pr_warn("%pOF: missing local APIC ID\n", dn); continue; } - generic_processor_info(apic_id, version); + generic_processor_info(apic_id); } } @@ -158,19 +157,15 @@ static void __init dtb_lapic_setup(void) /* Did the boot loader setup the local APIC ? */ if (!boot_cpu_has(X86_FEATURE_APIC)) { - if (apic_force_enable(lapic_addr)) + /* Try force enabling, which registers the APIC address */ + if (!apic_force_enable(lapic_addr)) return; - } - smp_found_config = 1; - if (of_property_read_bool(dn, "intel,virtual-wire-mode")) { - pr_info("Virtual Wire compatibility mode.\n"); - pic_mode = 0; } else { - pr_info("IMCR and PIC compatibility mode.\n"); - pic_mode = 1; + register_lapic_address(lapic_addr); } - - register_lapic_address(lapic_addr); + smp_found_config = 1; + pic_mode = !of_property_read_bool(dn, "intel,virtual-wire-mode"); + pr_info("%s compatibility mode.\n", pic_mode ? "IMCR and PIC" : "Virtual Wire"); } #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 98e507cc7d34..a86d37052a64 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -552,8 +552,36 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) } } +/* A passed ssp of zero will not cause any update */ +static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) +{ +#ifdef CONFIG_X86_USER_SHADOW_STACK + struct cet_user_state *xstate; + + /* If ssp update is not needed. */ + if (!ssp) + return 0; + + xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + XFEATURE_CET_USER); + + /* + * If there is a non-zero ssp, then 'dst' must be configured with a shadow + * stack and the fpu state should be up to date since it was just copied + * from the parent in fpu_clone(). So there must be a valid non-init CET + * state location in the buffer. + */ + if (WARN_ON_ONCE(!xstate)) + return 1; + + xstate->user_ssp = (u64)ssp; +#endif + return 0; +} + /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long ssp) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -613,6 +641,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; + /* + * Update shadow stack pointer, in case it changed during clone. + */ + if (update_fpu_shstk(dst, ssp)) + return 1; + trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -753,6 +787,24 @@ void switch_fpu_return(void) } EXPORT_SYMBOL_GPL(switch_fpu_return); +void fpregs_lock_and_load(void) +{ + /* + * fpregs_lock() only disables preemption (mostly). So modifying state + * in an interrupt could screw up some in progress fpregs operation. + * Warn about it. + */ + WARN_ON_ONCE(!irq_fpu_usable()); + WARN_ON_ONCE(current->flags & PF_KTHREAD); + + fpregs_lock(); + + fpregs_assert_state_consistent(); + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); +} + #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6d056b68f4ed..6bc1eb2a21bd 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -8,6 +8,7 @@ #include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/prctl.h> #include "context.h" #include "internal.h" @@ -174,6 +175,86 @@ out: return ret; } +#ifdef CONFIG_X86_USER_SHADOW_STACK +int ssp_active(struct task_struct *target, const struct user_regset *regset) +{ + if (target->thread.features & ARCH_SHSTK_SHSTK) + return regset->n; + + return 0; +} + +int ssp_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + struct fpu *fpu = &target->thread.fpu; + struct cet_user_state *cetregs; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -ENODEV; + + sync_fpstate(fpu); + cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + return membuf_write(&to, (unsigned long *)&cetregs->user_ssp, + sizeof(cetregs->user_ssp)); +} + +int ssp_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct fpu *fpu = &target->thread.fpu; + struct xregs_state *xsave = &fpu->fpstate->regs.xsave; + struct cet_user_state *cetregs; + unsigned long user_ssp; + int r; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) + return -ENODEV; + + if (pos != 0 || count != sizeof(user_ssp)) + return -EINVAL; + + r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1); + if (r) + return r; + + /* + * Some kernel instructions (IRET, etc) can cause exceptions in the case + * of disallowed CET register values. Just prevent invalid values. + */ + if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8)) + return -EINVAL; + + fpu_force_restore(fpu); + + cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER); + if (WARN_ON(!cetregs)) { + /* + * This shouldn't ever be NULL because shadow stack was + * verified to be enabled above. This means + * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so + * XFEATURE_CET_USER should not be in the init state. + */ + return -ENODEV; + } + + cetregs->user_ssp = user_ssp; + return 0; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1afbc4866b10..cadf68737e6b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -39,26 +39,26 @@ */ static const char *xfeature_names[] = { - "x87 floating point registers" , - "SSE registers" , - "AVX registers" , - "MPX bounds registers" , - "MPX CSR" , - "AVX-512 opmask" , - "AVX-512 Hi256" , - "AVX-512 ZMM_Hi256" , - "Processor Trace (unused)" , + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", "Protection Keys User registers", "PASID state", - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "unknown xstate feature" , - "AMX Tile config" , - "AMX Tile data" , - "unknown xstate feature" , + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { @@ -71,8 +71,9 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, - [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; @@ -276,6 +277,7 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_CET_USER); print_xstate_feature(XFEATURE_MASK_XTILE_CFG); print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } @@ -344,6 +346,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_XTILE) /* @@ -446,14 +449,15 @@ static void __init __xstate_dump_leaves(void) } \ } while (0) -#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \ - if ((nr == nr_macro) && \ - WARN_ONCE(sz != sizeof(__struct), \ - "%s: struct is %zu bytes, cpu state %d bytes\n", \ - __stringify(nr_macro), sizeof(__struct), sz)) { \ +#define XCHECK_SZ(sz, nr, __struct) ({ \ + if (WARN_ONCE(sz != sizeof(__struct), \ + "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ + xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ -} while (0) + true; \ +}) + /** * check_xtile_data_against_struct - Check tile data state size. @@ -527,36 +531,28 @@ static bool __init check_xstate_against_struct(int nr) * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); + /* * Match each CPU state with the corresponding software * structure. */ - XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct); - XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state); - XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state); - XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state); - XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); - XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); - XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); - XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); - XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); - - /* The tile data size varies between implementations. */ - if (nr == XFEATURE_XTILE_DATA) - check_xtile_data_against_struct(sz); - - /* - * Make *SURE* to add any feature numbers in below if - * there are "holes" in the xsave state component - * numbers. - */ - if ((nr < XFEATURE_YMM) || - (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { + switch (nr) { + case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); + case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); + case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); + case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); + case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); + case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); + case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); + case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); + case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); + case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; + default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } + return true; } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 4d8aff05a509..30a55207c000 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -231,9 +231,7 @@ struct irq_chip i8259A_chip = { }; static char irq_trigger[2]; -/** - * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ - */ +/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */ static void restore_ELCR(char *trigger) { outb(trigger[0], PIC_ELCR1); diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S new file mode 100644 index 000000000000..c43c4ed28a9c --- /dev/null +++ b/arch/x86/kernel/ibt_selftest.S @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <linux/objtool.h> +#include <asm/nospec-branch.h> + +SYM_CODE_START(ibt_selftest_noendbr) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + /* #CP handler sets %ax to 0 */ + RET +SYM_CODE_END(ibt_selftest_noendbr) + +SYM_FUNC_START(ibt_selftest) + lea ibt_selftest_noendbr(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + jmp *%rax +SYM_FUNC_END(ibt_selftest) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index a58c6bc1cd68..b786d48f5a0f 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -107,7 +107,7 @@ static const __initconst struct idt_data def_idts[] = { ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET INTG(X86_TRAP_CP, asm_exc_control_protection), #endif @@ -131,7 +131,6 @@ static const __initconst struct idt_data apic_idts[] = { INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), - INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f668d2f3d11..11761c124545 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -49,7 +49,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - ack_APIC_irq(); + apic_eoi(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -256,7 +256,7 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - ack_APIC_irq(); + apic_eoi(); if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", @@ -280,7 +280,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); + apic_eoi(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) @@ -310,7 +310,7 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_ipis); } @@ -319,7 +319,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) */ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); } @@ -329,7 +329,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(kvm_posted_intr_nested_ipis); } #endif @@ -401,6 +401,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); + apic_eoi(); } #endif diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 890d4778cd35..b0a24deab4a1 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -16,7 +16,7 @@ #ifdef CONFIG_X86_LOCAL_APIC DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { - ack_APIC_irq(); + apic_eoi(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); @@ -28,7 +28,7 @@ void arch_irq_work_raise(void) if (!arch_irq_work_has_interrupt()) return; - apic->send_IPI_self(IRQ_WORK_VECTOR); + __apic_send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); } #endif diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 4eb8f2d19a87..578d16fc040f 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -101,10 +101,8 @@ static void __init jailhouse_get_smp_config(unsigned int early) register_lapic_address(0xfee00000); - for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { - generic_processor_info(setup_data.v1.cpu_ids[cpu], - boot_cpu_apic_version); - } + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) + generic_processor_info(setup_data.v1.cpu_ids[cpu]); smp_found_config = 1; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f7f6042eb7e6..e8babebad7b8 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,6 +45,7 @@ #include <linux/vmalloc.h> #include <linux/pgtable.h> #include <linux/set_memory.h> +#include <linux/cfi.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -293,7 +294,40 @@ static int can_probe(unsigned long paddr) #endif addr += insn.length; } + if (IS_ENABLED(CONFIG_CFI_CLANG)) { + /* + * The compiler generates the following instruction sequence + * for indirect call checks and cfi.c decodes this; + * + *Â movl -<id>, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * Also, these movl and addl are used for showing expected + * type. So those must not be touched. + */ + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return 0; + + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return 0; + + if (insn.opcode.value == 0xBA) + offset = 12; + else if (insn.opcode.value == 0x3) + offset = 6; + else + goto out; + + /* This movl/addl is used for decoding CFI. */ + if (is_cfi_trap(addr + offset)) + return 0; + } +out: return (addr == paddr); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 526d4da3dcd4..b8ab9ee5896c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -291,7 +291,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) struct pt_regs *old_regs = set_irq_regs(regs); u32 token; - ack_APIC_irq(); + apic_eoi(); inc_irq_stat(irq_hv_callback_count); @@ -332,7 +332,7 @@ static void kvm_register_steal_time(void) static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) +static notrace __maybe_unused void kvm_guest_apic_eoi_write(void) { /** * This relies on __test_and_clear_bit to modify the memory @@ -343,7 +343,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; - apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); + apic_native_eoi(); } static void kvm_guest_cpu_init(void) @@ -622,10 +622,10 @@ late_initcall(setup_efi_kvm_sev_migration); /* * Set the IPI entry points */ -static void kvm_setup_pv_ipi(void) +static __init void kvm_setup_pv_ipi(void) { - apic->send_IPI_mask = kvm_send_ipi_mask; - apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + apic_update_callback(send_IPI_mask, kvm_send_ipi_mask); + apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself); pr_info("setup PV IPIs\n"); } @@ -825,7 +825,7 @@ static void __init kvm_guest_init(void) } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - apic_set_eoi_write(kvm_guest_apic_eoi_write); + apic_update_callback(eoi, kvm_guest_apic_eoi_write); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index fed721f90116..b223922248e9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -48,7 +48,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __init MP_processor_info(struct mpc_cpu *m) { - int apicid; char *bootup_cpu = ""; if (!(m->cpuflag & CPU_ENABLED)) { @@ -56,15 +55,11 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apicid = m->apicid; - - if (m->cpuflag & CPU_BOOTPROCESSOR) { + if (m->cpuflag & CPU_BOOTPROCESSOR) bootup_cpu = " (Bootup-CPU)"; - boot_cpu_physical_apicid = m->apicid; - } pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); - generic_processor_info(apicid, m->apicver); + generic_processor_info(m->apicid); } #ifdef CONFIG_X86_IO_APIC @@ -380,11 +375,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) int i; /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* * 2 CPUs, numbered 0 & 1. */ processor.type = MP_PROCESSOR; @@ -525,10 +515,8 @@ void __init default_get_smp_config(unsigned int early) */ if (mpf->feature1) { if (early) { - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + /* Local APIC has default address */ + register_lapic_address(APIC_DEFAULT_PHYS_BASE); goto out; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7bb17d37db01..e17c16c54a37 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -39,7 +39,6 @@ #include <asm/cpufeature.h> #include <asm/msr.h> -static struct class *msr_class; static enum cpuhp_state cpuhp_msr_state; enum allow_write_msrs { @@ -235,26 +234,31 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; +static char *msr_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + +static const struct class msr_class = { + .name = "msr", + .devnode = msr_devnode, +}; + static int msr_device_create(unsigned int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + dev = device_create(&msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, "msr%d", cpu); return PTR_ERR_OR_ZERO(dev); } static int msr_device_destroy(unsigned int cpu) { - device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + device_destroy(&msr_class, MKDEV(MSR_MAJOR, cpu)); return 0; } -static char *msr_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); -} - static int __init msr_init(void) { int err; @@ -263,12 +267,9 @@ static int __init msr_init(void) pr_err("unable to get major %d for msr\n", MSR_MAJOR); return -EBUSY; } - msr_class = class_create("msr"); - if (IS_ERR(msr_class)) { - err = PTR_ERR(msr_class); + err = class_register(&msr_class); + if (err) goto out_chrdev; - } - msr_class->devnode = msr_devnode; err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", msr_device_create, msr_device_destroy); @@ -278,7 +279,7 @@ static int __init msr_init(void) return 0; out_class: - class_destroy(msr_class); + class_unregister(&msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); return err; @@ -288,7 +289,7 @@ module_init(msr_init); static void __exit msr_exit(void) { cpuhp_remove_state(cpuhp_msr_state); - class_destroy(msr_class); + class_unregister(&msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); } module_exit(msr_exit) diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index a1a96df3dff1..e93a8545c74d 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -75,7 +75,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) /* sync above data before sending NMI */ wmb(); - apic->send_IPI_mask(mask, NMI_VECTOR); + __apic_send_IPI_mask(mask, NMI_VECTOR); /* Don't wait longer than a second */ timeout = USEC_PER_SEC; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 72015dba72ab..9f0909142a0a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -51,6 +51,7 @@ #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> +#include <asm/shstk.h> #include "process.h" @@ -122,6 +123,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); + shstk_free(tsk); fpu__drop(fpu); } @@ -162,6 +164,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; + unsigned long new_ssp; int ret = 0; childregs = task_pt_regs(p); @@ -199,7 +202,16 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags, args->fn); + /* + * Allocate a new shadow stack for thread if needed. If shadow stack, + * is disabled, new_ssp will remain 0, and fpu_clone() will know not to + * update it. + */ + new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); + if (IS_ERR_VALUE(new_ssp)) + return PTR_ERR((void *)new_ssp); + + fpu_clone(p, clone_flags, args->fn, new_ssp); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { @@ -245,6 +257,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); + /* + * If copy_thread() if failing, don't leak the shadow stack possibly + * allocated in shstk_alloc_thread_stack() above. + */ + if (ret) + shstk_free(p); + return ret; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d181c16a2f6..33b268747bb7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -515,6 +515,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, load_gs_index(__USER_DS); } + reset_thread_features(); + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); @@ -894,6 +896,12 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); #endif + case ARCH_SHSTK_ENABLE: + case ARCH_SHSTK_DISABLE: + case ARCH_SHSTK_LOCK: + case ARCH_SHSTK_UNLOCK: + case ARCH_SHSTK_STATUS: + return shstk_prctl(task, option, arg2); default: ret = -EINVAL; break; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index dfaa270a7cc9..095f04bdabdc 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -58,6 +58,7 @@ enum x86_regset_64 { REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, + REGSET64_SSP, }; #define REGSET_GENERAL \ @@ -1267,6 +1268,17 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .active = ioperm_active, .regset_get = ioperm_get }, +#ifdef CONFIG_X86_USER_SHADOW_STACK + [REGSET64_SSP] = { + .core_note_type = NT_X86_SHSTK, + .n = 1, + .size = sizeof(u64), + .align = sizeof(u64), + .active = ssp_active, + .regset_get = ssp_get, + .set = ssp_set + }, +#endif }; static const struct user_regset_view user_x86_64_view = { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3adbe97015c1..830425e6d38e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -22,7 +22,6 @@ #include <asm/reboot_fixups.h> #include <asm/reboot.h> #include <asm/pci_x86.h> -#include <asm/virtext.h> #include <asm/cpu.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -530,9 +529,54 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +/* RCU-protected callback to disable virtualization prior to reboot. */ +static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, callback); +} +EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); + +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_virt_cb *callback; + + /* + * IRQs must be disabled as KVM enables virtualization in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + + rcu_read_lock(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); + rcu_read_unlock(); +} + static void emergency_reboot_disable_virtualization(void) { - /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* @@ -545,7 +589,7 @@ static void emergency_reboot_disable_virtualization(void) * Do the NMI shootdown even if virtualization is off on _this_ CPU, as * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx() || cpu_has_svm(NULL)) { + if (rcu_access_pointer(cpu_emergency_virt_callback)) { /* Safely force _this_ CPU out of VMX/SVM operation. */ cpu_emergency_disable_virtualization(); @@ -553,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void) nmi_shootdown_cpus_on_restart(); } } - +#else +static void emergency_reboot_disable_virtualization(void) { } +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ void __attribute__((weak)) mach_reboot_fixups(void) { @@ -787,21 +833,9 @@ void machine_crash_shutdown(struct pt_regs *regs) } #endif - /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; -/* - * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during - * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if - * GIF=0, i.e. if the crash occurred between CLGI and STGI. - */ -void cpu_emergency_disable_virtualization(void) -{ - cpu_emergency_vmxoff(); - cpu_emergency_svm_disable(); -} - #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fd975a4a5200..b9145a63da77 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,7 +114,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* CPU data as detected by the assembly code in head_32.S */ struct cpuinfo_x86 new_cpu_data; -unsigned int def_to_bigsmp; struct apm_info apm_info; EXPORT_SYMBOL(apm_info); @@ -1018,9 +1017,11 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); + apic_setup_apic_calls(); + if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC - disable_apic = 1; + apic_is_disabled = true; #endif setup_clear_cpu_cap(X86_FEATURE_APIC); } @@ -1253,7 +1254,7 @@ void __init setup_arch(char **cmdline_p) map_vsyscall(); - generic_apic_probe(); + x86_32_probe_apic(); early_quirks(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index c242dc47e9cb..2c97bf7b56ae 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -181,15 +181,9 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_LOCAL_APIC per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); - per_cpu(x86_bios_cpu_apicid, cpu) = - early_per_cpu_map(x86_bios_cpu_apicid, cpu); per_cpu(x86_cpu_to_acpiid, cpu) = early_per_cpu_map(x86_cpu_to_acpiid, cpu); #endif -#ifdef CONFIG_X86_32 - per_cpu(x86_cpu_to_logical_apicid, cpu) = - early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -214,12 +208,8 @@ void __init setup_per_cpu_areas(void) /* indicate the early static arrays will soon be gone */ #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; - early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; #endif -#ifdef CONFIG_X86_32 - early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; -#endif #ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index d380c9399480..2787826d9f60 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1089,7 +1089,7 @@ static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) return ret; } -void snp_set_wakeup_secondary_cpu(void) +void __init snp_set_wakeup_secondary_cpu(void) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return; @@ -1099,7 +1099,7 @@ void snp_set_wakeup_secondary_cpu(void) * required method to start APs under SNP. If the hypervisor does * not support AP creation, then no APs will be started. */ - apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; + apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); } int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c new file mode 100644 index 000000000000..fd689921a1db --- /dev/null +++ b/arch/x86/kernel/shstk.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * shstk.c - Intel shadow stack support + * + * Copyright (c) 2021, Intel Corporation. + * Yu-cheng Yu <yu-cheng.yu@intel.com> + */ + +#include <linux/sched.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/sched/signal.h> +#include <linux/compat.h> +#include <linux/sizes.h> +#include <linux/user.h> +#include <linux/syscalls.h> +#include <asm/msr.h> +#include <asm/fpu/xstate.h> +#include <asm/fpu/types.h> +#include <asm/shstk.h> +#include <asm/special_insns.h> +#include <asm/fpu/api.h> +#include <asm/prctl.h> + +#define SS_FRAME_SIZE 8 + +static bool features_enabled(unsigned long features) +{ + return current->thread.features & features; +} + +static void features_set(unsigned long features) +{ + current->thread.features |= features; +} + +static void features_clr(unsigned long features) +{ + current->thread.features &= ~features; +} + +/* + * Create a restore token on the shadow stack. A token is always 8-byte + * and aligned to 8. + */ +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) +{ + unsigned long addr; + + /* Token must be aligned */ + if (!IS_ALIGNED(ssp, 8)) + return -EINVAL; + + addr = ssp - SS_FRAME_SIZE; + + /* + * SSP is aligned, so reserved bits and mode bit are a zero, just mark + * the token 64-bit. + */ + ssp |= BIT(0); + + if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) + return -EFAULT; + + if (token_addr) + *token_addr = addr; + + return 0; +} + +/* + * VM_SHADOW_STACK will have a guard page. This helps userspace protect + * itself from attacks. The reasoning is as follows: + * + * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The + * INCSSP instruction can increment the shadow stack pointer. It is the + * shadow stack analog of an instruction like: + * + * addq $0x80, %rsp + * + * However, there is one important difference between an ADD on %rsp + * and INCSSP. In addition to modifying SSP, INCSSP also reads from the + * memory of the first and last elements that were "popped". It can be + * thought of as acting like this: + * + * READ_ONCE(ssp); // read+discard top element on stack + * ssp += nr_to_pop * 8; // move the shadow stack + * READ_ONCE(ssp-8); // read+discard last popped stack element + * + * The maximum distance INCSSP can move the SSP is 2040 bytes, before + * it would read the memory. Therefore a single page gap will be enough + * to prevent any operation from shifting the SSP to an adjacent stack, + * since it would have to land in the gap at least once, causing a + * fault. + */ +static unsigned long alloc_shstk(unsigned long addr, unsigned long size, + unsigned long token_offset, bool set_res_tok) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; + struct mm_struct *mm = current->mm; + unsigned long mapped_addr, unused; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + mmap_write_unlock(mm); + + if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) + goto out; + + if (create_rstor_token(mapped_addr + token_offset, NULL)) { + vm_munmap(mapped_addr, size); + return -EINVAL; + } + +out: + return mapped_addr; +} + +static unsigned long adjust_shstk_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); +} + +static void unmap_shadow_stack(u64 base, u64 size) +{ + int r; + + r = vm_munmap(base, size); + + /* + * mmap_write_lock_killable() failed with -EINTR. This means + * the process is about to die and have it's MM cleaned up. + * This task shouldn't ever make it back to userspace. In this + * case it is ok to leak a shadow stack, so just exit out. + */ + if (r == -EINTR) + return; + + /* + * For all other types of vm_munmap() failure, either the + * system is out of memory or there is bug. + */ + WARN_ON_ONCE(r); +} + +static int shstk_setup(void) +{ + struct thread_shstk *shstk = ¤t->thread.shstk; + unsigned long addr, size; + + /* Already enabled */ + if (features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* Also not supported for 32 bit and x32 */ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) + return -EOPNOTSUPP; + + size = adjust_shstk_size(0); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return PTR_ERR((void *)addr); + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, addr + size); + wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + fpregs_unlock(); + + shstk->base = addr; + shstk->size = size; + features_set(ARCH_SHSTK_SHSTK); + + return 0; +} + +void reset_thread_features(void) +{ + memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); + current->thread.features = 0; + current->thread.features_locked = 0; +} + +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, + unsigned long stack_size) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + unsigned long addr, size; + + /* + * If shadow stack is not enabled on the new thread, skip any + * switch to a new shadow stack. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + /* + * For CLONE_VM, except vfork, the child needs a separate shadow + * stack. + */ + if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) + return 0; + + size = adjust_shstk_size(stack_size); + addr = alloc_shstk(0, size, 0, false); + if (IS_ERR_VALUE(addr)) + return addr; + + shstk->base = addr; + shstk->size = size; + + return addr + size; +} + +static unsigned long get_user_shstk_addr(void) +{ + unsigned long long ssp; + + fpregs_lock_and_load(); + + rdmsrl(MSR_IA32_PL3_SSP, ssp); + + fpregs_unlock(); + + return ssp; +} + +#define SHSTK_DATA_BIT BIT(63) + +static int put_shstk_data(u64 __user *addr, u64 data) +{ + if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) + return -EINVAL; + + /* + * Mark the high bit so that the sigframe can't be processed as a + * return address. + */ + if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) + return -EFAULT; + return 0; +} + +static int get_shstk_data(unsigned long *data, unsigned long __user *addr) +{ + unsigned long ldata; + + if (unlikely(get_user(ldata, addr))) + return -EFAULT; + + if (!(ldata & SHSTK_DATA_BIT)) + return -EINVAL; + + *data = ldata & ~SHSTK_DATA_BIT; + + return 0; +} + +static int shstk_push_sigframe(unsigned long *ssp) +{ + unsigned long target_ssp = *ssp; + + /* Token must be aligned */ + if (!IS_ALIGNED(target_ssp, 8)) + return -EINVAL; + + *ssp -= SS_FRAME_SIZE; + if (put_shstk_data((void __user *)*ssp, target_ssp)) + return -EFAULT; + + return 0; +} + +static int shstk_pop_sigframe(unsigned long *ssp) +{ + struct vm_area_struct *vma; + unsigned long token_addr; + bool need_to_check_vma; + int err = 1; + + /* + * It is possible for the SSP to be off the end of a shadow stack by 4 + * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes + * before it, it might be this case, so check that the address being + * read is actually shadow stack. + */ + if (!IS_ALIGNED(*ssp, 8)) + return -EINVAL; + + need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; + + if (need_to_check_vma) + mmap_read_lock_killable(current->mm); + + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); + if (unlikely(err)) + goto out_err; + + if (need_to_check_vma) { + vma = find_vma(current->mm, *ssp); + if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { + err = -EFAULT; + goto out_err; + } + + mmap_read_unlock(current->mm); + } + + /* Restore SSP aligned? */ + if (unlikely(!IS_ALIGNED(token_addr, 8))) + return -EINVAL; + + /* SSP in userspace? */ + if (unlikely(token_addr >= TASK_SIZE_MAX)) + return -EINVAL; + + *ssp = token_addr; + + return 0; +out_err: + if (need_to_check_vma) + mmap_read_unlock(current->mm); + return err; +} + +int setup_signal_shadow_stack(struct ksignal *ksig) +{ + void __user *restorer = ksig->ka.sa.sa_restorer; + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + if (!restorer) + return -EINVAL; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_push_sigframe(&ssp); + if (unlikely(err)) + return err; + + /* Push restorer address */ + ssp -= SS_FRAME_SIZE; + err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); + if (unlikely(err)) + return -EFAULT; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +int restore_signal_shadow_stack(void) +{ + unsigned long ssp; + int err; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + ssp = get_user_shstk_addr(); + if (unlikely(!ssp)) + return -EINVAL; + + err = shstk_pop_sigframe(&ssp); + if (unlikely(err)) + return err; + + fpregs_lock_and_load(); + wrmsrl(MSR_IA32_PL3_SSP, ssp); + fpregs_unlock(); + + return 0; +} + +void shstk_free(struct task_struct *tsk) +{ + struct thread_shstk *shstk = &tsk->thread.shstk; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !features_enabled(ARCH_SHSTK_SHSTK)) + return; + + /* + * When fork() with CLONE_VM fails, the child (tsk) already has a + * shadow stack allocated, and exit_thread() calls this function to + * free it. In this case the parent (current) and the child share + * the same mm struct. + */ + if (!tsk->mm || tsk->mm != current->mm) + return; + + unmap_shadow_stack(shstk->base, shstk->size); +} + +static int wrss_control(bool enable) +{ + u64 msrval; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* + * Only enable WRSS if shadow stack is enabled. If shadow stack is not + * enabled, WRSS will already be disabled, so don't bother clearing it + * when disabling. + */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return -EPERM; + + /* Already enabled/disabled? */ + if (features_enabled(ARCH_SHSTK_WRSS) == enable) + return 0; + + fpregs_lock_and_load(); + rdmsrl(MSR_IA32_U_CET, msrval); + + if (enable) { + features_set(ARCH_SHSTK_WRSS); + msrval |= CET_WRSS_EN; + } else { + features_clr(ARCH_SHSTK_WRSS); + if (!(msrval & CET_WRSS_EN)) + goto unlock; + + msrval &= ~CET_WRSS_EN; + } + + wrmsrl(MSR_IA32_U_CET, msrval); + +unlock: + fpregs_unlock(); + + return 0; +} + +static int shstk_disable(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + /* Already disabled? */ + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + fpregs_lock_and_load(); + /* Disable WRSS too when disabling shadow stack */ + wrmsrl(MSR_IA32_U_CET, 0); + wrmsrl(MSR_IA32_PL3_SSP, 0); + fpregs_unlock(); + + shstk_free(current); + features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); + + return 0; +} + +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + bool set_tok = flags & SHADOW_STACK_SET_TOKEN; + unsigned long aligned_size; + + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + return -EOPNOTSUPP; + + if (flags & ~SHADOW_STACK_SET_TOKEN) + return -EINVAL; + + /* If there isn't space for a token */ + if (set_tok && size < 8) + return -ENOSPC; + + if (addr && addr < SZ_4G) + return -ERANGE; + + /* + * An overflow would result in attempting to write the restore token + * to the wrong location. Not catastrophic, but just return the right + * error code and block it. + */ + aligned_size = PAGE_ALIGN(size); + if (aligned_size < size) + return -EOVERFLOW; + + return alloc_shstk(addr, aligned_size, size, set_tok); +} + +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) +{ + unsigned long features = arg2; + + if (option == ARCH_SHSTK_STATUS) { + return put_user(task->thread.features, (unsigned long __user *)arg2); + } + + if (option == ARCH_SHSTK_LOCK) { + task->thread.features_locked |= features; + return 0; + } + + /* Only allow via ptrace */ + if (task != current) { + if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { + task->thread.features_locked &= ~features; + return 0; + } + return -EINVAL; + } + + /* Do not allow to change locked features */ + if (features & task->thread.features_locked) + return -EPERM; + + /* Only support enabling/disabling one feature at a time. */ + if (hweight_long(features) > 1) + return -EINVAL; + + if (option == ARCH_SHSTK_DISABLE) { + if (features & ARCH_SHSTK_WRSS) + return wrss_control(false); + if (features & ARCH_SHSTK_SHSTK) + return shstk_disable(); + return -EINVAL; + } + + /* Handle ARCH_SHSTK_ENABLE */ + if (features & ARCH_SHSTK_SHSTK) + return shstk_setup(); + if (features & ARCH_SHSTK_WRSS) + return wrss_control(true); + return -EINVAL; +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cfeec3ee877e..65fe2094da59 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -40,6 +40,7 @@ #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> +#include <asm/shstk.h> static inline int is_ia32_compat_frame(struct ksignal *ksig) { diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 9027fc088f97..c12624bc82a3 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -402,7 +402,7 @@ Efault: */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 13a1e6083837..cacf2ede6217 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -175,6 +175,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -260,6 +263,9 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; + if (restore_signal_shadow_stack()) + goto badframe; + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; @@ -403,7 +409,7 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); -static_assert(NSIGSEGV == 9); +static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7eb18ca7bd45..6eb06d001bcc 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -135,7 +135,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) */ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { - ack_APIC_irq(); + apic_eoi(); cpu_emergency_disable_virtualization(); stop_this_cpu(NULL); } @@ -237,7 +237,7 @@ static void native_stop_other_cpus(int wait) pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) - apic->send_IPI(cpu, NMI_VECTOR); + __apic_send_IPI(cpu, NMI_VECTOR); } /* * Don't wait longer than 10 ms if the caller didn't @@ -268,7 +268,7 @@ done: */ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { - ack_APIC_irq(); + apic_eoi(); trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); scheduler_ipi(); @@ -277,7 +277,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { - ack_APIC_irq(); + apic_eoi(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); @@ -286,7 +286,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { - ack_APIC_irq(); + apic_eoi(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d40ed3a7dc23..4e45ff44aa07 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -414,7 +414,7 @@ found: return 0; } -void __init smp_store_boot_cpu_info(void) +static void __init smp_store_boot_cpu_info(void) { int id = 0; /* CPU 0 */ struct cpuinfo_x86 *c = &cpu_data(id); @@ -761,44 +761,6 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1\n"); } -void __inquire_remote_apic(int apicid) -{ - unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - const char * const names[] = { "ID", "VERSION", "SPIV" }; - int timeout; - u32 status; - - pr_info("Inquiring remote APIC 0x%x...\n", apicid); - - for (i = 0; i < ARRAY_SIZE(regs); i++) { - pr_info("... APIC 0x%x %s: ", apicid, names[i]); - - /* - * Wait for idle. - */ - status = safe_apic_wait_icr_idle(); - if (status) - pr_cont("a previous APIC delivery may have failed\n"); - - apic_icr_write(APIC_DM_REMRD | regs[i], apicid); - - timeout = 0; - do { - udelay(100); - status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; - } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); - - switch (status) { - case APIC_ICR_RR_VALID: - status = apic_read(APIC_RRR); - pr_cont("%08x\n", status); - break; - default: - pr_cont("failed\n"); - } - } -} - /* * The Multiprocessor Specification 1.4 (1997) example code suggests * that there should be a 10ms delay between the BSP asserting INIT @@ -1089,9 +1051,8 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || - !physid_isset(apicid, phys_cpu_present_map) || - !apic->apic_id_valid(apicid)) { + if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || + !apic_id_valid(apicid)) { pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -1174,58 +1135,6 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_die_cpumask(0)); } -/* - * Various sanity checks. - */ -static void __init smp_sanity_check(void) -{ - preempt_disable(); - -#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32) - if (def_to_bigsmp && nr_cpu_ids > 8) { - unsigned int cpu; - unsigned nr; - - pr_warn("More than 8 CPUs detected - skipping them\n" - "Use CONFIG_X86_BIGSMP\n"); - - nr = 0; - for_each_present_cpu(cpu) { - if (nr >= 8) - set_cpu_present(cpu, false); - nr++; - } - - nr = 0; - for_each_possible_cpu(cpu) { - if (nr >= 8) - set_cpu_possible(cpu, false); - nr++; - } - - set_nr_cpu_ids(8); - } -#endif - - if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", - hard_smp_processor_id()); - - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * Should not be necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - */ - if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", - boot_cpu_physical_apicid); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - preempt_enable(); -} - static void __init smp_cpu_index_default(void) { int i; @@ -1285,8 +1194,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - smp_sanity_check(); - switch (apic_intr_mode) { case APIC_PIC: case APIC_VIRTUAL_WIRE_NO_CONFIG: @@ -1343,7 +1250,7 @@ bool smp_park_other_cpus_in_init(void) if (this_cpu) return false; - for_each_present_cpu(cpu) { + for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) { if (cpu == this_cpu) continue; apicid = apic->cpu_present_to_apicid(cpu); @@ -1422,24 +1329,6 @@ __init void prefill_possible_map(void) { int i, possible; - /* No boot processor was found in mptable or ACPI MADT */ - if (!num_processors) { - if (boot_cpu_has(X86_FEATURE_APIC)) { - int apicid = boot_cpu_physical_apicid; - int cpu = hard_smp_processor_id(); - - pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); - - /* Make sure boot cpu is enumerated */ - if (apic->cpu_present_to_apicid(0) == BAD_APICID && - apic->apic_id_valid(apicid)) - generic_processor_info(apicid, boot_cpu_apic_version); - } - - if (!num_processors) - num_processors = 1; - } - i = setup_max_cpus ?: 1; if (setup_possible_cpus == -1) { possible = num_processors; @@ -1601,9 +1490,7 @@ void play_dead_common(void) idle_task_exit(); cpuhp_ap_report_dead(); - /* - * With physical CPU hotplug, we should halt the cpu - */ + local_irq_disable(); } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8cc653ffdccd..c783aeb37dce 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -193,7 +193,11 @@ get_unmapped_area: info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + if (!in_32bit_syscall() && (flags & MAP_ABOVE4G)) + info.low_limit = SZ_4G; + else + info.low_limit = PAGE_SIZE; + info.high_limit = get_mmap_base(0); /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4a817d20ce3b..c876f1d36a81 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -77,18 +77,6 @@ DECLARE_BITMAP(system_vectors, NR_VECTORS); -static inline void cond_local_irq_enable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void cond_local_irq_disable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} - __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) @@ -213,81 +201,6 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } -#ifdef CONFIG_X86_KERNEL_IBT - -static __ro_after_init bool ibt_fatal = true; - -extern void ibt_selftest_ip(void); /* code label defined in asm below */ - -enum cp_error_code { - CP_EC = (1 << 15) - 1, - - CP_RET = 1, - CP_IRET = 2, - CP_ENDBR = 3, - CP_RSTRORSSP = 4, - CP_SETSSBSY = 5, - - CP_ENCL = 1 << 15, -}; - -DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) -{ - if (!cpu_feature_enabled(X86_FEATURE_IBT)) { - pr_err("Unexpected #CP\n"); - BUG(); - } - - if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) - return; - - if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { - regs->ax = 0; - return; - } - - pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); - if (!ibt_fatal) { - printk(KERN_DEFAULT CUT_HERE); - __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); - return; - } - BUG(); -} - -/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ -noinline bool ibt_selftest(void) -{ - unsigned long ret; - - asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" - ANNOTATE_RETPOLINE_SAFE - " jmp *%%rax\n\t" - "ibt_selftest_ip:\n\t" - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - " nop\n\t" - - : "=a" (ret) : : "memory"); - - return !ret; -} - -static int __init ibt_setup(char *str) -{ - if (!strcmp(str, "off")) - setup_clear_cpu_cap(X86_FEATURE_IBT); - - if (!strcmp(str, "warn")) - ibt_fatal = false; - - return 1; -} - -__setup("ibt=", ibt_setup); - -#endif /* CONFIG_X86_KERNEL_IBT */ - #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 83d41c2601d7..f15fb71f280e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -156,7 +156,7 @@ SECTIONS ALIGN_ENTRY_TEXT_END *(.gnu.warning) - } :text =0xcccc + } :text = 0xcccccccc /* End of text section, which should occupy whole number of pages */ _etext = .; diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 796cfaa46bfa..65e96b76c423 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -129,7 +129,7 @@ static void __init vsmp_cap_cpus(void) static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return read_apic_id() >> index_msb; } static void vsmp_apic_post_init(void) |