summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/kernel/compat_alignment.c32
-rw-r--r--arch/arm64/kvm/arm.c1
-rw-r--r--arch/arm64/kvm/mmu.c99
-rw-r--r--arch/arm64/kvm/pmu-emul.c3
-rw-r--r--arch/arm64/kvm/sys_regs.c21
-rw-r--r--arch/mips/bmips/dma.c5
-rw-r--r--arch/mips/bmips/setup.c8
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush.h9
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace-view.c6
-rw-r--r--arch/powerpc/kvm/powerpc.c6
-rw-r--r--arch/powerpc/platforms/pseries/vas.c8
-rw-r--r--arch/riscv/Kconfig12
-rw-r--r--arch/riscv/Kconfig.erratas6
-rw-r--r--arch/riscv/include/asm/hwcap.h50
-rw-r--r--arch/riscv/kvm/vcpu_timer.c6
-rw-r--r--arch/s390/Makefile2
-rw-r--r--arch/s390/kernel/ptrace.c8
-rw-r--r--arch/s390/kvm/intercept.c32
-rw-r--r--arch/s390/kvm/kvm-s390.c1
-rw-r--r--arch/s390/lib/uaccess.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c12
-rw-r--r--arch/x86/kvm/ioapic.c36
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h5
-rw-r--r--arch/x86/kvm/svm/svm.c37
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h15
-rw-r--r--arch/x86/kvm/vmx/nested.c7
-rw-r--r--arch/x86/kvm/x86.c14
-rw-r--r--arch/xtensa/kernel/traps.c16
28 files changed, 334 insertions, 125 deletions
diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c
index 5edec2f49ec9..deff21bfa680 100644
--- a/arch/arm64/kernel/compat_alignment.c
+++ b/arch/arm64/kernel/compat_alignment.c
@@ -314,36 +314,32 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs)
int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs);
unsigned int type;
u32 instr = 0;
- u16 tinstr = 0;
int isize = 4;
int thumb2_32b = 0;
- int fault;
instrptr = instruction_pointer(regs);
if (compat_thumb_mode(regs)) {
__le16 __user *ptr = (__le16 __user *)(instrptr & ~1);
+ u16 tinstr, tinst2;
- fault = alignment_get_thumb(regs, ptr, &tinstr);
- if (!fault) {
- if (IS_T32(tinstr)) {
- /* Thumb-2 32-bit */
- u16 tinst2;
- fault = alignment_get_thumb(regs, ptr + 1, &tinst2);
- instr = ((u32)tinstr << 16) | tinst2;
- thumb2_32b = 1;
- } else {
- isize = 2;
- instr = thumb2arm(tinstr);
- }
+ if (alignment_get_thumb(regs, ptr, &tinstr))
+ return 1;
+
+ if (IS_T32(tinstr)) { /* Thumb-2 32-bit */
+ if (alignment_get_thumb(regs, ptr + 1, &tinst2))
+ return 1;
+ instr = ((u32)tinstr << 16) | tinst2;
+ thumb2_32b = 1;
+ } else {
+ isize = 2;
+ instr = thumb2arm(tinstr);
}
} else {
- fault = alignment_get_arm(regs, (__le32 __user *)instrptr, &instr);
+ if (alignment_get_arm(regs, (__le32 __user *)instrptr, &instr))
+ return 1;
}
- if (fault)
- return 1;
-
switch (CODING_BITS(instr)) {
case 0x00000000: /* 3.13.4 load/store instruction extensions */
if (LDSTHD_I_BIT(instr))
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 3bd732eaf087..3f6a5efdbcf0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -220,6 +220,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_PTP_KVM:
case KVM_CAP_ARM_SYSTEM_SUSPEND:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7113587222ff..3b9d4d24c361 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
CONFIG_PGTABLE_LEVELS),
.mm_ops = &kvm_user_mm_ops,
};
+ unsigned long flags;
kvm_pte_t pte = 0; /* Keep GCC quiet... */
u32 level = ~0;
int ret;
+ /*
+ * Disable IRQs so that we hazard against a concurrent
+ * teardown of the userspace page tables (which relies on
+ * IPI-ing threads).
+ */
+ local_irq_save(flags);
ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
- VM_BUG_ON(ret);
- VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
- VM_BUG_ON(!(pte & PTE_VALID));
+ local_irq_restore(flags);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Not seeing an error, but not updating level? Something went
+ * deeply wrong...
+ */
+ if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS))
+ return -EFAULT;
+
+ /* Oops, the userspace PTs are gone... Replay the fault */
+ if (!kvm_pte_valid(pte))
+ return -EAGAIN;
return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
}
@@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
*
* Returns the size of the mapping.
*/
-static unsigned long
+static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long hva, kvm_pfn_t *pfnp,
phys_addr_t *ipap)
@@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
* sure that the HVA and IPA are sufficiently aligned and that the
* block map is contained within the memslot.
*/
- if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
- get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ int sz = get_user_mapping_size(kvm, hva);
+
+ if (sz < 0)
+ return sz;
+
+ if (sz < PMD_SIZE)
+ return PAGE_SIZE;
+
/*
* The address we faulted on is backed by a transparent huge
* page. However, because we map the compound huge page and
@@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
{
int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault;
+ bool exec_fault, mte_allowed;
bool device = false;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
@@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
- unsigned long vma_pagesize, fault_granule;
+ long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
@@ -1218,6 +1244,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
/*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ if (fault_status != ESR_ELx_FSC_PERM ||
+ (logging_active && write_fault)) {
+ ret = kvm_mmu_topup_memory_cache(memcache,
+ kvm_mmu_cache_min_pages(kvm));
+ if (ret)
+ return ret;
+ }
+
+ /*
* Let's check if we will get back a huge page backed by hugetlbfs, or
* get block mapping for device MMIO region.
*/
@@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
fault_ipa &= ~(vma_pagesize - 1);
gfn = fault_ipa >> PAGE_SHIFT;
- mmap_read_unlock(current->mm);
+ mte_allowed = kvm_vma_mte_allowed(vma);
- /*
- * Permission faults just need to update the existing leaf entry,
- * and so normally don't require allocations from the memcache. The
- * only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
- */
- if (fault_status != ESR_ELx_FSC_PERM ||
- (logging_active && write_fault)) {
- ret = kvm_mmu_topup_memory_cache(memcache,
- kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
- }
+ /* Don't use the VMA after the unlock -- it may have vanished */
+ vma = NULL;
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
/*
- * Ensure the read of mmu_invalidate_seq happens before we call
- * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
- * the page we just got a reference to gets unmapped before we have a
- * chance to grab the mmu_lock, which ensure that if the page gets
- * unmapped afterwards, the call to kvm_unmap_gfn will take it away
- * from us again properly. This smp_rmb() interacts with the smp_wmb()
- * in kvm_mmu_notifier_invalidate_<page|range_end>.
+ * Read mmu_invalidate_seq so that KVM can detect if the results of
+ * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
+ * acquiring kvm->mmu_lock.
*
- * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
- * used to avoid unnecessary overhead introduced to locate the memory
- * slot because it's always fixed even @gfn is adjusted for huge pages.
+ * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
+ * with the smp_wmb() in kvm_mmu_invalidate_end().
*/
- smp_rmb();
+ mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ mmap_read_unlock(current->mm);
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
write_fault, &writable, NULL);
@@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn,
&fault_ipa);
+
+ if (vma_pagesize < 0) {
+ ret = vma_pagesize;
+ goto out_unlock;
+ }
}
if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new disallowed VMA */
- if (kvm_vma_mte_allowed(vma)) {
+ if (mte_allowed) {
sanitise_mte_tags(kvm, pfn, vma_pagesize);
} else {
ret = -EFAULT;
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 24908400e190..c243b10f3e15 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -538,7 +538,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
if (!kvm_pmu_is_3p5(vcpu))
val &= ~ARMV8_PMU_PMCR_LP;
- __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
+ /* The reset bits don't indicate any state, and shouldn't be saved. */
+ __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P);
if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter_mask(vcpu,
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 53749d3a0996..1b2c161120be 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -856,6 +856,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
return true;
}
+static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
+ u64 *val)
+{
+ u64 idx;
+
+ if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
+ /* PMCCNTR_EL0 */
+ idx = ARMV8_PMU_CYCLE_IDX;
+ else
+ /* PMEVCNTRn_EL0 */
+ idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+
+ *val = kvm_pmu_get_counter_value(vcpu, idx);
+ return 0;
+}
+
static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -1072,7 +1088,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
/* Macro to expand the PMEVCNTRn_EL0 register */
#define PMU_PMEVCNTR_EL0(n) \
{ PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \
- .reset = reset_pmevcntr, \
+ .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \
.access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
/* Macro to expand the PMEVTYPERn_EL0 register */
@@ -1982,7 +1998,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ PMU_SYS_REG(SYS_PMCEID1_EL0),
.access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(SYS_PMCCNTR_EL0),
- .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 },
+ .access = access_pmu_evcntr, .reset = reset_unknown,
+ .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr},
{ PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
.access = access_pmu_evtyper, .reset = NULL },
{ PMU_SYS_REG(SYS_PMXEVCNTR_EL0),
diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c
index 33788668cbdb..3779e7855bd7 100644
--- a/arch/mips/bmips/dma.c
+++ b/arch/mips/bmips/dma.c
@@ -5,6 +5,8 @@
#include <asm/bmips.h>
#include <asm/io.h>
+bool bmips_rac_flush_disable;
+
void arch_sync_dma_for_cpu_all(void)
{
void __iomem *cbr = BMIPS_GET_CBR();
@@ -15,6 +17,9 @@ void arch_sync_dma_for_cpu_all(void)
boot_cpu_type() != CPU_BMIPS4380)
return;
+ if (unlikely(bmips_rac_flush_disable))
+ return;
+
/* Flush stale data out of the readahead cache */
cfg = __raw_readl(cbr + BMIPS_RAC_CONFIG);
__raw_writel(cfg | 0x100, cbr + BMIPS_RAC_CONFIG);
diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c
index e95b3f78e7cd..549a6392a3d2 100644
--- a/arch/mips/bmips/setup.c
+++ b/arch/mips/bmips/setup.c
@@ -35,6 +35,8 @@
#define REG_BCM6328_OTP ((void __iomem *)CKSEG1ADDR(0x1000062c))
#define BCM6328_TP1_DISABLED BIT(9)
+extern bool bmips_rac_flush_disable;
+
static const unsigned long kbase = VMLINUX_LOAD_ADDRESS & 0xfff00000;
struct bmips_quirk {
@@ -104,6 +106,12 @@ static void bcm6358_quirks(void)
* disable SMP for now
*/
bmips_smp_enabled = 0;
+
+ /*
+ * RAC flush causes kernel panics on BCM6358 when booting from TP1
+ * because the bootloader is not initializing it properly.
+ */
+ bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31));
}
static void bcm6368_quirks(void)
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 2bbc0fcce04a..5e26c7f2c25a 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -148,6 +148,11 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
*/
}
+static inline bool __pte_protnone(unsigned long pte)
+{
+ return (pte & (pgprot_val(PAGE_NONE) | _PAGE_RWX)) == pgprot_val(PAGE_NONE);
+}
+
static inline bool __pte_flags_need_flush(unsigned long oldval,
unsigned long newval)
{
@@ -164,8 +169,8 @@ static inline bool __pte_flags_need_flush(unsigned long oldval,
/*
* We do not expect kernel mappings or non-PTEs or not-present PTEs.
*/
- VM_WARN_ON_ONCE(oldval & _PAGE_PRIVILEGED);
- VM_WARN_ON_ONCE(newval & _PAGE_PRIVILEGED);
+ VM_WARN_ON_ONCE(!__pte_protnone(oldval) && oldval & _PAGE_PRIVILEGED);
+ VM_WARN_ON_ONCE(!__pte_protnone(newval) && newval & _PAGE_PRIVILEGED);
VM_WARN_ON_ONCE(!(oldval & _PAGE_PTE));
VM_WARN_ON_ONCE(!(newval & _PAGE_PTE));
VM_WARN_ON_ONCE(!(oldval & _PAGE_PRESENT));
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c
index 2087a785f05f..5fff0d04b23f 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -290,6 +290,9 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
static int ppr_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
+ if (!target->thread.regs)
+ return -EINVAL;
+
return membuf_write(&to, &target->thread.regs->ppr, sizeof(u64));
}
@@ -297,6 +300,9 @@ static int ppr_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count, const void *kbuf,
const void __user *ubuf)
{
+ if (!target->thread.regs)
+ return -EINVAL;
+
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->thread.regs->ppr, 0, sizeof(u64));
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4c5405fc5538..d23e25e8432d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -576,6 +576,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
+#ifdef CONFIG_HAVE_KVM_IRQFD
+ case KVM_CAP_IRQFD_RESAMPLE:
+ r = !xive_enabled();
+ break;
+#endif
+
case KVM_CAP_PPC_ALLOC_HTAB:
r = hv_enabled;
break;
diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c
index 559112312810..513180467562 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -856,6 +856,13 @@ int pseries_vas_dlpar_cpu(void)
{
int new_nr_creds, rc;
+ /*
+ * NX-GZIP is not enabled. Nothing to do for DLPAR event
+ */
+ if (!copypaste_feat)
+ return 0;
+
+
rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat,
(u64)virt_to_phys(&hv_cop_caps));
@@ -1012,6 +1019,7 @@ static int __init pseries_vas_init(void)
* Linux supports user space COPY/PASTE only with Radix
*/
if (!radix_enabled()) {
+ copypaste_feat = false;
pr_err("API is supported only with radix page tables\n");
return -ENOTSUPP;
}
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 5b182d1c196c..eb7f29a412f8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -126,6 +126,7 @@ config RISCV
select OF_IRQ
select PCI_DOMAINS_GENERIC if PCI
select PCI_MSI if PCI
+ select RISCV_ALTERNATIVE if !XIP_KERNEL
select RISCV_INTC
select RISCV_TIMER if RISCV_SBI
select SIFIVE_PLIC
@@ -401,9 +402,8 @@ config RISCV_ISA_C
config RISCV_ISA_SVPBMT
bool "SVPBMT extension support"
depends on 64BIT && MMU
- depends on !XIP_KERNEL
+ depends on RISCV_ALTERNATIVE
default y
- select RISCV_ALTERNATIVE
help
Adds support to dynamically detect the presence of the SVPBMT
ISA-extension (Supervisor-mode: page-based memory types) and
@@ -428,8 +428,8 @@ config TOOLCHAIN_HAS_ZBB
config RISCV_ISA_ZBB
bool "Zbb extension support for bit manipulation instructions"
depends on TOOLCHAIN_HAS_ZBB
- depends on !XIP_KERNEL && MMU
- select RISCV_ALTERNATIVE
+ depends on MMU
+ depends on RISCV_ALTERNATIVE
default y
help
Adds support to dynamically detect the presence of the ZBB
@@ -443,9 +443,9 @@ config RISCV_ISA_ZBB
config RISCV_ISA_ZICBOM
bool "Zicbom extension support for non-coherent DMA operation"
- depends on !XIP_KERNEL && MMU
+ depends on MMU
+ depends on RISCV_ALTERNATIVE
default y
- select RISCV_ALTERNATIVE
select RISCV_DMA_NONCOHERENT
help
Adds support to dynamically detect the presence of the ZICBOM
diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas
index 69621ae6d647..0c8f4652cd82 100644
--- a/arch/riscv/Kconfig.erratas
+++ b/arch/riscv/Kconfig.erratas
@@ -2,8 +2,7 @@ menu "CPU errata selection"
config ERRATA_SIFIVE
bool "SiFive errata"
- depends on !XIP_KERNEL
- select RISCV_ALTERNATIVE
+ depends on RISCV_ALTERNATIVE
help
All SiFive errata Kconfig depend on this Kconfig. Disabling
this Kconfig will disable all SiFive errata. Please say "Y"
@@ -35,8 +34,7 @@ config ERRATA_SIFIVE_CIP_1200
config ERRATA_THEAD
bool "T-HEAD errata"
- depends on !XIP_KERNEL
- select RISCV_ALTERNATIVE
+ depends on RISCV_ALTERNATIVE
help
All T-HEAD errata Kconfig depend on this Kconfig. Disabling
this Kconfig will disable all T-HEAD errata. Please say "Y"
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index e3021b2590de..6263a0de1c6a 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -57,18 +57,31 @@ struct riscv_isa_ext_data {
unsigned int isa_ext_id;
};
+unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap);
+
+#define riscv_isa_extension_mask(ext) BIT_MASK(RISCV_ISA_EXT_##ext)
+
+bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit);
+#define riscv_isa_extension_available(isa_bitmap, ext) \
+ __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext)
+
static __always_inline bool
riscv_has_extension_likely(const unsigned long ext)
{
compiletime_assert(ext < RISCV_ISA_EXT_MAX,
"ext must be < RISCV_ISA_EXT_MAX");
- asm_volatile_goto(
- ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1)
- :
- : [ext] "i" (ext)
- :
- : l_no);
+ if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ asm_volatile_goto(
+ ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1)
+ :
+ : [ext] "i" (ext)
+ :
+ : l_no);
+ } else {
+ if (!__riscv_isa_extension_available(NULL, ext))
+ goto l_no;
+ }
return true;
l_no:
@@ -81,26 +94,23 @@ riscv_has_extension_unlikely(const unsigned long ext)
compiletime_assert(ext < RISCV_ISA_EXT_MAX,
"ext must be < RISCV_ISA_EXT_MAX");
- asm_volatile_goto(
- ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1)
- :
- : [ext] "i" (ext)
- :
- : l_yes);
+ if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
+ asm_volatile_goto(
+ ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1)
+ :
+ : [ext] "i" (ext)
+ :
+ : l_yes);
+ } else {
+ if (__riscv_isa_extension_available(NULL, ext))
+ goto l_yes;
+ }
return false;
l_yes:
return true;
}
-unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap);
-
-#define riscv_isa_extension_mask(ext) BIT_MASK(RISCV_ISA_EXT_##ext)
-
-bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit);
-#define riscv_isa_extension_available(isa_bitmap, ext) \
- __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext)
-
#endif
#endif /* _ASM_RISCV_HWCAP_H */
diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c
index ad34519c8a13..3ac2ff6a65da 100644
--- a/arch/riscv/kvm/vcpu_timer.c
+++ b/arch/riscv/kvm/vcpu_timer.c
@@ -147,10 +147,8 @@ static void kvm_riscv_vcpu_timer_blocking(struct kvm_vcpu *vcpu)
return;
delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t);
- if (delta_ns) {
- hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL);
- t->next_set = true;
- }
+ hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL);
+ t->next_set = true;
}
static void kvm_riscv_vcpu_timer_unblocking(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index b3235ab0ace8..ed646c583e4f 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -162,7 +162,7 @@ vdso_prepare: prepare0
ifdef CONFIG_EXPOLINE_EXTERN
modules_prepare: expoline_prepare
-expoline_prepare:
+expoline_prepare: scripts
$(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o
endif
endif
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index cf9659e13f03..ea244a73efad 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -474,9 +474,7 @@ long arch_ptrace(struct task_struct *child, long request,
}
return 0;
case PTRACE_GET_LAST_BREAK:
- put_user(child->thread.last_break,
- (unsigned long __user *) data);
- return 0;
+ return put_user(child->thread.last_break, (unsigned long __user *)data);
case PTRACE_ENABLE_TE:
if (!MACHINE_HAS_TE)
return -EIO;
@@ -824,9 +822,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
}
return 0;
case PTRACE_GET_LAST_BREAK:
- put_user(child->thread.last_break,
- (unsigned int __user *) data);
- return 0;
+ return put_user(child->thread.last_break, (unsigned int __user *)data);
}
return compat_ptrace_request(child, request, addr, data);
}
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 0ee02dae14b2..2cda8d9d7c6e 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -271,10 +271,18 @@ static int handle_prog(struct kvm_vcpu *vcpu)
* handle_external_interrupt - used for external interruption interceptions
* @vcpu: virtual cpu
*
- * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
- * the new PSW does not have external interrupts disabled. In the first case,
- * we've got to deliver the interrupt manually, and in the second case, we
- * drop to userspace to handle the situation there.
+ * This interception occurs if:
+ * - the CPUSTAT_EXT_INT bit was already set when the external interrupt
+ * occurred. In this case, the interrupt needs to be injected manually to
+ * preserve interrupt priority.
+ * - the external new PSW has external interrupts enabled, which will cause an
+ * interruption loop. We drop to userspace in this case.
+ *
+ * The latter case can be detected by inspecting the external mask bit in the
+ * external new psw.
+ *
+ * Under PV, only the latter case can occur, since interrupt priorities are
+ * handled in the ultravisor.
*/
static int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
@@ -285,10 +293,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
vcpu->stat.exit_external_interrupt++;
- rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
- if (rc)
- return rc;
- /* We can not handle clock comparator or timer interrupt with bad PSW */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ newpsw = vcpu->arch.sie_block->gpsw;
+ } else {
+ rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
+ if (rc)
+ return rc;
+ }
+
+ /*
+ * Clock comparator or timer interrupt with external interrupt enabled
+ * will cause interrupt loop. Drop to userspace.
+ */
if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) &&
(newpsw.mask & PSW_MASK_EXT))
return -EOPNOTSUPP;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 39b36562c043..1eeb9ae57879 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -573,6 +573,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_VCPU_RESETS:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_S390_DIAG318:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 720036fb1924..d44214072779 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -172,7 +172,7 @@ unsigned long __clear_user(void __user *to, unsigned long size)
"4: slgr %0,%0\n"
"5:\n"
EX_TABLE(0b,2b) EX_TABLE(6b,2b) EX_TABLE(3b,5b) EX_TABLE(7b,5b)
- : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2)
+ : "+&a" (size), "+&a" (to), "+a" (tmp1), "=&a" (tmp2)
: "a" (empty_zero_page), [spec] "d" (spec.val)
: "cc", "memory", "0");
return size;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f36dc2f796c5..f1197366a97d 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -358,12 +358,16 @@ static void __init ms_hyperv_init_platform(void)
* To mirror what Windows does we should extract CPU management
* features and use the ReservedIdentityBit to detect if Linux is the
* root partition. But that requires negotiating CPU management
- * interface (a process to be finalized).
+ * interface (a process to be finalized). For now, use the privilege
+ * flag as the indicator for running as root.
*
- * For now, use the privilege flag as the indicator for running as
- * root.
+ * Hyper-V should never specify running as root and as a Confidential
+ * VM. But to protect against a compromised/malicious Hyper-V trying
+ * to exploit root behavior to expose Confidential VM memory, ignore
+ * the root partition setting if also a Confidential VM.
*/
- if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
+ if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
+ !(ms_hyperv.priv_high & HV_ISOLATION)) {
hv_root_partition = true;
pr_info("Hyper-V: running as root partition\n");
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 042dee556125..995eb5054360 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -368,9 +368,39 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
- if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
- && ioapic->irr & (1 << index))
- ioapic_service(ioapic, index, false);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) {
+ /*
+ * Pending status in irr may be outdated: the IRQ line may have
+ * already been deasserted by a device while the IRQ was masked.
+ * This occurs, for instance, if the interrupt is handled in a
+ * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this
+ * case the guest acknowledges the interrupt to the device in
+ * its threaded irq handler, i.e. after the EOI but before
+ * unmasking, so at the time of unmasking the IRQ line is
+ * already down but our pending irr bit is still set. In such
+ * cases, injecting this pending interrupt to the guest is
+ * buggy: the guest will receive an extra unwanted interrupt.
+ *
+ * So we need to check here if the IRQ is actually still pending.
+ * As we are generally not able to probe the IRQ line status
+ * directly, we do it through irqfd resampler. Namely, we clear
+ * the pending status and notify the resampler that this interrupt
+ * is done, without actually injecting it into the guest. If the
+ * IRQ line is actually already deasserted, we are done. If it is
+ * still asserted, a new interrupt will be shortly triggered
+ * through irqfd and injected into the guest.
+ *
+ * If, however, it's not possible to resample (no irqfd resampler
+ * registered for this irq), then unconditionally inject this
+ * pending interrupt into the guest, so the guest will not miss
+ * an interrupt, although may get an extra unwanted interrupt.
+ */
+ if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))
+ ioapic->irr &= ~(1 << index);
+ else
+ ioapic_service(ioapic, index, false);
+ }
if (e->fields.delivery_mode == APIC_DM_FIXED) {
struct kvm_lapic_irq irq;
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 287e98ef9df3..6272dabec02d 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -12,6 +12,11 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
int hv_remote_flush_tlb(struct kvm *kvm);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
+static inline int hv_remote_flush_tlb(struct kvm *kvm)
+{
+ return -EOPNOTSUPP;
+}
+
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 252e7f37e4e2..f25bc3cbb250 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3729,7 +3729,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
-static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3753,6 +3753,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
svm->current_vmcb->asid_generation--;
}
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
+
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
+ * flush the NPT mappings via hypercall as flushing the ASID only
+ * affects virtual to physical mappings, it does not invalidate guest
+ * physical to host physical mappings.
+ */
+ if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
+ hyperv_flush_guest_mapping(root_tdp);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
+ * flushes should be routed to hv_remote_flush_tlb() without requesting
+ * a "regular" remote flush. Reaching this point means either there's
+ * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of
+ * which might be fatal to the guest. Yell, but try to recover.
+ */
+ if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
+ hv_remote_flush_tlb(vcpu->kvm);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4745,10 +4776,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
- .flush_tlb_all = svm_flush_tlb_current,
+ .flush_tlb_all = svm_flush_tlb_all,
.flush_tlb_current = svm_flush_tlb_current,
.flush_tlb_gva = svm_flush_tlb_gva,
- .flush_tlb_guest = svm_flush_tlb_current,
+ .flush_tlb_guest = svm_flush_tlb_asid,
.vcpu_pre_run = svm_vcpu_pre_run,
.vcpu_run = svm_vcpu_run,
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index cff838f15db5..786d46d73a8e 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -6,6 +6,8 @@
#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#include <asm/mshyperv.h>
+
#if IS_ENABLED(CONFIG_HYPERV)
#include "kvm_onhyperv.h"
@@ -15,6 +17,14 @@ static struct kvm_x86_ops svm_x86_ops;
int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB &&
+ !!hve->hv_enlightenments_control.enlightened_npt_tlb;
+}
+
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
@@ -80,6 +90,11 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
}
#else
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1bc2b80273c9..768487611db7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3868,7 +3868,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
exit_qual = 0;
}
- if (ex->has_error_code) {
+ /*
+ * Unlike AMD's Paged Real Mode, which reports an error code on #PF
+ * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
+ * "has error code" flags on VM-Exit if the CPU is in Real Mode.
+ */
+ if (ex->has_error_code && is_protmode(vcpu)) {
/*
* Intel CPUs do not generate error codes with bits 31:16 set,
* and more importantly VMX disallows setting bits 31:16 in the
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7713420abab0..3d852ce84920 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4432,6 +4432,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VAPIC:
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -8903,6 +8904,8 @@ restart:
}
if (ctxt->have_exception) {
+ WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
+ vcpu->mmio_needed = false;
r = 1;
inject_emulated_exception(vcpu);
} else if (vcpu->arch.pio.count) {
@@ -9906,13 +9909,20 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
+ /*
+ * Suppress the error code if the vCPU is in Real Mode, as Real Mode
+ * exceptions don't report error codes. The presence of an error code
+ * is carried with the exception and only stripped when the exception
+ * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
+ * report an error code despite the CPU being in Real Mode.
+ */
+ vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
+
trace_kvm_inj_exception(vcpu->arch.exception.vector,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
- vcpu->arch.exception.error_code = false;
static_call(kvm_x86_inject_exception)(vcpu);
}
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index cd98366a9b23..f0a7d1c2641e 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -539,7 +539,7 @@ static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH;
void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl)
{
- size_t len;
+ size_t len, off = 0;
if (!sp)
sp = stack_pointer(task);
@@ -548,9 +548,17 @@ void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl)
kstack_depth_to_print * STACK_DUMP_ENTRY_SIZE);
printk("%sStack:\n", loglvl);
- print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE,
- STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE,
- sp, len, false);
+ while (off < len) {
+ u8 line[STACK_DUMP_LINE_SIZE];
+ size_t line_len = len - off > STACK_DUMP_LINE_SIZE ?
+ STACK_DUMP_LINE_SIZE : len - off;
+
+ __memcpy(line, (u8 *)sp + off, line_len);
+ print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE,
+ STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE,
+ line, line_len, false);
+ off += STACK_DUMP_LINE_SIZE;
+ }
show_trace(task, sp, loglvl);
}