summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2011-03-11 12:22:26 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2011-03-11 12:22:26 +1100
commitc29220569d4561dc7480ad437ad614e427b606a5 (patch)
tree278a5753441fb2fd74e721aa8cb9b473d1af8cfb
parente3e06eeb07fa571398b85d8799c74698cabaec69 (diff)
parent153bdc68cfa7d1fdbf64908c2da0a176ae355faa (diff)
Merge remote-tracking branch 'kvm/linux-next'
-rw-r--r--Documentation/kvm/locking.txt25
-rw-r--r--arch/alpha/include/asm/errno.h2
-rw-r--r--arch/ia64/kvm/kvm-ia64.c2
-rw-r--r--arch/mips/include/asm/errno.h2
-rw-r--r--arch/parisc/include/asm/errno.h2
-rw-r--r--arch/powerpc/kvm/book3s.c14
-rw-r--r--arch/powerpc/kvm/booke.c14
-rw-r--r--arch/sparc/include/asm/errno.h2
-rw-r--r--arch/x86/include/asm/kvm_emulate.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h5
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kvm/emulate.c52
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/lapic.c13
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c44
-rw-r--r--arch/x86/kvm/paging_tmpl.h5
-rw-r--r--arch/x86/kvm/svm.c27
-rw-r--r--arch/x86/kvm/vmx.c103
-rw-r--r--arch/x86/kvm/x86.c153
-rw-r--r--drivers/infiniband/hw/ipath/ipath_user_pages.c6
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c6
-rw-r--r--include/asm-generic/cputime.h3
-rw-r--r--include/asm-generic/errno.h2
-rw-r--r--include/linux/interrupt.h7
-rw-r--r--include/linux/jiffies.h1
-rw-r--r--include/linux/kvm_host.h31
-rw-r--r--include/linux/mm.h13
-rw-r--r--include/linux/sched.h13
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/sched.c274
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c271
-rw-r--r--kernel/sched_idletask.c26
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stoptask.c7
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time.c23
-rw-r--r--mm/internal.h5
-rw-r--r--mm/memory-failure.c32
-rw-r--r--mm/memory.c63
-rw-r--r--virt/kvm/eventfd.c5
-rw-r--r--virt/kvm/kvm_main.c139
46 files changed, 969 insertions, 491 deletions
diff --git a/Documentation/kvm/locking.txt b/Documentation/kvm/locking.txt
new file mode 100644
index 000000000000..3b4cd3bf5631
--- /dev/null
+++ b/Documentation/kvm/locking.txt
@@ -0,0 +1,25 @@
+KVM Lock Overview
+=================
+
+1. Acquisition Orders
+---------------------
+
+(to be written)
+
+2. Reference
+------------
+
+Name: kvm_lock
+Type: raw_spinlock
+Arch: any
+Protects: - vm_list
+ - hardware virtualization enable/disable
+Comment: 'raw' because hardware enabling/disabling must be atomic /wrt
+ migration.
+
+Name: kvm_arch::tsc_write_lock
+Type: raw_spinlock
+Arch: x86
+Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
+ - tsc offset in vmcb
+Comment: 'raw' because updating the tsc offsets must not be preempted.
diff --git a/arch/alpha/include/asm/errno.h b/arch/alpha/include/asm/errno.h
index 98099bda9370..e5f29ca28180 100644
--- a/arch/alpha/include/asm/errno.h
+++ b/arch/alpha/include/asm/errno.h
@@ -122,4 +122,6 @@
#define ERFKILL 138 /* Operation not possible due to RF-kill */
+#define EHWPOISON 139 /* Memory page has hardware error */
+
#endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 70d224d4264c..8213efe1998c 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -662,6 +662,7 @@ again:
goto vcpu_run_fail;
srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ vcpu->mode = IN_GUEST_MODE;
kvm_guest_enter();
/*
@@ -683,6 +684,7 @@ again:
*/
barrier();
kvm_guest_exit();
+ vcpu->mode = OUTSIDE_GUEST_MODE;
preempt_enable();
idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/mips/include/asm/errno.h b/arch/mips/include/asm/errno.h
index a0efc73819e4..6dcd3583ed04 100644
--- a/arch/mips/include/asm/errno.h
+++ b/arch/mips/include/asm/errno.h
@@ -121,6 +121,8 @@
#define ERFKILL 167 /* Operation not possible due to RF-kill */
+#define EHWPOISON 168 /* Memory page has hardware error */
+
#define EDQUOT 1133 /* Quota exceeded */
#ifdef __KERNEL__
diff --git a/arch/parisc/include/asm/errno.h b/arch/parisc/include/asm/errno.h
index 9992abdd782d..135ad6047e51 100644
--- a/arch/parisc/include/asm/errno.h
+++ b/arch/parisc/include/asm/errno.h
@@ -122,4 +122,6 @@
#define ERFKILL 256 /* Operation not possible due to RF-kill */
+#define EHWPOISON 257 /* Memory page has hardware error */
+
#endif
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index badc983031b3..c961de40c676 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1141,9 +1141,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg1 = vcpu->arch.shared->sprg1;
regs->sprg2 = vcpu->arch.shared->sprg2;
regs->sprg3 = vcpu->arch.shared->sprg3;
- regs->sprg5 = vcpu->arch.sprg4;
- regs->sprg6 = vcpu->arch.sprg5;
- regs->sprg7 = vcpu->arch.sprg6;
+ regs->sprg4 = vcpu->arch.sprg4;
+ regs->sprg5 = vcpu->arch.sprg5;
+ regs->sprg6 = vcpu->arch.sprg6;
+ regs->sprg7 = vcpu->arch.sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -1167,9 +1168,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.shared->sprg1 = regs->sprg1;
vcpu->arch.shared->sprg2 = regs->sprg2;
vcpu->arch.shared->sprg3 = regs->sprg3;
- vcpu->arch.sprg5 = regs->sprg4;
- vcpu->arch.sprg6 = regs->sprg5;
- vcpu->arch.sprg7 = regs->sprg6;
+ vcpu->arch.sprg4 = regs->sprg4;
+ vcpu->arch.sprg5 = regs->sprg5;
+ vcpu->arch.sprg6 = regs->sprg6;
+ vcpu->arch.sprg7 = regs->sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 77575d08c818..ef76acb455c3 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -546,9 +546,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg1 = vcpu->arch.shared->sprg1;
regs->sprg2 = vcpu->arch.shared->sprg2;
regs->sprg3 = vcpu->arch.shared->sprg3;
- regs->sprg5 = vcpu->arch.sprg4;
- regs->sprg6 = vcpu->arch.sprg5;
- regs->sprg7 = vcpu->arch.sprg6;
+ regs->sprg4 = vcpu->arch.sprg4;
+ regs->sprg5 = vcpu->arch.sprg5;
+ regs->sprg6 = vcpu->arch.sprg6;
+ regs->sprg7 = vcpu->arch.sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@@ -572,9 +573,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.shared->sprg1 = regs->sprg1;
vcpu->arch.shared->sprg2 = regs->sprg2;
vcpu->arch.shared->sprg3 = regs->sprg3;
- vcpu->arch.sprg5 = regs->sprg4;
- vcpu->arch.sprg6 = regs->sprg5;
- vcpu->arch.sprg7 = regs->sprg6;
+ vcpu->arch.sprg4 = regs->sprg4;
+ vcpu->arch.sprg5 = regs->sprg5;
+ vcpu->arch.sprg6 = regs->sprg6;
+ vcpu->arch.sprg7 = regs->sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
diff --git a/arch/sparc/include/asm/errno.h b/arch/sparc/include/asm/errno.h
index 4e2bc490d714..c351aba997b7 100644
--- a/arch/sparc/include/asm/errno.h
+++ b/arch/sparc/include/asm/errno.h
@@ -112,4 +112,6 @@
#define ERFKILL 134 /* Operation not possible due to RF-kill */
+#define EHWPOISON 135 /* Memory page has hardware error */
+
#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 8e37deb1eb38..0f5213564326 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,9 +142,9 @@ struct x86_emulate_ops {
int (*pio_out_emulated)(int size, unsigned short port, const void *val,
unsigned int count, struct kvm_vcpu *vcpu);
- bool (*get_cached_descriptor)(struct desc_struct *desc,
+ bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
int seg, struct kvm_vcpu *vcpu);
- void (*set_cached_descriptor)(struct desc_struct *desc,
+ void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
int seg, struct kvm_vcpu *vcpu);
u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
@@ -239,6 +239,7 @@ struct x86_emulate_ctxt {
int interruptibility;
bool perm_ok; /* do not check permissions if true */
+ bool only_vendor_specific_insn;
bool have_exception;
struct x86_exception exception;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ffd7f8d29187..f08314f303e0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,7 +85,7 @@
#define ASYNC_PF_PER_VCPU 64
-extern spinlock_t kvm_lock;
+extern raw_spinlock_t kvm_lock;
extern struct list_head vm_list;
struct kvm_vcpu;
@@ -336,7 +336,6 @@ struct kvm_vcpu_arch {
gfn_t last_pte_gfn;
struct {
- gfn_t gfn; /* presumed gfn during guest pte update */
pfn_t pfn; /* pfn corresponding to that gfn */
unsigned long mmu_seq;
} update_pte;
@@ -448,7 +447,7 @@ struct kvm_arch {
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
- spinlock_t tsc_write_lock;
+ raw_spinlock_t tsc_write_lock;
u64 last_tsc_nsec;
u64 last_tsc_offset;
u64 last_tsc_write;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 43a18c77676d..d22611121ac7 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -43,6 +43,7 @@
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_BBL_CR_CTL 0x00000119
+#define MSR_IA32_BBL_CR_CTL3 0x0000011e
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8dc44662394b..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
native_smp_prepare_boot_cpu();
}
-static void kvm_guest_cpu_online(void *dummy)
+static void __cpuinit kvm_guest_cpu_online(void *dummy)
{
kvm_guest_cpu_init();
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index caf966781d25..0ad47b819a8b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -76,6 +76,7 @@
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
/* Misc flags */
+#define VendorSpecific (1<<22) /* Vendor specific instruction */
#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
#define Undefined (1<<25) /* No Such Instruction */
@@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
if (selector & 1 << 2) {
struct desc_struct desc;
memset (dt, 0, sizeof *dt);
- if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+ if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
+ ctxt->vcpu))
return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret;
}
+/* Does not support long mode */
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 selector, int seg)
@@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
}
load:
ops->set_segment_selector(selector, seg, ctxt->vcpu);
- ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+ ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
return X86EMUL_CONTINUE;
exception:
emulate_exception(ctxt, err_vec, err_code, true);
@@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *ss)
{
memset(cs, 0, sizeof(struct desc_struct));
- ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
memset(ss, 0, sizeof(struct desc_struct));
cs->l = 0; /* will be adjusted later */
@@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs.d = 0;
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
c->regs[VCPU_REGS_RCX] = c->eip;
@@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs.l = 1;
}
- ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs_sel |= SELECTOR_RPL_MASK;
ss_sel |= SELECTOR_RPL_MASK;
- ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
c->eip = c->regs[VCPU_REGS_RDX];
@@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
u16 port, u16 len)
{
struct desc_struct tr_seg;
+ u32 base3;
int r;
- u16 io_bitmap_ptr;
- u8 perm, bit_idx = port & 0x7;
+ u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
+ unsigned long base;
- ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+ ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
if (!tr_seg.p)
return false;
if (desc_limit_scaled(&tr_seg) < 103)
return false;
- r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
- ctxt->vcpu, NULL);
+ base = get_desc_base(&tr_seg);
+#ifdef CONFIG_X86_64
+ base |= ((u64)base3) << 32;
+#endif
+ r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
- &perm, 1, ctxt->vcpu, NULL);
+ r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
+ NULL);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
}
ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
- ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+ ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
if (has_error_code) {
@@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
D(SrcMem16 | ModRM | Mov | Priv),
D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
}, {
- D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
+ D(SrcNone | ModRM | Priv | VendorSpecific), N,
+ N, D(SrcNone | ModRM | Priv | VendorSpecific),
D(SrcNone | ModRM | DstMem | Mov), N,
D(SrcMem16 | ModRM | Mov | Priv), N,
} };
@@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
static struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
N, GD(0, &group7), N, N,
- N, D(ImplicitOps), D(ImplicitOps | Priv), N,
+ N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
N, D(ImplicitOps | ModRM), N, N,
/* 0x10 - 0x1F */
@@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
/* 0x30 - 0x3F */
D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
D(ImplicitOps | Priv), N,
- D(ImplicitOps), D(ImplicitOps | Priv), N, N,
+ D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
+ N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
X16(D(DstReg | SrcMem | ModRM | Mov)),
@@ -2741,6 +2750,9 @@ done_prefixes:
if (c->d == 0 || (c->d & Undefined))
return -1;
+ if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+ return -1;
+
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
c->op_bytes = 8;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05e4ac4..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s)
}
if (!found)
- found = s->kvm->bsp_vcpu;
-
- if (!found)
return;
kvm_make_request(KVM_REQ_EVENT, found);
@@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
- s->isr_ack |= (1 << irq);
if (s != &s->pics_state->pics[0])
irq += 8;
/*
@@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
pic_lock(s->pics_state);
}
-void kvm_pic_clear_isr_ack(struct kvm *kvm)
-{
- struct kvm_pic *s = pic_irqchip(kvm);
-
- pic_lock(s);
- s->pics[0].isr_ack = 0xff;
- s->pics[1].isr_ack = 0xff;
- pic_unlock(s);
-}
-
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
@@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
s->irr = 0;
s->imr = 0;
s->isr = 0;
- s->isr_ack = 0xff;
s->priority_add = 0;
s->irq_base = 0;
s->read_reg_select = 0;
@@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
*/
static void pic_irq_request(struct kvm *kvm, int level)
{
- struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
- int irq = pic_get_irq(&s->pics[0]);
- s->output = level;
- if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
- s->pics[0].isr_ack &= ~(1 << irq);
+ if (!s->output)
s->wakeup_needed = true;
- }
+ s->output = level;
}
static const struct kvm_io_device_ops picdev_ops = {
@@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s->pics[1].elcr_mask = 0xde;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
- s->pics[0].isr_ack = 0xff;
- s->pics[1].isr_ack = 0xff;
/*
* Initialize PIO device
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0d3653..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_INIT:
if (level) {
result = 1;
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- printk(KERN_DEBUG
- "INIT on a runnable vcpu %d\n",
- vcpu->vcpu_id);
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
@@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
- if (vcpu->arch.apic->regs_page)
- __free_page(vcpu->arch.apic->regs_page);
+ if (vcpu->arch.apic->regs)
+ free_page((unsigned long)vcpu->arch.apic->regs);
kfree(vcpu->arch.apic);
}
@@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
vcpu->arch.apic = apic;
- apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (apic->regs_page == NULL) {
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
goto nomem_free_apic;
}
- apic->regs = page_address(apic->regs_page);
apic->vcpu = vcpu;
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
u32 divide_count;
struct kvm_vcpu *vcpu;
bool irr_pending;
- struct page *regs_page;
void *regs;
gpa_t vapic_addr;
struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f02b8edc3d44..88d36890f420 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
#define PT64_LEVEL_SHIFT(level) \
(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-#define PT64_LEVEL_MASK(level) \
- (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
#define PT64_INDEX(address, level)\
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
@@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
#define PT32_LEVEL_SHIFT(level) \
(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
-#define PT32_LEVEL_MASK(level) \
- (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
#define PT32_LVL_OFFSET_MASK(level) \
(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT32_LEVEL_BITS))) - 1))
@@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
int min)
{
- struct page *page;
+ void *page;
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = alloc_page(GFP_KERNEL);
+ page = (void *)__get_free_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
- cache->objects[cache->nobjs++] = page_address(page);
+ cache->objects[cache->nobjs++] = page;
}
return 0;
}
@@ -1032,9 +1027,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
list_del(&sp->link);
- __free_page(virt_to_page(sp->spt));
+ free_page((unsigned long)sp->spt);
if (!sp->role.direct)
- __free_page(virt_to_page(sp->gfns));
+ free_page((unsigned long)sp->gfns);
kmem_cache_free(mmu_page_header_cache, sp);
kvm_mod_used_mmu_pages(kvm, -1);
}
@@ -3228,7 +3223,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_release_pfn_clean(pfn);
return;
}
- vcpu->arch.update_pte.gfn = gfn;
vcpu->arch.update_pte.pfn = pfn;
}
@@ -3275,9 +3269,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
/*
* Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode. This is nearly always true
- * (might be false while changing modes). Note it is verified later
- * by update_pte().
+ * as the current vcpu paging mode since we update the sptes only
+ * when they have the same mode.
*/
if ((is_pae(vcpu) && bytes == 4) || !new) {
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -3307,11 +3300,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
spin_lock(&vcpu->kvm->mmu_lock);
if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
gentry = 0;
- kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
if (guest_initiated) {
+ kvm_mmu_access_page(vcpu, gfn);
if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) {
++vcpu->arch.last_pt_write_count;
@@ -3538,14 +3531,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (!test_bit(slot, sp->slot_bitmap))
continue;
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
pt = sp->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_shadow_present_pte(pt[i]) ||
+ !is_last_spte(pt[i], sp->role.level))
+ continue;
+
+ if (is_large_pte(pt[i])) {
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
+ --kvm->stat.lpages;
+ continue;
+ }
+
/* avoid RMW */
if (is_writable_pte(pt[i]))
update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+ }
}
kvm_flush_remote_tlbs(kvm);
}
@@ -3583,7 +3585,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
if (nr_to_scan == 0)
goto out;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx, freed_pages;
@@ -3606,7 +3608,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
if (kvm_freed)
list_move_tail(&kvm_freed->vm_list, &vm_list);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
out:
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bccc24c4181..86eb8160bcb9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -31,7 +31,6 @@
#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
@@ -339,8 +337,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
- return;
pfn = vcpu->arch.update_pte.pfn;
if (is_error_pfn(pfn))
return;
@@ -829,7 +825,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
-#undef PT_LEVEL_MASK
#undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK
#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 63fec1531e89..6bb15d583e47 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -135,6 +135,8 @@ struct vcpu_svm {
u32 *msrpm;
+ ulong nmi_iret_rip;
+
struct nested_state nested;
bool nmi_singlestep;
@@ -1153,8 +1155,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
load_gs_index(svm->host.gs);
#else
+#ifdef CONFIG_X86_32_LAZY_GS
loadsegment(gs, svm->host.gs);
#endif
+#endif
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
++svm->vcpu.stat.nmi_window_exits;
clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
+ svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
return 1;
}
@@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
svm->int3_injected = 0;
- if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
+ /*
+ * If we've made progress since setting HF_IRET_MASK, we've
+ * executed an IRET and can allow NMI injection.
+ */
+ if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
+ && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
}
@@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+#endif
#endif
reload_tss(vcpu);
local_irq_disable();
- stgi();
-
vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_before_handle_nmi(&svm->vcpu);
+
+ stgi();
+
+ /* Any pending NMI will happen here */
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_after_handle_nmi(&svm->vcpu);
+
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2cfb82..3febb763cb7f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO);
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
- * According to test, this time is usually small than 41 cycles.
+ * According to test, this time is usually smaller than 128 cycles.
* ple_window: upper bound on the amount of time a guest is allowed to execute
* in a PAUSE loop. Tests indicate that most spinlocks are held for
* less than 2^12 cycles
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
-#define KVM_VMX_DEFAULT_PLE_GAP 41
+#define KVM_VMX_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
module_param(ple_gap, int, S_IRUGO);
@@ -176,7 +176,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
-static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
@@ -1333,19 +1332,25 @@ static __init int vmx_disabled_by_bios(void)
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
if (msr & FEATURE_CONTROL_LOCKED) {
+ /* launched w/ TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& tboot_enabled())
return 1;
+ /* launched w/o TXT and VMX only enabled w/ TXT */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& !tboot_enabled()) {
printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
- " activate TXT before enabling KVM\n");
+ "activate TXT before enabling KVM\n");
return 1;
}
+ /* launched w/o TXT and VMX disabled */
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && !tboot_enabled())
+ return 1;
}
return 0;
- /* locked but not enabled */
}
static void kvm_cpu_vmxon(u64 addr)
@@ -1683,6 +1688,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 0;
+ vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1756,6 +1762,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1;
+ vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1794,7 +1801,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
continue_rmode:
kvm_mmu_reset_context(vcpu);
- init_rmode(vcpu->kvm);
}
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2030,23 +2036,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(GUEST_CR4, hw_cr4);
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
- return vmcs_readl(sf->base);
-}
-
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_save_segment *save;
u32 ar;
+ if (vmx->rmode.vm86_active
+ && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
+ || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
+ || seg == VCPU_SREG_GS)
+ && !emulate_invalid_guest_state) {
+ switch (seg) {
+ case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
+ case VCPU_SREG_ES: save = &vmx->rmode.es; break;
+ case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
+ case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
+ case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
+ default: BUG();
+ }
+ var->selector = save->selector;
+ var->base = save->base;
+ var->limit = save->limit;
+ ar = save->ar;
+ if (seg == VCPU_SREG_TR
+ || var->selector == vmcs_read16(sf->selector))
+ goto use_saved_rmode_seg;
+ }
var->base = vmcs_readl(sf->base);
var->limit = vmcs_read32(sf->limit);
var->selector = vmcs_read16(sf->selector);
ar = vmcs_read32(sf->ar_bytes);
+use_saved_rmode_seg:
if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
ar = 0;
var->type = ar & 15;
@@ -2060,6 +2083,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->unusable = (ar >> 16) & 1;
}
+static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_segment s;
+
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ vmx_get_segment(vcpu, &s, seg);
+ return s.base;
+ }
+ return vmcs_readl(sf->base);
+}
+
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
if (!is_protmode(vcpu))
@@ -2101,6 +2136,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
u32 ar;
if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
+ vmcs_write16(sf->selector, var->selector);
vmx->rmode.tr.selector = var->selector;
vmx->rmode.tr.base = var->base;
vmx->rmode.tr.limit = var->limit;
@@ -2699,22 +2735,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
return 0;
}
-static int init_rmode(struct kvm *kvm)
-{
- int idx, ret = 0;
-
- idx = srcu_read_lock(&kvm->srcu);
- if (!init_rmode_tss(kvm))
- goto exit;
- if (!init_rmode_identity_map(kvm))
- goto exit;
-
- ret = 1;
-exit:
- srcu_read_unlock(&kvm->srcu, idx);
- return ret;
-}
-
static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2722,10 +2742,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
int ret;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- if (!init_rmode(vmx->vcpu.kvm)) {
- ret = -ENOMEM;
- goto out;
- }
vmx->rmode.vm86_active = 0;
@@ -2805,7 +2821,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
if (vm_need_tpr_shadow(vmx->vcpu.kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- page_to_phys(vmx->vcpu.arch.apic->regs_page));
+ __pa(vmx->vcpu.arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
}
@@ -2971,6 +2987,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
if (ret)
return ret;
kvm->arch.tss_addr = addr;
+ if (!init_rmode_tss(kvm))
+ return -ENOMEM;
+
return 0;
}
@@ -3962,7 +3981,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
#define Q "l"
#endif
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3991,6 +4010,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
+ "push %%"R"cx \n\t" /* placeholder for guest rcx */
"push %%"R"cx \n\t"
"cmp %%"R"sp, %c[host_rsp](%0) \n\t"
"je 1f \n\t"
@@ -4032,10 +4052,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
".Lkvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */
- "xchg %0, (%%"R"sp) \n\t"
+ "mov %0, %c[wordsize](%%"R"sp) \n\t"
+ "pop %0 \n\t"
"mov %%"R"ax, %c[rax](%0) \n\t"
"mov %%"R"bx, %c[rbx](%0) \n\t"
- "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+ "pop"Q" %c[rcx](%0) \n\t"
"mov %%"R"dx, %c[rdx](%0) \n\t"
"mov %%"R"si, %c[rsi](%0) \n\t"
"mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4053,7 +4074,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%cr2, %%"R"ax \n\t"
"mov %%"R"ax, %c[cr2](%0) \n\t"
- "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
+ "pop %%"R"bp; pop %%"R"dx \n\t"
"setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4076,7 +4097,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
- [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+ [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
+ [wordsize]"i"(sizeof(ulong))
: "cc", "memory"
, R"ax", R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
@@ -4183,8 +4205,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!kvm->arch.ept_identity_map_addr)
kvm->arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ err = -ENOMEM;
if (alloc_identity_pagetable(kvm) != 0)
goto free_vmcs;
+ if (!init_rmode_identity_map(kvm))
+ goto free_vmcs;
}
return &vmx->vcpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bcc0efce85bf..f1e4025f1ae2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -81,9 +81,10 @@
* - enable LME and LMA per default on 64 bit KVM
*/
#ifdef CONFIG_X86_64
-static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
+static
+u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
#else
-static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
+static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
+ kvm_make_request(KVM_REQ_NMI, vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
- vcpu->arch.nmi_pending = 1;
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
- if ((cr0 ^ old_cr0) & X86_CR0_PG)
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ }
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
@@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
unsigned long flags;
s64 sdiff;
- spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = data - native_read_tsc();
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm->arch.last_tsc_write = data;
kvm->arch.last_tsc_offset = offset;
kvm_x86_ops->write_tsc_offset(vcpu, offset);
- spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
/* Reset of TSC must disable overshoot protection below */
vcpu->arch.hv_clock.tsc_timestamp = 0;
@@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
+static void kvmclock_reset(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
- if (vcpu->arch.time_page) {
- kvm_release_page_dirty(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
- }
+ kvmclock_reset(vcpu);
vcpu->arch.time = data;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
} else
return set_msr_hyperv(vcpu, msr, data);
break;
+ case MSR_IA32_BBL_CR_CTL3:
+ /* Drop writes to this legacy MSR -- see rdmsr
+ * counterpart for further detail.
+ */
+ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
+ break;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
@@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
} else
return get_msr_hyperv(vcpu, msr, pdata);
break;
+ case MSR_IA32_BBL_CR_CTL3:
+ /* This legacy MSR exists but isn't fully documented in current
+ * silicon. It is however accessed by winxp in very narrow
+ * scenarios where it sets bit #19, itself documented as
+ * a "reserved" bit. Best effort attempt to source coherent
+ * read data here should the balance of the register be
+ * interpreted by the guest:
+ *
+ * L2 cache control register 3: 64GB range, 256KB size,
+ * enabled, latency 0x1, configured
+ */
+ data = 0xbe702111;
+ break;
default:
if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (check_tsc_unstable()) {
kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
vcpu->arch.tsc_catchup = 1;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu)
kvm_migrate_timers(vcpu);
vcpu->cpu = cpu;
@@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
!kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
- printk(KERN_DEBUG "kvm: set_mce: "
- "injects mce exception while "
- "previous one is in progress!\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
@@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.pending = events->interrupt.injected;
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
- if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
- kvm_pic_clear_isr_ack(vcpu->kvm);
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
kvm_x86_ops->set_interrupt_shadow(vcpu,
events->interrupt.shadow);
@@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg,
return get_segment_base(vcpu, seg);
}
-static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
- struct kvm_vcpu *vcpu)
+static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
+ int seg, struct kvm_vcpu *vcpu)
{
struct kvm_segment var;
@@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
var.limit >>= 12;
set_desc_limit(desc, var.limit);
set_desc_base(desc, (unsigned long)var.base);
+#ifdef CONFIG_X86_64
+ if (base3)
+ *base3 = var.base >> 32;
+#endif
desc->type = var.type;
desc->s = var.s;
desc->dpl = var.dpl;
@@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
return true;
}
-static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
- struct kvm_vcpu *vcpu)
+static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
+ int seg, struct kvm_vcpu *vcpu)
{
struct kvm_segment var;
@@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
kvm_get_segment(vcpu, &var, seg);
var.base = get_desc_base(desc);
+#ifdef CONFIG_X86_64
+ var.base |= ((u64)base3) << 32;
+#endif
var.limit = get_desc_limit(desc);
if (desc->g)
var.limit = (var.limit << 12) | 0xfff;
@@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
vcpu->arch.emulate_ctxt.have_exception = false;
vcpu->arch.emulate_ctxt.perm_ok = false;
+ vcpu->arch.emulate_ctxt.only_vendor_specific_insn
+ = emulation_type & EMULTYPE_TRAP_UD;
+
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
- if (r == X86EMUL_PROPAGATE_FAULT)
- goto done;
trace_kvm_emulate_insn_start(vcpu);
-
- /* Only allow emulation of specific instructions on #UD
- * (namely VMMCALL, sysenter, sysexit, syscall)*/
- if (emulation_type & EMULTYPE_TRAP_UD) {
- if (!c->twobyte)
- return EMULATE_FAIL;
- switch (c->b) {
- case 0x01: /* VMMCALL */
- if (c->modrm_mod != 3 || c->modrm_rm != 1)
- return EMULATE_FAIL;
- break;
- case 0x34: /* sysenter */
- case 0x35: /* sysexit */
- if (c->modrm_mod != 0 || c->modrm_rm != 0)
- return EMULATE_FAIL;
- break;
- case 0x05: /* syscall */
- if (c->modrm_mod != 0 || c->modrm_rm != 0)
- return EMULATE_FAIL;
- break;
- default:
- return EMULATE_FAIL;
- }
-
- if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
- return EMULATE_FAIL;
- }
-
++vcpu->stat.insn_emulation;
if (r) {
+ if (emulation_type & EMULTYPE_TRAP_UD)
+ return EMULATE_FAIL;
if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
@@ -4452,7 +4456,6 @@ restart:
return handle_emulation_failure(vcpu);
}
-done:
if (vcpu->arch.emulate_ctxt.have_exception) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
@@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu)
@@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
send_ipi = 1;
}
}
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) {
/*
@@ -5185,6 +5188,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 1;
goto out;
}
+ if (kvm_check_request(KVM_REQ_NMI, vcpu))
+ vcpu->arch.nmi_pending = true;
}
r = kvm_mmu_reload(vcpu);
@@ -5213,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_load_guest_fpu(vcpu);
kvm_load_guest_xcr0(vcpu);
- atomic_set(&vcpu->guest_mode, 1);
- smp_wmb();
+ vcpu->mode = IN_GUEST_MODE;
+
+ /* We should set ->mode before check ->requests,
+ * see the comment in make_all_cpus_request.
+ */
+ smp_mb();
local_irq_disable();
- if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+ if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) {
- atomic_set(&vcpu->guest_mode, 0);
+ vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
preempt_enable();
@@ -5256,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
- atomic_set(&vcpu->guest_mode, 0);
+ vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
@@ -5574,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
int mmu_reset_needed = 0;
- int pending_vec, max_bits;
+ int pending_vec, max_bits, idx;
struct desc_ptr dt;
dt.size = sregs->idt.limit;
@@ -5603,10 +5612,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
if (sregs->cr4 & X86_CR4_OSXSAVE)
update_cpuid(vcpu);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
@@ -5617,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (pending_vec < max_bits) {
kvm_queue_interrupt(vcpu, pending_vec, false);
pr_debug("Set back pending irq %d\n", pending_vec);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_pic_clear_isr_ack(vcpu->kvm);
}
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5814,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.time_page) {
- kvm_release_page_dirty(vcpu->arch.time_page);
- vcpu->arch.time_page = NULL;
- }
+ kvmclock_reset(vcpu);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fx_free(vcpu);
@@ -5878,6 +5885,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_val = 0;
+ kvmclock_reset(vcpu);
+
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
@@ -6005,7 +6014,7 @@ int kvm_arch_init_vm(struct kvm *kvm)
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- spin_lock_init(&kvm->arch.tsc_write_lock);
+ raw_spin_lock_init(&kvm->arch.tsc_write_lock);
return 0;
}
@@ -6103,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
int user_alloc)
{
- int npages = mem->memory_size >> PAGE_SHIFT;
+ int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
int ret;
@@ -6118,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
"failed to munmap memory\n");
}
+ if (!kvm->arch.n_requested_mmu_pages)
+ nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+
spin_lock(&kvm->mmu_lock);
- if (!kvm->arch.n_requested_mmu_pages) {
- unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+ if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- }
-
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
}
@@ -6157,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
me = get_cpu();
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (atomic_xchg(&vcpu->guest_mode, 0))
+ if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
smp_send_reschedule(cpu);
put_cpu();
}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
index bab9f74c0665..cfed5399f074 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -53,8 +53,8 @@ static void __ipath_release_user_pages(struct page **p, size_t num_pages,
}
/* call with current->mm->mmap_sem held */
-static int __get_user_pages(unsigned long start_page, size_t num_pages,
- struct page **p, struct vm_area_struct **vma)
+static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+ struct page **p, struct vm_area_struct **vma)
{
unsigned long lock_limit;
size_t got;
@@ -165,7 +165,7 @@ int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
down_write(&current->mm->mmap_sem);
- ret = __get_user_pages(start_page, num_pages, p, NULL);
+ ret = __ipath_get_user_pages(start_page, num_pages, p, NULL);
up_write(&current->mm->mmap_sem);
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index d7a26c1d4f37..7689e49c13c9 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -51,8 +51,8 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages,
/*
* Call with current->mm->mmap_sem held.
*/
-static int __get_user_pages(unsigned long start_page, size_t num_pages,
- struct page **p, struct vm_area_struct **vma)
+static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
+ struct page **p, struct vm_area_struct **vma)
{
unsigned long lock_limit;
size_t got;
@@ -136,7 +136,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
down_write(&current->mm->mmap_sem);
- ret = __get_user_pages(start_page, num_pages, p, NULL);
+ ret = __qib_get_user_pages(start_page, num_pages, p, NULL);
up_write(&current->mm->mmap_sem);
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 2bcc5c7c22a6..61e03dd7939e 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -30,6 +30,9 @@ typedef u64 cputime64_t;
#define cputime64_to_jiffies64(__ct) (__ct)
#define jiffies64_to_cputime64(__jif) (__jif)
#define cputime_to_cputime64(__ct) ((u64) __ct)
+#define cputime64_gt(__a, __b) ((__a) > (__b))
+
+#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct)
/*
diff --git a/include/asm-generic/errno.h b/include/asm-generic/errno.h
index 28cc03bf19e6..a1331ce50445 100644
--- a/include/asm-generic/errno.h
+++ b/include/asm-generic/errno.h
@@ -108,4 +108,6 @@
#define ERFKILL 132 /* Operation not possible due to RF-kill */
+#define EHWPOISON 133 /* Memory page has hardware error */
+
#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d4253e49..a1382b9b5813 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -426,6 +426,13 @@ extern void raise_softirq(unsigned int nr);
*/
DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
+
+static inline struct task_struct *this_cpu_ksoftirqd(void)
+{
+ return this_cpu_read(ksoftirqd);
+}
+
/* Try to send a softirq to a remote cpu. If this cannot be done, the
* work will be queued to the local cpu.
*/
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 6811f4bfc6e7..922aa313c9f9 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x);
extern unsigned long clock_t_to_jiffies(unsigned long x);
extern u64 jiffies_64_to_clock_t(u64 x);
extern u64 nsec_to_clock_t(u64 x);
+extern u64 nsecs_to_jiffies64(u64 n);
extern unsigned long nsecs_to_jiffies(u64 n);
#define TIMESTAMP_SIZE 30
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db21858..ab428552af8e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -43,6 +43,7 @@
#define KVM_REQ_DEACTIVATE_FPU 10
#define KVM_REQ_EVENT 11
#define KVM_REQ_APF_HALT 12
+#define KVM_REQ_NMI 13
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
@@ -98,23 +99,31 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
+enum {
+ OUTSIDE_GUEST_MODE,
+ IN_GUEST_MODE,
+ EXITING_GUEST_MODE
+};
+
struct kvm_vcpu {
struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct preempt_notifier preempt_notifier;
#endif
+ int cpu;
int vcpu_id;
- struct mutex mutex;
- int cpu;
- atomic_t guest_mode;
- struct kvm_run *run;
+ int srcu_idx;
+ int mode;
unsigned long requests;
unsigned long guest_debug;
- int srcu_idx;
+
+ struct mutex mutex;
+ struct kvm_run *run;
int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded;
wait_queue_head_t wq;
+ struct pid *pid;
int sigset_active;
sigset_t sigset;
struct kvm_vcpu_stat stat;
@@ -140,6 +149,11 @@ struct kvm_vcpu {
struct kvm_vcpu_arch arch;
};
+static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
+}
+
/*
* Some of the bitops functions do not support too long bitmaps.
* This number must be determined not to exceed such limits.
@@ -212,7 +226,6 @@ struct kvm_memslots {
struct kvm {
spinlock_t mmu_lock;
- raw_spinlock_t requests_lock;
struct mutex slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots *memslots;
@@ -223,6 +236,7 @@ struct kvm {
#endif
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
atomic_t online_vcpus;
+ int last_boosted_vcpu;
struct list_head vm_list;
struct mutex lock;
struct kvm_io_bus *buses[KVM_NR_BUSES];
@@ -719,11 +733,6 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
set_bit(req, &vcpu->requests);
}
-static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
-{
- return test_and_set_bit(req, &vcpu->requests);
-}
-
static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
{
if (test_bit(req, &vcpu->requests)) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6385fc17ad4..9c3a7c8ec039 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -965,6 +965,10 @@ static inline int handle_mm_fault(struct mm_struct *mm,
extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int len, unsigned int foll_flags,
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking);
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, int write, int force,
struct page **pages, struct vm_area_struct **vmas);
@@ -1530,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
#define FOLL_MLOCK 0x40 /* mark page as mlocked */
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
+#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
@@ -1622,14 +1627,6 @@ extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
extern atomic_long_t mce_bad_pages;
extern int soft_offline_page(struct page *page, int flags);
-#ifdef CONFIG_MEMORY_FAILURE
-int is_hwpoison_address(unsigned long addr);
-#else
-static inline int is_hwpoison_address(unsigned long addr)
-{
- return 0;
-}
-#endif
extern void dump_page(struct page *page);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5ed06b..755c4764b10d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1058,6 +1058,7 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
@@ -1084,12 +1085,10 @@ struct sched_class {
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_fork) (struct task_struct *p);
- void (*switched_from) (struct rq *this_rq, struct task_struct *task,
- int running);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task,
- int running);
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- int oldprio, int running);
+ int oldprio);
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
@@ -1715,7 +1714,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
/*
* Per process flags
*/
-#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
@@ -1945,8 +1943,6 @@ int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
-extern unsigned int sysctl_sched_compat_yield;
-
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;
@@ -1977,6 +1973,7 @@ static inline int rt_mutex_getprio(struct task_struct *p)
# define rt_mutex_adjust_pi(p) do { } while (0)
#endif
+extern bool yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
extern int task_nice(const struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..05b92c457010 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+EXPORT_SYMBOL_GPL(__put_task_struct);
/*
* macro override instead of weak attribute alias, to workaround
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..02f221274265 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
rcu_read_unlock();
return pid;
}
+EXPORT_SYMBOL_GPL(get_task_pid);
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
@@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
rcu_read_unlock();
return result;
}
+EXPORT_SYMBOL_GPL(get_pid_task);
struct pid *find_get_pid(pid_t nr)
{
diff --git a/kernel/sched.c b/kernel/sched.c
index 42eab5a8437d..9e81a0c54a14 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next, *last;
+ struct sched_entity *curr, *next, *last, *skip;
unsigned int nr_spread_over;
@@ -1686,6 +1686,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
+}
+
#endif
static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1913,7 @@ void account_system_vtime(struct task_struct *curr)
*/
if (hardirq_count())
__this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
irq_time_write_end();
@@ -1920,8 +1953,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
sched_rt_avg_update(rq, irq_delta);
}
+static int irqtime_account_hi_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#define sched_clock_irqtime (0)
+
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
rq->clock_task += delta;
@@ -2025,14 +2090,14 @@ inline int task_curr(const struct task_struct *p)
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
- int oldprio, int running)
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
- prev_class->switched_from(rq, p, running);
- p->sched_class->switched_to(rq, p, running);
- } else
- p->sched_class->prio_changed(rq, p, oldprio, running);
+ prev_class->switched_from(rq, p);
+ p->sched_class->switched_to(rq, p);
+ } else if (oldprio != p->prio)
+ p->sched_class->prio_changed(rq, p, oldprio);
}
static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2566,6 +2631,7 @@ static void __sched_fork(struct task_struct *p)
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
+ p->se.vruntime = 0;
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -3568,6 +3634,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
}
/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+ cputime64_t tmp = cputime_to_cputime64(cputime);
+
+ /* Add system time to process. */
+ p->stime = cputime_add(p->stime, cputime);
+ p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
+/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,33 +3670,90 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t tmp;
+ cputime64_t *target_cputime64;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
return;
}
- /* Add system time to process. */
- p->stime = cputime_add(p->stime, cputime);
- p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
- cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ target_cputime64 = &cpustat->irq;
else if (in_serving_softirq())
- cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ target_cputime64 = &cpustat->softirq;
else
- cpustat->system = cputime64_add(cpustat->system, tmp);
+ target_cputime64 = &cpustat->system;
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+}
- /* Account for system time used */
- acct_update_integrals(p);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+ if (irqtime_account_hi_update()) {
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ } else if (irqtime_account_si_update()) {
+ cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->softirq);
+ } else if (user_tick) {
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime_one_jiffy);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else {
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->system);
+ }
}
+static void irqtime_account_idle_ticks(int ticks)
+{
+ int i;
+ struct rq *rq = this_rq();
+
+ for (i = 0; i < ticks; i++)
+ irqtime_account_process_tick(current, 0, rq);
+}
+#else
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq) {}
+#endif
+
/*
* Account for involuntary wait time.
* @steal: the cpu time spent in involuntary wait
@@ -3645,6 +3794,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq);
+ return;
+ }
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3824,12 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
account_idle_time(jiffies_to_cputime(ticks));
}
@@ -4571,11 +4731,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
+ check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, &flags);
}
@@ -4903,11 +5062,10 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
activate_task(rq, p, 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
+ check_class_changed(rq, p, prev_class, oldprio);
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -5324,6 +5482,58 @@ void __sched yield(void)
}
EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ bool yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ double_rq_lock(rq, p_rq);
+ while (task_rq(p) != p_rq) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ if (!curr->sched_class->yield_to_task)
+ goto out;
+
+ if (curr->sched_class != p->sched_class)
+ goto out;
+
+ if (task_running(p_rq, p) || p->state)
+ goto out;
+
+ yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ if (yielded)
+ schedstat_inc(rq, yld_count);
+
+out:
+ double_rq_unlock(rq, p_rq);
+ local_irq_restore(flags);
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
@@ -7797,6 +8007,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
INIT_LIST_HEAD(&cfs_rq->tasks);
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
+ /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+ cfs_rq->load_stamp = 1;
+#endif
#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
}
@@ -8110,6 +8324,8 @@ EXPORT_SYMBOL(__might_sleep);
#ifdef CONFIG_MAGIC_SYSRQ
static void normalize_task(struct rq *rq, struct task_struct *p)
{
+ const struct sched_class *prev_class = p->sched_class;
+ int old_prio = p->prio;
int on_rq;
on_rq = p->se.on_rq;
@@ -8120,6 +8336,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
activate_task(rq, p, 0);
resched_task(rq->curr);
}
+
+ check_class_changed(rq, p, prev_class, old_prio);
}
void normalize_rt_tasks(void)
@@ -8511,7 +8729,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se), 0);
+ update_cfs_shares(group_cfs_rq(se));
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
raw_spin_lock_irqsave(&rq->lock, flags);
if (cfs_rq->rb_leftmost)
- MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+ MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..027024694043 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
-/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
return rb_entry(left, struct sched_entity, run_node);
}
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+ struct rb_node *next = rb_next(&se->run_node);
+
+ if (!next)
+ return NULL;
+
+ return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SCHED_DEBUG
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
/*
* Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
now - cfs_rq->load_last > 4 * period) {
cfs_rq->load_period = 0;
cfs_rq->load_avg = 0;
+ delta = period - 1;
}
cfs_rq->load_stamp = now;
@@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
- long weight_delta)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
long load_weight, load, shares;
- load = cfs_rq->load.weight + weight_delta;
+ load = cfs_rq->load.weight;
load_weight = atomic_read(&tg->load_weight);
- load_weight -= cfs_rq->load_contribution;
load_weight += load;
+ load_weight -= cfs_rq->load_contribution;
shares = (tg->shares * load);
if (load_weight)
@@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
}
# else /* CONFIG_SMP */
@@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
}
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
- long weight_delta)
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
@@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
struct sched_entity *se;
@@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
if (likely(se->load.weight == tg->shares))
return;
#endif
- shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+ shares = calc_cfs_shares(cfs_rq, tg);
reweight_entity(cfs_rq_of(se), se, shares);
}
@@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
}
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
@@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_curr(cfs_rq);
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, se->load.weight);
account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
list_add_leaf_cfs_rq(cfs_rq);
}
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+ else
+ break;
+ }
+}
+
+static void __clear_buddies_next(struct sched_entity *se)
{
- if (!se || cfs_rq->last == se)
- cfs_rq->last = NULL;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ else
+ break;
+ }
+}
- if (!se || cfs_rq->next == se)
- cfs_rq->next = NULL;
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->skip == se)
+ cfs_rq->skip = NULL;
+ else
+ break;
+ }
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- for_each_sched_entity(se)
- __clear_buddies(cfs_rq_of(se), se);
+ if (cfs_rq->last == se)
+ __clear_buddies_last(se);
+
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+
+ if (cfs_rq->skip == se)
+ __clear_buddies_skip(se);
}
static void
@@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (cfs_rq->nr_running > 1) {
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
if (delta < 0)
@@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
+ /*
+ * Avoid running the skip buddy, if running something else can
+ * be done without getting too unfair.
+ */
+ if (cfs_rq->skip == se) {
+ struct sched_entity *second = __pick_next_entity(se);
+ if (second && wakeup_preempt_entity(second, left) < 1)
+ se = second;
+ }
/*
* Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;
+
clear_buddies(cfs_rq, se);
return se;
@@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
hrtick_update(rq);
@@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
hrtick_update(rq);
}
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
- struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- struct sched_entity *rightmost, *se = &curr->se;
-
- /*
- * Are we the only task in the tree?
- */
- if (unlikely(cfs_rq->nr_running == 1))
- return;
-
- clear_buddies(cfs_rq, se);
-
- if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
-
- return;
- }
- /*
- * Find the rightmost entry in the rbtree:
- */
- rightmost = __pick_last_entity(cfs_rq);
- /*
- * Already in the rightmost position?
- */
- if (unlikely(!rightmost || entity_before(rightmost, se)))
- return;
-
- /*
- * Minimally necessary key value to be last in the tree:
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- se->vruntime = rightmost->vruntime + 1;
-}
-
#ifdef CONFIG_SMP
static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
}
}
+static void set_skip_buddy(struct sched_entity *se)
+{
+ if (likely(task_of(se)->policy != SCHED_IDLE)) {
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->skip = se;
+ }
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -1932,6 +1945,55 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
}
}
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *se = &curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(rq->nr_running == 1))
+ return;
+
+ clear_buddies(cfs_rq, se);
+
+ if (curr->policy != SCHED_BATCH) {
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ }
+
+ set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+ struct sched_entity *se = &p->se;
+
+ if (!se->on_rq)
+ return false;
+
+ /* Tell the scheduler that we'd really like pse to run next. */
+ set_next_buddy(se);
+
+ /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
+ if (preempt)
+ resched_task(rq->curr);
+
+ yield_task_fair(rq);
+
+ return true;
+}
+
#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods:
@@ -2123,7 +2185,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
* We need to update shares after updating tg->load_weight in
* order to adjust the weight of groups with long running tasks.
*/
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -4079,33 +4141,62 @@ static void task_fork_fair(struct task_struct *p)
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (running) {
+ if (rq->curr == p) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p, 0);
}
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Ensure the task's vruntime is normalized, so that when its
+ * switched back to the fair class the enqueue_entity(.flags=0) will
+ * do the right thing.
+ *
+ * If it was on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it was !on_rq, then only when
+ * the task is sleeping will it still have non-normalized vruntime.
+ */
+ if (!se->on_rq && p->state != TASK_RUNNING) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+}
+
/*
* We switched to the sched_fair class.
*/
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (running)
+ if (rq->curr == p)
resched_task(rq->curr);
else
check_preempt_curr(rq, p, 0);
@@ -4171,6 +4262,7 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
+ .yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
@@ -4191,6 +4283,7 @@ static const struct sched_class fair_sched_class = {
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
+ .switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..c82f26c1b7c3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
{
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
- /* Can this actually happen?? */
- if (running)
- resched_task(rq->curr);
- else
- check_preempt_curr(rq, p, 0);
+ BUG();
}
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
- /* This can happen for hot plug CPUS */
-
- /*
- * Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
- */
- if (running) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else
- check_preempt_curr(rq, p, 0);
+ BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 01f75a5f17af..db308cb08b75 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1599,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
* When switch from the rt queue, we bring ourselves to a position
* that we might want to pull RT tasks from other runqueues.
*/
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
/*
* If there are other RT tasks then we will reschedule
@@ -1609,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!rq->rt.rt_nr_running)
+ if (p->se.on_rq && !rq->rt.rt_nr_running)
pull_rt_task(rq);
}
@@ -1628,8 +1627,7 @@ static inline void init_sched_rt_class(void)
* with RT tasks. In this case we try to push them off to
* other runqueues.
*/
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
int check_resched = 1;
@@ -1640,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (!running) {
+ if (p->se.on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->rt.overloaded && push_rt_task(rq) &&
/* Don't resched if we changed runqueues */
@@ -1656,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
* Priority of the task has changed. This may cause
* us to initiate a push or pull.
*/
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (running) {
+ if (!p->se.on_rq)
+ return;
+
+ if (rq->curr == p) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..84ec9bcf82d9 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
-static void prio_changed_stop(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG(); /* how!?, what priority? */
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..0cee50487629 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu)
{
set_current_state(TASK_INTERRUPTIBLE);
- current->flags |= PF_KSOFTIRQD;
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4eed0af5d144..5be6ebc1e2a7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -361,13 +361,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rt_handler,
},
- {
- .procname = "sched_compat_yield",
- .data = &sysctl_sched_compat_yield,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..55337a816b20 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
}
/**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
*
* @n: nsecs in u64
*
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
*/
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n)
#endif
}
+
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n: nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+ return (unsigned long)nsecs_to_jiffies64(n);
+}
+
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{
diff --git a/mm/internal.h b/mm/internal.h
index 69488205723d..3438dd43a062 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -245,11 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas,
- int *nonblocking);
-
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0207c2f6f8bd..99ccb4472623 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1487,35 +1487,3 @@ done:
/* keep elevated page count for bad page */
return ret;
}
-
-/*
- * The caller must hold current->mm->mmap_sem in read mode.
- */
-int is_hwpoison_address(unsigned long addr)
-{
- pgd_t *pgdp;
- pud_t pud, *pudp;
- pmd_t pmd, *pmdp;
- pte_t pte, *ptep;
- swp_entry_t entry;
-
- pgdp = pgd_offset(current->mm, addr);
- if (!pgd_present(*pgdp))
- return 0;
- pudp = pud_offset(pgdp, addr);
- pud = *pudp;
- if (!pud_present(pud) || pud_large(pud))
- return 0;
- pmdp = pmd_offset(pudp, addr);
- pmd = *pmdp;
- if (!pmd_present(pmd) || pmd_large(pmd))
- return 0;
- ptep = pte_offset_map(pmdp, addr);
- pte = *ptep;
- pte_unmap(ptep);
- if (!is_swap_pte(pte))
- return 0;
- entry = pte_to_swp_entry(pte);
- return is_hwpoison_entry(entry);
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 5823698c2b71..346ee7e041fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1410,6 +1410,55 @@ no_page_table:
return page;
}
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk: task_struct of target task
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags,
struct page **pages, struct vm_area_struct **vmas,
@@ -1527,9 +1576,16 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
- if (ret &
- (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
- VM_FAULT_SIGBUS))
+ if (ret & (VM_FAULT_HWPOISON |
+ VM_FAULT_HWPOISON_LARGE)) {
+ if (i)
+ return i;
+ else if (gup_flags & FOLL_HWPOISON)
+ return -EHWPOISON;
+ else
+ return -EFAULT;
+ }
+ if (ret & VM_FAULT_SIGBUS)
return i ? i : -EFAULT;
BUG();
}
@@ -1578,6 +1634,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
} while (nr_pages);
return i;
}
+EXPORT_SYMBOL(__get_user_pages);
/**
* get_user_pages() - pin user pages in memory
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2ca4535f4fb7..3656849f78a0 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -313,8 +313,9 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
/*
* This rcu_assign_pointer is needed for when
- * another thread calls kvm_irqfd_update before
- * we flush workqueue below.
+ * another thread calls kvm_irq_routing_update before
+ * we flush workqueue below (we synchronize with
+ * kvm_irq_routing_update using irqfds.lock).
* It is paired with synchronize_rcu done by caller
* of that function.
*/
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f29abeb6a912..1fa0d292119a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -69,7 +69,7 @@ MODULE_LICENSE("GPL");
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_RAW_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list);
static cpumask_var_t cpus_hardware_enabled;
@@ -137,6 +137,14 @@ void vcpu_load(struct kvm_vcpu *vcpu)
int cpu;
mutex_lock(&vcpu->mutex);
+ if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
+ /* The thread running this VCPU changed. */
+ struct pid *oldpid = vcpu->pid;
+ struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+ rcu_assign_pointer(vcpu->pid, newpid);
+ synchronize_rcu();
+ put_pid(oldpid);
+ }
cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
@@ -165,13 +173,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
- raw_spin_lock(&kvm->requests_lock);
- me = smp_processor_id();
+ me = get_cpu();
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (kvm_make_check_request(req, vcpu))
- continue;
+ kvm_make_request(req, vcpu);
cpu = vcpu->cpu;
- if (cpus != NULL && cpu != -1 && cpu != me)
+
+ /* Set ->requests bit before we read ->mode */
+ smp_mb();
+
+ if (cpus != NULL && cpu != -1 && cpu != me &&
+ kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
cpumask_set_cpu(cpu, cpus);
}
if (unlikely(cpus == NULL))
@@ -180,7 +191,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
smp_call_function_many(cpus, ack_flush, NULL, 1);
else
called = false;
- raw_spin_unlock(&kvm->requests_lock);
+ put_cpu();
free_cpumask_var(cpus);
return called;
}
@@ -209,6 +220,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
+ vcpu->pid = NULL;
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
@@ -233,6 +245,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
+ put_pid(vcpu->pid);
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
}
@@ -463,15 +476,14 @@ static struct kvm *kvm_create_vm(void)
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
spin_lock_init(&kvm->mmu_lock);
- raw_spin_lock_init(&kvm->requests_lock);
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return kvm;
@@ -544,9 +556,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
struct mm_struct *mm = kvm->mm;
kvm_arch_sync_events(kvm);
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++)
kvm_io_bus_destroy(kvm->buses[i]);
@@ -588,6 +600,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
return 0;
}
+#ifndef CONFIG_S390
/*
* Allocation size is twice as large as the actual dirty bitmap size.
* This makes it possible to do double buffering: see x86's
@@ -608,6 +621,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
memslot->dirty_bitmap_head = memslot->dirty_bitmap;
return 0;
}
+#endif /* !CONFIG_S390 */
/*
* Allocate some memory and give it an address in the guest physical address
@@ -621,7 +635,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- int r, flush_shadow = 0;
+ int r;
gfn_t base_gfn;
unsigned long npages;
unsigned long i;
@@ -741,8 +755,6 @@ skip_lpage:
if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free;
/* destroy any largepage mappings for dirty tracking */
- if (old.npages)
- flush_shadow = 1;
}
#else /* not defined CONFIG_S390 */
new.user_alloc = user_alloc;
@@ -813,9 +825,6 @@ skip_lpage:
kvm_free_physmem_slot(&old, &new);
kfree(old_memslots);
- if (flush_shadow)
- kvm_arch_flush_shadow(kvm);
-
return 0;
out_free:
@@ -1029,6 +1038,15 @@ static pfn_t get_fault_pfn(void)
return fault_pfn;
}
+static inline int check_user_page_hwpoison(unsigned long addr)
+{
+ int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
+
+ rc = __get_user_pages(current, current->mm, addr, 1,
+ flags, NULL, NULL, NULL);
+ return rc == -EHWPOISON;
+}
+
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async, bool write_fault, bool *writable)
{
@@ -1076,7 +1094,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
return get_fault_pfn();
down_read(&current->mm->mmap_sem);
- if (is_hwpoison_address(addr)) {
+ if (check_user_page_hwpoison(addr)) {
up_read(&current->mm->mmap_sem);
get_page(hwpoison_page);
return page_to_pfn(hwpoison_page);
@@ -1466,18 +1484,55 @@ void kvm_resched(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_resched);
-void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
- ktime_t expires;
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
-
- /* Sleep for 100 us, and hope lock-holder got scheduled */
- expires = ktime_add_ns(ktime_get(), 100000UL);
- schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ struct kvm *kvm = me->kvm;
+ struct kvm_vcpu *vcpu;
+ int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int yielded = 0;
+ int pass;
+ int i;
- finish_wait(&vcpu->wq, &wait);
+ /*
+ * We boost the priority of a VCPU that is runnable but not
+ * currently running, because it got preempted by something
+ * else and called schedule in __vcpu_run. Hopefully that
+ * VCPU is holding the lock that we need and will release it.
+ * We approximate round-robin by starting at the last boosted VCPU.
+ */
+ for (pass = 0; pass < 2 && !yielded; pass++) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct task_struct *task = NULL;
+ struct pid *pid;
+ if (!pass && i < last_boosted_vcpu) {
+ i = last_boosted_vcpu;
+ continue;
+ } else if (pass && i > last_boosted_vcpu)
+ break;
+ if (vcpu == me)
+ continue;
+ if (waitqueue_active(&vcpu->wq))
+ continue;
+ rcu_read_lock();
+ pid = rcu_dereference(vcpu->pid);
+ if (pid)
+ task = get_pid_task(vcpu->pid, PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task)
+ continue;
+ if (task->flags & PF_VCPU) {
+ put_task_struct(task);
+ continue;
+ }
+ if (yield_to(task, 1)) {
+ put_task_struct(task);
+ kvm->last_boosted_vcpu = i;
+ yielded = 1;
+ break;
+ }
+ put_task_struct(task);
+ }
+ }
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@@ -2122,9 +2177,9 @@ static void hardware_enable_nolock(void *junk)
static void hardware_enable(void *junk)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_enable_nolock(junk);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static void hardware_disable_nolock(void *junk)
@@ -2139,9 +2194,9 @@ static void hardware_disable_nolock(void *junk)
static void hardware_disable(void *junk)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_disable_nolock(junk);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static void hardware_disable_all_nolock(void)
@@ -2155,16 +2210,16 @@ static void hardware_disable_all_nolock(void)
static void hardware_disable_all(void)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_disable_all_nolock();
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static int hardware_enable_all(void)
{
int r = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
kvm_usage_count++;
if (kvm_usage_count == 1) {
@@ -2177,7 +2232,7 @@ static int hardware_enable_all(void)
}
}
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return r;
}
@@ -2339,10 +2394,10 @@ static int vm_stat_get(void *_offset, u64 *val)
struct kvm *kvm;
*val = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
*val += *(u32 *)((void *)kvm + offset);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return 0;
}
@@ -2356,12 +2411,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
int i;
*val = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
*val += *(u32 *)((void *)vcpu + offset);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return 0;
}
@@ -2402,7 +2457,7 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state)
static int kvm_resume(struct sys_device *dev)
{
if (kvm_usage_count) {
- WARN_ON(spin_is_locked(&kvm_lock));
+ WARN_ON(raw_spin_is_locked(&kvm_lock));
hardware_enable_nolock(NULL);
}
return 0;