summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/events/core.c4
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/kexec.c118
-rw-r--r--kernel/kprobes.c19
-rw-r--r--kernel/kthread.c52
-rw-r--r--kernel/lockdep.c29
-rw-r--r--kernel/mutex.c151
-rw-r--r--kernel/rtmutex-tester.c5
-rw-r--r--kernel/sched/core.c45
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smpboot.c14
-rw-r--r--kernel/trace/blktrace.c26
-rw-r--r--kernel/user_namespace.c22
15 files changed, 345 insertions, 153 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f437..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7e0962ed7f8a..9fcb0944f071 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4596,6 +4596,7 @@ void perf_event_comm(struct task_struct *task)
struct perf_event_context *ctx;
int ctxn;
+ rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
@@ -4603,6 +4604,7 @@ void perf_event_comm(struct task_struct *task)
perf_event_enable_on_exec(ctx);
}
+ rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -5331,7 +5333,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..14be27feda49 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- raw_spin_lock_init(&cpu_base->lock);
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b6..ffd4e111fd67 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -55,7 +55,7 @@ struct resource crashk_res = {
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
struct resource crashk_low_res = {
- .name = "Crash kernel low",
+ .name = "Crash kernel",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
@@ -1368,35 +1368,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
return 0;
}
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
*/
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base,
- const char *name)
+ const char *name,
+ const char *suffix)
{
- char *p = cmdline, *ck_cmdline = NULL;
char *first_colon, *first_space;
+ char *ck_cmdline;
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- ck_cmdline = p;
- p = strstr(p+1, name);
- }
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
if (!ck_cmdline)
return -EINVAL;
ck_cmdline += strlen(name);
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ crash_base, suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -1413,13 +1492,26 @@ static int __init __parse_crashkernel(char *cmdline,
return 0;
}
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
int __init parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=");
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
}
int __init parse_crashkernel_low(char *cmdline,
@@ -1428,7 +1520,7 @@ int __init parse_crashkernel_low(char *cmdline,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel_low=");
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
static void update_vmcoreinfo_note(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e35be53f6613..3fed7f0cbcdf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -794,16 +794,16 @@ out:
}
#ifdef CONFIG_SYSCTL
-/* This should be called with kprobe_mutex locked */
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already allowed, just return */
if (kprobes_allow_optimization)
- return;
+ goto out;
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
optimize_kprobe(p);
}
printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+ mutex_unlock(&kprobe_mutex);
}
-/* This should be called with kprobe_mutex locked */
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already prohibited, just return */
- if (!kprobes_allow_optimization)
+ if (!kprobes_allow_optimization) {
+ mutex_unlock(&kprobe_mutex);
return;
+ }
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
unoptimize_kprobe(p, false);
}
}
+ mutex_unlock(&kprobe_mutex);
+
/* Wait for unoptimizing completion */
wait_for_kprobe_optimizer();
printk(KERN_INFO "Kprobes globally unoptimized\n");
}
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
{
int ret;
- mutex_lock(&kprobe_mutex);
+ mutex_lock(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_mutex);
+ mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..9eb7fed0bbaa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
complete(&self->parked);
schedule();
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
}
clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(p, state)) {
+ WARN_ON(1);
+ return;
+ }
/* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
- if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
- WARN_ON(1);
- return;
- }
- __kthread_bind(p, cpu);
+ __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
return NULL;
}
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+ wake_up_state(k, TASK_PARKED);
+ }
+}
+
/**
* kthread_unpark - unpark a thread created by kthread_create().
* @k: thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = task_get_live_kthread(k);
- if (kthread) {
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- /*
- * We clear the IS_PARKED bit here as we don't wait
- * until the task has left the park code. So if we'd
- * park before that happens we'd see the IS_PARKED bit
- * which might be about to be cleared.
- */
- if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
- __kthread_bind(k, kthread->cpu);
- wake_up_process(k);
- }
- }
+ if (kthread)
+ __kthread_unpark(k, kthread);
put_task_struct(k);
}
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
trace_sched_kthread_stop(k);
if (kthread) {
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ __kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8a0efac4f99d..6a3bccba7e7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
unsigned long nr_stack_trace_entries;
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static void print_lockdep_off(const char *bug_msg)
+{
+ printk(KERN_DEBUG "%s\n", bug_msg);
+ printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+ printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+}
+
static int save_trace(struct stack_trace *trace)
{
trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
}
raw_local_irq_restore(flags);
- printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
return NULL;
}
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
- printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
return NULL;
}
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct lock_class *class = hlock_class(hlock);
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- struct held_lock *hlock_curr, *hlock_next;
+ struct held_lock *hlock_curr;
int i, j;
/*
@@ -2048,8 +2052,7 @@ cache_hit:
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
return 0;
}
@@ -2057,12 +2060,10 @@ cache_hit:
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
/* Find the first held_lock of current chain */
- hlock_next = hlock;
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
- if (hlock_curr->irq_context != hlock_next->irq_context)
+ if (hlock_curr->irq_context != hlock->irq_context)
break;
- hlock_next = hlock;
}
i++;
chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
- printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
+ print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+ printk(KERN_DEBUG "depth: %i max: %lu!\n",
curr->lockdep_depth, MAX_LOCK_DEPTH);
- printk("turning off the locking correctness validator.\n");
lockdep_print_held_locks(current);
debug_show_all_locks();
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
# include <asm/mutex.h>
#endif
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
+
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ lock->spin_mlock = NULL;
+#endif
debug_mutex_init(lock, name, key);
}
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ * We don't inline mspin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+struct mspin_node {
+ struct mspin_node *next ;
+ int locked; /* 1 if lock acquired */
+};
+#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
+
+static noinline
+void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+
+static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (cmpxchg(lock, node, NULL) == node)
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+ ACCESS_ONCE(next->locked) = 1;
+ smp_wmb();
+}
+
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ if (lock->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return lock->owner == NULL;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+ int retval = 1;
+
+ rcu_read_lock();
+ if (lock->owner)
+ retval = lock->owner->on_cpu;
+ rcu_read_unlock();
+ /*
+ * if lock->owner is not set, the mutex owner may have just acquired
+ * it and not set the owner yet or the mutex has been released.
+ */
+ return retval;
+}
+#endif
+
static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*
* We can't do this for DEBUG_MUTEXES because that relies on wait_lock
* to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
*/
+ if (!mutex_can_spin_on_owner(lock))
+ goto slowpath;
for (;;) {
struct task_struct *owner;
+ struct mspin_node node;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
+ mspin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
+ if (owner && !mutex_spin_on_owner(lock, owner)) {
+ mspin_unlock(MLOCK(lock), &node);
break;
+ }
- if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+ if ((atomic_read(&lock->count) == 1) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
+ mspin_unlock(MLOCK(lock), &node);
preempt_enable();
return 0;
}
+ mspin_unlock(MLOCK(lock), &node);
/*
* When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
arch_mutex_cpu_relax();
}
+slowpath:
#endif
spin_lock_mutex(&lock->wait_lock, flags);
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
goto done;
lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) &&
+ (atomic_xchg(&lock->count, -1) == 1))
break;
/*
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10084a7..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/freezer.h>
+#include <linux/stat.h>
#include "rtmutex.h"
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
return curr - buf;
}
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
static struct bus_type rttest_subsys = {
.name = "rttest",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67d04651f44b..42053547e0f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2999,51 +2999,6 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
- if (lock->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
-
- return owner->on_cpu;
-}
-
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
- if (!sched_feat(OWNER_SPIN))
- return 0;
-
- rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
- break;
-
- arch_mutex_cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
-}
-#endif
-
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-
-/*
* Decrement CPU power based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/signal.c b/kernel/signal.c
index dd72567767d9..598dc06be421 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2948,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info;
+ struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 8eaed9aa9cf0..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
}
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
- if (ht->create)
- ht->create(cpu);
+ if (ht->create) {
+ /*
+ * Make sure that the task has actually scheduled out
+ * into park position, before calling the create
+ * callback. At least the migration thread callback
+ * requires that the task is off the runqueue.
+ */
+ if (!wait_task_inactive(tsk, TASK_PARKED))
+ WARN_ON(1);
+ else
+ ht->create(cpu);
+ }
return 0;
}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..5a0f781cd729 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
struct request_queue *q,
struct request *rq)
{
- struct blk_trace *bt = q->blk_trace;
-
- /* if control ever passes through here, it's a request based driver */
- if (unlikely(bt && !bt->rq_based))
- bt->rq_based = true;
-
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
}
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
-static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
+static void blk_add_trace_bio_complete(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ int error)
{
- struct request_queue *q;
- struct blk_trace *bt;
-
- if (!bio->bi_bdev)
- return;
-
- q = bdev_get_queue(bio->bi_bdev);
- bt = q->blk_trace;
-
- /*
- * Request based drivers will generate both rq and bio completions.
- * Ignore bio ones.
- */
- if (likely(!bt) || bt->rq_based)
- return;
-
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a54f26f82eb2..e134d8f365dd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -25,7 +25,8 @@
static struct kmem_cache *user_ns_cachep __read_mostly;
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -612,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (map->nr_extents != 0)
goto out;
- /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
- * over the user namespace in order to set the id mapping.
+ /*
+ * Adjusting namespace settings requires capabilities on the target.
*/
- if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
+ if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
/* Get a buffer */
@@ -700,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
- if (!new_idmap_permitted(ns, cap_setid, &new_map))
+ if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
/* Map the lower ids from the parent user namespace to the
@@ -787,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
&ns->projid_map, &ns->parent->projid_map);
}
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
/* Allow mapping to your own filesystem ids */
@@ -795,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, current_fsuid()))
+ if (uid_eq(uid, file->f_cred->fsuid))
return true;
}
else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, current_fsgid()))
+ if (gid_eq(gid, file->f_cred->fsgid))
return true;
}
}
@@ -811,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
/* Allow the specified ids if we have the appropriate capability
* (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+ * And the opener of the id file also had the approprpiate capability.
*/
- if (ns_capable(ns->parent, cap_setid))
+ if (ns_capable(ns->parent, cap_setid) &&
+ file_ns_capable(file, ns->parent, cap_setid))
return true;
return false;