summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/local_storage.c5
-rw-r--r--kernel/bpf/sockmap.c155
-rw-r--r--kernel/bpf/verifier.c12
-rw-r--r--kernel/bpf/xskmap.c10
-rw-r--r--kernel/cgroup/cgroup.c73
-rw-r--r--kernel/cpu.c11
-rw-r--r--kernel/dma/Kconfig19
-rw-r--r--kernel/dma/Makefile1
-rw-r--r--kernel/dma/contiguous.c6
-rw-r--r--kernel/dma/debug.c16
-rw-r--r--kernel/dma/direct.c222
-rw-r--r--kernel/dma/mapping.c71
-rw-r--r--kernel/dma/noncoherent.c106
-rw-r--r--kernel/events/core.c32
-rw-r--r--kernel/events/hw_breakpoint.c13
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/locking/lockdep.c1
-rw-r--r--kernel/locking/mutex.c3
-rw-r--r--kernel/locking/test-ww_mutex.c12
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/printk/printk.c12
-rw-r--r--kernel/printk/printk_safe.c4
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/debug.c6
-rw-r--r--kernel/sched/fair.c154
-rw-r--r--kernel/sched/sched.h5
-rw-r--r--kernel/sched/topology.c5
-rw-r--r--kernel/signal.c14
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/time/clocksource.c40
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/preemptirq_delay_test.c10
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace_events_hist.c32
-rw-r--r--kernel/tracepoint.c24
39 files changed, 701 insertions, 401 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2590700237c1..138f0302692e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
hdr = &btf->hdr;
cur = btf->nohdr_data + hdr->type_off;
- end = btf->nohdr_data + hdr->type_len;
+ end = cur + hdr->type_len;
env->log_type_id = 1;
while (cur < end) {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 22ad967d1e5f..830d7f095748 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
- if (flags & BPF_NOEXIST)
+ if (flags != BPF_ANY && flags != BPF_EXIST)
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
return ERR_PTR(-EINVAL);
+ if (attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
if (attr->value_size > PAGE_SIZE)
return ERR_PTR(-E2BIG);
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index cf5195c7c331..0a0f2ec75370 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -132,6 +132,7 @@ struct smap_psock {
struct work_struct gc_work;
struct proto *sk_proto;
+ void (*save_unhash)(struct sock *sk);
void (*save_close)(struct sock *sk, long timeout);
void (*save_data_ready)(struct sock *sk);
void (*save_write_space)(struct sock *sk);
@@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
+static void bpf_tcp_unhash(struct sock *sk);
static void bpf_tcp_close(struct sock *sk, long timeout);
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
struct proto *base)
{
prot[SOCKMAP_BASE] = *base;
+ prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash;
prot[SOCKMAP_BASE].close = bpf_tcp_close;
prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg;
prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read;
@@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk)
return -EBUSY;
}
+ psock->save_unhash = sk->sk_prot->unhash;
psock->save_close = sk->sk_prot->close;
psock->sk_proto = sk->sk_prot;
@@ -236,7 +240,7 @@ static int bpf_tcp_init(struct sock *sk)
}
static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md);
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
static void bpf_tcp_release(struct sock *sk)
{
@@ -248,7 +252,7 @@ static void bpf_tcp_release(struct sock *sk)
goto out;
if (psock->cork) {
- free_start_sg(psock->sock, psock->cork);
+ free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
psock->cork = NULL;
}
@@ -305,39 +309,21 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
return e;
}
-static void bpf_tcp_close(struct sock *sk, long timeout)
+static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
{
- void (*close_fun)(struct sock *sk, long timeout);
struct smap_psock_map_entry *e;
struct sk_msg_buff *md, *mtmp;
- struct smap_psock *psock;
struct sock *osk;
- lock_sock(sk);
- rcu_read_lock();
- psock = smap_psock_sk(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- release_sock(sk);
- return sk->sk_prot->close(sk, timeout);
- }
-
- /* The psock may be destroyed anytime after exiting the RCU critial
- * section so by the time we use close_fun the psock may no longer
- * be valid. However, bpf_tcp_close is called with the sock lock
- * held so the close hook and sk are still valid.
- */
- close_fun = psock->save_close;
-
if (psock->cork) {
- free_start_sg(psock->sock, psock->cork);
+ free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
psock->cork = NULL;
}
list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
list_del(&md->list);
- free_start_sg(psock->sock, md);
+ free_start_sg(psock->sock, md, true);
kfree(md);
}
@@ -369,7 +355,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
/* If another thread deleted this object skip deletion.
* The refcnt on psock may or may not be zero.
*/
- if (l) {
+ if (l && l == link) {
hlist_del_rcu(&link->hash_node);
smap_release_sock(psock, link->sk);
free_htab_elem(htab, link);
@@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
kfree(e);
e = psock_map_pop(sk, psock);
}
+}
+
+static void bpf_tcp_unhash(struct sock *sk)
+{
+ void (*unhash_fun)(struct sock *sk);
+ struct smap_psock *psock;
+
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->unhash)
+ sk->sk_prot->unhash(sk);
+ return;
+ }
+ unhash_fun = psock->save_unhash;
+ bpf_tcp_remove(sk, psock);
+ rcu_read_unlock();
+ unhash_fun(sk);
+}
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+ void (*close_fun)(struct sock *sk, long timeout);
+ struct smap_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = smap_psock_sk(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ release_sock(sk);
+ return sk->sk_prot->close(sk, timeout);
+ }
+ close_fun = psock->save_close;
+ bpf_tcp_remove(sk, psock);
rcu_read_unlock();
release_sock(sk);
close_fun(sk, timeout);
@@ -570,14 +592,16 @@ static void free_bytes_sg(struct sock *sk, int bytes,
md->sg_start = i;
}
-static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
+static int free_sg(struct sock *sk, int start,
+ struct sk_msg_buff *md, bool charge)
{
struct scatterlist *sg = md->sg_data;
int i = start, free = 0;
while (sg[i].length) {
free += sg[i].length;
- sk_mem_uncharge(sk, sg[i].length);
+ if (charge)
+ sk_mem_uncharge(sk, sg[i].length);
if (!md->skb)
put_page(sg_page(&sg[i]));
sg[i].length = 0;
@@ -594,9 +618,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
return free;
}
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge)
{
- int free = free_sg(sk, md->sg_start, md);
+ int free = free_sg(sk, md->sg_start, md, charge);
md->sg_start = md->sg_end;
return free;
@@ -604,7 +628,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
{
- return free_sg(sk, md->sg_curr, md);
+ return free_sg(sk, md->sg_curr, md, true);
}
static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
@@ -718,7 +742,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
list_add_tail(&r->list, &psock->ingress);
sk->sk_data_ready(sk);
} else {
- free_start_sg(sk, r);
+ free_start_sg(sk, r, true);
kfree(r);
}
@@ -752,14 +776,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
release_sock(sk);
}
smap_release_sock(psock, sk);
- if (unlikely(err))
- goto out;
- return 0;
+ return err;
out_rcu:
rcu_read_unlock();
-out:
- free_bytes_sg(NULL, send, md, false);
- return err;
+ return 0;
}
static inline void bpf_md_init(struct smap_psock *psock)
@@ -822,7 +842,7 @@ more_data:
case __SK_PASS:
err = bpf_tcp_push(sk, send, m, flags, true);
if (unlikely(err)) {
- *copied -= free_start_sg(sk, m);
+ *copied -= free_start_sg(sk, m, true);
break;
}
@@ -845,16 +865,17 @@ more_data:
lock_sock(sk);
if (unlikely(err < 0)) {
- free_start_sg(sk, m);
+ int free = free_start_sg(sk, m, false);
+
psock->sg_size = 0;
if (!cork)
- *copied -= send;
+ *copied -= free;
} else {
psock->sg_size -= send;
}
if (cork) {
- free_start_sg(sk, m);
+ free_start_sg(sk, m, true);
psock->sg_size = 0;
kfree(m);
m = NULL;
@@ -912,6 +933,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
rcu_read_lock();
psock = smap_psock_sk(sk);
@@ -922,9 +945,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out;
rcu_read_unlock();
- if (!skb_queue_empty(&sk->sk_receive_queue))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
lock_sock(sk);
bytes_ready:
while (copied != len) {
@@ -1122,7 +1142,7 @@ wait_for_memory:
err = sk_stream_wait_memory(sk, &timeo);
if (err) {
if (m && m != psock->cork)
- free_start_sg(sk, m);
+ free_start_sg(sk, m, true);
goto out_err;
}
}
@@ -1464,10 +1484,16 @@ static void smap_destroy_psock(struct rcu_head *rcu)
schedule_work(&psock->gc_work);
}
+static bool psock_is_smap_sk(struct sock *sk)
+{
+ return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops;
+}
+
static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
{
if (refcount_dec_and_test(&psock->refcnt)) {
- tcp_cleanup_ulp(sock);
+ if (psock_is_smap_sk(sock))
+ tcp_cleanup_ulp(sock);
write_lock_bh(&sock->sk_callback_lock);
smap_stop_sock(psock, sock);
write_unlock_bh(&sock->sk_callback_lock);
@@ -1581,13 +1607,13 @@ static void smap_gc_work(struct work_struct *w)
bpf_prog_put(psock->bpf_tx_msg);
if (psock->cork) {
- free_start_sg(psock->sock, psock->cork);
+ free_start_sg(psock->sock, psock->cork, true);
kfree(psock->cork);
}
list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
list_del(&md->list);
- free_start_sg(psock->sock, md);
+ free_start_sg(psock->sock, md, true);
kfree(md);
}
@@ -1894,6 +1920,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
* doesn't update user data.
*/
if (psock) {
+ if (!psock_is_smap_sk(sock)) {
+ err = -EBUSY;
+ goto out_progs;
+ }
if (READ_ONCE(psock->bpf_parse) && parse) {
err = -EBUSY;
goto out_progs;
@@ -2089,8 +2119,12 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EINVAL;
}
+ /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+ * state.
+ */
if (skops.sk->sk_type != SOCK_STREAM ||
- skops.sk->sk_protocol != IPPROTO_TCP) {
+ skops.sk->sk_protocol != IPPROTO_TCP ||
+ skops.sk->sk_state != TCP_ESTABLISHED) {
fput(socket->file);
return -EOPNOTSUPP;
}
@@ -2445,6 +2479,16 @@ static int sock_hash_update_elem(struct bpf_map *map,
return -EINVAL;
}
+ /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+ * state.
+ */
+ if (skops.sk->sk_type != SOCK_STREAM ||
+ skops.sk->sk_protocol != IPPROTO_TCP ||
+ skops.sk->sk_state != TCP_ESTABLISHED) {
+ fput(socket->file);
+ return -EOPNOTSUPP;
+ }
+
lock_sock(skops.sk);
preempt_disable();
rcu_read_lock();
@@ -2535,10 +2579,22 @@ const struct bpf_map_ops sock_hash_ops = {
.map_check_btf = map_check_no_btf,
};
+static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
+{
+ return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
struct bpf_map *, map, void *, key, u64, flags)
{
WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* ULPs are currently supported only for TCP sockets in ESTABLISHED
+ * state. This checks that the sock ops triggering the update is
+ * one indicating we are (or will be soon) in an ESTABLISHED state.
+ */
+ if (!bpf_is_valid_sock_op(bpf_sock))
+ return -EOPNOTSUPP;
return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
}
@@ -2557,6 +2613,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
struct bpf_map *, map, void *, key, u64, flags)
{
WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!bpf_is_valid_sock_op(bpf_sock))
+ return -EOPNOTSUPP;
return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92246117d2b0..465952a8e465 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2896,6 +2896,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
u64 umin_val, umax_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+ if (insn_bitness == 32) {
+ /* Relevant for 32-bit RSH: Information can propagate towards
+ * LSB, so it isn't sufficient to only truncate the output to
+ * 32 bits.
+ */
+ coerce_reg_to_size(dst_reg, 4);
+ coerce_reg_to_size(&src_reg, 4);
+ }
+
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
umin_val = src_reg.umin_value;
@@ -3131,7 +3140,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops are (32,32)->32 */
coerce_reg_to_size(dst_reg, 4);
- coerce_reg_to_size(&src_reg, 4);
}
__reg_deduce_bounds(dst_reg);
@@ -3163,7 +3171,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* an arbitrary scalar. Disallow all math except
* pointer subtraction
*/
- if (opcode == BPF_SUB){
+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
mark_reg_unknown(env, regs, insn->dst_reg);
return 0;
}
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 9f8463afda9c..47147c9e184d 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -192,11 +192,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
sock_hold(sock->sk);
old_xs = xchg(&m->xsk_map[i], xs);
- if (old_xs) {
- /* Make sure we've flushed everything. */
- synchronize_net();
+ if (old_xs)
sock_put((struct sock *)old_xs);
- }
sockfd_put(sock);
return 0;
@@ -212,11 +209,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
old_xs = xchg(&m->xsk_map[k], NULL);
- if (old_xs) {
- /* Make sure we've flushed everything. */
- synchronize_net();
+ if (old_xs)
sock_put((struct sock *)old_xs);
- }
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index aae10baf1902..4c1cf0969a80 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -492,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
}
/**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
@@ -501,8 +501,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
@@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
}
/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ *
+ * The returned css is not guaranteed to be online, and therefore it is the
+ * callers responsiblity to tryget a reference for it.
+ */
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css)
+ return css;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ return init_css_set.subsys[ss->id];
+}
+
+/**
* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
@@ -604,10 +633,11 @@ EXPORT_SYMBOL_GPL(of_css);
*
* Should be called under cgroup_[tree_]mutex.
*/
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
- ; \
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css_by_mask(cgrp, \
+ cgroup_subsys[(ssid)]))) \
+ ; \
else
/**
@@ -1006,7 +1036,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
- template[i] = cgroup_e_css(cgrp, ss);
+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else {
/*
* @ss is not in this hierarchy, so we don't want
@@ -2836,11 +2866,12 @@ restart:
}
/**
- * cgroup_save_control - save control masks of a subtree
+ * cgroup_save_control - save control masks and dom_cgrp of a subtree
* @cgrp: root of the target subtree
*
- * Save ->subtree_control and ->subtree_ss_mask to the respective old_
- * prefixed fields for @cgrp's subtree including @cgrp itself.
+ * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
+ * respective old_ prefixed fields for @cgrp's subtree including @cgrp
+ * itself.
*/
static void cgroup_save_control(struct cgroup *cgrp)
{
@@ -2850,6 +2881,7 @@ static void cgroup_save_control(struct cgroup *cgrp)
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
dsct->old_subtree_control = dsct->subtree_control;
dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+ dsct->old_dom_cgrp = dsct->dom_cgrp;
}
}
@@ -2875,11 +2907,12 @@ static void cgroup_propagate_control(struct cgroup *cgrp)
}
/**
- * cgroup_restore_control - restore control masks of a subtree
+ * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
* @cgrp: root of the target subtree
*
- * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
- * prefixed fields for @cgrp's subtree including @cgrp itself.
+ * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
+ * respective old_ prefixed fields for @cgrp's subtree including @cgrp
+ * itself.
*/
static void cgroup_restore_control(struct cgroup *cgrp)
{
@@ -2889,6 +2922,7 @@ static void cgroup_restore_control(struct cgroup *cgrp)
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
dsct->subtree_control = dsct->old_subtree_control;
dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+ dsct->dom_cgrp = dsct->old_dom_cgrp;
}
}
@@ -3019,7 +3053,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
return ret;
/*
- * At this point, cgroup_e_css() results reflect the new csses
+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
@@ -3196,6 +3230,8 @@ static int cgroup_enable_threaded(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup *dom_cgrp = parent->dom_cgrp;
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
int ret;
lockdep_assert_held(&cgroup_mutex);
@@ -3225,12 +3261,13 @@ static int cgroup_enable_threaded(struct cgroup *cgrp)
*/
cgroup_save_control(cgrp);
- cgrp->dom_cgrp = dom_cgrp;
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
+ if (dsct == cgrp || cgroup_is_threaded(dsct))
+ dsct->dom_cgrp = dom_cgrp;
+
ret = cgroup_apply_control(cgrp);
if (!ret)
parent->nr_threaded_children++;
- else
- cgrp->dom_cgrp = cgrp;
cgroup_finalize_control(cgrp, ret);
return ret;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index aa7fe85ad62e..0097acec1c71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -607,15 +607,15 @@ static void cpuhp_thread_fun(unsigned int cpu)
bool bringup = st->bringup;
enum cpuhp_state state;
+ if (WARN_ON_ONCE(!st->should_run))
+ return;
+
/*
* ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
* that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
- if (WARN_ON_ONCE(!st->should_run))
- return;
-
cpuhp_lock_acquire(bringup);
if (st->single) {
@@ -916,7 +916,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
if (ret) {
st->target = prev_state;
- undo_cpu_down(cpu, st);
+ if (st->state < prev_state)
+ undo_cpu_down(cpu, st);
break;
}
}
@@ -969,7 +970,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, target);
- if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+ if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
cpuhp_reset_state(st, prev_state);
__cpuhp_kick_ap(st);
}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 9bd54304446f..645c7a2ecde8 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -13,6 +13,9 @@ config NEED_DMA_MAP_STATE
config ARCH_DMA_ADDR_T_64BIT
def_bool 64BIT || PHYS_ADDR_T_64BIT
+config ARCH_HAS_DMA_COHERENCE_H
+ bool
+
config HAVE_GENERIC_DMA_COHERENT
bool
@@ -23,22 +26,22 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
bool
select NEED_DMA_MAP_STATE
-config DMA_DIRECT_OPS
+config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
bool
- depends on HAS_DMA
-config DMA_NONCOHERENT_OPS
+config ARCH_HAS_DMA_COHERENT_TO_PFN
bool
- depends on HAS_DMA
- select DMA_DIRECT_OPS
-config DMA_NONCOHERENT_MMAP
+config ARCH_HAS_DMA_MMAP_PGPROT
bool
- depends on DMA_NONCOHERENT_OPS
+
+config DMA_DIRECT_OPS
+ bool
+ depends on HAS_DMA
config DMA_NONCOHERENT_CACHE_SYNC
bool
- depends on DMA_NONCOHERENT_OPS
+ depends on DMA_DIRECT_OPS
config DMA_VIRT_OPS
bool
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 6de44e4eb454..7d581e4eea4a 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -4,7 +4,6 @@ obj-$(CONFIG_HAS_DMA) += mapping.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o
-obj-$(CONFIG_DMA_NONCOHERENT_OPS) += noncoherent.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 286d82329eb0..b2a87905846d 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -49,7 +49,11 @@ static phys_addr_t limit_cmdline;
static int __init early_cma(char *p)
{
- pr_debug("%s(%s)\n", __func__, p);
+ if (!p) {
+ pr_err("Config string not provided\n");
+ return -EINVAL;
+ }
+
size_cmdline = memparse(p, &p);
if (*p != '@')
return 0;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index c007d25bee09..231ca4628062 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1312,6 +1312,22 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg)
#endif
}
+void debug_dma_map_single(struct device *dev, const void *addr,
+ unsigned long len)
+{
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ if (!virt_addr_valid(addr))
+ err_printk(dev, NULL, "DMA-API: device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
+ addr, len);
+
+ if (is_vmalloc_addr(addr))
+ err_printk(dev, NULL, "DMA-API: device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
+ addr, len);
+}
+EXPORT_SYMBOL(debug_dma_map_single);
+
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
size_t size, int direction, dma_addr_t dma_addr,
bool map_single)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1c35b7b945d0..87a6bc2a96c0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -1,13 +1,16 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * DMA operations that map physical memory directly without using an IOMMU or
- * flushing caches.
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without using an IOMMU.
*/
+#include <linux/bootmem.h> /* for max_pfn */
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/dma-direct.h>
#include <linux/scatterlist.h>
#include <linux/dma-contiguous.h>
+#include <linux/dma-noncoherent.h>
#include <linux/pfn.h>
#include <linux/set_memory.h>
@@ -41,40 +44,83 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
return false;
}
- if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
+ if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
dev_err(dev,
- "%s: overflow %pad+%zu of device mask %llx\n",
- caller, &dma_addr, size, *dev->dma_mask);
+ "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n",
+ caller, &dma_addr, size,
+ *dev->dma_mask, dev->bus_dma_mask);
}
return false;
}
return true;
}
+static inline dma_addr_t phys_to_dma_direct(struct device *dev,
+ phys_addr_t phys)
+{
+ if (force_dma_unencrypted())
+ return __phys_to_dma(dev, phys);
+ return phys_to_dma(dev, phys);
+}
+
+u64 dma_direct_get_required_mask(struct device *dev)
+{
+ u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
+
+ if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)
+ max_dma = dev->bus_dma_mask;
+
+ return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
+}
+
+static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+ u64 *phys_mask)
+{
+ if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
+ dma_mask = dev->bus_dma_mask;
+
+ if (force_dma_unencrypted())
+ *phys_mask = __dma_to_phys(dev, dma_mask);
+ else
+ *phys_mask = dma_to_phys(dev, dma_mask);
+
+ /*
+ * Optimistically try the zone that the physical address mask falls
+ * into first. If that returns memory that isn't actually addressable
+ * we will fallback to the next lower zone and try again.
+ *
+ * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
+ * zones.
+ */
+ if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+ return GFP_DMA;
+ if (*phys_mask <= DMA_BIT_MASK(32))
+ return GFP_DMA32;
+ return 0;
+}
+
static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
- dma_addr_t addr = force_dma_unencrypted() ?
- __phys_to_dma(dev, phys) : phys_to_dma(dev, phys);
- return addr + size - 1 <= dev->coherent_dma_mask;
+ return phys_to_dma_direct(dev, phys) + size - 1 <=
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
}
-void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp, unsigned long attrs)
+void *dma_direct_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
int page_order = get_order(size);
struct page *page = NULL;
+ u64 phys_mask;
void *ret;
+ if (attrs & DMA_ATTR_NO_WARN)
+ gfp |= __GFP_NOWARN;
+
/* we always manually zero the memory once we are done: */
gfp &= ~__GFP_ZERO;
-
- /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
- if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
- gfp |= GFP_DMA;
- if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
- gfp |= GFP_DMA32;
-
+ gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+ &phys_mask);
again:
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(gfp)) {
@@ -93,15 +139,14 @@ again:
page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
- dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+ phys_mask < DMA_BIT_MASK(64) &&
!(gfp & (GFP_DMA32 | GFP_DMA))) {
gfp |= GFP_DMA32;
goto again;
}
if (IS_ENABLED(CONFIG_ZONE_DMA) &&
- dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
- !(gfp & GFP_DMA)) {
+ phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
@@ -124,7 +169,7 @@ again:
* NOTE: this function must never look at the dma_addr argument, because we want
* to be able to use it as a helper for iommu implementations as well.
*/
-void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
+void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_addr, unsigned long attrs)
{
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
@@ -136,14 +181,96 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
free_pages((unsigned long)cpu_addr, page_order);
}
+void *dma_direct_alloc(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+ if (!dev_is_dma_coherent(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+ return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+}
+
+void dma_direct_free(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
+{
+ if (!dev_is_dma_coherent(dev))
+ arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
+ else
+ dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
+}
+
+static void dma_direct_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ if (dev_is_dma_coherent(dev))
+ return;
+ arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
+}
+
+static void dma_direct_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ if (dev_is_dma_coherent(dev))
+ return;
+
+ for_each_sg(sgl, sg, nents, i)
+ arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
+}
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+static void dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ if (dev_is_dma_coherent(dev))
+ return;
+ arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
+ arch_sync_dma_for_cpu_all(dev);
+}
+
+static void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ if (dev_is_dma_coherent(dev))
+ return;
+
+ for_each_sg(sgl, sg, nents, i)
+ arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
+ arch_sync_dma_for_cpu_all(dev);
+}
+
+static void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
+}
+#endif
+
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
- dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset;
+ phys_addr_t phys = page_to_phys(page) + offset;
+ dma_addr_t dma_addr = phys_to_dma(dev, phys);
if (!check_addr(dev, dma_addr, size, __func__))
return DIRECT_MAPPING_ERROR;
+
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_single_for_device(dev, dma_addr, size, dir);
return dma_addr;
}
@@ -162,31 +289,29 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
sg_dma_len(sg) = sg->length;
}
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_sg_for_device(dev, sgl, nents, dir);
return nents;
}
+/*
+ * Because 32-bit DMA masks are so common we expect every architecture to be
+ * able to satisfy them - either by not supporting more physical memory, or by
+ * providing a ZONE_DMA32. If neither is the case, the architecture needs to
+ * use an IOMMU instead of the direct mapping.
+ */
int dma_direct_supported(struct device *dev, u64 mask)
{
-#ifdef CONFIG_ZONE_DMA
- if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
- return 0;
-#else
- /*
- * Because 32-bit DMA masks are so common we expect every architecture
- * to be able to satisfy them - either by not supporting more physical
- * memory, or by providing a ZONE_DMA32. If neither is the case, the
- * architecture needs to use an IOMMU instead of the direct mapping.
- */
- if (mask < DMA_BIT_MASK(32))
- return 0;
-#endif
- /*
- * Upstream PCI/PCIe bridges or SoC interconnects may not carry
- * as many DMA address bits as the device itself supports.
- */
- if (dev->bus_dma_mask && mask > dev->bus_dma_mask)
- return 0;
- return 1;
+ u64 min_mask;
+
+ if (IS_ENABLED(CONFIG_ZONE_DMA))
+ min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
+ else
+ min_mask = DMA_BIT_MASK(32);
+
+ min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
+
+ return mask >= phys_to_dma(dev, min_mask);
}
int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr)
@@ -199,7 +324,20 @@ const struct dma_map_ops dma_direct_ops = {
.free = dma_direct_free,
.map_page = dma_direct_map_page,
.map_sg = dma_direct_map_sg,
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
+ .sync_single_for_device = dma_direct_sync_single_for_device,
+ .sync_sg_for_device = dma_direct_sync_sg_for_device,
+#endif
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+ .sync_single_for_cpu = dma_direct_sync_single_for_cpu,
+ .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu,
+ .unmap_page = dma_direct_unmap_page,
+ .unmap_sg = dma_direct_unmap_sg,
+#endif
+ .get_required_mask = dma_direct_get_required_mask,
.dma_supported = dma_direct_supported,
.mapping_error = dma_direct_mapping_error,
+ .cache_sync = arch_dma_cache_sync,
};
EXPORT_SYMBOL(dma_direct_ops);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index d2a92ddaac4d..58dec7a92b7b 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -7,7 +7,7 @@
*/
#include <linux/acpi.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-noncoherent.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/of_device.h>
@@ -202,17 +202,26 @@ EXPORT_SYMBOL(dmam_release_declared_memory);
* Create scatter-list for the already allocated DMA buffer.
*/
int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
- void *cpu_addr, dma_addr_t handle, size_t size)
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
{
- struct page *page = virt_to_page(cpu_addr);
+ struct page *page;
int ret;
- ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
- if (unlikely(ret))
- return ret;
+ if (!dev_is_dma_coherent(dev)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
+ return -ENXIO;
- sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
- return 0;
+ page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr,
+ dma_addr));
+ } else {
+ page = virt_to_page(cpu_addr);
+ }
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
}
EXPORT_SYMBOL(dma_common_get_sgtable);
@@ -220,27 +229,37 @@ EXPORT_SYMBOL(dma_common_get_sgtable);
* Create userspace mapping for the DMA-coherent memory.
*/
int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
- void *cpu_addr, dma_addr_t dma_addr, size_t size)
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
{
- int ret = -ENXIO;
#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
+ unsigned long pfn;
+ int ret = -ENXIO;
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
- if (off < count && user_count <= (count - off))
- ret = remap_pfn_range(vma, vma->vm_start,
- page_to_pfn(virt_to_page(cpu_addr)) + off,
- user_count << PAGE_SHIFT,
- vma->vm_page_prot);
-#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
+ if (off >= count || user_count > count - off)
+ return -ENXIO;
- return ret;
+ if (!dev_is_dma_coherent(dev)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
+ return -ENXIO;
+ pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
+ } else {
+ pfn = page_to_pfn(virt_to_page(cpu_addr));
+ }
+
+ return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+ user_count << PAGE_SHIFT, vma->vm_page_prot);
+#else
+ return -ENXIO;
+#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
}
EXPORT_SYMBOL(dma_common_mmap);
@@ -327,19 +346,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
vunmap(cpu_addr);
}
#endif
-
-/*
- * enables DMA API use for a device
- */
-int dma_configure(struct device *dev)
-{
- if (dev->bus->dma_configure)
- return dev->bus->dma_configure(dev);
- return 0;
-}
-
-void dma_deconfigure(struct device *dev)
-{
- of_dma_deconfigure(dev);
- acpi_dma_deconfigure(dev);
-}
diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c
deleted file mode 100644
index 031fe235d958..000000000000
--- a/kernel/dma/noncoherent.c
+++ /dev/null
@@ -1,106 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2018 Christoph Hellwig.
- *
- * DMA operations that map physical memory directly without providing cache
- * coherence.
- */
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/scatterlist.h>
-
-static void dma_noncoherent_sync_single_for_device(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
- arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
-}
-
-static void dma_noncoherent_sync_sg_for_device(struct device *dev,
- struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sgl, sg, nents, i)
- arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
-}
-
-static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
-{
- dma_addr_t addr;
-
- addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
- if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- arch_sync_dma_for_device(dev, page_to_phys(page) + offset,
- size, dir);
- return addr;
-}
-
-static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl,
- int nents, enum dma_data_direction dir, unsigned long attrs)
-{
- nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs);
- if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir);
- return nents;
-}
-
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
- defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
-static void dma_noncoherent_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
- arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir);
- arch_sync_dma_for_cpu_all(dev);
-}
-
-static void dma_noncoherent_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sgl, int nents, enum dma_data_direction dir)
-{
- struct scatterlist *sg;
- int i;
-
- for_each_sg(sgl, sg, nents, i)
- arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
- arch_sync_dma_for_cpu_all(dev);
-}
-
-static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir);
-}
-
-static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl,
- int nents, enum dma_data_direction dir, unsigned long attrs)
-{
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
- dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir);
-}
-#endif
-
-const struct dma_map_ops dma_noncoherent_ops = {
- .alloc = arch_dma_alloc,
- .free = arch_dma_free,
- .mmap = arch_dma_mmap,
- .sync_single_for_device = dma_noncoherent_sync_single_for_device,
- .sync_sg_for_device = dma_noncoherent_sync_sg_for_device,
- .map_page = dma_noncoherent_map_page,
- .map_sg = dma_noncoherent_map_sg,
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
- defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
- .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu,
- .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu,
- .unmap_page = dma_noncoherent_unmap_page,
- .unmap_sg = dma_noncoherent_unmap_sg,
-#endif
- .dma_supported = dma_direct_supported,
- .mapping_error = dma_direct_mapping_error,
- .cache_sync = arch_dma_cache_sync,
-};
-EXPORT_SYMBOL(dma_noncoherent_ops);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2a62b96600ad..5a97f34bc14c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2867,16 +2867,11 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
_perf_event_disable(bp);
err = modify_user_hw_breakpoint_check(bp, attr, true);
- if (err) {
- if (!bp->attr.disabled)
- _perf_event_enable(bp);
- return err;
- }
-
- if (!attr->disabled)
+ if (!bp->attr.disabled)
_perf_event_enable(bp);
- return 0;
+
+ return err;
}
static int perf_event_modify_attr(struct perf_event *event,
@@ -3940,6 +3935,12 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
+ /* If this is a pinned event it must be running on this CPU */
+ if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/*
* If the event is currently on this CPU, its either a per-task event,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
@@ -5948,6 +5949,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
+ mm_segment_t fs;
/*
* We dump:
@@ -5965,7 +5967,10 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
+ fs = get_fs();
+ set_fs(USER_DS);
rem = __output_copy_user(handle, (void *) sp, dump_size);
+ set_fs(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -8309,6 +8314,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
goto unlock;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->cpu != smp_processor_id())
+ continue;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
continue;
if (event->attr.config != entry->type)
@@ -9426,9 +9433,7 @@ static void free_pmu_context(struct pmu *pmu)
if (pmu->task_ctx_nr > perf_invalid_context)
return;
- mutex_lock(&pmus_lock);
free_percpu(pmu->pmu_cpu_context);
- mutex_unlock(&pmus_lock);
}
/*
@@ -9684,12 +9689,8 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
- int remove_device;
-
mutex_lock(&pmus_lock);
- remove_device = pmu_bus_running;
list_del_rcu(&pmu->entry);
- mutex_unlock(&pmus_lock);
/*
* We dereference the pmu list under both SRCU and regular RCU, so
@@ -9701,13 +9702,14 @@ void perf_pmu_unregister(struct pmu *pmu)
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
- if (remove_device) {
+ if (pmu_bus_running) {
if (pmu->nr_addr_filters)
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
put_device(pmu->dev);
}
free_pmu_context(pmu);
+ mutex_unlock(&pmus_lock);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b3814fce5ecb..d6b56180827c 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -509,6 +509,8 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a
*/
int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
{
+ int err;
+
/*
* modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
* will not be possible to raise IPIs that invoke __perf_event_disable.
@@ -520,15 +522,12 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
else
perf_event_disable(bp);
- if (!attr->disabled) {
- int err = modify_user_hw_breakpoint_check(bp, attr, false);
+ err = modify_user_hw_breakpoint_check(bp, attr, false);
- if (err)
- return err;
+ if (!bp->attr.disabled)
perf_event_enable(bp);
- bp->attr.disabled = 0;
- }
- return 0;
+
+ return err;
}
EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
diff --git a/kernel/fork.c b/kernel/fork.c
index d896e9ca38b0..f0b58479534f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -550,8 +550,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
}
/* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
+ retval = arch_dup_mmap(oldmm, mm);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 01ebdf1f9f40..2e62503bea0d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -678,7 +678,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
case MODULE_STATE_COMING:
ret = jump_label_add_module(mod);
if (ret) {
- WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
+ WARN(1, "Failed to allocate memory: jump_label may not work properly.\n");
jump_label_del_module(mod);
}
break;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e406c5fdb41e..dd13f865ad40 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,7 +55,6 @@
#include "lockdep_internals.h"
-#include <trace/events/preemptirq.h>
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 1a81a1257b3f..3f8a35104285 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -389,7 +389,7 @@ static bool __ww_mutex_wound(struct mutex *lock,
/*
* wake_up_process() paired with set_current_state()
* inserts sufficient barriers to make sure @owner either sees
- * it's wounded in __ww_mutex_lock_check_stamp() or has a
+ * it's wounded in __ww_mutex_check_kill() or has a
* wakeup pending to re-read the wounded state.
*/
if (owner != current)
@@ -946,7 +946,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, current);
lock_contended(&lock->dep_map, ip);
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 5b915b370d5a..65a3b7e55b9f 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -260,7 +260,7 @@ static void test_cycle_work(struct work_struct *work)
{
struct test_cycle *cycle = container_of(work, typeof(*cycle), work);
struct ww_acquire_ctx ctx;
- int err;
+ int err, erra = 0;
ww_acquire_init(&ctx, &ww_class);
ww_mutex_lock(&cycle->a_mutex, &ctx);
@@ -270,17 +270,19 @@ static void test_cycle_work(struct work_struct *work)
err = ww_mutex_lock(cycle->b_mutex, &ctx);
if (err == -EDEADLK) {
+ err = 0;
ww_mutex_unlock(&cycle->a_mutex);
ww_mutex_lock_slow(cycle->b_mutex, &ctx);
- err = ww_mutex_lock(&cycle->a_mutex, &ctx);
+ erra = ww_mutex_lock(&cycle->a_mutex, &ctx);
}
if (!err)
ww_mutex_unlock(cycle->b_mutex);
- ww_mutex_unlock(&cycle->a_mutex);
+ if (!erra)
+ ww_mutex_unlock(&cycle->a_mutex);
ww_acquire_fini(&ctx);
- cycle->result = err;
+ cycle->result = err ?: erra;
}
static int __test_cycle(unsigned int nthreads)
@@ -324,7 +326,7 @@ static int __test_cycle(unsigned int nthreads)
if (!cycle->result)
continue;
- pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n",
+ pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n",
n, nthreads, cycle->result);
ret = -EINVAL;
break;
diff --git a/kernel/pid.c b/kernel/pid.c
index de1cfc4f75a2..cdf63e53a014 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -195,7 +195,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
idr_preload_end();
if (nr < 0) {
- retval = nr;
+ retval = (nr == -ENOSPC) ? -EAGAIN : nr;
goto out_free;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5342f6fc022e..0bd595a0b610 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -63,6 +63,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
enum s2idle_states __read_mostly s2idle_state;
static DEFINE_RAW_SPINLOCK(s2idle_lock);
+bool pm_suspend_via_s2idle(void)
+{
+ return mem_sleep_current == PM_SUSPEND_TO_IDLE;
+}
+EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle);
+
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
lock_system_sleep();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fd6f8ed28e01..9bf5404397e0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -351,7 +351,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
*/
enum log_flags {
- LOG_NOCONS = 1, /* suppress print, do not print to console */
LOG_NEWLINE = 2, /* text ended with a newline */
LOG_PREFIX = 4, /* text started with a prefix */
LOG_CONT = 8, /* text is a fragment of a continuation line */
@@ -1881,9 +1880,6 @@ int vprintk_store(int facility, int level,
if (dict)
lflags |= LOG_PREFIX|LOG_NEWLINE;
- if (suppress_message_printing(level))
- lflags |= LOG_NOCONS;
-
return log_output(facility, level, lflags,
dict, dictlen, text, text_len);
}
@@ -2032,6 +2028,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
+static bool suppress_message_printing(int level) { return false; }
#endif /* CONFIG_PRINTK */
@@ -2368,10 +2365,11 @@ skip:
break;
msg = log_from_idx(console_idx);
- if (msg->flags & LOG_NOCONS) {
+ if (suppress_message_printing(msg->level)) {
/*
- * Skip record if !ignore_loglevel, and
- * record has level above the console loglevel.
+ * Skip record we have buffered and already printed
+ * directly to the console when we received it, and
+ * record that has level above the console loglevel.
*/
console_idx = log_next(console_idx);
console_seq++;
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index a0a74c533e4b..0913b4d385de 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -306,12 +306,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
return printk_safe_log_store(s, fmt, args);
}
-void printk_nmi_enter(void)
+void notrace printk_nmi_enter(void)
{
this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
}
-void printk_nmi_exit(void)
+void notrace printk_nmi_exit(void)
{
this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 625bc9897f62..ad97f3ba5ec5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p);
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
perf_event_task_migrate(p);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 997ea7b839fa..91e4202b0634 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1607,7 +1607,7 @@ out:
return cpu;
}
-static void migrate_task_rq_dl(struct task_struct *p)
+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
struct rq *rq;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 60caf1fb94e0..6383aa6a60ca 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- static_key_disable(&sched_feat_keys[i]);
+ static_key_disable_cpuslocked(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- static_key_enable(&sched_feat_keys[i]);
+ static_key_enable_cpuslocked(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
@@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
+ cpus_read_lock();
inode_lock(inode);
ret = sched_feat_set(cmp);
inode_unlock(inode);
+ cpus_read_unlock();
if (ret < 0)
return ret;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b39fb596f6c1..908c9cdae2f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int last_cpupid, this_cpupid;
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+ /*
+ * Allow first faults or private faults to migrate immediately early in
+ * the lifetime of a task. The magic number 4 is based on waiting for
+ * two full passes of the "multi-stage node selection" test that is
+ * executed below.
+ */
+ if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+ return true;
/*
* Multi-stage node selection is used in conjunction with a periodic
@@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid)
return false;
@@ -1514,6 +1524,21 @@ struct task_numa_env {
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
+ struct rq *rq = cpu_rq(env->dst_cpu);
+
+ /* Bail out if run-queue part of active NUMA balance. */
+ if (xchg(&rq->numa_migrate_on, 1))
+ return;
+
+ /*
+ * Clear previous best_cpu/rq numa-migrate flag, since task now
+ * found a better CPU to move/swap.
+ */
+ if (env->best_cpu != -1) {
+ rq = cpu_rq(env->best_cpu);
+ WRITE_ONCE(rq->numa_migrate_on, 0);
+ }
+
if (env->best_task)
put_task_struct(env->best_task);
if (p)
@@ -1553,6 +1578,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
}
/*
+ * Maximum NUMA importance can be 1998 (2*999);
+ * SMALLIMP @ 30 would be close to 1998/64.
+ * Used to deter task migration.
+ */
+#define SMALLIMP 30
+
+/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
* into account that it might be best if task running on the dst_cpu should
@@ -1569,6 +1601,9 @@ static void task_numa_compare(struct task_numa_env *env,
long moveimp = imp;
int dist = env->dist;
+ if (READ_ONCE(dst_rq->numa_migrate_on))
+ return;
+
rcu_read_lock();
cur = task_rcu_dereference(&dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
@@ -1582,7 +1617,7 @@ static void task_numa_compare(struct task_numa_env *env,
goto unlock;
if (!cur) {
- if (maymove || imp > env->best_imp)
+ if (maymove && moveimp >= env->best_imp)
goto assign;
else
goto unlock;
@@ -1625,16 +1660,22 @@ static void task_numa_compare(struct task_numa_env *env,
task_weight(cur, env->dst_nid, dist);
}
- if (imp <= env->best_imp)
- goto unlock;
-
if (maymove && moveimp > imp && moveimp > env->best_imp) {
- imp = moveimp - 1;
+ imp = moveimp;
cur = NULL;
goto assign;
}
/*
+ * If the NUMA importance is less than SMALLIMP,
+ * task migration might only result in ping pong
+ * of tasks and also hurt performance due to cache
+ * misses.
+ */
+ if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
+ goto unlock;
+
+ /*
* In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur);
@@ -1710,6 +1751,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_cpu = -1,
};
struct sched_domain *sd;
+ struct rq *best_rq;
unsigned long taskweight, groupweight;
int nid, ret, dist;
long taskimp, groupimp;
@@ -1805,20 +1847,17 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;
- /*
- * Reset the scan period if the task is being rescheduled on an
- * alternative node to recheck if the tasks is now properly placed.
- */
- p->numa_scan_period = task_scan_start(p);
-
+ best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
return ret;
}
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
@@ -2596,6 +2635,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
+static void update_scan_period(struct task_struct *p, int new_cpu)
+{
+ int src_nid = cpu_to_node(task_cpu(p));
+ int dst_nid = cpu_to_node(new_cpu);
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ return;
+
+ if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
+ return;
+
+ if (src_nid == dst_nid)
+ return;
+
+ /*
+ * Allow resets if faults have been trapped before one scan
+ * has completed. This is most likely due to a new task that
+ * is pulled cross-node due to wakeups or load balancing.
+ */
+ if (p->numa_scan_seq) {
+ /*
+ * Avoid scan adjustments if moving to the preferred
+ * node or if the task was not previously running on
+ * the preferred node.
+ */
+ if (dst_nid == p->numa_preferred_nid ||
+ (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ return;
+ }
+
+ p->numa_scan_period = task_scan_start(p);
+}
+
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2609,6 +2681,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
+static inline void update_scan_period(struct task_struct *p, int new_cpu)
+{
+}
+
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -3362,6 +3438,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
+ * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
@@ -3924,7 +4001,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
}
@@ -4399,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
/*
* Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us
+ * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+ * not running add to the tail so that later runqueues don't get starved.
*/
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (cfs_b->distribute_running)
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ else
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
/*
* If we're the first throttled task, make sure the bandwidth
@@ -4545,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* in us over-using our runtime if it is all used during this loop, but
* only by limited amounts in that extreme case.
*/
- while (throttled && cfs_b->runtime > 0) {
+ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
+ cfs_b->distribute_running = 1;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);
+ cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4663,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->distribute_running) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock(&cfs_b->lock);
return;
@@ -4672,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
runtime = cfs_b->runtime;
expires = cfs_b->runtime_expires;
+ if (runtime)
+ cfs_b->distribute_running = 1;
+
raw_spin_unlock(&cfs_b->lock);
if (!runtime)
@@ -4682,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
@@ -4790,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ cfs_b->distribute_running = 0;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -6274,7 +6367,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
/*
* As blocked tasks retain absolute vruntime the migration needs to
@@ -6327,6 +6420,8 @@ static void migrate_task_rq_fair(struct task_struct *p)
/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;
+
+ update_scan_period(p, new_cpu);
}
static void task_dead_fair(struct task_struct *p)
@@ -7263,6 +7358,7 @@ static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
+ const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7299,8 +7395,10 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7365,13 +7463,16 @@ static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
+ const struct sched_class *curr_class;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7482,10 +7583,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long scale_rt_capacity(int cpu)
+static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long max = arch_scale_cpu_capacity(sd, cpu);
unsigned long used, free;
unsigned long irq;
@@ -7507,7 +7608,7 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(cpu);
+ unsigned long capacity = scale_rt_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
@@ -8269,7 +8370,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
- return sds.busiest;
+ return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
@@ -9638,7 +9739,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ if (!se->sum_exec_runtime ||
+ (p->state == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4a2e8cae63c4..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
+
+ bool distribute_running;
#endif
};
@@ -783,6 +785,7 @@ struct rq {
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
+ unsigned int numa_migrate_on;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
@@ -1523,7 +1526,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
- void (*migrate_task_rq)(struct task_struct *p);
+ void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 56a0fed30c0a..505a41c42b96 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1295,7 +1295,7 @@ static void init_numa_topology_type(void)
n = sched_max_numa_distance;
- if (sched_domains_numa_levels <= 1) {
+ if (sched_domains_numa_levels <= 2) {
sched_numa_topology_type = NUMA_DIRECT;
return;
}
@@ -1380,9 +1380,6 @@ void sched_init_numa(void)
break;
}
- if (!level)
- return;
-
/*
* 'level' contains the number of unique distances
*
diff --git a/kernel/signal.c b/kernel/signal.c
index 5843c541fda9..e4aad0e90882 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3460,7 +3460,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
}
static int
-do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
+do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
+ size_t min_ss_size)
{
struct task_struct *t = current;
@@ -3490,7 +3491,7 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
ss_size = 0;
ss_sp = NULL;
} else {
- if (unlikely(ss_size < MINSIGSTKSZ))
+ if (unlikely(ss_size < min_ss_size))
return -ENOMEM;
}
@@ -3508,7 +3509,8 @@ SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
return -EFAULT;
err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
- current_user_stack_pointer());
+ current_user_stack_pointer(),
+ MINSIGSTKSZ);
if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
err = -EFAULT;
return err;
@@ -3519,7 +3521,8 @@ int restore_altstack(const stack_t __user *uss)
stack_t new;
if (copy_from_user(&new, uss, sizeof(stack_t)))
return -EFAULT;
- (void)do_sigaltstack(&new, NULL, current_user_stack_pointer());
+ (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(),
+ MINSIGSTKSZ);
/* squash all but EFAULT for now */
return 0;
}
@@ -3553,7 +3556,8 @@ static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
uss.ss_size = uss32.ss_size;
}
ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
- compat_user_stack_pointer());
+ compat_user_stack_pointer(),
+ COMPAT_MINSIGSTKSZ);
if (ret >= 0 && uoss_ptr) {
compat_stack_t old;
memset(&old, 0, sizeof(old));
diff --git a/kernel/sys.c b/kernel/sys.c
index cf5c67533ff1..123bd73046ec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -71,9 +71,6 @@
#include <asm/io.h>
#include <asm/unistd.h>
-/* Hardening for Spectre-v1 */
-#include <linux/nospec.h>
-
#include "uid16.h"
#ifndef SET_UNALIGN_CTL
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f74fb00d8064..0e6e97a01942 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags)
spin_unlock_irqrestore(&watchdog_lock, *flags);
}
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
/*
* Interval: 0.5sec Threshold: 0.0625s
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+ /*
+ * We cannot directly run clocksource_watchdog_kthread() here, because
+ * clocksource_select() calls timekeeping_notify() which uses
+ * stop_machine(). One cannot use stop_machine() from a workqueue() due
+ * lock inversions wrt CPU hotplug.
+ *
+ * Also, we only ever run this work once or twice during the lifetime
+ * of the kernel, so there is no point in creating a more permanent
+ * kthread for this.
+ *
+ * If kthread_run fails the next watchdog scan over the
+ * watchdog_list will find the unstable clock again.
+ */
+ kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
/*
- * If the clocksource is registered clocksource_watchdog_work() will
+ * If the clocksource is registered clocksource_watchdog_kthread() will
* re-rate and re-select.
*/
if (list_empty(&cs->list)) {
@@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs)
if (cs->mark_unstable)
cs->mark_unstable(cs);
- /* kick clocksource_watchdog_work() */
+ /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(&watchdog_work);
}
@@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs)
* @cs: clocksource to be marked unstable
*
* This function is called by the x86 TSC code to mark clocksources as unstable;
- * it defers demotion and re-selection to a work.
+ * it defers demotion and re-selection to a kthread.
*/
void clocksource_mark_unstable(struct clocksource *cs)
{
@@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
}
}
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
-static int __clocksource_watchdog_work(void)
+static int __clocksource_watchdog_kthread(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
@@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void)
return select;
}
-static void clocksource_watchdog_work(struct work_struct *work)
+static int clocksource_watchdog_kthread(void *data)
{
mutex_lock(&clocksource_mutex);
- if (__clocksource_watchdog_work())
+ if (__clocksource_watchdog_kthread())
clocksource_select();
mutex_unlock(&clocksource_mutex);
+ return 0;
}
static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
-static inline int __clocksource_watchdog_work(void) { return 0; }
+static inline int __clocksource_watchdog_kthread(void) { return 0; }
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
void clocksource_mark_unstable(struct clocksource *cs) { }
@@ -810,7 +830,7 @@ static int __init clocksource_done_booting(void)
/*
* Run the watchdog first to eliminate unstable clock sources
*/
- __clocksource_watchdog_work();
+ __clocksource_watchdog_kthread();
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2868d85f1fb1..fac0ddf8a8e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return NULL;
- if (!bio->bi_css)
+ if (!bio->bi_blkg)
return NULL;
- return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+ return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup);
}
#else
static union kernfs_node_id *
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index f704390db9fc..d8765c952fab 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -5,12 +5,12 @@
* Copyright (C) 2018 Joel Fernandes (Google) <joel@joelfernandes.org>
*/
+#include <linux/trace_clock.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
-#include <linux/ktime.h>
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/string.h>
@@ -25,13 +25,13 @@ MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default ir
static void busy_wait(ulong time)
{
- ktime_t start, end;
- start = ktime_get();
+ u64 start, end;
+ start = trace_clock_local();
do {
- end = ktime_get();
+ end = trace_clock_local();
if (kthread_should_stop())
break;
- } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000));
+ } while ((end - start) < (time * 1000));
}
static int preemptirq_delay_run(void *data)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d92d4a982fd..65bd4616220d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1546,6 +1546,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
tmp_iter_page = first_page;
do {
+ cond_resched();
+
to_remove_page = tmp_iter_page;
rb_inc_page(cpu_buffer, &tmp_iter_page);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 85f6b01431c7..d239004aaf29 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -738,16 +738,30 @@ static void free_synth_field(struct synth_field *field)
kfree(field);
}
-static struct synth_field *parse_synth_field(char *field_type,
- char *field_name)
+static struct synth_field *parse_synth_field(int argc, char **argv,
+ int *consumed)
{
struct synth_field *field;
+ const char *prefix = NULL;
+ char *field_type = argv[0], *field_name;
int len, ret = 0;
char *array;
if (field_type[0] == ';')
field_type++;
+ if (!strcmp(field_type, "unsigned")) {
+ if (argc < 3)
+ return ERR_PTR(-EINVAL);
+ prefix = "unsigned ";
+ field_type = argv[1];
+ field_name = argv[2];
+ *consumed = 3;
+ } else {
+ field_name = argv[1];
+ *consumed = 2;
+ }
+
len = strlen(field_name);
if (field_name[len - 1] == ';')
field_name[len - 1] = '\0';
@@ -760,11 +774,15 @@ static struct synth_field *parse_synth_field(char *field_type,
array = strchr(field_name, '[');
if (array)
len += strlen(array);
+ if (prefix)
+ len += strlen(prefix);
field->type = kzalloc(len, GFP_KERNEL);
if (!field->type) {
ret = -ENOMEM;
goto free;
}
+ if (prefix)
+ strcat(field->type, prefix);
strcat(field->type, field_type);
if (array) {
strcat(field->type, array);
@@ -1009,7 +1027,7 @@ static int create_synth_event(int argc, char **argv)
struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
struct synth_event *event = NULL;
bool delete_event = false;
- int i, n_fields = 0, ret = 0;
+ int i, consumed = 0, n_fields = 0, ret = 0;
char *name;
mutex_lock(&synth_event_mutex);
@@ -1061,16 +1079,16 @@ static int create_synth_event(int argc, char **argv)
goto err;
}
- field = parse_synth_field(argv[i], argv[i + 1]);
+ field = parse_synth_field(argc - i, &argv[i], &consumed);
if (IS_ERR(field)) {
ret = PTR_ERR(field);
goto err;
}
- fields[n_fields] = field;
- i++; n_fields++;
+ fields[n_fields++] = field;
+ i += consumed - 1;
}
- if (i < argc) {
+ if (i < argc && strcmp(argv[i], ";") != 0) {
ret = -EINVAL;
goto err;
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index bf2c06ef9afc..a3be42304485 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -28,8 +28,8 @@
#include <linux/sched/task.h>
#include <linux/static_key.h>
-extern struct tracepoint * const __start___tracepoints_ptrs[];
-extern struct tracepoint * const __stop___tracepoints_ptrs[];
+extern tracepoint_ptr_t __start___tracepoints_ptrs[];
+extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
DEFINE_SRCU(tracepoint_srcu);
EXPORT_SYMBOL_GPL(tracepoint_srcu);
@@ -371,25 +371,17 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
}
EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
-static void for_each_tracepoint_range(struct tracepoint * const *begin,
- struct tracepoint * const *end,
+static void for_each_tracepoint_range(
+ tracepoint_ptr_t *begin, tracepoint_ptr_t *end,
void (*fct)(struct tracepoint *tp, void *priv),
void *priv)
{
+ tracepoint_ptr_t *iter;
+
if (!begin)
return;
-
- if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) {
- const int *iter;
-
- for (iter = (const int *)begin; iter < (const int *)end; iter++)
- fct(offset_to_ptr(iter), priv);
- } else {
- struct tracepoint * const *iter;
-
- for (iter = begin; iter < end; iter++)
- fct(*iter, priv);
- }
+ for (iter = begin; iter < end; iter++)
+ fct(tracepoint_ptr_deref(iter), priv);
}
#ifdef CONFIG_MODULES