diff options
Diffstat (limited to 'kernel')
39 files changed, 701 insertions, 401 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2590700237c1..138f0302692e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->type_len; + end = cur + hdr->type_len; env->log_type_id = 1; while (cur < end) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..830d7f095748 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf5195c7c331..0a0f2ec75370 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -132,6 +132,7 @@ struct smap_psock { struct work_struct gc_work; struct proto *sk_proto; + void (*save_unhash)(struct sock *sk); void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_unhash(struct sock *sk); static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].close = bpf_tcp_close; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk) return -EBUSY; } + psock->save_unhash = sk->sk_prot->unhash; psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; @@ -236,7 +240,7 @@ static int bpf_tcp_init(struct sock *sk) } static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); static void bpf_tcp_release(struct sock *sk) { @@ -248,7 +252,7 @@ static void bpf_tcp_release(struct sock *sk) goto out; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } @@ -305,39 +309,21 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -369,7 +355,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) /* If another thread deleted this object skip deletion. * The refcnt on psock may or may not be zero. */ - if (l) { + if (l && l == link) { hlist_del_rcu(&link->hash_node); smap_release_sock(psock, link->sk); free_htab_elem(htab, link); @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(e); e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); release_sock(sk); close_fun(sk, timeout); @@ -570,14 +592,16 @@ static void free_bytes_sg(struct sock *sk, int bytes, md->sg_start = i; } -static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +static int free_sg(struct sock *sk, int start, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = start, free = 0; while (sg[i].length) { free += sg[i].length; - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); if (!md->skb) put_page(sg_page(&sg[i])); sg[i].length = 0; @@ -594,9 +618,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) return free; } -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) { - int free = free_sg(sk, md->sg_start, md); + int free = free_sg(sk, md->sg_start, md, charge); md->sg_start = md->sg_end; return free; @@ -604,7 +628,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) { - return free_sg(sk, md->sg_curr, md); + return free_sg(sk, md->sg_curr, md, true); } static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) @@ -718,7 +742,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, list_add_tail(&r->list, &psock->ingress); sk->sk_data_ready(sk); } else { - free_start_sg(sk, r); + free_start_sg(sk, r, true); kfree(r); } @@ -752,14 +776,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, release_sock(sk); } smap_release_sock(psock, sk); - if (unlikely(err)) - goto out; - return 0; + return err; out_rcu: rcu_read_unlock(); -out: - free_bytes_sg(NULL, send, md, false); - return err; + return 0; } static inline void bpf_md_init(struct smap_psock *psock) @@ -822,7 +842,7 @@ more_data: case __SK_PASS: err = bpf_tcp_push(sk, send, m, flags, true); if (unlikely(err)) { - *copied -= free_start_sg(sk, m); + *copied -= free_start_sg(sk, m, true); break; } @@ -845,16 +865,17 @@ more_data: lock_sock(sk); if (unlikely(err < 0)) { - free_start_sg(sk, m); + int free = free_start_sg(sk, m, false); + psock->sg_size = 0; if (!cork) - *copied -= send; + *copied -= free; } else { psock->sg_size -= send; } if (cork) { - free_start_sg(sk, m); + free_start_sg(sk, m, true); psock->sg_size = 0; kfree(m); m = NULL; @@ -912,6 +933,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); rcu_read_lock(); psock = smap_psock_sk(sk); @@ -922,9 +945,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; rcu_read_unlock(); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); bytes_ready: while (copied != len) { @@ -1122,7 +1142,7 @@ wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (m && m != psock->cork) - free_start_sg(sk, m); + free_start_sg(sk, m, true); goto out_err; } } @@ -1464,10 +1484,16 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } +static bool psock_is_smap_sk(struct sock *sk) +{ + return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; +} + static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { - tcp_cleanup_ulp(sock); + if (psock_is_smap_sk(sock)) + tcp_cleanup_ulp(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1581,13 +1607,13 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_tx_msg); if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -1894,6 +1920,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, * doesn't update user data. */ if (psock) { + if (!psock_is_smap_sk(sock)) { + err = -EBUSY; + goto out_progs; + } if (READ_ONCE(psock->bpf_parse) && parse) { err = -EBUSY; goto out_progs; @@ -2089,8 +2119,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { fput(socket->file); return -EOPNOTSUPP; } @@ -2445,6 +2479,16 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { + fput(socket->file); + return -EOPNOTSUPP; + } + lock_sock(skops.sk); preempt_disable(); rcu_read_lock(); @@ -2535,10 +2579,22 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, }; +static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. This checks that the sock ops triggering the update is + * one indicating we are (or will be soon) in an ESTABLISHED state. + */ + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } @@ -2557,6 +2613,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b0..465952a8e465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2896,6 +2896,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + if (insn_bitness == 32) { + /* Relevant for 32-bit RSH: Information can propagate towards + * LSB, so it isn't sufficient to only truncate the output to + * 32 bits. + */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -3131,7 +3140,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->32 */ coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); } __reg_deduce_bounds(dst_reg); @@ -3163,7 +3171,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. Disallow all math except * pointer subtraction */ - if (opcode == BPF_SUB){ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { mark_reg_unknown(env, regs, insn->dst_reg); return 0; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..47147c9e184d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -192,11 +192,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, sock_hold(sock->sk); old_xs = xchg(&m->xsk_map[i], xs); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } sockfd_put(sock); return 0; @@ -212,11 +209,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; old_xs = xchg(&m->xsk_map[k], NULL); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aae10baf1902..4c1cf0969a80 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -492,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, } /** - * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * @@ -501,8 +501,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ -static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) +static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, + struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); @@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, } /** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * + * The returned css is not guaranteed to be online, and therefore it is the + * callers responsiblity to tryget a reference for it. + */ +struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + do { + css = cgroup_css(cgrp, ss); + + if (css) + return css; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + return init_css_set.subsys[ss->id]; +} + +/** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest @@ -604,10 +633,11 @@ EXPORT_SYMBOL_GPL(of_css); * * Should be called under cgroup_[tree_]mutex. */ -#define for_each_e_css(css, ssid, cgrp) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ - ; \ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css_by_mask(cgrp, \ + cgroup_subsys[(ssid)]))) \ + ; \ else /** @@ -1006,7 +1036,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ - template[i] = cgroup_e_css(cgrp, ss); + template[i] = cgroup_e_css_by_mask(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want @@ -2836,11 +2866,12 @@ restart: } /** - * cgroup_save_control - save control masks of a subtree + * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * - * Save ->subtree_control and ->subtree_ss_mask to the respective old_ - * prefixed fields for @cgrp's subtree including @cgrp itself. + * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the + * respective old_ prefixed fields for @cgrp's subtree including @cgrp + * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { @@ -2850,6 +2881,7 @@ static void cgroup_save_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + dsct->old_dom_cgrp = dsct->dom_cgrp; } } @@ -2875,11 +2907,12 @@ static void cgroup_propagate_control(struct cgroup *cgrp) } /** - * cgroup_restore_control - restore control masks of a subtree + * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * - * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ - * prefixed fields for @cgrp's subtree including @cgrp itself. + * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the + * respective old_ prefixed fields for @cgrp's subtree including @cgrp + * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { @@ -2889,6 +2922,7 @@ static void cgroup_restore_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + dsct->dom_cgrp = dsct->old_dom_cgrp; } } @@ -3019,7 +3053,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) return ret; /* - * At this point, cgroup_e_css() results reflect the new csses + * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ @@ -3196,6 +3230,8 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); @@ -3225,12 +3261,13 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) */ cgroup_save_control(cgrp); - cgrp->dom_cgrp = dom_cgrp; + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) + if (dsct == cgrp || cgroup_is_threaded(dsct)) + dsct->dom_cgrp = dom_cgrp; + ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; - else - cgrp->dom_cgrp = cgrp; cgroup_finalize_control(cgrp, ret); return ret; diff --git a/kernel/cpu.c b/kernel/cpu.c index aa7fe85ad62e..0097acec1c71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -607,15 +607,15 @@ static void cpuhp_thread_fun(unsigned int cpu) bool bringup = st->bringup; enum cpuhp_state state; + if (WARN_ON_ONCE(!st->should_run)) + return; + /* * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (WARN_ON_ONCE(!st->should_run)) - return; - cpuhp_lock_acquire(bringup); if (st->single) { @@ -916,7 +916,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st); + if (st->state < prev_state) + undo_cpu_down(cpu, st); break; } } @@ -969,7 +970,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, target); - if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) { cpuhp_reset_state(st, prev_state); __cpuhp_kick_ap(st); } diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9bd54304446f..645c7a2ecde8 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -13,6 +13,9 @@ config NEED_DMA_MAP_STATE config ARCH_DMA_ADDR_T_64BIT def_bool 64BIT || PHYS_ADDR_T_64BIT +config ARCH_HAS_DMA_COHERENCE_H + bool + config HAVE_GENERIC_DMA_COHERENT bool @@ -23,22 +26,22 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU bool select NEED_DMA_MAP_STATE -config DMA_DIRECT_OPS +config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL bool - depends on HAS_DMA -config DMA_NONCOHERENT_OPS +config ARCH_HAS_DMA_COHERENT_TO_PFN bool - depends on HAS_DMA - select DMA_DIRECT_OPS -config DMA_NONCOHERENT_MMAP +config ARCH_HAS_DMA_MMAP_PGPROT bool - depends on DMA_NONCOHERENT_OPS + +config DMA_DIRECT_OPS + bool + depends on HAS_DMA config DMA_NONCOHERENT_CACHE_SYNC bool - depends on DMA_NONCOHERENT_OPS + depends on DMA_DIRECT_OPS config DMA_VIRT_OPS bool diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 6de44e4eb454..7d581e4eea4a 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -4,7 +4,6 @@ obj-$(CONFIG_HAS_DMA) += mapping.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o -obj-$(CONFIG_DMA_NONCOHERENT_OPS) += noncoherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 286d82329eb0..b2a87905846d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -49,7 +49,11 @@ static phys_addr_t limit_cmdline; static int __init early_cma(char *p) { - pr_debug("%s(%s)\n", __func__, p); + if (!p) { + pr_err("Config string not provided\n"); + return -EINVAL; + } + size_cmdline = memparse(p, &p); if (*p != '@') return 0; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index c007d25bee09..231ca4628062 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1312,6 +1312,22 @@ static void check_sg_segment(struct device *dev, struct scatterlist *sg) #endif } +void debug_dma_map_single(struct device *dev, const void *addr, + unsigned long len) +{ + if (unlikely(dma_debug_disabled())) + return; + + if (!virt_addr_valid(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from invalid area [addr=%p] [len=%lu]\n", + addr, len); + + if (is_vmalloc_addr(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n", + addr, len); +} +EXPORT_SYMBOL(debug_dma_map_single); + void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, bool map_single) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1c35b7b945d0..87a6bc2a96c0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -1,13 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 /* - * DMA operations that map physical memory directly without using an IOMMU or - * flushing caches. + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without using an IOMMU. */ +#include <linux/bootmem.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> #include <linux/dma-direct.h> #include <linux/scatterlist.h> #include <linux/dma-contiguous.h> +#include <linux/dma-noncoherent.h> #include <linux/pfn.h> #include <linux/set_memory.h> @@ -41,40 +44,83 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, return false; } - if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx\n", - caller, &dma_addr, size, *dev->dma_mask); + "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n", + caller, &dma_addr, size, + *dev->dma_mask, dev->bus_dma_mask); } return false; } return true; } +static inline dma_addr_t phys_to_dma_direct(struct device *dev, + phys_addr_t phys) +{ + if (force_dma_unencrypted()) + return __phys_to_dma(dev, phys); + return phys_to_dma(dev, phys); +} + +u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + + if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) + max_dma = dev->bus_dma_mask; + + return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; +} + +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask) +{ + if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) + dma_mask = dev->bus_dma_mask; + + if (force_dma_unencrypted()) + *phys_mask = __dma_to_phys(dev, dma_mask); + else + *phys_mask = dma_to_phys(dev, dma_mask); + + /* + * Optimistically try the zone that the physical address mask falls + * into first. If that returns memory that isn't actually addressable + * we will fallback to the next lower zone and try again. + * + * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding + * zones. + */ + if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + return GFP_DMA; + if (*phys_mask <= DMA_BIT_MASK(32)) + return GFP_DMA32; + return 0; +} + static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { - dma_addr_t addr = force_dma_unencrypted() ? - __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); - return addr + size - 1 <= dev->coherent_dma_mask; + return phys_to_dma_direct(dev, phys) + size - 1 <= + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); } -void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) +void *dma_direct_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; int page_order = get_order(size); struct page *page = NULL; + u64 phys_mask; void *ret; + if (attrs & DMA_ATTR_NO_WARN) + gfp |= __GFP_NOWARN; + /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - - /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ - if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - gfp |= GFP_DMA; - if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; - + gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { @@ -93,15 +139,14 @@ again: page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && - dev->coherent_dma_mask < DMA_BIT_MASK(64) && + phys_mask < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && - dev->coherent_dma_mask < DMA_BIT_MASK(32) && - !(gfp & GFP_DMA)) { + phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -124,7 +169,7 @@ again: * NOTE: this function must never look at the dma_addr argument, because we want * to be able to use it as a helper for iommu implementations as well. */ -void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, +void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -136,14 +181,96 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, free_pages((unsigned long)cpu_addr, page_order); } +void *dma_direct_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +{ + if (!dev_is_dma_coherent(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); +} + +void dma_direct_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) +{ + if (!dev_is_dma_coherent(dev)) + arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); + else + dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); +} + +static void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + if (dev_is_dma_coherent(dev)) + return; + arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + if (dev_is_dma_coherent(dev)) + return; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); +} + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) +static void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + if (dev_is_dma_coherent(dev)) + return; + arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); + arch_sync_dma_for_cpu_all(dev); +} + +static void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + if (dev_is_dma_coherent(dev)) + return; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); + arch_sync_dma_for_cpu_all(dev); +} + +static void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); +} + +static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir); +} +#endif + dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset; + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dma_addr = phys_to_dma(dev, phys); if (!check_addr(dev, dma_addr, size, __func__)) return DIRECT_MAPPING_ERROR; + + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_device(dev, dma_addr, size, dir); return dma_addr; } @@ -162,31 +289,29 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, sg_dma_len(sg) = sg->length; } + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_sg_for_device(dev, sgl, nents, dir); return nents; } +/* + * Because 32-bit DMA masks are so common we expect every architecture to be + * able to satisfy them - either by not supporting more physical memory, or by + * providing a ZONE_DMA32. If neither is the case, the architecture needs to + * use an IOMMU instead of the direct mapping. + */ int dma_direct_supported(struct device *dev, u64 mask) { -#ifdef CONFIG_ZONE_DMA - if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - return 0; -#else - /* - * Because 32-bit DMA masks are so common we expect every architecture - * to be able to satisfy them - either by not supporting more physical - * memory, or by providing a ZONE_DMA32. If neither is the case, the - * architecture needs to use an IOMMU instead of the direct mapping. - */ - if (mask < DMA_BIT_MASK(32)) - return 0; -#endif - /* - * Upstream PCI/PCIe bridges or SoC interconnects may not carry - * as many DMA address bits as the device itself supports. - */ - if (dev->bus_dma_mask && mask > dev->bus_dma_mask) - return 0; - return 1; + u64 min_mask; + + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); + else + min_mask = DMA_BIT_MASK(32); + + min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + + return mask >= phys_to_dma(dev, min_mask); } int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) @@ -199,7 +324,20 @@ const struct dma_map_ops dma_direct_ops = { .free = dma_direct_free, .map_page = dma_direct_map_page, .map_sg = dma_direct_map_sg, +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + .sync_single_for_device = dma_direct_sync_single_for_device, + .sync_sg_for_device = dma_direct_sync_sg_for_device, +#endif +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) + .sync_single_for_cpu = dma_direct_sync_single_for_cpu, + .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, + .unmap_page = dma_direct_unmap_page, + .unmap_sg = dma_direct_unmap_sg, +#endif + .get_required_mask = dma_direct_get_required_mask, .dma_supported = dma_direct_supported, .mapping_error = dma_direct_mapping_error, + .cache_sync = arch_dma_cache_sync, }; EXPORT_SYMBOL(dma_direct_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d2a92ddaac4d..58dec7a92b7b 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -7,7 +7,7 @@ */ #include <linux/acpi.h> -#include <linux/dma-mapping.h> +#include <linux/dma-noncoherent.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/of_device.h> @@ -202,17 +202,26 @@ EXPORT_SYMBOL(dmam_release_declared_memory); * Create scatter-list for the already allocated DMA buffer. */ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t handle, size_t size) + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) { - struct page *page = virt_to_page(cpu_addr); + struct page *page; int ret; - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - if (unlikely(ret)) - return ret; + if (!dev_is_dma_coherent(dev)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) + return -ENXIO; - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - return 0; + page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr, + dma_addr)); + } else { + page = virt_to_page(cpu_addr); + } + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; } EXPORT_SYMBOL(dma_common_get_sgtable); @@ -220,27 +229,37 @@ EXPORT_SYMBOL(dma_common_get_sgtable); * Create userspace mapping for the DMA-coherent memory. */ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) { - int ret = -ENXIO; #ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; + unsigned long pfn; + int ret = -ENXIO; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; - if (off < count && user_count <= (count - off)) - ret = remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + off, - user_count << PAGE_SHIFT, - vma->vm_page_prot); -#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ + if (off >= count || user_count > count - off) + return -ENXIO; - return ret; + if (!dev_is_dma_coherent(dev)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) + return -ENXIO; + pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); + } else { + pfn = page_to_pfn(virt_to_page(cpu_addr)); + } + + return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +#else + return -ENXIO; +#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ } EXPORT_SYMBOL(dma_common_mmap); @@ -327,19 +346,3 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) vunmap(cpu_addr); } #endif - -/* - * enables DMA API use for a device - */ -int dma_configure(struct device *dev) -{ - if (dev->bus->dma_configure) - return dev->bus->dma_configure(dev); - return 0; -} - -void dma_deconfigure(struct device *dev) -{ - of_dma_deconfigure(dev); - acpi_dma_deconfigure(dev); -} diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c deleted file mode 100644 index 031fe235d958..000000000000 --- a/kernel/dma/noncoherent.c +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018 Christoph Hellwig. - * - * DMA operations that map physical memory directly without providing cache - * coherence. - */ -#include <linux/export.h> -#include <linux/mm.h> -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> -#include <linux/scatterlist.h> - -static void dma_noncoherent_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); -} - -static void dma_noncoherent_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); -} - -static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t addr; - - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); - if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, page_to_phys(page) + offset, - size, dir); - return addr; -} - -static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); - if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir); - return nents; -} - -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) -static void dma_noncoherent_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); - arch_sync_dma_for_cpu_all(dev); -} - -static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); - arch_sync_dma_for_cpu_all(dev); -} - -static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir); -} - -static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir); -} -#endif - -const struct dma_map_ops dma_noncoherent_ops = { - .alloc = arch_dma_alloc, - .free = arch_dma_free, - .mmap = arch_dma_mmap, - .sync_single_for_device = dma_noncoherent_sync_single_for_device, - .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, - .map_page = dma_noncoherent_map_page, - .map_sg = dma_noncoherent_map_sg, -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ - defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) - .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, - .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, - .unmap_page = dma_noncoherent_unmap_page, - .unmap_sg = dma_noncoherent_unmap_sg, -#endif - .dma_supported = dma_direct_supported, - .mapping_error = dma_direct_mapping_error, - .cache_sync = arch_dma_cache_sync, -}; -EXPORT_SYMBOL(dma_noncoherent_ops); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2a62b96600ad..5a97f34bc14c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2867,16 +2867,11 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); - if (err) { - if (!bp->attr.disabled) - _perf_event_enable(bp); - return err; - } - - if (!attr->disabled) + if (!bp->attr.disabled) _perf_event_enable(bp); - return 0; + + return err; } static int perf_event_modify_attr(struct perf_event *event, @@ -3940,6 +3935,12 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* If this is a pinned event it must be running on this CPU */ + if (event->attr.pinned && event->oncpu != smp_processor_id()) { + ret = -EBUSY; + goto out; + } + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise @@ -5948,6 +5949,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; + mm_segment_t fs; /* * We dump: @@ -5965,7 +5967,10 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); + fs = get_fs(); + set_fs(USER_DS); rem = __output_copy_user(handle, (void *) sp, dump_size); + set_fs(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -8309,6 +8314,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->cpu != smp_processor_id()) + continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) @@ -9426,9 +9433,7 @@ static void free_pmu_context(struct pmu *pmu) if (pmu->task_ctx_nr > perf_invalid_context) return; - mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); - mutex_unlock(&pmus_lock); } /* @@ -9684,12 +9689,8 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - int remove_device; - mutex_lock(&pmus_lock); - remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -9701,13 +9702,14 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (remove_device) { + if (pmu_bus_running) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); + mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b3814fce5ecb..d6b56180827c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -509,6 +509,8 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { + int err; + /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -520,15 +522,12 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - if (!attr->disabled) { - int err = modify_user_hw_breakpoint_check(bp, attr, false); + err = modify_user_hw_breakpoint_check(bp, attr, false); - if (err) - return err; + if (!bp->attr.disabled) perf_event_enable(bp); - bp->attr.disabled = 0; - } - return 0; + + return err; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/fork.c b/kernel/fork.c index d896e9ca38b0..f0b58479534f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -550,8 +550,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; } /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; + retval = arch_dup_mmap(oldmm, mm); out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01ebdf1f9f40..2e62503bea0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -678,7 +678,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { - WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); + WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e406c5fdb41e..dd13f865ad40 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,7 +55,6 @@ #include "lockdep_internals.h" -#include <trace/events/preemptirq.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 1a81a1257b3f..3f8a35104285 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -389,7 +389,7 @@ static bool __ww_mutex_wound(struct mutex *lock, /* * wake_up_process() paired with set_current_state() * inserts sufficient barriers to make sure @owner either sees - * it's wounded in __ww_mutex_lock_check_stamp() or has a + * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ if (owner != current) @@ -946,7 +946,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, current); lock_contended(&lock->dep_map, ip); diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5b915b370d5a..65a3b7e55b9f 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -260,7 +260,7 @@ static void test_cycle_work(struct work_struct *work) { struct test_cycle *cycle = container_of(work, typeof(*cycle), work); struct ww_acquire_ctx ctx; - int err; + int err, erra = 0; ww_acquire_init(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); @@ -270,17 +270,19 @@ static void test_cycle_work(struct work_struct *work) err = ww_mutex_lock(cycle->b_mutex, &ctx); if (err == -EDEADLK) { + err = 0; ww_mutex_unlock(&cycle->a_mutex); ww_mutex_lock_slow(cycle->b_mutex, &ctx); - err = ww_mutex_lock(&cycle->a_mutex, &ctx); + erra = ww_mutex_lock(&cycle->a_mutex, &ctx); } if (!err) ww_mutex_unlock(cycle->b_mutex); - ww_mutex_unlock(&cycle->a_mutex); + if (!erra) + ww_mutex_unlock(&cycle->a_mutex); ww_acquire_fini(&ctx); - cycle->result = err; + cycle->result = err ?: erra; } static int __test_cycle(unsigned int nthreads) @@ -324,7 +326,7 @@ static int __test_cycle(unsigned int nthreads) if (!cycle->result) continue; - pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n", n, nthreads, cycle->result); ret = -EINVAL; break; diff --git a/kernel/pid.c b/kernel/pid.c index de1cfc4f75a2..cdf63e53a014 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -195,7 +195,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) idr_preload_end(); if (nr < 0) { - retval = nr; + retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5342f6fc022e..0bd595a0b610 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -63,6 +63,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); +bool pm_suspend_via_s2idle(void) +{ + return mem_sleep_current == PM_SUSPEND_TO_IDLE; +} +EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); + void s2idle_set_ops(const struct platform_s2idle_ops *ops) { lock_system_sleep(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fd6f8ed28e01..9bf5404397e0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -351,7 +351,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; */ enum log_flags { - LOG_NOCONS = 1, /* suppress print, do not print to console */ LOG_NEWLINE = 2, /* text ended with a newline */ LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ @@ -1881,9 +1880,6 @@ int vprintk_store(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - if (suppress_message_printing(level)) - lflags |= LOG_NOCONS; - return log_output(facility, level, lflags, dict, dictlen, text, text_len); } @@ -2032,6 +2028,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } +static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2368,10 +2365,11 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + if (suppress_message_printing(msg->level)) { /* - * Skip record if !ignore_loglevel, and - * record has level above the console loglevel. + * Skip record we have buffered and already printed + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index a0a74c533e4b..0913b4d385de 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -306,12 +306,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) return printk_safe_log_store(s, fmt, args); } -void printk_nmi_enter(void) +void notrace printk_nmi_enter(void) { this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); } -void printk_nmi_exit(void) +void notrace printk_nmi_exit(void) { this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..ad97f3ba5ec5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p); + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 997ea7b839fa..91e4202b0634 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1607,7 +1607,7 @@ out: return cpu; } -static void migrate_task_rq_dl(struct task_struct *p) +static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { struct rq *rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 60caf1fb94e0..6383aa6a60ca 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - static_key_disable(&sched_feat_keys[i]); + static_key_disable_cpuslocked(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - static_key_enable(&sched_feat_keys[i]); + static_key_enable_cpuslocked(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); + cpus_read_lock(); inode_lock(inode); ret = sched_feat_set(cmp); inode_unlock(inode); + cpus_read_unlock(); if (ret < 0) return ret; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b39fb596f6c1..908c9cdae2f0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int last_cpupid, this_cpupid; this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; /* * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != dst_nid) return false; @@ -1514,6 +1524,21 @@ struct task_numa_env { static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Bail out if run-queue part of active NUMA balance. */ + if (xchg(&rq->numa_migrate_on, 1)) + return; + + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + if (env->best_task) put_task_struct(env->best_task); if (p) @@ -1553,6 +1578,13 @@ static bool load_too_imbalanced(long src_load, long dst_load, } /* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + +/* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should @@ -1569,6 +1601,9 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; int dist = env->dist; + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; + rcu_read_lock(); cur = task_rcu_dereference(&dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1582,7 +1617,7 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; if (!cur) { - if (maymove || imp > env->best_imp) + if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock; @@ -1625,16 +1660,22 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp) - goto unlock; - if (maymove && moveimp > imp && moveimp > env->best_imp) { - imp = moveimp - 1; + imp = moveimp; cur = NULL; goto assign; } /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + + /* * In the overloaded case, try and keep the load balanced. */ load = task_h_load(env->p) - task_h_load(cur); @@ -1710,6 +1751,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1, }; struct sched_domain *sd; + struct rq *best_rq; unsigned long taskweight, groupweight; int nid, ret, dist; long taskimp, groupimp; @@ -1805,20 +1847,17 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_start(p); - + best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); @@ -2596,6 +2635,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); +} + #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2609,6 +2681,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + #endif /* CONFIG_NUMA_BALANCING */ static void @@ -3362,6 +3438,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach + * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. @@ -3924,7 +4001,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * put back on, and if we advance min_vruntime, we'll be placed back * further than we started -- ie. we'll be penalized. */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); } @@ -4399,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) /* * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us + * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is + * not running add to the tail so that later runqueues don't get starved. */ - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (cfs_b->distribute_running) + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + else + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); /* * If we're the first throttled task, make sure the bandwidth @@ -4545,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * in us over-using our runtime if it is all used during this loop, but * only by limited amounts in that extreme case. */ - while (throttled && cfs_b->runtime > 0) { + while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; + cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); raw_spin_lock(&cfs_b->lock); + cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->runtime -= min(runtime, cfs_b->runtime); @@ -4663,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); + if (cfs_b->distribute_running) { + raw_spin_unlock(&cfs_b->lock); + return; + } + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock(&cfs_b->lock); return; @@ -4672,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) runtime = cfs_b->runtime; expires = cfs_b->runtime_expires; + if (runtime) + cfs_b->distribute_running = 1; + raw_spin_unlock(&cfs_b->lock); if (!runtime) @@ -4682,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -4790,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; + cfs_b->distribute_running = 0; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -6274,7 +6367,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * As blocked tasks retain absolute vruntime the migration needs to @@ -6327,6 +6420,8 @@ static void migrate_task_rq_fair(struct task_struct *p) /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; + + update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) @@ -7263,6 +7358,7 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; + const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7299,8 +7395,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7365,13 +7463,16 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; + const struct sched_class *curr_class; struct rq_flags rf; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7482,10 +7583,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long scale_rt_capacity(int cpu) +static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(sd, cpu); unsigned long used, free; unsigned long irq; @@ -7507,7 +7608,7 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(cpu); + unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); @@ -8269,7 +8370,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); - return sds.busiest; + return env->imbalance ? sds.busiest : NULL; out_balanced: env->imbalance = 0; @@ -9638,7 +9739,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..9683f458aec7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -346,6 +346,8 @@ struct cfs_bandwidth { int nr_periods; int nr_throttled; u64 throttled_time; + + bool distribute_running; #endif }; @@ -783,6 +785,7 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; + unsigned int numa_migrate_on; #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; @@ -1523,7 +1526,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 56a0fed30c0a..505a41c42b96 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1295,7 +1295,7 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (sched_domains_numa_levels <= 1) { + if (sched_domains_numa_levels <= 2) { sched_numa_topology_type = NUMA_DIRECT; return; } @@ -1380,9 +1380,6 @@ void sched_init_numa(void) break; } - if (!level) - return; - /* * 'level' contains the number of unique distances * diff --git a/kernel/signal.c b/kernel/signal.c index 5843c541fda9..e4aad0e90882 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3460,7 +3460,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) } static int -do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) +do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, + size_t min_ss_size) { struct task_struct *t = current; @@ -3490,7 +3491,7 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) ss_size = 0; ss_sp = NULL; } else { - if (unlikely(ss_size < MINSIGSTKSZ)) + if (unlikely(ss_size < min_ss_size)) return -ENOMEM; } @@ -3508,7 +3509,8 @@ SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) if (uss && copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, - current_user_stack_pointer()); + current_user_stack_pointer(), + MINSIGSTKSZ); if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) err = -EFAULT; return err; @@ -3519,7 +3521,8 @@ int restore_altstack(const stack_t __user *uss) stack_t new; if (copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; - (void)do_sigaltstack(&new, NULL, current_user_stack_pointer()); + (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(), + MINSIGSTKSZ); /* squash all but EFAULT for now */ return 0; } @@ -3553,7 +3556,8 @@ static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, uss.ss_size = uss32.ss_size; } ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, - compat_user_stack_pointer()); + compat_user_stack_pointer(), + COMPAT_MINSIGSTKSZ); if (ret >= 0 && uoss_ptr) { compat_stack_t old; memset(&old, 0, sizeof(old)); diff --git a/kernel/sys.c b/kernel/sys.c index cf5c67533ff1..123bd73046ec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -71,9 +71,6 @@ #include <asm/io.h> #include <asm/unistd.h> -/* Hardening for Spectre-v1 */ -#include <linux/nospec.h> - #include "uid16.h" #ifndef SET_UNALIGN_CTL diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f74fb00d8064..0e6e97a01942 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags) spin_unlock_irqrestore(&watchdog_lock, *flags); } +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + /* * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * We cannot directly run clocksource_watchdog_kthread() here, because + * clocksource_select() calls timekeeping_notify() which uses + * stop_machine(). One cannot use stop_machine() from a workqueue() due + * lock inversions wrt CPU hotplug. + * + * Also, we only ever run this work once or twice during the lifetime + * of the kernel, so there is no point in creating a more permanent + * kthread for this. + * + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; /* - * If the clocksource is registered clocksource_watchdog_work() will + * If the clocksource is registered clocksource_watchdog_kthread() will * re-rate and re-select. */ if (list_empty(&cs->list)) { @@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs) if (cs->mark_unstable) cs->mark_unstable(cs); - /* kick clocksource_watchdog_work() */ + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(&watchdog_work); } @@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs) * @cs: clocksource to be marked unstable * * This function is called by the x86 TSC code to mark clocksources as unstable; - * it defers demotion and re-selection to a work. + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) } } -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -static int __clocksource_watchdog_work(void) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; @@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void) return select; } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { mutex_lock(&clocksource_mutex); - if (__clocksource_watchdog_work()) + if (__clocksource_watchdog_kthread()) clocksource_select(); mutex_unlock(&clocksource_mutex); + return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int __clocksource_watchdog_work(void) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } void clocksource_mark_unstable(struct clocksource *cs) { } @@ -810,7 +830,7 @@ static int __init clocksource_done_booting(void) /* * Run the watchdog first to eliminate unstable clock sources */ - __clocksource_watchdog_work(); + __clocksource_watchdog_kthread(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2868d85f1fb1..fac0ddf8a8e2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return NULL; - if (!bio->bi_css) + if (!bio->bi_blkg) return NULL; - return cgroup_get_kernfs_id(bio->bi_css->cgroup); + return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else static union kernfs_node_id * diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index f704390db9fc..d8765c952fab 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -5,12 +5,12 @@ * Copyright (C) 2018 Joel Fernandes (Google) <joel@joelfernandes.org> */ +#include <linux/trace_clock.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kernel.h> #include <linux/kthread.h> -#include <linux/ktime.h> #include <linux/module.h> #include <linux/printk.h> #include <linux/string.h> @@ -25,13 +25,13 @@ MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default ir static void busy_wait(ulong time) { - ktime_t start, end; - start = ktime_get(); + u64 start, end; + start = trace_clock_local(); do { - end = ktime_get(); + end = trace_clock_local(); if (kthread_should_stop()) break; - } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000)); + } while ((end - start) < (time * 1000)); } static int preemptirq_delay_run(void *data) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d92d4a982fd..65bd4616220d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1546,6 +1546,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) tmp_iter_page = first_page; do { + cond_resched(); + to_remove_page = tmp_iter_page; rb_inc_page(cpu_buffer, &tmp_iter_page); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 85f6b01431c7..d239004aaf29 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -738,16 +738,30 @@ static void free_synth_field(struct synth_field *field) kfree(field); } -static struct synth_field *parse_synth_field(char *field_type, - char *field_name) +static struct synth_field *parse_synth_field(int argc, char **argv, + int *consumed) { struct synth_field *field; + const char *prefix = NULL; + char *field_type = argv[0], *field_name; int len, ret = 0; char *array; if (field_type[0] == ';') field_type++; + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) + return ERR_PTR(-EINVAL); + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed = 3; + } else { + field_name = argv[1]; + *consumed = 2; + } + len = strlen(field_name); if (field_name[len - 1] == ';') field_name[len - 1] = '\0'; @@ -760,11 +774,15 @@ static struct synth_field *parse_synth_field(char *field_type, array = strchr(field_name, '['); if (array) len += strlen(array); + if (prefix) + len += strlen(prefix); field->type = kzalloc(len, GFP_KERNEL); if (!field->type) { ret = -ENOMEM; goto free; } + if (prefix) + strcat(field->type, prefix); strcat(field->type, field_type); if (array) { strcat(field->type, array); @@ -1009,7 +1027,7 @@ static int create_synth_event(int argc, char **argv) struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; bool delete_event = false; - int i, n_fields = 0, ret = 0; + int i, consumed = 0, n_fields = 0, ret = 0; char *name; mutex_lock(&synth_event_mutex); @@ -1061,16 +1079,16 @@ static int create_synth_event(int argc, char **argv) goto err; } - field = parse_synth_field(argv[i], argv[i + 1]); + field = parse_synth_field(argc - i, &argv[i], &consumed); if (IS_ERR(field)) { ret = PTR_ERR(field); goto err; } - fields[n_fields] = field; - i++; n_fields++; + fields[n_fields++] = field; + i += consumed - 1; } - if (i < argc) { + if (i < argc && strcmp(argv[i], ";") != 0) { ret = -EINVAL; goto err; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index bf2c06ef9afc..a3be42304485 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -28,8 +28,8 @@ #include <linux/sched/task.h> #include <linux/static_key.h> -extern struct tracepoint * const __start___tracepoints_ptrs[]; -extern struct tracepoint * const __stop___tracepoints_ptrs[]; +extern tracepoint_ptr_t __start___tracepoints_ptrs[]; +extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; DEFINE_SRCU(tracepoint_srcu); EXPORT_SYMBOL_GPL(tracepoint_srcu); @@ -371,25 +371,17 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, +static void for_each_tracepoint_range( + tracepoint_ptr_t *begin, tracepoint_ptr_t *end, void (*fct)(struct tracepoint *tp, void *priv), void *priv) { + tracepoint_ptr_t *iter; + if (!begin) return; - - if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { - const int *iter; - - for (iter = (const int *)begin; iter < (const int *)end; iter++) - fct(offset_to_ptr(iter), priv); - } else { - struct tracepoint * const *iter; - - for (iter = begin; iter < end; iter++) - fct(*iter, priv); - } + for (iter = begin; iter < end; iter++) + fct(tracepoint_ptr_deref(iter), priv); } #ifdef CONFIG_MODULES |