diff options
Diffstat (limited to 'kernel')
47 files changed, 678 insertions, 455 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 367eaf2c78b7..0ebbbe37a60f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -347,12 +347,17 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct dentry *d = kern_path_locked(watch->path, parent); + struct dentry *d; + + d = kern_path_locked_negative(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; - watch->ino = d_backing_inode(d)->i_ino; + + if (d_is_positive(d)) { + /* update watch filter fields */ + watch->dev = d->d_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; + } inode_unlock(d_backing_inode(parent->dentry)); dput(d); @@ -418,11 +423,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret && ret != -ENOENT) { + if (ret) { audit_put_watch(watch); return ret; } - ret = 0; /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 24a26b4bb0b8..324c47ab377a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6391,8 +6391,8 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } -static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, - int off) +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) { const struct btf_param *args; const struct btf_type *t; @@ -6672,7 +6672,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = get_ctx_arg_idx(btf, t, off); + arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5a5adc66b8e2..92b606d60020 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2189,7 +2189,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 2fdf3c978db1..774e5a538811 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -89,5 +89,6 @@ static void __exit fini(void) } late_initcall(load); module_exit(fini); +MODULE_IMPORT_NS("BPF_INTERNAL"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index d869f51ea93a..9a5f94371e50 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -9,13 +9,14 @@ #include <linux/slab.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" +#include <asm/rqspinlock.h> #define QUEUE_STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; - raw_spinlock_t lock; + rqspinlock_t lock; u32 head, tail; u32 size; /* max_entries + 1 */ @@ -78,7 +79,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) qs->size = size; - raw_spin_lock_init(&qs->lock); + raw_res_spin_lock_init(&qs->lock); return &qs->map; } @@ -98,12 +99,8 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, flags)) + return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -120,7 +117,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) } out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } @@ -133,12 +130,8 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, flags)) + return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -157,7 +150,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) qs->head = index; out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } @@ -203,12 +196,8 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, irq_flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; if (queue_stack_map_is_full(qs)) { if (!replace) { @@ -227,7 +216,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, qs->head = 0; out: - raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags); return err; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 1499d8caa9a3..719d73299397 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -11,6 +11,7 @@ #include <linux/kmemleak.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <asm/rqspinlock.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -29,7 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - raw_spinlock_t spinlock ____cacheline_aligned_in_smp; + rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +174,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - raw_spin_lock_init(&rb->spinlock); + raw_res_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -416,12 +417,8 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) - return NULL; - } else { - raw_spin_lock_irqsave(&rb->spinlock, flags); - } + if (raw_res_spin_lock_irqsave(&rb->spinlock, flags)) + return NULL; pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; @@ -446,7 +443,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - raw_spin_unlock_irqrestore(&rb->spinlock, flags); + raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -458,7 +455,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - raw_spin_unlock_irqrestore(&rb->spinlock, flags); + raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index b896c4a75a5c..338305c8852c 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -253,7 +253,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, }) #else #define RES_CHECK_TIMEOUT(ts, ret, mask) \ - ({ (ret) = check_timeout(&(ts)); }) + ({ (ret) = check_timeout((lock), (mask), &(ts)); }) #endif /* diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9794446bc8c6..64c3393e8270 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1583,7 +1583,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } -EXPORT_SYMBOL(bpf_map_get); +EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -3364,7 +3364,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) bpf_link_inc(link); return link; } -EXPORT_SYMBOL(bpf_link_get_from_fd); +EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); static void bpf_tracing_link_release(struct bpf_link *link) { @@ -6020,7 +6020,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) return ____bpf_sys_bpf(cmd, attr, size); } } -EXPORT_SYMBOL(kern_sys_bpf); +EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3caf2cd86e65..63e5b90da1f3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,7 +90,7 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif @@ -2353,9 +2353,37 @@ static struct file_system_type cgroup2_fs_type = { }; #ifdef CONFIG_CPUSETS_V1 +enum cpuset_param { + Opt_cpuset_v2_mode, +}; + +static const struct fs_parameter_spec cpuset_fs_parameters[] = { + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + {} +}; + +static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, cpuset_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + return 0; + } + return -EINVAL; +} + static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, + .parse_param = cpuset_parse_param, }; /* @@ -2392,6 +2420,7 @@ static int cpuset_init_fs_context(struct fs_context *fc) static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, + .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 306b60430091..24b70ea3e6ce 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1116,9 +1116,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 3b2bdca9f1d4..77c8d9487a9a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -336,16 +336,22 @@ static phys_addr_t dma_reserved_default_memory_size __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { - if (!rmem->priv) { - struct dma_coherent_mem *mem; + struct dma_coherent_mem *mem = rmem->priv; + if (!mem) { mem = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, true); if (IS_ERR(mem)) return PTR_ERR(mem); rmem->priv = mem; } - dma_assign_coherent_memory(dev, rmem->priv); + + /* Warn if the device potentially can't use the reserved memory */ + if (mem->device_base + rmem->size - 1 > + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit)) + dev_warn(dev, "reserved memory is beyond device's set DMA address range\n"); + + dma_assign_coherent_memory(dev, mem); return 0; } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 055da410ac71..8df0dfaaca18 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes __initconst = - (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M) static phys_addr_t size_cmdline __initdata = -1; static phys_addr_t base_cmdline __initdata; static phys_addr_t limit_cmdline __initdata; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index cda127027e48..051a32988040 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -910,6 +910,19 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); +static bool __dma_addressing_limited(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev)) + return true; + + if (unlikely(ops) || use_dma_iommu(dev)) + return false; + return !dma_direct_all_ram_mapped(dev); +} + /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check @@ -920,15 +933,11 @@ EXPORT_SYMBOL(dma_set_coherent_mask); */ bool dma_addressing_limited(struct device *dev) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < - dma_get_required_mask(dev)) - return true; - - if (unlikely(ops) || use_dma_iommu(dev)) + if (!__dma_addressing_limited(dev)) return false; - return !dma_direct_all_ram_mapped(dev); + + dev_dbg(dev, "device is DMA addressing limited\n"); + return true; } EXPORT_SYMBOL_GPL(dma_addressing_limited); diff --git a/kernel/events/core.c b/kernel/events/core.c index 128db74e9eab..95e703891b24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3943,7 +3943,7 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_event_set_state(event, PERF_EVENT_STATE_ERROR); if (*perf_event_fasync(event)) - event->pending_kill = POLL_HUP; + event->pending_kill = POLL_ERR; perf_event_wakeup(event); } else { @@ -5518,30 +5518,6 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_free_addr_filters(struct perf_event *event); -static void perf_pending_task_sync(struct perf_event *event) -{ - struct callback_head *head = &event->pending_task; - - if (!event->pending_work) - return; - /* - * If the task is queued to the current task's queue, we - * obviously can't wait for it to complete. Simply cancel it. - */ - if (task_work_cancel(current, head)) { - event->pending_work = 0; - local_dec(&event->ctx->nr_no_switch_fast); - return; - } - - /* - * All accesses related to the event are within the same RCU section in - * perf_pending_task(). The RCU grace period before the event is freed - * will make sure all those accesses are complete by then. - */ - rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); -} - /* vs perf_event_alloc() error */ static void __free_event(struct perf_event *event) { @@ -5599,7 +5575,6 @@ static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); unaccount_event(event); @@ -5692,10 +5667,17 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { + struct perf_event *parent; + if (!atomic_long_dec_and_test(&event->refcount)) return; + parent = event->parent; _free_event(event); + + /* Matches the refcount bump in inherit_event() */ + if (parent) + put_event(parent); } /* @@ -5779,11 +5761,6 @@ again: if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); list_move(&child->child_list, &free_list); - /* - * This matches the refcount bump in inherit_event(); - * this can't be the last reference. - */ - put_event(event); } else { var = &ctx->refcount; } @@ -5809,7 +5786,8 @@ again: void *var = &child->ctx->refcount; list_del(&child->child_list); - free_event(child); + /* Last reference unless ->pending_task work is pending */ + put_event(child); /* * Wake any perf_event_free_task() waiting for this event to be @@ -5820,7 +5798,11 @@ again: } no_ctx: - put_event(event); /* Must be the 'last' reference */ + /* + * Last reference unless ->pending_task work is pending on this event + * or any of its children. + */ + put_event(event); return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -6093,7 +6075,7 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && event->attr.pinned)) - return events; + return EPOLLERR; /* * Pin the event->rb by taking event->mmap_mutex; otherwise @@ -7236,12 +7218,6 @@ static void perf_pending_task(struct callback_head *head) int rctx; /* - * All accesses to the event must belong to the same implicit RCU read-side - * critical section as the ->pending_work reset. See comment in - * perf_pending_task_sync(). - */ - rcu_read_lock(); - /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ @@ -7251,9 +7227,8 @@ static void perf_pending_task(struct callback_head *head) event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_no_switch_fast); - rcuwait_wake_up(&event->pending_work_wait); } - rcu_read_unlock(); + put_event(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); @@ -10248,6 +10223,7 @@ static int __perf_event_overflow(struct perf_event *event, !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; local_inc(&event->ctx->nr_no_switch_fast); + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@ -12610,7 +12586,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_irq_work(&event->pending_irq, perf_pending_irq); event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); - rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -13747,8 +13722,7 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); - free_event(event); - put_event(parent_event); + put_event(event); return; } @@ -13872,13 +13846,11 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - put_event(parent); - raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); - free_event(event); + put_event(event); } /* @@ -14016,6 +13988,9 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + get_ctx(child_ctx); + child_event->ctx = child_ctx; + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); @@ -14037,8 +14012,6 @@ inherit_event(struct perf_event *parent_event, return NULL; } - get_ctx(child_ctx); - /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, @@ -14059,7 +14032,6 @@ inherit_event(struct perf_event *parent_event, local64_set(&hwc->period_left, sample_period); } - child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 615b4e6d22c7..8d783b5882b6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1956,6 +1956,9 @@ static void free_ret_instance(struct uprobe_task *utask, * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. + * Admittedly, this is a rather simple use of seqcount, but it nicely + * abstracts away all the necessary memory barriers, so we use + * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ @@ -2016,12 +2019,20 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); - write_seqcount_begin(&utask->ri_seqcount); + /* + * See free_ret_instance() for notes on seqcount use. + * We also employ raw API variants to avoid lockdep false-positive + * warning complaining about enabled preemption. The timer can only be + * invoked once for a uprobe_task. Therefore there can only be one + * writer. The reader does not require an even sequence count to make + * progress, so it is OK to remain preemptible on PREEMPT_RT. + */ + raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); - write_seqcount_end(&utask->ri_seqcount); + raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b..168681fc4b25 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -498,10 +498,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); - /* track_pfn_copy() will later take care of copying internal state. */ - if (unlikely(new->vm_flags & VM_PFNMAP)) - untrack_pfn_clear(new); - return new; } @@ -672,6 +668,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; + + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(tmp->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(tmp); + retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5c8d43cdb0a3..c05ba7ca00fa 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -761,7 +761,7 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, struct irq_data *irqd, int ind) { - struct msi_desc *desc = irq_data_get_msi_desc(irqd); + struct msi_desc *desc = irqd ? irq_data_get_msi_desc(irqd) : NULL; if (!desc) return; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index d7762ef5949a..39278737bb68 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -192,6 +192,11 @@ config GENDWARFKSYMS depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT # Requires ELF object files. depends on !LTO + # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on + # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop + # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder: + # Verify 0 address DWARF variables are in ELF section"). + depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129 help Calculate symbol versions from DWARF debugging information using gendwarfksyms. Requires DEBUG_INFO to be enabled. diff --git a/kernel/padata.c b/kernel/padata.c index b3d4eacc4f5d..7eee94166357 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd) * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. */ padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); + if (!queue_work(pinst->serial_wq, &pd->reorder_work)) + padata_put_pd(pd); } } diff --git a/kernel/params.c b/kernel/params.c index 2509f216c9f3..b92d64161b75 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -760,38 +760,35 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static struct module_kobject * __init locate_module_kobject(const char *name) +struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; int err; kobj = kset_find_obj(module_kset, name); - if (kobj) { - mk = to_module_kobject(kobj); - } else { - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, - "%s", name); -#ifdef CONFIG_MODULES - if (!err) - err = sysfs_create_file(&mk->kobj, &module_uevent.attr); -#endif - if (err) { - kobject_put(&mk->kobj); - pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", - name, err); - return NULL; - } + if (kobj) + return to_module_kobject(kobj); - /* So that we hold reference in both cases. */ - kobject_get(&mk->kobj); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + if (!mk) + return NULL; + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); + if (IS_ENABLED(CONFIG_MODULES) && !err) + err = sysfs_create_file(&mk->kobj, &module_uevent.attr); + if (err) { + kobject_put(&mk->kobj); + pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", + name, err); + return NULL; } + /* So that we hold reference in both cases. */ + kobject_get(&mk->kobj); + return mk; } @@ -802,7 +799,7 @@ static void __init kernel_add_sysfs_param(const char *name, struct module_kobject *mk; int err; - mk = locate_module_kobject(name); + mk = lookup_or_create_module_kobject(name); if (!mk) return; @@ -873,7 +870,7 @@ static void __init version_sysfs_builtin(void) int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { - mk = locate_module_kobject(vattr->module_name); + mk = lookup_or_create_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); @@ -946,7 +943,9 @@ struct kset *module_kset; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); - complete(mk->kobj_completion); + + if (mk->kobj_completion) + complete(mk->kobj_completion); } const struct kobj_type module_ktype = { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1a19d69b91ed..816f07f9d30f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -81,9 +81,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); + sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; } @@ -95,10 +109,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) + if (sg_policy->need_freq_update) { sg_policy->need_freq_update = false; - else if (sg_policy->next_freq == next_freq) + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -365,7 +391,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) - sg_cpu->sg_policy->limits_changed = true; + sg_cpu->sg_policy->need_freq_update = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -871,7 +897,16 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } struct cpufreq_governor schedutil_gov = { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 66bcd40a28ca..f5133249fd4d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -163,7 +163,7 @@ enum scx_ops_flags { /* * CPU cgroup support flags */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | @@ -1118,8 +1118,38 @@ static void scx_kf_disallow(u32 mask) current->scx.kf_mask &= ~mask; } -#define SCX_CALL_OP(mask, op, args...) \ +/* + * Track the rq currently locked. + * + * This allows kfuncs to safely operate on rq from any scx ops callback, + * knowing which rq is already locked. + */ +static DEFINE_PER_CPU(struct rq *, locked_rq); + +static inline void update_locked_rq(struct rq *rq) +{ + /* + * Check whether @rq is actually locked. This can help expose bugs + * or incorrect assumptions about the context in which a kfunc or + * callback is executed. + */ + if (rq) + lockdep_assert_rq_held(rq); + __this_cpu_write(locked_rq, rq); +} + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(locked_rq); +} + +#define SCX_CALL_OP(mask, op, rq, args...) \ do { \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ scx_ops.op(args); \ @@ -1127,11 +1157,14 @@ do { \ } else { \ scx_ops.op(args); \ } \ + update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(mask, op, args...) \ +#define SCX_CALL_OP_RET(mask, op, rq, args...) \ ({ \ __typeof__(scx_ops.op(args)) __ret; \ + \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ __ret = scx_ops.op(args); \ @@ -1139,6 +1172,7 @@ do { \ } else { \ __ret = scx_ops.op(args); \ } \ + update_locked_rq(NULL); \ __ret; \ }) @@ -1153,31 +1187,31 @@ do { \ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on * the specific task. */ -#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK(mask, op, rq, task, args...) \ do { \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP(mask, op, task, ##args); \ + SCX_CALL_OP(mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \ ({ \ __typeof__(scx_ops.op(task, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + __ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \ ({ \ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET(mask, op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -2172,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2269,7 +2303,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags add_nr_running(rq, 1); if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -2283,7 +2317,7 @@ out: __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } -static void ops_dequeue(struct task_struct *p, u64 deq_flags) +static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { unsigned long opss; @@ -2304,7 +2338,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) BUG(); case SCX_OPSS_QUEUED: if (SCX_HAS_OP(dequeue)) - SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) @@ -2337,7 +2371,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags return true; } - ops_dequeue(p, deq_flags); + ops_dequeue(rq, p, deq_flags); /* * A currently running task which is going off @rq first gets dequeued @@ -2353,11 +2387,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags */ if (SCX_HAS_OP(stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false); } if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -2377,7 +2411,7 @@ static void yield_task_scx(struct rq *rq) struct task_struct *p = rq->curr; if (SCX_HAS_OP(yield)) - SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL); else p->scx.slice = 0; } @@ -2387,7 +2421,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) struct task_struct *from = rq->curr; if (SCX_HAS_OP(yield)) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to); else return false; } @@ -2945,7 +2979,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * emitted in switch_class(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2990,7 +3024,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), + SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq), prev_on_scx ? prev : NULL); flush_dispatch_buf(rq); @@ -3104,7 +3138,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) * Core-sched might decide to execute @p before it is * dispatched. Call ops_dequeue() to notify the BPF scheduler. */ - ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); dispatch_dequeue(rq, p); } @@ -3112,7 +3146,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p); clr_task_runnable(p, true); @@ -3193,8 +3227,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) .task = next, }; - SCX_CALL_OP(SCX_KF_CPU_RELEASE, - cpu_release, cpu_of(rq), &args); + SCX_CALL_OP(SCX_KF_CPU_RELEASE, cpu_release, rq, cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -3207,7 +3240,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -3348,7 +3381,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, * verifier. */ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL, (struct task_struct *)a, (struct task_struct *)b); else @@ -3385,7 +3418,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag *ddsp_taskp = p; cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, p, prev_cpu, wake_flags); + select_cpu, NULL, p, prev_cpu, wake_flags); p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) @@ -3430,8 +3463,8 @@ static void set_cpus_allowed_scx(struct task_struct *p, * designation pointless. Cast it away when calling the operation. */ if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL, + p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) @@ -3444,9 +3477,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, NULL, cpu); else if (!online && SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); else scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, "cpu %d going %s, exiting scheduler", cpu, @@ -3550,7 +3583,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr); } if (!curr->scx.slice) @@ -3627,7 +3660,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool .fork = fork, }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, NULL, p, &args); if (unlikely(ret)) { ret = ops_sanitize_err("init_task", ret); return ret; @@ -3668,9 +3701,10 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool static void scx_ops_enable_task(struct task_struct *p) { + struct rq *rq = task_rq(p); u32 weight; - lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_rq_held(rq); /* * Set the weight before calling ops.enable() so that the scheduler @@ -3684,20 +3718,22 @@ static void scx_ops_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); if (SCX_HAS_OP(enable)) - SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p); scx_set_task_state(p, SCX_TASK_ENABLED); if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight); } static void scx_ops_disable_task(struct task_struct *p) { - lockdep_assert_rq_held(task_rq(p)); + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -3726,7 +3762,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3835,7 +3871,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight); } static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) @@ -3851,8 +3887,8 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) * different scheduler class. Keep the BPF scheduler up-to-date. */ if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq, + p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) @@ -3899,35 +3935,6 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; -static bool cgroup_warned_missing_weight; -static bool cgroup_warned_missing_idle; - -static void scx_cgroup_warn_missing_weight(struct task_group *tg) -{ - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_weight) - return; - - if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", - scx_ops.name); - cgroup_warned_missing_weight = true; -} - -static void scx_cgroup_warn_missing_idle(struct task_group *tg) -{ - if (!scx_cgroup_enabled || cgroup_warned_missing_idle) - return; - - if (!tg->idle) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", - scx_ops.name); - cgroup_warned_missing_idle = true; -} int scx_tg_online(struct task_group *tg) { @@ -3937,14 +3944,12 @@ int scx_tg_online(struct task_group *tg) percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_weight(tg); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(cgroup_init)) { struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); if (ret) ret = ops_sanitize_err("cgroup_init", ret); @@ -3966,7 +3971,7 @@ void scx_tg_offline(struct task_group *tg) percpu_down_read(&scx_cgroup_rwsem); if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); @@ -3999,7 +4004,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) continue; if (SCX_HAS_OP(cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -4013,8 +4018,8 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) err: cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } @@ -4032,8 +4037,8 @@ void scx_cgroup_move_task(struct task_struct *p) * cgrp_moving_from set. */ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, - p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL, + p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; } @@ -4052,8 +4057,8 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } out_unlock: @@ -4066,7 +4071,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) if (scx_cgroup_enabled && tg->scx_weight != weight) { if (SCX_HAS_OP(cgroup_set_weight)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL, tg_cgrp(tg), weight); tg->scx_weight = weight; } @@ -4076,9 +4081,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_idle(tg); - percpu_up_read(&scx_cgroup_rwsem); + /* TODO: Implement ops->cgroup_set_idle() */ } static void scx_cgroup_lock(void) @@ -4257,7 +4260,7 @@ static void scx_cgroup_exit(void) continue; rcu_read_unlock(); - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); rcu_read_lock(); css_put(css); @@ -4272,9 +4275,6 @@ static int scx_cgroup_init(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); - cgroup_warned_missing_weight = false; - cgroup_warned_missing_idle = false; - /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. @@ -4284,9 +4284,6 @@ static int scx_cgroup_init(void) struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - scx_cgroup_warn_missing_weight(tg); - scx_cgroup_warn_missing_idle(tg); - if ((tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; @@ -4300,7 +4297,7 @@ static int scx_cgroup_init(void) continue; rcu_read_unlock(); - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { css_put(css); @@ -4623,7 +4620,7 @@ unlock: static void free_exit_info(struct scx_exit_info *ei) { - kfree(ei->dump); + kvfree(ei->dump); kfree(ei->msg); kfree(ei->bt); kfree(ei); @@ -4639,7 +4636,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); - ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); if (!ei->bt || !ei->msg || !ei->dump) { free_exit_info(ei); @@ -4797,7 +4794,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } if (scx_ops.exit) - SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei); cancel_delayed_work_sync(&scx_watchdog_work); @@ -5004,7 +5001,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, if (SCX_HAS_OP(dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -5051,7 +5048,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (SCX_HAS_OP(dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx); ops_dump_exit(); } @@ -5108,7 +5105,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) used = seq_buf_used(&ns); if (SCX_HAS_OP(dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle); ops_dump_exit(); } @@ -5252,6 +5249,9 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) + pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + return 0; } @@ -5364,7 +5364,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_idle_enable(ops); if (scx_ops.init) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL); if (ret) { ret = ops_sanitize_err("init", ret); cpus_read_unlock(); @@ -6827,6 +6827,12 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; @@ -7113,13 +7119,32 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) } if (ops_cpu_valid(cpu, NULL)) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); + struct rq_flags rf; + + /* + * When called with an rq lock held, restrict the operation + * to the corresponding CPU to prevent ABBA deadlocks. + */ + if (locked_rq && rq != locked_rq) { + scx_ops_error("Invalid target CPU %d", cpu); + return; + } + + /* + * If no rq lock is held, allow to operate on any CPU by + * acquiring the corresponding rq lock. + */ + if (!locked_rq) { + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + } rq->scx.cpuperf_target = perf; + cpufreq_update_util(rq, 0); - rcu_read_lock_sched_notrace(); - cpufreq_update_util(cpu_rq(cpu), 0); - rcu_read_unlock_sched_notrace(); + if (!locked_rq) + rq_unlock_irqrestore(rq, &rf); } } @@ -7350,12 +7375,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index cb343ca889e0..e67a19a071c1 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -674,7 +674,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) * managed by put_prev_task_idle()/set_next_task_idle(). */ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); /* * Update the idle masks: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e43993a4e580..0fb9bf995a47 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7081,9 +7081,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) h_nr_idle = task_has_idle_policy(p); if (task_sleep || task_delayed || !se->sched_delayed) h_nr_runnable = 1; - } else { - cfs_rq = group_cfs_rq(se); - slice = cfs_rq_min_slice(cfs_rq); } for_each_sched_entity(se) { @@ -7093,6 +7090,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (p && &p->se == se) return -1; + slice = cfs_rq_min_slice(cfs_rq); break; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 517ee2590a29..30899a8cc52c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -366,7 +366,7 @@ static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { - return ((struct hrtimer *) addr)->function; + return ACCESS_PRIVATE((struct hrtimer *)addr, function); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a47bcf71defc..9a3859443c04 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -509,6 +509,7 @@ void tick_resume(void) #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** @@ -528,9 +529,22 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + /* + * All other CPUs have their interrupts disabled and are + * suspended to idle. Other tasks have been frozen so there + * is no scheduling happening. This means that there is no + * concurrency in the system at this point. Therefore it is + * okay to acquire a sleeping lock on PREEMPT_RT, such as a + * spinlock, because the lock cannot be held by other CPUs + * or threads and acquiring it cannot block. + * + * Inform lockdep about the situation. + */ + lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); + lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } @@ -552,8 +566,16 @@ void tick_unfreeze(void) raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { + /* + * Similar to tick_freeze(). On resumption the first CPU may + * acquire uncontended sleeping locks while other CPUs block on + * tick_freeze_lock. + */ + lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); + lock_map_release(&tick_freeze_map); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f195..a009c91f7b05 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -164,10 +164,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -175,6 +199,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -708,6 +733,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); delta -= incr; } + tk_update_coarse_nsecs(tk); } /** @@ -804,8 +830,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -813,7 +839,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2161,7 +2187,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; - u64 offset; + u64 offset, orig_offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2172,7 +2198,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); - + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; @@ -2205,6 +2231,14 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); + /* + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time + */ + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; @@ -2248,7 +2282,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); @@ -2271,7 +2305,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2350,12 +2384,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 01c2ab1e8971..32ef27c71b57 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -98,12 +98,12 @@ void update_vsyscall(struct timekeeper *tk) /* CLOCK_REALTIME_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 95c6e3473a76..ba7ff14f5339 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -454,7 +454,8 @@ static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head * struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist) { + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { if (!within_module(node->addr, mod)) continue; if (delete_fprobe_node(node)) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1a48aedb5255..6981830c3128 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1297,6 +1297,8 @@ void ftrace_free_filter(struct ftrace_ops *ops) return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; } EXPORT_SYMBOL_GPL(ftrace_free_filter); @@ -3256,6 +3258,31 @@ static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, } /* + * Remove functions from @hash that are in @notrace_hash + */ +static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tmp; + int size; + int i; + + /* If the notrace hash is empty, there's nothing to do */ + if (ftrace_hash_empty(notrace_hash)) + return; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { + if (!__ftrace_lookup_ip(notrace_hash, entry->ip)) + continue; + remove_hash_entry(hash, entry); + kfree(entry); + } + } +} + +/* * Add to @hash only those that are in both @new_hash1 and @new_hash2 * * The notrace_hash updates uses just the intersect_hash() function @@ -3295,67 +3322,6 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has return 0; } -/* Return a new hash that has a union of all @ops->filter_hash entries */ -static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - if (ops->func_hash->filter_hash) - size_bits = ops->func_hash->filter_hash->size_bits; - else - size_bits = FTRACE_HASH_DEFAULT_BITS; - - list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - /* Can't return NULL as that means this failed */ - return new_hash ? : EMPTY_HASH; -} - -/* Make @ops trace evenything except what all its subops do not trace */ -static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - list_for_each_entry(subops, &ops->subop_list, list) { - struct ftrace_hash *next_hash; - - if (!new_hash) { - size_bits = subops->func_hash->notrace_hash->size_bits; - new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash); - if (!new_hash) - return NULL; - continue; - } - size_bits = new_hash->size_bits; - next_hash = new_hash; - new_hash = alloc_ftrace_hash(size_bits); - ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash); - free_ftrace_hash(next_hash); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - return new_hash; -} - static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B) { struct ftrace_func_entry *entry; @@ -3427,6 +3393,95 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ return 0; } +static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *func_hash) +{ + /* If the filter hash is not empty, simply remove the nohash from it */ + if (!ftrace_hash_empty(func_hash->filter_hash)) { + *filter_hash = copy_hash(func_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + remove_hash(*filter_hash, func_hash->notrace_hash); + *notrace_hash = EMPTY_HASH; + + } else { + *notrace_hash = copy_hash(func_hash->notrace_hash); + if (!*notrace_hash) + return -ENOMEM; + *filter_hash = EMPTY_HASH; + } + return 0; +} + +static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash) +{ + int size_bits; + int ret; + + /* If the subops trace all functions so must the main ops */ + if (ftrace_hash_empty(ops_hash->filter_hash) || + ftrace_hash_empty(subops_hash->filter_hash)) { + *filter_hash = EMPTY_HASH; + } else { + /* + * The main ops filter hash is not empty, so its + * notrace_hash had better be, as the notrace hash + * is only used for empty main filter hashes. + */ + WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash)); + + size_bits = max(ops_hash->filter_hash->size_bits, + subops_hash->filter_hash->size_bits); + + /* Copy the subops hash */ + *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + /* Remove any notrace functions from the copy */ + remove_hash(*filter_hash, subops_hash->notrace_hash); + + ret = append_hash(filter_hash, ops_hash->filter_hash, + size_bits); + if (ret < 0) { + free_ftrace_hash(*filter_hash); + *filter_hash = EMPTY_HASH; + return ret; + } + } + + /* + * Only process notrace hashes if the main filter hash is empty + * (tracing all functions), otherwise the filter hash will just + * remove the notrace hash functions, and the notrace hash is + * not needed. + */ + if (ftrace_hash_empty(*filter_hash)) { + /* + * Intersect the notrace functions. That is, if two + * subops are not tracing a set of functions, the + * main ops will only not trace the functions that are + * in both subops, but has to trace the functions that + * are only notrace in one of the subops, for the other + * subops to be able to trace them. + */ + size_bits = max(ops_hash->notrace_hash->size_bits, + subops_hash->notrace_hash->size_bits); + *notrace_hash = alloc_ftrace_hash(size_bits); + if (!*notrace_hash) + return -ENOMEM; + + ret = intersect_hash(notrace_hash, ops_hash->notrace_hash, + subops_hash->notrace_hash); + if (ret < 0) { + free_ftrace_hash(*notrace_hash); + *notrace_hash = EMPTY_HASH; + return ret; + } + } + return 0; +} + /** * ftrace_startup_subops - enable tracing for subops of an ops * @ops: Manager ops (used to pick all the functions of its subops) @@ -3439,11 +3494,10 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ */ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; - int size_bits; int ret; if (unlikely(ftrace_disabled)) @@ -3467,14 +3521,14 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* For the first subops to ops just enable it normally */ if (list_empty(&ops->subop_list)) { - /* Just use the subops hashes */ - filter_hash = copy_hash(subops->func_hash->filter_hash); - notrace_hash = copy_hash(subops->func_hash->notrace_hash); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return -ENOMEM; - } + + /* The ops was empty, should have empty hashes */ + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash)); + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash)); + + ret = add_first_hash(&filter_hash, ¬race_hash, subops->func_hash); + if (ret < 0) + return ret; save_filter_hash = ops->func_hash->filter_hash; save_notrace_hash = ops->func_hash->notrace_hash; @@ -3500,48 +3554,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* * Here there's already something attached. Here are the rules: - * o If either filter_hash is empty then the final stays empty - * o Otherwise, the final is a superset of both hashes - * o If either notrace_hash is empty then the final stays empty - * o Otherwise, the final is an intersection between the hashes + * If the new subops and main ops filter hashes are not empty: + * o Make a copy of the subops filter hash + * o Remove all functions in the nohash from it. + * o Add in the main hash filter functions + * o Remove any of these functions from the main notrace hash */ - if (ftrace_hash_empty(ops->func_hash->filter_hash) || - ftrace_hash_empty(subops->func_hash->filter_hash)) { - filter_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); - if (!filter_hash) - return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash, - size_bits); - if (ret < 0) { - free_ftrace_hash(filter_hash); - return ret; - } - } - if (ftrace_hash_empty(ops->func_hash->notrace_hash) || - ftrace_hash_empty(subops->func_hash->notrace_hash)) { - notrace_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - notrace_hash = alloc_ftrace_hash(size_bits); - if (!notrace_hash) { - free_ftrace_hash(filter_hash); - return -ENOMEM; - } - - ret = intersect_hash(¬race_hash, ops->func_hash->filter_hash, - subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return ret; - } - } + ret = add_next_hash(&filter_hash, ¬race_hash, ops->func_hash, subops->func_hash); + if (ret < 0) + return ret; list_add(&subops->list, &ops->subop_list); @@ -3557,6 +3579,45 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int return ret; } +static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops *ops) +{ + struct ftrace_ops_hash temp_hash; + struct ftrace_ops *subops; + bool first = true; + int ret; + + temp_hash.filter_hash = EMPTY_HASH; + temp_hash.notrace_hash = EMPTY_HASH; + + list_for_each_entry(subops, &ops->subop_list, list) { + *filter_hash = EMPTY_HASH; + *notrace_hash = EMPTY_HASH; + + if (first) { + ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash); + if (ret < 0) + return ret; + first = false; + } else { + ret = add_next_hash(filter_hash, notrace_hash, + &temp_hash, subops->func_hash); + if (ret < 0) { + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + return ret; + } + } + + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + + temp_hash.filter_hash = *filter_hash; + temp_hash.notrace_hash = *notrace_hash; + } + return 0; +} + /** * ftrace_shutdown_subops - Remove a subops from a manager ops * @ops: A manager ops to remove @subops from @@ -3571,8 +3632,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int */ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; int ret; if (unlikely(ftrace_disabled)) @@ -3605,14 +3666,9 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in } /* Rebuild the hashes without subops */ - filter_hash = append_hashes(ops); - notrace_hash = intersect_hashes(ops); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - list_add(&subops->list, &ops->subop_list); - return -ENOMEM; - } + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (ret < 0) + return ret; ret = ftrace_update_ops(ops, filter_hash, notrace_hash); if (ret < 0) { @@ -3628,11 +3684,11 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, struct ftrace_hash **orig_subhash, - struct ftrace_hash *hash, - int enable) + struct ftrace_hash *hash) { struct ftrace_ops *ops = subops->managed; - struct ftrace_hash **orig_hash; + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; struct ftrace_hash *save_hash; struct ftrace_hash *new_hash; int ret; @@ -3649,24 +3705,18 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, return -ENOMEM; } - /* Create a new_hash to hold the ops new functions */ - if (enable) { - orig_hash = &ops->func_hash->filter_hash; - new_hash = append_hashes(ops); - } else { - orig_hash = &ops->func_hash->notrace_hash; - new_hash = intersect_hashes(ops); + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (!ret) { + ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); } - /* Move the hash over to the new hash */ - ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); - - free_ftrace_hash(new_hash); - if (ret) { /* Put back the original hash */ - free_ftrace_hash_rcu(*orig_subhash); + new_hash = *orig_subhash; *orig_subhash = save_hash; + free_ftrace_hash_rcu(new_hash); } else { free_ftrace_hash_rcu(save_hash); } @@ -4890,7 +4940,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, int enable) { if (ops->flags & FTRACE_OPS_FL_SUBOP) - return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(ops, orig_hash, hash); /* * If this ops is not enabled, it could be sharing its filters @@ -4909,7 +4959,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, list_for_each_entry(subops, &op->subop_list, list) { if ((subops->flags & FTRACE_OPS_FL_ENABLED) && subops->func_hash == ops->func_hash) { - return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(subops, orig_hash, hash); } } } while_for_each_ftrace_op(op); @@ -5914,9 +5964,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c0f877d39a24..3f9bf562beea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,10 +1887,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) head_page = cpu_buffer->head_page; - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 968c5c3b0246..e4077500a91d 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -225,7 +225,12 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef) */ bool rv_is_container_monitor(struct rv_monitor_def *mdef) { - struct rv_monitor_def *next = list_next_entry(mdef, list); + struct rv_monitor_def *next; + + if (list_is_last(&mdef->list, &rv_monitors_list)) + return false; + + next = list_next_entry(mdef, list); return next->parent == mdef->monitor || !mdef->monitor->enable; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b581e388a9d9..5b8db27fb6ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6043,8 +6043,10 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) tscratch = tr->scratch; /* if there is no tscrach, module_delta must be NULL. */ module_delta = READ_ONCE(tr->module_delta); - if (!module_delta || tscratch->entries[0].mod_addr > addr) + if (!module_delta || !tscratch->nr_entries || + tscratch->entries[0].mod_addr > addr) { return addr + tr->text_delta; + } /* Note that entries must be sorted. */ nr_entries = tscratch->nr_entries; @@ -6821,13 +6823,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } @@ -9806,6 +9809,7 @@ static int instance_mkdir(const char *name) return ret; } +#ifdef CONFIG_MMU static u64 map_pages(unsigned long start, unsigned long size) { unsigned long vmap_start, vmap_end; @@ -9828,6 +9832,12 @@ static u64 map_pages(unsigned long start, unsigned long size) return (u64)vmap_start; } +#else +static inline u64 map_pages(unsigned long start, unsigned long size) +{ + return 0; +} +#endif /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a5..5d64a18cacac 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee40d4e6ad1c..4ef4df6623a8 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned long, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index c08355c3ef32..916555f0de81 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -969,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } + trace_probe_log_clear(); return ret; + parse_error: ret = -EINVAL; error: + trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0993dfc1c5c1..2048560264bb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 969f48742d72..33cfbd4ed76d 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -370,7 +370,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b66b6d235d91..6e87ae2a1a66 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1560,7 +1560,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 98ccf3f00c51..4e37a0f6aaa3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -633,11 +633,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2f077d4158e5..0c357a89c58e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -880,8 +880,6 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr if (print_retval || print_retaddr) trace_seq_puts(s, " /*"); - else - trace_seq_putc(s, '\n'); } else { print_retaddr = false; trace_seq_printf(s, "} /* %ps", func); @@ -899,7 +897,7 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr } if (!entry || print_retval || print_retaddr) - trace_seq_puts(s, " */\n"); + trace_seq_puts(s, " */"); } #else @@ -975,7 +973,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, } else trace_seq_puts(s, "();"); } - trace_seq_printf(s, "\n"); + trace_seq_putc(s, '\n'); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -1313,10 +1311,11 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * that if the funcgraph-tail option is enabled. */ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); + trace_seq_puts(s, "}"); else - trace_seq_printf(s, "} /* %ps */\n", (void *)func); + trace_seq_printf(s, "} /* %ps */", (void *)func); } + trace_seq_putc(s, '\n'); /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2703b96d8990..3e5c47b6d7b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index fee40ffbd490..b9ab06c99543 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1042,11 +1042,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, struct trace_event_call *call; struct list_head *head; + lockdep_assert_held_read(&trace_event_sem); + /* ftrace defined events have separate call structures */ if (event->type <= __TRACE_LAST_TYPE) { bool found = false; - down_read(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { if (call->event.type == event->type) { found = true; @@ -1056,7 +1057,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, if (call->event.type > __TRACE_LAST_TYPE) break; } - up_read(&trace_event_sem); if (!found) { trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); goto out; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2eeecb6c95ee..424751cdf31f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3386439ec9f6..35cf76c75dd7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -741,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2ef2e1b80091..2f844c279a3e 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop); * @arg: data to be passed to fn and handled_kill * @name: the thread's name * - * This returns a specialized task for use by the vhost layer or NULL on + * This returns a specialized task for use by the vhost layer or ERR_PTR() on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ |