diff options
author | Kent Overstreet <koverstreet@google.com> | 2012-10-11 13:36:15 -0700 |
---|---|---|
committer | Kent Overstreet <koverstreet@google.com> | 2012-10-11 13:36:15 -0700 |
commit | 21a0765a50fff349155964432b9529541eee5e8a (patch) | |
tree | f605ce5afbeb7d2dce0ee8c546f47d98ea5f48fb | |
parent | 07a2039b8eb0af4ff464efd3dfd95de5c02648c6 (diff) | |
download | linux-bcache-acall.tar.zst |
acallacall
-rw-r--r-- | arch/x86/ia32/ia32entry.S | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/unistd_32.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/unistd_64.h | 8 | ||||
-rw-r--r-- | arch/x86/kernel/process_32.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/syscall_64.c | 14 | ||||
-rw-r--r-- | include/linux/acall.h | 54 | ||||
-rw-r--r-- | include/linux/mm_types.h | 2 | ||||
-rw-r--r-- | include/linux/syscalls.h | 18 | ||||
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/acall.c | 852 | ||||
-rw-r--r-- | kernel/compat.c | 86 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/sys_ni.c | 7 |
13 files changed, 1069 insertions, 1 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a505202086e8..167a4b3ca051 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -830,4 +830,8 @@ ia32_sys_call_table: .quad sys_inotify_init1 .quad compat_sys_preadv .quad compat_sys_pwritev + .quad sys_acall_submit /* 335 */ + .quad compat_sys_acall_comp_pwait + .quad compat_sys_acall_ring_pwait + .quad sys_acall_cancel ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6e72d74cf8dc..157d86af59b3 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -340,6 +340,10 @@ #define __NR_inotify_init1 332 #define __NR_preadv 333 #define __NR_pwritev 334 +#define __NR_acall_submit 335 +#define __NR_acall_comp_pwait 336 +#define __NR_acall_ring_pwait 337 +#define __NR_acall_cancel 338 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index f81829462325..cf98b8fd0e1f 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -657,6 +657,14 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1) __SYSCALL(__NR_preadv, sys_preadv) #define __NR_pwritev 296 __SYSCALL(__NR_pwritev, sys_pwritev) +#define __NR_acall_submit 297 +__SYSCALL(__NR_acall_submit, sys_acall_submit) +#define __NR_acall_comp_pwait 298 +__SYSCALL(__NR_acall_comp_pwait, sys_acall_comp_pwait) +#define __NR_acall_ring_pwait 299 +__SYSCALL(__NR_acall_ring_pwait, sys_acall_ring_pwait) +#define __NR_acall_cancel 300 +__SYSCALL(__NR_acall_cancel, sys_acall_cancel) #ifndef __NO_STUBS diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a2..6d18b975674a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -509,3 +509,20 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long range_end = mm->brk + 0x02000000; return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } + +/* + * XXX This is just a dumb place-holder for testing until we find out how + * the x86 maintainers want this done. + */ +long arch_call_syscall(unsigned int nr, long arg0, long arg1, long arg2, + long arg3, long arg4, long arg5) +{ + typedef asmlinkage long (*syscall_fn_t)(long, long, long, long, long, + long); + syscall_fn_t *calls = (syscall_fn_t *)sys_call_table; + + if (nr > __NR_acall_cancel) + return -ENOSYS; + + return calls[nr](arg0, arg1, arg2, arg3, arg4, arg5); +} diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d6008295..f5d5f6aac664 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -3,6 +3,7 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> +#include <linux/errno.h> #include <asm/asm-offsets.h> #define __NO_STUBS @@ -27,3 +28,16 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/unistd_64.h> }; + +long arch_call_syscall(unsigned int nr, long arg0, long arg1, long arg2, + long arg3, long arg4, long arg5) +{ + typedef asmlinkage long (*syscall_fn_t)(long, long, long, long, long, + long); + syscall_fn_t *calls = (syscall_fn_t *)sys_call_table; + + if (nr > __NR_syscall_max) + return -ENOSYS; + + return calls[nr](arg0, arg1, arg2, arg3, arg4, arg5); +} diff --git a/include/linux/acall.h b/include/linux/acall.h new file mode 100644 index 000000000000..f6d3f6b854af --- /dev/null +++ b/include/linux/acall.h @@ -0,0 +1,54 @@ +#ifndef __LINUX_ACALL_H +#define __LINUX_ACALL_H + +/* + * The kernel makes a private copy of this during sys_acall_submit(). Once + * that call returns userspace does not need to keep it around. + * + * The flags field will be used to indicate the presence of fields which + * are added to the end of the struct over time. To support this the + * submission call must refuse submission for structs which contain flags + * which it doesn't recognize. + */ +struct acall_submission { + u32 nr; + u32 flags; + u64 cookie; + u64 completion_ring_pointer; + u64 completion_pointer; + u64 id_pointer; + u64 args[6]; +}; + +#define ACALL_SUBMIT_THREAD_POOL 1 + +/* + * This is used by userspace to specify an operation for cancelation or + * waiting. The data here only has significance to the kernel. + */ +struct acall_id { + unsigned char opaque[16]; +}; + +struct acall_completion { + u64 return_code; + u64 cookie; +}; + +/* + * 'nr' is read by the kernel each time it tries to store an event in + * the ring. + * + * 'head' is written by the kernel as it adds events. Once it changes than + * the kernel will be writing an acall_completion struct into the ring. + * A non-zero cookie field of the completion struct indicates that the + * completion has been written. Once it is non-zero then the return_code + * can be loaded after issuing a read memory barrier. + */ +struct acall_completion_ring { + u32 head; + u32 nr; + struct acall_completion comps[0]; +}; + +#endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e80e26ecf21..4eb375557534 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -21,6 +21,7 @@ #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) struct address_space; +struct acall_mm; #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) @@ -278,6 +279,7 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif + struct acall_mm *acall_mm; }; /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 30520844b8da..5a72e5b2ba58 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,6 +55,9 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; +struct acall_submission; +struct acall_id; +struct acall_completion_ring; #include <linux/types.h> #include <linux/aio_abi.h> @@ -752,7 +755,22 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, size_t); asmlinkage long sys_pipe2(int __user *, int); asmlinkage long sys_pipe(int __user *); +asmlinkage long sys_acall_submit(struct acall_submission __user *submissions, + unsigned long nr); +asmlinkage long sys_acall_comp_pwait(struct acall_id __user *uids, + unsigned long nr, + struct timespec __user *utime, + const sigset_t __user *sigmask, + size_t sigsetsize); +asmlinkage long sys_acall_ring_pwait(struct acall_completion_ring __user *uring, + u32 tail, u32 min, + struct timespec __user *utime, + const sigset_t __user *sigmask, + size_t sigsetsize); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +long arch_call_syscall(unsigned int nr, long arg0, long arg1, long arg2, + long arg3, long arg4, long arg5); + #endif diff --git a/kernel/Makefile b/kernel/Makefile index 42423665660a..1e89a514f412 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o + async.o acall.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/acall.c b/kernel/acall.c new file mode 100644 index 000000000000..61f901c9c861 --- /dev/null +++ b/kernel/acall.c @@ -0,0 +1,852 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/signal.h> +#include <linux/uaccess.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/pagemap.h> +#include <linux/acall.h> +#include <linux/compiler.h> +#include <linux/syscalls.h> +#include <asm/futex.h> + +/* This is the kernel's version of the id which is opaque to userspace */ +struct acall_kernel_id { + u64 cpu; + u64 counter; +}; + +static DEFINE_PER_CPU(u64, id_counter); + +/* + * We store some things per mm_struct. This is allocated and stored in + * the mm on first use and is freed as the mm exits. + */ +struct acall_mm { + struct rb_root active_ops; + wait_queue_head_t ring_waiters; + wait_queue_head_t threads; +}; + +/* + * This tracks an operation which is being performed by a acall thread. It + * is built up in the submitting task and then handed off to an acall thread + * to process. It is removed and freed by the acall thread once it's done. + */ +struct acall_operation { + struct rb_node node; + struct acall_kernel_id kid; + wait_queue_head_t waitq; + struct task_struct *task; + struct acall_submission sub; +}; + +static void insert_op(struct rb_root *root, struct acall_operation *ins) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct acall_operation *op; + int cmp; + + while (*p) { + parent = *p; + op = rb_entry(parent, struct acall_operation, node); + + cmp = memcmp(&ins->kid, &op->kid, sizeof(op->kid)); + BUG_ON(cmp == 0); + + if (cmp < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&ins->node, parent, p); + rb_insert_color(&ins->node, root); +} + +static struct acall_operation *find_op(struct rb_root *root, + struct acall_kernel_id *kid) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent; + struct acall_operation *op; + int cmp; + + while (*p) { + parent = *p; + op = rb_entry(parent, struct acall_operation, node); + + cmp = memcmp(kid, &op->kid, sizeof(op->kid)); + if (cmp == 0) + return op; + + if (cmp < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + return NULL; +} + +static struct acall_mm *get_amm(struct mm_struct *mm) +{ + struct acall_mm *amm = mm->acall_mm; + if (amm) + return amm; + + amm = kmalloc(sizeof(struct acall_mm), GFP_KERNEL); + if (amm == NULL) + return NULL; + + amm->active_ops = RB_ROOT; + init_waitqueue_head(&amm->ring_waiters); + init_waitqueue_head(&amm->threads); + + /* XXX I hope it's ok to abuse this sem. */ + down_write(&mm->mmap_sem); + if (mm->acall_mm == NULL) + mm->acall_mm = amm; + else { + kfree(amm); + amm = mm->acall_mm; + } + up_write(&mm->mmap_sem); + return amm; +} + +/* + * completions can be collected from user space as long as they load the + * cookie before the return code and separate the two loads with a read + * barrier: + * + * cookie = comp->cookie; + * smp_rmb(); + * ret = comp->return_code; + * if (cookie) + * return ret; + * else + * { do more userspace business ; sys_acall_*_pwait(); } + */ +static int store_comp(struct acall_completion __user *comp, u64 return_code, + u64 cookie) +{ + if (__put_user(return_code, &comp->return_code)) + return -EFAULT; + smp_wmb(); + if (__put_user(cookie, &comp->cookie)) + return -EFAULT; + + return 0; +} + +static int store_ring(struct acall_completion_ring __user *uring, + u64 return_code, u64 cookie) +{ + struct acall_completion __user *ucomp; + struct acall_mm *amm; + u32 nr; + u32 old; + u32 head; + int ret; + + if (__get_user(nr, &uring->nr)) + return -EFAULT; + if (nr == 0) + return -EINVAL; + + do { + if (__get_user(head, &uring->head)) + return -EFAULT; + + pagefault_disable(); + old = futex_atomic_cmpxchg_inatomic(&uring->head, head, + head + 1); + pagefault_enable(); + pr_debug("head %u old %u\n", head, old); + /* XXX handle old = -EFAULT :P. */ + } while (old != head); + + ucomp = &uring->comps[head % nr]; + pr_debug("ucomp %p\n", ucomp); + ret = store_comp(ucomp, return_code, cookie); + if (ret) + return ret; + + /* + * XXX We might want a barrier to order our ring store with our loading + * of acall_mm. We don't want to miss a wake-up. + */ + amm = current->mm->acall_mm; + if (amm) + wake_up(&amm->ring_waiters); + return 0; +} + +static void process_op(struct acall_mm *amm, struct acall_operation *op) +{ + struct acall_completion_ring __user *uring; + struct acall_completion __user *ucomp; + struct acall_id __user *uid; + struct acall_submission *sub = &op->sub; + unsigned long flags; + u64 rc; + int ret; + + rc = arch_call_syscall(sub->nr, sub->args[0], sub->args[1], + sub->args[2], sub->args[3], sub->args[4], + sub->args[5]); + + ucomp = (void __user *)sub->completion_pointer; + if (ucomp) { + ret = store_comp(ucomp, rc, sub->cookie); + if (ret) + printk("comp store to %p failed ret %d\n", ucomp, ret); + } + + uring = (void __user *)sub->completion_ring_pointer; + if (uring) { + ret = store_ring(uring, rc, sub->cookie); + if (ret) + printk("ring store to %p failed ret %d\n", uring, ret); + } + + /* + * We're waking and freeing under the lock to avoid races with + * sys_acall_comp_pwait(). Something more efficient is surely + * possible, but this seems like a safe first pass. + */ + uid = (void __user *)sub->id_pointer; + if (uid) { + spin_lock_irqsave(&amm->threads.lock, flags); + wake_up(&op->waitq); + rb_erase(&op->node, &amm->active_ops); + spin_unlock_irqrestore(&amm->threads.lock, flags); + } +} + +struct thread_wait_private { + struct task_struct *task; + struct acall_operation *op; +}; + +/* + * This is called in the submit path to hand off an operation to a waiting + * thread. We also use the wait queue lock to protect the active ops + * tracking so that we don't have to add more locking to the submission + * path. + */ +static int wake_idle_thread(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct thread_wait_private *wp = wait->private; + struct acall_operation **caller_op = key; + struct acall_mm *amm = current->mm->acall_mm; + struct acall_operation *op; + int ret; + + /* + * XXX We don't use the generic wake functions because they reference + * wait->private instead of calling helpers which take the task + * struct. Maybe we should export try_to_wake_up, or wrap it as + * wake_up_state_sync(), or something. In any case, this is currently + * ignoring the sync argument. + */ + ret = wake_up_state(wp->task, mode); + if (ret) { + op = *caller_op; + wp->op = op; + *caller_op = NULL; + + op->task = wp->task; + if (op->sub.id_pointer) + insert_op(&amm->active_ops, op); + + list_del_init(&wait->task_list); + } + + return ret; +} + +static int acall_thread(void *data) +{ + struct acall_operation *op = data; + struct acall_mm *amm = current->mm->acall_mm; + struct thread_wait_private wp; + wait_queue_t wait; + + /* + * XXX We don't want our parent task to know that we've secretly + * created kernel threads working on their behalf. This at least stops + * us from becoming zombies and waiting for our parent to wait on us. + * I have no idea if this is the right way to do this. Halp! + */ + current->exit_signal = -1; + + /* + * Let cancellation know which task is handling the op. This isn't so + * great because there's a window where cancellation won't find a + * pending op. It could be cleaned up if anyone cares. Cancellation + * is inherently racey and rare to begin with. + */ + op->task = current; + + /* get the easy case out of the way.. */ + if (!(op->sub.flags & ACALL_SUBMIT_THREAD_POOL)) { + process_op(amm, op); + kfree(op); + return 0; + } + + /* + * We're using our own wait queue entry func so we roll our own + * wait_event_*() :( + */ + wp.op = op; + wp.task = current; + init_wait(&wait); + wait.private = ℘ + wait.func = wake_idle_thread; + + /* + * This is being careful to test wp.op after finish_wait() is + * called in case we got woken up just before removing ourselves + * from the wait queue. + */ + while (wp.op) { + process_op(amm, wp.op); + kfree(wp.op); + wp.op = NULL; + + prepare_to_wait_exclusive(&amm->threads, &wait, + TASK_INTERRUPTIBLE); + if (wp.op == NULL) + schedule_timeout(msecs_to_jiffies(200)); + if (wp.op == NULL) + finish_wait(&amm->threads, &wait); + } + + return 0; +} + +static int setup_op_id(struct acall_operation *op, struct acall_id __user *uid) +{ + int cpu = get_cpu(); + op->kid.cpu = cpu; + op->kid.counter = per_cpu(id_counter, cpu)++; + put_cpu(); + + init_waitqueue_head(&op->waitq); + + BUILD_BUG_ON(sizeof(struct acall_kernel_id) != sizeof(struct acall_id)); + if (copy_to_user(uid, &op->kid, sizeof(op->kid))) + return -EFAULT; + else + return 0; +} + +/* + * Submits system calls to be executed by kernel threads. + * + * The submissions array contains pointers to submission structures, one for + * each operation. The pointer and submission struct are copied to the + * kernel. The submission struct is only referenced during this submission + * call. It will not be referenced once this submission call has returned. + * + * The 'flags' field alters behaviour of a given submission: + * ACALL_SUBMIT_THREAD_POOL: The submission will be handed off to a waiting + * thread if one is available. It will not be updated with the submitting + * callers task state. If a waiting thread isn't available then one + * will be created. After servicing the operation the thread will + * wait for 200ms for a chance to service another operation before it + * exits. + * + * The 'id_pointer' field in the submission struct is a user space pointer to + * a 'struct acall_id'. If the field is non-zero then the kernel writes an id + * to that address which identifies the operation. The id can be used to + * cancel the operation or wait for its completion. + * + * The 'completion_pointer' field in the submission struct is a user space + * pointer to a 'struct acall_completion'. If it is non-zero then the kernel + * will write a completion struct to that address when the operation is + * complete. + * + * The 'completion_ring_pointer' field in the submission struct is a user + * space pointer to a 'struct acall_completion_ring'. If it is non-zero then + * the kernel will write a completion struct to the next available position in + * the given ring. It is up to the application to ensure that there is always + * enough room in the ring by not submitting more operations than there are + * entries in the ring. + * + * It is allowed to set all of these three pointers to null. The operation + * will still be processed. + * + * A positive return code gives the number of operations which are now + * pending. A return code less than 'nr' is possible if later submissions + * contain errors. A negative return code is the errno of the submission + * failure of the first submission struct. 0 will be returned if 'nr' is 0. + */ +asmlinkage long sys_acall_submit(struct acall_submission __user *submissions, + unsigned long nr) +{ + struct acall_operation *op = NULL; + struct acall_id __user *uid; + struct acall_mm *amm = NULL; + unsigned long flags; + unsigned long i = 0; + pid_t pid; + int ret = 0; + + /* + * We don't strictly need this for all ops. But it's a small amount + * of work and an unlikely failure case. Ensuring that it exists + * makes uses later on in the loop cleaner and the majority of ops + * will use it eventually anyway. + */ + amm = get_amm(current->mm); + if (amm == NULL) + return -ENOMEM; + + for (; i < nr; i++) { + op = kmalloc(sizeof(struct acall_operation), GFP_KERNEL); + if (op == NULL) { + ret = -ENOMEM; + break; + } + + if (copy_from_user(&op->sub, &submissions[i], + sizeof(struct acall_submission))) { + ret = -EFAULT; + break; + } + + uid = (void __user *)op->sub.id_pointer; + if (uid) { + ret = setup_op_id(op, uid); + if (ret) + break; + } + + /* the threads' waitq wake func passes it the op */ + if (op->sub.flags & ACALL_SUBMIT_THREAD_POOL) { + __wake_up(&amm->threads, TASK_NORMAL, 1, &op); + if (op == NULL) + continue; + } + + if (uid) { + op->task = NULL; + spin_lock_irqsave(&amm->threads.lock, flags); + insert_op(&amm->active_ops, op); + spin_unlock_irqrestore(&amm->threads.lock, flags); + } + + pid = kernel_thread(acall_thread, op, + CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_IO); + if (pid < 0) { + ret = pid; + break; + } + op = NULL; + } + + kfree(op); + return i ? i : ret; +} + +static int pwait_prologue(struct timespec __user *utime, + const sigset_t __user *sigmask, size_t sigsetsize, + struct hrtimer_sleeper *sleeper, sigset_t *sigsaved) +{ + sigset_t ksigmask; + struct timespec ts; + unsigned long slack; + ktime_t t; + + if (utime) { + if (copy_from_user(&ts, utime, sizeof(ts)) != 0) + return -EFAULT; + if (!timespec_valid(&ts)) + return -EINVAL; + + hrtimer_init_on_stack(&sleeper->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(sleeper, current); + + t = ktime_add_safe(ktime_get(), timespec_to_ktime(ts)); + if (rt_task(current)) + slack = 0; + else + slack = current->timer_slack_ns; + hrtimer_set_expires_range_ns(&sleeper->timer, t, slack); + + hrtimer_start_expires(&sleeper->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&sleeper->timer)) + sleeper->task = NULL; + } else + sleeper->task = current; + + if (sigmask) { + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, sigsaved); + } + + return 0; +} + +static void pwait_epilogue(int ret, struct timespec __user *utime, + const sigset_t __user *sigmask, + struct hrtimer_sleeper *sleeper, sigset_t *sigsaved) +{ + if (utime) { + hrtimer_cancel(&sleeper->timer); + destroy_hrtimer_on_stack(&sleeper->timer); + } + + if (sigmask) { + if (ret == -EINTR) { + memcpy(¤t->saved_sigmask, sigsaved, + sizeof(*sigsaved)); + set_restore_sigmask(); + } else + sigprocmask(SIG_SETMASK, sigsaved, NULL); + } +} + +struct comp_wait_private { + struct task_struct *task; + int woken; +}; + +/* + * We have a wake function which sets a per-waiter boolean so that it can + * tell when one of its wait queues has been woken without having to test + * all of them. + */ +static int wake_comp_waiter(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct comp_wait_private *wp = wait->private; + int ret; + + /* + * XXX We don't use the generic wake functions because they reference + * wait->private instead of calling helpers which take the task + * struct. Maybe we should export try_to_wake_up, or wrap it as + * wake_up_state_sync(), or something. In any case, this is currently + * ignoring the sync argument. + */ + ret = wake_up_state(wp->task, mode); + if (ret) { + wp->woken = 1; + list_del_init(&wait->task_list); + } + + return ret; +} + +struct acall_wait { + struct acall_wait *next; + wait_queue_t wait; + struct acall_operation *op; +}; + + +/* + * This waits for the given operations to complete. + * + * A return code of 1 indicates that some number of the operations have + * completed. They might have been completed before this system call was + * executed or they might have completed while we were sleeping. + * + * A return code of 0 indicates that there were no operations to wait for. + * 'nr' might have been 0 or all the uid pointers were NULL. + * + * A negative return code indicates a negative errno which occurred during + * processing of one of the specified operations. + * + * -EINVAL can be returned if the calling memory context has never submitted + * operations with the id_pointer set, meaning that we could have no operations + * to wait for. + * + * This call has know way to know if a given id represents a valid id that was + * issued in the past. If it finds an id that does not correspond to an + * operation that is currently processing it assumes that the operation has + * been completed and returns 1. Callers that pass in invalid ids will be + * told that an operation has completed. + */ +asmlinkage long sys_acall_comp_pwait(struct acall_id __user *uids, + unsigned long nr, + struct timespec __user *utime, + const sigset_t __user *sigmask, + size_t sigsetsize) +{ + struct acall_mm *amm = current->mm->acall_mm; + struct acall_kernel_id kid; + struct hrtimer_sleeper sleeper; + unsigned long flags; + struct acall_operation *op; + sigset_t sigsaved; + struct acall_wait *aw; + struct acall_wait *head = NULL; + struct comp_wait_private wp = { + .task = current, + .woken = 0, + }; + unsigned long i; + int ret; + + if (amm == NULL) + return -EINVAL; + + ret = pwait_prologue(utime, sigmask, sigsetsize, &sleeper, &sigsaved); + if (ret) + return ret; + + for (i = 0; i < nr; i++) { + if (copy_from_user(&kid, &uids[i], sizeof(kid))) { + ret = -EFAULT; + break; + } + + aw = kzalloc(sizeof(struct acall_wait), GFP_KERNEL); + if (aw == NULL) { + ret = -ENOMEM; + break; + } + + init_wait(&aw->wait); + aw->wait.private = ℘ + aw->wait.func = wake_comp_waiter; + + spin_lock_irqsave(&amm->threads.lock, flags); + op = find_op(&amm->active_ops, &kid); + if (op) { + aw->op = op; + add_wait_queue(&op->waitq, &aw->wait); + } + spin_unlock_irqrestore(&amm->threads.lock, flags); + if (op == NULL) { + kfree(aw); + wp.woken = 1; + break; + } + + aw->next = head; + head = aw; + } + + if (head == NULL) + goto out; + + /* we need the barrier in set_current_state() */ + set_current_state(TASK_INTERRUPTIBLE); + + if (!wp.woken && sleeper.task && !signal_pending(current)) + schedule(); + if (signal_pending(current)) + ret = -ERESTARTSYS; + + /* + * The op is freed after waking up the op's waitqueue, removing all its + * wait heads, while holding the lock. If we acquire the lock, and our + * aw is still on the queue, then the op won't be freed until we + * release the lock. finish_wait() only dereferences our op pointer if + * the entry is still on the queue. + * + * XXX How much work is too much work to do while holding the lock? + */ + while (head) { + spin_lock_irqsave(&amm->threads.lock, flags); + for (i = 0; (aw = head) && i < 100; i++) { + head = head->next; + finish_wait(&aw->op->waitq, &aw->wait); + kfree(aw); + } + spin_unlock_irqrestore(&amm->threads.lock, flags); + } + +out: + pwait_epilogue(ret, utime, sigmask, &sleeper, &sigsaved); + if (wp.woken) + ret = 1; + return ret; +} + +/* + * This returns non-zero if the calling wait_event_*() loop should break + * out and fall back to sampling the head with a blocking read. We want to + * do this either if the read faults or if we see enough space in the ring. + * + * The calling wait_event_*() loop has set our task state. We need + * to be very careful that we don't set it in the process of testing the + * userspace pointer. We could lose a wake-up if we did. + */ +static int should_get_user(struct acall_completion_ring __user *uring, + u32 tail, u32 min) +{ + u32 head; + int ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(&head, &uring->head, sizeof(head)); + pagefault_enable(); + return ret || (head - tail >= min); +} + +/* + * This waits for the given number of completions to appear in the given ring. + * + * Userspace specifies the tail value which indicates the index of the last + * completion that they've consumed. We watch the head pointer until it + * indicates that 'min' number of completions are waiting. + * + * If 'min' is 0 then the call will return 0 immediately without reading + * the ring. + * + * The number of pending events is not returned because it may be larger + * than the signed int that many archs use to represent the return code + * of a system call. Userspace is expected to perform u32 math on the + * head index and their tail index. + * + * We'll only be woken by threads which complete into our ring if we share + * the same mm context as they do. We could provide a flag to submission + * and waiting to indicate that we should track sleepers outside of a given + * mm context. + * + * This first pass just uses a boring wait queue head in our per-mm data + * We wake when anything hits the ring and re-evaluate our situation. We could + * spend some code and complexity on a more clever data structure which would + * allow the completer to only wake us when the ring has the given number + * of events that we want. + */ +asmlinkage long sys_acall_ring_pwait(struct acall_completion_ring __user *uring, + u32 tail, u32 min, + struct timespec __user *utime, + const sigset_t __user *sigmask, + size_t sigsetsize) +{ + struct hrtimer_sleeper sleeper; + struct acall_mm *amm; + sigset_t sigsaved; + u32 head; + int ret; + + amm = get_amm(current->mm); + if (amm == NULL) + return -ENOMEM; + + if (min == 0) + return 0; + + ret = pwait_prologue(utime, sigmask, sigsetsize, &sleeper, &sigsaved); + if (ret) + return ret; + + for(;;) { + /* XXX read memory barrier? */ + if (__get_user(head, &uring->head)) { + ret = -EFAULT; + break; + } + + /* XXX is the wrapping u32 math ok? */ + if (head - tail >= min) { + break; + } + + ret = wait_event_interruptible(amm->ring_waiters, + !sleeper.task || + should_get_user(uring, tail, + min)); + if (ret || !sleeper.task) + break; + } + + pwait_epilogue(ret, utime, sigmask, &sleeper, &sigsaved); + return ret; +} + +/* + * Cancels the operation specified by the given id. The id is set by the + * kernel as the operation begins processing. It is so large as to be + * effectively unique for the life time of the system. + * + * -EAGAIN can be returned when: + * - the callers thread group has never issued an acall operation + * - the given id is not pending + * - the operation could not be canceled + * + * -EINVAL can be returned if the calling memory context has never submitted + * operations with the id_pointer set, meaning that we could have no operations + * to cancel. + * + * 0 will be returned if a successful attempt was made to cancel the + * operation. How the operation copes with an attempt to cancel it, sending + * its thread SIGKILL, depends on the operation. Some will abort immediately. + * Some may complete with partial progress. Some may ignore the signal. + * A completion struct will be generated as usual according to the event's + * submission. Its return code may reflect the cancellation attempt, or + * it may not. + */ +asmlinkage long sys_acall_cancel(struct acall_id __user *uid) + +{ + struct acall_mm *amm = current->mm->acall_mm; + struct acall_operation *op; + struct siginfo info; + struct acall_kernel_id kid; + unsigned long flags; + int ret; + + if (amm == NULL) + return -EAGAIN; + if (copy_from_user(&kid, uid, sizeof(kid))) + return -EFAULT; + + /* + * The target task exits after having removed its op from the tree + * under the lock. If the op is in the tree then the task won't + * exit until we release the lock. + */ + ret = -EAGAIN; + spin_lock_irqsave(&amm->threads.lock, flags); + op = find_op(&amm->active_ops, &kid); + if (op && op->task) { + info.si_signo = SIGKILL; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = task_tgid_vnr(current); + info.si_uid = current_uid(); + if (force_sig_info(info.si_signo, &info, op->task) == 0) + ret = 0; + } + spin_unlock_irqrestore(&amm->threads.lock, flags); + + return ret; +} diff --git a/kernel/compat.c b/kernel/compat.c index 42d56544460f..22764085d530 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -100,6 +100,16 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +static int user_timespec_from_compat(struct timespec __user *uts, + const struct compat_timespec __user *ucts) +{ + struct timespec ts; + return (get_compat_timespec(&ts, ucts) || + !access_ok(VERIFY_WRITE, uts, sizeof(*uts)) || + __put_user(uts->tv_sec, &ts.tv_sec) || + __put_user(uts->tv_nsec, &ts.tv_nsec)) ? -EFAULT : 0; +} + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; @@ -1128,3 +1138,79 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } + +static int user_sigset_from_compat(sigset_t __user *usigset, + const compat_sigset_t __user *ucsigset, + compat_size_t sigsetsize) +{ + compat_sigset_t csigset; + sigset_t sigset; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&csigset, ucsigset, sizeof(csigset))) + return -EFAULT; + sigset_from_compat(&sigset, &csigset); + if (copy_to_user(usigset, &sigset, sizeof(sigset))) + return -EFAULT; + return 0; +} + +asmlinkage long +compat_sys_acall_ring_pwait(struct acall_completion_ring __user *uring, + u32 tail, u32 min, + struct compat_timespec __user *ucts, + const compat_sigset_t __user *ucsigmask, + compat_size_t sigsetsize) +{ + struct timespec __user *uts; + sigset_t __user *usigmask; + long ret; + + if (ucts) { + uts = compat_alloc_user_space(sizeof(struct timespec)); + if (user_timespec_from_compat(uts, ucts)) + return -EFAULT; + } else + uts = NULL; + + if (ucsigmask) { + usigmask = compat_alloc_user_space(sizeof (sigset_t)); + ret = user_sigset_from_compat(usigmask, ucsigmask, sigsetsize); + if (ret) + return ret; + } else + usigmask = NULL; + + return sys_acall_ring_pwait(uring, tail, min, uts, usigmask, + sigsetsize); +} + +asmlinkage long +compat_sys_acall_comp_pwait(struct acall_id __user *uids, + unsigned long nr, + struct compat_timespec __user *ucts, + const compat_sigset_t __user *ucsigmask, + compat_size_t sigsetsize) +{ + struct timespec __user *uts; + sigset_t __user *usigmask; + long ret; + + if (ucts) { + uts = compat_alloc_user_space(sizeof(struct timespec)); + if (user_timespec_from_compat(uts, ucts)) + return -EFAULT; + } else + uts = NULL; + + if (ucsigmask) { + usigmask = compat_alloc_user_space(sizeof (sigset_t)); + ret = user_sigset_from_compat(usigmask, ucsigmask, sigsetsize); + if (ret) + return ret; + } else + usigmask = NULL; + + return sys_acall_comp_pwait(uids, nr, uts, usigmask, sigsetsize); +} diff --git a/kernel/fork.c b/kernel/fork.c index 875ffbdd96d0..f5fabc5500e4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -437,6 +437,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; mm_init_owner(mm, p); + mm->acall_mm = NULL; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -494,6 +495,7 @@ void mmput(struct mm_struct *mm) list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } + kfree(mm->acall_mm); put_swap_token(mm); mmdrop(mm); } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 27dad2967387..6dab68a286c6 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -16,6 +16,13 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } +long __attribute__ ((weak)) arch_call_syscall(unsigned int nr, + long arg0, long arg1, long arg2, + long arg3, long arg4, long arg5) +{ + return -ENOSYS; +} + cond_syscall(sys_nfsservctl); cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); |