summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/futex.c2
-rw-r--r--io_uring/io-wq.c23
-rw-r--r--io_uring/io_uring.c25
-rw-r--r--io_uring/kbuf.c16
-rw-r--r--io_uring/net.c9
-rw-r--r--io_uring/rsrc.h6
-rw-r--r--io_uring/rw.c37
-rw-r--r--io_uring/timeout.c13
-rw-r--r--io_uring/uring_cmd.c28
-rw-r--r--io_uring/waitid.c18
10 files changed, 102 insertions, 75 deletions
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 3159a2b7eeca..43e2143255f5 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -338,7 +338,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
hlist_add_head(&req->hash_node, &ctx->futex_list);
io_ring_submit_unlock(ctx, issue_flags);
- futex_queue(&ifd->q, hb);
+ futex_queue(&ifd->q, hb, NULL);
return IOU_ISSUE_SKIP_COMPLETE;
}
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 5d0928f37471..91019b4d0308 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -64,7 +64,7 @@ struct io_worker {
union {
struct rcu_head rcu;
- struct work_struct work;
+ struct delayed_work work;
};
};
@@ -770,6 +770,18 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err)
}
}
+static void queue_create_worker_retry(struct io_worker *worker)
+{
+ /*
+ * We only bother retrying because there's a chance that the
+ * failure to create a worker is due to some temporary condition
+ * in the forking task (e.g. outstanding signal); give the task
+ * some time to clear that condition.
+ */
+ schedule_delayed_work(&worker->work,
+ msecs_to_jiffies(worker->init_retries * 5));
+}
+
static void create_worker_cont(struct callback_head *cb)
{
struct io_worker *worker;
@@ -809,12 +821,13 @@ static void create_worker_cont(struct callback_head *cb)
/* re-create attempts grab a new worker ref, drop the existing one */
io_worker_release(worker);
- schedule_work(&worker->work);
+ queue_create_worker_retry(worker);
}
static void io_workqueue_create(struct work_struct *work)
{
- struct io_worker *worker = container_of(work, struct io_worker, work);
+ struct io_worker *worker = container_of(work, struct io_worker,
+ work.work);
struct io_wq_acct *acct = io_wq_get_acct(worker);
if (!io_queue_worker_create(worker, acct, create_worker_cont))
@@ -855,8 +868,8 @@ fail:
kfree(worker);
goto fail;
} else {
- INIT_WORK(&worker->work, io_workqueue_create);
- schedule_work(&worker->work);
+ INIT_DELAYED_WORK(&worker->work, io_workqueue_create);
+ queue_create_worker_retry(worker);
}
return true;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ceacf6230e34..01d75e5c47aa 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2045,6 +2045,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->opcode = 0;
return io_init_fail_req(req, -EINVAL);
}
+ opcode = array_index_nospec(opcode, IORING_OP_LAST);
+
def = &io_issue_defs[opcode];
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
@@ -2421,7 +2423,7 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
goto out_wake;
}
- iowq->t.function = io_cqring_timer_wakeup;
+ hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup);
hrtimer_set_expires(timer, iowq->timeout);
return HRTIMER_RESTART;
out_wake:
@@ -3791,29 +3793,36 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
return io_uring_create(entries, &p, params);
}
-static inline bool io_uring_allowed(void)
+static inline int io_uring_allowed(void)
{
int disabled = READ_ONCE(sysctl_io_uring_disabled);
kgid_t io_uring_group;
if (disabled == 2)
- return false;
+ return -EPERM;
if (disabled == 0 || capable(CAP_SYS_ADMIN))
- return true;
+ goto allowed_lsm;
io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
if (!gid_valid(io_uring_group))
- return false;
+ return -EPERM;
- return in_group_p(io_uring_group);
+ if (!in_group_p(io_uring_group))
+ return -EPERM;
+
+allowed_lsm:
+ return security_uring_allowed();
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
struct io_uring_params __user *, params)
{
- if (!io_uring_allowed())
- return -EPERM;
+ int ret;
+
+ ret = io_uring_allowed();
+ if (ret)
+ return ret;
return io_uring_setup(entries, params);
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 04bf493eecae..8e72de7712ac 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -415,6 +415,13 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
}
}
+static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+ scoped_guard(mutex, &ctx->mmap_lock)
+ WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl);
+ io_put_bl(ctx, bl);
+}
+
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
@@ -636,12 +643,13 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
return -EEXIST;
- } else {
- free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl)
- return -ENOMEM;
+ io_destroy_bl(ctx, bl);
}
+ free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return -ENOMEM;
+
mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
ring_size = flex_array_size(br, bufs, reg.ring_entries);
diff --git a/io_uring/net.c b/io_uring/net.c
index 17852a6616ff..50e8a3ccc9de 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -148,7 +148,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr);
if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
req->async_data = NULL;
- req->flags &= ~REQ_F_ASYNC_DATA;
+ req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
}
}
@@ -322,7 +322,9 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
if (unlikely(ret))
return ret;
- return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
+ ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
+ sr->msg_control = iomsg->msg.msg_control_user;
+ return ret;
}
#endif
@@ -439,7 +441,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static void io_req_msg_cleanup(struct io_kiocb *req,
unsigned int issue_flags)
{
- req->flags &= ~REQ_F_NEED_CLEANUP;
io_netmsg_recycle(req, issue_flags);
}
@@ -1439,6 +1440,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(zc->notif);
+ zc->notif = NULL;
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
@@ -1499,6 +1501,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(sr->notif);
+ sr->notif = NULL;
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 190f7ee45de9..89ea0135a1a0 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -4,12 +4,6 @@
#include <linux/lockdep.h>
-#define IO_NODE_ALLOC_CACHE_MAX 32
-
-#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
-#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
-#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
-
enum {
IORING_RSRC_FILE = 0,
IORING_RSRC_BUFFER = 1,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 7aa1e4c9f64a..e5528cebcd06 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -23,6 +23,9 @@
#include "poll.h"
#include "rw.h"
+static void io_complete_rw(struct kiocb *kiocb, long res);
+static void io_complete_rw_iopoll(struct kiocb *kiocb, long res);
+
struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
@@ -289,6 +292,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.dio_complete = NULL;
rw->kiocb.ki_flags = 0;
+ if (req->ctx->flags & IORING_SETUP_IOPOLL)
+ rw->kiocb.ki_complete = io_complete_rw_iopoll;
+ else
+ rw->kiocb.ki_complete = io_complete_rw;
+
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
@@ -552,19 +560,20 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
if (unlikely(res != req->cqe.res)) {
- if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ if (res == -EAGAIN && io_rw_should_reissue(req))
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
- return;
- }
- req->cqe.res = res;
+ else
+ req->cqe.res = res;
}
/* order with io_iopoll_complete() checking ->iopoll_completed */
smp_store_release(&req->iopoll_completed, 1);
}
-static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
+static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+
/* IO was queued async, completion will happen later */
if (ret == -EIOCBQUEUED)
return;
@@ -586,8 +595,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
- INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll,
- io_complete_rw, kiocb, ret);
+ if (req->ctx->flags & IORING_SETUP_IOPOLL)
+ io_complete_rw_iopoll(&rw->kiocb, ret);
+ else
+ io_complete_rw(&rw->kiocb, ret);
}
static int kiocb_done(struct io_kiocb *req, ssize_t ret,
@@ -598,7 +609,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos;
- if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
+ if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
__io_complete_rw_common(req, ret);
/*
* Safe to call io_end from here as we're inline
@@ -609,7 +620,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
io_req_rw_cleanup(req, issue_flags);
return IOU_OK;
} else {
- io_rw_done(&rw->kiocb, ret);
+ io_rw_done(req, ret);
}
return IOU_ISSUE_SKIP_COMPLETE;
@@ -813,10 +824,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
-
kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI;
- kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
/* make sure every req only blocks once*/
@@ -826,7 +835,6 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
- kiocb->ki_complete = io_complete_rw;
}
if (req->flags & REQ_F_HAS_METADATA) {
@@ -904,7 +912,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
} else if (ret == -EIOCBQUEUED) {
return IOU_ISSUE_SKIP_COMPLETE;
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
- (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
+ (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) ||
+ (issue_flags & IO_URING_F_MULTISHOT)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
@@ -977,6 +986,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
if (!io_file_can_poll(req))
return -EBADFD;
+ /* make it sync, multishot doesn't support async execution */
+ rw->kiocb.ki_complete = NULL;
ret = __io_read(req, issue_flags);
/*
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 48fc8cf70784..c5fb817b1e28 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -407,8 +407,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) == -1)
return -EALREADY;
- hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
- io->timer.function = io_link_timeout_fn;
+ hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode);
hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
return 0;
}
@@ -430,8 +429,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
data->ts = *ts;
list_add_tail(&timeout->list, &ctx->timeout_list);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
- data->timer.function = io_timeout_fn;
+ hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode);
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode);
return 0;
}
@@ -557,7 +555,6 @@ static int __io_timeout_prep(struct io_kiocb *req,
return -EINVAL;
data->mode = io_translate_timeout_mode(flags);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
if (is_timeout_link) {
struct io_submit_link *link = &req->ctx->submit_state.link;
@@ -568,6 +565,10 @@ static int __io_timeout_prep(struct io_kiocb *req,
return -EINVAL;
timeout->head = link->last;
link->last->flags |= REQ_F_ARM_LTIMEOUT;
+ hrtimer_setup(&data->timer, io_link_timeout_fn, io_timeout_get_clock(data),
+ data->mode);
+ } else {
+ hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), data->mode);
}
return 0;
}
@@ -627,7 +628,6 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
}
add:
list_add(&timeout->list, entry);
- data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
return IOU_ISSUE_SKIP_COMPLETE;
@@ -646,7 +646,6 @@ void io_queue_linked_timeout(struct io_kiocb *req)
if (timeout->head) {
struct io_timeout_data *data = req->async_data;
- data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
list_add_tail(&timeout->list, &ctx->ltimeout_list);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 1f6a82128b47..e6701b7aa147 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -54,9 +54,6 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
continue;
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
- /* ->sqe isn't available if no async data */
- if (!req_has_async_data(req))
- cmd->sqe = NULL;
file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL |
IO_URING_F_COMPLETE_DEFER);
ret = true;
@@ -179,12 +176,13 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req,
return -ENOMEM;
cache->op_data = NULL;
- if (!(req->flags & REQ_F_FORCE_ASYNC)) {
- /* defer memcpy until we need it */
- ioucmd->sqe = sqe;
- return 0;
- }
-
+ /*
+ * Unconditionally cache the SQE for now - this is only needed for
+ * requests that go async, but prep handlers must ensure that any
+ * sqe data is stable beyond prep. Since uring_cmd is special in
+ * that it doesn't read in per-op data, play it safe and ensure that
+ * any SQE data is stable beyond prep. This can later get relaxed.
+ */
memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx));
ioucmd->sqe = cache->sqes;
return 0;
@@ -249,16 +247,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
- if (ret == -EAGAIN) {
- struct io_uring_cmd_data *cache = req->async_data;
-
- if (ioucmd->sqe != (void *) cache)
- memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx));
- return -EAGAIN;
- } else if (ret == -EIOCBQUEUED) {
- return -EIOCBQUEUED;
- }
-
+ if (ret == -EAGAIN || ret == -EIOCBQUEUED)
+ return ret;
if (ret < 0)
req_set_fail(req);
io_req_uring_cleanup(req, issue_flags);
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 853e97a7b0ec..15a7daf3ff4f 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -118,7 +118,6 @@ static int io_waitid_finish(struct io_kiocb *req, int ret)
static void io_waitid_complete(struct io_kiocb *req, int ret)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
- struct io_tw_state ts = {};
/* anyone completing better be holding a reference */
WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
@@ -131,7 +130,6 @@ static void io_waitid_complete(struct io_kiocb *req, int ret)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- io_req_task_complete(req, &ts);
}
static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
@@ -153,6 +151,7 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
list_del_init(&iwa->wo.child_wait.entry);
spin_unlock_irq(&iw->head->lock);
io_waitid_complete(req, -ECANCELED);
+ io_req_queue_tw_complete(req, -ECANCELED);
return true;
}
@@ -258,6 +257,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
}
io_waitid_complete(req, ret);
+ io_req_task_complete(req, ts);
}
static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
@@ -285,10 +285,16 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa;
if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
return -EINVAL;
+ iwa = io_uring_alloc_async_data(NULL, req);
+ if (!unlikely(iwa))
+ return -ENOMEM;
+ iwa->req = req;
+
iw->which = READ_ONCE(sqe->len);
iw->upid = READ_ONCE(sqe->fd);
iw->options = READ_ONCE(sqe->file_index);
@@ -299,16 +305,10 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
- struct io_waitid_async *iwa;
int ret;
- iwa = io_uring_alloc_async_data(NULL, req);
- if (!iwa)
- return -ENOMEM;
-
- iwa->req = req;
-
ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
iw->options, NULL);
if (ret)