summaryrefslogtreecommitdiff
path: root/io_uring/net.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 12:48:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-13 12:48:06 -0700
commit9961a785944601e32f185ea696347b22ffda634c (patch)
tree24362a6360b2a1fa96c7eb562c6a95f96b37e459 /io_uring/net.c
parentf4e8d80292859809ea135e9f4c43bae47e4f58bc (diff)
parentdeb1e496a83557896fe0cca0b8af01c2a97c0dc6 (diff)
Merge tag 'for-6.10/io_uring-20240511' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Greatly improve send zerocopy performance, by enabling coalescing of sent buffers. MSG_ZEROCOPY already does this with send(2) and sendmsg(2), but the io_uring side did not. In local testing, the crossover point for send zerocopy being faster is now around 3000 byte packets, and it performs better than the sync syscall variants as well. This feature relies on a shared branch with net-next, which was pulled into both branches. - Unification of how async preparation is done across opcodes. Previously, opcodes that required extra memory for async retry would allocate that as needed, using on-stack state until that was the case. If async retry was needed, the on-stack state was adjusted appropriately for a retry and then copied to the allocated memory. This led to some fragile and ugly code, particularly for read/write handling, and made storage retries more difficult than they needed to be. Allocate the memory upfront, as it's cheap from our pools, and use that state consistently both initially and also from the retry side. - Move away from using remap_pfn_range() for mapping the rings. This is really not the right interface to use and can cause lifetime issues or leaks. Additionally, it means the ring sq/cq arrays need to be physically contigious, which can cause problems in production with larger rings when services are restarted, as memory can be very fragmented at that point. Move to using vm_insert_page(s) for the ring sq/cq arrays, and apply the same treatment to mapped ring provided buffers. This also helps unify the code we have dealing with allocating and mapping memory. Hard to see in the diffstat as we're adding a few features as well, but this kills about ~400 lines of code from the codebase as well. - Add support for bundles for send/recv. When used with provided buffers, bundles support sending or receiving more than one buffer at the time, improving the efficiency by only needing to call into the networking stack once for multiple sends or receives. - Tweaks for our accept operations, supporting both a DONTWAIT flag for skipping poll arm and retry if we can, and a POLLFIRST flag that the application can use to skip the initial accept attempt and rely purely on poll for triggering the operation. Both of these have identical flags on the receive side already. - Make the task_work ctx locking unconditional. We had various code paths here that would do a mix of lock/trylock and set the task_work state to whether or not it was locked. All of that goes away, we lock it unconditionally and get rid of the state flag indicating whether it's locked or not. The state struct still exists as an empty type, can go away in the future. - Add support for specifying NOP completion values, allowing it to be used for error handling testing. - Use set/test bit for io-wq worker flags. Not strictly needed, but also doesn't hurt and helps silence a KCSAN warning. - Cleanups for io-wq locking and work assignments, closing a tiny race where cancelations would not be able to find the work item reliably. - Misc fixes, cleanups, and improvements * tag 'for-6.10/io_uring-20240511' of git://git.kernel.dk/linux: (97 commits) io_uring: support to inject result for NOP io_uring: fail NOP if non-zero op flags is passed in io_uring/net: add IORING_ACCEPT_POLL_FIRST flag io_uring/net: add IORING_ACCEPT_DONTWAIT flag io_uring/filetable: don't unnecessarily clear/reset bitmap io_uring/io-wq: Use set_bit() and test_bit() at worker->flags io_uring/msg_ring: cleanup posting to IOPOLL vs !IOPOLL ring io_uring: Require zeroed sqe->len on provided-buffers send io_uring/notif: disable LAZY_WAKE for linked notifs io_uring/net: fix sendzc lazy wake polling io_uring/msg_ring: reuse ctx->submitter_task read using READ_ONCE instead of re-reading it io_uring/rw: reinstate thread check for retries io_uring/notif: implement notification stacking io_uring/notif: simplify io_notif_flush() net: add callback for setting a ubuf_info to skb net: extend ubuf_info callback to ops structure io_uring/net: support bundles for recv io_uring/net: support bundles for send io_uring/kbuf: add helpers for getting/peeking multiple buffers io_uring/net: add provided buffer support for IORING_OP_SEND ...
Diffstat (limited to 'io_uring/net.c')
-rw-r--r--io_uring/net.c852
1 files changed, 494 insertions, 358 deletions
diff --git a/io_uring/net.c b/io_uring/net.c
index 4afb475d4197..070dea9a4eda 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -28,6 +28,7 @@ struct io_accept {
struct sockaddr __user *addr;
int __user *addr_len;
int flags;
+ int iou_flags;
u32 file_slot;
unsigned long nofile;
};
@@ -57,7 +58,7 @@ struct io_sr_msg {
struct user_msghdr __user *umsg;
void __user *buf;
};
- unsigned len;
+ int len;
unsigned done_io;
unsigned msg_flags;
unsigned nr_multishot_loops;
@@ -115,80 +116,85 @@ static bool io_net_retry(struct socket *sock, int flags)
return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
}
+static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
+{
+ if (kmsg->free_iov) {
+ kfree(kmsg->free_iov);
+ kmsg->free_iov_nr = 0;
+ kmsg->free_iov = NULL;
+ }
+}
+
static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr *hdr = req->async_data;
+ struct iovec *iov;
- if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED)
+ /* can't recycle, ensure we free the iovec if we have one */
+ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
+ io_netmsg_iovec_free(hdr);
return;
+ }
/* Let normal cleanup path reap it if we fail adding to the cache */
- if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
+ iov = hdr->free_iov;
+ if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
+ if (iov)
+ kasan_mempool_poison_object(iov);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
}
}
-static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
- unsigned int issue_flags)
+static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_cache_entry *entry;
struct io_async_msghdr *hdr;
- if (!(issue_flags & IO_URING_F_UNLOCKED)) {
- entry = io_alloc_cache_get(&ctx->netmsg_cache);
- if (entry) {
- hdr = container_of(entry, struct io_async_msghdr, cache);
- hdr->free_iov = NULL;
- req->flags |= REQ_F_ASYNC_DATA;
- req->async_data = hdr;
- return hdr;
+ hdr = io_alloc_cache_get(&ctx->netmsg_cache);
+ if (hdr) {
+ if (hdr->free_iov) {
+ kasan_mempool_unpoison_object(hdr->free_iov,
+ hdr->free_iov_nr * sizeof(struct iovec));
+ req->flags |= REQ_F_NEED_CLEANUP;
}
+ req->flags |= REQ_F_ASYNC_DATA;
+ req->async_data = hdr;
+ return hdr;
}
if (!io_alloc_async_data(req)) {
hdr = req->async_data;
+ hdr->free_iov_nr = 0;
hdr->free_iov = NULL;
return hdr;
}
return NULL;
}
-static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req)
+/* assign new iovec to kmsg, if we need to */
+static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg,
+ struct iovec *iov)
{
- /* ->prep_async is always called from the submission context */
- return io_msg_alloc_async(req, 0);
+ if (iov) {
+ req->flags |= REQ_F_NEED_CLEANUP;
+ kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
+ kmsg->free_iov = iov;
+ }
+ return 0;
}
-static int io_setup_async_msg(struct io_kiocb *req,
- struct io_async_msghdr *kmsg,
- unsigned int issue_flags)
+static inline void io_mshot_prep_retry(struct io_kiocb *req,
+ struct io_async_msghdr *kmsg)
{
- struct io_async_msghdr *async_msg;
-
- if (req_has_async_data(req))
- return -EAGAIN;
- async_msg = io_msg_alloc_async(req, issue_flags);
- if (!async_msg) {
- kfree(kmsg->free_iov);
- return -ENOMEM;
- }
- req->flags |= REQ_F_NEED_CLEANUP;
- memcpy(async_msg, kmsg, sizeof(*kmsg));
- if (async_msg->msg.msg_name)
- async_msg->msg.msg_name = &async_msg->addr;
-
- if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs)
- return -EAGAIN;
-
- /* if were using fast_iov, set it to the new one */
- if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
- size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
- async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
- }
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- return -EAGAIN;
+ req->flags &= ~REQ_F_BL_EMPTY;
+ sr->done_io = 0;
+ sr->len = 0; /* get from the provided buffer */
+ req->buf_index = sr->buf_group;
}
#ifdef CONFIG_COMPAT
@@ -198,7 +204,16 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct compat_iovec __user *uiov;
- int ret;
+ struct iovec *iov;
+ int ret, nr_segs;
+
+ if (iomsg->free_iov) {
+ nr_segs = iomsg->free_iov_nr;
+ iov = iomsg->free_iov;
+ } else {
+ iov = &iomsg->fast_iov;
+ nr_segs = 1;
+ }
if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
return -EFAULT;
@@ -207,9 +222,9 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
if (req->flags & REQ_F_BUFFER_SELECT) {
compat_ssize_t clen;
- iomsg->free_iov = NULL;
if (msg->msg_iovlen == 0) {
- sr->len = 0;
+ sr->len = iov->iov_len = 0;
+ iov->iov_base = NULL;
} else if (msg->msg_iovlen > 1) {
return -EINVAL;
} else {
@@ -225,14 +240,12 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
return 0;
}
- iomsg->free_iov = iomsg->fast_iov;
ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
- UIO_FASTIOV, &iomsg->free_iov,
- &iomsg->msg.msg_iter, true);
+ nr_segs, &iov, &iomsg->msg.msg_iter, true);
if (unlikely(ret < 0))
return ret;
- return 0;
+ return io_net_vec_assign(req, iomsg, iov);
}
#endif
@@ -240,7 +253,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
struct user_msghdr *msg, int ddir)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- int ret;
+ struct iovec *iov;
+ int ret, nr_segs;
+
+ if (iomsg->free_iov) {
+ nr_segs = iomsg->free_iov_nr;
+ iov = iomsg->free_iov;
+ } else {
+ iov = &iomsg->fast_iov;
+ nr_segs = 1;
+ }
if (!user_access_begin(sr->umsg, sizeof(*sr->umsg)))
return -EFAULT;
@@ -256,9 +278,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
if (req->flags & REQ_F_BUFFER_SELECT) {
if (msg->msg_iovlen == 0) {
- sr->len = iomsg->fast_iov[0].iov_len = 0;
- iomsg->fast_iov[0].iov_base = NULL;
- iomsg->free_iov = NULL;
+ sr->len = iov->iov_len = 0;
+ iov->iov_base = NULL;
} else if (msg->msg_iovlen > 1) {
ret = -EINVAL;
goto ua_end;
@@ -266,10 +287,9 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
/* we only need the length for provided buffers */
if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t)))
goto ua_end;
- unsafe_get_user(iomsg->fast_iov[0].iov_len,
- &msg->msg_iov[0].iov_len, ua_end);
- sr->len = iomsg->fast_iov[0].iov_len;
- iomsg->free_iov = NULL;
+ unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len,
+ ua_end);
+ sr->len = iov->iov_len;
}
ret = 0;
ua_end:
@@ -278,13 +298,12 @@ ua_end:
}
user_access_end();
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV,
- &iomsg->free_iov, &iomsg->msg.msg_iter, false);
+ ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs,
+ &iov, &iomsg->msg.msg_iter, false);
if (unlikely(ret < 0))
return ret;
- return 0;
+ return io_net_vec_assign(req, iomsg, iov);
}
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
@@ -320,60 +339,58 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
return ret;
}
-int io_send_prep_async(struct io_kiocb *req)
+void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
{
- struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *io;
- int ret;
+ struct io_async_msghdr *io = req->async_data;
- if (req_has_async_data(req))
- return 0;
- zc->done_io = 0;
- if (!zc->addr)
- return 0;
- io = io_msg_alloc_async_prep(req);
- if (!io)
- return -ENOMEM;
- ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr);
- return ret;
+ io_netmsg_iovec_free(io);
}
-static int io_setup_async_addr(struct io_kiocb *req,
- struct sockaddr_storage *addr_storage,
- unsigned int issue_flags)
+static int io_send_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *io;
+ struct io_async_msghdr *kmsg = req->async_data;
+ int ret;
- if (!sr->addr || req_has_async_data(req))
- return -EAGAIN;
- io = io_msg_alloc_async(req, issue_flags);
- if (!io)
- return -ENOMEM;
- memcpy(&io->addr, addr_storage, sizeof(io->addr));
- return -EAGAIN;
+ kmsg->msg.msg_name = NULL;
+ kmsg->msg.msg_namelen = 0;
+ kmsg->msg.msg_control = NULL;
+ kmsg->msg.msg_controllen = 0;
+ kmsg->msg.msg_ubuf = NULL;
+
+ if (sr->addr) {
+ ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr);
+ if (unlikely(ret < 0))
+ return ret;
+ kmsg->msg.msg_name = &kmsg->addr;
+ kmsg->msg.msg_namelen = sr->addr_len;
+ }
+ if (!io_do_buffer_select(req)) {
+ ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
+ if (unlikely(ret < 0))
+ return ret;
+ }
+ return 0;
}
-int io_sendmsg_prep_async(struct io_kiocb *req)
+static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg)
{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr *kmsg;
int ret;
- sr->done_io = 0;
- if (!io_msg_alloc_async_prep(req))
+ kmsg = io_msg_alloc_async(req);
+ if (unlikely(!kmsg))
return -ENOMEM;
- ret = io_sendmsg_copy_hdr(req, req->async_data);
+ if (!is_msg)
+ return io_send_setup(req);
+ ret = io_sendmsg_copy_hdr(req, kmsg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
-{
- struct io_async_msghdr *io = req->async_data;
-
- kfree(io->free_iov);
-}
+#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -393,34 +410,114 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
- if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ if (sr->flags & ~SENDMSG_FLAGS)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (sr->flags & IORING_RECVSEND_BUNDLE) {
+ if (req->opcode == IORING_OP_SENDMSG)
+ return -EINVAL;
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
+ sr->msg_flags |= MSG_WAITALL;
+ sr->buf_group = req->buf_index;
+ req->buf_list = NULL;
+ }
+ if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
+ return -EINVAL;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- return 0;
+ return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG);
}
static void io_req_msg_cleanup(struct io_kiocb *req,
- struct io_async_msghdr *kmsg,
unsigned int issue_flags)
{
req->flags &= ~REQ_F_NEED_CLEANUP;
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
io_netmsg_recycle(req, issue_flags);
}
+/*
+ * For bundle completions, we need to figure out how many segments we consumed.
+ * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
+ * could be using an ITER_IOVEC. If the latter, then if we consumed all of
+ * the segments, then it's a trivial questiont o answer. If we have residual
+ * data in the iter, then loop the segments to figure out how much we
+ * transferred.
+ */
+static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
+{
+ struct iovec *iov;
+ int nbufs;
+
+ /* no data is always zero segments, and a ubuf is always 1 segment */
+ if (ret <= 0)
+ return 0;
+ if (iter_is_ubuf(&kmsg->msg.msg_iter))
+ return 1;
+
+ iov = kmsg->free_iov;
+ if (!iov)
+ iov = &kmsg->fast_iov;
+
+ /* if all data was transferred, it's basic pointer math */
+ if (!iov_iter_count(&kmsg->msg.msg_iter))
+ return iter_iov(&kmsg->msg.msg_iter) - iov;
+
+ /* short transfer, count segments */
+ nbufs = 0;
+ do {
+ int this_len = min_t(int, iov[nbufs].iov_len, ret);
+
+ nbufs++;
+ ret -= this_len;
+ } while (ret);
+
+ return nbufs;
+}
+
+static inline bool io_send_finish(struct io_kiocb *req, int *ret,
+ struct io_async_msghdr *kmsg,
+ unsigned issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ bool bundle_finished = *ret <= 0;
+ unsigned int cflags;
+
+ if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
+ cflags = io_put_kbuf(req, issue_flags);
+ goto finish;
+ }
+
+ cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
+
+ if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
+ goto finish;
+
+ /*
+ * Fill CQE for this receive and see if we should keep trying to
+ * receive from this socket.
+ */
+ if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
+ io_mshot_prep_retry(req, kmsg);
+ return false;
+ }
+
+ /* Otherwise stop bundle and use the current result. */
+finish:
+ io_req_set_res(req, *ret, cflags);
+ *ret = IOU_OK;
+ return true;
+}
+
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr iomsg, *kmsg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -430,19 +527,9 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- kmsg->msg.msg_control_user = sr->msg_control;
- } else {
- ret = io_sendmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
@@ -450,23 +537,25 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+ kmsg->msg.msg_control_user = sr->msg_control;
+
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
if (ret > 0 && io_net_retry(sock, flags)) {
kmsg->msg.msg_controllen = 0;
kmsg->msg.msg_control = NULL;
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
}
- io_req_msg_cleanup(req, kmsg, issue_flags);
+ io_req_msg_cleanup(req, issue_flags);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
@@ -477,65 +566,77 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
int io_send(struct io_kiocb *req, unsigned int issue_flags)
{
- struct sockaddr_storage __address;
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct msghdr msg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int min_ret = 0;
int ret;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
- msg.msg_ubuf = NULL;
-
- if (sr->addr) {
- if (req_has_async_data(req)) {
- struct io_async_msghdr *io = req->async_data;
-
- msg.msg_name = &io->addr;
- } else {
- ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address);
- if (unlikely(ret < 0))
- return ret;
- msg.msg_name = (struct sockaddr *)&__address;
- }
- msg.msg_namelen = sr->addr_len;
- }
-
- if (!(req->flags & REQ_F_POLLED) &&
- (sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_addr(req, &__address, issue_flags);
-
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
- if (unlikely(ret))
- return ret;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
+
+retry_bundle:
+ if (io_do_buffer_select(req)) {
+ struct buf_sel_arg arg = {
+ .iovs = &kmsg->fast_iov,
+ .max_len = INT_MAX,
+ .nr_iovs = 1,
+ .mode = KBUF_MODE_EXPAND,
+ };
+
+ if (kmsg->free_iov) {
+ arg.nr_iovs = kmsg->free_iov_nr;
+ arg.iovs = kmsg->free_iov;
+ arg.mode |= KBUF_MODE_FREE;
+ }
+
+ if (!(sr->flags & IORING_RECVSEND_BUNDLE))
+ arg.nr_iovs = 1;
+
+ ret = io_buffers_select(req, &arg, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+
+ sr->len = arg.out_len;
+ iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
+ arg.out_len);
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+ kmsg->free_iov_nr = ret;
+ kmsg->free_iov = arg.iovs;
+ }
+ }
+
+ /*
+ * If MSG_WAITALL is set, or this is a bundle send, then we need
+ * the full amount. If just bundle is set, if we do a short send
+ * then we complete the bundle sequence rather than continue on.
+ */
+ if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
- msg.msg_flags = flags;
- ret = sock_sendmsg(sock, &msg);
+ kmsg->msg.msg_flags = flags;
+ ret = sock_sendmsg(sock, &kmsg->msg);
if (ret < min_ret) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->len -= ret;
sr->buf += ret;
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -545,8 +646,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
- io_req_set_res(req, ret, 0);
- return IOU_OK;
+
+ if (!io_send_finish(req, &ret, kmsg, issue_flags))
+ goto retry_bundle;
+
+ io_req_msg_cleanup(req, issue_flags);
+ return ret;
}
static int io_recvmsg_mshot_prep(struct io_kiocb *req,
@@ -611,23 +716,42 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
msg.msg_controllen);
}
-int io_recvmsg_prep_async(struct io_kiocb *req)
+static int io_recvmsg_prep_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *iomsg;
+ struct io_async_msghdr *kmsg;
int ret;
- sr->done_io = 0;
- if (!io_msg_alloc_async_prep(req))
+ kmsg = io_msg_alloc_async(req);
+ if (unlikely(!kmsg))
return -ENOMEM;
- iomsg = req->async_data;
- ret = io_recvmsg_copy_hdr(req, iomsg);
+
+ if (req->opcode == IORING_OP_RECV) {
+ kmsg->msg.msg_name = NULL;
+ kmsg->msg.msg_namelen = 0;
+ kmsg->msg.msg_control = NULL;
+ kmsg->msg.msg_get_inq = 1;
+ kmsg->msg.msg_controllen = 0;
+ kmsg->msg.msg_iocb = NULL;
+ kmsg->msg.msg_ubuf = NULL;
+
+ if (!io_do_buffer_select(req)) {
+ ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ }
+ return 0;
+ }
+
+ ret = io_recvmsg_copy_hdr(req, kmsg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
return ret;
}
-#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
+#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
+ IORING_RECVSEND_BUNDLE)
int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
@@ -641,21 +765,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
- if (sr->flags & ~(RECVMSG_FLAGS))
+ if (sr->flags & ~RECVMSG_FLAGS)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
if (sr->msg_flags & MSG_ERRQUEUE)
req->flags |= REQ_F_CLEAR_POLLIN;
- if (sr->flags & IORING_RECV_MULTISHOT) {
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
- if (sr->msg_flags & MSG_WAITALL)
- return -EINVAL;
- if (req->opcode == IORING_OP_RECV && sr->len)
- return -EINVAL;
- req->flags |= REQ_F_APOLL_MULTISHOT;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
/*
* Store the buffer group for this multishot receive separately,
* as if we end up doing an io-wq based issue that selects a
@@ -665,6 +782,20 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
* restore it.
*/
sr->buf_group = req->buf_index;
+ req->buf_list = NULL;
+ }
+ if (sr->flags & IORING_RECV_MULTISHOT) {
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
+ if (sr->msg_flags & MSG_WAITALL)
+ return -EINVAL;
+ if (req->opcode == IORING_OP_RECV && sr->len)
+ return -EINVAL;
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ }
+ if (sr->flags & IORING_RECVSEND_BUNDLE) {
+ if (req->opcode == IORING_OP_RECVMSG)
+ return -EINVAL;
}
#ifdef CONFIG_COMPAT
@@ -672,17 +803,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
sr->nr_multishot_loops = 0;
- return 0;
-}
-
-static inline void io_recv_prep_retry(struct io_kiocb *req)
-{
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-
- req->flags &= ~REQ_F_BL_EMPTY;
- sr->done_io = 0;
- sr->len = 0; /* get from the provided buffer */
- req->buf_index = sr->buf_group;
+ return io_recvmsg_prep_setup(req);
}
/*
@@ -692,28 +813,36 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
* again (for multishot).
*/
static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
- struct msghdr *msg, bool mshot_finished,
- unsigned issue_flags)
+ struct io_async_msghdr *kmsg,
+ bool mshot_finished, unsigned issue_flags)
{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
unsigned int cflags;
- cflags = io_put_kbuf(req, issue_flags);
- if (msg->msg_inq > 0)
+ if (sr->flags & IORING_RECVSEND_BUNDLE)
+ cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
+ issue_flags);
+ else
+ cflags = io_put_kbuf(req, issue_flags);
+
+ if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+ /* bundle with no more immediate buffers, we're done */
+ if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY)
+ goto finish;
+
/*
* Fill CQE for this receive and see if we should keep trying to
* receive from this socket.
*/
if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
- io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
- *ret, cflags | IORING_CQE_F_MORE)) {
- struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
- io_recv_prep_retry(req);
+ io_mshot_prep_retry(req, kmsg);
/* Known not-empty or unknown state, retry */
- if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) {
+ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
return false;
/* mshot retries exceeded, force a requeue */
@@ -728,12 +857,14 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
}
/* Finish the request / stop multishot. */
+finish:
io_req_set_res(req, *ret, cflags);
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_STOP_MULTISHOT;
else
*ret = IOU_OK;
+ io_req_msg_cleanup(req, issue_flags);
return true;
}
@@ -824,7 +955,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr iomsg, *kmsg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
@@ -835,18 +966,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- } else {
- ret = io_recvmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
flags = sr->msg_flags;
if (force_nonblock)
@@ -888,17 +1010,16 @@ retry_multishot:
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
- ret = io_setup_async_msg(req, kmsg, issue_flags);
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) {
+ if (issue_flags & IO_URING_F_MULTISHOT) {
io_kbuf_recycle(req, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
- return ret;
+ return -EAGAIN;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -914,21 +1035,79 @@ retry_multishot:
else
io_kbuf_recycle(req, issue_flags);
- if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
+ if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags))
goto retry_multishot;
- if (mshot_finished)
- io_req_msg_cleanup(req, kmsg, issue_flags);
- else if (ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg, issue_flags);
-
return ret;
}
+static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
+ size_t *len, unsigned int issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ int ret;
+
+ /*
+ * If the ring isn't locked, then don't use the peek interface
+ * to grab multiple buffers as we will lock/unlock between
+ * this selection and posting the buffers.
+ */
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ sr->flags & IORING_RECVSEND_BUNDLE) {
+ struct buf_sel_arg arg = {
+ .iovs = &kmsg->fast_iov,
+ .nr_iovs = 1,
+ .mode = KBUF_MODE_EXPAND,
+ };
+
+ if (kmsg->free_iov) {
+ arg.nr_iovs = kmsg->free_iov_nr;
+ arg.iovs = kmsg->free_iov;
+ arg.mode |= KBUF_MODE_FREE;
+ }
+
+ if (kmsg->msg.msg_inq > 0)
+ arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq);
+
+ ret = io_buffers_peek(req, &arg);
+ if (unlikely(ret < 0))
+ return ret;
+
+ /* special case 1 vec, can be a fast path */
+ if (ret == 1) {
+ sr->buf = arg.iovs[0].iov_base;
+ sr->len = arg.iovs[0].iov_len;
+ goto map_ubuf;
+ }
+ iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
+ arg.out_len);
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+ kmsg->free_iov_nr = ret;
+ kmsg->free_iov = arg.iovs;
+ }
+ } else {
+ void __user *buf;
+
+ *len = sr->len;
+ buf = io_buffer_select(req, len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ sr->buf = buf;
+ sr->len = *len;
+map_ubuf:
+ ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
+}
+
int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct msghdr msg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
@@ -943,40 +1122,25 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_get_inq = 1;
- msg.msg_controllen = 0;
- msg.msg_iocb = NULL;
- msg.msg_ubuf = NULL;
-
flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
retry_multishot:
if (io_do_buffer_select(req)) {
- void __user *buf;
-
- buf = io_buffer_select(req, &len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- sr->buf = buf;
- sr->len = len;
+ ret = io_recv_buf_select(req, kmsg, &len, issue_flags);
+ if (unlikely(ret))
+ goto out_free;
+ sr->buf = NULL;
}
- ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
- if (unlikely(ret))
- goto out_free;
-
- msg.msg_inq = -1;
- msg.msg_flags = 0;
+ kmsg->msg.msg_inq = -1;
+ kmsg->msg.msg_flags = 0;
if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
- ret = sock_recvmsg(sock, &msg, flags);
+ ret = sock_recvmsg(sock, &kmsg->msg, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
if (issue_flags & IO_URING_F_MULTISHOT) {
@@ -996,7 +1160,7 @@ retry_multishot:
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
- } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
@@ -1008,7 +1172,7 @@ out_free:
else
io_kbuf_recycle(req, issue_flags);
- if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags))
+ if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags))
goto retry_multishot;
return ret;
@@ -1017,14 +1181,10 @@ out_free:
void io_send_zc_cleanup(struct io_kiocb *req)
{
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr *io;
+ struct io_async_msghdr *io = req->async_data;
- if (req_has_async_data(req)) {
- io = req->async_data;
- /* might be ->fast_iov if *msg_copy_hdr failed */
- if (io->free_iov != io->fast_iov)
- kfree(io->free_iov);
- }
+ if (req_has_async_data(req))
+ io_netmsg_iovec_free(io);
if (zc->notif) {
io_notif_flush(zc->notif);
zc->notif = NULL;
@@ -1041,6 +1201,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_kiocb *notif;
zc->done_io = 0;
+ req->flags |= REQ_F_POLL_NO_LAZY;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
@@ -1061,8 +1222,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (zc->flags & ~IO_ZC_FLAGS_VALID)
return -EINVAL;
if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
- io_notif_set_extended(notif);
- io_notif_to_data(notif)->zc_report = true;
+ struct io_notif_data *nd = io_notif_to_data(notif);
+
+ nd->zc_report = true;
+ nd->zc_used = false;
+ nd->zc_copied = false;
}
}
@@ -1090,7 +1254,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
zc->len = READ_ONCE(sqe->len);
- zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -1098,7 +1262,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
zc->msg_flags |= MSG_CMSG_COMPAT;
#endif
- return 0;
+ return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC);
}
static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb,
@@ -1159,11 +1323,34 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
return ret;
}
+static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ int ret;
+
+ if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
+ ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu,
+ (u64)(uintptr_t)sr->buf, sr->len);
+ if (unlikely(ret))
+ return ret;
+ kmsg->msg.sg_from_iter = io_sg_from_iter;
+ } else {
+ ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+ ret = io_notif_account_mem(sr->notif, sr->len);
+ if (unlikely(ret))
+ return ret;
+ kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
+ }
+
+ return ret;
+}
+
int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
{
- struct sockaddr_storage __address;
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct msghdr msg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned msg_flags;
int ret, min_ret = 0;
@@ -1174,67 +1361,37 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
return -EOPNOTSUPP;
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
-
- if (zc->addr) {
- if (req_has_async_data(req)) {
- struct io_async_msghdr *io = req->async_data;
-
- msg.msg_name = &io->addr;
- } else {
- ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address);
- if (unlikely(ret < 0))
- return ret;
- msg.msg_name = (struct sockaddr *)&__address;
- }
- msg.msg_namelen = zc->addr_len;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(zc->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
- if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
- ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu,
- (u64)(uintptr_t)zc->buf, zc->len);
- if (unlikely(ret))
- return ret;
- msg.sg_from_iter = io_sg_from_iter;
- } else {
- io_notif_set_extended(zc->notif);
- ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter);
+ if (!zc->done_io) {
+ ret = io_send_zc_import(req, kmsg);
if (unlikely(ret))
return ret;
- ret = io_notif_account_mem(zc->notif, zc->len);
- if (unlikely(ret))
- return ret;
- msg.sg_from_iter = io_sg_from_iter_iovec;
}
- msg_flags = zc->msg_flags | MSG_ZEROCOPY;
+ msg_flags = zc->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
- msg.msg_flags = msg_flags;
- msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
- ret = sock_sendmsg(sock, &msg);
+ kmsg->msg.msg_flags = msg_flags;
+ kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
+ ret = sock_sendmsg(sock, &kmsg->msg);
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
- if (ret > 0 && io_net_retry(sock, msg.msg_flags)) {
+ if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) {
zc->len -= ret;
zc->buf += ret;
zc->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_addr(req, &__address, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -1252,7 +1409,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(zc->notif);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
@@ -1261,63 +1418,46 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- struct io_async_msghdr iomsg, *kmsg;
+ struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
unsigned flags;
int ret, min_ret = 0;
- io_notif_set_extended(sr->notif);
-
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
return -EOPNOTSUPP;
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- kmsg->msg.msg_control_user = sr->msg_control;
- } else {
- ret = io_sendmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
if (!(req->flags & REQ_F_POLLED) &&
(sr->flags & IORING_RECVSEND_POLL_FIRST))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
- flags = sr->msg_flags | MSG_ZEROCOPY;
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+ kmsg->msg.msg_control_user = sr->msg_control;
kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (unlikely(ret < min_ret)) {
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return io_setup_async_msg(req, kmsg, issue_flags);
+ return -EAGAIN;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
}
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov) {
- kfree(kmsg->free_iov);
- kmsg->free_iov = NULL;
- }
- io_netmsg_recycle(req, issue_flags);
if (ret >= 0)
ret += sr->done_io;
else if (sr->done_io)
@@ -1329,7 +1469,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(sr->notif);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
@@ -1347,10 +1487,12 @@ void io_sendrecv_fail(struct io_kiocb *req)
req->cqe.flags |= IORING_CQE_F_MORE;
}
+#define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
+ IORING_ACCEPT_POLL_FIRST)
+
int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
- unsigned flags;
if (sqe->len || sqe->buf_index)
return -EINVAL;
@@ -1359,15 +1501,15 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
- flags = READ_ONCE(sqe->ioprio);
- if (flags & ~IORING_ACCEPT_MULTISHOT)
+ accept->iou_flags = READ_ONCE(sqe->ioprio);
+ if (accept->iou_flags & ~ACCEPT_FLAGS)
return -EINVAL;
accept->file_slot = READ_ONCE(sqe->file_index);
if (accept->file_slot) {
if (accept->flags & SOCK_CLOEXEC)
return -EINVAL;
- if (flags & IORING_ACCEPT_MULTISHOT &&
+ if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
accept->file_slot != IORING_FILE_INDEX_ALLOC)
return -EINVAL;
}
@@ -1375,8 +1517,10 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
- if (flags & IORING_ACCEPT_MULTISHOT)
+ if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
req->flags |= REQ_F_APOLL_MULTISHOT;
+ if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
return 0;
}
@@ -1389,6 +1533,10 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
+ if (!(req->flags & REQ_F_POLLED) &&
+ accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
+ return -EAGAIN;
+
retry:
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
@@ -1401,7 +1549,8 @@ retry:
if (!fixed)
put_unused_fd(fd);
ret = PTR_ERR(file);
- if (ret == -EAGAIN && force_nonblock) {
+ if (ret == -EAGAIN && force_nonblock &&
+ !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) {
/*
* if it's multishot and polled, we don't need to
* return EAGAIN to arm the poll infra since it
@@ -1429,8 +1578,7 @@ retry:
if (ret < 0)
return ret;
- if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
- ret, IORING_CQE_F_MORE))
+ if (io_req_post_cqe(req, ret, IORING_CQE_F_MORE))
goto retry;
io_req_set_res(req, ret, 0);
@@ -1491,17 +1639,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
-int io_connect_prep_async(struct io_kiocb *req)
-{
- struct io_async_connect *io = req->async_data;
- struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
-
- return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
-}
-
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
+ struct io_async_msghdr *io;
if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
@@ -1509,32 +1650,26 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
conn->addr_len = READ_ONCE(sqe->addr2);
conn->in_progress = conn->seen_econnaborted = false;
- return 0;
+
+ io = io_msg_alloc_async(req);
+ if (unlikely(!io))
+ return -ENOMEM;
+
+ return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
}
int io_connect(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
- struct io_async_connect __io, *io;
+ struct io_async_msghdr *io = req->async_data;
unsigned file_flags;
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (req_has_async_data(req)) {
- io = req->async_data;
- } else {
- ret = move_addr_to_kernel(connect->addr,
- connect->addr_len,
- &__io.address);
- if (ret)
- goto out;
- io = &__io;
- }
-
file_flags = force_nonblock ? O_NONBLOCK : 0;
- ret = __sys_connect_file(req->file, &io->address,
- connect->addr_len, file_flags);
+ ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
+ file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
&& force_nonblock) {
if (ret == -EINPROGRESS) {
@@ -1544,13 +1679,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
goto out;
connect->seen_econnaborted = true;
}
- if (req_has_async_data(req))
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- ret = -ENOMEM;
- goto out;
- }
- memcpy(req->async_data, &__io, sizeof(__io));
return -EAGAIN;
}
if (connect->in_progress) {
@@ -1568,12 +1696,20 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
out:
if (ret < 0)
req_set_fail(req);
+ io_req_msg_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
-void io_netmsg_cache_free(struct io_cache_entry *entry)
+void io_netmsg_cache_free(const void *entry)
{
- kfree(container_of(entry, struct io_async_msghdr, cache));
+ struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
+
+ if (kmsg->free_iov) {
+ kasan_mempool_unpoison_object(kmsg->free_iov,
+ kmsg->free_iov_nr * sizeof(struct iovec));
+ io_netmsg_iovec_free(kmsg);
+ }
+ kfree(kmsg);
}
#endif