summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorPavel Begunkov <asml.silence@gmail.com>2022-09-21 12:17:54 +0100
committerJens Axboe <axboe@kernel.dk>2022-09-21 13:15:02 -0600
commit493108d95f1464ccd101d4e5cfa7e93f1fc64d47 (patch)
tree1fe9fd8773f4c8a351a20f3cdc796af6f5a074c3 /io_uring
parentc4c0009e0b56ef9920020bcade1e45be52653bae (diff)
io_uring/net: zerocopy sendmsg
Add a zerocopy version of sendmsg. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/6aabc4bdfc0ec78df6ec9328137e394af9d4e7ef.1663668091.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/net.c91
-rw-r--r--io_uring/net.h1
-rw-r--r--io_uring/opdef.c19
3 files changed, 106 insertions, 5 deletions
diff --git a/io_uring/net.c b/io_uring/net.c
index 209bc69b3707..757a300578f4 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -909,7 +909,12 @@ out_free:
void io_send_zc_cleanup(struct io_kiocb *req)
{
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr *io;
+ if (req_has_async_data(req)) {
+ io = req->async_data;
+ kfree(io->free_iov);
+ }
zc->notif->flags |= REQ_F_CQE_SKIP;
io_notif_flush(zc->notif);
zc->notif = NULL;
@@ -921,8 +926,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *notif;
- if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) ||
- READ_ONCE(sqe->__pad3[0]))
+ if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
if (req->flags & REQ_F_CQE_SKIP)
@@ -949,14 +953,24 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
io_req_set_rsrc_node(notif, ctx, 0);
}
+ if (req->opcode == IORING_OP_SEND_ZC) {
+ if (READ_ONCE(sqe->__pad3[0]))
+ return -EINVAL;
+ zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ zc->addr_len = READ_ONCE(sqe->addr_len);
+ } else {
+ if (unlikely(sqe->addr2 || sqe->file_index))
+ return -EINVAL;
+ if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
+ return -EINVAL;
+ }
+
zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
zc->len = READ_ONCE(sqe->len);
zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
- zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- zc->addr_len = READ_ONCE(sqe->addr_len);
zc->done_io = 0;
#ifdef CONFIG_COMPAT
@@ -1118,6 +1132,73 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
+int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr iomsg, *kmsg;
+ struct socket *sock;
+ unsigned flags, cflags;
+ int ret, min_ret = 0;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
+ ret = io_sendmsg_copy_hdr(req, &iomsg);
+ if (ret)
+ return ret;
+ kmsg = &iomsg;
+ }
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg, issue_flags);
+
+ flags = sr->msg_flags | MSG_ZEROCOPY;
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+ if (flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
+ kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
+ kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
+ ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
+
+ if (unlikely(ret < min_ret)) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return io_setup_async_msg(req, kmsg, issue_flags);
+
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg, issue_flags);
+ }
+ if (ret < 0 && !sr->done_io)
+ sr->notif->flags |= REQ_F_CQE_SKIP;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ }
+ /* fast path, check for non-NULL to avoid function call */
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
+
+ io_netmsg_recycle(req, issue_flags);
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+
+ io_notif_flush(sr->notif);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ cflags = ret >= 0 ? IORING_CQE_F_MORE : 0;
+ io_req_set_res(req, ret, cflags);
+ return IOU_OK;
+}
+
void io_sendrecv_fail(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -1127,7 +1208,7 @@ void io_sendrecv_fail(struct io_kiocb *req)
if (req->flags & REQ_F_PARTIAL_IO)
res = sr->done_io;
if ((req->flags & REQ_F_NEED_CLEANUP) &&
- req->opcode == IORING_OP_SEND_ZC) {
+ (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) {
/* preserve notification for partial I/O */
if (res < 0)
sr->notif->flags |= REQ_F_CQE_SKIP;
diff --git a/io_uring/net.h b/io_uring/net.h
index 45558e2b0a83..5ffa11bf5d2e 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -57,6 +57,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc(struct io_kiocb *req, unsigned int issue_flags);
+int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 0fdeb1bc21de..2330f6da791e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -505,6 +505,25 @@ const struct io_op_def io_op_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_SENDMSG_ZC] = {
+ .name = "SENDMSG_ZC",
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollout = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .manual_alloc = 1,
+#if defined(CONFIG_NET)
+ .async_size = sizeof(struct io_async_msghdr),
+ .prep = io_send_zc_prep,
+ .issue = io_sendmsg_zc,
+ .prep_async = io_sendmsg_prep_async,
+ .cleanup = io_send_zc_cleanup,
+ .fail = io_sendrecv_fail,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
};
const char *io_uring_get_opcode(u8 opcode)