summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/netlink/specs/netdev.yaml19
-rw-r--r--Documentation/networking/index.rst1
-rw-r--r--Documentation/networking/xdp-rx-metadata.rst2
-rw-r--r--Documentation/networking/xsk-tx-metadata.rst79
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h4
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c72
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h11
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c17
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c1
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac.h12
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c64
-rw-r--r--include/linux/netdevice.h2
-rw-r--r--include/linux/skbuff.h14
-rw-r--r--include/net/xdp_sock.h111
-rw-r--r--include/net/xdp_sock_drv.h34
-rw-r--r--include/net/xsk_buff_pool.h8
-rw-r--r--include/uapi/linux/if_xdp.h47
-rw-r--r--include/uapi/linux/netdev.h16
-rw-r--r--net/core/netdev-genl.c13
-rw-r--r--net/xdp/xdp_umem.c11
-rw-r--r--net/xdp/xsk.c56
-rw-r--r--net/xdp/xsk_buff_pool.c2
-rw-r--r--net/xdp/xsk_queue.h19
-rw-r--r--tools/include/uapi/linux/if_xdp.h61
-rw-r--r--tools/include/uapi/linux/netdev.h16
-rw-r--r--tools/net/ynl/generated/netdev-user.c19
-rw-r--r--tools/net/ynl/generated/netdev-user.h3
-rw-r--r--tools/net/ynl/samples/netdev.c10
-rw-r--r--tools/testing/selftests/bpf/network_helpers.h43
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_metadata.c33
-rw-r--r--tools/testing/selftests/bpf/xdp_hw_metadata.c235
-rw-r--r--tools/testing/selftests/bpf/xsk.c3
-rw-r--r--tools/testing/selftests/bpf/xsk.h1
33 files changed, 969 insertions, 70 deletions
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 14511b13f305..00439bcbd2e3 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -45,7 +45,6 @@ definitions:
-
type: flags
name: xdp-rx-metadata
- render-max: true
entries:
-
name: timestamp
@@ -55,6 +54,18 @@ definitions:
name: hash
doc:
Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash().
+ -
+ type: flags
+ name: xsk-flags
+ entries:
+ -
+ name: tx-timestamp
+ doc:
+ HW timestamping egress packets is supported by the driver.
+ -
+ name: tx-checksum
+ doc:
+ L3 checksum HW offload is supported by the driver.
attribute-sets:
-
@@ -86,6 +97,11 @@ attribute-sets:
See Documentation/networking/xdp-rx-metadata.rst for more details.
type: u64
enum: xdp-rx-metadata
+ -
+ name: xsk-features
+ doc: Bitmask of enabled AF_XDP features.
+ type: u64
+ enum: xsk-flags
operations:
list:
@@ -103,6 +119,7 @@ operations:
- xdp-features
- xdp-zc-max-segs
- xdp-rx-metadata-features
+ - xsk-features
dump:
reply: *dev-all
-
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 683eb42309cc..a297a894b366 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -123,6 +123,7 @@ Contents:
xfrm_sync
xfrm_sysctl
xdp-rx-metadata
+ xsk-tx-metadata
.. only:: subproject and html
diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
index 205696780b78..e3e9420fd817 100644
--- a/Documentation/networking/xdp-rx-metadata.rst
+++ b/Documentation/networking/xdp-rx-metadata.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
===============
XDP RX Metadata
===============
diff --git a/Documentation/networking/xsk-tx-metadata.rst b/Documentation/networking/xsk-tx-metadata.rst
new file mode 100644
index 000000000000..97ecfa480d00
--- /dev/null
+++ b/Documentation/networking/xsk-tx-metadata.rst
@@ -0,0 +1,79 @@
+==================
+AF_XDP TX Metadata
+==================
+
+This document describes how to enable offloads when transmitting packets
+via :doc:`af_xdp`. Refer to :doc:`xdp-rx-metadata` on how to access similar
+metadata on the receive side.
+
+General Design
+==============
+
+The headroom for the metadata is reserved via ``tx_metadata_len`` in
+``struct xdp_umem_reg``. The metadata length is therefore the same for
+every socket that shares the same umem. The metadata layout is a fixed UAPI,
+refer to ``union xsk_tx_metadata`` in ``include/uapi/linux/if_xdp.h``.
+Thus, generally, the ``tx_metadata_len`` field above should contain
+``sizeof(union xsk_tx_metadata)``.
+
+The headroom and the metadata itself should be located right before
+``xdp_desc->addr`` in the umem frame. Within a frame, the metadata
+layout is as follows::
+
+ tx_metadata_len
+ / \
+ +-----------------+---------+----------------------------+
+ | xsk_tx_metadata | padding | payload |
+ +-----------------+---------+----------------------------+
+ ^
+ |
+ xdp_desc->addr
+
+An AF_XDP application can request headrooms larger than ``sizeof(struct
+xsk_tx_metadata)``. The kernel will ignore the padding (and will still
+use ``xdp_desc->addr - tx_metadata_len`` to locate
+the ``xsk_tx_metadata``). For the frames that shouldn't carry
+any metadata (i.e., the ones that don't have ``XDP_TX_METADATA`` option),
+the metadata area is ignored by the kernel as well.
+
+The flags field enables the particular offload:
+
+- ``XDP_TXMD_FLAGS_TIMESTAMP``: requests the device to put transmission
+ timestamp into ``tx_timestamp`` field of ``union xsk_tx_metadata``.
+- ``XDP_TXMD_FLAGS_CHECKSUM``: requests the device to calculate L4
+ checksum. ``csum_start`` specifies byte offset of where the checksumming
+ should start and ``csum_offset`` specifies byte offset where the
+ device should store the computed checksum.
+
+Besides the flags above, in order to trigger the offloads, the first
+packet's ``struct xdp_desc`` descriptor should set ``XDP_TX_METADATA``
+bit in the ``options`` field. Also note that in a multi-buffer packet
+only the first chunk should carry the metadata.
+
+Software TX Checksum
+====================
+
+For development and testing purposes its possible to pass
+``XDP_UMEM_TX_SW_CSUM`` flag to ``XDP_UMEM_REG`` UMEM registration call.
+In this case, when running in ``XDK_COPY`` mode, the TX checksum
+is calculated on the CPU. Do not enable this option in production because
+it will negatively affect performance.
+
+Querying Device Capabilities
+============================
+
+Every devices exports its offloads capabilities via netlink netdev family.
+Refer to ``xsk-flags`` features bitmask in
+``Documentation/netlink/specs/netdev.yaml``.
+
+- ``tx-timestamp``: device supports ``XDP_TXMD_FLAGS_TIMESTAMP``
+- ``tx-checksum``: device supports ``XDP_TXMD_FLAGS_CHECKSUM``
+
+See ``tools/net/ynl/samples/netdev.c`` on how to query this information.
+
+Example
+=======
+
+See ``tools/testing/selftests/bpf/xdp_hw_metadata.c`` for an example
+program that handles TX metadata. Also see https://github.com/fomichev/xskgen
+for a more bare-bones example.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index b2a5da9739d2..43f027bf2da3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -484,10 +484,12 @@ struct mlx5e_xdp_info_fifo {
struct mlx5e_xdpsq;
struct mlx5e_xmit_data;
+struct xsk_tx_metadata;
typedef int (*mlx5e_fp_xmit_xdp_frame_check)(struct mlx5e_xdpsq *);
typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq *,
struct mlx5e_xmit_data *,
- int);
+ int,
+ struct xsk_tx_metadata *);
struct mlx5e_xdpsq {
/* data path */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 7decc81ed33a..e2e7d82cfca4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -103,7 +103,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
xdptxd->dma_addr = dma_addr;
if (unlikely(!INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
- mlx5e_xmit_xdp_frame, sq, xdptxd, 0)))
+ mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL)))
return false;
/* xmit_mode == MLX5E_XDP_XMIT_MODE_FRAME */
@@ -145,7 +145,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
xdptxd->dma_addr = dma_addr;
if (unlikely(!INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
- mlx5e_xmit_xdp_frame, sq, xdptxd, 0)))
+ mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL)))
return false;
/* xmit_mode == MLX5E_XDP_XMIT_MODE_PAGE */
@@ -261,6 +261,37 @@ const struct xdp_metadata_ops mlx5e_xdp_metadata_ops = {
.xmo_rx_hash = mlx5e_xdp_rx_hash,
};
+struct mlx5e_xsk_tx_complete {
+ struct mlx5_cqe64 *cqe;
+ struct mlx5e_cq *cq;
+};
+
+static u64 mlx5e_xsk_fill_timestamp(void *_priv)
+{
+ struct mlx5e_xsk_tx_complete *priv = _priv;
+ u64 ts;
+
+ ts = get_cqe_ts(priv->cqe);
+
+ if (mlx5_is_real_time_rq(priv->cq->mdev) || mlx5_is_real_time_sq(priv->cq->mdev))
+ return mlx5_real_time_cyc2time(&priv->cq->mdev->clock, ts);
+
+ return mlx5_timecounter_cyc2time(&priv->cq->mdev->clock, ts);
+}
+
+static void mlx5e_xsk_request_checksum(u16 csum_start, u16 csum_offset, void *priv)
+{
+ struct mlx5_wqe_eth_seg *eseg = priv;
+
+ /* HW/FW is doing parsing, so offsets are largely ignored. */
+ eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+}
+
+const struct xsk_tx_metadata_ops mlx5e_xsk_tx_metadata_ops = {
+ .tmo_fill_timestamp = mlx5e_xsk_fill_timestamp,
+ .tmo_request_checksum = mlx5e_xsk_request_checksum,
+};
+
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq,
struct bpf_prog *prog, struct mlx5e_xdp_buff *mxbuf)
@@ -398,11 +429,11 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq
INDIRECT_CALLABLE_SCOPE bool
mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
- int check_result);
+ int check_result, struct xsk_tx_metadata *meta);
INDIRECT_CALLABLE_SCOPE bool
mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
- int check_result)
+ int check_result, struct xsk_tx_metadata *meta)
{
struct mlx5e_tx_mpwqe *session = &sq->mpwqe;
struct mlx5e_xdpsq_stats *stats = sq->stats;
@@ -420,7 +451,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptx
*/
if (unlikely(sq->mpwqe.wqe))
mlx5e_xdp_mpwqe_complete(sq);
- return mlx5e_xmit_xdp_frame(sq, xdptxd, 0);
+ return mlx5e_xmit_xdp_frame(sq, xdptxd, 0, meta);
}
if (!xdptxd->len) {
skb_frag_t *frag = &xdptxdf->sinfo->frags[0];
@@ -450,6 +481,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptx
* and it's safe to complete it at any time.
*/
mlx5e_xdp_mpwqe_session_start(sq);
+ xsk_tx_metadata_request(meta, &mlx5e_xsk_tx_metadata_ops, &session->wqe->eth);
}
mlx5e_xdp_mpwqe_add_dseg(sq, p, stats);
@@ -480,7 +512,7 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq)
INDIRECT_CALLABLE_SCOPE bool
mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
- int check_result)
+ int check_result, struct xsk_tx_metadata *meta)
{
struct mlx5e_xmit_data_frags *xdptxdf =
container_of(xdptxd, struct mlx5e_xmit_data_frags, xd);
@@ -599,6 +631,8 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
sq->pc++;
}
+ xsk_tx_metadata_request(meta, &mlx5e_xsk_tx_metadata_ops, eseg);
+
sq->doorbell_cseg = cseg;
stats->xmit++;
@@ -608,7 +642,9 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
struct mlx5e_xdp_wqe_info *wi,
u32 *xsk_frames,
- struct xdp_frame_bulk *bq)
+ struct xdp_frame_bulk *bq,
+ struct mlx5e_cq *cq,
+ struct mlx5_cqe64 *cqe)
{
struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo;
u16 i;
@@ -668,10 +704,24 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
break;
}
- case MLX5E_XDP_XMIT_MODE_XSK:
+ case MLX5E_XDP_XMIT_MODE_XSK: {
/* AF_XDP send */
+ struct xsk_tx_metadata_compl *compl = NULL;
+ struct mlx5e_xsk_tx_complete priv = {
+ .cqe = cqe,
+ .cq = cq,
+ };
+
+ if (xp_tx_metadata_enabled(sq->xsk_pool)) {
+ xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
+ compl = &xdpi.xsk_meta;
+
+ xsk_tx_metadata_complete(compl, &mlx5e_xsk_tx_metadata_ops, &priv);
+ }
+
(*xsk_frames)++;
break;
+ }
default:
WARN_ON_ONCE(true);
}
@@ -720,7 +770,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
sqcc += wi->num_wqebbs;
- mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq);
+ mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, cq, cqe);
} while (!last_wqe);
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) {
@@ -767,7 +817,7 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
sq->cc += wi->num_wqebbs;
- mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq);
+ mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, &bq, NULL, NULL);
}
xdp_flush_frame_bulk(&bq);
@@ -840,7 +890,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
}
ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
- mlx5e_xmit_xdp_frame, sq, xdptxd, 0);
+ mlx5e_xmit_xdp_frame, sq, xdptxd, 0, NULL);
if (unlikely(!ret)) {
int j;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index ecfe93a479da..e054db1e10f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -33,6 +33,7 @@
#define __MLX5_EN_XDP_H__
#include <linux/indirect_call_wrapper.h>
+#include <net/xdp_sock.h>
#include "en.h"
#include "en/txrx.h"
@@ -82,7 +83,7 @@ enum mlx5e_xdp_xmit_mode {
* num, page_1, page_2, ... , page_num.
*
* MLX5E_XDP_XMIT_MODE_XSK:
- * none.
+ * frame.xsk_meta.
*/
#define MLX5E_XDP_FIFO_ENTRIES2DS_MAX_RATIO 4
@@ -97,6 +98,7 @@ union mlx5e_xdp_info {
u8 num;
struct page *page;
} page;
+ struct xsk_tx_metadata_compl xsk_meta;
};
struct mlx5e_xsk_param;
@@ -112,13 +114,16 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
extern const struct xdp_metadata_ops mlx5e_xdp_metadata_ops;
+extern const struct xsk_tx_metadata_ops mlx5e_xsk_tx_metadata_ops;
INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq,
struct mlx5e_xmit_data *xdptxd,
- int check_result));
+ int check_result,
+ struct xsk_tx_metadata *meta));
INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq,
struct mlx5e_xmit_data *xdptxd,
- int check_result));
+ int check_result,
+ struct xsk_tx_metadata *meta));
INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq));
INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 597f319d4770..a59199ed590d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -55,12 +55,16 @@ static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq,
nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, *xdpi);
+ if (xp_tx_metadata_enabled(sq->xsk_pool))
+ mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo,
+ (union mlx5e_xdp_info) { .xsk_meta = {} });
sq->doorbell_cseg = &nopwqe->ctrl;
}
bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
{
struct xsk_buff_pool *pool = sq->xsk_pool;
+ struct xsk_tx_metadata *meta = NULL;
union mlx5e_xdp_info xdpi;
bool work_done = true;
bool flush = false;
@@ -93,12 +97,13 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
xdptxd.dma_addr = xsk_buff_raw_get_dma(pool, desc.addr);
xdptxd.data = xsk_buff_raw_get_data(pool, desc.addr);
xdptxd.len = desc.len;
+ meta = xsk_buff_get_metadata(pool, desc.addr);
xsk_buff_raw_dma_sync_for_device(pool, xdptxd.dma_addr, xdptxd.len);
ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe,
mlx5e_xmit_xdp_frame, sq, &xdptxd,
- check_result);
+ check_result, meta);
if (unlikely(!ret)) {
if (sq->mpwqe.wqe)
mlx5e_xdp_mpwqe_complete(sq);
@@ -106,6 +111,16 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
mlx5e_xsk_tx_post_err(sq, &xdpi);
} else {
mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi);
+ if (xp_tx_metadata_enabled(sq->xsk_pool)) {
+ struct xsk_tx_metadata_compl compl;
+
+ xsk_tx_metadata_to_compl(meta, &compl);
+ XSK_TX_COMPL_FITS(void *);
+
+ mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo,
+ (union mlx5e_xdp_info)
+ { .xsk_meta = compl });
+ }
}
flush = true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3aecdf099a2f..0197eefb2108 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5164,6 +5164,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
netdev->netdev_ops = &mlx5e_netdev_ops;
netdev->xdp_metadata_ops = &mlx5e_xdp_metadata_ops;
+ netdev->xsk_tx_metadata_ops = &mlx5e_xsk_tx_metadata_ops;
mlx5e_dcbnl_build_netdev(netdev);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index cd7a9768de5f..686c94c2e8a7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -51,6 +51,7 @@ struct stmmac_tx_info {
bool last_segment;
bool is_jumbo;
enum stmmac_txbuf_type buf_type;
+ struct xsk_tx_metadata_compl xsk_meta;
};
#define STMMAC_TBS_AVAIL BIT(0)
@@ -100,6 +101,17 @@ struct stmmac_xdp_buff {
struct dma_desc *ndesc;
};
+struct stmmac_metadata_request {
+ struct stmmac_priv *priv;
+ struct dma_desc *tx_desc;
+ bool *set_ic;
+};
+
+struct stmmac_xsk_tx_complete {
+ struct stmmac_priv *priv;
+ struct dma_desc *desc;
+};
+
struct stmmac_rx_queue {
u32 rx_count_frames;
u32 queue_index;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8964fc8a955f..c2ac88aaffed 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2430,6 +2430,46 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
}
}
+static void stmmac_xsk_request_timestamp(void *_priv)
+{
+ struct stmmac_metadata_request *meta_req = _priv;
+
+ stmmac_enable_tx_timestamp(meta_req->priv, meta_req->tx_desc);
+ *meta_req->set_ic = true;
+}
+
+static u64 stmmac_xsk_fill_timestamp(void *_priv)
+{
+ struct stmmac_xsk_tx_complete *tx_compl = _priv;
+ struct stmmac_priv *priv = tx_compl->priv;
+ struct dma_desc *desc = tx_compl->desc;
+ bool found = false;
+ u64 ns = 0;
+
+ if (!priv->hwts_tx_en)
+ return 0;
+
+ /* check tx tstamp status */
+ if (stmmac_get_tx_timestamp_status(priv, desc)) {
+ stmmac_get_timestamp(priv, desc, priv->adv_ts, &ns);
+ found = true;
+ } else if (!stmmac_get_mac_tx_timestamp(priv, priv->hw, &ns)) {
+ found = true;
+ }
+
+ if (found) {
+ ns -= priv->plat->cdc_error_adj;
+ return ns_to_ktime(ns);
+ }
+
+ return 0;
+}
+
+static const struct xsk_tx_metadata_ops stmmac_xsk_tx_metadata_ops = {
+ .tmo_request_timestamp = stmmac_xsk_request_timestamp,
+ .tmo_fill_timestamp = stmmac_xsk_fill_timestamp,
+};
+
static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
{
struct netdev_queue *nq = netdev_get_tx_queue(priv->dev, queue);
@@ -2449,6 +2489,8 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
budget = min(budget, stmmac_tx_avail(priv, queue));
while (budget-- > 0) {
+ struct stmmac_metadata_request meta_req;
+ struct xsk_tx_metadata *meta = NULL;
dma_addr_t dma_addr;
bool set_ic;
@@ -2472,6 +2514,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
tx_desc = tx_q->dma_tx + entry;
dma_addr = xsk_buff_raw_get_dma(pool, xdp_desc.addr);
+ meta = xsk_buff_get_metadata(pool, xdp_desc.addr);
xsk_buff_raw_dma_sync_for_device(pool, dma_addr, xdp_desc.len);
tx_q->tx_skbuff_dma[entry].buf_type = STMMAC_TXBUF_T_XSK_TX;
@@ -2499,6 +2542,11 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
else
set_ic = false;
+ meta_req.priv = priv;
+ meta_req.tx_desc = tx_desc;
+ meta_req.set_ic = &set_ic;
+ xsk_tx_metadata_request(meta, &stmmac_xsk_tx_metadata_ops,
+ &meta_req);
if (set_ic) {
tx_q->tx_count_frames = 0;
stmmac_set_tx_ic(priv, tx_desc);
@@ -2511,6 +2559,9 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
stmmac_enable_dma_transmission(priv, priv->ioaddr);
+ xsk_tx_metadata_to_compl(meta,
+ &tx_q->tx_skbuff_dma[entry].xsk_meta);
+
tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, priv->dma_conf.dma_tx_size);
entry = tx_q->cur_tx;
}
@@ -2620,8 +2671,18 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue,
} else {
tx_packets++;
}
- if (skb)
+ if (skb) {
stmmac_get_tx_hwtstamp(priv, p, skb);
+ } else {
+ struct stmmac_xsk_tx_complete tx_compl = {
+ .priv = priv,
+ .desc = p,
+ };
+
+ xsk_tx_metadata_complete(&tx_q->tx_skbuff_dma[entry].xsk_meta,
+ &stmmac_xsk_tx_metadata_ops,
+ &tx_compl);
+ }
}
if (likely(tx_q->tx_skbuff_dma[entry].buf &&
@@ -7464,6 +7525,7 @@ int stmmac_dvr_probe(struct device *device,
ndev->netdev_ops = &stmmac_netdev_ops;
ndev->xdp_metadata_ops = &stmmac_xdp_metadata_ops;
+ ndev->xsk_tx_metadata_ops = &stmmac_xsk_tx_metadata_ops;
ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
NETIF_F_RXCSUM;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e87caa81f70c..08da8b28c816 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1865,6 +1865,7 @@ enum netdev_stat_type {
* @netdev_ops: Includes several pointers to callbacks,
* if one wants to override the ndo_*() functions
* @xdp_metadata_ops: Includes pointers to XDP metadata callbacks.
+ * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks.
* @ethtool_ops: Management operations
* @l3mdev_ops: Layer 3 master device operations
* @ndisc_ops: Includes callbacks for different IPv6 neighbour
@@ -2128,6 +2129,7 @@ struct net_device {
unsigned long long priv_flags;
const struct net_device_ops *netdev_ops;
const struct xdp_metadata_ops *xdp_metadata_ops;
+ const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
int ifindex;
unsigned short gflags;
unsigned short hard_header_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 27998f73183e..b370eb8d70f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -566,6 +566,15 @@ struct ubuf_info_msgzc {
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);
+/* Preserve some data across TX submission and completion.
+ *
+ * Note, this state is stored in the driver. Extending the layout
+ * might need some special care.
+ */
+struct xsk_tx_metadata_compl {
+ __u64 *tx_timestamp;
+};
+
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
@@ -578,7 +587,10 @@ struct skb_shared_info {
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list;
- struct skb_shared_hwtstamps hwtstamps;
+ union {
+ struct skb_shared_hwtstamps hwtstamps;
+ struct xsk_tx_metadata_compl xsk_meta;
+ };
unsigned int gso_type;
u32 tskey;
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index f83128007fb0..3cb4dc9bd70e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@ struct xdp_umem {
struct user_struct *user;
refcount_t users;
u8 flags;
+ u8 tx_metadata_len;
bool zc;
struct page **pgs;
int id;
@@ -92,12 +93,105 @@ struct xdp_sock {
struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
};
+/*
+ * AF_XDP TX metadata hooks for network devices.
+ * The following hooks can be defined; unless noted otherwise, they are
+ * optional and can be filled with a null pointer.
+ *
+ * void (*tmo_request_timestamp)(void *priv)
+ * Called when AF_XDP frame requested egress timestamp.
+ *
+ * u64 (*tmo_fill_timestamp)(void *priv)
+ * Called when AF_XDP frame, that had requested egress timestamp,
+ * received a completion. The hook needs to return the actual HW timestamp.
+ *
+ * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv)
+ * Called when AF_XDP frame requested HW checksum offload. csum_start
+ * indicates position where checksumming should start.
+ * csum_offset indicates position where checksum should be stored.
+ *
+ */
+struct xsk_tx_metadata_ops {
+ void (*tmo_request_timestamp)(void *priv);
+ u64 (*tmo_fill_timestamp)(void *priv);
+ void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
+};
+
#ifdef CONFIG_XDP_SOCKETS
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(void);
+/**
+ * xsk_tx_metadata_to_compl - Save enough relevant metadata information
+ * to perform tx completion in the future.
+ * @meta: pointer to AF_XDP metadata area
+ * @compl: pointer to output struct xsk_tx_metadata_to_compl
+ *
+ * This function should be called by the networking device when
+ * it prepares AF_XDP egress packet. The value of @compl should be stored
+ * and passed to xsk_tx_metadata_complete upon TX completion.
+ */
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+ struct xsk_tx_metadata_compl *compl)
+{
+ if (!meta)
+ return;
+
+ if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+ compl->tx_timestamp = &meta->completion.tx_timestamp;
+ else
+ compl->tx_timestamp = NULL;
+}
+
+/**
+ * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission
+ * and call appropriate xsk_tx_metadata_ops operation.
+ * @meta: pointer to AF_XDP metadata area
+ * @ops: pointer to struct xsk_tx_metadata_ops
+ * @priv: pointer to driver-private aread
+ *
+ * This function should be called by the networking device when
+ * it prepares AF_XDP egress packet.
+ */
+static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+ if (!meta)
+ return;
+
+ if (ops->tmo_request_timestamp)
+ if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
+ ops->tmo_request_timestamp(priv);
+
+ if (ops->tmo_request_checksum)
+ if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM)
+ ops->tmo_request_checksum(meta->request.csum_start,
+ meta->request.csum_offset, priv);
+}
+
+/**
+ * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion
+ * and call appropriate xsk_tx_metadata_ops operation.
+ * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl
+ * @ops: pointer to struct xsk_tx_metadata_ops
+ * @priv: pointer to driver-private aread
+ *
+ * This function should be called by the networking device upon
+ * AF_XDP egress completion.
+ */
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+ if (!compl)
+ return;
+
+ *compl->tx_timestamp = ops->tmo_fill_timestamp(priv);
+}
+
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -114,6 +208,23 @@ static inline void __xsk_map_flush(void)
{
}
+static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
+ struct xsk_tx_metadata_compl *compl)
+{
+}
+
+static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+}
+
+static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl,
+ const struct xsk_tx_metadata_ops *ops,
+ void *priv)
+{
+}
+
#endif /* CONFIG_XDP_SOCKETS */
#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 1f6fc8c7a84c..81e02de3f453 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -165,6 +165,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return xp_raw_get_data(pool, addr);
}
+#define XDP_TXMD_FLAGS_VALID ( \
+ XDP_TXMD_FLAGS_TIMESTAMP | \
+ XDP_TXMD_FLAGS_CHECKSUM | \
+ 0)
+
+static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
+{
+ return !(meta->flags & ~XDP_TXMD_FLAGS_VALID);
+}
+
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+ struct xsk_tx_metadata *meta;
+
+ if (!pool->tx_metadata_len)
+ return NULL;
+
+ meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
+ if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
+ return NULL; /* no way to signal the error to the user */
+
+ return meta;
+}
+
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
@@ -324,6 +348,16 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return NULL;
}
+static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
+{
+ return false;
+}
+
+static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+ return NULL;
+}
+
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
{
}
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index b0bdff26fc88..8d48d37ab7c0 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -33,6 +33,7 @@ struct xdp_buff_xsk {
};
#define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb))
+#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t))
struct xsk_dma_map {
dma_addr_t *dma_pages;
@@ -77,10 +78,12 @@ struct xsk_buff_pool {
u32 chunk_size;
u32 chunk_shift;
u32 frame_len;
+ u8 tx_metadata_len; /* inherited from umem */
u8 cached_need_wakeup;
bool uses_need_wakeup;
bool dma_need_sync;
bool unaligned;
+ bool tx_sw_csum;
void *addrs;
/* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
* NAPI TX thread and sendmsg error paths in the SKB destructor callback and when
@@ -233,4 +236,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb)
return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}
+static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool)
+{
+ return pool->tx_metadata_len > 0;
+}
+
#endif /* XSK_BUFF_POOL_H_ */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 8d48863472b9..d31698410410 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -33,7 +33,13 @@
#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
-#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+
+/* Force checksum calculation in software. Can be used for testing or
+ * working around potential HW issues. This option causes performance
+ * degradation and only works in XDP_COPY mode.
+ */
+#define XDP_UMEM_TX_SW_CSUM (1 << 1)
struct sockaddr_xdp {
__u16 sxdp_family;
@@ -76,6 +82,7 @@ struct xdp_umem_reg {
__u32 chunk_size;
__u32 headroom;
__u32 flags;
+ __u32 tx_metadata_len;
};
struct xdp_statistics {
@@ -105,6 +112,41 @@ struct xdp_options {
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+/* Request transmit timestamp. Upon completion, put it into tx_timestamp
+ * field of struct xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0)
+
+/* Request transmit checksum offload. Checksum start position and offset
+ * are communicated via csum_start and csum_offset fields of struct
+ * xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
+
+/* AF_XDP offloads request. 'request' union member is consumed by the driver
+ * when the packet is being transmitted. 'completion' union member is
+ * filled by the driver when the transmit completion arrives.
+ */
+struct xsk_tx_metadata {
+ __u64 flags;
+
+ union {
+ struct {
+ /* XDP_TXMD_FLAGS_CHECKSUM */
+
+ /* Offset from desc->addr where checksumming should start. */
+ __u16 csum_start;
+ /* Offset from csum_start where checksum should be stored. */
+ __u16 csum_offset;
+ } request;
+
+ struct {
+ /* XDP_TXMD_FLAGS_TIMESTAMP */
+ __u64 tx_timestamp;
+ } completion;
+ };
+};
+
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
@@ -121,4 +163,7 @@ struct xdp_desc {
*/
#define XDP_PKT_CONTD (1 << 0)
+/* TX packet carries valid metadata. */
+#define XDP_TX_METADATA (1 << 1)
+
#endif /* _LINUX_IF_XDP_H */
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 2943a151d4f1..48d5477a668c 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata {
NETDEV_XDP_RX_METADATA_MASK = 3,
};
+/**
+ * enum netdev_xsk_flags
+ * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported
+ * by the driver.
+ * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
+ * driver.
+ */
+enum netdev_xsk_flags {
+ NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
+ NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
+
+ /* private: */
+ NETDEV_XSK_FLAGS_MASK = 3,
+};
+
enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_DEV_XDP_FEATURES,
NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ NETDEV_A_DEV_XSK_FEATURES,
__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index fe61f85bcf33..10f2124e9e23 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -6,6 +6,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/xdp.h>
+#include <net/xdp_sock.h>
#include "netdev-genl-gen.h"
@@ -13,6 +14,7 @@ static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
+ u64 xsk_features = 0;
u64 xdp_rx_meta = 0;
void *hdr;
@@ -26,11 +28,20 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
+ if (netdev->xsk_tx_metadata_ops) {
+ if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ }
+
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
netdev->xdp_features, NETDEV_A_DEV_PAD) ||
nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
- xdp_rx_meta, NETDEV_A_DEV_PAD)) {
+ xdp_rx_meta, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
+ xsk_features, NETDEV_A_DEV_PAD)) {
genlmsg_cancel(rsp, hdr);
return -EINVAL;
}
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 06cead2b8e34..caa340134b0e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -148,6 +148,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
return 0;
}
+#define XDP_UMEM_FLAGS_VALID ( \
+ XDP_UMEM_UNALIGNED_CHUNK_FLAG | \
+ XDP_UMEM_TX_SW_CSUM | \
+ 0)
+
static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
@@ -167,7 +172,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
}
- if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+ if (mr->flags & ~XDP_UMEM_FLAGS_VALID)
return -EINVAL;
if (!unaligned_chunks && !is_power_of_2(chunk_size))
@@ -199,6 +204,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL;
+ if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8)
+ return -EINVAL;
+
umem->size = size;
umem->headroom = headroom;
umem->chunk_size = chunk_size;
@@ -207,6 +215,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->pgs = NULL;
umem->user = NULL;
umem->flags = mr->flags;
+ umem->tx_metadata_len = mr->tx_metadata_len;
INIT_LIST_HEAD(&umem->xsk_dma_list);
refcount_set(&umem->users, 1);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ae9f8cb611f6..281d49b4fca4 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -571,6 +571,13 @@ static u32 xsk_get_num_desc(struct sk_buff *skb)
static void xsk_destruct_skb(struct sk_buff *skb)
{
+ struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
+
+ if (compl->tx_timestamp) {
+ /* sw completion timestamp, not a real one */
+ *compl->tx_timestamp = ktime_get_tai_fast_ns();
+ }
+
xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
sock_wfree(skb);
}
@@ -655,8 +662,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct xdp_desc *desc)
{
+ struct xsk_tx_metadata *meta = NULL;
struct net_device *dev = xs->dev;
struct sk_buff *skb = xs->skb;
+ bool first_frag = false;
int err;
if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
@@ -687,6 +696,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
kfree_skb(skb);
goto free_err;
}
+
+ first_frag = true;
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
struct page *page;
@@ -709,12 +720,45 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb_add_rx_frag(skb, nr_frags, page, 0, len, 0);
}
+
+ if (first_frag && desc->options & XDP_TX_METADATA) {
+ if (unlikely(xs->pool->tx_metadata_len == 0)) {
+ err = -EINVAL;
+ goto free_err;
+ }
+
+ meta = buffer - xs->pool->tx_metadata_len;
+ if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
+ err = -EINVAL;
+ goto free_err;
+ }
+
+ if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
+ if (unlikely(meta->request.csum_start +
+ meta->request.csum_offset +
+ sizeof(__sum16) > len)) {
+ err = -EINVAL;
+ goto free_err;
+ }
+
+ skb->csum_start = hr + meta->request.csum_start;
+ skb->csum_offset = meta->request.csum_offset;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (unlikely(xs->pool->tx_sw_csum)) {
+ err = skb_checksum_help(skb);
+ if (err)
+ goto free_err;
+ }
+ }
+ }
}
skb->dev = dev;
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
+ xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
xsk_set_destructor_arg(skb);
return skb;
@@ -1283,6 +1327,14 @@ struct xdp_umem_reg_v1 {
__u32 headroom;
};
+struct xdp_umem_reg_v2 {
+ __u64 addr; /* Start of packet data area */
+ __u64 len; /* Length of packet data area */
+ __u32 chunk_size;
+ __u32 headroom;
+ __u32 flags;
+};
+
static int xsk_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
@@ -1326,8 +1378,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(struct xdp_umem_reg_v1))
return -EINVAL;
- else if (optlen < sizeof(mr))
+ else if (optlen < sizeof(struct xdp_umem_reg_v2))
mr_size = sizeof(struct xdp_umem_reg_v1);
+ else if (optlen < sizeof(mr))
+ mr_size = sizeof(struct xdp_umem_reg_v2);
if (copy_from_sockptr(&mr, optval, mr_size))
return -EFAULT;
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 49cb9f9a09be..4f6f538a5462 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -85,6 +85,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
XDP_PACKET_HEADROOM;
pool->umem = umem;
pool->addrs = umem->addrs;
+ pool->tx_metadata_len = umem->tx_metadata_len;
+ pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM;
INIT_LIST_HEAD(&pool->free_list);
INIT_LIST_HEAD(&pool->xskb_list);
INIT_LIST_HEAD(&pool->xsk_tx_list);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 13354a1e4280..6f2d1621c992 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -137,21 +137,23 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
static inline bool xp_unused_options_set(u32 options)
{
- return options & ~XDP_PKT_CONTD;
+ return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA);
}
static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 offset = desc->addr & (pool->chunk_size - 1);
+ u64 addr = desc->addr - pool->tx_metadata_len;
+ u64 len = desc->len + pool->tx_metadata_len;
+ u64 offset = addr & (pool->chunk_size - 1);
if (!desc->len)
return false;
- if (offset + desc->len > pool->chunk_size)
+ if (offset + len > pool->chunk_size)
return false;
- if (desc->addr >= pool->addrs_cnt)
+ if (addr >= pool->addrs_cnt)
return false;
if (xp_unused_options_set(desc->options))
@@ -162,16 +164,17 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+ u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
+ u64 len = desc->len + pool->tx_metadata_len;
if (!desc->len)
return false;
- if (desc->len > pool->chunk_size)
+ if (len > pool->chunk_size)
return false;
- if (addr >= pool->addrs_cnt || addr + desc->len > pool->addrs_cnt ||
- xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
+ if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt ||
+ xp_desc_crosses_non_contig_pg(pool, addr, len))
return false;
if (xp_unused_options_set(desc->options))
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 73a47da885dc..638c606dfa74 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -26,14 +26,20 @@
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
/* By setting this option, userspace application indicates that it can
- * handle multiple descriptors per packet thus enabling xsk core to split
+ * handle multiple descriptors per packet thus enabling AF_XDP to split
* multi-buffer XDP frames into multiple Rx descriptors. Without this set
- * such frames will be dropped by xsk.
+ * such frames will be dropped.
*/
-#define XDP_USE_SG (1 << 4)
+#define XDP_USE_SG (1 << 4)
/* Flags for xsk_umem_config flags */
-#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
+
+/* Force checksum calculation in software. Can be used for testing or
+ * working around potential HW issues. This option causes performance
+ * degradation and only works in XDP_COPY mode.
+ */
+#define XDP_UMEM_TX_SW_CSUM (1 << 1)
struct sockaddr_xdp {
__u16 sxdp_family;
@@ -76,6 +82,7 @@ struct xdp_umem_reg {
__u32 chunk_size;
__u32 headroom;
__u32 flags;
+ __u32 tx_metadata_len;
};
struct xdp_statistics {
@@ -105,6 +112,41 @@ struct xdp_options {
#define XSK_UNALIGNED_BUF_ADDR_MASK \
((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+/* Request transmit timestamp. Upon completion, put it into tx_timestamp
+ * field of union xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0)
+
+/* Request transmit checksum offload. Checksum start position and offset
+ * are communicated via csum_start and csum_offset fields of union
+ * xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
+
+/* AF_XDP offloads request. 'request' union member is consumed by the driver
+ * when the packet is being transmitted. 'completion' union member is
+ * filled by the driver when the transmit completion arrives.
+ */
+struct xsk_tx_metadata {
+ __u64 flags;
+
+ union {
+ struct {
+ /* XDP_TXMD_FLAGS_CHECKSUM */
+
+ /* Offset from desc->addr where checksumming should start. */
+ __u16 csum_start;
+ /* Offset from csum_start where checksum should be stored. */
+ __u16 csum_offset;
+ } request;
+
+ struct {
+ /* XDP_TXMD_FLAGS_TIMESTAMP */
+ __u64 tx_timestamp;
+ } completion;
+ };
+};
+
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
@@ -112,9 +154,16 @@ struct xdp_desc {
__u32 options;
};
-/* Flag indicating packet constitutes of multiple buffers*/
+/* UMEM descriptor is __u64 */
+
+/* Flag indicating that the packet continues with the buffer pointed out by the
+ * next frame in the ring. The end of the packet is signalled by setting this
+ * bit to zero. For single buffer packets, every descriptor has 'options' set
+ * to 0 and this maintains backward compatibility.
+ */
#define XDP_PKT_CONTD (1 << 0)
-/* UMEM descriptor is __u64 */
+/* TX packet carries valid metadata. */
+#define XDP_TX_METADATA (1 << 1)
#endif /* _LINUX_IF_XDP_H */
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 2943a151d4f1..48d5477a668c 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata {
NETDEV_XDP_RX_METADATA_MASK = 3,
};
+/**
+ * enum netdev_xsk_flags
+ * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported
+ * by the driver.
+ * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
+ * driver.
+ */
+enum netdev_xsk_flags {
+ NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
+ NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
+
+ /* private: */
+ NETDEV_XSK_FLAGS_MASK = 3,
+};
+
enum {
NETDEV_A_DEV_IFINDEX = 1,
NETDEV_A_DEV_PAD,
NETDEV_A_DEV_XDP_FEATURES,
NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ NETDEV_A_DEV_XSK_FEATURES,
__NETDEV_A_DEV_MAX,
NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/tools/net/ynl/generated/netdev-user.c b/tools/net/ynl/generated/netdev-user.c
index b5ffe8cd1144..6283d87dad37 100644
--- a/tools/net/ynl/generated/netdev-user.c
+++ b/tools/net/ynl/generated/netdev-user.c
@@ -58,6 +58,19 @@ const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value)
return netdev_xdp_rx_metadata_strmap[value];
}
+static const char * const netdev_xsk_flags_strmap[] = {
+ [0] = "tx-timestamp",
+ [1] = "tx-checksum",
+};
+
+const char *netdev_xsk_flags_str(enum netdev_xsk_flags value)
+{
+ value = ffs(value) - 1;
+ if (value < 0 || value >= (int)MNL_ARRAY_SIZE(netdev_xsk_flags_strmap))
+ return NULL;
+ return netdev_xsk_flags_strmap[value];
+}
+
/* Policies */
struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = {
[NETDEV_A_DEV_IFINDEX] = { .name = "ifindex", .type = YNL_PT_U32, },
@@ -65,6 +78,7 @@ struct ynl_policy_attr netdev_dev_policy[NETDEV_A_DEV_MAX + 1] = {
[NETDEV_A_DEV_XDP_FEATURES] = { .name = "xdp-features", .type = YNL_PT_U64, },
[NETDEV_A_DEV_XDP_ZC_MAX_SEGS] = { .name = "xdp-zc-max-segs", .type = YNL_PT_U32, },
[NETDEV_A_DEV_XDP_RX_METADATA_FEATURES] = { .name = "xdp-rx-metadata-features", .type = YNL_PT_U64, },
+ [NETDEV_A_DEV_XSK_FEATURES] = { .name = "xsk-features", .type = YNL_PT_U64, },
};
struct ynl_policy_nest netdev_dev_nest = {
@@ -116,6 +130,11 @@ int netdev_dev_get_rsp_parse(const struct nlmsghdr *nlh, void *data)
return MNL_CB_ERROR;
dst->_present.xdp_rx_metadata_features = 1;
dst->xdp_rx_metadata_features = mnl_attr_get_u64(attr);
+ } else if (type == NETDEV_A_DEV_XSK_FEATURES) {
+ if (ynl_attr_validate(yarg, attr))
+ return MNL_CB_ERROR;
+ dst->_present.xsk_features = 1;
+ dst->xsk_features = mnl_attr_get_u64(attr);
}
}
diff --git a/tools/net/ynl/generated/netdev-user.h b/tools/net/ynl/generated/netdev-user.h
index 4fafac879df3..39af1908444b 100644
--- a/tools/net/ynl/generated/netdev-user.h
+++ b/tools/net/ynl/generated/netdev-user.h
@@ -19,6 +19,7 @@ extern const struct ynl_family ynl_netdev_family;
const char *netdev_op_str(int op);
const char *netdev_xdp_act_str(enum netdev_xdp_act value);
const char *netdev_xdp_rx_metadata_str(enum netdev_xdp_rx_metadata value);
+const char *netdev_xsk_flags_str(enum netdev_xsk_flags value);
/* Common nested types */
/* ============== NETDEV_CMD_DEV_GET ============== */
@@ -50,12 +51,14 @@ struct netdev_dev_get_rsp {
__u32 xdp_features:1;
__u32 xdp_zc_max_segs:1;
__u32 xdp_rx_metadata_features:1;
+ __u32 xsk_features:1;
} _present;
__u32 ifindex;
__u64 xdp_features;
__u32 xdp_zc_max_segs;
__u64 xdp_rx_metadata_features;
+ __u64 xsk_features;
};
void netdev_dev_get_rsp_free(struct netdev_dev_get_rsp *rsp);
diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c
index b828225daad0..591b90e21890 100644
--- a/tools/net/ynl/samples/netdev.c
+++ b/tools/net/ynl/samples/netdev.c
@@ -33,17 +33,23 @@ static void netdev_print_device(struct netdev_dev_get_rsp *d, unsigned int op)
return;
printf("xdp-features (%llx):", d->xdp_features);
- for (int i = 0; d->xdp_features > 1U << i; i++) {
+ for (int i = 0; d->xdp_features >= 1U << i; i++) {
if (d->xdp_features & (1U << i))
printf(" %s", netdev_xdp_act_str(1 << i));
}
printf(" xdp-rx-metadata-features (%llx):", d->xdp_rx_metadata_features);
- for (int i = 0; d->xdp_rx_metadata_features > 1U << i; i++) {
+ for (int i = 0; d->xdp_rx_metadata_features >= 1U << i; i++) {
if (d->xdp_rx_metadata_features & (1U << i))
printf(" %s", netdev_xdp_rx_metadata_str(1 << i));
}
+ printf(" xsk-features (%llx):", d->xsk_features);
+ for (int i = 0; d->xsk_features >= 1U << i; i++) {
+ if (d->xsk_features & (1U << i))
+ printf(" %s", netdev_xsk_flags_str(1 << i));
+ }
+
printf(" xdp-zc-max-segs=%u", d->xdp_zc_max_segs);
name = netdev_op_str(op);
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 34f1200a781b..94b9be24e39b 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -71,4 +71,47 @@ struct nstoken;
*/
struct nstoken *open_netns(const char *name);
void close_netns(struct nstoken *token);
+
+static __u16 csum_fold(__u32 csum)
+{
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+
+ return (__u16)~csum;
+}
+
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ __u32 len, __u8 proto,
+ __wsum csum)
+{
+ __u64 s = csum;
+
+ s += (__u32)saddr;
+ s += (__u32)daddr;
+ s += htons(proto + len);
+ s = (s & 0xffffffff) + (s >> 32);
+ s = (s & 0xffffffff) + (s >> 32);
+
+ return csum_fold((__u32)s);
+}
+
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto,
+ __wsum csum)
+{
+ __u64 s = csum;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ s += (__u32)saddr->s6_addr32[i];
+ for (i = 0; i < 4; i++)
+ s += (__u32)daddr->s6_addr32[i];
+ s += htons(proto + len);
+ s = (s & 0xffffffff) + (s >> 32);
+ s = (s & 0xffffffff) + (s >> 32);
+
+ return csum_fold((__u32)s);
+}
+
#endif
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
index 4439ba9392f8..33cdf88efa6b 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c
@@ -56,7 +56,8 @@ static int open_xsk(int ifindex, struct xsk *xsk)
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
- .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+ .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG | XDP_UMEM_TX_SW_CSUM,
+ .tx_metadata_len = sizeof(struct xsk_tx_metadata),
};
__u32 idx;
u64 addr;
@@ -138,6 +139,7 @@ static void ip_csum(struct iphdr *iph)
static int generate_packet(struct xsk *xsk, __u16 dst_port)
{
+ struct xsk_tx_metadata *meta;
struct xdp_desc *tx_desc;
struct udphdr *udph;
struct ethhdr *eth;
@@ -151,10 +153,14 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port)
return -1;
tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
- tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE;
+ tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
printf("%p: tx_desc[%u]->addr=%llx\n", xsk, idx, tx_desc->addr);
data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+ meta = data - sizeof(struct xsk_tx_metadata);
+ memset(meta, 0, sizeof(*meta));
+ meta->flags = XDP_TXMD_FLAGS_TIMESTAMP;
+
eth = data;
iph = (void *)(eth + 1);
udph = (void *)(iph + 1);
@@ -178,11 +184,17 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port)
udph->source = htons(AF_XDP_SOURCE_PORT);
udph->dest = htons(dst_port);
udph->len = htons(sizeof(*udph) + UDP_PAYLOAD_BYTES);
- udph->check = 0;
+ udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ ntohs(udph->len), IPPROTO_UDP, 0);
memset(udph + 1, 0xAA, UDP_PAYLOAD_BYTES);
+ meta->flags |= XDP_TXMD_FLAGS_CHECKSUM;
+ meta->request.csum_start = sizeof(*eth) + sizeof(*iph);
+ meta->request.csum_offset = offsetof(struct udphdr, check);
+
tx_desc->len = sizeof(*eth) + sizeof(*iph) + sizeof(*udph) + UDP_PAYLOAD_BYTES;
+ tx_desc->options |= XDP_TX_METADATA;
xsk_ring_prod__submit(&xsk->tx, 1);
ret = sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
@@ -194,13 +206,21 @@ static int generate_packet(struct xsk *xsk, __u16 dst_port)
static void complete_tx(struct xsk *xsk)
{
- __u32 idx;
+ struct xsk_tx_metadata *meta;
__u64 addr;
+ void *data;
+ __u32 idx;
if (ASSERT_EQ(xsk_ring_cons__peek(&xsk->comp, 1, &idx), 1, "xsk_ring_cons__peek")) {
addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
+
+ data = xsk_umem__get_data(xsk->umem_area, addr);
+ meta = data - sizeof(struct xsk_tx_metadata);
+
+ ASSERT_NEQ(meta->completion.tx_timestamp, 0, "tx_timestamp");
+
xsk_ring_cons__release(&xsk->comp, 1);
}
}
@@ -221,6 +241,7 @@ static int verify_xsk_metadata(struct xsk *xsk)
const struct xdp_desc *rx_desc;
struct pollfd fds = {};
struct xdp_meta *meta;
+ struct udphdr *udph;
struct ethhdr *eth;
struct iphdr *iph;
__u64 comp_addr;
@@ -257,6 +278,7 @@ static int verify_xsk_metadata(struct xsk *xsk)
ASSERT_EQ(eth->h_proto, htons(ETH_P_IP), "eth->h_proto");
iph = (void *)(eth + 1);
ASSERT_EQ((int)iph->version, 4, "iph->version");
+ udph = (void *)(iph + 1);
/* custom metadata */
@@ -270,6 +292,9 @@ static int verify_xsk_metadata(struct xsk *xsk)
ASSERT_EQ(meta->rx_hash_type, 0, "rx_hash_type");
+ /* checksum offload */
+ ASSERT_EQ(udph->check, htons(0x721c), "csum");
+
xsk_ring_cons__release(&xsk->rx, 1);
refill_rx(xsk, comp_addr);
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index c3ba40d0b9de..3291625ba4fb 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -10,7 +10,9 @@
* - rx_hash
*
* TX:
- * - TBD
+ * - UDP 9091 packets trigger TX reply
+ * - TX HW timestamp is requested and reported back upon completion
+ * - TX checksum is requested
*/
#include <test_progs.h>
@@ -24,15 +26,18 @@
#include <linux/net_tstamp.h>
#include <linux/udp.h>
#include <linux/sockios.h>
+#include <linux/if_xdp.h>
#include <sys/mman.h>
#include <net/if.h>
#include <ctype.h>
#include <poll.h>
#include <time.h>
+#include <unistd.h>
+#include <libgen.h>
#include "xdp_metadata.h"
-#define UMEM_NUM 16
+#define UMEM_NUM 256
#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
#define XDP_FLAGS (XDP_FLAGS_DRV_MODE | XDP_FLAGS_REPLACE)
@@ -48,11 +53,14 @@ struct xsk {
};
struct xdp_hw_metadata *bpf_obj;
-__u16 bind_flags = XDP_COPY;
+__u16 bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY;
struct xsk *rx_xsk;
const char *ifname;
int ifindex;
int rxq;
+bool skip_tx;
+__u64 last_hw_rx_timestamp;
+__u64 last_xdp_rx_timestamp;
void test__fail(void) { /* for network_helpers.c */ }
@@ -68,7 +76,8 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
- .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG,
+ .flags = XSK_UMEM__DEFAULT_FLAGS,
+ .tx_metadata_len = sizeof(struct xsk_tx_metadata),
};
__u32 idx;
u64 addr;
@@ -110,7 +119,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
for (i = 0; i < UMEM_NUM / 2; i++) {
addr = (UMEM_NUM / 2 + i) * UMEM_FRAME_SIZE;
printf("%p: rx_desc[%d] -> %lx\n", xsk, i, addr);
- *xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+ *xsk_ring_prod__fill_addr(&xsk->fill, idx + i) = addr;
}
xsk_ring_prod__submit(&xsk->fill, ret);
@@ -131,12 +140,22 @@ static void refill_rx(struct xsk *xsk, __u64 addr)
__u32 idx;
if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
- printf("%p: complete idx=%u addr=%llx\n", xsk, idx, addr);
+ printf("%p: complete rx idx=%u addr=%llx\n", xsk, idx, addr);
*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
xsk_ring_prod__submit(&xsk->fill, 1);
}
}
+static int kick_tx(struct xsk *xsk)
+{
+ return sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
+}
+
+static int kick_rx(struct xsk *xsk)
+{
+ return recvfrom(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, NULL);
+}
+
#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
static __u64 gettime(clockid_t clock_id)
{
@@ -152,6 +171,17 @@ static __u64 gettime(clockid_t clock_id)
return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
}
+static void print_tstamp_delta(const char *name, const char *refname,
+ __u64 tstamp, __u64 reference)
+{
+ __s64 delta = (__s64)reference - (__s64)tstamp;
+
+ printf("%s: %llu (sec:%0.4f) delta to %s sec:%0.4f (%0.3f usec)\n",
+ name, tstamp, (double)tstamp / NANOSEC_PER_SEC, refname,
+ (double)delta / NANOSEC_PER_SEC,
+ (double)delta / 1000);
+}
+
static void verify_xdp_metadata(void *data, clockid_t clock_id)
{
struct xdp_meta *meta;
@@ -164,25 +194,20 @@ static void verify_xdp_metadata(void *data, clockid_t clock_id)
printf("rx_hash: 0x%X with RSS type:0x%X\n",
meta->rx_hash, meta->rx_hash_type);
- printf("rx_timestamp: %llu (sec:%0.4f)\n", meta->rx_timestamp,
- (double)meta->rx_timestamp / NANOSEC_PER_SEC);
if (meta->rx_timestamp) {
- __u64 usr_clock = gettime(clock_id);
- __u64 xdp_clock = meta->xdp_timestamp;
- __s64 delta_X = xdp_clock - meta->rx_timestamp;
- __s64 delta_X2U = usr_clock - xdp_clock;
-
- printf("XDP RX-time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
- xdp_clock, (double)xdp_clock / NANOSEC_PER_SEC,
- (double)delta_X / NANOSEC_PER_SEC,
- (double)delta_X / 1000);
-
- printf("AF_XDP time: %llu (sec:%0.4f) delta sec:%0.4f (%0.3f usec)\n",
- usr_clock, (double)usr_clock / NANOSEC_PER_SEC,
- (double)delta_X2U / NANOSEC_PER_SEC,
- (double)delta_X2U / 1000);
+ __u64 ref_tstamp = gettime(clock_id);
+
+ /* store received timestamps to calculate a delta at tx */
+ last_hw_rx_timestamp = meta->rx_timestamp;
+ last_xdp_rx_timestamp = meta->xdp_timestamp;
+
+ print_tstamp_delta("HW RX-time", "User RX-time",
+ meta->rx_timestamp, ref_tstamp);
+ print_tstamp_delta("XDP RX-time", "User RX-time",
+ meta->xdp_timestamp, ref_tstamp);
+ } else {
+ printf("No rx_timestamp\n");
}
-
}
static void verify_skb_metadata(int fd)
@@ -230,6 +255,129 @@ static void verify_skb_metadata(int fd)
printf("skb hwtstamp is not found!\n");
}
+static bool complete_tx(struct xsk *xsk, clockid_t clock_id)
+{
+ struct xsk_tx_metadata *meta;
+ __u64 addr;
+ void *data;
+ __u32 idx;
+
+ if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx))
+ return false;
+
+ addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
+ data = xsk_umem__get_data(xsk->umem_area, addr);
+ meta = data - sizeof(struct xsk_tx_metadata);
+
+ printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
+
+ if (meta->completion.tx_timestamp) {
+ __u64 ref_tstamp = gettime(clock_id);
+
+ print_tstamp_delta("HW TX-complete-time", "User TX-complete-time",
+ meta->completion.tx_timestamp, ref_tstamp);
+ print_tstamp_delta("XDP RX-time", "User TX-complete-time",
+ last_xdp_rx_timestamp, ref_tstamp);
+ print_tstamp_delta("HW RX-time", "HW TX-complete-time",
+ last_hw_rx_timestamp, meta->completion.tx_timestamp);
+ } else {
+ printf("No tx_timestamp\n");
+ }
+
+ xsk_ring_cons__release(&xsk->comp, 1);
+
+ return true;
+}
+
+#define swap(a, b, len) do { \
+ for (int i = 0; i < len; i++) { \
+ __u8 tmp = ((__u8 *)a)[i]; \
+ ((__u8 *)a)[i] = ((__u8 *)b)[i]; \
+ ((__u8 *)b)[i] = tmp; \
+ } \
+} while (0)
+
+static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id)
+{
+ struct xsk_tx_metadata *meta;
+ struct ipv6hdr *ip6h = NULL;
+ struct iphdr *iph = NULL;
+ struct xdp_desc *tx_desc;
+ struct udphdr *udph;
+ struct ethhdr *eth;
+ __sum16 want_csum;
+ void *data;
+ __u32 idx;
+ int ret;
+ int len;
+
+ ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
+ if (ret != 1) {
+ printf("%p: failed to reserve tx slot\n", xsk);
+ return;
+ }
+
+ tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
+ tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE + sizeof(struct xsk_tx_metadata);
+ data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
+
+ meta = data - sizeof(struct xsk_tx_metadata);
+ memset(meta, 0, sizeof(*meta));
+ meta->flags = XDP_TXMD_FLAGS_TIMESTAMP;
+
+ eth = rx_packet;
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ iph = (void *)(eth + 1);
+ udph = (void *)(iph + 1);
+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
+ ip6h = (void *)(eth + 1);
+ udph = (void *)(ip6h + 1);
+ } else {
+ printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth->h_proto);
+ xsk_ring_prod__cancel(&xsk->tx, 1);
+ return;
+ }
+
+ len = ETH_HLEN;
+ if (ip6h)
+ len += sizeof(*ip6h) + ntohs(ip6h->payload_len);
+ if (iph)
+ len += ntohs(iph->tot_len);
+
+ swap(eth->h_dest, eth->h_source, ETH_ALEN);
+ if (iph)
+ swap(&iph->saddr, &iph->daddr, 4);
+ else
+ swap(&ip6h->saddr, &ip6h->daddr, 16);
+ swap(&udph->source, &udph->dest, 2);
+
+ want_csum = udph->check;
+ if (ip6h)
+ udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ntohs(udph->len), IPPROTO_UDP, 0);
+ else
+ udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ ntohs(udph->len), IPPROTO_UDP, 0);
+
+ meta->flags |= XDP_TXMD_FLAGS_CHECKSUM;
+ if (iph)
+ meta->request.csum_start = sizeof(*eth) + sizeof(*iph);
+ else
+ meta->request.csum_start = sizeof(*eth) + sizeof(*ip6h);
+ meta->request.csum_offset = offsetof(struct udphdr, check);
+
+ printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d csum_offset=%d\n",
+ xsk, ntohs(udph->check), ntohs(want_csum),
+ meta->request.csum_start, meta->request.csum_offset);
+
+ memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
+ tx_desc->options |= XDP_TX_METADATA;
+ tx_desc->len = len;
+
+ xsk_ring_prod__submit(&xsk->tx, 1);
+}
+
static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
{
const struct xdp_desc *rx_desc;
@@ -252,6 +400,13 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t
while (true) {
errno = 0;
+
+ for (i = 0; i < rxq; i++) {
+ ret = kick_rx(&rx_xsk[i]);
+ if (ret)
+ printf("kick_rx ret=%d\n", ret);
+ }
+
ret = poll(fds, rxq + 1, 1000);
printf("poll: %d (%d) skip=%llu fail=%llu redir=%llu\n",
ret, errno, bpf_obj->bss->pkts_skip,
@@ -288,6 +443,22 @@ peek:
verify_xdp_metadata(xsk_umem__get_data(xsk->umem_area, addr),
clock_id);
first_seg = false;
+
+ if (!skip_tx) {
+ /* mirror first chunk back */
+ ping_pong(xsk, xsk_umem__get_data(xsk->umem_area, addr),
+ clock_id);
+
+ ret = kick_tx(xsk);
+ if (ret)
+ printf("kick_tx ret=%d\n", ret);
+
+ for (int j = 0; j < 500; j++) {
+ if (complete_tx(xsk, clock_id))
+ break;
+ usleep(10*1000);
+ }
+ }
}
xsk_ring_cons__release(&xsk->rx, 1);
@@ -420,8 +591,10 @@ static void print_usage(void)
{
const char *usage =
"Usage: xdp_hw_metadata [OPTIONS] [IFNAME]\n"
- " -m Enable multi-buffer XDP for larger MTU\n"
+ " -c Run in copy mode (zerocopy is default)\n"
" -h Display this help and exit\n\n"
+ " -m Enable multi-buffer XDP for larger MTU\n"
+ " -r Don't generate AF_XDP reply (rx metadata only)\n"
"Generate test packets on the other machine with:\n"
" echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
@@ -432,14 +605,22 @@ static void read_args(int argc, char *argv[])
{
int opt;
- while ((opt = getopt(argc, argv, "mh")) != -1) {
+ while ((opt = getopt(argc, argv, "chmr")) != -1) {
switch (opt) {
- case 'm':
- bind_flags |= XDP_USE_SG;
+ case 'c':
+ bind_flags &= ~XDP_USE_NEED_WAKEUP;
+ bind_flags &= ~XDP_ZEROCOPY;
+ bind_flags |= XDP_COPY;
break;
case 'h':
print_usage();
exit(0);
+ case 'm':
+ bind_flags |= XDP_USE_SG;
+ break;
+ case 'r':
+ skip_tx = true;
+ break;
case '?':
if (isprint(optopt))
fprintf(stderr, "Unknown option: -%c\n", optopt);
diff --git a/tools/testing/selftests/bpf/xsk.c b/tools/testing/selftests/bpf/xsk.c
index e574711eeb84..25d568abf0f2 100644
--- a/tools/testing/selftests/bpf/xsk.c
+++ b/tools/testing/selftests/bpf/xsk.c
@@ -115,6 +115,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
+ cfg->tx_metadata_len = 0;
return;
}
@@ -123,6 +124,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->frame_size = usr_cfg->frame_size;
cfg->frame_headroom = usr_cfg->frame_headroom;
cfg->flags = usr_cfg->flags;
+ cfg->tx_metadata_len = usr_cfg->tx_metadata_len;
}
static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
@@ -252,6 +254,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
mr.chunk_size = umem->config.frame_size;
mr.headroom = umem->config.frame_headroom;
mr.flags = umem->config.flags;
+ mr.tx_metadata_len = umem->config.tx_metadata_len;
err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
if (err) {
diff --git a/tools/testing/selftests/bpf/xsk.h b/tools/testing/selftests/bpf/xsk.h
index 771570bc3731..93c2cc413cfc 100644
--- a/tools/testing/selftests/bpf/xsk.h
+++ b/tools/testing/selftests/bpf/xsk.h
@@ -200,6 +200,7 @@ struct xsk_umem_config {
__u32 frame_size;
__u32 frame_headroom;
__u32 flags;
+ __u32 tx_metadata_len;
};
int xsk_attach_xdp_program(struct bpf_program *prog, int ifindex, u32 xdp_flags);