* [PATCH net-next 01/14] net/mlx4_en: Code cleanups in tx path
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 02/14] net/mlx4_en: Align tx path structures to cache lines Amir Vadai
` (14 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
- Remove unused variable ring->poll_cnt
- No need to set some fields if using blueflame
- Add missing const's
- Use unlikely
- Remove unneeded new line
- Make some comments more precise
- struct mlx4_bf @offset field reduced to unsigned int to save space
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 49 +++++++++++++++-------------
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 -
include/linux/mlx4/device.h | 2 +-
3 files changed, 27 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 0c50125..eaf23eb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -191,7 +191,6 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
ring->prod = 0;
ring->cons = 0xffffffff;
ring->last_nr_txbb = 1;
- ring->poll_cnt = 0;
memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
memset(ring->buf, 0, ring->buf_size);
@@ -512,7 +511,8 @@ static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
return ring->buf + index * TXBB_SIZE;
}
-static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag)
+static bool is_inline(int inline_thold, const struct sk_buff *skb,
+ void **pfrag)
{
void *ptr;
@@ -535,7 +535,7 @@ static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag)
return 0;
}
-static int inline_size(struct sk_buff *skb)
+static int inline_size(const struct sk_buff *skb)
{
if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
<= MLX4_INLINE_ALIGN)
@@ -546,7 +546,8 @@ static int inline_size(struct sk_buff *skb)
sizeof(struct mlx4_wqe_inline_seg), 16);
}
-static int get_real_size(struct sk_buff *skb, struct net_device *dev,
+static int get_real_size(const struct sk_buff *skb,
+ struct net_device *dev,
int *lso_header_size)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -581,8 +582,10 @@ static int get_real_size(struct sk_buff *skb, struct net_device *dev,
return real_size;
}
-static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *skb,
- int real_size, u16 *vlan_tag, int tx_ind, void *fragptr)
+static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
+ const struct sk_buff *skb,
+ int real_size, u16 *vlan_tag,
+ int tx_ind, void *fragptr)
{
struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
@@ -642,7 +645,8 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
return fallback(dev, skb) % rings_p_up + up * rings_p_up;
}
-static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt)
+static void mlx4_bf_copy(void __iomem *dst, const void *src,
+ unsigned int bytecnt)
{
__iowrite64_copy(dst, src, bytecnt / 8);
}
@@ -736,11 +740,10 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_info->skb = skb;
tx_info->nr_txbb = nr_txbb;
+ data = &tx_desc->data;
if (lso_header_size)
data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
DS_SIZE));
- else
- data = &tx_desc->data;
/* valid only for none inline segments */
tx_info->data_offset = (void *)data - (void *)tx_desc;
@@ -753,9 +756,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
if (is_inline(ring->inline_thold, skb, &fragptr)) {
tx_info->inl = 1;
} else {
- /* Map fragments */
+ /* Map fragments if any */
for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
- struct skb_frag_struct *frag;
+ const struct skb_frag_struct *frag;
dma_addr_t dma;
frag = &skb_shinfo(skb)->frags[i];
@@ -772,7 +775,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
--data;
}
- /* Map linear part */
+ /* Map linear part if needed */
if (tx_info->linear) {
u32 byte_count = skb_headlen(skb) - lso_header_size;
dma_addr_t dma;
@@ -795,18 +798,14 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
* For timestamping add flag to skb_shinfo and
* set flag for further reference
*/
- if (ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
- skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) {
- skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+ if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
+ shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
+ shinfo->tx_flags |= SKBTX_IN_PROGRESS;
tx_info->ts_requested = 1;
}
/* Prepare ctrl segement apart opcode+ownership, which depends on
* whether LSO is used */
- tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
- tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
- !!vlan_tx_tag_present(skb);
- tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
@@ -852,7 +851,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
ring->packets++;
-
}
ring->bytes += tx_info->nr_bytes;
netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
@@ -874,7 +872,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
ring->prod += nr_txbb;
/* If we used a bounce buffer then copy descriptor back into place */
- if (bounce)
+ if (unlikely(bounce))
tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
skb_tx_timestamp(skb);
@@ -894,13 +892,18 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
wmb();
- mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl,
- desc_size);
+ mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
+ desc_size);
wmb();
ring->bf.offset ^= ring->bf.buf_size;
} else {
+ tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
+ tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
+ !!vlan_tx_tag_present(skb);
+ tx_desc->ctrl.fence_size = real_size;
+
/* Ensure new descriptor hits memory
* before setting ownership of this descriptor to HW
*/
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 84c9d5d..e54b653 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -263,7 +263,6 @@ struct mlx4_en_tx_ring {
u32 buf_size;
u32 doorbell_qpn;
void *buf;
- u16 poll_cnt;
struct mlx4_en_tx_info *tx_info;
u8 *bounce_buf;
u8 queue_index;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b2f8ab9..37e4404 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -583,7 +583,7 @@ struct mlx4_uar {
};
struct mlx4_bf {
- unsigned long offset;
+ unsigned int offset;
int buf_size;
struct mlx4_uar *uar;
void __iomem *reg;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 02/14] net/mlx4_en: Align tx path structures to cache lines
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 01/14] net/mlx4_en: Code cleanups in tx path Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 03/14] net/mlx4_en: Avoid calling bswap in tx fast path Amir Vadai
` (13 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Reorganize struct mlx4_en_tx_ring to have:
- One cache line containing last_nr_txbb & cons & wake_queue, used by tx
completion.
- One cache line containing fields dirtied by mlx4_en_xmit()
- Following part is read mostly and shared by cpus.
Align struct mlx4_en_tx_info to a cache line
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 86 +++++++++++++++-------------
1 file changed, 46 insertions(+), 40 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index e54b653..b7bde95 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -216,13 +216,13 @@ enum cq_type {
struct mlx4_en_tx_info {
struct sk_buff *skb;
- u32 nr_txbb;
- u32 nr_bytes;
- u8 linear;
- u8 data_offset;
- u8 inl;
- u8 ts_requested;
-};
+ u32 nr_txbb;
+ u32 nr_bytes;
+ u8 linear;
+ u8 data_offset;
+ u8 inl;
+ u8 ts_requested;
+} ____cacheline_aligned_in_smp;
#define MLX4_EN_BIT_DESC_OWN 0x80000000
@@ -253,40 +253,46 @@ struct mlx4_en_rx_alloc {
};
struct mlx4_en_tx_ring {
+ /* cache line used and dirtied in tx completion
+ * (mlx4_en_free_tx_buf())
+ */
+ u32 last_nr_txbb;
+ u32 cons;
+ unsigned long wake_queue;
+
+ /* cache line used and dirtied in mlx4_en_xmit() */
+ u32 prod ____cacheline_aligned_in_smp;
+ unsigned long bytes;
+ unsigned long packets;
+ unsigned long tx_csum;
+ unsigned long tso_packets;
+ unsigned long xmit_more;
+ struct mlx4_bf bf;
+ unsigned long queue_stopped;
+
+ /* Following part should be mostly read */
+ cpumask_t affinity_mask;
+ struct mlx4_qp qp;
struct mlx4_hwq_resources wqres;
- u32 size ; /* number of TXBBs */
- u32 size_mask;
- u16 stride;
- u16 cqn; /* index of port CQ associated with this ring */
- u32 prod;
- u32 cons;
- u32 buf_size;
- u32 doorbell_qpn;
- void *buf;
- struct mlx4_en_tx_info *tx_info;
- u8 *bounce_buf;
- u8 queue_index;
- cpumask_t affinity_mask;
- u32 last_nr_txbb;
- struct mlx4_qp qp;
- struct mlx4_qp_context context;
- int qpn;
- enum mlx4_qp_state qp_state;
- struct mlx4_srq dummy;
- unsigned long bytes;
- unsigned long packets;
- unsigned long tx_csum;
- unsigned long queue_stopped;
- unsigned long wake_queue;
- unsigned long tso_packets;
- unsigned long xmit_more;
- struct mlx4_bf bf;
- bool bf_enabled;
- bool bf_alloced;
- struct netdev_queue *tx_queue;
- int hwtstamp_tx_type;
- int inline_thold;
-};
+ u32 size; /* number of TXBBs */
+ u32 size_mask;
+ u16 stride;
+ u16 cqn; /* index of port CQ associated with this ring */
+ u32 buf_size;
+ u32 doorbell_qpn;
+ void *buf;
+ struct mlx4_en_tx_info *tx_info;
+ u8 *bounce_buf;
+ struct mlx4_qp_context context;
+ int qpn;
+ enum mlx4_qp_state qp_state;
+ u8 queue_index;
+ bool bf_enabled;
+ bool bf_alloced;
+ struct netdev_queue *tx_queue;
+ int hwtstamp_tx_type;
+ int inline_thold;
+} ____cacheline_aligned_in_smp;
struct mlx4_en_rx_desc {
/* actual number of entries depends on rx ring stride */
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 03/14] net/mlx4_en: Avoid calling bswap in tx fast path
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 01/14] net/mlx4_en: Code cleanups in tx path Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 02/14] net/mlx4_en: Align tx path structures to cache lines Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 04/14] net/mlx4_en: tx_info allocated with kmalloc() instead of vmalloc() Amir Vadai
` (12 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
- doorbell_qpn is stored in the cpu_to_be32() way to avoid bswap() in fast
path.
- mdev->mr.key stored in ring->mr_key to also avoid bswap() and access to
cold cache line.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 17 ++++++++++-------
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++-
2 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index eaf23eb..322cda3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -195,7 +195,8 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
memset(ring->buf, 0, ring->buf_size);
ring->qp_state = MLX4_QP_STATE_RST;
- ring->doorbell_qpn = ring->qp.qpn << 8;
+ ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
+ ring->mr_key = cpu_to_be32(mdev->mr.key);
mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
ring->cqn, user_prio, &ring->context);
@@ -654,7 +655,6 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src,
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
- struct mlx4_en_dev *mdev = priv->mdev;
struct device *ddev = priv->ddev;
struct mlx4_en_tx_ring *ring;
struct mlx4_en_tx_desc *tx_desc;
@@ -769,7 +769,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_drop_unmap;
data->addr = cpu_to_be64(dma);
- data->lkey = cpu_to_be32(mdev->mr.key);
+ data->lkey = ring->mr_key;
wmb();
data->byte_count = cpu_to_be32(skb_frag_size(frag));
--data;
@@ -787,7 +787,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_drop_unmap;
data->addr = cpu_to_be64(dma);
- data->lkey = cpu_to_be32(mdev->mr.key);
+ data->lkey = ring->mr_key;
wmb();
data->byte_count = cpu_to_be32(byte_count);
}
@@ -879,9 +879,12 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
+ real_size = (real_size / 16) & 0x3f;
+
if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
!vlan_tx_tag_present(skb) && send_doorbell) {
- tx_desc->ctrl.bf_qpn |= cpu_to_be32(ring->doorbell_qpn);
+ tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
+ cpu_to_be32(real_size);
op_own |= htonl((bf_index & 0xffff) << 8);
/* Ensure new descriptor hits memory
@@ -911,8 +914,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_desc->ctrl.owner_opcode = op_own;
if (send_doorbell) {
wmb();
- iowrite32be(ring->doorbell_qpn,
- ring->bf.uar->map + MLX4_SEND_DOORBELL);
+ iowrite32(ring->doorbell_qpn,
+ ring->bf.uar->map + MLX4_SEND_DOORBELL);
} else {
ring->xmit_more++;
}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index b7bde95..ab34461 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -279,7 +279,8 @@ struct mlx4_en_tx_ring {
u16 stride;
u16 cqn; /* index of port CQ associated with this ring */
u32 buf_size;
- u32 doorbell_qpn;
+ __be32 doorbell_qpn;
+ __be32 mr_key;
void *buf;
struct mlx4_en_tx_info *tx_info;
u8 *bounce_buf;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 04/14] net/mlx4_en: tx_info allocated with kmalloc() instead of vmalloc()
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (2 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 03/14] net/mlx4_en: Avoid calling bswap in tx fast path Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 05/14] net/mlx4_en: Avoid a cache line miss in TX completion for single frag skb's Amir Vadai
` (11 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Try to allocate using kmalloc_node() first, only on failure use
vmalloc()
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 322cda3..1447906 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -68,7 +68,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
ring->inline_thold = priv->prof->inline_thold;
tmp = size * sizeof(struct mlx4_en_tx_info);
- ring->tx_info = vmalloc_node(tmp, node);
+ ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
if (!ring->tx_info) {
ring->tx_info = vmalloc(tmp);
if (!ring->tx_info) {
@@ -151,7 +151,7 @@ err_bounce:
kfree(ring->bounce_buf);
ring->bounce_buf = NULL;
err_info:
- vfree(ring->tx_info);
+ kvfree(ring->tx_info);
ring->tx_info = NULL;
err_ring:
kfree(ring);
@@ -174,7 +174,7 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
kfree(ring->bounce_buf);
ring->bounce_buf = NULL;
- vfree(ring->tx_info);
+ kvfree(ring->tx_info);
ring->tx_info = NULL;
kfree(ring);
*pring = NULL;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 05/14] net/mlx4_en: Avoid a cache line miss in TX completion for single frag skb's
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (3 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 04/14] net/mlx4_en: tx_info allocated with kmalloc() instead of vmalloc() Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 06/14] net/mlx4_en: Use prefetch in tx path Amir Vadai
` (10 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Add frag0_dma/frag0_byte_count into mlx4_en_tx_info to avoid a cache
line miss in TX completion for frames having one dma element. (We avoid
reading back the tx descriptor)
Note this could be extended to 2/3 dma elements later, as we have free
room in mlx4_en_tx_info
Also, mlx4_en_free_tx_desc() no longer accesses skb_shinfo(). We use a
new nr_maps fields in mlx4_en_tx_info to avoid 2 or 3 cache misses.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 83 +++++++++++++++-------------
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 +
2 files changed, 49 insertions(+), 37 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 1447906..edc4a88 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -259,38 +259,40 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring,
int index, u8 owner, u64 timestamp)
{
- struct mlx4_en_dev *mdev = priv->mdev;
struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
- struct sk_buff *skb = tx_info->skb;
- struct skb_frag_struct *frag;
void *end = ring->buf + ring->buf_size;
- int frags = skb_shinfo(skb)->nr_frags;
+ struct sk_buff *skb = tx_info->skb;
+ int nr_maps = tx_info->nr_maps;
int i;
- struct skb_shared_hwtstamps hwts;
- if (timestamp) {
- mlx4_en_fill_hwtstamps(mdev, &hwts, timestamp);
+ if (unlikely(timestamp)) {
+ struct skb_shared_hwtstamps hwts;
+
+ mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
skb_tstamp_tx(skb, &hwts);
}
/* Optimize the common case when there are no wraparounds */
if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
if (!tx_info->inl) {
- if (tx_info->linear) {
+ if (tx_info->linear)
dma_unmap_single(priv->ddev,
- (dma_addr_t) be64_to_cpu(data->addr),
- be32_to_cpu(data->byte_count),
- PCI_DMA_TODEVICE);
- ++data;
- }
-
- for (i = 0; i < frags; i++) {
- frag = &skb_shinfo(skb)->frags[i];
+ tx_info->map0_dma,
+ tx_info->map0_byte_count,
+ PCI_DMA_TODEVICE);
+ else
+ dma_unmap_page(priv->ddev,
+ tx_info->map0_dma,
+ tx_info->map0_byte_count,
+ PCI_DMA_TODEVICE);
+ for (i = 1; i < nr_maps; i++) {
+ data++;
dma_unmap_page(priv->ddev,
- (dma_addr_t) be64_to_cpu(data[i].addr),
- skb_frag_size(frag), PCI_DMA_TODEVICE);
+ (dma_addr_t)be64_to_cpu(data->addr),
+ be32_to_cpu(data->byte_count),
+ PCI_DMA_TODEVICE);
}
}
} else {
@@ -299,23 +301,25 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
data = ring->buf + ((void *)data - end);
}
- if (tx_info->linear) {
+ if (tx_info->linear)
dma_unmap_single(priv->ddev,
- (dma_addr_t) be64_to_cpu(data->addr),
- be32_to_cpu(data->byte_count),
- PCI_DMA_TODEVICE);
- ++data;
- }
-
- for (i = 0; i < frags; i++) {
+ tx_info->map0_dma,
+ tx_info->map0_byte_count,
+ PCI_DMA_TODEVICE);
+ else
+ dma_unmap_page(priv->ddev,
+ tx_info->map0_dma,
+ tx_info->map0_byte_count,
+ PCI_DMA_TODEVICE);
+ for (i = 1; i < nr_maps; i++) {
+ data++;
/* Check for wraparound before unmapping */
if ((void *) data >= end)
data = ring->buf;
- frag = &skb_shinfo(skb)->frags[i];
dma_unmap_page(priv->ddev,
- (dma_addr_t) be64_to_cpu(data->addr),
- skb_frag_size(frag), PCI_DMA_TODEVICE);
- ++data;
+ (dma_addr_t)be64_to_cpu(data->addr),
+ be32_to_cpu(data->byte_count),
+ PCI_DMA_TODEVICE);
}
}
}
@@ -751,19 +755,22 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_info->linear = (lso_header_size < skb_headlen(skb) &&
!is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0;
- data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1;
+ tx_info->nr_maps = skb_shinfo(skb)->nr_frags + tx_info->linear;
+ data += tx_info->nr_maps - 1;
if (is_inline(ring->inline_thold, skb, &fragptr)) {
tx_info->inl = 1;
} else {
+ dma_addr_t dma = 0;
+ u32 byte_count = 0;
+
/* Map fragments if any */
for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
const struct skb_frag_struct *frag;
- dma_addr_t dma;
-
frag = &skb_shinfo(skb)->frags[i];
+ byte_count = skb_frag_size(frag);
dma = skb_frag_dma_map(ddev, frag,
- 0, skb_frag_size(frag),
+ 0, byte_count,
DMA_TO_DEVICE);
if (dma_mapping_error(ddev, dma))
goto tx_drop_unmap;
@@ -771,14 +778,13 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
data->addr = cpu_to_be64(dma);
data->lkey = ring->mr_key;
wmb();
- data->byte_count = cpu_to_be32(skb_frag_size(frag));
+ data->byte_count = cpu_to_be32(byte_count);
--data;
}
/* Map linear part if needed */
if (tx_info->linear) {
- u32 byte_count = skb_headlen(skb) - lso_header_size;
- dma_addr_t dma;
+ byte_count = skb_headlen(skb) - lso_header_size;
dma = dma_map_single(ddev, skb->data +
lso_header_size, byte_count,
@@ -792,6 +798,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
data->byte_count = cpu_to_be32(byte_count);
}
tx_info->inl = 0;
+ /* tx completion can avoid cache line miss for common cases */
+ tx_info->map0_dma = dma;
+ tx_info->map0_byte_count = byte_count;
}
/*
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index ab34461..a904030 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -216,12 +216,15 @@ enum cq_type {
struct mlx4_en_tx_info {
struct sk_buff *skb;
+ dma_addr_t map0_dma;
+ u32 map0_byte_count;
u32 nr_txbb;
u32 nr_bytes;
u8 linear;
u8 data_offset;
u8 inl;
u8 ts_requested;
+ u8 nr_maps;
} ____cacheline_aligned_in_smp;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 06/14] net/mlx4_en: Use prefetch in tx path
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (4 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 05/14] net/mlx4_en: Avoid a cache line miss in TX completion for single frag skb's Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 07/14] net/mlx4_en: Avoid false sharing in mlx4_en_en_process_tx_cq() Amir Vadai
` (9 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
mlx4_en_free_tx_desc() uses a prefetchw(&skb->users) to speed up
consume_skb()
prefetchw(&ring->tx_queue->dql) to speed up BQL update
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index edc4a88..b96627c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -37,6 +37,7 @@
#include <linux/mlx4/qp.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
+#include <linux/prefetch.h>
#include <linux/vmalloc.h>
#include <linux/tcp.h>
#include <linux/ip.h>
@@ -267,6 +268,11 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
int nr_maps = tx_info->nr_maps;
int i;
+ /* We do not touch skb here, so prefetch skb->users location
+ * to speedup consume_skb()
+ */
+ prefetchw(&skb->users);
+
if (unlikely(timestamp)) {
struct skb_shared_hwtstamps hwts;
@@ -385,6 +391,7 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
if (!priv->port_up)
return true;
+ prefetchw(&ring->tx_queue->dql.limit);
index = cons_index & size_mask;
cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
ring_index = ring->cons & size_mask;
@@ -722,6 +729,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
}
}
+ prefetchw(&ring->tx_queue->dql);
+
/* Track current inflight packets for performance analysis */
AVG_PERF_COUNTER(priv->pstats.inflight_avg,
(u32) (ring->prod - ring->cons - 1));
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 07/14] net/mlx4_en: Avoid false sharing in mlx4_en_en_process_tx_cq()
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (5 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 06/14] net/mlx4_en: Use prefetch in tx path Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 08/14] net/mlx4_en: mlx4_en_xmit() reads ring->cons once, and ahead of time to avoid stalls Amir Vadai
` (8 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
mlx4_en_process_tx_cq() carefully fetches and writes ring->last_nr_txbb
and ring->cons only one time to avoid false sharing
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 22 +++++++++++++++-------
1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index b96627c..d9aaeb2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -387,6 +387,8 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
u64 timestamp = 0;
int done = 0;
int budget = priv->tx_work_limit;
+ u32 last_nr_txbb;
+ u32 ring_cons;
if (!priv->port_up)
return true;
@@ -394,7 +396,9 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
prefetchw(&ring->tx_queue->dql.limit);
index = cons_index & size_mask;
cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
- ring_index = ring->cons & size_mask;
+ last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
+ ring_cons = ACCESS_ONCE(ring->cons);
+ ring_index = ring_cons & size_mask;
stamp_index = ring_index;
/* Process all completed CQEs */
@@ -419,19 +423,19 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
do {
- txbbs_skipped += ring->last_nr_txbb;
- ring_index = (ring_index + ring->last_nr_txbb) & size_mask;
+ txbbs_skipped += last_nr_txbb;
+ ring_index = (ring_index + last_nr_txbb) & size_mask;
if (ring->tx_info[ring_index].ts_requested)
timestamp = mlx4_en_get_cqe_ts(cqe);
/* free next descriptor */
- ring->last_nr_txbb = mlx4_en_free_tx_desc(
+ last_nr_txbb = mlx4_en_free_tx_desc(
priv, ring, ring_index,
- !!((ring->cons + txbbs_skipped) &
+ !!((ring_cons + txbbs_skipped) &
ring->size), timestamp);
mlx4_en_stamp_wqe(priv, ring, stamp_index,
- !!((ring->cons + txbbs_stamp) &
+ !!((ring_cons + txbbs_stamp) &
ring->size));
stamp_index = ring_index;
txbbs_stamp = txbbs_skipped;
@@ -452,7 +456,11 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
mcq->cons_index = cons_index;
mlx4_cq_set_ci(mcq);
wmb();
- ring->cons += txbbs_skipped;
+
+ /* we want to dirty this cache line once */
+ ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
+ ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
+
netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
/*
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 08/14] net/mlx4_en: mlx4_en_xmit() reads ring->cons once, and ahead of time to avoid stalls
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (6 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 07/14] net/mlx4_en: Avoid false sharing in mlx4_en_en_process_tx_cq() Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 09/14] net/mlx4_en: Use local var in tx flow for skb_shinfo(skb) Amir Vadai
` (7 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index d9aaeb2..99875c8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -691,10 +691,17 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
void *fragptr;
bool bounce = false;
bool send_doorbell;
+ u32 ring_cons;
if (!priv->port_up)
goto tx_drop;
+ tx_ind = skb_get_queue_mapping(skb);
+ ring = priv->tx_ring[tx_ind];
+
+ /* fetch ring->cons far ahead before needing it to avoid stall */
+ ring_cons = ACCESS_ONCE(ring->cons);
+
real_size = get_real_size(skb, dev, &lso_header_size);
if (unlikely(!real_size))
goto tx_drop;
@@ -708,13 +715,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_drop;
}
- tx_ind = skb->queue_mapping;
- ring = priv->tx_ring[tx_ind];
if (vlan_tx_tag_present(skb))
vlan_tag = vlan_tx_tag_get(skb);
/* Check available TXBBs And 2K spare for prefetch */
- if (unlikely(((int)(ring->prod - ring->cons)) >
+ if (unlikely(((int)(ring->prod - ring_cons)) >
ring->size - HEADROOM - MAX_DESC_TXBBS)) {
/* every full Tx ring stops queue */
netif_tx_stop_queue(ring->tx_queue);
@@ -728,7 +733,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
*/
wmb();
- if (unlikely(((int)(ring->prod - ring->cons)) <=
+ ring_cons = ACCESS_ONCE(ring->cons);
+ if (unlikely(((int)(ring->prod - ring_cons)) <=
ring->size - HEADROOM - MAX_DESC_TXBBS)) {
netif_tx_wake_queue(ring->tx_queue);
ring->wake_queue++;
@@ -741,7 +747,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* Track current inflight packets for performance analysis */
AVG_PERF_COUNTER(priv->pstats.inflight_avg,
- (u32) (ring->prod - ring->cons - 1));
+ (u32)(ring->prod - ring_cons - 1));
/* Packet is good - grab an index and transmit it */
index = ring->prod & ring->size_mask;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 09/14] net/mlx4_en: Use local var in tx flow for skb_shinfo(skb)
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (7 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 08/14] net/mlx4_en: mlx4_en_xmit() reads ring->cons once, and ahead of time to avoid stalls Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 10/14] net/mlx4_en: Use local var for skb_headlen(skb) Amir Vadai
` (6 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Acces skb_shinfo(skb) once in tx flow.
Also, rename @i variable to @i_frag to avoid confusion, as the "goto
tx_drop_unmap;" relied on this @i variable.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 58 +++++++++++++++++-------------
1 file changed, 34 insertions(+), 24 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 99875c8..aa05b09 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -532,13 +532,14 @@ static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
}
static bool is_inline(int inline_thold, const struct sk_buff *skb,
+ const struct skb_shared_info *shinfo,
void **pfrag)
{
void *ptr;
if (inline_thold && !skb_is_gso(skb) && skb->len <= inline_thold) {
- if (skb_shinfo(skb)->nr_frags == 1) {
- ptr = skb_frag_address_safe(&skb_shinfo(skb)->frags[0]);
+ if (shinfo->nr_frags == 1) {
+ ptr = skb_frag_address_safe(&shinfo->frags[0]);
if (unlikely(!ptr))
return 0;
@@ -546,7 +547,7 @@ static bool is_inline(int inline_thold, const struct sk_buff *skb,
*pfrag = ptr;
return 1;
- } else if (unlikely(skb_shinfo(skb)->nr_frags))
+ } else if (unlikely(shinfo->nr_frags))
return 0;
else
return 1;
@@ -567,18 +568,19 @@ static int inline_size(const struct sk_buff *skb)
}
static int get_real_size(const struct sk_buff *skb,
+ const struct skb_shared_info *shinfo,
struct net_device *dev,
int *lso_header_size)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
int real_size;
- if (skb_is_gso(skb)) {
+ if (shinfo->gso_size) {
if (skb->encapsulation)
*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
else
*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
- real_size = CTRL_SIZE + skb_shinfo(skb)->nr_frags * DS_SIZE +
+ real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
ALIGN(*lso_header_size + 4, DS_SIZE);
if (unlikely(*lso_header_size != skb_headlen(skb))) {
/* We add a segment for the skb linear buffer only if
@@ -593,8 +595,8 @@ static int get_real_size(const struct sk_buff *skb,
}
} else {
*lso_header_size = 0;
- if (!is_inline(priv->prof->inline_thold, skb, NULL))
- real_size = CTRL_SIZE + (skb_shinfo(skb)->nr_frags + 1) * DS_SIZE;
+ if (!is_inline(priv->prof->inline_thold, skb, shinfo, NULL))
+ real_size = CTRL_SIZE + (shinfo->nr_frags + 1) * DS_SIZE;
else
real_size = inline_size(skb);
}
@@ -604,6 +606,7 @@ static int get_real_size(const struct sk_buff *skb,
static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
const struct sk_buff *skb,
+ const struct skb_shared_info *shinfo,
int real_size, u16 *vlan_tag,
int tx_ind, void *fragptr)
{
@@ -619,9 +622,9 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
MIN_PKT_LEN - skb->len);
}
skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb));
- if (skb_shinfo(skb)->nr_frags)
+ if (shinfo->nr_frags)
memcpy(((void *)(inl + 1)) + skb_headlen(skb), fragptr,
- skb_frag_size(&skb_shinfo(skb)->frags[0]));
+ skb_frag_size(&shinfo->frags[0]));
} else {
inl->byte_count = cpu_to_be32(1 << 31 | spc);
@@ -639,9 +642,10 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
inl = (void *) (inl + 1) + spc;
skb_copy_from_linear_data_offset(skb, spc, inl + 1,
skb_headlen(skb) - spc);
- if (skb_shinfo(skb)->nr_frags)
+ if (shinfo->nr_frags)
memcpy(((void *)(inl + 1)) + skb_headlen(skb) - spc,
- fragptr, skb_frag_size(&skb_shinfo(skb)->frags[0]));
+ fragptr,
+ skb_frag_size(&shinfo->frags[0]));
}
wmb();
@@ -673,6 +677,7 @@ static void mlx4_bf_copy(void __iomem *dst, const void *src,
netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
struct mlx4_en_priv *priv = netdev_priv(dev);
struct device *ddev = priv->ddev;
struct mlx4_en_tx_ring *ring;
@@ -686,7 +691,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
u32 index, bf_index;
__be32 op_own;
u16 vlan_tag = 0;
- int i;
+ int i_frag;
int lso_header_size;
void *fragptr;
bool bounce = false;
@@ -702,7 +707,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* fetch ring->cons far ahead before needing it to avoid stall */
ring_cons = ACCESS_ONCE(ring->cons);
- real_size = get_real_size(skb, dev, &lso_header_size);
+ real_size = get_real_size(skb, shinfo, dev, &lso_header_size);
if (unlikely(!real_size))
goto tx_drop;
@@ -776,21 +781,22 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_info->data_offset = (void *)data - (void *)tx_desc;
tx_info->linear = (lso_header_size < skb_headlen(skb) &&
- !is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0;
+ !is_inline(ring->inline_thold, skb, shinfo, NULL)) ? 1 : 0;
- tx_info->nr_maps = skb_shinfo(skb)->nr_frags + tx_info->linear;
+ tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
data += tx_info->nr_maps - 1;
- if (is_inline(ring->inline_thold, skb, &fragptr)) {
+ if (is_inline(ring->inline_thold, skb, shinfo, &fragptr)) {
tx_info->inl = 1;
} else {
dma_addr_t dma = 0;
u32 byte_count = 0;
/* Map fragments if any */
- for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
+ for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
const struct skb_frag_struct *frag;
- frag = &skb_shinfo(skb)->frags[i];
+
+ frag = &shinfo->frags[i_frag];
byte_count = skb_frag_size(frag);
dma = skb_frag_dma_map(ddev, frag,
0, byte_count,
@@ -858,6 +864,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* Handle LSO (TSO) packets */
if (lso_header_size) {
+ int i;
+
/* Mark opcode as LSO */
op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
((ring->prod & ring->size) ?
@@ -865,15 +873,16 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* Fill in the LSO prefix */
tx_desc->lso.mss_hdr_size = cpu_to_be32(
- skb_shinfo(skb)->gso_size << 16 | lso_header_size);
+ shinfo->gso_size << 16 | lso_header_size);
/* Copy headers;
* note that we already verified that it is linear */
memcpy(tx_desc->lso.header, skb->data, lso_header_size);
ring->tso_packets++;
- i = ((skb->len - lso_header_size) / skb_shinfo(skb)->gso_size) +
- !!((skb->len - lso_header_size) % skb_shinfo(skb)->gso_size);
+
+ i = ((skb->len - lso_header_size) / shinfo->gso_size) +
+ !!((skb->len - lso_header_size) % shinfo->gso_size);
tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
ring->packets += i;
} else {
@@ -889,7 +898,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
if (tx_info->inl) {
- build_inline_wqe(tx_desc, skb, real_size, &vlan_tag, tx_ind, fragptr);
+ build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
+ tx_ind, fragptr);
tx_info->inl = 1;
}
@@ -958,8 +968,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
tx_drop_unmap:
en_err(priv, "DMA mapping error\n");
- for (i++; i < skb_shinfo(skb)->nr_frags; i++) {
- data++;
+ while (++i_frag < shinfo->nr_frags) {
+ ++data;
dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
be32_to_cpu(data->byte_count),
PCI_DMA_TODEVICE);
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 10/14] net/mlx4_en: Use local var for skb_headlen(skb)
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (8 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 09/14] net/mlx4_en: Use local var in tx flow for skb_shinfo(skb) Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 11/14] net/mlx4_en: tx_info->ts_requested was not cleared Amir Vadai
` (5 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Access skb_headlen() once in tx flow
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index aa05b09..e00841a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -612,6 +612,7 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
{
struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
+ unsigned int hlen = skb_headlen(skb);
if (skb->len <= spc) {
if (likely(skb->len >= MIN_PKT_LEN)) {
@@ -621,19 +622,19 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
memset(((void *)(inl + 1)) + skb->len, 0,
MIN_PKT_LEN - skb->len);
}
- skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb));
+ skb_copy_from_linear_data(skb, inl + 1, hlen);
if (shinfo->nr_frags)
- memcpy(((void *)(inl + 1)) + skb_headlen(skb), fragptr,
+ memcpy(((void *)(inl + 1)) + hlen, fragptr,
skb_frag_size(&shinfo->frags[0]));
} else {
inl->byte_count = cpu_to_be32(1 << 31 | spc);
- if (skb_headlen(skb) <= spc) {
- skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb));
- if (skb_headlen(skb) < spc) {
- memcpy(((void *)(inl + 1)) + skb_headlen(skb),
- fragptr, spc - skb_headlen(skb));
- fragptr += spc - skb_headlen(skb);
+ if (hlen <= spc) {
+ skb_copy_from_linear_data(skb, inl + 1, hlen);
+ if (hlen < spc) {
+ memcpy(((void *)(inl + 1)) + hlen,
+ fragptr, spc - hlen);
+ fragptr += spc - hlen;
}
inl = (void *) (inl + 1) + spc;
memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
@@ -641,9 +642,9 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
skb_copy_from_linear_data(skb, inl + 1, spc);
inl = (void *) (inl + 1) + spc;
skb_copy_from_linear_data_offset(skb, spc, inl + 1,
- skb_headlen(skb) - spc);
+ hlen - spc);
if (shinfo->nr_frags)
- memcpy(((void *)(inl + 1)) + skb_headlen(skb) - spc,
+ memcpy(((void *)(inl + 1)) + hlen - spc,
fragptr,
skb_frag_size(&shinfo->frags[0]));
}
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 11/14] net/mlx4_en: tx_info->ts_requested was not cleared
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (9 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 10/14] net/mlx4_en: Use local var for skb_headlen(skb) Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 12/14] net/mlx4_en: Enable the compiler to make is_inline() inlined Amir Vadai
` (4 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Properly clear tx_info->ts_requested
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index e00841a..2c03b55 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -837,6 +837,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
* For timestamping add flag to skb_shinfo and
* set flag for further reference
*/
+ tx_info->ts_requested = 0;
if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
shinfo->tx_flags |= SKBTX_IN_PROGRESS;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 12/14] net/mlx4_en: Enable the compiler to make is_inline() inlined
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (10 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 11/14] net/mlx4_en: tx_info->ts_requested was not cleared Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 13/14] ethtool: Ethtool parameter to dynamically change tx_copybreak Amir Vadai
` (3 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Reorganize code to call is_inline() once, so compiler can inline it
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 67 +++++++++++++++++-------------
1 file changed, 38 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 2c03b55..f0080c5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -531,29 +531,32 @@ static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
return ring->buf + index * TXBB_SIZE;
}
+/* Decide if skb can be inlined in tx descriptor to avoid dma mapping
+ *
+ * It seems strange we do not simply use skb_copy_bits().
+ * This would allow to inline all skbs iff skb->len <= inline_thold
+ *
+ * Note that caller already checked skb was not a gso packet
+ */
static bool is_inline(int inline_thold, const struct sk_buff *skb,
const struct skb_shared_info *shinfo,
void **pfrag)
{
void *ptr;
- if (inline_thold && !skb_is_gso(skb) && skb->len <= inline_thold) {
- if (shinfo->nr_frags == 1) {
- ptr = skb_frag_address_safe(&shinfo->frags[0]);
- if (unlikely(!ptr))
- return 0;
-
- if (pfrag)
- *pfrag = ptr;
+ if (skb->len > inline_thold || !inline_thold)
+ return false;
- return 1;
- } else if (unlikely(shinfo->nr_frags))
- return 0;
- else
- return 1;
+ if (shinfo->nr_frags == 1) {
+ ptr = skb_frag_address_safe(&shinfo->frags[0]);
+ if (unlikely(!ptr))
+ return false;
+ *pfrag = ptr;
+ return true;
}
-
- return 0;
+ if (shinfo->nr_frags)
+ return false;
+ return true;
}
static int inline_size(const struct sk_buff *skb)
@@ -570,12 +573,15 @@ static int inline_size(const struct sk_buff *skb)
static int get_real_size(const struct sk_buff *skb,
const struct skb_shared_info *shinfo,
struct net_device *dev,
- int *lso_header_size)
+ int *lso_header_size,
+ bool *inline_ok,
+ void **pfrag)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
int real_size;
if (shinfo->gso_size) {
+ *inline_ok = false;
if (skb->encapsulation)
*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
else
@@ -595,10 +601,14 @@ static int get_real_size(const struct sk_buff *skb,
}
} else {
*lso_header_size = 0;
- if (!is_inline(priv->prof->inline_thold, skb, shinfo, NULL))
- real_size = CTRL_SIZE + (shinfo->nr_frags + 1) * DS_SIZE;
- else
+ *inline_ok = is_inline(priv->prof->inline_thold, skb,
+ shinfo, pfrag);
+
+ if (*inline_ok)
real_size = inline_size(skb);
+ else
+ real_size = CTRL_SIZE +
+ (shinfo->nr_frags + 1) * DS_SIZE;
}
return real_size;
@@ -694,9 +704,10 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
u16 vlan_tag = 0;
int i_frag;
int lso_header_size;
- void *fragptr;
+ void *fragptr = NULL;
bool bounce = false;
bool send_doorbell;
+ bool inline_ok;
u32 ring_cons;
if (!priv->port_up)
@@ -708,7 +719,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* fetch ring->cons far ahead before needing it to avoid stall */
ring_cons = ACCESS_ONCE(ring->cons);
- real_size = get_real_size(skb, shinfo, dev, &lso_header_size);
+ real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
+ &inline_ok, &fragptr);
if (unlikely(!real_size))
goto tx_drop;
@@ -781,15 +793,15 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
/* valid only for none inline segments */
tx_info->data_offset = (void *)data - (void *)tx_desc;
+ tx_info->inl = inline_ok;
+
tx_info->linear = (lso_header_size < skb_headlen(skb) &&
- !is_inline(ring->inline_thold, skb, shinfo, NULL)) ? 1 : 0;
+ !inline_ok) ? 1 : 0;
tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
data += tx_info->nr_maps - 1;
- if (is_inline(ring->inline_thold, skb, shinfo, &fragptr)) {
- tx_info->inl = 1;
- } else {
+ if (!tx_info->inl) {
dma_addr_t dma = 0;
u32 byte_count = 0;
@@ -827,7 +839,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
wmb();
data->byte_count = cpu_to_be32(byte_count);
}
- tx_info->inl = 0;
/* tx completion can avoid cache line miss for common cases */
tx_info->map0_dma = dma;
tx_info->map0_byte_count = byte_count;
@@ -899,11 +910,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
- if (tx_info->inl) {
+ if (tx_info->inl)
build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
tx_ind, fragptr);
- tx_info->inl = 1;
- }
if (skb->encapsulation) {
struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb);
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 13/14] ethtool: Ethtool parameter to dynamically change tx_copybreak
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (11 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 12/14] net/mlx4_en: Enable the compiler to make is_inline() inlined Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 9:35 ` [PATCH net-next 14/14] net/mlx4_en: Use the new tx_copybreak to set inline threshold Amir Vadai
` (2 subsequent siblings)
15 siblings, 0 replies; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Use new ethtool [sg]et_tunable() to set tx_copybread (inline threshold)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
include/uapi/linux/ethtool.h | 1 +
net/core/ethtool.c | 1 +
2 files changed, 2 insertions(+)
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 7a364f2..99b4305 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -212,6 +212,7 @@ struct ethtool_value {
enum tunable_id {
ETHTOOL_ID_UNSPEC,
ETHTOOL_RX_COPYBREAK,
+ ETHTOOL_TX_COPYBREAK,
};
enum tunable_type_id {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 27e61b8..1600aa2 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1625,6 +1625,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
{
switch (tuna->id) {
case ETHTOOL_RX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK:
if (tuna->len != sizeof(u32) ||
tuna->type_id != ETHTOOL_TUNABLE_U32)
return -EINVAL;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* [PATCH net-next 14/14] net/mlx4_en: Use the new tx_copybreak to set inline threshold
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (12 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 13/14] ethtool: Ethtool parameter to dynamically change tx_copybreak Amir Vadai
@ 2014-10-05 9:35 ` Amir Vadai
2014-10-05 13:03 ` Sergei Shtylyov
2014-10-05 11:45 ` [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
2014-10-06 5:04 ` [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow David Miller
15 siblings, 1 reply; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 9:35 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay, Amir Vadai
From: Eric Dumazet <edumazet@google.com>
Instead of setting inline threshold using module parameter only on
driver load, use set_tunable() to set it dynamically.
No need to store the threshold per ring, using instead the netdev global
priv->prof->inline_thold
Initial value still is set using the module parameter, therefore
backward compatability is kept.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 44 +++++++++++++++++++++++++
drivers/net/ethernet/mellanox/mlx4/en_tx.c | 1 -
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 -
3 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 42c9f8b..ae83da9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1267,6 +1267,48 @@ static u32 mlx4_en_get_priv_flags(struct net_device *dev)
return priv->pflags;
}
+static int mlx4_en_get_tunable(struct net_device *dev,
+ const struct ethtool_tunable *tuna,
+ void *data)
+{
+ const struct mlx4_en_priv *priv = netdev_priv(dev);
+ int ret = 0;
+
+ switch (tuna->id) {
+ case ETHTOOL_TX_COPYBREAK:
+ *(u32 *)data = priv->prof->inline_thold;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int mlx4_en_set_tunable(struct net_device *dev,
+ const struct ethtool_tunable *tuna,
+ const void *data)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int val, ret = 0;
+
+ switch (tuna->id) {
+ case ETHTOOL_TX_COPYBREAK:
+ val = *(u32 *)data;
+ if (val < MIN_PKT_LEN || val > MAX_INLINE)
+ ret = -EINVAL;
+ else
+ priv->prof->inline_thold = val;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
const struct ethtool_ops mlx4_en_ethtool_ops = {
.get_drvinfo = mlx4_en_get_drvinfo,
@@ -1297,6 +1339,8 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
.get_ts_info = mlx4_en_get_ts_info,
.set_priv_flags = mlx4_en_set_priv_flags,
.get_priv_flags = mlx4_en_get_priv_flags,
+ .get_tunable = mlx4_en_get_tunable,
+ .set_tunable = mlx4_en_set_tunable,
};
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index f0080c5..92a7cf4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -66,7 +66,6 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
ring->size = size;
ring->size_mask = size - 1;
ring->stride = stride;
- ring->inline_thold = priv->prof->inline_thold;
tmp = size * sizeof(struct mlx4_en_tx_info);
ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index a904030..8fef658 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -295,7 +295,6 @@ struct mlx4_en_tx_ring {
bool bf_alloced;
struct netdev_queue *tx_queue;
int hwtstamp_tx_type;
- int inline_thold;
} ____cacheline_aligned_in_smp;
struct mlx4_en_rx_desc {
--
1.8.3.4
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH net-next 14/14] net/mlx4_en: Use the new tx_copybreak to set inline threshold
2014-10-05 9:35 ` [PATCH net-next 14/14] net/mlx4_en: Use the new tx_copybreak to set inline threshold Amir Vadai
@ 2014-10-05 13:03 ` Sergei Shtylyov
0 siblings, 0 replies; 31+ messages in thread
From: Sergei Shtylyov @ 2014-10-05 13:03 UTC (permalink / raw)
To: Amir Vadai, David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay
Hello.
On 10/5/2014 1:35 PM, Amir Vadai wrote:
> From: Eric Dumazet <edumazet@google.com>
> Instead of setting inline threshold using module parameter only on
> driver load, use set_tunable() to set it dynamically.
> No need to store the threshold per ring, using instead the netdev global
> priv->prof->inline_thold
> Initial value still is set using the module parameter, therefore
> backward compatability is kept.
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Amir Vadai <amirv@mellanox.com>
> ---
> drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 44 +++++++++++++++++++++++++
> drivers/net/ethernet/mellanox/mlx4/en_tx.c | 1 -
> drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 -
> 3 files changed, 44 insertions(+), 2 deletions(-)
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> index 42c9f8b..ae83da9 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
[...]
> @@ -1297,6 +1339,8 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
> .get_ts_info = mlx4_en_get_ts_info,
> .set_priv_flags = mlx4_en_set_priv_flags,
> .get_priv_flags = mlx4_en_get_priv_flags,
> + .get_tunable = mlx4_en_get_tunable,
> + .set_tunable = mlx4_en_set_tunable,
If the above initializers aren't aligned with tabs, why should these two be?
WBR, Sergei
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (13 preceding siblings ...)
2014-10-05 9:35 ` [PATCH net-next 14/14] net/mlx4_en: Use the new tx_copybreak to set inline threshold Amir Vadai
@ 2014-10-05 11:45 ` Amir Vadai
2014-10-05 15:50 ` Eric Dumazet
2014-10-06 5:04 ` [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow David Miller
15 siblings, 1 reply; 31+ messages in thread
From: Amir Vadai @ 2014-10-05 11:45 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet
Cc: netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay
On 10/5/2014 12:35 PM, Amir Vadai wrote:
> I am sending this patchset now since the merge window is near and don't want to
> miss it.
A small clarification, since I didn't explain myself precisely enough:
This patchset is not a WIP and ready for merge. The open issues
aren't directly related to the series and will be addressed in
incremental manner.
Amir
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow
2014-10-05 11:45 ` [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
@ 2014-10-05 15:50 ` Eric Dumazet
2014-10-05 17:11 ` [PATCH net-next] net: introduce netdevice gso_min_segs attribute Eric Dumazet
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2014-10-05 15:50 UTC (permalink / raw)
To: Amir Vadai
Cc: David S. Miller, Eric Dumazet, netdev, Yevgeny Petrilin,
Or Gerlitz, Ido Shamay
On Sun, 2014-10-05 at 14:45 +0300, Amir Vadai wrote:
> On 10/5/2014 12:35 PM, Amir Vadai wrote:
> > I am sending this patchset now since the merge window is near and don't want to
> > miss it.
>
> A small clarification, since I didn't explain myself precisely enough:
> This patchset is not a WIP and ready for merge. The open issues
> aren't directly related to the series and will be addressed in
> incremental manner.
+2
Thanks Amir for this hard work.
Note that now the validate calls (GSO segmentation) is performed outside
of qdisc or device lock ( 55a93b3ea780 qdisc: validate skb without
holding lock) , its very easy to add a dev->gso_min_segs
I already have a patch for this, I am sending it right now.
Thanks !
^ permalink raw reply [flat|nested] 31+ messages in thread
* [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-05 15:50 ` Eric Dumazet
@ 2014-10-05 17:11 ` Eric Dumazet
2014-10-05 18:45 ` Tom Herbert
` (3 more replies)
0 siblings, 4 replies; 31+ messages in thread
From: Eric Dumazet @ 2014-10-05 17:11 UTC (permalink / raw)
To: Amir Vadai, David S. Miller
Cc: Eric Dumazet, netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay
From: Eric Dumazet <edumazet@google.com>
Some TSO engines might have a too heavy setup cost, that impacts
performance on hosts sending small bursts (2 MSS per packet).
This patch adds a device gso_min_segs, allowing drivers to set
a minimum segment size for TSO packets, according to the NIC
performance.
Tested on a mlx4 NIC, this allows to get a ~110% increase of
throughput when sending 2 MSS per packet.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
mlx4 patch will be sent later, its a one liner.
include/linux/netdevice.h | 4 +++-
net/core/dev.c | 9 ++++++---
2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 22d54b9b700d..2df86f50261c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1416,6 +1416,8 @@ enum netdev_priv_flags {
* @gso_max_size: Maximum size of generic segmentation offload
* @gso_max_segs: Maximum number of segments that can be passed to the
* NIC for GSO
+ * @gso_min_segs: Minimum number of segments that can be passed to the
+ * NIC for GSO
*
* @dcbnl_ops: Data Center Bridging netlink ops
* @num_tc: Number of traffic classes in the net device
@@ -1666,7 +1668,7 @@ struct net_device {
unsigned int gso_max_size;
#define GSO_MAX_SEGS 65535
u16 gso_max_segs;
-
+ u16 gso_min_segs;
#ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 1a90530f83ff..16e8ebbd3316 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2567,10 +2567,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
+ const struct net_device *dev = skb->dev;
+ netdev_features_t features = dev->features;
+ u16 gso_segs = skb_shinfo(skb)->gso_segs;
__be16 protocol = skb->protocol;
- netdev_features_t features = skb->dev->features;
- if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+ if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
features &= ~NETIF_F_GSO_MASK;
if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
@@ -2581,7 +2583,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
}
features = netdev_intersect_features(features,
- skb->dev->vlan_features |
+ dev->vlan_features |
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
@@ -6658,6 +6660,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->gso_min_segs = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
^ permalink raw reply related [flat|nested] 31+ messages in thread* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-05 17:11 ` [PATCH net-next] net: introduce netdevice gso_min_segs attribute Eric Dumazet
@ 2014-10-05 18:45 ` Tom Herbert
2014-10-05 18:58 ` Eric Dumazet
2014-10-06 6:41 ` Amir Vadai
` (2 subsequent siblings)
3 siblings, 1 reply; 31+ messages in thread
From: Tom Herbert @ 2014-10-05 18:45 UTC (permalink / raw)
To: Eric Dumazet
Cc: Amir Vadai, David S. Miller, Eric Dumazet, Linux Netdev List,
Yevgeny Petrilin, Or Gerlitz, Ido Shamay
On Sun, Oct 5, 2014 at 10:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Some TSO engines might have a too heavy setup cost, that impacts
> performance on hosts sending small bursts (2 MSS per packet).
>
> This patch adds a device gso_min_segs, allowing drivers to set
> a minimum segment size for TSO packets, according to the NIC
> performance.
>
Eric, this seems like another device specific limitation in TSO we are
exposing to the stack. Can't we just put things like this in
ndo_gso_check so driver can apply whatever criteria it wants in
deciding if it worth it or even to rather do TSO?
> Tested on a mlx4 NIC, this allows to get a ~110% increase of
> throughput when sending 2 MSS per packet.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> mlx4 patch will be sent later, its a one liner.
>
> include/linux/netdevice.h | 4 +++-
> net/core/dev.c | 9 ++++++---
> 2 files changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 22d54b9b700d..2df86f50261c 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1416,6 +1416,8 @@ enum netdev_priv_flags {
> * @gso_max_size: Maximum size of generic segmentation offload
> * @gso_max_segs: Maximum number of segments that can be passed to the
> * NIC for GSO
> + * @gso_min_segs: Minimum number of segments that can be passed to the
> + * NIC for GSO
> *
> * @dcbnl_ops: Data Center Bridging netlink ops
> * @num_tc: Number of traffic classes in the net device
> @@ -1666,7 +1668,7 @@ struct net_device {
> unsigned int gso_max_size;
> #define GSO_MAX_SEGS 65535
> u16 gso_max_segs;
> -
> + u16 gso_min_segs;
> #ifdef CONFIG_DCB
> const struct dcbnl_rtnl_ops *dcbnl_ops;
> #endif
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1a90530f83ff..16e8ebbd3316 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2567,10 +2567,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
>
> netdev_features_t netif_skb_features(struct sk_buff *skb)
> {
> + const struct net_device *dev = skb->dev;
> + netdev_features_t features = dev->features;
> + u16 gso_segs = skb_shinfo(skb)->gso_segs;
> __be16 protocol = skb->protocol;
> - netdev_features_t features = skb->dev->features;
>
> - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
> + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
> features &= ~NETIF_F_GSO_MASK;
>
> if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
> @@ -2581,7 +2583,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
> }
>
> features = netdev_intersect_features(features,
> - skb->dev->vlan_features |
> + dev->vlan_features |
> NETIF_F_HW_VLAN_CTAG_TX |
> NETIF_F_HW_VLAN_STAG_TX);
>
> @@ -6658,6 +6660,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
>
> dev->gso_max_size = GSO_MAX_SIZE;
> dev->gso_max_segs = GSO_MAX_SEGS;
> + dev->gso_min_segs = 0;
>
> INIT_LIST_HEAD(&dev->napi_list);
> INIT_LIST_HEAD(&dev->unreg_list);
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-05 18:45 ` Tom Herbert
@ 2014-10-05 18:58 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2014-10-05 18:58 UTC (permalink / raw)
To: Tom Herbert
Cc: Amir Vadai, David S. Miller, Eric Dumazet, Linux Netdev List,
Yevgeny Petrilin, Or Gerlitz, Ido Shamay
On Sun, 2014-10-05 at 11:45 -0700, Tom Herbert wrote:
> On Sun, Oct 5, 2014 at 10:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > From: Eric Dumazet <edumazet@google.com>
> >
> > Some TSO engines might have a too heavy setup cost, that impacts
> > performance on hosts sending small bursts (2 MSS per packet).
> >
> > This patch adds a device gso_min_segs, allowing drivers to set
> > a minimum segment size for TSO packets, according to the NIC
> > performance.
> >
> Eric, this seems like another device specific limitation in TSO we are
> exposing to the stack. Can't we just put things like this in
> ndo_gso_check so driver can apply whatever criteria it wants in
> deciding if it worth it or even to rather do TSO?
I don't think it really matters where the check is done.
If you want to move the check away from netif_skb_features(), its fine
with me, but please note you did not suggest this for current check
against gso_max_segs.
I based my patch on current net-next, where ndo_gso_check() does not
exist.
BTW, I wonder now why we added gso_max_segs support to bonding driver.
(commit 0e376bd0b791ac6ac6bdb051492df0769c840848)
It makes little sense, gso should be done at the last possible stage.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-05 17:11 ` [PATCH net-next] net: introduce netdevice gso_min_segs attribute Eric Dumazet
2014-10-05 18:45 ` Tom Herbert
@ 2014-10-06 6:41 ` Amir Vadai
2014-10-06 12:17 ` Eric Dumazet
2014-10-06 10:20 ` David Laight
2014-10-06 21:21 ` David Miller
3 siblings, 1 reply; 31+ messages in thread
From: Amir Vadai @ 2014-10-06 6:41 UTC (permalink / raw)
To: Eric Dumazet, David S. Miller
Cc: Eric Dumazet, netdev, Yevgeny Petrilin, Or Gerlitz, Ido Shamay
On 10/5/2014 8:11 PM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Some TSO engines might have a too heavy setup cost, that impacts
> performance on hosts sending small bursts (2 MSS per packet).
>
> This patch adds a device gso_min_segs, allowing drivers to set
> a minimum segment size for TSO packets, according to the NIC
> performance.
>
> Tested on a mlx4 NIC, this allows to get a ~110% increase of
> throughput when sending 2 MSS per packet.
>
Amazing!
Shouldn't there be a netif_set_gso_min_size() too?
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> mlx4 patch will be sent later, its a one liner.
>
> include/linux/netdevice.h | 4 +++-
> net/core/dev.c | 9 ++++++---
> 2 files changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 22d54b9b700d..2df86f50261c 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1416,6 +1416,8 @@ enum netdev_priv_flags {
> * @gso_max_size: Maximum size of generic segmentation offload
> * @gso_max_segs: Maximum number of segments that can be passed to the
> * NIC for GSO
> + * @gso_min_segs: Minimum number of segments that can be passed to the
> + * NIC for GSO
> *
> * @dcbnl_ops: Data Center Bridging netlink ops
> * @num_tc: Number of traffic classes in the net device
> @@ -1666,7 +1668,7 @@ struct net_device {
> unsigned int gso_max_size;
> #define GSO_MAX_SEGS 65535
> u16 gso_max_segs;
> -
> + u16 gso_min_segs;
> #ifdef CONFIG_DCB
> const struct dcbnl_rtnl_ops *dcbnl_ops;
> #endif
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1a90530f83ff..16e8ebbd3316 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2567,10 +2567,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
>
> netdev_features_t netif_skb_features(struct sk_buff *skb)
> {
> + const struct net_device *dev = skb->dev;
> + netdev_features_t features = dev->features;
> + u16 gso_segs = skb_shinfo(skb)->gso_segs;
> __be16 protocol = skb->protocol;
> - netdev_features_t features = skb->dev->features;
>
> - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
> + if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
> features &= ~NETIF_F_GSO_MASK;
>
> if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
> @@ -2581,7 +2583,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
> }
>
> features = netdev_intersect_features(features,
> - skb->dev->vlan_features |
> + dev->vlan_features |
> NETIF_F_HW_VLAN_CTAG_TX |
> NETIF_F_HW_VLAN_STAG_TX);
>
> @@ -6658,6 +6660,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
>
> dev->gso_max_size = GSO_MAX_SIZE;
> dev->gso_max_segs = GSO_MAX_SEGS;
> + dev->gso_min_segs = 0;
>
> INIT_LIST_HEAD(&dev->napi_list);
> INIT_LIST_HEAD(&dev->unreg_list);
>
>
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-06 6:41 ` Amir Vadai
@ 2014-10-06 12:17 ` Eric Dumazet
2014-10-06 12:22 ` Eric Dumazet
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2014-10-06 12:17 UTC (permalink / raw)
To: Amir Vadai
Cc: David S. Miller, Eric Dumazet, netdev, Yevgeny Petrilin,
Or Gerlitz, Ido Shamay
On Mon, 2014-10-06 at 09:41 +0300, Amir Vadai wrote:
> On 10/5/2014 8:11 PM, Eric Dumazet wrote:
> > From: Eric Dumazet <edumazet@google.com>
> >
> > Some TSO engines might have a too heavy setup cost, that impacts
> > performance on hosts sending small bursts (2 MSS per packet).
> >
> > This patch adds a device gso_min_segs, allowing drivers to set
> > a minimum segment size for TSO packets, according to the NIC
> > performance.
> >
> > Tested on a mlx4 NIC, this allows to get a ~110% increase of
> > throughput when sending 2 MSS per packet.
> >
>
> Amazing!
>
> Shouldn't there be a netif_set_gso_min_size() too?
Good point, I'll add it in v2, thanks !
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-06 12:17 ` Eric Dumazet
@ 2014-10-06 12:22 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2014-10-06 12:22 UTC (permalink / raw)
To: Amir Vadai
Cc: David S. Miller, Eric Dumazet, netdev, Yevgeny Petrilin,
Or Gerlitz, Ido Shamay
On Mon, 2014-10-06 at 05:17 -0700, Eric Dumazet wrote:
> On Mon, 2014-10-06 at 09:41 +0300, Amir Vadai wrote:
> > On 10/5/2014 8:11 PM, Eric Dumazet wrote:
> > > From: Eric Dumazet <edumazet@google.com>
> > >
> > > Some TSO engines might have a too heavy setup cost, that impacts
> > > performance on hosts sending small bursts (2 MSS per packet).
> > >
> > > This patch adds a device gso_min_segs, allowing drivers to set
> > > a minimum segment size for TSO packets, according to the NIC
> > > performance.
> > >
> > > Tested on a mlx4 NIC, this allows to get a ~110% increase of
> > > throughput when sending 2 MSS per packet.
> > >
> >
> > Amazing!
> >
> > Shouldn't there be a netif_set_gso_min_size() too?
>
> Good point, I'll add it in v2, thanks !
Well, no... I thought you were speaking of an hypothetical
netif_set_gso_{max|min}_segs()
But we don't have it. : Drivers simply change dev->gso_max_segs
drivers/net/ethernet/freescale/fec_main.c:3058: ndev->gso_max_segs = FEC_MAX_TSO_SEGS;
drivers/net/ethernet/marvell/mv643xx_eth.c:3120: dev->gso_max_segs = MV643XX_MAX_TSO_SEGS;
drivers/net/ethernet/marvell/mvneta.c:3057: dev->gso_max_segs = MVNETA_MAX_TSO_SEGS;
drivers/net/ethernet/sfc/efx.c:2300: net_dev->gso_max_segs = EFX_TSO_MAX_SEGS;
So my mlx4 patch was simply doing
dev->gso_min_segs = MLX4_MIN_TSO_SEGS;
^ permalink raw reply [flat|nested] 31+ messages in thread
* RE: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-05 17:11 ` [PATCH net-next] net: introduce netdevice gso_min_segs attribute Eric Dumazet
2014-10-05 18:45 ` Tom Herbert
2014-10-06 6:41 ` Amir Vadai
@ 2014-10-06 10:20 ` David Laight
2014-10-06 12:14 ` Eric Dumazet
2014-10-06 21:21 ` David Miller
3 siblings, 1 reply; 31+ messages in thread
From: David Laight @ 2014-10-06 10:20 UTC (permalink / raw)
To: 'Eric Dumazet', Amir Vadai, David S. Miller
Cc: Eric Dumazet, netdev@vger.kernel.org, Yevgeny Petrilin,
Or Gerlitz, Ido Shamay
From: Eric Dumazet <edumazet@google.com>
> Some TSO engines might have a too heavy setup cost, that impacts
> performance on hosts sending small bursts (2 MSS per packet).
>
> This patch adds a device gso_min_segs, allowing drivers to set
> a minimum segment size for TSO packets, according to the NIC
> performance.
>
> Tested on a mlx4 NIC, this allows to get a ~110% increase of
> throughput when sending 2 MSS per packet.
Doesn't this all depend on what you need to optimise for.
I can think of three^Wseveral main cases:
1) minimising cpu use while saturating the local network.
2) minimising latency for single packets.
3) maximising throughput for a single connection.
4) minimising cpu use when handling a large number of connections.
plus all the variations in packet size.
I'm not sure what you are trading for what here.
Maybe gso = tx_bursting is almost always faster on some hardware?
(Especially if an idle mac engine is 'kicked' for the first packet
of a burst.)
David
^ permalink raw reply [flat|nested] 31+ messages in thread
* RE: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-06 10:20 ` David Laight
@ 2014-10-06 12:14 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2014-10-06 12:14 UTC (permalink / raw)
To: David Laight
Cc: Amir Vadai, David S. Miller, Eric Dumazet, netdev@vger.kernel.org,
Yevgeny Petrilin, Or Gerlitz, Ido Shamay
On Mon, 2014-10-06 at 10:20 +0000, David Laight wrote:
> From: Eric Dumazet <edumazet@google.com>
> > Some TSO engines might have a too heavy setup cost, that impacts
> > performance on hosts sending small bursts (2 MSS per packet).
> >
> > This patch adds a device gso_min_segs, allowing drivers to set
> > a minimum segment size for TSO packets, according to the NIC
> > performance.
> >
> > Tested on a mlx4 NIC, this allows to get a ~110% increase of
> > throughput when sending 2 MSS per packet.
>
> Doesn't this all depend on what you need to optimise for.
> I can think of three^Wseveral main cases:
> 1) minimising cpu use while saturating the local network.
> 2) minimising latency for single packets.
> 3) maximising throughput for a single connection.
> 4) minimising cpu use when handling a large number of connections.
> plus all the variations in packet size.
>
> I'm not sure what you are trading for what here.
>
I am not sure you really understood.
What's the point having a 40GB NIC and not being able to reach 18 Gb on
it, even if you are willing to spend all the cpu cycles you want ?
> Maybe gso = tx_bursting is almost always faster on some hardware?
> (Especially if an idle mac engine is 'kicked' for the first packet
> of a burst.)
This has nothing to do with xmit_more.
I start 1200 flows rate limited to 3 MBytes each.
(1000 netperf -t TCP_STREAM, nothing fancy here)
Theoretical total of 3.6 GBytes per second.
Without patch :
# sar -n DEV 5 5 | grep eth0
05:07:56 AM eth0 555621.60 1111203.20 35813.03 1642923.46 0.00 0.00 0.60
05:08:01 AM eth0 555591.00 1111173.80 35810.47 1642877.52 0.00 0.00 0.40
05:08:06 AM eth0 555586.20 1111162.60 35810.06 1642861.03 0.00 0.00 0.60
05:08:11 AM eth0 555624.40 1111235.40 35812.75 1642974.19 0.00 0.00 0.60
05:08:16 AM eth0 555639.60 1111266.80 35813.21 1643017.83 0.00 0.00 0.60
Average: eth0 555612.56 1111208.36 35811.90 1642930.81 0.00 0.00 0.56
With patch :
# sar -n DEV 5 5 | grep eth0
05:07:04 AM eth0 1179478.80 2358940.40 76022.47 3487725.22 0.00 0.00 0.60
05:07:09 AM eth0 1178913.60 2357807.40 75986.60 3486044.00 0.00 0.00 0.40
05:07:14 AM eth0 1178957.40 2357897.60 75988.98 3486177.50 0.00 0.00 0.60
05:07:19 AM eth0 1177556.00 2355064.60 75899.37 3481993.37 0.00 0.00 0.60
05:07:24 AM eth0 1180321.20 2360625.20 76077.15 3490209.58 0.00 0.00 0.40
Average: eth0 1179045.40 2358067.04 75994.92 3486429.94 0.00 0.00 0.52
Ask yourself which one we prefer.
About cpu costs, we hardly see anything caused by GSO,
now we optimized __copy_skb_header()
6.36% swapper [kernel.kallsyms] [k] _raw_spin_lock
5.24% netperf [kernel.kallsyms] [k] copy_user_enhanced_fast_string
5.03% swapper [kernel.kallsyms] [k] poll_idle
3.73% swapper [kernel.kallsyms] [k] tcp_ack
2.73% swapper [kernel.kallsyms] [k] memcpy
2.49% swapper [kernel.kallsyms] [k] __skb_clone
2.41% swapper [kernel.kallsyms] [k] skb_release_data
2.33% swapper [kernel.kallsyms] [k] intel_idle
2.23% swapper [kernel.kallsyms] [k] tcp_init_tso_segs
1.99% swapper [kernel.kallsyms] [k] fq_dequeue
1.94% netperf [kernel.kallsyms] [k] tcp_sendmsg
1.82% swapper [kernel.kallsyms] [k] tcp_write_xmit
1.28% swapper [kernel.kallsyms] [k] __netif_receive_skb_core
1.23% swapper [kernel.kallsyms] [k] __copy_skb_header
1.14% swapper [kernel.kallsyms] [k] kfree
1.10% swapper [kernel.kallsyms] [k] kmem_cache_free
1.06% swapper [kernel.kallsyms] [k] mlx4_en_xmit
1.05% swapper [kernel.kallsyms] [k] tcp_wfree
1.02% swapper [kernel.kallsyms] [k] inet_gso_segment
1.01% swapper [kernel.kallsyms] [k] put_compound_page
0.98% swapper [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.96% netperf [kernel.kallsyms] [k] __alloc_skb
0.92% netperf [kernel.kallsyms] [k] _raw_spin_lock
0.89% swapper [kernel.kallsyms] [k] skb_segment
0.88% swapper [kernel.kallsyms] [k] tcp_transmit_skb
0.82% swapper [kernel.kallsyms] [k] ip_queue_xmit
0.81% swapper [kernel.kallsyms] [k] __inet_lookup_established
0.76% swapper [kernel.kallsyms] [k] __kfree_skb
0.73% swapper [kernel.kallsyms] [k] ipv4_dst_check
0.66% netperf [kernel.kallsyms] [k] tcp_ack
0.60% swapper [kernel.kallsyms] [k] __alloc_skb
0.56% swapper [kernel.kallsyms] [k] ipt_do_table
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-05 17:11 ` [PATCH net-next] net: introduce netdevice gso_min_segs attribute Eric Dumazet
` (2 preceding siblings ...)
2014-10-06 10:20 ` David Laight
@ 2014-10-06 21:21 ` David Miller
2014-10-06 21:42 ` Eric Dumazet
3 siblings, 1 reply; 31+ messages in thread
From: David Miller @ 2014-10-06 21:21 UTC (permalink / raw)
To: eric.dumazet; +Cc: amirv, edumazet, netdev, yevgenyp, ogerlitz, idos
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 05 Oct 2014 10:11:27 -0700
> From: Eric Dumazet <edumazet@google.com>
>
> Some TSO engines might have a too heavy setup cost, that impacts
> performance on hosts sending small bursts (2 MSS per packet).
>
> This patch adds a device gso_min_segs, allowing drivers to set
> a minimum segment size for TSO packets, according to the NIC
> performance.
>
> Tested on a mlx4 NIC, this allows to get a ~110% increase of
> throughput when sending 2 MSS per packet.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
So exactly what value are you using for mlx4?
Because I wonder if we should just generically forfeit TSO unless
we have > 2 segments, for example.
^ permalink raw reply [flat|nested] 31+ messages in thread* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-06 21:21 ` David Miller
@ 2014-10-06 21:42 ` Eric Dumazet
2014-10-06 21:54 ` David Miller
0 siblings, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2014-10-06 21:42 UTC (permalink / raw)
To: David Miller; +Cc: amirv, edumazet, netdev, yevgenyp, ogerlitz, idos
On Mon, 2014-10-06 at 17:21 -0400, David Miller wrote:
> So exactly what value are you using for mlx4?
>
It seems that on ConnectX-3 family, TSO packets of 2 or 3 MSS are not
worth using TSO engine. The cutoff point seems to be 4 (same throughput)
So I was planning to use gso_min_segs = 4 only for them.
> Because I wonder if we should just generically forfeit TSO unless
> we have > 2 segments, for example.
When I tested on bnx2x, this was not a gain.
bnx2x is faster sending TSO packets, even if they have 2 MSS.
I'll try the experiment on I40E Intel cards.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-06 21:42 ` Eric Dumazet
@ 2014-10-06 21:54 ` David Miller
2014-10-06 22:26 ` Eric Dumazet
0 siblings, 1 reply; 31+ messages in thread
From: David Miller @ 2014-10-06 21:54 UTC (permalink / raw)
To: eric.dumazet; +Cc: amirv, edumazet, netdev, yevgenyp, ogerlitz, idos
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 06 Oct 2014 14:42:58 -0700
> On Mon, 2014-10-06 at 17:21 -0400, David Miller wrote:
>
>> So exactly what value are you using for mlx4?
>>
>
> It seems that on ConnectX-3 family, TSO packets of 2 or 3 MSS are not
> worth using TSO engine. The cutoff point seems to be 4 (same throughput)
>
> So I was planning to use gso_min_segs = 4 only for them.
>
>> Because I wonder if we should just generically forfeit TSO unless
>> we have > 2 segments, for example.
>
> When I tested on bnx2x, this was not a gain.
>
> bnx2x is faster sending TSO packets, even if they have 2 MSS.
>
> I'll try the experiment on I40E Intel cards.
Ok I'm sold on your patch then if two major chipsets already benefit
from differing values.
I'll apply this, thanks Eric.
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH net-next] net: introduce netdevice gso_min_segs attribute
2014-10-06 21:54 ` David Miller
@ 2014-10-06 22:26 ` Eric Dumazet
0 siblings, 0 replies; 31+ messages in thread
From: Eric Dumazet @ 2014-10-06 22:26 UTC (permalink / raw)
To: David Miller; +Cc: amirv, edumazet, netdev, yevgenyp, ogerlitz, idos
On Mon, 2014-10-06 at 17:54 -0400, David Miller wrote:
> Ok I'm sold on your patch then if two major chipsets already benefit
> from differing values.
>
> I'll apply this, thanks Eric.
I did the experiment on I40e, and no difference on this NIC.
(Note it stills lacks xmit_more support atm)
Thanks David
^ permalink raw reply [flat|nested] 31+ messages in thread
* Re: [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow
2014-10-05 9:35 [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
` (14 preceding siblings ...)
2014-10-05 11:45 ` [PATCH net-next 00/14] net/mlx4_en: Optimizations to TX flow Amir Vadai
@ 2014-10-06 5:04 ` David Miller
15 siblings, 0 replies; 31+ messages in thread
From: David Miller @ 2014-10-06 5:04 UTC (permalink / raw)
To: amirv; +Cc: edumazet, netdev, yevgenyp, ogerlitz, idos
From: Amir Vadai <amirv@mellanox.com>
Date: Sun, 5 Oct 2014 12:35:08 +0300
> This patchset contains optimizations to TX flow in mlx4_en driver. It also introduce
> setting/getting tx copybreak, to enable controlling inline threshold dynamically.
>
> TX flow optimizations was authored and posted to the mailing list by Eric
> Dumazet [1] as a single patch. I splitted this patch to smaller patches,
> Reviewed it and tested.
> Changed from original patch:
> - s/iowrite32be/iowrite32/, since ring->doorbell_qpn is stored as be32
>
> The tx copybreak patch was also suggested by Eric Dumazet, and was edited and
> reviewed by me. User space patch will be sent after kernel code is ready.
>
> I am sending this patchset now since the merge window is near and don't want to
> miss it.
>
> More work need to do:
> - Disable BF when xmit_more is in use
> - Make TSO use xmit_more too. Maybe by splitting small TSO packets in the
> driver itself, to avoid extra cpu/memory costs of GSO before the driver
> - Fix mlx4_en_xmit buggy handling of queue full in the middle of a burst
> partially posted to send queue using xmit_more
>
> Eric, I edited the patches to have you as the Author and the first
> signed-off-by. I hope it is ok with you (I wasn't sure if it is ok to sign by
> you), anyway all the credit to those changes should go to you.
>
> Patchset was tested and applied over commit 1e203c1 "(net: sched:
> suspicious RCU usage in qdisc_watchdog")
>
> [1] - https://patchwork.ozlabs.org/patch/394256/
Looks great, nice work everyone.
^ permalink raw reply [flat|nested] 31+ messages in thread