From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org, Paolo Abeni <pabeni@redhat.com>,
"David S. Miller" <davem@davemloft.net>
Subject: [PATCH 4.8 13/35] IB/ipoib: move back IB LL address into the hard header
Date: Sun, 13 Nov 2016 12:27:27 +0100 [thread overview]
Message-ID: <20161113112421.382521528@linuxfoundation.org> (raw)
In-Reply-To: <20161113112420.863033770@linuxfoundation.org>
4.8-stable review patch. If anyone has any objections, please let me know.
------------------
From: Paolo Abeni <pabeni@redhat.com>
[ Upstream commit fc791b6335152c5278dc4a4991bcb2d329f806f9 ]
After the commit 9207f9d45b0a ("net: preserve IP control block
during GSO segmentation"), the GSO CB and the IPoIB CB conflict.
That destroy the IPoIB address information cached there,
causing a severe performance regression, as better described here:
http://marc.info/?l=linux-kernel&m=146787279825501&w=2
This change moves the data cached by the IPoIB driver from the
skb control lock into the IPoIB hard header, as done before
the commit 936d7de3d736 ("IPoIB: Stop lying about hard_header_len
and use skb->cb to stash LL addresses").
In order to avoid GRO issue, on packet reception, the IPoIB driver
stash into the skb a dummy pseudo header, so that the received
packets have actually a hard header matching the declared length.
To avoid changing the connected mode maximum mtu, the allocated
head buffer size is increased by the pseudo header length.
After this commit, IPoIB performances are back to pre-regression
value.
v2 -> v3: rebased
v1 -> v2: avoid changing the max mtu, increasing the head buf size
Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
drivers/infiniband/ulp/ipoib/ipoib.h | 20 ++++++---
drivers/infiniband/ulp/ipoib/ipoib_cm.c | 15 +++---
drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 ++---
drivers/infiniband/ulp/ipoib/ipoib_main.c | 54 +++++++++++++++----------
drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 +-
5 files changed, 64 insertions(+), 43 deletions(-)
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -63,6 +63,8 @@ enum ipoib_flush_level {
enum {
IPOIB_ENCAP_LEN = 4,
+ IPOIB_PSEUDO_LEN = 20,
+ IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN,
IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */
@@ -134,15 +136,21 @@ struct ipoib_header {
u16 reserved;
};
-struct ipoib_cb {
- struct qdisc_skb_cb qdisc_cb;
- u8 hwaddr[INFINIBAND_ALEN];
+struct ipoib_pseudo_header {
+ u8 hwaddr[INFINIBAND_ALEN];
};
-static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
+static inline void skb_add_pseudo_hdr(struct sk_buff *skb)
{
- BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
- return (struct ipoib_cb *)skb->cb;
+ char *data = skb_push(skb, IPOIB_PSEUDO_LEN);
+
+ /*
+ * only the ipoib header is present now, make room for a dummy
+ * pseudo header and set skb field accordingly
+ */
+ memset(data, 0, IPOIB_PSEUDO_LEN);
+ skb_reset_mac_header(skb);
+ skb_pull(skb, IPOIB_HARD_LEN);
}
/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level,
#define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
#define IPOIB_CM_RX_UPDATE_MASK (0x3)
+#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
+
static struct ib_qp_attr ipoib_cm_err_attr = {
.qp_state = IB_QPS_ERR
};
@@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx
struct sk_buff *skb;
int i;
- skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
+ skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
if (unlikely(!skb))
return NULL;
/*
- * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
+ * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
* IP header to a multiple of 16.
*/
- skb_reserve(skb, 12);
+ skb_reserve(skb, IPOIB_CM_RX_RESERVE);
mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
DMA_FROM_DEVICE);
@@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_de
if (wc->byte_len < IPOIB_CM_COPYBREAK) {
int dlen = wc->byte_len;
- small_skb = dev_alloc_skb(dlen + 12);
+ small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
if (small_skb) {
- skb_reserve(small_skb, 12);
+ skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
dlen, DMA_FROM_DEVICE);
skb_copy_from_linear_data(skb, small_skb->data, dlen);
@@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
copied:
skb->protocol = ((struct ipoib_header *) skb->data)->proto;
- skb_reset_mac_header(skb);
- skb_pull(skb, IPOIB_ENCAP_LEN);
+ skb_add_pseudo_hdr(skb);
++dev->stats.rx_packets;
dev->stats.rx_bytes += skb->len;
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_sk
buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
- skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
+ skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
if (unlikely(!skb))
return NULL;
/*
- * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
- * header. So we need 4 more bytes to get to 48 and align the
- * IP header to a multiple of 16.
+ * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
+ * 64 bytes aligned
*/
- skb_reserve(skb, 4);
+ skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
mapping = priv->rx_ring[id].mapping;
mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
@@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct
skb_pull(skb, IB_GRH_BYTES);
skb->protocol = ((struct ipoib_header *) skb->data)->proto;
- skb_reset_mac_header(skb);
- skb_pull(skb, IPOIB_ENCAP_LEN);
+ skb_add_pseudo_hdr(skb);
++dev->stats.rx_packets;
dev->stats.rx_bytes += skb->len;
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buf
ipoib_neigh_free(neigh);
goto err_drop;
}
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+ if (skb_queue_len(&neigh->queue) <
+ IPOIB_MAX_PATH_REC_QUEUE) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, IPOIB_PSEUDO_LEN);
__skb_queue_tail(&neigh->queue, skb);
- else {
+ } else {
ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
skb_queue_len(&neigh->queue));
goto err_drop;
@@ -964,7 +967,7 @@ err_drop:
}
static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
- struct ipoib_cb *cb)
+ struct ipoib_pseudo_header *phdr)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path;
@@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_b
spin_lock_irqsave(&priv->lock, flags);
- path = __path_find(dev, cb->hwaddr + 4);
+ path = __path_find(dev, phdr->hwaddr + 4);
if (!path || !path->valid) {
int new_path = 0;
if (!path) {
- path = path_rec_create(dev, cb->hwaddr + 4);
+ path = path_rec_create(dev, phdr->hwaddr + 4);
new_path = 1;
}
if (path) {
if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, IPOIB_PSEUDO_LEN);
__skb_queue_tail(&path->queue, skb);
} else {
++dev->stats.tx_dropped;
@@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_b
be16_to_cpu(path->pathrec.dlid));
spin_unlock_irqrestore(&priv->lock, flags);
- ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
+ ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
return;
} else if ((path->query || !path_rec_start(dev, path)) &&
skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, IPOIB_PSEUDO_LEN);
__skb_queue_tail(&path->queue, skb);
} else {
++dev->stats.tx_dropped;
@@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_bu
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_neigh *neigh;
- struct ipoib_cb *cb = ipoib_skb_cb(skb);
+ struct ipoib_pseudo_header *phdr;
struct ipoib_header *header;
unsigned long flags;
+ phdr = (struct ipoib_pseudo_header *) skb->data;
+ skb_pull(skb, sizeof(*phdr));
header = (struct ipoib_header *) skb->data;
- if (unlikely(cb->hwaddr[4] == 0xff)) {
+ if (unlikely(phdr->hwaddr[4] == 0xff)) {
/* multicast, arrange "if" according to probability */
if ((header->proto != htons(ETH_P_IP)) &&
(header->proto != htons(ETH_P_IPV6)) &&
@@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_bu
return NETDEV_TX_OK;
}
/* Add in the P_Key for multicast*/
- cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
- cb->hwaddr[9] = priv->pkey & 0xff;
+ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+ phdr->hwaddr[9] = priv->pkey & 0xff;
- neigh = ipoib_neigh_get(dev, cb->hwaddr);
+ neigh = ipoib_neigh_get(dev, phdr->hwaddr);
if (likely(neigh))
goto send_using_neigh;
- ipoib_mcast_send(dev, cb->hwaddr, skb);
+ ipoib_mcast_send(dev, phdr->hwaddr, skb);
return NETDEV_TX_OK;
}
@@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_bu
case htons(ETH_P_IP):
case htons(ETH_P_IPV6):
case htons(ETH_P_TIPC):
- neigh = ipoib_neigh_get(dev, cb->hwaddr);
+ neigh = ipoib_neigh_get(dev, phdr->hwaddr);
if (unlikely(!neigh)) {
- neigh_add_path(skb, cb->hwaddr, dev);
+ neigh_add_path(skb, phdr->hwaddr, dev);
return NETDEV_TX_OK;
}
break;
case htons(ETH_P_ARP):
case htons(ETH_P_RARP):
/* for unicast ARP and RARP should always perform path find */
- unicast_arp_send(skb, dev, cb);
+ unicast_arp_send(skb, dev, phdr);
return NETDEV_TX_OK;
default:
/* ethertype not supported by IPoIB */
@@ -1086,11 +1095,13 @@ send_using_neigh:
goto unref;
}
} else if (neigh->ah) {
- ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
+ ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
goto unref;
}
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, sizeof(*phdr));
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
@@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_b
unsigned short type,
const void *daddr, const void *saddr, unsigned len)
{
+ struct ipoib_pseudo_header *phdr;
struct ipoib_header *header;
- struct ipoib_cb *cb = ipoib_skb_cb(skb);
header = (struct ipoib_header *) skb_push(skb, sizeof *header);
@@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_b
/*
* we don't rely on dst_entry structure, always stuff the
- * destination address into skb->cb so we can figure out where
+ * destination address into skb hard header so we can figure out where
* to send the packet later.
*/
- memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
+ phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr));
+ memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
- return sizeof *header;
+ return IPOIB_HARD_LEN;
}
static void ipoib_set_mcast_list(struct net_device *dev)
@@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev)
dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
- dev->hard_header_len = IPOIB_ENCAP_LEN;
+ dev->hard_header_len = IPOIB_HARD_LEN;
dev->addr_len = INFINIBAND_ALEN;
dev->type = ARPHRD_INFINIBAND;
dev->tx_queue_len = ipoib_sendq_size * 2;
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device
__ipoib_mcast_add(dev, mcast);
list_add_tail(&mcast->list, &priv->multicast_list);
}
- if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
+ if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, sizeof(struct ipoib_pseudo_header));
skb_queue_tail(&mcast->pkt_queue, skb);
- else {
+ } else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
next prev parent reply other threads:[~2016-11-13 11:38 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CGME20161113112745epcas3p44b79602b71457dccccc98057db31f68c@epcas3p4.samsung.com>
2016-11-13 11:27 ` [PATCH 4.8 00/35] 4.8.8-stable review Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 02/35] net: pktgen: fix pkt_size Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 03/35] net/sched: act_vlan: Push skb->data to mac_header prior calling skb_vlan_*() functions Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 04/35] net: Add netdev all_adj_list refcnt propagation to fix panic Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 05/35] packet: call fanout_release, while UNREGISTERING a netdev Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 06/35] netlink: do not enter direct reclaim from netlink_dump() Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 07/35] drivers/ptp: Fix kernel memory disclosure Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 08/35] net_sched: reorder pernet ops and act ops registrations Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 09/35] ipv6: tcp: restore IP6CB for pktoptions skbs Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 10/35] net: phy: Trigger state machine on state change and not polling Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 11/35] ip6_tunnel: fix ip6_tnl_lookup Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 12/35] ipv6: correctly add local routes when lo goes up Greg Kroah-Hartman
2016-11-13 11:27 ` Greg Kroah-Hartman [this message]
2016-11-13 11:27 ` [PATCH 4.8 14/35] net/mlx4_en: fixup xdp tx irq to match rx Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 15/35] net: pktgen: remove rcu locking in pktgen_change_name() Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 16/35] bridge: multicast: restore perm router ports on multicast enable Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 17/35] switchdev: Execute bridge ndos only for bridge ports Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 18/35] rtnetlink: Add rtnexthop offload flag to compare mask Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 19/35] net: core: Correctly iterate over lower adjacency list Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 21/35] ipv4: disable BH in set_ping_group_range() Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 22/35] ipv4: use the right lock for ping_group_range Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 23/35] net: fec: Call swap_buffer() prior to IP header alignment Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 24/35] net: sctp, forbid negative length Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 25/35] sctp: fix the panic caused by route update Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 26/35] udp: fix IP_CHECKSUM handling Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 27/35] netvsc: fix incorrect receive checksum offloading Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 28/35] macsec: Fix header length if SCI is added if explicitly disabled Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 29/35] net: ipv6: Do not consider link state for nexthop validation Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 30/35] net sched filters: fix notification of filter delete with proper handle Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 31/35] sctp: validate chunk len before actually using it Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 32/35] ip6_tunnel: Update skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit() Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 33/35] packet: on direct_xmit, limit tso and csum to supported devices Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 34/35] arch/powerpc: Update parameters for csum_tcpudp_magic & csum_tcpudp_nofold Greg Kroah-Hartman
2016-11-13 11:27 ` [PATCH 4.8 35/35] usb: dwc3: gadget: properly account queued requests Greg Kroah-Hartman
2016-11-13 20:41 ` [PATCH 4.8 00/35] 4.8.8-stable review Guenter Roeck
2016-11-14 7:44 ` Greg Kroah-Hartman
[not found] ` <5828ae8b.46bb1c0a.31433.9533@mx.google.com>
2016-11-14 7:53 ` Greg Kroah-Hartman
2016-11-14 16:48 ` Shuah Khan
2016-11-14 17:18 ` Greg Kroah-Hartman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20161113112421.382521528@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=davem@davemloft.net \
--cc=linux-kernel@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).