Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 5/7] s390/qeth: pass full IQD header length to fill_buffer()
From: Julian Wiedmann @ 2017-08-18  8:19 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-s390, Martin Schwidefsky, Heiko Carstens,
	Stefan Raspl, Ursula Braun, Julian Wiedmann
In-Reply-To: <20170818081910.48869-1-jwi@linux.vnet.ibm.com>

This is a prerequisite for unifying the code to build header elements.
The TSO header has a different size, so we can no longer rely on implicitly
adding the size of a normal qeth_hdr.

No functional change.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
---
 drivers/s390/net/qeth_core_main.c | 3 +--
 drivers/s390/net/qeth_l2_main.c   | 2 +-
 drivers/s390/net/qeth_l3_main.c   | 3 ++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 4a5c3028dfb6..cef9f54d0eb9 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3981,8 +3981,7 @@ static int qeth_fill_buffer(struct qeth_qdio_out_q *queue,
 		is_first_elem = false;
 
 		buffer->element[element].addr = hdr;
-		buffer->element[element].length = sizeof(struct qeth_hdr) +
-							hd_len;
+		buffer->element[element].length = hd_len;
 		buffer->element[element].eflags = SBAL_EFLAGS_FIRST_FRAG;
 		buf->is_header[element] = 1;
 		buf->next_element_to_fill++;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index a6233ab562f0..c85fadf21b38 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -695,7 +695,7 @@ static int qeth_l2_xmit_iqd(struct qeth_card *card, struct sk_buff *skb,
 		goto out;
 	}
 	rc = qeth_do_send_packet_fast(card, queue, skb, hdr, data_offset,
-				      data_offset);
+				      sizeof(*hdr) + data_offset);
 out:
 	if (rc)
 		kmem_cache_free(qeth_core_header_cache, hdr);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 02400bbcb610..ab661a431f7c 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2670,6 +2670,7 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 	if (card->info.type == QETH_CARD_TYPE_IQD) {
 		new_skb = skb;
 		data_offset = ETH_HLEN;
+		hd_len = sizeof(*hdr);
 		hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
 		if (!hdr)
 			goto tx_drop;
@@ -2771,7 +2772,7 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 					 hd_len, elements);
 	} else
 		rc = qeth_do_send_packet_fast(card, queue, new_skb, hdr,
-					      data_offset, 0);
+					      data_offset, hd_len);
 
 	if (!rc) {
 		card->stats.tx_packets++;
-- 
2.11.2

^ permalink raw reply related

* [PATCH net-next 4/7] s390/qeth: pass TSO data offset to fill_buffer()
From: Julian Wiedmann @ 2017-08-18  8:19 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-s390, Martin Schwidefsky, Heiko Carstens,
	Stefan Raspl, Ursula Braun, Julian Wiedmann
In-Reply-To: <20170818081910.48869-1-jwi@linux.vnet.ibm.com>

For TSO we need to skip the skb's qeth/IP/TCP headers when mapping
it into buffer elements. Instead of (mis)using skb_pull(), pass a
corresponding offset to fill_buffer() like we already do for IQDs.

No actual change in the resulting TSO buffers.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
---
 drivers/s390/net/qeth_core.h      |  2 +-
 drivers/s390/net/qeth_core_main.c | 10 ++++------
 drivers/s390/net/qeth_l2_main.c   |  4 ++--
 drivers/s390/net/qeth_l3_main.c   |  2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 2f5673812810..5753fbc485d5 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -952,7 +952,7 @@ int qeth_do_send_packet_fast(struct qeth_card *card,
 			     unsigned int hd_len);
 int qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
 			struct sk_buff *skb, struct qeth_hdr *hdr,
-			unsigned int hd_len, int elements);
+			unsigned int hd_len, unsigned int offset, int elements);
 int qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 int qeth_core_get_sset_count(struct net_device *, int);
 void qeth_core_get_ethtool_stats(struct net_device *,
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 6cafeceea3ce..4a5c3028dfb6 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3975,11 +3975,8 @@ static int qeth_fill_buffer(struct qeth_qdio_out_q *queue,
 		buffer->element[element].length = hd_len;
 		buffer->element[element].eflags = SBAL_EFLAGS_FIRST_FRAG;
 		buf->next_element_to_fill++;
-		skb_pull(skb, hd_len);
-	}
-
 	/* IQD */
-	if (offset > 0) {
+	} else if (offset) {
 		int element = buf->next_element_to_fill;
 		is_first_elem = false;
 
@@ -4049,7 +4046,8 @@ EXPORT_SYMBOL_GPL(qeth_do_send_packet_fast);
 
 int qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
 			struct sk_buff *skb, struct qeth_hdr *hdr,
-			unsigned int hd_len, int elements_needed)
+			unsigned int offset, unsigned int hd_len,
+			int elements_needed)
 {
 	struct qeth_qdio_out_buffer *buffer;
 	int start_index;
@@ -4098,7 +4096,7 @@ int qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
 			}
 		}
 	}
-	tmp = qeth_fill_buffer(queue, buffer, skb, hdr, 0, hd_len);
+	tmp = qeth_fill_buffer(queue, buffer, skb, hdr, offset, hd_len);
 	queue->next_buf_to_fill = (queue->next_buf_to_fill + tmp) %
 				  QDIO_MAX_BUFFERS_PER_Q;
 	flush_count += tmp;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index c78d9fadb9c8..a6233ab562f0 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -746,7 +746,7 @@ static int qeth_l2_xmit_osa(struct qeth_card *card, struct sk_buff *skb,
 		rc = -EINVAL;
 		goto out;
 	}
-	rc = qeth_do_send_packet(card, queue, skb_copy, hdr, 0, elements);
+	rc = qeth_do_send_packet(card, queue, skb_copy, hdr, 0, 0, elements);
 out:
 	if (!rc) {
 		/* tx success, free dangling original */
@@ -778,7 +778,7 @@ static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb,
 		return -E2BIG;
 	if (qeth_hdr_chk_and_bounce(skb, &hdr, sizeof(*hdr)))
 		return -EINVAL;
-	return qeth_do_send_packet(card, queue, skb, hdr, 0, elements);
+	return qeth_do_send_packet(card, queue, skb, hdr, 0, 0, elements);
 }
 
 static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index fa8b638e3842..02400bbcb610 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2768,7 +2768,7 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 		if (qeth_hdr_chk_and_bounce(new_skb, &hdr, len))
 			goto tx_drop;
 		rc = qeth_do_send_packet(card, queue, new_skb, hdr, hd_len,
-					 elements);
+					 hd_len, elements);
 	} else
 		rc = qeth_do_send_packet_fast(card, queue, new_skb, hdr,
 					      data_offset, 0);
-- 
2.11.2

^ permalink raw reply related

* [PATCH net-next 1/7] s390/qeth: split L2 xmit paths
From: Julian Wiedmann @ 2017-08-18  8:19 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-s390, Martin Schwidefsky, Heiko Carstens,
	Stefan Raspl, Ursula Braun, Julian Wiedmann
In-Reply-To: <20170818081910.48869-1-jwi@linux.vnet.ibm.com>

l2_hard_start_xmit() actually doesn't contain much shared code,
and having device-specific paths makes isolated changes a lot easier.
So split it into three routines for IQD, OSN and OSD/OSM/OSX.

No functional change.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
---
 drivers/s390/net/qeth_l2_main.c | 225 ++++++++++++++++++++++------------------
 1 file changed, 123 insertions(+), 102 deletions(-)

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 438a7f29e99f..310bfa225e20 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -676,143 +676,164 @@ static void qeth_l2_set_rx_mode(struct net_device *dev)
 		qeth_promisc_to_bridge(card);
 }
 
-static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
-					   struct net_device *dev)
+static int qeth_l2_xmit_iqd(struct qeth_card *card, struct sk_buff *skb,
+			    struct qeth_qdio_out_q *queue, int cast_type)
 {
+	unsigned int data_offset = ETH_HLEN;
+	struct qeth_hdr *hdr;
 	int rc;
-	struct qeth_hdr *hdr = NULL;
-	int elements = 0;
-	struct qeth_card *card = dev->ml_priv;
-	struct sk_buff *new_skb = skb;
-	int cast_type = qeth_l2_get_cast_type(card, skb);
-	struct qeth_qdio_out_q *queue;
-	int tx_bytes = skb->len;
-	int data_offset = -1;
-	int elements_needed = 0;
-	int hd_len = 0;
-	unsigned int nr_frags;
 
-	if (card->qdio.do_prio_queueing || (cast_type &&
-					card->info.is_multicast_different))
-		queue = card->qdio.out_qs[qeth_get_priority_queue(card, skb,
-					qeth_get_ip_version(skb), cast_type)];
-	else
-		queue = card->qdio.out_qs[card->qdio.default_out_queue];
+	hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
+	if (!hdr)
+		return -ENOMEM;
+	qeth_l2_fill_header(card, hdr, skb, cast_type);
+	hdr->hdr.l2.pkt_length = skb->len;
+	skb_copy_from_linear_data(skb, ((char *)hdr) + sizeof(*hdr),
+				  data_offset);
 
-	if ((card->state != CARD_STATE_UP) || !card->lan_online) {
-		card->stats.tx_carrier_errors++;
-		goto tx_drop;
+	if (!qeth_get_elements_no(card, skb, 1, data_offset)) {
+		rc = -E2BIG;
+		goto out;
 	}
+	rc = qeth_do_send_packet_fast(card, queue, skb, hdr, data_offset,
+				      data_offset);
+out:
+	if (rc)
+		kmem_cache_free(qeth_core_header_cache, hdr);
+	return rc;
+}
 
-	if ((card->info.type == QETH_CARD_TYPE_OSN) &&
-	    (skb->protocol == htons(ETH_P_IPV6)))
-		goto tx_drop;
-
-	if (card->options.performance_stats) {
-		card->perf_stats.outbound_cnt++;
-		card->perf_stats.outbound_start_time = qeth_get_micros();
-	}
-	netif_stop_queue(dev);
+static int qeth_l2_xmit_osa(struct qeth_card *card, struct sk_buff *skb,
+			    struct qeth_qdio_out_q *queue, int cast_type)
+{
+	unsigned int elements, nr_frags;
+	struct sk_buff *skb_copy;
+	struct qeth_hdr *hdr;
+	int rc;
 
 	/* fix hardware limitation: as long as we do not have sbal
 	 * chaining we can not send long frag lists
 	 */
-	if ((card->info.type != QETH_CARD_TYPE_IQD) &&
-	    !qeth_get_elements_no(card, new_skb, 0, 0)) {
-		int lin_rc = skb_linearize(new_skb);
+	if (!qeth_get_elements_no(card, skb, 0, 0)) {
+		rc = skb_linearize(skb);
 
 		if (card->options.performance_stats) {
-			if (lin_rc)
+			if (rc)
 				card->perf_stats.tx_linfail++;
 			else
 				card->perf_stats.tx_lin++;
 		}
-		if (lin_rc)
-			goto tx_drop;
+		if (rc)
+			return rc;
 	}
-	nr_frags = skb_shinfo(new_skb)->nr_frags;
+	nr_frags = skb_shinfo(skb)->nr_frags;
 
-	if (card->info.type == QETH_CARD_TYPE_OSN)
-		hdr = (struct qeth_hdr *)skb->data;
-	else {
-		if (card->info.type == QETH_CARD_TYPE_IQD) {
-			new_skb = skb;
-			data_offset = ETH_HLEN;
-			hd_len = ETH_HLEN;
-			hdr = kmem_cache_alloc(qeth_core_header_cache,
-						GFP_ATOMIC);
-			if (!hdr)
-				goto tx_drop;
-			elements_needed++;
-			qeth_l2_fill_header(card, hdr, new_skb, cast_type);
-			hdr->hdr.l2.pkt_length = new_skb->len;
-			skb_copy_from_linear_data(new_skb,
-						  ((char *)hdr) + sizeof(*hdr),
-						  ETH_HLEN);
-		} else {
-			/* create a clone with writeable headroom */
-			new_skb = skb_realloc_headroom(skb,
-						sizeof(struct qeth_hdr));
-			if (!new_skb)
-				goto tx_drop;
-			hdr = skb_push(new_skb, sizeof(struct qeth_hdr));
-			qeth_l2_fill_header(card, hdr, new_skb, cast_type);
-			if (new_skb->ip_summed == CHECKSUM_PARTIAL)
-				qeth_l2_hdr_csum(card, hdr, new_skb);
-		}
-	}
+	/* create a copy with writeable headroom */
+	skb_copy = skb_realloc_headroom(skb, sizeof(struct qeth_hdr));
+	if (!skb_copy)
+		return -ENOMEM;
+	hdr = skb_push(skb_copy, sizeof(struct qeth_hdr));
+	qeth_l2_fill_header(card, hdr, skb_copy, cast_type);
+	if (skb_copy->ip_summed == CHECKSUM_PARTIAL)
+		qeth_l2_hdr_csum(card, hdr, skb_copy);
 
-	elements = qeth_get_elements_no(card, new_skb, elements_needed,
-					(data_offset > 0) ? data_offset : 0);
+	elements = qeth_get_elements_no(card, skb_copy, 0, 0);
 	if (!elements) {
-		if (data_offset >= 0)
-			kmem_cache_free(qeth_core_header_cache, hdr);
-		goto tx_drop;
+		rc = -E2BIG;
+		goto out;
 	}
-
-	if (card->info.type != QETH_CARD_TYPE_IQD) {
-		if (qeth_hdr_chk_and_bounce(new_skb, &hdr,
-		    sizeof(struct qeth_hdr_layer2)))
-			goto tx_drop;
-		rc = qeth_do_send_packet(card, queue, new_skb, hdr,
-					 elements);
-	} else
-		rc = qeth_do_send_packet_fast(card, queue, new_skb, hdr,
-					      data_offset, hd_len);
+	if (qeth_hdr_chk_and_bounce(skb_copy, &hdr, sizeof(*hdr))) {
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = qeth_do_send_packet(card, queue, skb_copy, hdr, elements);
+out:
 	if (!rc) {
-		card->stats.tx_packets++;
-		card->stats.tx_bytes += tx_bytes;
+		/* tx success, free dangling original */
+		dev_kfree_skb_any(skb);
 		if (card->options.performance_stats && nr_frags) {
 			card->perf_stats.sg_skbs_sent++;
 			/* nr_frags + skb->data */
 			card->perf_stats.sg_frags_sent += nr_frags + 1;
 		}
-		if (new_skb != skb)
-			dev_kfree_skb_any(skb);
-		rc = NETDEV_TX_OK;
 	} else {
-		if (data_offset >= 0)
-			kmem_cache_free(qeth_core_header_cache, hdr);
+		/* tx fail, free copy */
+		dev_kfree_skb_any(skb_copy);
+	}
+	return rc;
+}
 
-		if (rc == -EBUSY) {
-			if (new_skb != skb)
-				dev_kfree_skb_any(new_skb);
-			return NETDEV_TX_BUSY;
-		} else
-			goto tx_drop;
+static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb,
+			    struct qeth_qdio_out_q *queue)
+{
+	unsigned int elements;
+	struct qeth_hdr *hdr;
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		return -EPROTONOSUPPORT;
+
+	hdr = (struct qeth_hdr *)skb->data;
+	elements = qeth_get_elements_no(card, skb, 0, 0);
+	if (!elements)
+		return -E2BIG;
+	if (qeth_hdr_chk_and_bounce(skb, &hdr, sizeof(*hdr)))
+		return -EINVAL;
+	return qeth_do_send_packet(card, queue, skb, hdr, elements);
+}
+
+static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
+					   struct net_device *dev)
+{
+	struct qeth_card *card = dev->ml_priv;
+	int cast_type = qeth_l2_get_cast_type(card, skb);
+	struct qeth_qdio_out_q *queue;
+	int tx_bytes = skb->len;
+	int rc;
+
+	if (card->qdio.do_prio_queueing || (cast_type &&
+					card->info.is_multicast_different))
+		queue = card->qdio.out_qs[qeth_get_priority_queue(card, skb,
+					qeth_get_ip_version(skb), cast_type)];
+	else
+		queue = card->qdio.out_qs[card->qdio.default_out_queue];
+
+	if ((card->state != CARD_STATE_UP) || !card->lan_online) {
+		card->stats.tx_carrier_errors++;
+		goto tx_drop;
 	}
 
-	netif_wake_queue(dev);
-	if (card->options.performance_stats)
-		card->perf_stats.outbound_time += qeth_get_micros() -
-			card->perf_stats.outbound_start_time;
-	return rc;
+	if (card->options.performance_stats) {
+		card->perf_stats.outbound_cnt++;
+		card->perf_stats.outbound_start_time = qeth_get_micros();
+	}
+	netif_stop_queue(dev);
+
+	switch (card->info.type) {
+	case QETH_CARD_TYPE_OSN:
+		rc = qeth_l2_xmit_osn(card, skb, queue);
+		break;
+	case QETH_CARD_TYPE_IQD:
+		rc = qeth_l2_xmit_iqd(card, skb, queue, cast_type);
+		break;
+	default:
+		rc = qeth_l2_xmit_osa(card, skb, queue, cast_type);
+	}
+
+	if (!rc) {
+		card->stats.tx_packets++;
+		card->stats.tx_bytes += tx_bytes;
+		if (card->options.performance_stats)
+			card->perf_stats.outbound_time += qeth_get_micros() -
+				card->perf_stats.outbound_start_time;
+		netif_wake_queue(dev);
+		return NETDEV_TX_OK;
+	} else if (rc == -EBUSY) {
+		return NETDEV_TX_BUSY;
+	} /* else fall through */
 
 tx_drop:
 	card->stats.tx_dropped++;
 	card->stats.tx_errors++;
-	if ((new_skb != skb) && new_skb)
-		dev_kfree_skb_any(new_skb);
 	dev_kfree_skb_any(skb);
 	netif_wake_queue(dev);
 	return NETDEV_TX_OK;
-- 
2.11.2

^ permalink raw reply related

* [PATCH net-next 2/7] s390/qeth: pass full data length to l2_fill_header()
From: Julian Wiedmann @ 2017-08-18  8:19 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-s390, Martin Schwidefsky, Heiko Carstens,
	Stefan Raspl, Ursula Braun, Julian Wiedmann
In-Reply-To: <20170818081910.48869-1-jwi@linux.vnet.ibm.com>

For IQD we already need to fix up the qeth_hdr's length field, and
future changes will require more flexibility for OSA as well. The
device-specific path knows best what header length it requires, so just
pass it from there.
While at it, remove the unused qeth_card parameter.

No functional change.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
---
 drivers/s390/net/qeth_l2_main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 310bfa225e20..3f5b852408d3 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -259,13 +259,14 @@ static void qeth_l2_hdr_csum(struct qeth_card *card, struct qeth_hdr *hdr,
 		card->perf_stats.tx_csum++;
 }
 
-static void qeth_l2_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
-			struct sk_buff *skb, int cast_type)
+static void qeth_l2_fill_header(struct qeth_hdr *hdr, struct sk_buff *skb,
+				int cast_type, unsigned int data_len)
 {
 	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb_mac_header(skb);
 
 	memset(hdr, 0, sizeof(struct qeth_hdr));
 	hdr->hdr.l2.id = QETH_HEADER_TYPE_LAYER2;
+	hdr->hdr.l2.pkt_length = data_len;
 
 	/* set byte byte 3 to casting flags */
 	if (cast_type == RTN_MULTICAST)
@@ -275,7 +276,6 @@ static void qeth_l2_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
 	else
 		hdr->hdr.l2.flags[2] |= QETH_LAYER2_FLAG_UNICAST;
 
-	hdr->hdr.l2.pkt_length = skb->len - sizeof(struct qeth_hdr);
 	/* VSWITCH relies on the VLAN
 	 * information to be present in
 	 * the QDIO header */
@@ -686,8 +686,7 @@ static int qeth_l2_xmit_iqd(struct qeth_card *card, struct sk_buff *skb,
 	hdr = kmem_cache_alloc(qeth_core_header_cache, GFP_ATOMIC);
 	if (!hdr)
 		return -ENOMEM;
-	qeth_l2_fill_header(card, hdr, skb, cast_type);
-	hdr->hdr.l2.pkt_length = skb->len;
+	qeth_l2_fill_header(hdr, skb, cast_type, skb->len);
 	skb_copy_from_linear_data(skb, ((char *)hdr) + sizeof(*hdr),
 				  data_offset);
 
@@ -733,7 +732,8 @@ static int qeth_l2_xmit_osa(struct qeth_card *card, struct sk_buff *skb,
 	if (!skb_copy)
 		return -ENOMEM;
 	hdr = skb_push(skb_copy, sizeof(struct qeth_hdr));
-	qeth_l2_fill_header(card, hdr, skb_copy, cast_type);
+	qeth_l2_fill_header(hdr, skb_copy, cast_type,
+			    skb_copy->len - sizeof(*hdr));
 	if (skb_copy->ip_summed == CHECKSUM_PARTIAL)
 		qeth_l2_hdr_csum(card, hdr, skb_copy);
 
-- 
2.11.2

^ permalink raw reply related

* [PATCH net-next 3/7] s390/qeth: pass TSO header length to fill_buffer()
From: Julian Wiedmann @ 2017-08-18  8:19 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-s390, Martin Schwidefsky, Heiko Carstens,
	Stefan Raspl, Ursula Braun, Julian Wiedmann
In-Reply-To: <20170818081910.48869-1-jwi@linux.vnet.ibm.com>

The TSO code already calculates the length of its header element,
no need to duplicate this in the low-level code again.

Use this opportunity to make hd_len unsigned, and for TSO match
its calculation to what tso_fill_header() does.

No functional change.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
---
 drivers/s390/net/qeth_core.h      |  7 ++++---
 drivers/s390/net/qeth_core_main.c | 18 ++++++++----------
 drivers/s390/net/qeth_l2_main.c   |  4 ++--
 drivers/s390/net/qeth_l3_main.c   | 15 +++++++++------
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 4a4ca5cb37a0..2f5673812810 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -949,9 +949,10 @@ int qeth_get_elements_for_frags(struct sk_buff *);
 int qeth_do_send_packet_fast(struct qeth_card *card,
 			     struct qeth_qdio_out_q *queue, struct sk_buff *skb,
 			     struct qeth_hdr *hdr, unsigned int offset,
-			     int hd_len);
-int qeth_do_send_packet(struct qeth_card *, struct qeth_qdio_out_q *,
-		    struct sk_buff *, struct qeth_hdr *, int);
+			     unsigned int hd_len);
+int qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
+			struct sk_buff *skb, struct qeth_hdr *hdr,
+			unsigned int hd_len, int elements);
 int qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 int qeth_core_get_sset_count(struct net_device *, int);
 void qeth_core_get_ethtool_stats(struct net_device *,
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 415424e618ad..6cafeceea3ce 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3956,11 +3956,11 @@ static void __qeth_fill_buffer(struct sk_buff *skb,
 static int qeth_fill_buffer(struct qeth_qdio_out_q *queue,
 			    struct qeth_qdio_out_buffer *buf,
 			    struct sk_buff *skb, struct qeth_hdr *hdr,
-			    unsigned int offset, int hd_len)
+			    unsigned int offset, unsigned int hd_len)
 {
 	struct qdio_buffer *buffer;
-	int flush_cnt = 0, hdr_len;
 	bool is_first_elem = true;
+	int flush_cnt = 0;
 
 	buffer = buf->buffer;
 	refcount_inc(&skb->users);
@@ -3970,14 +3970,12 @@ static int qeth_fill_buffer(struct qeth_qdio_out_q *queue,
 		int element = buf->next_element_to_fill;
 		is_first_elem = false;
 
-		hdr_len = sizeof(struct qeth_hdr_tso) +
-			((struct qeth_hdr_tso *)hdr)->ext.dg_hdr_len;
 		/*fill first buffer entry only with header information */
 		buffer->element[element].addr = skb->data;
-		buffer->element[element].length = hdr_len;
+		buffer->element[element].length = hd_len;
 		buffer->element[element].eflags = SBAL_EFLAGS_FIRST_FRAG;
 		buf->next_element_to_fill++;
-		skb_pull(skb, hdr_len);
+		skb_pull(skb, hd_len);
 	}
 
 	/* IQD */
@@ -4020,7 +4018,7 @@ static int qeth_fill_buffer(struct qeth_qdio_out_q *queue,
 int qeth_do_send_packet_fast(struct qeth_card *card,
 			     struct qeth_qdio_out_q *queue, struct sk_buff *skb,
 			     struct qeth_hdr *hdr, unsigned int offset,
-			     int hd_len)
+			     unsigned int hd_len)
 {
 	struct qeth_qdio_out_buffer *buffer;
 	int index;
@@ -4050,8 +4048,8 @@ int qeth_do_send_packet_fast(struct qeth_card *card,
 EXPORT_SYMBOL_GPL(qeth_do_send_packet_fast);
 
 int qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
-		struct sk_buff *skb, struct qeth_hdr *hdr,
-		int elements_needed)
+			struct sk_buff *skb, struct qeth_hdr *hdr,
+			unsigned int hd_len, int elements_needed)
 {
 	struct qeth_qdio_out_buffer *buffer;
 	int start_index;
@@ -4100,7 +4098,7 @@ int qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
 			}
 		}
 	}
-	tmp = qeth_fill_buffer(queue, buffer, skb, hdr, 0, 0);
+	tmp = qeth_fill_buffer(queue, buffer, skb, hdr, 0, hd_len);
 	queue->next_buf_to_fill = (queue->next_buf_to_fill + tmp) %
 				  QDIO_MAX_BUFFERS_PER_Q;
 	flush_count += tmp;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 3f5b852408d3..c78d9fadb9c8 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -746,7 +746,7 @@ static int qeth_l2_xmit_osa(struct qeth_card *card, struct sk_buff *skb,
 		rc = -EINVAL;
 		goto out;
 	}
-	rc = qeth_do_send_packet(card, queue, skb_copy, hdr, elements);
+	rc = qeth_do_send_packet(card, queue, skb_copy, hdr, 0, elements);
 out:
 	if (!rc) {
 		/* tx success, free dangling original */
@@ -778,7 +778,7 @@ static int qeth_l2_xmit_osn(struct qeth_card *card, struct sk_buff *skb,
 		return -E2BIG;
 	if (qeth_hdr_chk_and_bounce(skb, &hdr, sizeof(*hdr)))
 		return -EINVAL;
-	return qeth_do_send_packet(card, queue, skb, hdr, elements);
+	return qeth_do_send_packet(card, queue, skb, hdr, 0, elements);
 }
 
 static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 0a3dc14a1381..fa8b638e3842 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2637,6 +2637,7 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 			qeth_get_priority_queue(card, skb, ipv, cast_type) :
 			card->qdio.default_out_queue];
 	int tx_bytes = skb->len;
+	unsigned int hd_len = 0;
 	bool use_tso;
 	int data_offset = -1;
 	unsigned int nr_frags;
@@ -2756,16 +2757,18 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 
 	if (card->info.type != QETH_CARD_TYPE_IQD) {
 		int len;
-		if (use_tso)
-			len = ((unsigned long)tcp_hdr(new_skb) +
-				tcp_hdrlen(new_skb)) -
-				(unsigned long)new_skb->data;
-		else
+		if (use_tso) {
+			hd_len = sizeof(struct qeth_hdr_tso) +
+				 ip_hdrlen(new_skb) + tcp_hdrlen(new_skb);
+			len = hd_len;
+		} else {
 			len = sizeof(struct qeth_hdr_layer3);
+		}
 
 		if (qeth_hdr_chk_and_bounce(new_skb, &hdr, len))
 			goto tx_drop;
-		rc = qeth_do_send_packet(card, queue, new_skb, hdr, elements);
+		rc = qeth_do_send_packet(card, queue, new_skb, hdr, hd_len,
+					 elements);
 	} else
 		rc = qeth_do_send_packet_fast(card, queue, new_skb, hdr,
 					      data_offset, 0);
-- 
2.11.2

^ permalink raw reply related

* [PATCH net-next 0/7] s390/net: more updates for 4.14
From: Julian Wiedmann @ 2017-08-18  8:19 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-s390, Martin Schwidefsky, Heiko Carstens,
	Stefan Raspl, Ursula Braun, Julian Wiedmann

Hi Dave,

please apply another batch of qeth patches for net-next.
This reworks the xmit path for L2 OSAs to use skb_cow_head() instead of
skb_realloc_headroom().

Thanks,
Julian

Julian Wiedmann (7):
  s390/qeth: split L2 xmit paths
  s390/qeth: pass full data length to l2_fill_header()
  s390/qeth: pass TSO header length to fill_buffer()
  s390/qeth: pass TSO data offset to fill_buffer()
  s390/qeth: pass full IQD header length to fill_buffer()
  s390/qeth: unify code to build header elements
  s390/qeth: use skb_cow_head() for L2 OSA xmit

 drivers/s390/net/qeth_core.h      |   8 +-
 drivers/s390/net/qeth_core_main.c |  78 +++++++-----
 drivers/s390/net/qeth_l2_main.c   | 243 ++++++++++++++++++++++----------------
 drivers/s390/net/qeth_l3_main.c   |  18 +--
 4 files changed, 206 insertions(+), 141 deletions(-)

-- 
2.11.2

^ permalink raw reply

* [patch net] net: sched: fix p_filter_chain check in tcf_chain_flush
From: Jiri Pirko @ 2017-08-18  8:10 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

The dereference before check is wrong and leads to an oops when
p_filter_chain is NULL. The check needs to be done on the pointer to
prevent NULL dereference.

Fixes: f93e1cdcf42c ("net/sched: fix filter flushing")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ebeeb87..eef6b07 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -190,7 +190,7 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 
-	if (*chain->p_filter_chain)
+	if (chain->p_filter_chain)
 		RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
 	while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH net-next] bpf: Fix map-in-map checking in the verifier
From: Daniel Borkmann @ 2017-08-18  8:10 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: Alexei Starovoitov, kernel-team, John Fastabend
In-Reply-To: <20170818011443.562793-1-kafai@fb.com>

On 08/18/2017 03:14 AM, Martin KaFai Lau wrote:
> In check_map_func_compatibility(), a 'break' has been accidentally
> removed for the BPF_MAP_TYPE_ARRAY_OF_MAPS and BPF_MAP_TYPE_HASH_OF_MAPS
> cases.  This patch adds it back.
>
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Cc: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>

Thanks for spotting!

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

^ permalink raw reply

* Re: [PATCH iproute2 json v2 00/27] ip: add -json support to 'ip link show'
From: Julien Fortin @ 2017-08-18  7:56 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Roopa Prabhu, Nikolay Aleksandrov, David Ahern
In-Reply-To: <20170817180459.7e75d650@xeon-e3>

Ack, thanks Stephen.

On Thu, Aug 17, 2017 at 6:04 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Thu, 17 Aug 2017 10:35:47 -0700
> Julien Fortin <julien@cumulusnetworks.com> wrote:
>
>> From: Julien Fortin <julien@cumulusnetworks.com>
>>
>> This patch series adds json support to 'ip [-details] link show [dev DEV]'
>> Each patch describes the json schema it adds and provides some examples.
>>
>> Julien Fortin (27):
>>   color: add new COLOR_NONE and disable_color function
>>   ip: add new command line argument -json (mutually exclusive with
>>     -color)
>>   json_writer: add new json handlers (null, float with format, lluint,
>>     hu)
>>   ip: ip_print: add new API to print JSON or regular format output
>>   ip: ipaddress.c: add support for json output
>>   ip: iplink.c: open/close json obj for ip -brief -json link show dev
>>     DEV
>>   ip: iplink_bond.c: add json output support
>>   ip: iplink_bond_slave.c: add json output support (info_slave_data)
>>   ip: iplink_hsr.c: add json output support
>>   ip: iplink_bridge.c: add json output support
>>   ip: iplink_bridge_slave.c: add json output support
>>   ip: iplink_can.c: add json output support
>>   ip: iplink_geneve.c: add json output support
>>   ip: iplink_ipoib.c: add json output support
>>   ip: iplink_ipvlan.c: add json output support
>>   ip: iplink_vrf.c: add json output support
>>   ip: iplink_vxlan.c: add json output support
>>   ip: iplink_xdp.c: add json output support
>>   ip: ipmacsec.c: add json output support
>>   ip: link_gre.c: add json output support
>>   ip: link_gre6.c: add json output support
>>   ip: link_ip6tnl.c: add json output support
>>   ip: link_iptnl.c: add json output support
>>   ip: link_vti.c: add json output support
>>   ip: link_vti6.c: add json output support
>>   ip: link_macvlan.c: add json output support
>>   ip: iplink_vlan.c: add json output support
>>
>>  include/color.h          |    2 +
>>  include/json_writer.h    |    9 +
>>  include/utils.h          |    1 +
>>  ip/Makefile              |    2 +-
>>  ip/ip.c                  |    6 +
>>  ip/ip_common.h           |   56 +++
>>  ip/ip_print.c            |  233 ++++++++++
>>  ip/ipaddress.c           | 1064 ++++++++++++++++++++++++++++++++--------------
>>  ip/iplink.c              |    2 +
>>  ip/iplink_bond.c         |  231 ++++++----
>>  ip/iplink_bond_slave.c   |   57 ++-
>>  ip/iplink_bridge.c       |  293 ++++++++-----
>>  ip/iplink_bridge_slave.c |  185 ++++----
>>  ip/iplink_can.c          |  282 ++++++++----
>>  ip/iplink_geneve.c       |   86 +++-
>>  ip/iplink_hsr.c          |   36 +-
>>  ip/iplink_ipoib.c        |   30 +-
>>  ip/iplink_ipvlan.c       |    8 +-
>>  ip/iplink_macvlan.c      |   37 +-
>>  ip/iplink_vlan.c         |   62 ++-
>>  ip/iplink_vrf.c          |   13 +-
>>  ip/iplink_vxlan.c        |  161 ++++---
>>  ip/iplink_xdp.c          |   31 +-
>>  ip/ipmacsec.c            |   84 +++-
>>  ip/link_gre.c            |  147 ++++---
>>  ip/link_gre6.c           |  142 +++++--
>>  ip/link_ip6tnl.c         |  172 +++++---
>>  ip/link_iptnl.c          |  155 ++++---
>>  ip/link_vti.c            |   24 +-
>>  ip/link_vti6.c           |   22 +-
>>  lib/color.c              |   12 +-
>>  lib/json_writer.c        |   44 +-
>>  32 files changed, 2663 insertions(+), 1026 deletions(-)
>>  create mode 100644 ip/ip_print.c
>>
>
> This looks good, Thanks Julien et all.
> Applied to net-next branch.
> You may need to make sure that as new features get added that json support
> continues to work.
>
>

^ permalink raw reply

* Re: [PATCH net-next] bpf: fix a return in sockmap_get_from_fd()
From: John Fastabend @ 2017-08-18  7:40 UTC (permalink / raw)
  To: Dan Carpenter, Alexei Starovoitov
  Cc: Daniel Borkmann, netdev, kernel-janitors
In-Reply-To: <20170818071210.wyq37kura6wz6bx6@mwanda>

On 08/18/2017 12:27 AM, Dan Carpenter wrote:
> "map" is a valid pointer.  We wanted to return "err" instead.  Also
> let's return a zero literal at the end.
> 
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
> 
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index d2f2bdf71ffa..b8cb1b3c9bfb 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c

Thanks, just a note though the sockmap attach call can not fail so
this branch is effectively never used. We could just remove the
branch but having an unchecked 'err' code seems fragile if we ever
do find a reason to fail in the attach path.

Acked-by: John Fastabend <john.fastabend@gmail.com>

> @@ -1125,11 +1125,11 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
>  		fdput(f);
>  		bpf_prog_put(prog1);
>  		bpf_prog_put(prog2);
> -		return PTR_ERR(map);
> +		return err;
>  	}
>  
>  	fdput(f);
> -	return err;
> +	return 0;
>  }
>  
>  static int bpf_prog_attach(const union bpf_attr *attr)
> 

^ permalink raw reply

* Re: [net-next PATCH 06/10] bpf: sockmap with sk redirect support
From: John Fastabend @ 2017-08-18  7:35 UTC (permalink / raw)
  To: Alexei Starovoitov, davem, daniel; +Cc: tgraf, netdev, tom
In-Reply-To: <a6dafa0a-9cb2-6d70-5db7-fe44108a6c2c@fb.com>

On 08/17/2017 03:28 PM, Alexei Starovoitov wrote:
> On 8/17/17 11:58 AM, John Fastabend wrote:
>>>> +    /* reserve BPF programs early so can abort easily on failures */
>>>> +    if (map_flags & BPF_SOCKMAP_STRPARSER) {
>>> why have two 'flags' arguments and new helper just for this?
>>> can normal update() be used and extra bits of flag there?
>>>
>> The new helper is needed regardless to handle consuming the skops ctx
>> pointer from programs attached to cgroups. This way we can attach sockets
>> in cgroups when they enter specified TCP states.
>>
>> The map_flags arg was because I expect we may end up with a few more flags
>> in sockmap and thought it was reasonable to keep separate namespaces for
>> the two flag types, BPF_ and BPF_SOCKMAP_*. It does however have the one
>> issue that when doing the update via syscall the flags are not available.
>>
>> If there is no objection to consuming some bits of the normal flags a small
>> patch could do that. I think we will need at least two more bits going forward
>> for additional features. I guess though the map flags is not pressed for
>> bit space yet though.
>>
> 
> hmm. looking at the patch further. I'm not excited with this new api.

OK first thanks for looking it over.

> Why this BPF_SOCKMAP_STRPARSER flag is needed at all?
> The sample code doesn't use it. What could be the use case?

The use case is to allow 'redirect' to a socket but without having the
receive side use strparser/verdict programs. If the flags not set we
wont run the strparser.

> Some future proofing? but the code seems to work properly only in
> strparser mode... like 'verdict' bpf prog is called only
> from strparser callback... and no other way to call it...
> and normal map_update_elem() implies BPF_SOCKMAP_STRPARSER too...
> I don't see detach api either..
> .

The detach is not yet implemented, I'll add it shortly.

> Why BPF_CGROUP_SMAP_INGRESS has 'cgroup' suffix in there?
> It doesn't deal with cgroups. It attaches a pair of prog_fds to sockmap.
> I guess it's ok-ish to use BPF_PROG_ATTACH syscall command for that,
> but using BPF_CGROUP_SMAP_INGRESS as 'attach type' is very confusing.

Point taken, bad naming scheme but easy enough to fix.

> Why couldn't you use multiple normal bpf_map_update() commands to
> store two prog_fds into stockmap ?
> You could have reserved two special numerical key values 0xdead and
> 0xbeef or -1/-2 for these two progs?
> If I read it correctly this new prog_attach sub-command doesn't attach
> to any event, it only stores two hidden bpf programs inside sockmap.
> 

see proposal below.

> Later proper prog_attach to actual cgroup is done with normal cgroup_fd
> and BPF_CGROUP_SOCK_OPS prog type which looks completely independent
> of sockmap, no?

Yes agreed. Although I expect sockmap will primarily be used with
BPF_CGROUP_SOCK_OPS with population of a sock map done through the
SOCK_OPS helpers. But, I agree sockmap fundamentally can stand on its
own without cgroups.

> It seems right now there is psock <-> one sockmap restriction
> which looks implementation related. If we ever want to remove that
> restriction such uapi will prevent us from doing it, since it's
> doing too much stuff under the cover.

The psock <-> sockmap binding is a result of the programs being
attached to the sockmap and not to the sock object. I think the
proposal below resolves this.

> 
> i like the sock to sock redirect concept of the patch, but uapi needs
> to be improved before it goes to released kernel.
> 
> How about we break it down into smaller chunks of work and
> expose what's happening underneath as explicit user driven actions?
> Like:
> - map_fd = create_map(BPF_MAP_TYPE_SOCKMAP) will create an empty map
> - add new explicit helper to create psock in given map_fd or
>   use prog_attach() cmd to attach to socket instead of sockmap ?
> - two map_udpate_elem(map_fd, -1, strparser_prog_fd)
>   map_update_elem(map_fd, -2, verdict_prog_fd)
>   will store the progs in there which are currently hidden.
>   map_delete_elem(-1) and (-2) would clear them.
>   Alternatively pass both FDs at once as 8-byte key into single
>   map_update() cmd
>   or create new psock object that will keep strpaser and verdict progs?
> - keep prog_attach(BPF_CGROUP_SOCK_OPS) as-is for
>   BPF_CGROUP_SOCK_OPS prog type which can call new helper
>   bpf_foo(ctx, ...) that will convert ctx socket into strparser mode
>   without making association to sockmap.
> 
> My main objection to the api is that hidden psock object makes
> the api confusing to use and I'm struggling to see how it will
> be extensible. Like what if we want more than two strparser+verdict
> progs in sockmap? or multiple sockmaps out of the same verdict prog?
> I'm also struggling to wrap my head how prog_attach cmd attaches
> to a map and hidden creation of strparser in a socket.
> It seems right now psock == one rx socket that works in strparser
> mode while sockmap is secondary set of tx sockets to redirect to.
> If so, psock and sockmap should be independent in uapi.
> Like prog_attach would attach strpaser and verdict programs to
> a socket switching it to strparser mode.
> If SK_REDIRECT is not returned it would be dropping skbs (like now),
> if redirect requested, the smap_do_verdict() will pick the next
> socket from sockmap and there will be no rigid connection
> between sockmap and psock and we can use multiple sockmaps
> from the same verdict program. smap_do_verdict() only needs
> to know peer socket to redirect to.
> Thoughts?
> 

OK I think there are a couple things we can do here.

>From an API perspective having all socks in a sockmap inherit the same
BPF programs is useful when working with cgroups. It keeps things consistent
and is pretty effective for applying policy to cgroup sockets. But,
in some cases it breaks down a bit and that is where the map_flags
and BPF_SOCKMAP_STRPARSER entered the picture. After this discussion
I think we can clean this up. Here is my proposal, let me know what you
think.

First I would like to continue allowing socks to inherit BPF programs
from sock map if users set this up. To clean it up and make it extensible
though,

  - instead of doing the attach_fd2 which would break quickly if we need
    fd(3,4,...) use two separate attach types,

         BPF_SMAP_STREAM_PARSER
         BPF_SMAP_STREAM_VERDICT

    the target fd is the map fd just as before.

    This allows us to easily extend as needed by adding another type and
    the map space is a u32 so we have plenty of room for extensions.

 - implement the detach for the above to remove the programs

Next lets just remove the map_flags BPF_SOCKMAP_STRPARSER. The UAPI is
simplified this way and the inheritance rule is clear. If BPF programs
are attached to the map they are inherited. If there is no BPF program
attached the socks do not use strparser/verdict logic and are purely
for redirect actions.

A sock may be in multiple maps but can only inherit a single BPF
stream/verdict program. Otherwise we would have no way to "know"
which stream parser to run.

Future extensions could provide an API for doing per sock attach operations
and I see no reason they would not be compatible. By adding two more
attach types,

        BPF_SOCK_STREAM_PARSER
        BPF_SOCK_STREAM_VERDICT

we can provide specific sock BPF programs. With verifier work we could
even make bpf helpers

        bpf_sock_prog_attach(skops, prog, type, flags)
        bpf_sock_map_attach(sockmap, key, prog, type, flags)

I think both this and the above work together nicely also the code can
support this with some additional work. To summarize the API then with
above changes,

 syscall:

  bpf_create_map(BPF_MAP_TYPE_SOCKMAP, .... )
  bpf_prog_attach(verdict_prog, map_fd, BPF_SMAP_STREAM_VERDICT, 0);
  bpf_prog_attach(parse_prog, map_fd, BPF_SMAP_STREAM_PARSER, 0);
  bpf_map_update_elem(map_fd, key, sock_fd, BPF_ANY)
  bpf_map_delete_elem(map_fd, key)

 helpers:
  to insert sock from sock ops progrm
      bpf_sock_map_update(skops, map, key, flags);
  to redirect skb to a sock in a sockmap
      bpf_sk_redirect_map(map, key, flags)

 future work:
  bpf_prog_attach(verdict_prog, map_fd, BPF_SOCK_STREAM_VERDICT, 0)
  bpf_prog_attach(parse_prog, map_fd, BPF_SOCK_STREAM_PARSER, 0)

How does this look? I think it will be both extensible and very usable
now.

Thanks,
John

^ permalink raw reply

* [PATCH net-next] bpf: fix a return in sockmap_get_from_fd()
From: Dan Carpenter @ 2017-08-18  7:27 UTC (permalink / raw)
  To: Alexei Starovoitov, John Fastabend
  Cc: Daniel Borkmann, netdev, kernel-janitors

"map" is a valid pointer.  We wanted to return "err" instead.  Also
let's return a zero literal at the end.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d2f2bdf71ffa..b8cb1b3c9bfb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1125,11 +1125,11 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
 		fdput(f);
 		bpf_prog_put(prog1);
 		bpf_prog_put(prog2);
-		return PTR_ERR(map);
+		return err;
 	}
 
 	fdput(f);
-	return err;
+	return 0;
 }
 
 static int bpf_prog_attach(const union bpf_attr *attr)

^ permalink raw reply related

* [PATCH net-next v4] openvswitch: enable NSH support
From: Yi Yang @ 2017-08-18  7:24 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, jbenc-H+wXaHxf7aLQT0dZR+AlfA, e

v3->v4
 - Add new NSH match field ttl
 - Update NSH header to the latest format
   which will be final format and won't change
   per its author's confirmation.
 - Fix comments for v3.

v2->v3
 - Change OVS_KEY_ATTR_NSH to nested key to handle
   length-fixed attributes and length-variable
   attriubte more flexibly.
 - Remove struct ovs_action_push_nsh completely
 - Add code to handle nested attribute for SET_MASKED
 - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
   to transfer NSH header data.
 - Fix comments and coding style issues by Jiri and Eric

v1->v2
 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
 - Dynamically allocate struct ovs_action_push_nsh for
   length-variable metadata.

OVS master and 2.8 branch has merged NSH userspace
patch series, this patch is to enable NSH support
in kernel data path in order that OVS can support
NSH in 2.8 release in compat mode by porting this.

Signed-off-by: Yi Yang <yi.y.yang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 drivers/net/vxlan.c              |   7 +
 include/net/nsh.h                | 325 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/if_ether.h    |   1 +
 include/uapi/linux/openvswitch.h |  30 ++++
 net/openvswitch/actions.c        | 177 +++++++++++++++++++
 net/openvswitch/flow.c           |  52 ++++++
 net/openvswitch/flow.h           |  11 ++
 net/openvswitch/flow_netlink.c   | 361 ++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/flow_netlink.h   |   3 +
 9 files changed, 966 insertions(+), 1 deletion(-)
 create mode 100644 include/net/nsh.h

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ae3a1da..a36c41e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -27,6 +27,7 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/vxlan.h>
+#include <net/nsh.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ip6_tunnel.h>
@@ -1268,6 +1269,9 @@ static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
 	case VXLAN_GPE_NP_IPV6:
 		*protocol = htons(ETH_P_IPV6);
 		break;
+	case VXLAN_GPE_NP_NSH:
+		*protocol = htons(ETH_P_NSH);
+		break;
 	case VXLAN_GPE_NP_ETHERNET:
 		*protocol = htons(ETH_P_TEB);
 		break;
@@ -1807,6 +1811,9 @@ static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
 	case htons(ETH_P_IPV6):
 		gpe->next_protocol = VXLAN_GPE_NP_IPV6;
 		return 0;
+	case htons(ETH_P_NSH):
+		gpe->next_protocol = VXLAN_GPE_NP_NSH;
+		return 0;
 	case htons(ETH_P_TEB):
 		gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
 		return 0;
diff --git a/include/net/nsh.h b/include/net/nsh.h
new file mode 100644
index 0000000..5186fff
--- /dev/null
+++ b/include/net/nsh.h
@@ -0,0 +1,325 @@
+#ifndef __NET_NSH_H
+#define __NET_NSH_H 1
+
+/*
+ * Network Service Header:
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |Ver|O|U|    TTL    |   Length  |U|U|U|U|MD Type| Next Protocol |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |          Service Path Identifier (SPI)        | Service Index |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                                                               |
+ * ~               Mandatory/Optional Context Headers              ~
+ * |                                                               |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Version: The version field is used to ensure backward compatibility
+ * going forward with future NSH specification updates.  It MUST be set
+ * to 0x0 by the sender, in this first revision of NSH.  Given the
+ * widespread implementation of existing hardware that uses the first
+ * nibble after an MPLS label stack for ECMP decision processing, this
+ * document reserves version 01b and this value MUST NOT be used in
+ * future versions of the protocol.  Please see [RFC7325] for further
+ * discussion of MPLS-related forwarding requirements.
+ *
+ * O bit: Setting this bit indicates an Operations, Administration, and
+ * Maintenance (OAM) packet.  The actual format and processing of SFC
+ * OAM packets is outside the scope of this specification (see for
+ * example [I-D.ietf-sfc-oam-framework] for one approach).
+ *
+ * The O bit MUST be set for OAM packets and MUST NOT be set for non-OAM
+ * packets.  The O bit MUST NOT be modified along the SFP.
+ *
+ * SF/SFF/SFC Proxy/Classifier implementations that do not support SFC
+ * OAM procedures SHOULD discard packets with O bit set, but MAY support
+ * a configurable parameter to enable forwarding received SFC OAM
+ * packets unmodified to the next element in the chain.  Forwarding OAM
+ * packets unmodified by SFC elements that do not support SFC OAM
+ * procedures may be acceptable for a subset of OAM functions, but can
+ * result in unexpected outcomes for others, thus it is recommended to
+ * analyze the impact of forwarding an OAM packet for all OAM functions
+ * prior to enabling this behavior.  The configurable parameter MUST be
+ * disabled by default.
+ *
+ * TTL: Indicates the maximum SFF hops for an SFP.  This field is used
+ * for service plane loop detection.  The initial TTL value SHOULD be
+ * configurable via the control plane; the configured initial value can
+ * be specific to one or more SFPs.  If no initial value is explicitly
+ * provided, the default initial TTL value of 63 MUST be used.  Each SFF
+ * involved in forwarding an NSH packet MUST decrement the TTL value by
+ * 1 prior to NSH forwarding lookup.  Decrementing by 1 from an incoming
+ * value of 0 shall result in a TTL value of 63.  The packet MUST NOT be
+ * forwarded if TTL is, after decrement, 0.
+ *
+ * All other flag fields, marked U, are unassigned and available for
+ * future use, see Section 11.2.1.  Unassigned bits MUST be set to zero
+ * upon origination, and MUST be ignored and preserved unmodified by
+ * other NSH supporting elements.  Elements which do not understand the
+ * meaning of any of these bits MUST NOT modify their actions based on
+ * those unknown bits.
+ *
+ * Length: The total length, in 4-byte words, of NSH including the Base
+ * Header, the Service Path Header, the Fixed Length Context Header or
+ * Variable Length Context Header(s).  The length MUST be 0x6 for MD
+ * Type equal to 0x1, and MUST be 0x2 or greater for MD Type equal to
+ * 0x2.  The length of the NSH header MUST be an integer multiple of 4
+ * bytes, thus variable length metadata is always padded out to a
+ * multiple of 4 bytes.
+ *
+ * MD Type: Indicates the format of NSH beyond the mandatory Base Header
+ * and the Service Path Header.  MD Type defines the format of the
+ * metadata being carried.
+ *
+ * 0x0 - This is a reserved value.  Implementations SHOULD silently
+ * discard packets with MD Type 0x0.
+ *
+ * 0x1 - This indicates that the format of the header includes a fixed
+ * length Context Header (see Figure 4 below).
+ *
+ * 0x2 - This does not mandate any headers beyond the Base Header and
+ * Service Path Header, but may contain optional variable length Context
+ * Header(s).  The semantics of the variable length Context Header(s)
+ * are not defined in this document.  The format of the optional
+ * variable length Context Headers is provided in Section 2.5.1.
+ *
+ * 0xF - This value is reserved for experimentation and testing, as per
+ * [RFC3692].  Implementations not explicitly configured to be part of
+ * an experiment SHOULD silently discard packets with MD Type 0xF.
+ *
+ * Next Protocol: indicates the protocol type of the encapsulated data.
+ * NSH does not alter the inner payload, and the semantics on the inner
+ * protocol remain unchanged due to NSH service function chaining.
+ * Please see the IANA Considerations section below, Section 11.2.5.
+ *
+ * This document defines the following Next Protocol values:
+ *
+ * 0x1: IPv4
+ * 0x2: IPv6
+ * 0x3: Ethernet
+ * 0x4: NSH
+ * 0x5: MPLS
+ * 0xFE: Experiment 1
+ * 0xFF: Experiment 2
+ *
+ * Packets with Next Protocol values not supported SHOULD be silently
+ * dropped by default, although an implementation MAY provide a
+ * configuration parameter to forward them.  Additionally, an
+ * implementation not explicitly configured for a specific experiment
+ * [RFC3692] SHOULD silently drop packets with Next Protocol values 0xFE
+ * and 0xFF.
+ *
+ * Service Path Identifier (SPI): Identifies a service path.
+ * Participating nodes MUST use this identifier for Service Function
+ * Path selection.  The initial classifier MUST set the appropriate SPI
+ * for a given classification result.
+ *
+ * Service Index (SI): Provides location within the SFP.  The initial
+ * classifier for a given SFP SHOULD set the SI to 255, however the
+ * control plane MAY configure the initial value of SI as appropriate
+ * (i.e., taking into account the length of the service function path).
+ * The Service Index MUST be decremented by a value of 1 by Service
+ * Functions or by SFC Proxy nodes after performing required services
+ * and the new decremented SI value MUST be used in the egress packet's
+ * NSH.  The initial Classifier MUST send the packet to the first SFF in
+ * the identified SFP for forwarding along an SFP.  If re-classification
+ * occurs, and that re-classification results in a new SPI, the
+ * (re)classifier is, in effect, the initial classifier for the
+ * resultant SPI.
+ *
+ * The SI is used in conjunction the with Service Path Identifier for
+ * Service Function Path Selection and for determining the next SFF/SF
+ * in the path.  The SI is also valuable when troubleshooting or
+ * reporting service paths.  Additionally, while the TTL field is the
+ * main mechanism for service plane loop detection, the SI can also be
+ * used for detecting service plane loops.
+ *
+ * When the Base Header specifies MD Type = 0x1, a Fixed Length Context
+ * Header (16-bytes) MUST be present immediately following the Service
+ * Path Header. The value of a Fixed Length Context
+ * Header that carries no metadata MUST be set to zero.
+ *
+ * When the base header specifies MD Type = 0x2, zero or more Variable
+ * Length Context Headers MAY be added, immediately following the
+ * Service Path Header (see Figure 5).  Therefore, Length = 0x2,
+ * indicates that only the Base Header followed by the Service Path
+ * Header are present.  The optional Variable Length Context Headers
+ * MUST be of an integer number of 4-bytes.  The base header Length
+ * field MUST be used to determine the offset to locate the original
+ * packet or frame for SFC nodes that require access to that
+ * information.
+ *
+ * The format of the optional variable length Context Headers
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |          Metadata Class       |      Type     |U|    Length   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                      Variable Metadata                        |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Metadata Class (MD Class): Defines the scope of the 'Type' field to
+ * provide a hierarchical namespace.  The IANA Considerations
+ * Section 11.2.4 defines how the MD Class values can be allocated to
+ * standards bodies, vendors, and others.
+ *
+ * Type: Indicates the explicit type of metadata being carried.  The
+ * definition of the Type is the responsibility of the MD Class owner.
+ *
+ * Unassigned bit: One unassigned bit is available for future use. This
+ * bit MUST NOT be set, and MUST be ignored on receipt.
+ *
+ * Length: Indicates the length of the variable metadata, in bytes.  In
+ * case the metadata length is not an integer number of 4-byte words,
+ * the sender MUST add pad bytes immediately following the last metadata
+ * byte to extend the metadata to an integer number of 4-byte words.
+ * The receiver MUST round up the length field to the nearest 4-byte
+ * word boundary, to locate and process the next field in the packet.
+ * The receiver MUST access only those bytes in the metadata indicated
+ * by the length field (i.e., actual number of bytes) and MUST ignore
+ * the remaining bytes up to the nearest 4-byte word boundary.  The
+ * Length may be 0 or greater.
+ *
+ * A value of 0 denotes a Context Header without a Variable Metadata
+ * field.
+ *
+ * [0] https://datatracker.ietf.org/doc/draft-ietf-sfc-nsh/
+ */
+
+/**
+ * struct nsh_md1_ctx - Keeps track of NSH context data
+ * @nshc<1-4>: NSH Contexts.
+ */
+struct nsh_md1_ctx {
+	__be32 context[4];
+};
+
+struct nsh_md2_tlv {
+	__be16 md_class;
+	u8 type;
+	u8 length;
+	/* Followed by variable-length data. */
+};
+
+struct nsh_hdr {
+	__be16 ver_flags_ttl_len;
+	u8 md_type;
+	u8 next_proto;
+	__be32 path_hdr;
+	union {
+	    struct nsh_md1_ctx md1;
+	    struct nsh_md2_tlv md2;
+	};
+};
+
+#define NSH_M_TYPE2_MAX_LEN 256
+
+/* Masking NSH header fields. */
+#define NSH_VER_MASK       0xc000
+#define NSH_VER_SHIFT      14
+#define NSH_FLAGS_MASK     0x3000
+#define NSH_FLAGS_SHIFT    12
+#define NSH_TTL_MASK       0x0fc0
+#define NSH_TTL_SHIFT      6
+#define NSH_LEN_MASK       0x003f
+#define NSH_LEN_SHIFT      0
+
+#define NSH_MDTYPE_MASK    0x0f
+#define NSH_MDTYPE_SHIFT   0
+
+#define NSH_SPI_MASK       0xffffff00
+#define NSH_SPI_SHIFT      8
+#define NSH_SI_MASK        0x000000ff
+#define NSH_SI_SHIFT       0
+
+#define NSH_DST_PORT    4790     /* UDP Port for NSH on VXLAN. */
+
+/* NSH Base Header Next Protocol. */
+#define NSH_P_IPV4        0x01
+#define NSH_P_IPV6        0x02
+#define NSH_P_ETHERNET    0x03
+#define NSH_P_NSH         0x04
+#define NSH_P_MPLS        0x05
+
+/* MD Type Registry. */
+#define NSH_M_TYPE1     0x01
+#define NSH_M_TYPE2     0x02
+#define NSH_M_EXP1      0xFE
+#define NSH_M_EXP2      0xFF
+
+/* NSH Metadata Length. */
+#define NSH_M_TYPE1_MDLEN 16
+
+/* NSH Base Header Length */
+#define NSH_BASE_HDR_LEN  8
+
+/* NSH MD Type 1 header Length. */
+#define NSH_M_TYPE1_LEN   24
+
+/* NSH MD Type 2 header maximum Length. */
+#define NSH_M_TYPE2_MAX_LEN 256
+
+/* NSH MD Type 2 Metadata maximum Length. */
+#define NSH_M_TYPE2_MD_MAX_LEN (NSH_M_TYPE2_MAX_LEN - NSH_BASE_HDR_LEN)
+
+#define NSH_MD1_CTX(nsh_hdr_ptr) (&(nsh_hdr_ptr)->md1)
+
+#define NSH_MD2_CTX(nsh_hdr_ptr) (&(nsh_hdr_ptr)->md2)
+
+static inline u16 nsh_hdr_len(const struct nsh_hdr *nsh)
+{
+	return ((ntohs(nsh->ver_flags_ttl_len) & NSH_LEN_MASK)
+		>> NSH_LEN_SHIFT) << 2;
+}
+
+static inline struct nsh_md1_ctx *nsh_md1_ctx(struct nsh_hdr *nsh)
+{
+	return &nsh->md1;
+}
+
+static inline struct nsh_md2_tlv *nsh_md2_ctx(struct nsh_hdr *nsh)
+{
+	return &nsh->md2;
+}
+
+static inline u8 nsh_get_ver(const struct nsh_hdr *nsh)
+{
+	return (ntohs(nsh->ver_flags_ttl_len) & NSH_VER_MASK)
+		>> NSH_VER_SHIFT;
+}
+
+static inline u8 nsh_get_flags(const struct nsh_hdr *nsh)
+{
+	return (ntohs(nsh->ver_flags_ttl_len) & NSH_FLAGS_MASK)
+		>> NSH_FLAGS_SHIFT;
+}
+
+static inline u8 nsh_get_ttl(const struct nsh_hdr *nsh)
+{
+	return (ntohs(nsh->ver_flags_ttl_len) & NSH_TTL_MASK)
+		>> NSH_TTL_SHIFT;
+}
+
+static inline void nsh_set_flags_and_ttl(struct nsh_hdr *nsh, u8 flags, u8 ttl)
+{
+	nsh->ver_flags_ttl_len
+		= htons((ntohs(nsh->ver_flags_ttl_len)
+			& ~(NSH_FLAGS_MASK | NSH_TTL_MASK))
+			| ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK)
+			| ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK));
+}
+
+static inline void nsh_set_flags_ttl_len(struct nsh_hdr *nsh, u8 flags,
+					 u8 ttl, u8 len)
+{
+	nsh->ver_flags_ttl_len
+		= htons((ntohs(nsh->ver_flags_ttl_len)
+			& ~(NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK))
+			| ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK)
+			| ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK)
+			| ((len << NSH_LEN_SHIFT) & NSH_LEN_MASK));
+}
+
+#endif /* __NET_NSH_H */
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 5bc9bfd..e7f1a61 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -137,6 +137,7 @@
 #define ETH_P_IEEE802154 0x00F6		/* IEEE802.15.4 frame		*/
 #define ETH_P_CAIF	0x00F7		/* ST-Ericsson CAIF protocol	*/
 #define ETH_P_XDSA	0x00F8		/* Multiplexed DSA protocol	*/
+#define ETH_P_NSH	0x894F		/* Ethertype for NSH. */
 
 /*
  *	This is an Ethernet frame header.
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 156ee4c..1a1f1d0 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -333,6 +333,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_CT_LABELS,	/* 16-octet connection tracking label */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,   /* struct ovs_key_ct_tuple_ipv4 */
 	OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,   /* struct ovs_key_ct_tuple_ipv6 */
+	OVS_KEY_ATTR_NSH,       /* Nested set of ovs_nsh_key_* */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -491,6 +492,29 @@ struct ovs_key_ct_tuple_ipv6 {
 	__u8   ipv6_proto;
 };
 
+enum ovs_nsh_key_attr {
+	OVS_NSH_KEY_ATTR_BASE,  /* struct ovs_nsh_key_base. */
+	OVS_NSH_KEY_ATTR_MD1,   /* struct ovs_nsh_key_md1. */
+	OVS_NSH_KEY_ATTR_MD2,   /* variable-length octets for MD type 2. */
+	__OVS_NSH_KEY_ATTR_MAX
+};
+
+#define OVS_NSH_KEY_ATTR_MAX (__OVS_NSH_KEY_ATTR_MAX - 1)
+
+struct ovs_nsh_key_base {
+	__u8 flags;
+	__u8 ttl;
+	__u8 mdtype;
+	__u8 np;
+	__be32 path_hdr;
+};
+
+#define NSH_MD1_CONTEXT_SIZE 4
+
+struct ovs_nsh_key_md1 {
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -769,6 +793,8 @@ struct ovs_action_push_eth {
 	struct ovs_key_ethernet addresses;
 };
 
+#define OVS_PUSH_NSH_MAX_MD_LEN 248
+
 /**
  * enum ovs_action_attr - Action types.
  *
@@ -806,6 +832,8 @@ struct ovs_action_push_eth {
  * packet.
  * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
  * packet.
+ * @OVS_ACTION_ATTR_PUSH_NSH: push NSH header to the packet.
+ * @OVS_ACTION_ATTR_POP_NSH: pop the outermost NSH header off the packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -835,6 +863,8 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
 	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
 	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+	OVS_ACTION_ATTR_PUSH_NSH,     /* Nested OVS_NSH_KEY_ATTR_*. */
+	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e461067..f6de49b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -43,6 +43,7 @@
 #include "flow.h"
 #include "conntrack.h"
 #include "vport.h"
+#include "flow_netlink.h"
 
 struct deferred_action {
 	struct sk_buff *skb;
@@ -380,6 +381,98 @@ static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
+		    const struct nsh_hdr *nsh_src)
+{
+	struct nsh_hdr *nsh;
+	size_t length = nsh_hdr_len(nsh_src);
+	u8 next_proto;
+
+	if (key->mac_proto == MAC_PROTO_ETHERNET) {
+		next_proto = NSH_P_ETHERNET;
+	} else {
+		switch (ntohs(skb->protocol)) {
+		case ETH_P_IP:
+			next_proto = NSH_P_IPV4;
+			break;
+		case ETH_P_IPV6:
+			next_proto = NSH_P_IPV6;
+			break;
+		case ETH_P_NSH:
+			next_proto = NSH_P_NSH;
+			break;
+		default:
+			return -ENOTSUPP;
+		}
+	}
+
+	/* Add the NSH header */
+	if (skb_cow_head(skb, length) < 0)
+		return -ENOMEM;
+
+	skb_push(skb, length);
+	nsh = (struct nsh_hdr *)(skb->data);
+	memcpy(nsh, nsh_src, length);
+	nsh->next_proto = next_proto;
+	nsh->md_type &= NSH_MDTYPE_MASK;
+
+	if (!skb->inner_protocol)
+		skb_set_inner_protocol(skb, skb->protocol);
+
+	skb->protocol = htons(ETH_P_NSH);
+	key->eth.type = htons(ETH_P_NSH);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+
+	/* safe right before invalidate_flow_key */
+	key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
+static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	struct nsh_hdr *nsh = (struct nsh_hdr *)(skb->data);
+	size_t length;
+	u16 inner_proto;
+
+	if (ovs_key_mac_proto(key) != MAC_PROTO_NONE ||
+	    skb->protocol != htons(ETH_P_NSH)) {
+		return -EINVAL;
+	}
+
+	switch (nsh->next_proto) {
+	case NSH_P_ETHERNET:
+		inner_proto = htons(ETH_P_TEB);
+		break;
+	case NSH_P_IPV4:
+		inner_proto = htons(ETH_P_IP);
+		break;
+	case NSH_P_IPV6:
+		inner_proto = htons(ETH_P_IPV6);
+		break;
+	case NSH_P_NSH:
+		inner_proto = htons(ETH_P_NSH);
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
+	length = nsh_hdr_len(nsh);
+	skb_pull(skb, length);
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+	skb->protocol = inner_proto;
+
+	/* safe right before invalidate_flow_key */
+	if (inner_proto == htons(ETH_P_TEB))
+		key->mac_proto = MAC_PROTO_ETHERNET;
+	else
+		key->mac_proto = MAC_PROTO_NONE;
+	invalidate_flow_key(key);
+	return 0;
+}
+
 static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
 				  __be32 addr, __be32 new_addr)
 {
@@ -602,6 +695,53 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
 	return 0;
 }
 
+static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
+		   const struct ovs_key_nsh *key,
+		   const struct ovs_key_nsh *mask)
+{
+	struct nsh_hdr *nsh;
+	int err;
+	u8 flags;
+	u8 ttl;
+	int i;
+
+	err = skb_ensure_writable(skb, skb_network_offset(skb) +
+				  sizeof(struct nsh_hdr));
+	if (unlikely(err))
+		return err;
+
+	nsh = (struct nsh_hdr *)skb_network_header(skb);
+
+	flags = nsh_get_flags(nsh);
+	flags = OVS_MASKED(flags, key->flags, mask->flags);
+	flow_key->nsh.flags = flags;
+	ttl = nsh_get_ttl(nsh);
+	ttl = OVS_MASKED(ttl, key->ttl, mask->ttl);
+	flow_key->nsh.ttl = ttl;
+	nsh_set_flags_and_ttl(nsh, flags, ttl);
+	nsh->path_hdr = OVS_MASKED(nsh->path_hdr, key->path_hdr,
+				   mask->path_hdr);
+	flow_key->nsh.path_hdr = nsh->path_hdr;
+	switch (nsh->md_type) {
+	case NSH_M_TYPE1:
+		for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) {
+			nsh->md1.context[i] =
+			    OVS_MASKED(nsh->md1.context[i], key->context[i],
+				       mask->context[i]);
+		}
+		memcpy(flow_key->nsh.context, nsh->md1.context,
+		       sizeof(nsh->md1.context));
+		break;
+	case NSH_M_TYPE2:
+		memset(flow_key->nsh.context, 0,
+		       sizeof(flow_key->nsh.context));
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /* Must follow skb_ensure_writable() since that can move the skb data. */
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			__be16 new_port, __sum16 *check)
@@ -1024,6 +1164,29 @@ static int execute_masked_set_action(struct sk_buff *skb,
 				   get_mask(a, struct ovs_key_ethernet *));
 		break;
 
+	case OVS_KEY_ATTR_NSH: {
+		struct ovs_key_nsh nsh;
+		struct ovs_key_nsh nsh_mask;
+		size_t size = nla_len(a) / 2;
+		struct nlattr attr[1 + size / sizeof(struct nlattr) + 1];
+		struct nlattr mask[1 + size / sizeof(struct nlattr) + 1];
+
+		attr->nla_type = nla_type(a);
+		mask->nla_type = attr->nla_type;
+		attr->nla_len = NLA_HDRLEN + size;
+		mask->nla_len = attr->nla_len;
+		memcpy(attr + 1, (char *)(a + 1), size);
+		memcpy(mask + 1, (char *)(a + 1) + size, size);
+		err = nsh_key_from_nlattr(attr, &nsh);
+		if (err)
+			break;
+		err = nsh_key_from_nlattr(mask, &nsh_mask);
+		if (err)
+			break;
+		err = set_nsh(skb, flow_key, &nsh, &nsh_mask);
+		break;
+	}
+
 	case OVS_KEY_ATTR_IPV4:
 		err = set_ipv4(skb, flow_key, nla_data(a),
 			       get_mask(a, struct ovs_key_ipv4 *));
@@ -1210,6 +1373,20 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		case OVS_ACTION_ATTR_POP_ETH:
 			err = pop_eth(skb, key);
 			break;
+
+		case OVS_ACTION_ATTR_PUSH_NSH: {
+			u8 buffer[256];
+			struct nsh_hdr *nsh_hdr = (struct nsh_hdr *)buffer;
+			const struct nsh_hdr *nsh_src = nsh_hdr;
+
+			nsh_hdr_from_nlattr(nla_data(a), nsh_hdr);
+			err = push_nsh(skb, key, nsh_src);
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			err = pop_nsh(skb, key);
+			break;
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 8c94cef..1e8d860 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -46,6 +46,7 @@
 #include <net/ipv6.h>
 #include <net/mpls.h>
 #include <net/ndisc.h>
+#include <net/nsh.h>
 
 #include "conntrack.h"
 #include "datapath.h"
@@ -490,6 +491,53 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 	return 0;
 }
 
+static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	struct nsh_hdr *nsh = (struct nsh_hdr *)skb_network_header(skb);
+	u8 version, length;
+	int err;
+
+	err = check_header(skb, NSH_BASE_HDR_LEN);
+	if (unlikely(err))
+		return err;
+
+	memset(&key->nsh, 0, sizeof(struct ovs_key_nsh));
+	version = nsh_get_ver(nsh);
+	length = nsh_hdr_len(nsh);
+
+	if (version != 0)
+		return -EINVAL;
+
+	if (nsh->md_type == NSH_M_TYPE1 && length != NSH_M_TYPE1_LEN)
+		return -EINVAL;
+
+	if (nsh->md_type == NSH_M_TYPE2 && length < NSH_BASE_HDR_LEN)
+		return -EINVAL;
+
+	err = check_header(skb, length);
+	if (unlikely(err))
+		return err;
+
+	key->nsh.flags = nsh_get_flags(nsh);
+	key->nsh.ttl = nsh_get_ttl(nsh);
+	key->nsh.mdtype = nsh->md_type;
+	key->nsh.np = nsh->next_proto;
+	key->nsh.path_hdr = nsh->path_hdr;
+	switch (key->nsh.mdtype) {
+	case NSH_M_TYPE1:
+		memcpy(key->nsh.context, nsh->md1.context,
+		       sizeof(nsh->md1));
+		break;
+	case NSH_M_TYPE2:
+		/* Don't support MD type 2 metedata parsing yet */
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * key_extract - extracts a flow key from an Ethernet frame.
  * @skb: sk_buff that contains the frame, with skb->data pointing to the
@@ -735,6 +783,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 				memset(&key->tp, 0, sizeof(key->tp));
 			}
 		}
+	} else if (key->eth.type == htons(ETH_P_NSH)) {
+		error = parse_nsh(skb, key);
+		if (error)
+			return error;
 	}
 	return 0;
 }
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1875bba..d613fdc 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -35,6 +35,7 @@
 #include <net/inet_ecn.h>
 #include <net/ip_tunnels.h>
 #include <net/dst_metadata.h>
+#include <net/nsh.h>
 
 struct sk_buff;
 
@@ -66,6 +67,15 @@ struct vlan_head {
 	(offsetof(struct sw_flow_key, recirc_id) +	\
 	FIELD_SIZEOF(struct sw_flow_key, recirc_id))
 
+struct ovs_key_nsh {
+	__u8 flags;
+	__u8 ttl;
+	__u8 mdtype;
+	__u8 np;
+	__be32 path_hdr;
+	__be32 context[NSH_MD1_CONTEXT_SIZE];
+};
+
 struct sw_flow_key {
 	u8 tun_opts[IP_TUNNEL_OPTS_MAX];
 	u8 tun_opts_len;
@@ -144,6 +154,7 @@ struct sw_flow_key {
 			};
 		} ipv6;
 	};
+	struct ovs_key_nsh nsh;         /* network service header */
 	struct {
 		/* Connection tracking fields not packed above. */
 		struct {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index e8eb427..c62bdac 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -78,9 +78,11 @@ static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_HASH:
 		case OVS_ACTION_ATTR_POP_ETH:
 		case OVS_ACTION_ATTR_POP_MPLS:
+		case OVS_ACTION_ATTR_POP_NSH:
 		case OVS_ACTION_ATTR_POP_VLAN:
 		case OVS_ACTION_ATTR_PUSH_ETH:
 		case OVS_ACTION_ATTR_PUSH_MPLS:
+		case OVS_ACTION_ATTR_PUSH_NSH:
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 		case OVS_ACTION_ATTR_SAMPLE:
 		case OVS_ACTION_ATTR_SET:
@@ -322,12 +324,27 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
 }
 
+size_t ovs_nsh_key_attr_size(void)
+{
+	/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
+	 * updating this function.
+	 */
+	return  nla_total_size(8)      /* OVS_NSH_KEY_ATTR_BASE */
+		/* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are
+		 * mutually exclusive, so the bigger one can cover
+		 * the small one.
+		 *
+		 * OVS_NSH_KEY_ATTR_MD2
+		 */
+		+ nla_total_size(NSH_M_TYPE2_MD_MAX_LEN);
+}
+
 size_t ovs_key_attr_size(void)
 {
 	/* Whenever adding new OVS_KEY_ FIELDS, we should consider
 	 * updating this function.
 	 */
-	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
+	BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 29);
 
 	return    nla_total_size(4)   /* OVS_KEY_ATTR_PRIORITY */
 		+ nla_total_size(0)   /* OVS_KEY_ATTR_TUNNEL */
@@ -341,6 +358,8 @@ size_t ovs_key_attr_size(void)
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */
 		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */
 		+ nla_total_size(40)  /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+		+ nla_total_size(0)   /* OVS_KEY_ATTR_NSH */
+		  + ovs_nsh_key_attr_size()
 		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */
 		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */
 		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */
@@ -373,6 +392,13 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 	[OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
 };
 
+static const struct ovs_len_tbl
+ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = {
+	[OVS_NSH_KEY_ATTR_BASE]     = { .len = 8 },
+	[OVS_NSH_KEY_ATTR_MD1]      = { .len = 16 },
+	[OVS_NSH_KEY_ATTR_MD2]      = { .len = OVS_ATTR_VARIABLE },
+};
+
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
 static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ENCAP]	 = { .len = OVS_ATTR_NESTED },
@@ -405,6 +431,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv4) },
 	[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
 		.len = sizeof(struct ovs_key_ct_tuple_ipv6) },
+	[OVS_KEY_ATTR_NSH]       = { .len = OVS_ATTR_NESTED,
+				     .next = ovs_nsh_key_attr_lens, },
 };
 
 static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -1179,6 +1207,264 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
 	return 0;
 }
 
+int nsh_hdr_from_nlattr(const struct nlattr *attr,
+			struct nsh_hdr *nsh)
+{
+	struct nlattr *a;
+	int rem;
+	u8 flags = 0;
+	u8 ttl = 0;
+	int mdlen = 0;
+	bool has_md1 = false;
+	bool has_md2 = false;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(1, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    1,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+			flags = base->flags;
+			ttl = base->ttl;
+			nsh->next_proto = base->np;
+			nsh->md_type = base->mdtype;
+			nsh->path_hdr = base->path_hdr;
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+			struct nsh_md1_ctx *md1_dst = nsh_md1_ctx(nsh);
+
+			has_md1 = true;
+			mdlen = nla_len(a);
+			memcpy(md1_dst, md1, mdlen);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2: {
+			const struct u8 *md2 = nla_data(a);
+			struct nsh_md2_tlv *md2_dst = nsh_md2_ctx(nsh);
+
+			has_md2 = true;
+			mdlen = nla_len(a);
+			if ((mdlen > NSH_M_TYPE2_MD_MAX_LEN) ||
+			    (mdlen == 0)) {
+				OVS_NLERR(
+				    1,
+				    "length %d of nsh attr %d is invalid",
+				    mdlen,
+				    type
+				);
+				return -EINVAL;
+			}
+			memcpy(md2_dst, md2, mdlen);
+			break;
+		}
+		default:
+			OVS_NLERR(1, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(1, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if ((has_md1 && nsh->md_type != NSH_M_TYPE1) ||
+	    (has_md2 && nsh->md_type != NSH_M_TYPE2)) {
+		OVS_NLERR(1,
+			  "nsh attribute has unmatched MD type %d.",
+			  nsh->md_type);
+		return -EINVAL;
+	}
+
+	/* nsh header length  = NSH_BASE_HDR_LEN + mdlen */
+	nsh_set_flags_ttl_len(nsh, flags, ttl,
+			      (NSH_BASE_HDR_LEN + mdlen) >> 2);
+
+	return 0;
+}
+
+int nsh_key_from_nlattr(const struct nlattr *attr,
+			struct ovs_key_nsh *nsh)
+{
+	struct nlattr *a;
+	int rem;
+	bool has_md1 = false;
+	bool has_md2 = false;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(1, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    1,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+
+			memcpy(nsh, base, sizeof(*base));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+
+			has_md1 = true;
+			memcpy(nsh->context, md1->context, sizeof(*md1));
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			/* Not supported yet */
+			has_md2 = true;
+			break;
+		default:
+			OVS_NLERR(1, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(1, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if ((has_md1 && nsh->mdtype != NSH_M_TYPE1) ||
+	    (has_md2 && nsh->mdtype != NSH_M_TYPE2)) {
+		OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
+			  nsh->mdtype);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nsh_key_put_from_nlattr(const struct nlattr *attr,
+				   struct sw_flow_match *match, bool is_mask,
+				   bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool has_md1 = false;
+	bool has_md2 = false;
+	u8 mdtype = 0;
+
+	nla_for_each_nested(a, attr, rem) {
+		int type = nla_type(a);
+		int i;
+
+		if (type > OVS_NSH_KEY_ATTR_MAX) {
+			OVS_NLERR(log, "nsh attr %d is out of range max %d",
+				  type, OVS_NSH_KEY_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (!check_attr_len(nla_len(a),
+				    ovs_nsh_key_attr_lens[type].len)) {
+			OVS_NLERR(
+			    log,
+			    "nsh attr %d has unexpected len %d expected %d",
+			    type,
+			    nla_len(a),
+			    ovs_nsh_key_attr_lens[type].len
+			);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NSH_KEY_ATTR_BASE: {
+			const struct ovs_nsh_key_base *base =
+				(struct ovs_nsh_key_base *)nla_data(a);
+
+			mdtype = base->mdtype;
+			SW_FLOW_KEY_PUT(match, nsh.flags,
+					base->flags, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.ttl,
+					base->ttl, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.mdtype,
+					base->mdtype, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.np,
+					base->np, is_mask);
+			SW_FLOW_KEY_PUT(match, nsh.path_hdr,
+					base->path_hdr, is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD1: {
+			const struct ovs_nsh_key_md1 *md1 =
+				(struct ovs_nsh_key_md1 *)nla_data(a);
+
+			has_md1 = true;
+			for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++)
+				SW_FLOW_KEY_PUT(match, nsh.context[i],
+						md1->context[i], is_mask);
+			break;
+		}
+		case OVS_NSH_KEY_ATTR_MD2:
+			/* Not supported yet */
+			has_md2 = true;
+			break;
+		default:
+			OVS_NLERR(log, "Unknown nsh attribute %d",
+				  type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem);
+		return -EINVAL;
+	}
+
+	if (!is_mask) {
+		if ((has_md1 && mdtype != NSH_M_TYPE1) ||
+		    (has_md2 && mdtype != NSH_M_TYPE2)) {
+			OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
+				  mdtype);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 				u64 attrs, const struct nlattr **a,
 				bool is_mask, bool log)
@@ -1306,6 +1592,13 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_NSH)) {
+		if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match,
+					    is_mask, log) < 0)
+			return -EINVAL;
+		attrs &= ~(1 << OVS_KEY_ATTR_NSH);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
 		const struct ovs_key_mpls *mpls_key;
 
@@ -1622,6 +1915,40 @@ static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh,
 	return 0;
 }
 
+static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask,
+			     struct sk_buff *skb)
+{
+	struct nlattr *start;
+	struct ovs_nsh_key_base base;
+	struct ovs_nsh_key_md1 md1;
+
+	memcpy(&base, nsh, sizeof(base));
+
+	if (is_mask || nsh->mdtype == NSH_M_TYPE1)
+		memcpy(md1.context, nsh->context, sizeof(md1));
+
+	start = nla_nest_start(skb, OVS_KEY_ATTR_NSH);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(base), &base))
+		goto nla_put_failure;
+
+	if (is_mask || nsh->mdtype == NSH_M_TYPE1) {
+		if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, sizeof(md1), &md1))
+			goto nla_put_failure;
+	}
+
+	/* Don't support MD type 2 yet */
+
+	nla_nest_end(skb, start);
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 			     const struct sw_flow_key *output, bool is_mask,
 			     struct sk_buff *skb)
@@ -1750,6 +2077,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 		ipv6_key->ipv6_tclass = output->ip.tos;
 		ipv6_key->ipv6_hlimit = output->ip.ttl;
 		ipv6_key->ipv6_frag = output->ip.frag;
+	} else if (swkey->eth.type == htons(ETH_P_NSH)) {
+		if (nsh_key_to_nlattr(&output->nsh, is_mask, skb))
+			goto nla_put_failure;
 	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
 		   swkey->eth.type == htons(ETH_P_RARP)) {
 		struct ovs_key_arp *arp_key;
@@ -2242,6 +2572,17 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	return err;
 }
 
+static bool validate_nsh(const struct nlattr *attr, bool is_mask, bool log)
+{
+	struct sw_flow_match match;
+	struct sw_flow_key key;
+	int ret = 0;
+
+	ovs_match_init(&match, &key, true, NULL);
+	ret = nsh_key_put_from_nlattr(nla_data(attr), &match, is_mask, log);
+	return ((ret != 0) ? false : true);
+}
+
 /* Return false if there are any non-masked bits set.
  * Mask follows data immediately, before any netlink padding.
  */
@@ -2384,6 +2725,11 @@ static int validate_set(const struct nlattr *a,
 
 		break;
 
+	case OVS_KEY_ATTR_NSH:
+		if (!validate_nsh(a, masked, log))
+			return -EINVAL;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -2482,6 +2828,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
 			[OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
 			[OVS_ACTION_ATTR_POP_ETH] = 0,
+			[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
+			[OVS_ACTION_ATTR_POP_NSH] = 0,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -2636,6 +2984,17 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			mac_proto = MAC_PROTO_ETHERNET;
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_NSH:
+			mac_proto = MAC_PROTO_NONE;
+			break;
+
+		case OVS_ACTION_ATTR_POP_NSH:
+			if (key->nsh.np == NSH_P_ETHERNET)
+				mac_proto = MAC_PROTO_ETHERNET;
+			else
+				mac_proto = MAC_PROTO_NONE;
+			break;
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 929c665..68e1e66 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -79,4 +79,7 @@ int ovs_nla_put_actions(const struct nlattr *attr,
 void ovs_nla_free_flow_actions(struct sw_flow_actions *);
 void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
 
+int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh);
+int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nsh_hdr *nsh_src);
+
 #endif /* flow_netlink.h */
-- 
2.5.5

^ permalink raw reply related

* [PATCH net-next] net: sched: Add the invalid handle check in qdisc_class_find
From: gfree.wind @ 2017-08-18  7:23 UTC (permalink / raw)
  To: davem, netdev; +Cc: Gao Feng

From: Gao Feng <gfree.wind@vip.163.com>

Add the invalid handle "0" check to avoid unnecessary search, because
the qdisc uses the skb->priority as the handle value to look up, and
it is "0" usually.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
---
 include/net/sch_generic.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 5865db9..107c524 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -393,6 +393,9 @@ static inline unsigned int qdisc_class_hash(u32 id, u32 mask)
 	struct Qdisc_class_common *cl;
 	unsigned int h;
 
+	if (!id)
+		return NULL;
+
 	h = qdisc_class_hash(id, hash->hashmask);
 	hlist_for_each_entry(cl, &hash->hash[h], hnode) {
 		if (cl->classid == id)
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH RFC v2 3/5] samples/bpf: Fix inline asm issues building samples on arm64
From: Joel Fernandes @ 2017-08-18  6:55 UTC (permalink / raw)
  To: David Miller
  Cc: LKML, Chenbo Feng, Alison Chaiken, Juri Lelli, Alexei Starovoitov,
	Daniel Borkmann, open list:BPF (Safe dynamic programs and tools)
In-Reply-To: <20170808.203540.2238751604302920304.davem@davemloft.net>

On Tue, Aug 8, 2017 at 8:35 PM, David Miller <davem@davemloft.net> wrote:
> From: Joel Fernandes <joelaf@google.com>
> Date: Mon, 7 Aug 2017 18:20:49 -0700
>
>> On Mon, Aug 7, 2017 at 11:28 AM, David Miller <davem@davemloft.net> wrote:
>>> The amount of hellish hacks we are adding to deal with this is getting
>>> way out of control.
>>
>> I agree with you that hellish hacks are being added which is why it
>> keeps breaking. I think one of the things my series does is to add
>> back inclusion of asm headers that were previously removed (that is
>> the worst hellish hack in my opinion that existing in mainline). So in
>> that respect my patch is an improvement and makes it possible to build
>> for arm64 platforms (which is currently broken in mainline).
>
> Yeah that is a problem.
>
> Perhaps another avenue of attack is to separate "type" header files from
> stuff that has functiond declarations and inline assembler code.

I was thinking that's probably a huge undertaking if you meant doing
the above for every arch?

Also another way could be to modify clang to ignore inline asm
directives during compilation? Do you have any comments about such
approach?

thanks,

-Joel

^ permalink raw reply

* [PATCH] net: stmmac: socfgpa: Ensure emac bit set in sys manager for MII/GMII/SGMII.
From: Stephan Gatzka @ 2017-08-18  6:55 UTC (permalink / raw)
  To: peppe.cavallaro, alexandre.torgue, netdev, linux-kernel; +Cc: Stephan Gatzka

When using MII/GMII/SGMII in the Altera SoC, the phy needs to be
wired through the FPGA. To ensure correct behavior, the appropriate
bit in the System Manager FPGA Interface Group register needs to be
set.

Signed-off-by: Stephan Gatzka <stephan.gatzka@gmail.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index 17d4bba..d7c231b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -269,7 +269,10 @@ static int socfpga_dwmac_set_phy_mode(struct socfpga_dwmac *dwmac)
 	ctrl &= ~(SYSMGR_EMACGRP_CTRL_PHYSEL_MASK << reg_shift);
 	ctrl |= val << reg_shift;

-	if (dwmac->f2h_ptp_ref_clk) {
+	if ((dwmac->f2h_ptp_ref_clk) ||
+	    (phymode == PHY_INTERFACE_MODE_MII) ||
+	    (phymode == PHY_INTERFACE_MODE_GMII) ||
+	    (phymode == PHY_INTERFACE_MODE_SGMII)) {
 		ctrl |= SYSMGR_EMACGRP_CTRL_PTP_REF_CLK_MASK << (reg_shift / 2);
 		regmap_read(sys_mgr_base_addr, SYSMGR_FPGAGRP_MODULE_REG,
 			    &module);
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 net 2/2] net: ixgbe: Use new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag
From: Ding Tianhong @ 2017-08-18  6:21 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher, keescook, linux-kernel, sparclinux,
	intel-wired-lan, alexander.duyck, netdev, linuxarm
  Cc: Ding Tianhong
In-Reply-To: <1503037265-11144-1-git-send-email-dingtianhong@huawei.com>

The ixgbe driver use the compile check to determine if it can
send TLPs to Root Port with the Relaxed Ordering Attribute set,
this is too inconvenient, now the new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING
has been added to the kernel and we could check the bit4 in the PCIe
Device Control register to determine whether we should use the Relaxed
Ordering Attributes or not, so use this new way in the ixgbe driver.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c  | 22 ----------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 19 -------------------
 2 files changed, 41 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
index 523f9d0..8a32eb7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
@@ -175,31 +175,9 @@ static s32 ixgbe_init_phy_ops_82598(struct ixgbe_hw *hw)
  **/
 static s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw)
 {
-#ifndef CONFIG_SPARC
-	u32 regval;
-	u32 i;
-#endif
 	s32 ret_val;
 
 	ret_val = ixgbe_start_hw_generic(hw);
-
-#ifndef CONFIG_SPARC
-	/* Disable relaxed ordering */
-	for (i = 0; ((i < hw->mac.max_tx_queues) &&
-	     (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
-		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
-	}
-
-	for (i = 0; ((i < hw->mac.max_rx_queues) &&
-	     (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
-	}
-#endif
 	if (ret_val)
 		return ret_val;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index d4933d2..96c324f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -350,25 +350,6 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
 	}
 	IXGBE_WRITE_FLUSH(hw);
 
-#ifndef CONFIG_SPARC
-	/* Disable relaxed ordering */
-	for (i = 0; i < hw->mac.max_tx_queues; i++) {
-		u32 regval;
-
-		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
-		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
-	}
-
-	for (i = 0; i < hw->mac.max_rx_queues; i++) {
-		u32 regval;
-
-		regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-		regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-			    IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-		IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
-	}
-#endif
 	return 0;
 }
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v3 net 1/2] Revert commit 1a8b6d76dc5b ("net:add one common config...")
From: Ding Tianhong @ 2017-08-18  6:21 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher, keescook, linux-kernel, sparclinux,
	intel-wired-lan, alexander.duyck, netdev, linuxarm
  Cc: Ding Tianhong
In-Reply-To: <1503037265-11144-1-git-send-email-dingtianhong@huawei.com>

The new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING has been added
to indicate that Relaxed Ordering Attributes (RO) should not
be used for Transaction Layer Packets (TLP) targeted toward
these affected Root Port, it will clear the bit4 in the PCIe
Device Control register, so the PCIe device drivers could
query PCIe configuration space to determine if it can send
TLPs to Root Port with the Relaxed Ordering Attributes set.

With this new flag  we don't need the config ARCH_WANT_RELAX_ORDER
to control the Relaxed Ordering Attributes for the ixgbe drivers
just like the commit 1a8b6d76dc5b ("net:add one common config...") did,
so revert this commit.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
---
 arch/Kconfig                                    | 3 ---
 arch/sparc/Kconfig                              | 1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089..00cfc63 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -928,9 +928,6 @@ config STRICT_MODULE_RWX
 	  and non-text memory will be made non-executable. This provides
 	  protection against certain security exploits (e.g. writing to text)
 
-config ARCH_WANT_RELAX_ORDER
-	bool
-
 config REFCOUNT_FULL
 	bool "Perform full reference count validation at the expense of speed"
 	help
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a4a6261..987a575 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -44,7 +44,6 @@ config SPARC
 	select ARCH_HAS_SG_CHAIN
 	select CPU_NO_EFFICIENT_FFS
 	select LOCKDEP_SMALL if LOCKDEP
-	select ARCH_WANT_RELAX_ORDER
 
 config SPARC32
 	def_bool !64BIT
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 4e35e70..d4933d2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -350,7 +350,7 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
 	}
 	IXGBE_WRITE_FLUSH(hw);
 
-#ifndef CONFIG_ARCH_WANT_RELAX_ORDER
+#ifndef CONFIG_SPARC
 	/* Disable relaxed ordering */
 	for (i = 0; i < hw->mac.max_tx_queues; i++) {
 		u32 regval;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v3 net 0/2] net: ixgbe: Use new flag to disable Relaxed Ordering
From: Ding Tianhong @ 2017-08-18  6:21 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher, keescook, linux-kernel, sparclinux,
	intel-wired-lan, alexander.duyck, netdev, linuxarm
  Cc: Mao Wenan

From: Mao Wenan <maowenan@huawei.com>

The new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING has been added
to indicate that Relaxed Ordering Attributes (RO) should not
be used for Transaction Layer Packets (TLP) targeted toward
these affected Root Port, it will clear the bit4 in the PCIe
Device Control register, so the PCIe device drivers could
query PCIe configuration space to determine if it can send
TLPs to Root Port with the Relaxed Ordering Attributes set.

The ixgbe driver could use this flag to determine if it can
send TLPs to Root Port with the Relaxed Ordering Attributes set.

v2: Simplify the original program according Alex's suggestion,
    remove the new ixgbe flag2 and only check the bit4 in the
    PCIe Device Control register. 

v3: Remove the code that clears the bits in DCA_T/RXCTRL, relaxed
    ordering should be enabled by the HW when the bus allow it.

Ding Tianhong (2):
  Revert commit 1a8b6d76dc5b ("net:add one common config...")
  net: ixgbe: Use new IXGBE_FLAG2_ROOT_NO_RELAXED_ORDERING flag

 arch/Kconfig                                    |  3 --
 arch/sparc/Kconfig                              |  1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c  | 37 ++++++++++++-------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 32 +++++++++++----------
 4 files changed, 35 insertions(+), 38 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH net-next 2/2] liquidio: with embedded f/w, issue droq credits before enablement
From: Felix Manlunas @ 2017-08-18  6:11 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	ricardo.farrington
In-Reply-To: <20170818061037.GA4024@felix-thinkpad.cavium.com>

From: Rick Farrington <ricardo.farrington@cavium.com>

1. Issue credits BEFORE enabling DROQ's; this prevents PKTPF_ERR interrupt.

Signed-off-by: Rick Farrington <ricardo.farrington@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index dccd447..89d4bbc 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -4040,6 +4040,18 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 
 	atomic_set(&octeon_dev->status, OCT_DEV_INTR_SET_DONE);
 
+	/* Send Credit for Octeon Output queues. Credits are always sent BEFORE
+	 * the output queue is enabled.
+	 * This ensures that we'll receive the f/w CORE DRV_ACTIVE message in
+	 * case we've configured CN23XX_SLI_GBL_CONTROL[NOPTR_D] = 0.
+	 * Otherwise, it is possible that the DRV_ACTIVE message will be sent
+	 * before any credits have been issued, causing the ring to be reset
+	 * (and the f/w appear to never have started).
+	 */
+	for (j = 0; j < octeon_dev->num_oqs; j++)
+		writel(octeon_dev->droq[j]->max_count,
+		       octeon_dev->droq[j]->pkts_credit_reg);
+
 	/* Enable the input and output queues for this Octeon device */
 	ret = octeon_dev->fn_list.enable_io_queues(octeon_dev);
 	if (ret) {
@@ -4124,14 +4136,6 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 
 	atomic_set(&octeon_dev->status, OCT_DEV_HOST_OK);
 
-	/* Send Credit for Octeon Output queues. Credits are always sent after
-	 * the output queue is enabled.
-	 */
-	for (j = 0; j < octeon_dev->num_oqs; j++)
-		writel(octeon_dev->droq[j]->max_count,
-		       octeon_dev->droq[j]->pkts_credit_reg);
-
-	/* Packets can start arriving on the output queues from this point. */
 	return 0;
 }
 
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next 1/2] liquidio: with embedded f/w, don't reload f/w, issue pf flr at exit
From: Felix Manlunas @ 2017-08-18  6:11 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	ricardo.farrington
In-Reply-To: <20170818061037.GA4024@felix-thinkpad.cavium.com>

From: Rick Farrington <ricardo.farrington@cavium.com>

1. Add support for PF FLR when exiting
   (enables CORE_DRV_ACTIVE upon next driver init)
2. Skip some initialization (don't try to load f/w, activate consoles).

Signed-off-by: Rick Farrington <ricardo.farrington@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c | 70 ++++++++++++++++---------
 1 file changed, 46 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index db85f8f..dccd447 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1122,6 +1122,33 @@ static bool fw_type_is_none(void)
 }
 
 /**
+ * \brief PCI FLR for each Octeon device.
+ * @param oct octeon device
+ */
+static void octeon_pci_flr(struct octeon_device *oct)
+{
+	int rc;
+
+	pci_save_state(oct->pci_dev);
+
+	pci_cfg_access_lock(oct->pci_dev);
+
+	/* Quiesce the device completely */
+	pci_write_config_word(oct->pci_dev, PCI_COMMAND,
+			      PCI_COMMAND_INTX_DISABLE);
+
+	rc = __pci_reset_function_locked(oct->pci_dev);
+
+	if (rc != 0)
+		dev_err(&oct->pci_dev->dev, "Error %d resetting PCI function %d\n",
+			rc, oct->pf_num);
+
+	pci_cfg_access_unlock(oct->pci_dev);
+
+	pci_restore_state(oct->pci_dev);
+}
+
+/**
  *\brief Destroy resources associated with octeon device
  * @param pdev PCI device structure
  * @param ent unused
@@ -1269,14 +1296,16 @@ static void octeon_destroy_resources(struct octeon_device *oct)
 	case OCT_DEV_PCI_MAP_DONE:
 		refcount = octeon_deregister_device(oct);
 
-		if (!fw_type_is_none()) {
-			/* Soft reset the octeon device before exiting.
-			 * Implementation note: here, we reset the device
-			 * if it is a CN6XXX OR the last CN23XX device.
-			 */
-			if (OCTEON_CN6XXX(oct) || !refcount)
-				oct->fn_list.soft_reset(oct);
-		}
+		/* Soft reset the octeon device before exiting.
+		 * However, if fw was loaded from card (i.e. autoboot),
+		 * perform an FLR instead.
+		 * Implementation note: only soft-reset the device
+		 * if it is a CN6XXX OR the LAST CN23XX device.
+		 */
+		if (fw_type_is_none())
+			octeon_pci_flr(oct);
+		else if (OCTEON_CN6XXX(oct) || !refcount)
+			oct->fn_list.soft_reset(oct);
 
 		octeon_unmap_pci_barx(oct, 0);
 		octeon_unmap_pci_barx(oct, 1);
@@ -1903,11 +1932,6 @@ static int load_firmware(struct octeon_device *oct)
 	char fw_name[LIO_MAX_FW_FILENAME_LEN];
 	char *tmp_fw_type;
 
-	if (fw_type_is_none()) {
-		dev_info(&oct->pci_dev->dev, "Skipping firmware load\n");
-		return ret;
-	}
-
 	if (fw_type[0] == '\0')
 		tmp_fw_type = LIO_FW_NAME_TYPE_NIC;
 	else
@@ -3891,18 +3915,16 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 	octeon_dev->app_mode = CVM_DRV_INVALID_APP;
 
 	if (OCTEON_CN23XX_PF(octeon_dev)) {
-		if (!cn23xx_fw_loaded(octeon_dev)) {
+		if (!cn23xx_fw_loaded(octeon_dev) && !fw_type_is_none()) {
 			fw_loaded = 0;
-			if (!fw_type_is_none()) {
-				/* Do a soft reset of the Octeon device. */
-				if (octeon_dev->fn_list.soft_reset(octeon_dev))
-					return 1;
-				/* things might have changed */
-				if (!cn23xx_fw_loaded(octeon_dev))
-					fw_loaded = 0;
-				else
-					fw_loaded = 1;
-			}
+			/* Do a soft reset of the Octeon device. */
+			if (octeon_dev->fn_list.soft_reset(octeon_dev))
+				return 1;
+			/* things might have changed */
+			if (!cn23xx_fw_loaded(octeon_dev))
+				fw_loaded = 0;
+			else
+				fw_loaded = 1;
 		} else {
 			fw_loaded = 1;
 		}
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next 0/2] liquidio: initialization fixes for embedded firmware
From: Felix Manlunas @ 2017-08-18  6:10 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	ricardo.farrington

From: Rick Farrington <ricardo.farrington@cavium.com>

Fix problems when using an adapter w/embedded f/w (param "fw_type=none").

1. Add support for PF FLR when exiting.
2. Skip some initialization (don't try to load f/w, activate consoles).
3. Issue credits BEFORE enabling DROQs.

Rick Farrington (2):
  liquidio: with embedded f/w, don't reload f/w, issue pf flr at exit
  liquidio: with embedded f/w, issue droq credits before enablement

 drivers/net/ethernet/cavium/liquidio/lio_main.c | 90 ++++++++++++++++---------
 1 file changed, 58 insertions(+), 32 deletions(-)

-- 
2.9.0

^ permalink raw reply

* Re: [PATCH net v2 2/2] net: ixgbe: Use new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag
From: Ding Tianhong @ 2017-08-18  5:50 UTC (permalink / raw)
  To: Tantilov, Emil S, davem@davemloft.net, Kirsher, Jeffrey T,
	keescook@chromium.org, linux-kernel@vger.kernel.org,
	sparclinux@vger.kernel.org, intel-wired-lan@lists.osuosl.org,
	alexander.duyck@gmail.com, netdev@vger.kernel.org,
	linuxarm@huawei.com
In-Reply-To: <87618083B2453E4A8714035B62D67992B4095363@FMSMSX105.amr.corp.intel.com>



On 2017/8/18 13:04, Tantilov, Emil S wrote:
>> -----Original Message-----
>> From: Ding Tianhong [mailto:dingtianhong@huawei.com]
>> Sent: Thursday, August 17, 2017 5:39 PM
>> To: Tantilov, Emil S <emil.s.tantilov@intel.com>; davem@davemloft.net;
>> Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; keescook@chromium.org;
>> linux-kernel@vger.kernel.org; sparclinux@vger.kernel.org; intel-wired-
>> lan@lists.osuosl.org; alexander.duyck@gmail.com; netdev@vger.kernel.org;
>> linuxarm@huawei.com
>> Subject: Re: [PATCH net v2 2/2] net: ixgbe: Use new
>> PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag
>>
>>
>>
>> On 2017/8/17 22:17, Tantilov, Emil S wrote:
>>
>>>> 	ret_val = ixgbe_start_hw_generic(hw);
>>>>
>>>> -#ifndef CONFIG_SPARC
>>>> -	/* Disable relaxed ordering */
>>>> -	for (i = 0; ((i < hw->mac.max_tx_queues) &&
>>>> -	     (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
>>>> -		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
>>>> -		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
>>>> -		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
>>>> -	}
>>>> +	if (!pcie_relaxed_ordering_enabled(adapter->pdev)) {
>>>
>>> As Alex mentioned there is no need for this check in any form.
>>>
>>> The HW defaults to Relaxed Ordering enabled unless it is disabled in
>>> the PCIe Device Control Register. So the above logic is already done by
>> HW.
>>>
>>> All you have to do is strip the code disabling relaxed ordering.
>>>
>>
>> Hi Tantilov:
>>
>> I misunderstood Alex's suggestion, But I still couldn't find the logic
>> where
>> the HW disable the Relaxed Ordering when the PCIe Device Control Register
>> disable it, can you point it out?
> 
> If you look at the datasheet (82599) - the description of CTRL_EXT.RO_DIS (bit 17, 0b):
> 
> Relaxed Ordering Disable. When set to 1b, the device does not request any relaxed
> ordering transactions. When this bit is cleared and the Enable Relaxed Ordering bit in
> the Device Control register is set, the device requests relaxed ordering transactions per queues as configured in the DCA_RXCTRL[n] and DCA_TXCTRL[n] registers.
> 
> So if you remove the code that clears the bits in DCA_T/RXCTRL relaxed ordering should
> be enabled by HW when the bus allows it.
> 

Great, Thanks for your explanation.

> Thanks,
> Emil
> 
> 
> .
> 


^ permalink raw reply

* Re: [RFC] about net: Fix inconsistent teardown and release of private netdev state.
From: David Miller @ 2017-08-18  5:21 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1503009040.4936.161.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 17 Aug 2017 15:30:40 -0700

> So we do not really know if we need to clean up or not.

We always know, the answer is that whenever register_netdev() fails we
never need to perform any cleanup which is done by priv_destructor.

> Any idea how to fix the issue ?

Your patch is exactly how we should fix this, but without the comment.
The logic is straightforward.

If register_netdevice() fails any resources handled by priv_destructor
are cleaned up, it is guaranteed.

^ permalink raw reply

* RE: [PATCH net v2 2/2] net: ixgbe: Use new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag
From: Tantilov, Emil S @ 2017-08-18  5:04 UTC (permalink / raw)
  To: Ding Tianhong, davem@davemloft.net, Kirsher, Jeffrey T,
	keescook@chromium.org, linux-kernel@vger.kernel.org,
	sparclinux@vger.kernel.org, intel-wired-lan@lists.osuosl.org,
	alexander.duyck@gmail.com, netdev@vger.kernel.org,
	linuxarm@huawei.com
In-Reply-To: <2a7fc27b-c2f4-a7a1-9318-3a93531e7670@huawei.com>

>-----Original Message-----
>From: Ding Tianhong [mailto:dingtianhong@huawei.com]
>Sent: Thursday, August 17, 2017 5:39 PM
>To: Tantilov, Emil S <emil.s.tantilov@intel.com>; davem@davemloft.net;
>Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; keescook@chromium.org;
>linux-kernel@vger.kernel.org; sparclinux@vger.kernel.org; intel-wired-
>lan@lists.osuosl.org; alexander.duyck@gmail.com; netdev@vger.kernel.org;
>linuxarm@huawei.com
>Subject: Re: [PATCH net v2 2/2] net: ixgbe: Use new
>PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag
>
>
>
>On 2017/8/17 22:17, Tantilov, Emil S wrote:
>
>>> 	ret_val = ixgbe_start_hw_generic(hw);
>>>
>>> -#ifndef CONFIG_SPARC
>>> -	/* Disable relaxed ordering */
>>> -	for (i = 0; ((i < hw->mac.max_tx_queues) &&
>>> -	     (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
>>> -		regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
>>> -		regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
>>> -		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
>>> -	}
>>> +	if (!pcie_relaxed_ordering_enabled(adapter->pdev)) {
>>
>> As Alex mentioned there is no need for this check in any form.
>>
>> The HW defaults to Relaxed Ordering enabled unless it is disabled in
>> the PCIe Device Control Register. So the above logic is already done by
>HW.
>>
>> All you have to do is strip the code disabling relaxed ordering.
>>
>
>Hi Tantilov:
>
>I misunderstood Alex's suggestion, But I still couldn't find the logic
>where
>the HW disable the Relaxed Ordering when the PCIe Device Control Register
>disable it, can you point it out?

If you look at the datasheet (82599) - the description of CTRL_EXT.RO_DIS (bit 17, 0b):

Relaxed Ordering Disable. When set to 1b, the device does not request any relaxed
ordering transactions. When this bit is cleared and the Enable Relaxed Ordering bit in
the Device Control register is set, the device requests relaxed ordering transactions per queues as configured in the DCA_RXCTRL[n] and DCA_TXCTRL[n] registers.

So if you remove the code that clears the bits in DCA_T/RXCTRL relaxed ordering should
be enabled by HW when the bus allows it.

Thanks,
Emil

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox