Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net 3/4] net:ethernet:aquantia: Fix transient invalid link down/up indications
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Due to a bug in aquantia atlantic card firmware, it sometimes reports
invalid link speed bits. That caused driver to report link down events,
although link itself is totally fine.

This patch ignores such out of blue readings.

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
index 4f5ec9a..ab5d3cb 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
@@ -351,8 +351,7 @@ int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self)
 			break;
 
 		default:
-			link_status->mbps = 0U;
-			break;
+			return -1;
 		}
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 2/4] net:ethernet:aquantia: Fix Tx queue hangups
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Driver did a poor job in managing its Tx queues: Sometimes it could stop
tx queues due to link down condition in aq_nic_xmit - but never waked up
them. That led to Tx path total suspend.
This patch fixes this and improves generic queue management:
- introduces queue restart counter
- uses generic netif_ interface to disable and enable tx path
- refactors link up/down condition and introduces dmesg log event when
  link changes.
- introduces new constant for minimum descriptors count required for queue
  wakeup

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h  |  4 ++
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c  | 91 +++++++++++-------------
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h  |  2 -
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 26 +++++++
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h |  4 ++
 drivers/net/ethernet/aquantia/atlantic/aq_vec.c  |  8 +--
 6 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
index 2149864..0fdaaa6 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
@@ -51,6 +51,10 @@
 
 #define AQ_CFG_SKB_FRAGS_MAX   32U
 
+/* Number of descriptors available in one ring to resume this ring queue
+ */
+#define AQ_CFG_RESTART_DESC_THRES   (AQ_CFG_SKB_FRAGS_MAX * 2)
+
 #define AQ_CFG_NAPI_WEIGHT     64U
 
 #define AQ_CFG_MULTICAST_ADDRESS_MAX     32U
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index f281392..24f573c 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -119,6 +119,35 @@ int aq_nic_cfg_start(struct aq_nic_s *self)
 	return 0;
 }
 
+static int aq_nic_update_link_status(struct aq_nic_s *self)
+{
+	int err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
+
+	if (err < 0)
+		return -1;
+
+	if (self->link_status.mbps != self->aq_hw->aq_link_status.mbps)
+		pr_info("%s: link change old %d new %d\n",
+			AQ_CFG_DRV_NAME, self->link_status.mbps,
+			self->aq_hw->aq_link_status.mbps);
+
+	self->link_status = self->aq_hw->aq_link_status;
+	if (!netif_carrier_ok(self->ndev) && self->link_status.mbps) {
+		aq_utils_obj_set(&self->header.flags,
+				 AQ_NIC_FLAG_STARTED);
+		aq_utils_obj_clear(&self->header.flags,
+				   AQ_NIC_LINK_DOWN);
+		netif_carrier_on(self->ndev);
+		netif_tx_wake_all_queues(self->ndev);
+	}
+	if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) {
+		netif_carrier_off(self->ndev);
+		netif_tx_disable(self->ndev);
+		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
+	}
+	return 0;
+}
+
 static void aq_nic_service_timer_cb(unsigned long param)
 {
 	struct aq_nic_s *self = (struct aq_nic_s *)param;
@@ -131,26 +160,13 @@ static void aq_nic_service_timer_cb(unsigned long param)
 	if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY))
 		goto err_exit;
 
-	err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
-	if (err < 0)
+	err = aq_nic_update_link_status(self);
+	if (err)
 		goto err_exit;
 
-	self->link_status = self->aq_hw->aq_link_status;
-
 	self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw,
 		    self->aq_nic_cfg.is_interrupt_moderation);
 
-	if (self->link_status.mbps) {
-		aq_utils_obj_set(&self->header.flags,
-				 AQ_NIC_FLAG_STARTED);
-		aq_utils_obj_clear(&self->header.flags,
-				   AQ_NIC_LINK_DOWN);
-		netif_carrier_on(self->ndev);
-	} else {
-		netif_carrier_off(self->ndev);
-		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
-	}
-
 	memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
 	memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
 	for (i = AQ_DIMOF(self->aq_vec); i--;) {
@@ -240,7 +256,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 int aq_nic_ndev_register(struct aq_nic_s *self)
 {
 	int err = 0;
-	unsigned int i = 0U;
 
 	if (!self->ndev) {
 		err = -EINVAL;
@@ -262,8 +277,7 @@ int aq_nic_ndev_register(struct aq_nic_s *self)
 
 	netif_carrier_off(self->ndev);
 
-	for (i = AQ_CFG_VECS_MAX; i--;)
-		aq_nic_ndev_queue_stop(self, i);
+	netif_tx_disable(self->ndev);
 
 	err = register_netdev(self->ndev);
 	if (err < 0)
@@ -319,12 +333,8 @@ struct aq_nic_s *aq_nic_alloc_hot(struct net_device *ndev)
 		err = -EINVAL;
 		goto err_exit;
 	}
-	if (netif_running(ndev)) {
-		unsigned int i;
-
-		for (i = AQ_CFG_VECS_MAX; i--;)
-			netif_stop_subqueue(ndev, i);
-	}
+	if (netif_running(ndev))
+		netif_tx_disable(ndev);
 
 	for (self->aq_vecs = 0; self->aq_vecs < self->aq_nic_cfg.vecs;
 		self->aq_vecs++) {
@@ -384,16 +394,6 @@ int aq_nic_init(struct aq_nic_s *self)
 	return err;
 }
 
-void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx)
-{
-	netif_start_subqueue(self->ndev, idx);
-}
-
-void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx)
-{
-	netif_stop_subqueue(self->ndev, idx);
-}
-
 int aq_nic_start(struct aq_nic_s *self)
 {
 	struct aq_vec_s *aq_vec = NULL;
@@ -452,10 +452,6 @@ int aq_nic_start(struct aq_nic_s *self)
 			goto err_exit;
 	}
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
-		aq_nic_ndev_queue_start(self, i);
-
 	err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs);
 	if (err < 0)
 		goto err_exit;
@@ -464,6 +460,8 @@ int aq_nic_start(struct aq_nic_s *self)
 	if (err < 0)
 		goto err_exit;
 
+	netif_tx_start_all_queues(self->ndev);
+
 err_exit:
 	return err;
 }
@@ -603,7 +601,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 	unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
 	unsigned int tc = 0U;
 	int err = NETDEV_TX_OK;
-	bool is_nic_in_bad_state;
 
 	frags = skb_shinfo(skb)->nr_frags + 1;
 
@@ -614,13 +611,10 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 		goto err_exit;
 	}
 
-	is_nic_in_bad_state = aq_utils_obj_test(&self->header.flags,
-						AQ_NIC_FLAGS_IS_NOT_TX_READY) ||
-						(aq_ring_avail_dx(ring) <
-						AQ_CFG_SKB_FRAGS_MAX);
+	aq_ring_update_queue_state(ring);
 
-	if (is_nic_in_bad_state) {
-		aq_nic_ndev_queue_stop(self, ring->idx);
+	/* Above status update may stop the queue. Check this. */
+	if (__netif_subqueue_stopped(self->ndev, ring->idx)) {
 		err = NETDEV_TX_BUSY;
 		goto err_exit;
 	}
@@ -632,9 +626,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 						      ring,
 						      frags);
 		if (err >= 0) {
-			if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1)
-				aq_nic_ndev_queue_stop(self, ring->idx);
-
 			++ring->stats.tx.packets;
 			ring->stats.tx.bytes += skb->len;
 		}
@@ -906,9 +897,7 @@ int aq_nic_stop(struct aq_nic_s *self)
 	struct aq_vec_s *aq_vec = NULL;
 	unsigned int i = 0U;
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
-		aq_nic_ndev_queue_stop(self, i);
+	netif_tx_disable(self->ndev);
 
 	del_timer_sync(&self->service_timer);
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 7fc2a5e..0ddd556 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -83,8 +83,6 @@ struct net_device *aq_nic_get_ndev(struct aq_nic_s *self);
 int aq_nic_init(struct aq_nic_s *self);
 int aq_nic_cfg_start(struct aq_nic_s *self);
 int aq_nic_ndev_register(struct aq_nic_s *self);
-void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx);
-void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx);
 void aq_nic_ndev_free(struct aq_nic_s *self);
 int aq_nic_start(struct aq_nic_s *self);
 int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 4eee199..02f79b0 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -104,6 +104,32 @@ int aq_ring_init(struct aq_ring_s *self)
 	return 0;
 }
 
+void aq_ring_update_queue_state(struct aq_ring_s *ring)
+{
+	if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX)
+		aq_ring_queue_stop(ring);
+	else if (aq_ring_avail_dx(ring) > AQ_CFG_RESTART_DESC_THRES)
+		aq_ring_queue_wake(ring);
+}
+
+void aq_ring_queue_wake(struct aq_ring_s *ring)
+{
+	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
+
+	if (__netif_subqueue_stopped(ndev, ring->idx)) {
+		netif_wake_subqueue(ndev, ring->idx);
+		ring->stats.tx.queue_restarts++;
+	}
+}
+
+void aq_ring_queue_stop(struct aq_ring_s *ring)
+{
+	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
+
+	if (!__netif_subqueue_stopped(ndev, ring->idx))
+		netif_stop_subqueue(ndev, ring->idx);
+}
+
 void aq_ring_tx_clean(struct aq_ring_s *self)
 {
 	struct device *dev = aq_nic_get_dev(self->aq_nic);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
index 782176c..24523b5 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
@@ -94,6 +94,7 @@ struct aq_ring_stats_tx_s {
 	u64 errors;
 	u64 packets;
 	u64 bytes;
+	u64 queue_restarts;
 };
 
 union aq_ring_stats_s {
@@ -147,6 +148,9 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
 int aq_ring_init(struct aq_ring_s *self);
 void aq_ring_rx_deinit(struct aq_ring_s *self);
 void aq_ring_free(struct aq_ring_s *self);
+void aq_ring_update_queue_state(struct aq_ring_s *ring);
+void aq_ring_queue_wake(struct aq_ring_s *ring);
+void aq_ring_queue_stop(struct aq_ring_s *ring);
 void aq_ring_tx_clean(struct aq_ring_s *self);
 int aq_ring_rx_clean(struct aq_ring_s *self,
 		     struct napi_struct *napi,
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
index ebf5880..305ff8f 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
@@ -59,12 +59,7 @@ static int aq_vec_poll(struct napi_struct *napi, int budget)
 			if (ring[AQ_VEC_TX_ID].sw_head !=
 			    ring[AQ_VEC_TX_ID].hw_head) {
 				aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]);
-
-				if (aq_ring_avail_dx(&ring[AQ_VEC_TX_ID]) >
-				    AQ_CFG_SKB_FRAGS_MAX) {
-					aq_nic_ndev_queue_start(self->aq_nic,
-						ring[AQ_VEC_TX_ID].idx);
-				}
+				aq_ring_update_queue_state(&ring[AQ_VEC_TX_ID]);
 				was_tx_cleaned = true;
 			}
 
@@ -364,6 +359,7 @@ void aq_vec_add_stats(struct aq_vec_s *self,
 		stats_tx->packets += tx->packets;
 		stats_tx->bytes += tx->bytes;
 		stats_tx->errors += tx->errors;
+		stats_tx->queue_restarts += tx->queue_restarts;
 	}
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 1/4] net:ethernet:aquantia: Setup max_mtu in ndev to enable jumbo frames
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Although hardware is capable for almost 16K MTU, without max_mtu field
correctly set it only allows standard MTU to be used.
This patch enables max MTU, calculating it from hardware maximum frame size
of 16352 octets (including FCS).

Fixes: 5513e16421cb ("net: ethernet: aquantia: Fixes for aq_ndev_change_mtu")

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c                    | 5 +++--
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 6ac9e26..f281392 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -214,7 +214,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 	SET_NETDEV_DEV(ndev, dev);
 
 	ndev->if_port = port;
-	ndev->min_mtu = ETH_MIN_MTU;
 	self->ndev = ndev;
 
 	self->aq_pci_func = aq_pci_func;
@@ -283,6 +282,8 @@ int aq_nic_ndev_init(struct aq_nic_s *self)
 	self->ndev->features = aq_hw_caps->hw_features;
 	self->ndev->priv_flags = aq_hw_caps->hw_priv_flags;
 	self->ndev->mtu = aq_nic_cfg->mtu - ETH_HLEN;
+	self->ndev->min_mtu = ETH_MIN_MTU;
+	self->ndev->max_mtu = self->aq_hw_caps.mtu - ETH_FCS_LEN - ETH_HLEN;
 
 	return 0;
 }
@@ -695,7 +696,7 @@ int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu)
 {
 	int err = 0;
 
-	if (new_mtu > self->aq_hw_caps.mtu) {
+	if (new_mtu + ETH_FCS_LEN > self->aq_hw_caps.mtu) {
 		err = -EINVAL;
 		goto err_exit;
 	}
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
index f3957e93..fcf89e2 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
@@ -16,7 +16,7 @@
 
 #include "../aq_common.h"
 
-#define HW_ATL_B0_MTU_JUMBO (16000U)
+#define HW_ATL_B0_MTU_JUMBO  16352U
 #define HW_ATL_B0_MTU        1514U
 
 #define HW_ATL_B0_TX_RINGS 4U
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 0/4] net:ethernet:aquantia: Atlantic driver bugfixes and improvements
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh

This series contains bugfixes for aQuantia Atlantic driver.

Igor Russkikh (3):
  net:ethernet:aquantia: Setup max_mtu in ndev to enable jumbo frames
  net:ethernet:aquantia: Fix Tx queue hangups
  net:ethernet:aquantia: Fix transient invalid link down/up indications

Pavel Belous (1):
  net:ethernet:atlantic: fix iommu errors

 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h    |   4 +
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c    | 139 ++++++++++-----------
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h    |   2 -
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c   |  53 ++++++--
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h   |  10 +-
 drivers/net/ethernet/aquantia/atlantic/aq_vec.c    |   8 +-
 .../aquantia/atlantic/hw_atl/hw_atl_b0_internal.h  |   2 +-
 .../aquantia/atlantic/hw_atl/hw_atl_utils.c        |   3 +-
 8 files changed, 130 insertions(+), 91 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH net-next 0/5] net: introduce noref sk
From: Eric Dumazet @ 2017-09-21 10:37 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: David Miller, netdev, pablo, fw, edumazet, hannes
In-Reply-To: <1505986931.2560.51.camel@redhat.com>

On Thu, 2017-09-21 at 11:42 +0200, Paolo Abeni wrote:
> Hi,
> 
> Thanks for the feedback!
> 
> On Wed, 2017-09-20 at 20:20 -0700, David Miller wrote:
> > From: Paolo Abeni <pabeni@redhat.com>
> > Date: Wed, 20 Sep 2017 18:54:00 +0200
> > 
> > > This series introduce the infrastructure to store inside the skb a socket
> > > pointer without carrying a refcount to the socket.
> > > 
> > > Such infrastructure is then used in the network receive path - and
> > > specifically the early demux operation.
> > > 
> > > This allows the UDP early demux to perform a full lookup for UDP sockets,
> > > with many benefits:
> > > 
> > > - the UDP early demux code is now much simpler
> > > - the early demux does not hit any performance penalties in case of UDP hash
> > >   table collision - previously the early demux performed a partial, unsuccesful,
> > >   lookup
> > > - early demux is now operational also for unconnected sockets.
> > > 
> > > This infrastrcture will be used in follow-up series to allow dst caching for
> > > unconnected UDP sockets, and than to extend the same features to TCP listening
> > > sockets.
> > 
> > Like Eric, I find this series (while exciting) quite scary :-)
> > 
> > You really have to post some kind of performance numbers in your
> > header posting in order to justify something with these ramifications
> > and scale.
> 
> This is actually a preparatory work for the next series which will
> bring in the real gain. The next patches are still to be polished so we
>  posted this separately to get some early feedback. 
> 
> If that would help, I can post the follow-up soon as RFC. Overall -
> with the follow-up appplied, too - when using a single rx ingress
> queue, I measured ~20% tput gain for unconnected ipv4 sockets - with
> rp_filter disabled - and ~30% for ipv6 sockets. In case of multiple
> ingress queues, the gain is smaller but still measurable (roughly 5%). 
> 
> Please let me know if you prefer the see the full work early. 

I want to see the full work yes. Ipv6, and everything.

I do not want ~1000 lines of changed code in the stack for some corner
cases, where people do not properly use existing infra, like proper
SO_REUSEPORT with proper BPF filter to have as many clean siloes (proper
CPU/NUMA affinities to avoid QPI traffic)

The complexity of your patches reached a point where I am extremely
nervous.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next 1/5] net: add support for noref skb->sk
From: Eric Dumazet @ 2017-09-21 10:35 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, Pablo Neira Ayuso, Florian Westphal,
	Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <1505985297.2560.39.camel@redhat.com>

On Thu, 2017-09-21 at 11:14 +0200, Paolo Abeni wrote:
> Hi,
> 
> Thank you for looking at it!
> 
> On Wed, 2017-09-20 at 10:41 -0700, Eric Dumazet wrote:
> > On Wed, 2017-09-20 at 18:54 +0200, Paolo Abeni wrote:
> > > Noref sk do not carry a socket refcount, are valid
> > > only inside the current RCU section and must be
> > > explicitly cleared before exiting such section.
> > > 
> > > They will be used in a later patch to allow early demux
> > > without sock refcounting.
> > 
> > 
> > 
> > 
> > > +/* dummy destructor used by noref sockets */
> > > +void sock_dummyfree(struct sk_buff *skb)
> > > +{
> > 
> > BUG();
> > 
> > > +}
> > > +EXPORT_SYMBOL(sock_dummyfree);
> > > +
> 
> We can call sock_dummyfree() in legitimate paths, see below, but we can
> add a:
> 
> WARN_ON_ONCE(!rcu_read_lock_held());

This wont be enough see below.

> 
> here and in  skb_clear_noref_sk(). That should help much to catch
> possible bugs.
> 
> > I do not see how you ensure we do not leave RCU section with an skb
> > destructor pointing to this sock_dummyfree()
> > 
> > This patch series looks quite dangerous to me.
> 
> The idea is to explicitly clear the sknoref references before leaving
> the RCU section. Quite alike what we currently do for dst noref, but
> here the only place where we get a noref socket is the socket early
> demux, thus the scope of this change is more limited to what we have
> with noref dst_entries.
> 
> The relevant code is in the next 2 patches; after the demux we preserve
> the sknoref only if the skb has a local destination. The UDP socket
> will then set the noref on early demux lookup, and the skb will either:
> 
> * land on the corresponding UDP socket, the receive function will steal
> the sknoref
> * be dropped by some nft/iptables target - the dummy destructor is
> called
> * forwarded by some nft/iptables target outside the input path; we
> clear the skref explicitly in such targets. 
> 
> Currently there are an handful of places affected, and we can simplify
> the code dropping the early demux result for locally terminated
> multicast sockets on a host acting as a multicast router, please see
> the comment on the next patch.
> 
> > Do we really have real applications using connected UDP sockets and
> > wanting very high pps throughput ?
> 
> The ultimate goal is to improve the unconnected UDP sockets scenario,
> we do actually have use cases for that - DNS servers and VoIP SBCs.

Unconnected UDP traffic does not use refcounting on sk _already_.

And SO_REUSEPORT already allows us to handle all the traffic we want
_already_.


Please take a look at 71563f3414e917c62acd8e0fb0edf8ed6af63e4b

This might tell you why I am so nervous about your changes.

Checking WARN_ON_ONCE(!rcu_read_lock_held());
is not enough.

rcu_read_lock()
skb->destructor = sock_dummyfree;

queue the packet into an intermediate queue.
rcu_read_unlock();

....

rcu_read_lock()
...
if (skb->sk && skb->sk->state == ...) // crash

Also you covered IPv4, but really we need to forget about IPv4 and focus
on IPv6 only. And _then_ take care of IPv4 compat.

^ permalink raw reply

* [net-next v3] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl
From: Vincent Bernat @ 2017-09-21 10:05 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern, David Miller, bridge, netdev
  Cc: Vincent Bernat
In-Reply-To: <20170920162140.369bb198@xeon-e3>

Currently, there is a difference in netlink events received when an
interface is modified through bridge ioctl() or through netlink. This
patch generates additional events when an interface is added to or
removed from a bridge via ioctl().

When adding then removing an interface from a bridge with netlink, we
get:

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

When using ioctl():

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

Without this patch, the last netlink notification is not sent.

Signed-off-by: Vincent Bernat <vincent@bernat.im>
---
 net/bridge/br_ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 7970f8540cbb..66cd98772051 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -102,6 +102,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
 	else
 		ret = br_del_if(br, dev);
 
+	if (!ret)
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL);
+
 	return ret;
 }
 
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge
From: Vincent Bernat @ 2017-09-21 10:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Ahern, David Miller, bridge, netdev
In-Reply-To: <20170920162140.369bb198@xeon-e3>

 ❦ 20 septembre 2017 16:21 -0700, Stephen Hemminger <stephen@networkplumber.org> :

> The one concern is that ports added or removed through ioctl should
> cause same events as doing the same thing via netlink. Some users use
> brctl (ioctl) and others use newer bridge (netlink) API.

I'll make a third iteration to have the same notifications when using
ioctl() with details in the commit message.
-- 
When in doubt, tell the truth.
		-- Mark Twain

^ permalink raw reply

* Re: [PATCH net-next 0/5] net: introduce noref sk
From: Paolo Abeni @ 2017-09-21  9:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, pablo, fw, edumazet, hannes
In-Reply-To: <20170920.202022.2073272587145961156.davem@davemloft.net>

Hi,

Thanks for the feedback!

On Wed, 2017-09-20 at 20:20 -0700, David Miller wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> Date: Wed, 20 Sep 2017 18:54:00 +0200
> 
> > This series introduce the infrastructure to store inside the skb a socket
> > pointer without carrying a refcount to the socket.
> > 
> > Such infrastructure is then used in the network receive path - and
> > specifically the early demux operation.
> > 
> > This allows the UDP early demux to perform a full lookup for UDP sockets,
> > with many benefits:
> > 
> > - the UDP early demux code is now much simpler
> > - the early demux does not hit any performance penalties in case of UDP hash
> >   table collision - previously the early demux performed a partial, unsuccesful,
> >   lookup
> > - early demux is now operational also for unconnected sockets.
> > 
> > This infrastrcture will be used in follow-up series to allow dst caching for
> > unconnected UDP sockets, and than to extend the same features to TCP listening
> > sockets.
> 
> Like Eric, I find this series (while exciting) quite scary :-)
> 
> You really have to post some kind of performance numbers in your
> header posting in order to justify something with these ramifications
> and scale.

This is actually a preparatory work for the next series which will
bring in the real gain. The next patches are still to be polished so we
 posted this separately to get some early feedback. 

If that would help, I can post the follow-up soon as RFC. Overall -
with the follow-up appplied, too - when using a single rx ingress
queue, I measured ~20% tput gain for unconnected ipv4 sockets - with
rp_filter disabled - and ~30% for ipv6 sockets. In case of multiple
ingress queues, the gain is smaller but still measurable (roughly 5%). 

Please let me know if you prefer the see the full work early. 

Thanks,

Paolo

^ permalink raw reply

* [PATCH net-next 2/2] net: dsa: lan9303: Add basic offloading of unicast traffic
From: Egil Hjelmeland @ 2017-09-21  9:41 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland
In-Reply-To: <20170921094139.4250-1-privat@egil-hjelmeland.no>

When both user ports are joined to the same bridge, the normal
HW MAC learning is enabled. This means that unicast traffic is forwarded
in HW.

If one of the user ports leave the bridge,
the ports goes back to the initial separated operation.

Port separation relies on disabled HW MAC learning. Hence the condition
that both ports must join same bridge.

Add brigde methods port_bridge_join, port_bridge_leave and
port_stp_state_set.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
---
 drivers/net/dsa/lan9303-core.c | 88 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/lan9303.h      |  1 +
 2 files changed, 89 insertions(+)

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index bba875e114e7..76112674fe6a 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
+#include <linux/if_bridge.h>
 
 #include "lan9303.h"
 
@@ -146,6 +147,7 @@
 # define LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 (0)
 # define LAN9303_SWE_PORT_STATE_LEARNING_PORT0 BIT(1)
 # define LAN9303_SWE_PORT_STATE_BLOCKING_PORT0 BIT(0)
+# define LAN9303_SWE_PORT_STATE_DISABLED_PORT0 (3)
 #define LAN9303_SWE_PORT_MIRROR 0x1846
 # define LAN9303_SWE_PORT_MIRROR_SNIFF_ALL BIT(8)
 # define LAN9303_SWE_PORT_MIRROR_SNIFFER_PORT2 BIT(7)
@@ -431,6 +433,20 @@ static int lan9303_read_switch_reg(struct lan9303 *chip, u16 regnum, u32 *val)
 	return ret;
 }
 
+static int lan9303_write_switch_reg_mask(
+	struct lan9303 *chip, u16 regnum, u32 val, u32 mask)
+{
+	int ret;
+	u32 reg;
+
+	ret = lan9303_read_switch_reg(chip, regnum, &reg);
+	if (ret)
+		return ret;
+	reg = (reg & ~mask) | val;
+
+	return lan9303_write_switch_reg(chip, regnum, reg);
+}
+
 static int lan9303_write_switch_port(struct lan9303 *chip, int port,
 				     u16 regnum, u32 val)
 {
@@ -556,6 +572,12 @@ static int lan9303_separate_ports(struct lan9303 *chip)
 				LAN9303_SWE_PORT_STATE_BLOCKING_PORT2);
 }
 
+static void lan9303_bridge_ports(struct lan9303 *chip)
+{
+	/* ports bridged: remove mirroring */
+	lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_MIRROR, 0);
+}
+
 static int lan9303_handle_reset(struct lan9303 *chip)
 {
 	if (!chip->reset_gpio)
@@ -844,6 +866,69 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port,
 	}
 }
 
+static int lan9303_port_bridge_join(struct dsa_switch *ds, int port,
+				    struct net_device *br)
+{
+	struct lan9303 *chip = ds->priv;
+
+	dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
+	if (ds->ports[1].bridge_dev ==  ds->ports[2].bridge_dev) {
+		lan9303_bridge_ports(chip);
+		chip->is_bridged = true;  /* unleash stp_state_set() */
+	}
+
+	return 0;
+}
+
+static void lan9303_port_bridge_leave(struct dsa_switch *ds, int port,
+				      struct net_device *br)
+{
+	struct lan9303 *chip = ds->priv;
+
+	dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
+	if (chip->is_bridged) {
+		lan9303_separate_ports(chip);
+		chip->is_bridged = false;
+	}
+}
+
+static void lan9303_port_stp_state_set(struct dsa_switch *ds, int port,
+				       u8 state)
+{
+	int portmask, portstate;
+	struct lan9303 *chip = ds->priv;
+
+	dev_dbg(chip->dev, "%s(port %d, state %d)\n",
+		__func__, port, state);
+	if (!chip->is_bridged)
+		return; /* touching SWE_PORT_STATE will break port separation */
+
+	switch (state) {
+	case BR_STATE_DISABLED:
+		portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
+		break;
+	case BR_STATE_BLOCKING:
+	case BR_STATE_LISTENING:
+		portstate = LAN9303_SWE_PORT_STATE_BLOCKING_PORT0;
+		break;
+	case BR_STATE_LEARNING:
+		portstate = LAN9303_SWE_PORT_STATE_LEARNING_PORT0;
+		break;
+	case BR_STATE_FORWARDING:
+		portstate = LAN9303_SWE_PORT_STATE_FORWARDING_PORT0;
+		break;
+	default:
+		portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
+		dev_err(chip->dev, "unknown stp state: port %d, state %d\n",
+			port, state);
+	}
+
+	portmask = 0x3 << (port * 2);
+	portstate     <<= (port * 2);
+	lan9303_write_switch_reg_mask(chip, LAN9303_SWE_PORT_STATE,
+				      portstate, portmask);
+}
+
 static const struct dsa_switch_ops lan9303_switch_ops = {
 	.get_tag_protocol = lan9303_get_tag_protocol,
 	.setup = lan9303_setup,
@@ -855,6 +940,9 @@ static const struct dsa_switch_ops lan9303_switch_ops = {
 	.get_sset_count = lan9303_get_sset_count,
 	.port_enable = lan9303_port_enable,
 	.port_disable = lan9303_port_disable,
+	.port_bridge_join       = lan9303_port_bridge_join,
+	.port_bridge_leave      = lan9303_port_bridge_leave,
+	.port_stp_state_set     = lan9303_port_stp_state_set,
 };
 
 static int lan9303_register_switch(struct lan9303 *chip)
diff --git a/drivers/net/dsa/lan9303.h b/drivers/net/dsa/lan9303.h
index 4d8be555ff4d..5be246f05965 100644
--- a/drivers/net/dsa/lan9303.h
+++ b/drivers/net/dsa/lan9303.h
@@ -21,6 +21,7 @@ struct lan9303 {
 	struct dsa_switch *ds;
 	struct mutex indirect_mutex; /* protect indexed register access */
 	const struct lan9303_phy_ops *ops;
+	bool is_bridged; /* true if port 1 and 2 is bridged */
 };
 
 extern const struct regmap_access_table lan9303_register_set;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/2] net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
From: Egil Hjelmeland @ 2017-09-21  9:41 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland
In-Reply-To: <20170921094139.4250-1-privat@egil-hjelmeland.no>

Prepare for next patch:
Move tag setup from lan9303_separate_ports() to new function
lan9303_setup_tagging()

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
---
 drivers/net/dsa/lan9303-core.c | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index 07355db2ad81..bba875e114e7 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -157,6 +157,7 @@
 # define LAN9303_SWE_PORT_MIRROR_ENABLE_RX_MIRRORING BIT(1)
 # define LAN9303_SWE_PORT_MIRROR_ENABLE_TX_MIRRORING BIT(0)
 #define LAN9303_SWE_INGRESS_PORT_TYPE 0x1847
+#define  LAN9303_SWE_INGRESS_PORT_TYPE_VLAN 3
 #define LAN9303_BM_CFG 0x1c00
 #define LAN9303_BM_EGRSS_PORT_TYPE 0x1c0c
 # define LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT2 (BIT(17) | BIT(16))
@@ -510,11 +511,30 @@ static int lan9303_enable_processing_port(struct lan9303 *chip,
 				LAN9303_MAC_TX_CFG_X_TX_ENABLE);
 }
 
+/* forward special tagged packets from port 0 to port 1 *or* port 2 */
+static int lan9303_setup_tagging(struct lan9303 *chip)
+{
+	int ret;
+	/* enable defining the destination port via special VLAN tagging
+	 * for port 0
+	 */
+	ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE,
+				       LAN9303_SWE_INGRESS_PORT_TYPE_VLAN);
+	if (ret)
+		return ret;
+
+	/* tag incoming packets at port 1 and 2 on their way to port 0 to be
+	 * able to discover their source port
+	 */
+	return lan9303_write_switch_reg(
+		chip, LAN9303_BM_EGRSS_PORT_TYPE,
+		LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0);
+}
+
 /* We want a special working switch:
  * - do not forward packets between port 1 and 2
  * - forward everything from port 1 to port 0
  * - forward everything from port 2 to port 0
- * - forward special tagged packets from port 0 to port 1 *or* port 2
  */
 static int lan9303_separate_ports(struct lan9303 *chip)
 {
@@ -529,22 +549,6 @@ static int lan9303_separate_ports(struct lan9303 *chip)
 	if (ret)
 		return ret;
 
-	/* enable defining the destination port via special VLAN tagging
-	 * for port 0
-	 */
-	ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE,
-				       0x03);
-	if (ret)
-		return ret;
-
-	/* tag incoming packets at port 1 and 2 on their way to port 0 to be
-	 * able to discover their source port
-	 */
-	ret = lan9303_write_switch_reg(chip, LAN9303_BM_EGRSS_PORT_TYPE,
-			LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0);
-	if (ret)
-		return ret;
-
 	/* prevent port 1 and 2 from forwarding packets by their own */
 	return lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE,
 				LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 |
@@ -644,6 +648,10 @@ static int lan9303_setup(struct dsa_switch *ds)
 		return -EINVAL;
 	}
 
+	ret = lan9303_setup_tagging(chip);
+	if (ret)
+		dev_err(chip->dev, "failed to setup port tagging %d\n", ret);
+
 	ret = lan9303_separate_ports(chip);
 	if (ret)
 		dev_err(chip->dev, "failed to separate ports %d\n", ret);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 0/2] lan9303: Add basic offloading of unicast traffic
From: Egil Hjelmeland @ 2017-09-21  9:41 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland

This series add basic offloading of unicast traffic to the lan9303
DSA driver.

Comments welcome!

Egil Hjelmeland (2):
  net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
  net: dsa: lan9303: Add basic offloading of unicast traffic

 drivers/net/dsa/lan9303-core.c | 130 +++++++++++++++++++++++++++++++++++------
 drivers/net/dsa/lan9303.h      |   1 +
 2 files changed, 114 insertions(+), 17 deletions(-)

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH v3 10/31] befs: Define usercopy region in befs_inode_cache slab cache
From: Luis de Bethencourt @ 2017-09-21  9:34 UTC (permalink / raw)
  To: Kees Cook, linux-kernel
  Cc: David Windsor, Salah Triki, linux-fsdevel, netdev, linux-mm,
	kernel-hardening
In-Reply-To: <1505940337-79069-11-git-send-email-keescook@chromium.org>

On 09/20/2017 09:45 PM, Kees Cook wrote:
> From: David Windsor <dave@nullcore.net>
> 
> befs symlink pathnames, stored in struct befs_inode_info.i_data.symlink
> and therefore contained in the befs_inode_cache slab cache, need to be
> copied to/from userspace.
> 
> cache object allocation:
>      fs/befs/linuxvfs.c:
>          befs_alloc_inode(...):
>              ...
>              bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
>              ...
>              return &bi->vfs_inode;
> 
>          befs_iget(...):
>              ...
>              strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
>                      BEFS_SYMLINK_LEN);
>              ...
>              inode->i_link = befs_ino->i_data.symlink;
> 
> example usage trace:
>      readlink_copy+0x43/0x70
>      vfs_readlink+0x62/0x110
>      SyS_readlinkat+0x100/0x130
> 
>      fs/namei.c:
>          readlink_copy(..., link):
>              ...
>              copy_to_user(..., link, len);
> 
>          (inlined in vfs_readlink)
>          generic_readlink(dentry, ...):
>              struct inode *inode = d_inode(dentry);
>              const char *link = inode->i_link;
>              ...
>              readlink_copy(..., link);
> 
> In support of usercopy hardening, this patch defines a region in the
> befs_inode_cache slab cache in which userspace copy operations are
> allowed.
> 
> This region is known as the slab cache's usercopy region. Slab caches can
> now check that each copy operation involving cache-managed memory falls
> entirely within the slab's usercopy region.
> 
> This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
> whitelisting code in the last public patch of grsecurity/PaX based on my
> understanding of the code. Changes or omissions from the original code are
> mine and don't reflect the original grsecurity/PaX code.
> 
> Signed-off-by: David Windsor <dave@nullcore.net>
> [kees: adjust commit log, provide usage trace]
> Cc: Luis de Bethencourt <luisbg@kernel.org>
> Cc: Salah Triki <salah.triki@gmail.com>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Acked-by: Luis de Bethencourt <luisbg@kernel.org>
> ---
>   fs/befs/linuxvfs.c | 14 +++++++++-----
>   1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
> index a92355cc453b..e5dcd26003dc 100644
> --- a/fs/befs/linuxvfs.c
> +++ b/fs/befs/linuxvfs.c
> @@ -444,11 +444,15 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
>   static int __init
>   befs_init_inodecache(void)
>   {
> -	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
> -					      sizeof (struct befs_inode_info),
> -					      0, (SLAB_RECLAIM_ACCOUNT|
> -						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
> -					      init_once);
> +	befs_inode_cachep = kmem_cache_create_usercopy("befs_inode_cache",
> +				sizeof(struct befs_inode_info), 0,
> +				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
> +					SLAB_ACCOUNT),
> +				offsetof(struct befs_inode_info,
> +					i_data.symlink),
> +				sizeof_field(struct befs_inode_info,
> +					i_data.symlink),
> +				init_once);
>   	if (befs_inode_cachep == NULL)
>   		return -ENOMEM;
>   
> 

No changes in the befs patch in v3. It goes without saying I continue to 
Ack this.

Thanks Kees and David,
Luis

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 2/2] ip_tunnel: add mpls over gre encapsulation
From: Amine Kherbouche @ 2017-09-21  9:25 UTC (permalink / raw)
  To: netdev, xeb, roopa; +Cc: amine.kherbouche, equinox
In-Reply-To: <1505985924-12479-1-git-send-email-amine.kherbouche@6wind.com>

This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel
API.

Encap:
  - Add a new iptunnel type mpls.

Decap:
  - pull gre hdr and call mpls_forward().

Signed-off-by: Amine Kherbouche <amine.kherbouche@6wind.com>
---
 include/net/gre.h              |  3 +++
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/gre_demux.c           | 22 ++++++++++++++++++++++
 net/ipv4/ip_gre.c              |  9 +++++++++
 net/ipv6/ip6_gre.c             |  7 +++++++
 net/mpls/af_mpls.c             | 37 +++++++++++++++++++++++++++++++++++++
 6 files changed, 79 insertions(+)

diff --git a/include/net/gre.h b/include/net/gre.h
index d25d836..88a8343 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -35,6 +35,9 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 				       u8 name_assign_type);
 int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 		     bool *csum_err, __be16 proto, int nhs);
+#if IS_ENABLED(CONFIG_MPLS)
+int mpls_gre_rcv(struct sk_buff *skb, int gre_hdr_len);
+#endif
 
 static inline int gre_calc_hlen(__be16 o_flags)
 {
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 2e52088..a2f48c0 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -84,6 +84,7 @@ enum tunnel_encap_types {
 	TUNNEL_ENCAP_NONE,
 	TUNNEL_ENCAP_FOU,
 	TUNNEL_ENCAP_GUE,
+	TUNNEL_ENCAP_MPLS,
 };
 
 #define TUNNEL_ENCAP_FLAG_CSUM		(1<<0)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index b798862..a6a937e 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -23,6 +23,9 @@
 #include <linux/netdevice.h>
 #include <linux/if_tunnel.h>
 #include <linux/spinlock.h>
+#if IS_ENABLED(CONFIG_MPLS)
+#include <linux/mpls.h>
+#endif
 #include <net/protocol.h>
 #include <net/gre.h>
 
@@ -122,6 +125,25 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 }
 EXPORT_SYMBOL(gre_parse_header);
 
+#if IS_ENABLED(CONFIG_MPLS)
+int mpls_gre_rcv(struct sk_buff *skb, int gre_hdr_len)
+{
+	if (unlikely(!pskb_may_pull(skb, gre_hdr_len)))
+		goto drop;
+
+	/* Pop GRE hdr and reset the skb */
+	skb_pull(skb, gre_hdr_len);
+	skb_reset_network_header(skb);
+
+	mpls_forward(skb, skb->dev, NULL, NULL);
+
+	return 0;
+drop:
+	return NET_RX_DROP;
+}
+EXPORT_SYMBOL(mpls_gre_rcv);
+#endif
+
 static int gre_rcv(struct sk_buff *skb)
 {
 	const struct gre_protocol *proto;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9cee986..dd4431c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -412,10 +412,19 @@ static int gre_rcv(struct sk_buff *skb)
 			return 0;
 	}
 
+#if IS_ENABLED(CONFIG_MPLS)
+	if (unlikely(tpi.proto == htons(ETH_P_MPLS_UC))) {
+		if (mpls_gre_rcv(skb, hdr_len))
+			goto drop;
+		return 0;
+	}
+#endif
+
 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 		return 0;
 
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
 drop:
 	kfree_skb(skb);
 	return 0;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index c82d41e..e52396d 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -476,6 +476,13 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
+#if IS_ENABLED(CONFIG_MPLS)
+	if (unlikely(tpi.proto == htons(ETH_P_MPLS_UC))) {
+		if (mpls_gre_rcv(skb, hdr_len))
+			goto drop;
+		return 0;
+	}
+#endif
 	if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
 		goto drop;
 
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 36ea2ad..060ed07 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -16,6 +16,7 @@
 #include <net/arp.h>
 #include <net/ip_fib.h>
 #include <net/netevent.h>
+#include <net/ip_tunnels.h>
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -39,6 +40,40 @@ static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
+size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e)
+{
+	return sizeof(struct mpls_shim_hdr);
+}
+
+int ipgre_mpls_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+			    u8 *protocol, struct flowi4 *fl4)
+{
+	return 0;
+}
+
+static const struct ip_tunnel_encap_ops mpls_iptun_ops = {
+	.encap_hlen = ipgre_mpls_encap_hlen,
+	.build_header = ipgre_mpls_build_header,
+};
+
+int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+	int ret;
+
+	ret = ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+	if (ret < 0) {
+		pr_err("can't add mplsgre ops\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+	ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
 		       struct nlmsghdr *nlh, struct net *net, u32 portid,
 		       unsigned int nlm_flags);
@@ -2486,6 +2521,7 @@ static int __init mpls_init(void)
 		      0);
 	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
 		      mpls_netconf_dump_devconf, 0);
+	ipgre_tunnel_encap_add_mpls_ops();
 	err = 0;
 out:
 	return err;
@@ -2503,6 +2539,7 @@ static void __exit mpls_exit(void)
 	dev_remove_pack(&mpls_packet_type);
 	unregister_netdevice_notifier(&mpls_dev_notifier);
 	unregister_pernet_subsys(&mpls_net_ops);
+	ipgre_tunnel_encap_del_mpls_ops();
 }
 module_exit(mpls_exit);
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 1/2] mpls: expose stack entry function
From: Amine Kherbouche @ 2017-09-21  9:25 UTC (permalink / raw)
  To: netdev, xeb, roopa; +Cc: amine.kherbouche, equinox
In-Reply-To: <1505985924-12479-1-git-send-email-amine.kherbouche@6wind.com>

Exposing mpls_forward() function to be able to be called from elsewhere
such as MPLS over GRE in the next commit.

Signed-off-by: Amine Kherbouche <amine.kherbouche@6wind.com>
---
 include/linux/mpls.h | 3 +++
 net/mpls/af_mpls.c   | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/mpls.h b/include/linux/mpls.h
index 384fb22..d5c7599 100644
--- a/include/linux/mpls.h
+++ b/include/linux/mpls.h
@@ -2,10 +2,13 @@
 #define _LINUX_MPLS_H
 
 #include <uapi/linux/mpls.h>
+#include <linux/netdevice.h>
 
 #define MPLS_TTL_MASK		(MPLS_LS_TTL_MASK >> MPLS_LS_TTL_SHIFT)
 #define MPLS_BOS_MASK		(MPLS_LS_S_MASK >> MPLS_LS_S_SHIFT)
 #define MPLS_TC_MASK		(MPLS_LS_TC_MASK >> MPLS_LS_TC_SHIFT)
 #define MPLS_LABEL_MASK		(MPLS_LS_LABEL_MASK >> MPLS_LS_LABEL_SHIFT)
 
+int mpls_forward(struct sk_buff *skb, struct net_device *dev,
+		 struct packet_type *pt, struct net_device *orig_dev);
 #endif  /* _LINUX_MPLS_H */
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c5b9ce4..36ea2ad 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -307,8 +307,8 @@ static bool mpls_egress(struct net *net, struct mpls_route *rt,
 	return success;
 }
 
-static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
-			struct packet_type *pt, struct net_device *orig_dev)
+int mpls_forward(struct sk_buff *skb, struct net_device *dev,
+		 struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct net *net = dev_net(dev);
 	struct mpls_shim_hdr *hdr;
@@ -442,6 +442,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
+EXPORT_SYMBOL(mpls_forward);
 
 static struct packet_type mpls_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_UC),
-- 
2.1.4

^ permalink raw reply related

* [RFC PATCH 0/0] Introduce MPLS over GRE
From: Amine Kherbouche @ 2017-09-21  9:25 UTC (permalink / raw)
  To: netdev, xeb, roopa; +Cc: amine.kherbouche, equinox

This series introduces the MPLS over GRE encapsulation (RFC 4023).

Various applications of MPLS make use of label stacks with multiple
entries.  In some cases, it is possible to replace the top label of
the stack with an IP-based encapsulation, thereby, it is possible for
two LSRs that are adjacent on an LSP to be separated by an IP network,
even if that IP network does not provide MPLS.

An example of configuration:


         node1                LER1                       LER2                node2
        +-----+             +------+                   +------+             +-----+
        |     |             |      |                   |      |             |     |
        |     |             |      |p3  GRE tunnel   p4|      |             |     |
        |     |p1         p2|      +-------------------+      |p5         p6|     |
        |     +-------------+      +-------------------+      +------------+|     |
        |     |10.100.0.0/24|      |                   |      |10.200.0.0/24|     |
        |     |fd00:100::/64|      |  10.125.0.0/24    |      |fd00:200::/64|     |
        |     |             |      |  fd00:125::/64    |      |             |     |
        |     |             |      |                   |      |             |     |
        |     |             |      |                   |      |             |     |
        |     |             |      |                   |      |             |     |
        |     |             |      |                   |      |             |     |
        +-----+             +------+                   +------+             +-----+


		###	node1	###

ip link set p1 up
ip addr add 10.100.0.1/24 dev p1

		###	LER1	###

ip link set p2 up
ip addr add 10.100.0.2/24 dev p2

ip link set p3 up
ip addr add 10.125.0.1/24 dev p3

modprobe mpls_router
sysctl -w net.mpls.conf.p2.input=1
sysctl -w net.mpls.conf.p3.input=1
sysctl -w net.mpls.platform_labels=1000

ip link add gre1 type gre ttl 64 local 10.125.0.1 remote 10.125.0.2 dev p3
ip link set dev gre1 up

ip -M route add 111 as 222 dev gre1
ip -M route add 555 as 666 via inet 10.100.0.1 dev p2

		###	LER2	###

ip link set p5 up
ip addr add 10.200.0.2/24 dev p5

ip link set p4 up
ip addr add 10.125.0.2/24 dev p4

modprobe mpls_router
sysctl -w net.mpls.conf.p4.input=1
sysctl -w net.mpls.conf.p5.input=1
sysctl -w net.mpls.platform_labels=1000

ip link add gre1 type gre ttl 64 local 10.125.0.2 remote 10.125.0.1 dev p4
ip link set dev gre1 up

ip -M route add 444 as 555 dev gre1
ip -M route add 222 as 333 via inet 10.200.0.1 dev p5

		###	node2	###

ip link set p6 up
ip addr add 10.200.0.1/24 dev p6


Now using this scapy to forge and send packets from the port p1 of node1:

p = Ether(src='de:ed:01:0c:41:09', dst='de:ed:01:2f:3b:ba')
p /= MPLS(s=1, ttl=64, label=111)/Raw(load='\xde')
sendp(p, iface="p1", count=20, inter=0.1)

^ permalink raw reply

* Re: [PATCH net-next 1/5] net: add support for noref skb->sk
From: Paolo Abeni @ 2017-09-21  9:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, David S. Miller, Pablo Neira Ayuso, Florian Westphal,
	Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <1505929295.29839.103.camel@edumazet-glaptop3.roam.corp.google.com>

Hi,

Thank you for looking at it!

On Wed, 2017-09-20 at 10:41 -0700, Eric Dumazet wrote:
> On Wed, 2017-09-20 at 18:54 +0200, Paolo Abeni wrote:
> > Noref sk do not carry a socket refcount, are valid
> > only inside the current RCU section and must be
> > explicitly cleared before exiting such section.
> > 
> > They will be used in a later patch to allow early demux
> > without sock refcounting.
> 
> 
> 
> 
> > +/* dummy destructor used by noref sockets */
> > +void sock_dummyfree(struct sk_buff *skb)
> > +{
> 
> BUG();
> 
> > +}
> > +EXPORT_SYMBOL(sock_dummyfree);
> > +

We can call sock_dummyfree() in legitimate paths, see below, but we can
add a:

WARN_ON_ONCE(!rcu_read_lock_held());

here and in  skb_clear_noref_sk(). That should help much to catch
possible bugs.

> I do not see how you ensure we do not leave RCU section with an skb
> destructor pointing to this sock_dummyfree()
> 
> This patch series looks quite dangerous to me.

The idea is to explicitly clear the sknoref references before leaving
the RCU section. Quite alike what we currently do for dst noref, but
here the only place where we get a noref socket is the socket early
demux, thus the scope of this change is more limited to what we have
with noref dst_entries.

The relevant code is in the next 2 patches; after the demux we preserve
the sknoref only if the skb has a local destination. The UDP socket
will then set the noref on early demux lookup, and the skb will either:

* land on the corresponding UDP socket, the receive function will steal
the sknoref
* be dropped by some nft/iptables target - the dummy destructor is
called
* forwarded by some nft/iptables target outside the input path; we
clear the skref explicitly in such targets. 

Currently there are an handful of places affected, and we can simplify
the code dropping the early demux result for locally terminated
multicast sockets on a host acting as a multicast router, please see
the comment on the next patch.

> Do we really have real applications using connected UDP sockets and
> wanting very high pps throughput ?

The ultimate goal is to improve the unconnected UDP sockets scenario,
we do actually have use cases for that - DNS servers and VoIP SBCs.

Thanks,

Paolo

^ permalink raw reply

* Re: [PATCH net-next 2/5] net: allow early demux to fetch noref socket
From: Paolo Abeni @ 2017-09-21  9:13 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Pablo Neira Ayuso, Florian Westphal,
	Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <db75c6a6872040712a9ab97b0bac04b697c42a4c.1505926196.git.pabeni@redhat.com>

On Wed, 2017-09-20 at 18:54 +0200, Paolo Abeni wrote:
> We must be careful to avoid leaking such sockets outside
> the RCU section containing the early demux call; we clear
> them on nonlocal delivery.
> 
> For ipv4 we must take care of local mcast delivery, too,
> since udp early demux works also for mcast addresses.
> 
> Also update all iptables/nftables extension that can
> happen in the input chain and can transmit the skb outside
> such patch, namely TEE, nft_dup and nfqueue.
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  net/ipv4/ip_input.c              | 12 ++++++++++++
>  net/ipv4/ipmr.c                  | 18 ++++++++++++++----
>  net/ipv4/netfilter/nf_dup_ipv4.c |  3 +++
>  net/ipv6/ip6_input.c             |  7 ++++++-
>  net/ipv6/netfilter/nf_dup_ipv6.c |  3 +++
>  net/netfilter/nf_queue.c         |  3 +++
>  6 files changed, 41 insertions(+), 5 deletions(-)
> 
> diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> index fa2dc8f692c6..e71abc8b698c 100644
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -349,6 +349,18 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
>  				__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
>  			goto drop;
>  		}
> +
> +		/* Since the sk has no reference to the socket, we must
> +		 * clear it before escaping this RCU section.
> +		 * The sk is just an hint and we know we are not going to use
> +		 * it outside the input path.
> +		 */
> +		if (skb_dst(skb)->input != ip_local_deliver
> +#ifdef CONFIG_IP_MROUTE
> +		    && skb_dst(skb)->input != ip_mr_input
> +#endif
> +		    )
> +			skb_clear_noref_sk(skb);
>  	}

The above is to allow early demux for multicast sockets even on hosts
acting as multicast router. This is probably overkill: an host will
probably act as a multicast router or receive large amount of locally
terminate mcast traffic.

We can instead preserve the sknoref only for ip_local_deliver(),
dropping the early demux optimization in the above scenario, which
should not be very relevant. Will simplify the above chunk and drop the
need for the ipmr.c changes below; overall this patch will become much
simpler.

Paolo

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-21  9:06 UTC (permalink / raw)
  To: Eric Dumazet, Wei Wang
  Cc: Cong Wang, Linux Kernel Network Developers, Eric Dumazet
In-Reply-To: <1505956639.29839.108.camel@edumazet-glaptop3.roam.corp.google.com>



W dniu 2017-09-21 o 03:17, Eric Dumazet pisze:
> On Wed, 2017-09-20 at 18:09 -0700, Wei Wang wrote:
>>> Thanks very much Pawel for the feedback.
>>>
>>> I was looking into the code (specifically IPv4 part) and found that in
>>> free_fib_info_rcu(), we call free_nh_exceptions() without holding the
>>> fnhe_lock. I am wondering if that could cause some race condition on
>>> fnhe->fnhe_rth_input/output so a double call on dst_dev_put() on the
>>> same dst could be happening.
>>>
>>> But as we call free_fib_info_rcu() only after the grace period, and
>>> the lookup code which could potentially modify
>>> fnhe->fnhe_rth_input/output all holds rcu_read_lock(), it seems
>>> fine...
>>>
>> Hi Pawel,
>>
>> Could you try the following debug patch on top of net-next branch and
>> reproduce the issue check if there are warning msg showing?
>>
>> diff --git a/include/net/dst.h b/include/net/dst.h
>> index 93568bd0a352..82aff41c6f63 100644
>> --- a/include/net/dst.h
>> +++ b/include/net/dst.h
>> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry
>> *dst, unsigned long time)
>>   static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>>   {
>>          if (dst)
>> -               atomic_inc(&dst->__refcnt);
>> +               dst_hold(dst);
>>          return dst;
>>   }
>>
>> Thanks.
>> Wei
>>
>
> Yes, we believe skb_dst_force() and skb_dst_force_safe() should be
> unified  (to the 'safe' version)
>
> We no longer have gc to protect from 0 -> 1 transition of dst refcount.
>
>
>
>

After adding patch from Wei
https://bugzilla.kernel.org/show_bug.cgi?id=197005#c14

^ permalink raw reply

* Re: [PATCH net-next 0/4] cxgb4: add support to offload tc flower
From: Jiri Pirko @ 2017-09-21  8:56 UTC (permalink / raw)
  To: Rahul Lakkireddy; +Cc: netdev, davem, kumaras, ganeshgr, nirranjan, indranil
In-Reply-To: <cover.1505977744.git.rahul.lakkireddy@chelsio.com>

Thu, Sep 21, 2017 at 09:33:33AM CEST, rahul.lakkireddy@chelsio.com wrote:
>This series of patches add support to offload tc flower onto Chelsio
>NICs.
>
>Patch 1 adds basic skeleton to prepare for offloading tc flower flows.
>
>Patch 2 adds support to add/remove flows for offload.  Flows can have
>accompanying masks.  Following match and action are currently supported
>for offload:
>Match:  ether-protocol, IPv4/IPv6 addresses, L4 ports (TCP/UDP)
>Action: drop, redirect to another port on the device.
>
>Patch 3 adds support to offload tc-flower flows having
>vlan actions: pop, push, and modify.
>
>Patch 4 adds support to fetch stats for the offloaded tc flower flows
>from hardware.
>
>Support for offloading more match and action types are to be followed
>in subsequent series.

Looks good to me. Thanks!

^ permalink raw reply

* Re: [PATCH net-next 3/4] cxgb4: add support to offload action vlan
From: Jiri Pirko @ 2017-09-21  8:55 UTC (permalink / raw)
  To: Rahul Lakkireddy; +Cc: netdev, davem, kumaras, ganeshgr, nirranjan, indranil
In-Reply-To: <016c3bf21a7bfe45e73275d3191cf61cceffd362.1505977744.git.rahul.lakkireddy@chelsio.com>

Thu, Sep 21, 2017 at 09:33:36AM CEST, rahul.lakkireddy@chelsio.com wrote:
>From: Kumar Sanghvi <kumaras@chelsio.com>
>
>Add support for offloading tc-flower flows having
>vlan actions: pop, push and modify.
>
>Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
>Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
>Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
>---
> .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 43 ++++++++++++++++++++++
> 1 file changed, 43 insertions(+)
>
>diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c

[...]


>+			switch (vlan_action) {
>+			case TCA_VLAN_ACT_POP:
>+				break;
>+			case TCA_VLAN_ACT_PUSH:
>+			case TCA_VLAN_ACT_MODIFY:
>+				if (proto != ETH_P_8021Q) {
>+					netdev_err(dev,
>+						   "%s: Unsupp. vlan proto\n",

Don't wrap this. Also "Unsupp."vs"Unsupported". Please be consistent.


>+						   __func__);
>+					return -EOPNOTSUPP;
>+				}
>+				break;
>+			default:
>+				netdev_err(dev, "%s: Unsupported vlan action\n",
>+					   __func__);
>+				return -EOPNOTSUPP;
>+			}
> 		} else {
> 			netdev_err(dev, "%s: Unsupported action\n", __func__);
> 			return -EOPNOTSUPP;
>-- 
>2.14.1
>

^ permalink raw reply

* [PATCH iproute2 master 2/2] bpf: properly output json for xdp
From: Daniel Borkmann @ 2017-09-21  8:42 UTC (permalink / raw)
  To: stephen; +Cc: ast, netdev, Daniel Borkmann
In-Reply-To: <cover.1505956723.git.daniel@iogearbox.net>

After merging net-next branch into master, Stephen asked
to fix up json dump for XDP. Thus, rework the json dump a
bit, such that 'ip -json l' looks as below.

  [{
        "ifindex": 1,
        "ifname": "lo",
        "flags": ["LOOPBACK","UP","LOWER_UP"],
        "mtu": 65536,
        "xdp": {
            "mode": 2,
            "prog": {
                "id": 5,
                "tag": "e1e9d0ec0f55d638",
                "jited": 1
            }
        },
        "qdisc": "noqueue",
        "operstate": "UNKNOWN",
        "linkmode": "DEFAULT",
        "group": "default",
        "txqlen": 1000,
        "link_type": "loopback",
        "address": "00:00:00:00:00:00",
        "broadcast": "00:00:00:00:00:00"
    },[...]
  ]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 ip/iplink_xdp.c | 74 ++++++++++++++++++++++++++++++++++-----------------------
 lib/bpf.c       | 19 ++++++++++-----
 2 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/ip/iplink_xdp.c b/ip/iplink_xdp.c
index 71f7798..2d2953a 100644
--- a/ip/iplink_xdp.c
+++ b/ip/iplink_xdp.c
@@ -14,9 +14,9 @@
 
 #include <linux/bpf.h>
 
+#include "json_print.h"
 #include "xdp.h"
 #include "bpf_util.h"
-#include "ip_common.h"
 
 extern int force;
 
@@ -82,6 +82,22 @@ int xdp_parse(int *argc, char ***argv, struct iplink_req *req, bool generic,
 	return 0;
 }
 
+static void xdp_dump_json(struct rtattr *tb[IFLA_XDP_MAX + 1])
+{
+	__u32 prog_id = 0;
+	__u8 mode;
+
+	mode = rta_getattr_u8(tb[IFLA_XDP_ATTACHED]);
+	if (tb[IFLA_XDP_PROG_ID])
+		prog_id = rta_getattr_u32(tb[IFLA_XDP_PROG_ID]);
+
+	open_json_object("xdp");
+	print_uint(PRINT_JSON, "mode", NULL, mode);
+	if (prog_id)
+		bpf_dump_prog_info(NULL, prog_id);
+	close_json_object();
+}
+
 void xdp_dump(FILE *fp, struct rtattr *xdp, bool link, bool details)
 {
 	struct rtattr *tb[IFLA_XDP_MAX + 1];
@@ -94,34 +110,32 @@ void xdp_dump(FILE *fp, struct rtattr *xdp, bool link, bool details)
 		return;
 
 	mode = rta_getattr_u8(tb[IFLA_XDP_ATTACHED]);
-	if (is_json_context()) {
-		print_uint(PRINT_JSON, "attached", NULL, mode);
-	} else {
-		if (mode == XDP_ATTACHED_NONE)
-			return;
-		else if (details && link)
-			fprintf(fp, "%s    prog/xdp", _SL_);
-		else if (mode == XDP_ATTACHED_DRV)
-			fprintf(fp, "xdp");
-		else if (mode == XDP_ATTACHED_SKB)
-			fprintf(fp, "xdpgeneric");
-		else if (mode == XDP_ATTACHED_HW)
-			fprintf(fp, "xdpoffload");
-		else
-			fprintf(fp, "xdp[%u]", mode);
-
-		if (tb[IFLA_XDP_PROG_ID])
-			prog_id = rta_getattr_u32(tb[IFLA_XDP_PROG_ID]);
-		if (!details) {
-			if (prog_id && !link)
-				fprintf(fp, "/id:%u", prog_id);
-			fprintf(fp, " ");
-			return;
-		}
-
-		if (prog_id) {
-			fprintf(fp, " ");
-			bpf_dump_prog_info(fp, prog_id);
-		}
+	if (mode == XDP_ATTACHED_NONE)
+		return;
+	else if (is_json_context())
+		return details ? (void)0 : xdp_dump_json(tb);
+	else if (details && link)
+		fprintf(fp, "%s    prog/xdp", _SL_);
+	else if (mode == XDP_ATTACHED_DRV)
+		fprintf(fp, "xdp");
+	else if (mode == XDP_ATTACHED_SKB)
+		fprintf(fp, "xdpgeneric");
+	else if (mode == XDP_ATTACHED_HW)
+		fprintf(fp, "xdpoffload");
+	else
+		fprintf(fp, "xdp[%u]", mode);
+
+	if (tb[IFLA_XDP_PROG_ID])
+		prog_id = rta_getattr_u32(tb[IFLA_XDP_PROG_ID]);
+	if (!details) {
+		if (prog_id && !link)
+			fprintf(fp, "/id:%u", prog_id);
+		fprintf(fp, " ");
+		return;
+	}
+
+	if (prog_id) {
+		fprintf(fp, " ");
+		bpf_dump_prog_info(fp, prog_id);
 	}
 }
diff --git a/lib/bpf.c b/lib/bpf.c
index cfa1f79..10ea23a 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -40,6 +40,7 @@
 #include <arpa/inet.h>
 
 #include "utils.h"
+#include "json_print.h"
 
 #include "bpf_util.h"
 #include "bpf_elf.h"
@@ -186,23 +187,29 @@ int bpf_dump_prog_info(FILE *f, uint32_t id)
 	int fd, ret, dump_ok = 0;
 	SPRINT_BUF(tmp);
 
-	fprintf(f, "id %u ", id);
+	open_json_object("prog");
+	print_uint(PRINT_ANY, "id", "id %u ", id);
 
 	fd = bpf_prog_fd_by_id(id);
 	if (fd < 0)
-		return dump_ok;
+		goto out;
 
 	ret = bpf_prog_info_by_fd(fd, &info, &len);
 	if (!ret && len) {
-		fprintf(f, "tag %s ",
-			hexstring_n2a(info.tag, sizeof(info.tag),
-				      tmp, sizeof(tmp)));
-		if (info.jited_prog_len)
+		int jited = !!info.jited_prog_len;
+
+		print_string(PRINT_ANY, "tag", "tag %s ",
+			     hexstring_n2a(info.tag, sizeof(info.tag),
+					   tmp, sizeof(tmp)));
+		print_uint(PRINT_JSON, "jited", NULL, jited);
+		if (jited && !is_json_context())
 			fprintf(f, "jited ");
 		dump_ok = 1;
 	}
 
 	close(fd);
+out:
+	close_json_object();
 	return dump_ok;
 }
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH iproute2 master 1/2] json: move json printer to common library
From: Daniel Borkmann @ 2017-09-21  8:42 UTC (permalink / raw)
  To: stephen; +Cc: ast, netdev, Daniel Borkmann
In-Reply-To: <cover.1505956723.git.daniel@iogearbox.net>

Move the json printer which is based on json writer into the
iproute2 library, so it can be used by library code and tools
other than ip. Should probably have been done from the beginning
like that given json writer is in the library already anyway.
No functional changes.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/json_print.h |  71 ++++++++++++++++
 ip/Makefile          |   2 +-
 ip/ip_common.h       |  65 ++------------
 ip/ip_print.c        | 233 ---------------------------------------------------
 lib/Makefile         |   2 +-
 lib/json_print.c     | 231 ++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 312 insertions(+), 292 deletions(-)
 create mode 100644 include/json_print.h
 delete mode 100644 ip/ip_print.c
 create mode 100644 lib/json_print.c

diff --git a/include/json_print.h b/include/json_print.h
new file mode 100644
index 0000000..44cf5ac
--- /dev/null
+++ b/include/json_print.h
@@ -0,0 +1,71 @@
+/*
+ * json_print.h		"print regular or json output, based on json_writer".
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * Authors:    Julien Fortin, <julien@cumulusnetworks.com>
+ */
+
+#ifndef _JSON_PRINT_H_
+#define _JSON_PRINT_H_
+
+#include "json_writer.h"
+#include "color.h"
+
+json_writer_t *get_json_writer(void);
+
+/*
+ * use:
+ *      - PRINT_ANY for context based output
+ *      - PRINT_FP for non json specific output
+ *      - PRINT_JSON for json specific output
+ */
+enum output_type {
+	PRINT_FP = 1,
+	PRINT_JSON = 2,
+	PRINT_ANY = 4,
+};
+
+void new_json_obj(int json, FILE *fp);
+void delete_json_obj(void);
+
+bool is_json_context(void);
+
+void set_current_fp(FILE *fp);
+
+void fflush_fp(void);
+
+void open_json_object(const char *str);
+void close_json_object(void);
+void open_json_array(enum output_type type, const char *delim);
+void close_json_array(enum output_type type, const char *delim);
+
+#define _PRINT_FUNC(type_name, type)					\
+	void print_color_##type_name(enum output_type t,		\
+				     enum color_attr color,		\
+				     const char *key,			\
+				     const char *fmt,			\
+				     type value);			\
+									\
+	static inline void print_##type_name(enum output_type t,	\
+					     const char *key,		\
+					     const char *fmt,		\
+					     type value)		\
+	{								\
+		print_color_##type_name(t, -1, key, fmt, value);	\
+	}
+_PRINT_FUNC(int, int);
+_PRINT_FUNC(bool, bool);
+_PRINT_FUNC(null, const char*);
+_PRINT_FUNC(string, const char*);
+_PRINT_FUNC(uint, uint64_t);
+_PRINT_FUNC(hu, unsigned short);
+_PRINT_FUNC(hex, unsigned int);
+_PRINT_FUNC(0xhex, unsigned int);
+_PRINT_FUNC(lluint, unsigned long long int);
+#undef _PRINT_FUNC
+
+#endif /* _JSON_PRINT_H_ */
diff --git a/ip/Makefile b/ip/Makefile
index 52c9a2e..5a1c7ad 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -9,7 +9,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
     iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \
     iplink_geneve.o iplink_vrf.o iproute_lwtunnel.o ipmacsec.o ipila.o \
-    ipvrf.o iplink_xstats.o ipseg6.o ip_print.o
+    ipvrf.o iplink_xstats.o ipseg6.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip_common.h b/ip/ip_common.h
index efc789c..4b8b0a7 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -1,3 +1,10 @@
+#ifndef _IP_COMMON_H_
+#define _IP_COMMON_H_
+
+#include <stdbool.h>
+
+#include "json_print.h"
+
 struct link_filter {
 	int ifindex;
 	int family;
@@ -101,8 +108,6 @@ static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
 
 extern struct rtnl_handle rth;
 
-#include <stdbool.h>
-
 struct link_util {
 	struct link_util	*next;
 	const char		*id;
@@ -141,58 +146,4 @@ int name_is_vrf(const char *name);
 
 void print_num(FILE *fp, unsigned int width, uint64_t count);
 
-#include "json_writer.h"
-
-json_writer_t   *get_json_writer(void);
-/*
- * use:
- *      - PRINT_ANY for context based output
- *      - PRINT_FP for non json specific output
- *      - PRINT_JSON for json specific output
- */
-enum output_type {
-	PRINT_FP = 1,
-	PRINT_JSON = 2,
-	PRINT_ANY = 4,
-};
-
-void new_json_obj(int json, FILE *fp);
-void delete_json_obj(void);
-
-bool is_json_context(void);
-
-void set_current_fp(FILE *fp);
-
-void fflush_fp(void);
-
-void open_json_object(const char *str);
-void close_json_object(void);
-void open_json_array(enum output_type type, const char *delim);
-void close_json_array(enum output_type type, const char *delim);
-
-#include "color.h"
-
-#define _PRINT_FUNC(type_name, type)					\
-	void print_color_##type_name(enum output_type t,		\
-				     enum color_attr color,		\
-				     const char *key,			\
-				     const char *fmt,			\
-				     type value);			\
-									\
-	static inline void print_##type_name(enum output_type t,	\
-					     const char *key,		\
-					     const char *fmt,		\
-					     type value)		\
-	{								\
-		print_color_##type_name(t, -1, key, fmt, value);	\
-	}
-_PRINT_FUNC(int, int);
-_PRINT_FUNC(bool, bool);
-_PRINT_FUNC(null, const char*);
-_PRINT_FUNC(string, const char*);
-_PRINT_FUNC(uint, uint64_t);
-_PRINT_FUNC(hu, unsigned short);
-_PRINT_FUNC(hex, unsigned int);
-_PRINT_FUNC(0xhex, unsigned int);
-_PRINT_FUNC(lluint, unsigned long long int);
-#undef _PRINT_FUNC
+#endif /* _IP_COMMON_H_ */
diff --git a/ip/ip_print.c b/ip/ip_print.c
deleted file mode 100644
index 4cd6a0b..0000000
--- a/ip/ip_print.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * ip_print.c          "ip print regular or json output".
- *
- *             This program is free software; you can redistribute it and/or
- *             modify it under the terms of the GNU General Public License
- *             as published by the Free Software Foundation; either version
- *             2 of the License, or (at your option) any later version.
- *
- * Authors:    Julien Fortin, <julien@cumulusnetworks.com>
- *
- */
-
-#include <stdarg.h>
-#include <stdio.h>
-
-#include "utils.h"
-#include "ip_common.h"
-#include "json_writer.h"
-
-static json_writer_t *_jw;
-static FILE *_fp;
-
-#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw)
-#define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY))
-
-void new_json_obj(int json, FILE *fp)
-{
-	if (json) {
-		_jw = jsonw_new(fp);
-		if (!_jw) {
-			perror("json object");
-			exit(1);
-		}
-		jsonw_pretty(_jw, true);
-		jsonw_start_array(_jw);
-	}
-	set_current_fp(fp);
-}
-
-void delete_json_obj(void)
-{
-	if (_jw) {
-		jsonw_end_array(_jw);
-		jsonw_destroy(&_jw);
-	}
-}
-
-bool is_json_context(void)
-{
-	return _jw != NULL;
-}
-
-void set_current_fp(FILE *fp)
-{
-	if (!fp) {
-		fprintf(stderr, "Error: invalid file pointer.\n");
-		exit(1);
-	}
-	_fp = fp;
-}
-
-json_writer_t *get_json_writer(void)
-{
-	return _jw;
-}
-
-void open_json_object(const char *str)
-{
-	if (_IS_JSON_CONTEXT(PRINT_JSON)) {
-		if (str)
-			jsonw_name(_jw, str);
-		jsonw_start_object(_jw);
-	}
-}
-
-void close_json_object(void)
-{
-	if (_IS_JSON_CONTEXT(PRINT_JSON))
-		jsonw_end_object(_jw);
-}
-
-/*
- * Start json array or string array using
- * the provided string as json key (if not null)
- * or as array delimiter in non-json context.
- */
-void open_json_array(enum output_type type, const char *str)
-{
-	if (_IS_JSON_CONTEXT(type)) {
-		if (str)
-			jsonw_name(_jw, str);
-		jsonw_start_array(_jw);
-	} else if (_IS_FP_CONTEXT(type)) {
-		fprintf(_fp, "%s", str);
-	}
-}
-
-/*
- * End json array or string array
- */
-void close_json_array(enum output_type type, const char *str)
-{
-	if (_IS_JSON_CONTEXT(type)) {
-		jsonw_pretty(_jw, false);
-		jsonw_end_array(_jw);
-		jsonw_pretty(_jw, true);
-	} else if (_IS_FP_CONTEXT(type)) {
-		fprintf(_fp, "%s", str);
-	}
-}
-
-/*
- * pre-processor directive to generate similar
- * functions handling different types
- */
-#define _PRINT_FUNC(type_name, type)					\
-	void print_color_##type_name(enum output_type t,		\
-				     enum color_attr color,		\
-				     const char *key,			\
-				     const char *fmt,			\
-				     type value)			\
-	{								\
-		if (_IS_JSON_CONTEXT(t)) {				\
-			if (!key)					\
-				jsonw_##type_name(_jw, value);		\
-			else						\
-				jsonw_##type_name##_field(_jw, key, value); \
-		} else if (_IS_FP_CONTEXT(t)) {				\
-			color_fprintf(_fp, color, fmt, value);          \
-		}							\
-	}
-_PRINT_FUNC(int, int);
-_PRINT_FUNC(hu, unsigned short);
-_PRINT_FUNC(uint, uint64_t);
-_PRINT_FUNC(lluint, unsigned long long int);
-#undef _PRINT_FUNC
-
-void print_color_string(enum output_type type,
-			enum color_attr color,
-			const char *key,
-			const char *fmt,
-			const char *value)
-{
-	if (_IS_JSON_CONTEXT(type)) {
-		if (key && !value)
-			jsonw_name(_jw, key);
-		else if (!key && value)
-			jsonw_string(_jw, value);
-		else
-			jsonw_string_field(_jw, key, value);
-	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, value);
-	}
-}
-
-/*
- * value's type is bool. When using this function in FP context you can't pass
- * a value to it, you will need to use "is_json_context()" to have different
- * branch for json and regular output. grep -r "print_bool" for example
- */
-void print_color_bool(enum output_type type,
-		      enum color_attr color,
-		      const char *key,
-		      const char *fmt,
-		      bool value)
-{
-	if (_IS_JSON_CONTEXT(type)) {
-		if (key)
-			jsonw_bool_field(_jw, key, value);
-		else
-			jsonw_bool(_jw, value);
-	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, value ? "true" : "false");
-	}
-}
-
-/*
- * In JSON context uses hardcode %#x format: 42 -> 0x2a
- */
-void print_color_0xhex(enum output_type type,
-		       enum color_attr color,
-		       const char *key,
-		       const char *fmt,
-		       unsigned int hex)
-{
-	if (_IS_JSON_CONTEXT(type)) {
-		SPRINT_BUF(b1);
-
-		snprintf(b1, sizeof(b1), "%#x", hex);
-		print_string(PRINT_JSON, key, NULL, b1);
-	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, hex);
-	}
-}
-
-void print_color_hex(enum output_type type,
-		     enum color_attr color,
-		     const char *key,
-		     const char *fmt,
-		     unsigned int hex)
-{
-	if (_IS_JSON_CONTEXT(type)) {
-		SPRINT_BUF(b1);
-
-		snprintf(b1, sizeof(b1), "%x", hex);
-		if (key)
-			jsonw_string_field(_jw, key, b1);
-		else
-			jsonw_string(_jw, b1);
-	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, hex);
-	}
-}
-
-/*
- * In JSON context we don't use the argument "value" we simply call jsonw_null
- * whereas FP context can use "value" to output anything
- */
-void print_color_null(enum output_type type,
-		      enum color_attr color,
-		      const char *key,
-		      const char *fmt,
-		      const char *value)
-{
-	if (_IS_JSON_CONTEXT(type)) {
-		if (key)
-			jsonw_null_field(_jw, key);
-		else
-			jsonw_null(_jw);
-	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, value);
-	}
-}
diff --git a/lib/Makefile b/lib/Makefile
index 5e9f72f..0fbdf4c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -3,7 +3,7 @@ include ../config.mk
 CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o \
-	inet_proto.o namespace.o json_writer.o \
+	inet_proto.o namespace.o json_writer.o json_print.o \
 	names.o color.o bpf.o exec.o fs.o
 
 NLOBJ=libgenl.o ll_map.o libnetlink.o
diff --git a/lib/json_print.c b/lib/json_print.c
new file mode 100644
index 0000000..93b4119
--- /dev/null
+++ b/lib/json_print.c
@@ -0,0 +1,231 @@
+/*
+ * json_print.c		"print regular or json output, based on json_writer".
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * Authors:    Julien Fortin, <julien@cumulusnetworks.com>
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "utils.h"
+#include "json_print.h"
+
+static json_writer_t *_jw;
+static FILE *_fp;
+
+#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw)
+#define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY))
+
+void new_json_obj(int json, FILE *fp)
+{
+	if (json) {
+		_jw = jsonw_new(fp);
+		if (!_jw) {
+			perror("json object");
+			exit(1);
+		}
+		jsonw_pretty(_jw, true);
+		jsonw_start_array(_jw);
+	}
+	set_current_fp(fp);
+}
+
+void delete_json_obj(void)
+{
+	if (_jw) {
+		jsonw_end_array(_jw);
+		jsonw_destroy(&_jw);
+	}
+}
+
+bool is_json_context(void)
+{
+	return _jw != NULL;
+}
+
+void set_current_fp(FILE *fp)
+{
+	if (!fp) {
+		fprintf(stderr, "Error: invalid file pointer.\n");
+		exit(1);
+	}
+	_fp = fp;
+}
+
+json_writer_t *get_json_writer(void)
+{
+	return _jw;
+}
+
+void open_json_object(const char *str)
+{
+	if (_IS_JSON_CONTEXT(PRINT_JSON)) {
+		if (str)
+			jsonw_name(_jw, str);
+		jsonw_start_object(_jw);
+	}
+}
+
+void close_json_object(void)
+{
+	if (_IS_JSON_CONTEXT(PRINT_JSON))
+		jsonw_end_object(_jw);
+}
+
+/*
+ * Start json array or string array using
+ * the provided string as json key (if not null)
+ * or as array delimiter in non-json context.
+ */
+void open_json_array(enum output_type type, const char *str)
+{
+	if (_IS_JSON_CONTEXT(type)) {
+		if (str)
+			jsonw_name(_jw, str);
+		jsonw_start_array(_jw);
+	} else if (_IS_FP_CONTEXT(type)) {
+		fprintf(_fp, "%s", str);
+	}
+}
+
+/*
+ * End json array or string array
+ */
+void close_json_array(enum output_type type, const char *str)
+{
+	if (_IS_JSON_CONTEXT(type)) {
+		jsonw_pretty(_jw, false);
+		jsonw_end_array(_jw);
+		jsonw_pretty(_jw, true);
+	} else if (_IS_FP_CONTEXT(type)) {
+		fprintf(_fp, "%s", str);
+	}
+}
+
+/*
+ * pre-processor directive to generate similar
+ * functions handling different types
+ */
+#define _PRINT_FUNC(type_name, type)					\
+	void print_color_##type_name(enum output_type t,		\
+				     enum color_attr color,		\
+				     const char *key,			\
+				     const char *fmt,			\
+				     type value)			\
+	{								\
+		if (_IS_JSON_CONTEXT(t)) {				\
+			if (!key)					\
+				jsonw_##type_name(_jw, value);		\
+			else						\
+				jsonw_##type_name##_field(_jw, key, value); \
+		} else if (_IS_FP_CONTEXT(t)) {				\
+			color_fprintf(_fp, color, fmt, value);          \
+		}							\
+	}
+_PRINT_FUNC(int, int);
+_PRINT_FUNC(hu, unsigned short);
+_PRINT_FUNC(uint, uint64_t);
+_PRINT_FUNC(lluint, unsigned long long int);
+#undef _PRINT_FUNC
+
+void print_color_string(enum output_type type,
+			enum color_attr color,
+			const char *key,
+			const char *fmt,
+			const char *value)
+{
+	if (_IS_JSON_CONTEXT(type)) {
+		if (key && !value)
+			jsonw_name(_jw, key);
+		else if (!key && value)
+			jsonw_string(_jw, value);
+		else
+			jsonw_string_field(_jw, key, value);
+	} else if (_IS_FP_CONTEXT(type)) {
+		color_fprintf(_fp, color, fmt, value);
+	}
+}
+
+/*
+ * value's type is bool. When using this function in FP context you can't pass
+ * a value to it, you will need to use "is_json_context()" to have different
+ * branch for json and regular output. grep -r "print_bool" for example
+ */
+void print_color_bool(enum output_type type,
+		      enum color_attr color,
+		      const char *key,
+		      const char *fmt,
+		      bool value)
+{
+	if (_IS_JSON_CONTEXT(type)) {
+		if (key)
+			jsonw_bool_field(_jw, key, value);
+		else
+			jsonw_bool(_jw, value);
+	} else if (_IS_FP_CONTEXT(type)) {
+		color_fprintf(_fp, color, fmt, value ? "true" : "false");
+	}
+}
+
+/*
+ * In JSON context uses hardcode %#x format: 42 -> 0x2a
+ */
+void print_color_0xhex(enum output_type type,
+		       enum color_attr color,
+		       const char *key,
+		       const char *fmt,
+		       unsigned int hex)
+{
+	if (_IS_JSON_CONTEXT(type)) {
+		SPRINT_BUF(b1);
+
+		snprintf(b1, sizeof(b1), "%#x", hex);
+		print_string(PRINT_JSON, key, NULL, b1);
+	} else if (_IS_FP_CONTEXT(type)) {
+		color_fprintf(_fp, color, fmt, hex);
+	}
+}
+
+void print_color_hex(enum output_type type,
+		     enum color_attr color,
+		     const char *key,
+		     const char *fmt,
+		     unsigned int hex)
+{
+	if (_IS_JSON_CONTEXT(type)) {
+		SPRINT_BUF(b1);
+
+		snprintf(b1, sizeof(b1), "%x", hex);
+		if (key)
+			jsonw_string_field(_jw, key, b1);
+		else
+			jsonw_string(_jw, b1);
+	} else if (_IS_FP_CONTEXT(type)) {
+		color_fprintf(_fp, color, fmt, hex);
+	}
+}
+
+/*
+ * In JSON context we don't use the argument "value" we simply call jsonw_null
+ * whereas FP context can use "value" to output anything
+ */
+void print_color_null(enum output_type type,
+		      enum color_attr color,
+		      const char *key,
+		      const char *fmt,
+		      const char *value)
+{
+	if (_IS_JSON_CONTEXT(type)) {
+		if (key)
+			jsonw_null_field(_jw, key);
+		else
+			jsonw_null(_jw);
+	} else if (_IS_FP_CONTEXT(type)) {
+		color_fprintf(_fp, color, fmt, value);
+	}
+}
-- 
1.9.3

^ permalink raw reply related

* [PATCH iproute2 master 0/2] BPF/XDP json follow-up
From: Daniel Borkmann @ 2017-09-21  8:42 UTC (permalink / raw)
  To: stephen; +Cc: ast, netdev, Daniel Borkmann

After merging net-next branch into master, Stephen asked to
fix up json dump for XDP as there were some merge conflicts,
so here it is.

Thanks!

Daniel Borkmann (2):
  json: move json printer to common library
  bpf: properly output json for xdp

 include/json_print.h |  71 ++++++++++++++++
 ip/Makefile          |   2 +-
 ip/ip_common.h       |  65 ++------------
 ip/ip_print.c        | 233 ---------------------------------------------------
 ip/iplink_xdp.c      |  74 +++++++++-------
 lib/Makefile         |   2 +-
 lib/bpf.c            |  19 +++--
 lib/json_print.c     | 231 ++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 369 insertions(+), 328 deletions(-)
 create mode 100644 include/json_print.h
 delete mode 100644 ip/ip_print.c
 create mode 100644 lib/json_print.c

-- 
1.9.3

^ permalink raw reply

* [PATCH net-next 4/4] cxgb4: fetch stats for offloaded tc flower flows
From: Rahul Lakkireddy @ 2017-09-21  7:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, kumaras, ganeshgr, nirranjan, indranil, Rahul Lakkireddy
In-Reply-To: <cover.1505977744.git.rahul.lakkireddy@chelsio.com>

From: Kumar Sanghvi <kumaras@chelsio.com>

Add support to retrieve stats from hardware for offloaded tc flower
flows.  Also, poll for the stats of offloaded flows via timer callback.

Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |  1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c  | 76 +++++++++++++++++++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  1 +
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 79 +++++++++++++++++++++-
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h   |  3 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h     |  2 +
 6 files changed, 161 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 26eac599ab2c..8a94d97df025 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -907,6 +907,7 @@ struct adapter {
 
 	/* TC flower offload */
 	DECLARE_HASHTABLE(flower_anymatch_tbl, 9);
+	struct timer_list flower_stats_timer;
 };
 
 /* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 07a4619e2164..c09c4de8c9fb 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -148,6 +148,82 @@ static int get_filter_steerq(struct net_device *dev,
 	return iq;
 }
 
+static int get_filter_count(struct adapter *adapter, unsigned int fidx,
+			    u64 *pkts, u64 *bytes)
+{
+	unsigned int tcb_base, tcbaddr;
+	unsigned int word_offset;
+	struct filter_entry *f;
+	__be64 be64_byte_count;
+	int ret;
+
+	tcb_base = t4_read_reg(adapter, TP_CMM_TCB_BASE_A);
+	if ((fidx != (adapter->tids.nftids + adapter->tids.nsftids - 1)) &&
+	    fidx >= adapter->tids.nftids)
+		return -E2BIG;
+
+	f = &adapter->tids.ftid_tab[fidx];
+	if (!f->valid)
+		return -EINVAL;
+
+	tcbaddr = tcb_base + f->tid * TCB_SIZE;
+
+	spin_lock(&adapter->win0_lock);
+	if (is_t4(adapter->params.chip)) {
+		__be64 be64_count;
+
+		/* T4 doesn't maintain byte counts in hw */
+		*bytes = 0;
+
+		/* Get pkts */
+		word_offset = 4;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be64_count),
+				   (__be32 *)&be64_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*pkts = be64_to_cpu(be64_count);
+	} else {
+		__be32 be32_count;
+
+		/* Get bytes */
+		word_offset = 4;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be64_byte_count),
+				   &be64_byte_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*bytes = be64_to_cpu(be64_byte_count);
+
+		/* Get pkts */
+		word_offset = 6;
+		ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0,
+				   tcbaddr + (word_offset * sizeof(__be32)),
+				   sizeof(be32_count),
+				   &be32_count,
+				   T4_MEMORY_READ);
+		if (ret < 0)
+			goto out;
+		*pkts = (u64)be32_to_cpu(be32_count);
+	}
+
+out:
+	spin_unlock(&adapter->win0_lock);
+	return ret;
+}
+
+int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx,
+			      u64 *hitcnt, u64 *bytecnt)
+{
+	struct adapter *adapter = netdev2adap(dev);
+
+	return get_filter_count(adapter, fidx, hitcnt, bytecnt);
+}
+
 int cxgb4_get_free_ftid(struct net_device *dev, int family)
 {
 	struct adapter *adap = netdev2adap(dev);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 3ba4e1ff8486..d634098d52ab 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4637,6 +4637,7 @@ static void free_some_resources(struct adapter *adapter)
 	kvfree(adapter->l2t);
 	t4_cleanup_sched(adapter);
 	kvfree(adapter->tids.tid_tab);
+	cxgb4_cleanup_tc_flower(adapter);
 	cxgb4_cleanup_tc_u32(adapter);
 	kfree(adapter->sge.egr_map);
 	kfree(adapter->sge.ingr_map);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index fddb0c419edc..7a47d4e88a57 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -39,9 +39,12 @@
 #include "cxgb4.h"
 #include "cxgb4_tc_flower.h"
 
+#define STATS_CHECK_PERIOD (HZ / 2)
+
 static struct ch_tc_flower_entry *allocate_flower_entry(void)
 {
 	struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	spin_lock_init(&new->lock);
 	return new;
 }
 
@@ -369,13 +372,87 @@ int cxgb4_tc_flower_destroy(struct net_device *dev,
 	return ret;
 }
 
+void ch_flower_stats_cb(unsigned long data)
+{
+	struct adapter *adap = (struct adapter *)data;
+	struct ch_tc_flower_entry *flower_entry;
+	struct ch_tc_flower_stats *ofld_stats;
+	unsigned int i;
+	u64 packets;
+	u64 bytes;
+	int ret;
+
+	rcu_read_lock();
+	hash_for_each_rcu(adap->flower_anymatch_tbl, i, flower_entry, link) {
+		ret = cxgb4_get_filter_counters(adap->port[0],
+						flower_entry->filter_id,
+						&packets, &bytes);
+		if (!ret) {
+			spin_lock(&flower_entry->lock);
+			ofld_stats = &flower_entry->stats;
+
+			if (ofld_stats->prev_packet_count != packets) {
+				ofld_stats->prev_packet_count = packets;
+				ofld_stats->last_used = jiffies;
+			}
+			spin_unlock(&flower_entry->lock);
+		}
+	}
+	rcu_read_unlock();
+	mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
+}
+
 int cxgb4_tc_flower_stats(struct net_device *dev,
 			  struct tc_cls_flower_offload *cls)
 {
-	return -EOPNOTSUPP;
+	struct adapter *adap = netdev2adap(dev);
+	struct ch_tc_flower_stats *ofld_stats;
+	struct ch_tc_flower_entry *ch_flower;
+	u64 packets;
+	u64 bytes;
+	int ret;
+
+	ch_flower = ch_flower_lookup(adap, cls->cookie);
+	if (!ch_flower) {
+		ret = -ENOENT;
+		goto err;
+	}
+
+	ret = cxgb4_get_filter_counters(dev, ch_flower->filter_id,
+					&packets, &bytes);
+	if (ret < 0)
+		goto err;
+
+	spin_lock_bh(&ch_flower->lock);
+	ofld_stats = &ch_flower->stats;
+	if (ofld_stats->packet_count != packets) {
+		if (ofld_stats->prev_packet_count != packets)
+			ofld_stats->last_used = jiffies;
+		tcf_exts_stats_update(cls->exts, bytes - ofld_stats->byte_count,
+				      packets - ofld_stats->packet_count,
+				      ofld_stats->last_used);
+
+		ofld_stats->packet_count = packets;
+		ofld_stats->byte_count = bytes;
+		ofld_stats->prev_packet_count = packets;
+	}
+	spin_unlock_bh(&ch_flower->lock);
+	return 0;
+
+err:
+	return ret;
 }
 
 void cxgb4_init_tc_flower(struct adapter *adap)
 {
 	hash_init(adap->flower_anymatch_tbl);
+	setup_timer(&adap->flower_stats_timer, ch_flower_stats_cb,
+		    (unsigned long)adap);
+	mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD);
+}
+
+void cxgb4_cleanup_tc_flower(struct adapter *adap)
+{
+	if (adap->flower_stats_timer.function)
+		del_timer_sync(&adap->flower_stats_timer);
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
index 6145a9e056eb..604feffc752e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
@@ -38,6 +38,7 @@
 #include <net/pkt_cls.h>
 
 struct ch_tc_flower_stats {
+	u64 prev_packet_count;
 	u64 packet_count;
 	u64 byte_count;
 	u64 last_used;
@@ -49,6 +50,7 @@ struct ch_tc_flower_entry {
 	unsigned long tc_flower_cookie;
 	struct hlist_node link;
 	struct rcu_head rcu;
+	spinlock_t lock; /* lock for stats */
 	u32 filter_id;
 };
 
@@ -60,4 +62,5 @@ int cxgb4_tc_flower_stats(struct net_device *dev,
 			  struct tc_cls_flower_offload *cls);
 
 void cxgb4_init_tc_flower(struct adapter *adap);
+void cxgb4_cleanup_tc_flower(struct adapter *adap);
 #endif /* __CXGB4_TC_FLOWER_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 88487095d14f..52324c77a4fe 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -221,6 +221,8 @@ int __cxgb4_del_filter(struct net_device *dev, int filter_id,
 int cxgb4_set_filter(struct net_device *dev, int filter_id,
 		     struct ch_filter_specification *fs);
 int cxgb4_del_filter(struct net_device *dev, int filter_id);
+int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx,
+			      u64 *hitcnt, u64 *bytecnt);
 
 static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue)
 {
-- 
2.14.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox