Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 1/2] net: systemport: Implement adaptive interrupt coalescing
From: Florian Fainelli @ 2018-03-23  1:19 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, davem, jaedon.shin, pgynther, opendmb,
	michal.chan, gospo, talgi, saeedm
In-Reply-To: <20180323011933.29748-1-f.fainelli@gmail.com>

Implement support for adaptive RX and TX interrupt coalescing using
net_dim. We have each of our TX ring and our single RX ring implement a
bcm_sysport_net_dim structure which holds an interrupt counter, number
of packets, bytes, and a container for a net_dim instance.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 141 ++++++++++++++++++++++++++---
 drivers/net/ethernet/broadcom/bcmsysport.h |  14 +++
 2 files changed, 140 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index f15a8fc6dfc9..5a5a726bafa4 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/net_dim.h>
 #include <linux/etherdevice.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
@@ -574,21 +575,55 @@ static int bcm_sysport_set_wol(struct net_device *dev,
 	return 0;
 }
 
+static void bcm_sysport_set_rx_coalesce(struct bcm_sysport_priv *priv)
+{
+	u32 reg;
+
+	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
+	reg &= ~(RDMA_INTR_THRESH_MASK |
+		 RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
+	reg |= priv->dim.coal_pkts;
+	reg |= DIV_ROUND_UP(priv->dim.coal_usecs * 1000, 8192) <<
+			    RDMA_TIMEOUT_SHIFT;
+	rdma_writel(priv, reg, RDMA_MBDONE_INTR);
+}
+
+static void bcm_sysport_set_tx_coalesce(struct bcm_sysport_tx_ring *ring)
+{
+	struct bcm_sysport_priv *priv = ring->priv;
+	u32 reg;
+
+	reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(ring->index));
+	reg &= ~(RING_INTR_THRESH_MASK |
+		 RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
+	reg |= ring->dim.coal_pkts;
+	reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192) <<
+			    RING_TIMEOUT_SHIFT;
+	tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(ring->index));
+}
+
 static int bcm_sysport_get_coalesce(struct net_device *dev,
 				    struct ethtool_coalesce *ec)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
+	struct bcm_sysport_tx_ring *ring;
+	unsigned int i;
 	u32 reg;
 
 	reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(0));
 
 	ec->tx_coalesce_usecs = (reg >> RING_TIMEOUT_SHIFT) * 8192 / 1000;
 	ec->tx_max_coalesced_frames = reg & RING_INTR_THRESH_MASK;
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		ring = &priv->tx_rings[i];
+		ec->use_adaptive_tx_coalesce |= ring->dim.use_dim;
+	}
 
 	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
 
 	ec->rx_coalesce_usecs = (reg >> RDMA_TIMEOUT_SHIFT) * 8192 / 1000;
 	ec->rx_max_coalesced_frames = reg & RDMA_INTR_THRESH_MASK;
+	ec->use_adaptive_rx_coalesce = priv->dim.use_dim;
 
 	return 0;
 }
@@ -597,8 +632,8 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 				    struct ethtool_coalesce *ec)
 {
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
+	struct bcm_sysport_tx_ring *ring;
 	unsigned int i;
-	u32 reg;
 
 	/* Base system clock is 125Mhz, DMA timeout is this reference clock
 	 * divided by 1024, which yield roughly 8.192 us, our maximum value has
@@ -615,22 +650,26 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 		return -EINVAL;
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
-		reg = tdma_readl(priv, TDMA_DESC_RING_INTR_CONTROL(i));
-		reg &= ~(RING_INTR_THRESH_MASK |
-			 RING_TIMEOUT_MASK << RING_TIMEOUT_SHIFT);
-		reg |= ec->tx_max_coalesced_frames;
-		reg |= DIV_ROUND_UP(ec->tx_coalesce_usecs * 1000, 8192) <<
-			 RING_TIMEOUT_SHIFT;
-		tdma_writel(priv, reg, TDMA_DESC_RING_INTR_CONTROL(i));
+		ring = &priv->tx_rings[i];
+		ring->dim.coal_pkts = ec->tx_max_coalesced_frames;
+		ring->dim.coal_usecs = ec->tx_coalesce_usecs;
+		if (!ec->use_adaptive_tx_coalesce && ring->dim.use_dim) {
+			ring->dim.coal_pkts = 1;
+			ring->dim.coal_usecs = 0;
+		}
+		ring->dim.use_dim = ec->use_adaptive_tx_coalesce;
+		bcm_sysport_set_tx_coalesce(ring);
 	}
 
-	reg = rdma_readl(priv, RDMA_MBDONE_INTR);
-	reg &= ~(RDMA_INTR_THRESH_MASK |
-		 RDMA_TIMEOUT_MASK << RDMA_TIMEOUT_SHIFT);
-	reg |= ec->rx_max_coalesced_frames;
-	reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192) <<
-			    RDMA_TIMEOUT_SHIFT;
-	rdma_writel(priv, reg, RDMA_MBDONE_INTR);
+	priv->dim.coal_usecs = ec->rx_coalesce_usecs;
+	priv->dim.coal_pkts = ec->rx_max_coalesced_frames;
+
+	if (!ec->use_adaptive_rx_coalesce && priv->dim.use_dim) {
+		priv->dim.coal_pkts = 1;
+		priv->dim.coal_usecs = 0;
+	}
+	priv->dim.use_dim = ec->use_adaptive_rx_coalesce;
+	bcm_sysport_set_rx_coalesce(priv);
 
 	return 0;
 }
@@ -709,6 +748,7 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
 	struct bcm_sysport_stats64 *stats64 = &priv->stats64;
 	struct net_device *ndev = priv->netdev;
 	unsigned int processed = 0, to_process;
+	unsigned int processed_bytes = 0;
 	struct bcm_sysport_cb *cb;
 	struct sk_buff *skb;
 	unsigned int p_index;
@@ -800,6 +840,7 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
 		 */
 		skb_pull(skb, sizeof(*rsb) + 2);
 		len -= (sizeof(*rsb) + 2);
+		processed_bytes += len;
 
 		/* UniMAC may forward CRC */
 		if (priv->crc_fwd) {
@@ -824,6 +865,9 @@ static unsigned int bcm_sysport_desc_rx(struct bcm_sysport_priv *priv,
 			priv->rx_read_ptr = 0;
 	}
 
+	priv->dim.packets = processed;
+	priv->dim.bytes = processed_bytes;
+
 	return processed;
 }
 
@@ -900,6 +944,8 @@ static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
 	ring->packets += pkts_compl;
 	ring->bytes += bytes_compl;
 	u64_stats_update_end(&priv->syncp);
+	ring->dim.packets = pkts_compl;
+	ring->dim.bytes = bytes_compl;
 
 	ring->c_index = c_index;
 
@@ -945,6 +991,7 @@ static int bcm_sysport_tx_poll(struct napi_struct *napi, int budget)
 {
 	struct bcm_sysport_tx_ring *ring =
 		container_of(napi, struct bcm_sysport_tx_ring, napi);
+	struct net_dim_sample dim_sample;
 	unsigned int work_done = 0;
 
 	work_done = bcm_sysport_tx_reclaim(ring->priv, ring);
@@ -961,6 +1008,12 @@ static int bcm_sysport_tx_poll(struct napi_struct *napi, int budget)
 		return 0;
 	}
 
+	if (ring->dim.use_dim) {
+		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
+			       ring->dim.bytes, &dim_sample);
+		net_dim(&ring->dim.dim, dim_sample);
+	}
+
 	return budget;
 }
 
@@ -976,6 +1029,7 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 {
 	struct bcm_sysport_priv *priv =
 		container_of(napi, struct bcm_sysport_priv, napi);
+	struct net_dim_sample dim_sample;
 	unsigned int work_done = 0;
 
 	work_done = bcm_sysport_desc_rx(priv, budget);
@@ -998,6 +1052,12 @@ static int bcm_sysport_poll(struct napi_struct *napi, int budget)
 		intrl2_0_mask_clear(priv, INTRL2_0_RDMA_MBDONE);
 	}
 
+	if (priv->dim.use_dim) {
+		net_dim_sample(priv->dim.event_ctr, priv->dim.packets,
+			       priv->dim.bytes, &dim_sample);
+		net_dim(&priv->dim.dim, dim_sample);
+	}
+
 	return work_done;
 }
 
@@ -1016,6 +1076,40 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 	netif_dbg(priv, wol, priv->netdev, "resumed from WOL\n");
 }
 
+static void bcm_sysport_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct bcm_sysport_net_dim *ndim =
+			container_of(dim, struct bcm_sysport_net_dim, dim);
+	struct bcm_sysport_priv *priv =
+			container_of(ndim, struct bcm_sysport_priv, dim);
+	struct net_dim_cq_moder cur_profile =
+				net_dim_get_profile(dim->mode, dim->profile_ix);
+
+	priv->dim.coal_usecs = cur_profile.usec;
+	priv->dim.coal_pkts = cur_profile.pkts;
+
+	bcm_sysport_set_rx_coalesce(priv);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
+static void bcm_sysport_dim_tx_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct bcm_sysport_net_dim *ndim =
+			container_of(dim, struct bcm_sysport_net_dim, dim);
+	struct bcm_sysport_tx_ring *ring =
+			container_of(ndim, struct bcm_sysport_tx_ring, dim);
+	struct net_dim_cq_moder cur_profile =
+				net_dim_get_profile(dim->mode, dim->profile_ix);
+
+	ring->dim.coal_usecs = cur_profile.usec;
+	ring->dim.coal_pkts = cur_profile.pkts;
+
+	bcm_sysport_set_tx_coalesce(ring);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
 /* RX and misc interrupt routine */
 static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
 {
@@ -1034,6 +1128,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
 	}
 
 	if (priv->irq0_stat & INTRL2_0_RDMA_MBDONE) {
+		priv->dim.event_ctr++;
 		if (likely(napi_schedule_prep(&priv->napi))) {
 			/* disable RX interrupts */
 			intrl2_0_mask_set(priv, INTRL2_0_RDMA_MBDONE);
@@ -1061,6 +1156,7 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id)
 			continue;
 
 		txr = &priv->tx_rings[ring];
+		txr->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&txr->napi))) {
 			intrl2_0_mask_set(priv, ring_bit);
@@ -1093,6 +1189,7 @@ static irqreturn_t bcm_sysport_tx_isr(int irq, void *dev_id)
 			continue;
 
 		txr = &priv->tx_rings[ring];
+		txr->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&txr->napi))) {
 			intrl2_1_mask_set(priv, BIT(ring));
@@ -1358,6 +1455,16 @@ static void bcm_sysport_adj_link(struct net_device *dev)
 		phy_print_status(phydev);
 }
 
+static void bcm_sysport_init_dim(struct bcm_sysport_net_dim *dim,
+				 void (*cb)(struct work_struct *work))
+{
+	INIT_WORK(&dim->dim.work, cb);
+	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->event_ctr = 0;
+	dim->packets = 0;
+	dim->bytes = 0;
+}
+
 static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv,
 				    unsigned int index)
 {
@@ -1447,6 +1554,7 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv,
 	reg |= (1 << index);
 	tdma_writel(priv, reg, TDMA_TIER1_ARB_0_QUEUE_EN);
 
+	bcm_sysport_init_dim(&ring->dim, bcm_sysport_dim_tx_work);
 	napi_enable(&ring->napi);
 
 	netif_dbg(priv, hw, priv->netdev,
@@ -1477,6 +1585,7 @@ static void bcm_sysport_fini_tx_ring(struct bcm_sysport_priv *priv,
 		return;
 
 	napi_disable(&ring->napi);
+	cancel_work_sync(&ring->dim.dim.work);
 	netif_napi_del(&ring->napi);
 
 	bcm_sysport_tx_clean(priv, ring);
@@ -1766,6 +1875,7 @@ static void bcm_sysport_netif_start(struct net_device *dev)
 	struct bcm_sysport_priv *priv = netdev_priv(dev);
 
 	/* Enable NAPI */
+	bcm_sysport_init_dim(&priv->dim, bcm_sysport_dim_work);
 	napi_enable(&priv->napi);
 
 	/* Enable RX interrupt and TX ring full interrupt */
@@ -1951,6 +2061,7 @@ static void bcm_sysport_netif_stop(struct net_device *dev)
 	/* stop all software from updating hardware */
 	netif_tx_stop_all_queues(dev);
 	napi_disable(&priv->napi);
+	cancel_work_sync(&priv->dim.dim.work);
 	phy_stop(dev->phydev);
 
 	/* mask all interrupts */
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index f5a984c1c986..9f48ad3cc38d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -12,6 +12,7 @@
 #define __BCM_SYSPORT_H
 
 #include <linux/if_vlan.h>
+#include <linux/net_dim.h>
 
 /* Receive/transmit descriptor format */
 #define DESC_ADDR_HI_STATUS_LEN	0x00
@@ -695,6 +696,16 @@ struct bcm_sysport_hw_params {
 	unsigned int	num_rx_desc_words;
 };
 
+struct bcm_sysport_net_dim {
+	u16			use_dim;
+	u16			event_ctr;
+	unsigned long		packets;
+	unsigned long		bytes;
+	u32			coal_usecs;
+	u32			coal_pkts;
+	struct net_dim		dim;
+};
+
 /* Software view of the TX ring */
 struct bcm_sysport_tx_ring {
 	spinlock_t	lock;		/* Ring lock for tx reclaim/xmit */
@@ -712,6 +723,7 @@ struct bcm_sysport_tx_ring {
 	struct bcm_sysport_priv *priv;	/* private context backpointer */
 	unsigned long	packets;	/* packets statistics */
 	unsigned long	bytes;		/* bytes statistics */
+	struct bcm_sysport_net_dim dim;	/* Net DIM context */
 	unsigned int	switch_queue;	/* switch port queue number */
 	unsigned int	switch_port;	/* switch port queue number */
 	bool		inspect;	/* inspect switch port and queue */
@@ -743,6 +755,8 @@ struct bcm_sysport_priv {
 	unsigned int		rx_read_ptr;
 	unsigned int		rx_c_index;
 
+	struct bcm_sysport_net_dim	dim;
+
 	/* PHY device */
 	struct device_node	*phy_dn;
 	phy_interface_t		phy_interface;
-- 
2.14.1

^ permalink raw reply related

* [PATCH net-next 2/2] net: bcmgenet: Add support for adaptive RX coalescing
From: Florian Fainelli @ 2018-03-23  1:19 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, davem, jaedon.shin, pgynther, opendmb,
	michal.chan, gospo, talgi, saeedm
In-Reply-To: <20180323011933.29748-1-f.fainelli@gmail.com>

Unlike the moder modern SYSTEMPORT hardware, we do not have a
configurable TDMA timeout, which limits us to implement adaptive RX
interrupt coalescing only. We have each of our RX rings implement a
bcmgenet_net_dim structure which holds an interrupt counter, number of
packets, bytes, and a container for a net_dim instance.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 109 +++++++++++++++++++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.h |  12 +++
 2 files changed, 103 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index b1e35a9accf1..7db8edc643ec 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -603,6 +603,8 @@ static int bcmgenet_get_coalesce(struct net_device *dev,
 				 struct ethtool_coalesce *ec)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct bcmgenet_rx_ring *ring;
+	unsigned int i;
 
 	ec->tx_max_coalesced_frames =
 		bcmgenet_tdma_ring_readl(priv, DESC_INDEX,
@@ -613,15 +615,37 @@ static int bcmgenet_get_coalesce(struct net_device *dev,
 	ec->rx_coalesce_usecs =
 		bcmgenet_rdma_readl(priv, DMA_RING16_TIMEOUT) * 8192 / 1000;
 
+	for (i = 0; i < priv->hw_params->rx_queues; i++) {
+		ring = &priv->rx_rings[i];
+		ec->use_adaptive_rx_coalesce |= ring->dim.use_dim;
+	}
+	ring = &priv->rx_rings[DESC_INDEX];
+	ec->use_adaptive_rx_coalesce |= ring->dim.use_dim;
+
 	return 0;
 }
 
+static void bcmgenet_set_rx_coalesce(struct bcmgenet_rx_ring *ring)
+{
+	struct bcmgenet_priv *priv = ring->priv;
+	unsigned int i = ring->index;
+	u32 reg;
+
+	bcmgenet_rdma_ring_writel(priv, i, ring->dim.coal_pkts,
+				  DMA_MBUF_DONE_THRESH);
+
+	reg = bcmgenet_rdma_readl(priv, DMA_RING0_TIMEOUT + i);
+	reg &= ~DMA_TIMEOUT_MASK;
+	reg |= DIV_ROUND_UP(ring->dim.coal_usecs * 1000, 8192);
+	bcmgenet_rdma_writel(priv, reg, DMA_RING0_TIMEOUT + i);
+}
+
 static int bcmgenet_set_coalesce(struct net_device *dev,
 				 struct ethtool_coalesce *ec)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct bcmgenet_rx_ring *ring;
 	unsigned int i;
-	u32 reg;
 
 	/* Base system clock is 125Mhz, DMA timeout is this reference clock
 	 * divided by 1024, which yields roughly 8.192us, our maximum value
@@ -641,7 +665,8 @@ static int bcmgenet_set_coalesce(struct net_device *dev,
 	 * transmitted, or when the ring is empty.
 	 */
 	if (ec->tx_coalesce_usecs || ec->tx_coalesce_usecs_high ||
-	    ec->tx_coalesce_usecs_irq || ec->tx_coalesce_usecs_low)
+	    ec->tx_coalesce_usecs_irq || ec->tx_coalesce_usecs_low ||
+	    ec->use_adaptive_tx_coalesce)
 		return -EOPNOTSUPP;
 
 	/* Program all TX queues with the same values, as there is no
@@ -656,24 +681,26 @@ static int bcmgenet_set_coalesce(struct net_device *dev,
 				  DMA_MBUF_DONE_THRESH);
 
 	for (i = 0; i < priv->hw_params->rx_queues; i++) {
-		bcmgenet_rdma_ring_writel(priv, i,
-					  ec->rx_max_coalesced_frames,
-					  DMA_MBUF_DONE_THRESH);
-
-		reg = bcmgenet_rdma_readl(priv, DMA_RING0_TIMEOUT + i);
-		reg &= ~DMA_TIMEOUT_MASK;
-		reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192);
-		bcmgenet_rdma_writel(priv, reg, DMA_RING0_TIMEOUT + i);
+		ring = &priv->rx_rings[i];
+		ring->dim.coal_usecs = ec->rx_coalesce_usecs;
+		ring->dim.coal_pkts = ec->rx_max_coalesced_frames;
+		if (!ec->use_adaptive_rx_coalesce && ring->dim.use_dim) {
+			ring->dim.coal_pkts = 1;
+			ring->dim.coal_usecs = 0;
+		}
+		ring->dim.use_dim = ec->use_adaptive_rx_coalesce;
+		bcmgenet_set_rx_coalesce(ring);
 	}
 
-	bcmgenet_rdma_ring_writel(priv, DESC_INDEX,
-				  ec->rx_max_coalesced_frames,
-				  DMA_MBUF_DONE_THRESH);
-
-	reg = bcmgenet_rdma_readl(priv, DMA_RING16_TIMEOUT);
-	reg &= ~DMA_TIMEOUT_MASK;
-	reg |= DIV_ROUND_UP(ec->rx_coalesce_usecs * 1000, 8192);
-	bcmgenet_rdma_writel(priv, reg, DMA_RING16_TIMEOUT);
+	ring = &priv->rx_rings[DESC_INDEX];
+	ring->dim.coal_usecs = ec->rx_coalesce_usecs;
+	ring->dim.coal_pkts = ec->rx_max_coalesced_frames;
+	if (!ec->use_adaptive_rx_coalesce && ring->dim.use_dim) {
+		ring->dim.coal_pkts = 1;
+		ring->dim.coal_usecs = 0;
+	}
+	ring->dim.use_dim = ec->use_adaptive_rx_coalesce;
+	bcmgenet_set_rx_coalesce(ring);
 
 	return 0;
 }
@@ -1713,6 +1740,7 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	unsigned long dma_flag;
 	int len;
 	unsigned int rxpktprocessed = 0, rxpkttoprocess;
+	unsigned int bytes_processed = 0;
 	unsigned int p_index, mask;
 	unsigned int discards;
 	unsigned int chksum_ok = 0;
@@ -1832,6 +1860,8 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 			len -= ETH_FCS_LEN;
 		}
 
+		bytes_processed += len;
+
 		/*Finish setting up the received SKB and send it to the kernel*/
 		skb->protocol = eth_type_trans(skb, priv->dev);
 		ring->packets++;
@@ -1854,6 +1884,9 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 		bcmgenet_rdma_ring_writel(priv, ring->index, ring->c_index, RDMA_CONS_INDEX);
 	}
 
+	ring->dim.bytes = bytes_processed;
+	ring->dim.packets = rxpktprocessed;
+
 	return rxpktprocessed;
 }
 
@@ -1862,6 +1895,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 {
 	struct bcmgenet_rx_ring *ring = container_of(napi,
 			struct bcmgenet_rx_ring, napi);
+	struct net_dim_sample dim_sample;
 	unsigned int work_done;
 
 	work_done = bcmgenet_desc_rx(ring, budget);
@@ -1871,9 +1905,32 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int budget)
 		ring->int_enable(ring);
 	}
 
+	if (ring->dim.use_dim) {
+		net_dim_sample(ring->dim.event_ctr, ring->dim.packets,
+			       ring->dim.bytes, &dim_sample);
+		net_dim(&ring->dim.dim, dim_sample);
+	}
+
 	return work_done;
 }
 
+static void bcmgenet_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct bcmgenet_net_dim *ndim =
+			container_of(dim, struct bcmgenet_net_dim, dim);
+	struct bcmgenet_rx_ring *ring =
+			container_of(ndim, struct bcmgenet_rx_ring, dim);
+	struct net_dim_cq_moder cur_profile =
+			net_dim_get_profile(dim->mode, dim->profile_ix);
+
+	ring->dim.coal_usecs = cur_profile.usec;
+	ring->dim.coal_pkts = cur_profile.pkts;
+
+	bcmgenet_set_rx_coalesce(ring);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
 /* Assign skb to RX DMA descriptor. */
 static int bcmgenet_alloc_rx_buffers(struct bcmgenet_priv *priv,
 				     struct bcmgenet_rx_ring *ring)
@@ -2022,6 +2079,16 @@ static void init_umac(struct bcmgenet_priv *priv)
 	dev_dbg(kdev, "done init umac\n");
 }
 
+static void bcmgenet_init_dim(struct bcmgenet_net_dim *dim,
+			      void (*cb)(struct work_struct *work))
+{
+	INIT_WORK(&dim->dim.work, cb);
+	dim->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	dim->event_ctr = 0;
+	dim->packets = 0;
+	dim->bytes = 0;
+}
+
 /* Initialize a Tx ring along with corresponding hardware registers */
 static void bcmgenet_init_tx_ring(struct bcmgenet_priv *priv,
 				  unsigned int index, unsigned int size,
@@ -2111,6 +2178,8 @@ static int bcmgenet_init_rx_ring(struct bcmgenet_priv *priv,
 	if (ret)
 		return ret;
 
+	bcmgenet_init_dim(&ring->dim, bcmgenet_dim_work);
+
 	/* Initialize Rx NAPI */
 	netif_napi_add(priv->dev, &ring->napi, bcmgenet_rx_poll,
 		       NAPI_POLL_WEIGHT);
@@ -2276,10 +2345,12 @@ static void bcmgenet_disable_rx_napi(struct bcmgenet_priv *priv)
 	for (i = 0; i < priv->hw_params->rx_queues; ++i) {
 		ring = &priv->rx_rings[i];
 		napi_disable(&ring->napi);
+		cancel_work_sync(&ring->dim.dim.work);
 	}
 
 	ring = &priv->rx_rings[DESC_INDEX];
 	napi_disable(&ring->napi);
+	cancel_work_sync(&ring->dim.dim.work);
 }
 
 static void bcmgenet_fini_rx_napi(struct bcmgenet_priv *priv)
@@ -2557,6 +2628,7 @@ static irqreturn_t bcmgenet_isr1(int irq, void *dev_id)
 			continue;
 
 		rx_ring = &priv->rx_rings[index];
+		rx_ring->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&rx_ring->napi))) {
 			rx_ring->int_disable(rx_ring);
@@ -2601,6 +2673,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 
 	if (status & UMAC_IRQ_RXDMA_DONE) {
 		rx_ring = &priv->rx_rings[DESC_INDEX];
+		rx_ring->dim.event_ctr++;
 
 		if (likely(napi_schedule_prep(&rx_ring->napi))) {
 			rx_ring->int_disable(rx_ring);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 3c50431ccd2a..22c41e0430fb 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -16,6 +16,7 @@
 #include <linux/mii.h>
 #include <linux/if_vlan.h>
 #include <linux/phy.h>
+#include <linux/net_dim.h>
 
 /* total number of Buffer Descriptors, same for Rx/Tx */
 #define TOTAL_DESC				256
@@ -572,6 +573,16 @@ struct bcmgenet_tx_ring {
 	struct bcmgenet_priv *priv;
 };
 
+struct bcmgenet_net_dim {
+	u16		use_dim;
+	u16		event_ctr;
+	unsigned long	packets;
+	unsigned long	bytes;
+	u32		coal_usecs;
+	u32		coal_pkts;
+	struct net_dim	dim;
+};
+
 struct bcmgenet_rx_ring {
 	struct napi_struct napi;	/* Rx NAPI struct */
 	unsigned long	bytes;
@@ -586,6 +597,7 @@ struct bcmgenet_rx_ring {
 	unsigned int	cb_ptr;		/* Rx ring initial CB ptr */
 	unsigned int	end_ptr;	/* Rx ring end CB ptr */
 	unsigned int	old_discards;
+	struct bcmgenet_net_dim dim;
 	void (*int_enable)(struct bcmgenet_rx_ring *);
 	void (*int_disable)(struct bcmgenet_rx_ring *);
 	struct bcmgenet_priv *priv;
-- 
2.14.1

^ permalink raw reply related

* Re: [net-next:master 304/314] drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3878:8: error: too few arguments to function 'devlink_resource_register'
From: David Ahern @ 2018-03-23  1:53 UTC (permalink / raw)
  To: kbuild test robot; +Cc: kbuild-all, netdev
In-Reply-To: <201803230852.Y4PG4qcn%fengguang.wu@intel.com>

On 3/22/18 6:47 PM, kbuild test robot wrote:
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
> head:   6686c459e1449a3ee5f3fd313b0a559ace7a700e
> commit: 145307460ba9c11489807de7acd3f4c7395f60b7 [304/314] devlink: Remove top_hierarchy arg to devlink_resource_register
> config: x86_64-randconfig-s1-03230751 (attached as .config)
> compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
> reproduce:
>         git checkout 145307460ba9c11489807de7acd3f4c7395f60b7
>         # save the attached .config to linux build tree
>         make ARCH=x86_64 
> 
> All error/warnings (new ones prefixed by >>):
> 
>    drivers/net/ethernet/mellanox/mlxsw/spectrum.c: In function 'mlxsw_sp_resources_register':
>>> drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3881:6: warning: passing argument 6 of 'devlink_resource_register' makes integer from pointer without a cast [-Wint-conversion]
>          &kvd_size_params,
>          ^
>    In file included from drivers/net/ethernet/mellanox/mlxsw/core.h:47:0,
>                     from drivers/net/ethernet/mellanox/mlxsw/spectrum.h:54,
>                     from drivers/net/ethernet/mellanox/mlxsw/spectrum.c:64:
>    include/net/devlink.h:560:1: note: expected 'u64 {aka long long unsigned int}' but argument is of type 'struct devlink_resource_size_params *'
>     devlink_resource_register(struct devlink *devlink,
>     ^~~~~~~~~~~~~~~~~~~~~~~~~

I just did another full build (allmodconfig) on net-next and did not hit
this error.

^ permalink raw reply

* Re: [PATCH net-next] modules: allow modprobe load regular elf binaries
From: Luis R. Rodriguez @ 2018-03-23  2:47 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Jessica Yu, Linus Torvalds, Mimi Zohar,
	Djalal Harouni, David Miller, Kees Cook, Alexei Starovoitov,
	Al Viro, Daniel Borkmann, Greg Kroah-Hartman, Network Development,
	Linux Kernel Mailing List, kernel-team, Linux API, Michal Hocko,
	Hannes Reinecke, werner
In-Reply-To: <CALCETrWFhTurv0gwxRcdLn4CD-1ySE=v4ZgvbsUYJqruZNMptQ@mail.gmail.com>

On Thu, Mar 22, 2018 at 3:15 PM, Andy Lutomirski <luto@kernel.org> wrote:
>  All we need to do is to make sure that, if this is
> distributed as a module, that it's init routine doesn't wait for a
> long time, right?

Yeap.

 Luis

^ permalink raw reply

* RE: [PATCH] qed: Use true and false for boolean values
From: Kalluru, Sudarsana @ 2018-03-23  3:20 UTC (permalink / raw)
  To: Gustavo A. R. Silva, Elior, Ariel, Dept-Eng Everest Linux L2
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20180322200849.GA28033@embeddedgus>

-----Original Message-----
From: Gustavo A. R. Silva [mailto:gustavo@embeddedor.com] 
Sent: 23 March 2018 01:39
To: Elior, Ariel <Ariel.Elior@cavium.com>; Dept-Eng Everest Linux L2 <Dept-EngEverestLinuxL2@cavium.com>
Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Gustavo A. R. Silva <gustavo@embeddedor.com>
Subject: [PATCH] qed: Use true and false for boolean values

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 553a6d1..cdb3eec 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -298,8 +298,8 @@ static void qed_init_qm_params(struct qed_hwfn *p_hwfn)
 	qm_info->start_vport = (u8) RESC_START(p_hwfn, QED_VPORT);
 
 	/* rate limiting and weighted fair queueing are always enabled */
-	qm_info->vport_rl_en = 1;
-	qm_info->vport_wfq_en = 1;
+	qm_info->vport_rl_en = true;
+	qm_info->vport_wfq_en = true;
 
 	/* TC config is different for AH 4 port */
 	four_port = p_hwfn->cdev->num_ports_in_engine == MAX_NUM_PORTS_K2; @@ -1276,9 +1276,9 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
 
 	if (p_hwfn->mcp_info) {
 		if (p_hwfn->mcp_info->func_info.bandwidth_max)
-			qm_info->pf_rl_en = 1;
+			qm_info->pf_rl_en = true;
 		if (p_hwfn->mcp_info->func_info.bandwidth_min)
-			qm_info->pf_wfq_en = 1;
+			qm_info->pf_wfq_en = true;
 	}
 
 	memset(&params, 0, sizeof(params));
@@ -1630,7 +1630,7 @@ static int qed_vf_start(struct qed_hwfn *p_hwfn,
 		qed_vf_pf_tunnel_param_update(p_hwfn, p_params->p_tunn);
 	}
 
-	p_hwfn->b_int_enabled = 1;
+	p_hwfn->b_int_enabled = true;
 
 	return 0;
 }
--
2.7.4

Acked-by: Sudarsana Kalluru <Sudarsana.Kalluru@cavium.com>

^ permalink raw reply related

* Re: [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
From: Jakub Kicinski @ 2018-03-23  3:34 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, idosch, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-1-jiri@resnulli.us>

On Thu, 22 Mar 2018 11:55:10 +0100, Jiri Pirko wrote:
> Also, there is one extra port that I don't understand what
> is the purpose for it - something nfp specific perhaps.

Do you mean the PF netdev?  There can be multiple of those on
multi-host cards.  There is one pf_repr from ASIC's perspective and a
full-blown PF netdev which should be used by applications.  pf_repr is
only for switch config.

^ permalink raw reply

* Re: [patch net-next RFC 02/12] devlink: extend attrs_set for setting port flavours
From: Jakub Kicinski @ 2018-03-23  3:36 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, idosch, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-3-jiri@resnulli.us>

On Thu, 22 Mar 2018 11:55:12 +0100, Jiri Pirko wrote:
>  enum devlink_attr {
>  	/* don't change the order or add anything between, this is ABI! */
>  	DEVLINK_ATTR_UNSPEC,
> @@ -224,6 +242,7 @@ enum devlink_attr {
>  	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
>  	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
>  
> +	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
>  	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
>  	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */

nit: why is it OK to add the attr in the middle of enum? 

^ permalink raw reply

* Re: [patch net-next RFC 10/12] nfp: flower: create port for flower vnic
From: Jakub Kicinski @ 2018-03-23  3:38 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, idosch, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322105522.8186-11-jiri@resnulli.us>

On Thu, 22 Mar 2018 11:55:20 +0100, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
> 
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  drivers/net/ethernet/netronome/nfp/flower/main.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
> index aed8df0e9d41..1890af7e6196 100644
> --- a/drivers/net/ethernet/netronome/nfp/flower/main.c
> +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
> @@ -427,10 +427,9 @@ static int nfp_flower_vnic_alloc(struct nfp_app *app, struct nfp_net *nn,
>  		goto err_invalid_port;
>  	}
>  
> -	eth_hw_addr_random(nn->dp.netdev);
>  	netif_keep_dst(nn->dp.netdev);
>  
> -	return 0;
> +	return nfp_app_nic_vnic_alloc(app, nn, id);
>  
>  err_invalid_port:
>  	nn->port = nfp_port_alloc(app, NFP_PORT_INVALID, nn->dp.netdev);

This will associate the PF netdev with physical port, incl. all ethtool
information.  Im not sure we want to do that.  phy_repr carries this
functionality.

^ permalink raw reply

* Re: linux-next: manual merge of the net-next tree with the rdma-fixes tree
From: David Miller @ 2018-03-23  4:19 UTC (permalink / raw)
  To: dledford; +Cc: sfr, netdev, jgg, linux-next, linux-kernel, markb, leonro
In-Reply-To: <1521163082.18703.191.camel@redhat.com>

From: Doug Ledford <dledford@redhat.com>
Date: Thu, 15 Mar 2018 21:18:02 -0400

> Here's the commit (from the rdma git repo) with the proper merge fix
> (although it also has other minor merge stuff that needs to be ignored):
> 
> 2d873449a202 (Merge branch 'k.o/wip/dl-for-rc' into k.o/wip/dl-for-next)

Really?

[davem@localhost GIT]$ git clone --reference linux/.git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git
Cloning into 'rdma'...
remote: Counting objects: 32, done.        
remote: Compressing objects: 100% (31/31), done.        
remote: Total 32 (delta 0), reused 32 (delta 0)        
Receiving objects: 100% (32/32), 186.52 KiB | 3.01 MiB/s, done.
Checking connectivity: 5706198, done.
Checking out files: 100% (61622/61622), done.
[davem@localhost GIT]$ cd rdma
[davem@localhost rdma]$ git show 2d873449a202
fatal: ambiguous argument '2d873449a202': unknown revision or path not in the working tree.
Use '--' to separate paths from revisions, like this:
'git <command> [<revision>...] -- [<file>...]'
[davem@localhost rdma]$

^ permalink raw reply

* Re: linux-next: manual merge of the net-next tree with the rdma-fixes tree
From: Jason Gunthorpe @ 2018-03-23  4:33 UTC (permalink / raw)
  To: David Miller
  Cc: dledford, sfr, netdev, linux-next, linux-kernel, markb, leonro
In-Reply-To: <20180323.001900.849305099701511845.davem@davemloft.net>

On Fri, Mar 23, 2018 at 12:19:00AM -0400, David Miller wrote:
> From: Doug Ledford <dledford@redhat.com>
> Date: Thu, 15 Mar 2018 21:18:02 -0400
> 
> > Here's the commit (from the rdma git repo) with the proper merge fix
> > (although it also has other minor merge stuff that needs to be ignored):
> > 
> > 2d873449a202 (Merge branch 'k.o/wip/dl-for-rc' into k.o/wip/dl-for-next)
> 
> Really?
> 
> [davem@localhost GIT]$ git clone --reference linux/.git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git
> Cloning into 'rdma'...

Doug and I moved to a shared repo location when we started maintain it
as a team:

git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git

The commit is here:

https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?h=for-next&id=2d873449a202d02e0c4d90009fb2beb7013ac575

Jason

^ permalink raw reply

* Re: [PATCH net-next] bridge: Allow max MTU when multiple VLANs present
From: Roopa Prabhu @ 2018-03-23  4:53 UTC (permalink / raw)
  To: Chas Williams; +Cc: David Miller, netdev, Stephen Hemminger
In-Reply-To: <20180322153406.17760-1-3chas3@gmail.com>

On Thu, Mar 22, 2018 at 8:34 AM, Chas Williams <3chas3@gmail.com> wrote:
> If the bridge is allowing multiple VLANs, some VLANs may have
> different MTUs.  Instead of choosing the minimum MTU for the
> bridge interface, choose the maximum MTU of the bridge members.
> With this the user only needs to set a larger MTU on the member
> ports that are participating in the large MTU VLANS.
>
> Signed-off-by: Chas Williams <3chas3@gmail.com>
> ---

Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>

This or an equivalent fix is necessary: as stated above, today the
bridge mtu capped at min port mtu limits all
vlan devices on top of the vlan filtering bridge to min port mtu.

^ permalink raw reply

* Re: [patch net-next RFC 10/12] nfp: flower: create port for flower vnic
From: Jiri Pirko @ 2018-03-23  6:29 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, davem, idosch, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322203828.5167c0ac@cakuba.netronome.com>

Fri, Mar 23, 2018 at 04:38:28AM CET, jakub.kicinski@netronome.com wrote:
>On Thu, 22 Mar 2018 11:55:20 +0100, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  drivers/net/ethernet/netronome/nfp/flower/main.c | 3 +--
>>  1 file changed, 1 insertion(+), 2 deletions(-)
>> 
>> diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
>> index aed8df0e9d41..1890af7e6196 100644
>> --- a/drivers/net/ethernet/netronome/nfp/flower/main.c
>> +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
>> @@ -427,10 +427,9 @@ static int nfp_flower_vnic_alloc(struct nfp_app *app, struct nfp_net *nn,
>>  		goto err_invalid_port;
>>  	}
>>  
>> -	eth_hw_addr_random(nn->dp.netdev);
>>  	netif_keep_dst(nn->dp.netdev);
>>  
>> -	return 0;
>> +	return nfp_app_nic_vnic_alloc(app, nn, id);
>>  
>>  err_invalid_port:
>>  	nn->port = nfp_port_alloc(app, NFP_PORT_INVALID, nn->dp.netdev);
>
>This will associate the PF netdev with physical port, incl. all ethtool
>information.  Im not sure we want to do that.  phy_repr carries this
>functionality.

I was not sure originally what this port is. Okay, what I would like to
see is another port flavour for "pf" and "vf". I guess that since the pf
has the same pci address, it would fall under the same devlink instance.
For vfs, which have each separate pci address, I would like to create
devlink instance for each and associate with one devlink port flavour
"vf".

^ permalink raw reply

* Re: [patch net-next RFC 02/12] devlink: extend attrs_set for setting port flavours
From: Jiri Pirko @ 2018-03-23  6:30 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, davem, idosch, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322203624.36e8b693@cakuba.netronome.com>

Fri, Mar 23, 2018 at 04:36:24AM CET, jakub.kicinski@netronome.com wrote:
>On Thu, 22 Mar 2018 11:55:12 +0100, Jiri Pirko wrote:
>>  enum devlink_attr {
>>  	/* don't change the order or add anything between, this is ABI! */
>>  	DEVLINK_ATTR_UNSPEC,
>> @@ -224,6 +242,7 @@ enum devlink_attr {
>>  	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
>>  	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
>>  
>> +	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
>>  	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
>>  	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
>
>nit: why is it OK to add the attr in the middle of enum? 

Well, I added the next 2 in previous the patch.

^ permalink raw reply

* Re: [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
From: Jiri Pirko @ 2018-03-23  6:34 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, davem, idosch, jakub.kicinski, mlxsw, andrew,
	vivien.didelot, f.fainelli, michael.chan, ganeshgr, saeedm,
	simon.horman, pieter.jansenvanvuuren, john.hurley,
	dirk.vandermerwe, alexander.h.duyck, ogerlitz, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <fde6feca-9616-c3a9-2ab5-1ecfe8741ca6@gmail.com>

Thu, Mar 22, 2018 at 08:10:38PM CET, dsahern@gmail.com wrote:
>On 3/22/18 11:49 AM, Jiri Pirko wrote:
>> Thu, Mar 22, 2018 at 04:34:07PM CET, dsahern@gmail.com wrote:
>>> On 3/22/18 4:55 AM, Jiri Pirko wrote:
>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>
>>>> This patchset resolves 2 issues we have right now:
>>>> 1) There are many netdevices / ports in the system, for port, pf, vf
>>>>    represenatation but the user has no way to see which is which
>>>> 2) The ndo_get_phys_port_name is implemented in each driver separatelly,
>>>>    which may lead to inconsistent names between drivers.
>>>
>>> Similar to ndo_get_phys_port_{name,id}, devlink requires drivers to opt
>>> in with an implementation right, so you can't really force a solution to
>>> the consistent naming.
>> 
>> Yeah, drivers would still have free choice to implemen the ndo
>> themselves. But most of them, like all sriov switch drivers should use
>> the devlink helper to have consistent naming. In other words, devlink
>> helper should be the standard way, in weird cases (like rocker), driver
>> implements it himself.
>
>That's an assumption that somehow the devlink API will be better
>supported than ndo_get_phys_port_{name,id}. Don't get me wrong -- an API
>to show the kind of device is needed, but I do not think this enforces
>any kind of consistency in naming.

So you say that we need to enforce it somehow?


>
>> 
>> 
>>>
>>>>
>>>> This patchset introduces port flavours which should address the first
>>>> problem. I'm testing this with Netronome nfp hardware. When the user
>>>> has 2 physical ports, 1 pf, and 4 vfs, he should see something like this:
>>>> # devlink port
>>>> pci/0000:05:00.0/0: type eth netdev enp5s0np0 flavour physical number 0
>>>> pci/0000:05:00.0/268435456: type eth netdev eth0 flavour physical number 0
>>>> pci/0000:05:00.0/268435460: type eth netdev enp5s0np1 flavour physical number 1
>>>> pci/0000:05:00.0/536875008: type eth netdev eth2 flavour pf_rep number 536875008
>>>> pci/0000:05:00.0/536870912: type eth netdev eth1 flavour vf_rep number 0
>>>> pci/0000:05:00.0/536870976: type eth netdev eth3 flavour vf_rep number 1
>>>> pci/0000:05:00.0/536871040: type eth netdev eth4 flavour vf_rep number 2
>>>> pci/0000:05:00.0/536871104: type eth netdev eth5 flavour vf_rep number 3
>>>
>>> How about 'kind' instead of flavo{u}r?
>> 
>> Yeah, kind is often used in kernel already with different meaning
>> git grep kind net/core
>> I wanted to avoid confusions
>
>Roopa's amendment works as well; I just think flavor / flavour is the
>wrong word. Make me thinks of food ... ice cream vs netdevices.

Ok

^ permalink raw reply

* [patch iproute2 rfc 1/2] devlink: introduce support for showing port flavours
From: Jiri Pirko @ 2018-03-23  6:35 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322233550.GE6544@lunn.ch>

From: Jiri Pirko <jiri@mellanox.com>

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 devlink/devlink.c            | 16 ++++++++++++++++
 include/uapi/linux/devlink.h | 20 ++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 69c3c5d9dd31..952b7cabbccf 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -1685,6 +1685,16 @@ static const char *port_type_name(uint32_t type)
 	}
 }
 
+static const char *port_flavour_name(uint16_t flavour)
+{
+	switch (flavour) {
+	case DEVLINK_PORT_FLAVOUR_PHYSICAL: return "physical";
+	case DEVLINK_PORT_FLAVOUR_PF_REP: return "pf_rep";
+	case DEVLINK_PORT_FLAVOUR_VF_REP: return "vf_rep";
+	default: return "<unknown flavour>";
+	}
+}
+
 static void pr_out_port(struct dl *dl, struct nlattr **tb)
 {
 	struct nlattr *pt_attr = tb[DEVLINK_ATTR_PORT_TYPE];
@@ -1709,6 +1719,12 @@ static void pr_out_port(struct dl *dl, struct nlattr **tb)
 	if (tb[DEVLINK_ATTR_PORT_IBDEV_NAME])
 		pr_out_str(dl, "ibdev",
 			   mnl_attr_get_str(tb[DEVLINK_ATTR_PORT_IBDEV_NAME]));
+	if (tb[DEVLINK_ATTR_PORT_FLAVOUR]) {
+		uint16_t port_flavour =
+				mnl_attr_get_u16(tb[DEVLINK_ATTR_PORT_FLAVOUR]);
+
+		pr_out_str(dl, "flavour", port_flavour_name(port_flavour));
+	}
 	if (tb[DEVLINK_ATTR_PORT_SPLIT_GROUP])
 		pr_out_uint(dl, "split_group",
 			    mnl_attr_get_u32(tb[DEVLINK_ATTR_PORT_SPLIT_GROUP]));
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 9f17286ec89f..5ea871c2739d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -132,6 +132,24 @@ enum devlink_eswitch_encap_mode {
 	DEVLINK_ESWITCH_ENCAP_MODE_BASIC,
 };
 
+enum devlink_port_flavour {
+	DEVLINK_PORT_FLAVOUR_PHYSICAL, /* Any kind of a port physically
+					* facing the user.
+					*/
+	DEVLINK_PORT_FLAVOUR_PF_REP, /* Port represents a SR-IOV physical
+				      *	function counterpart port of
+				      *	embedded switch.
+				      */
+	DEVLINK_PORT_FLAVOUR_VF_REP, /* Port represents a SR-IOV virtual
+				      *	function counterpart port of
+				      *	embedded switch.
+				      */
+	DEVLINK_PORT_FLAVOUR_CPU, /* CPU port */
+	DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
+				   * interconnect port.
+				   */
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -224,6 +242,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
-- 
2.14.3

^ permalink raw reply related

* [patch iproute2 rfc 2/2] devlink: introduce support for showing port number and split subport number
From: Jiri Pirko @ 2018-03-23  6:35 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322233550.GE6544@lunn.ch>

From: Jiri Pirko <jiri@mellanox.com>

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 devlink/devlink.c            | 6 ++++++
 include/uapi/linux/devlink.h | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 952b7cabbccf..50fec26781f4 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -1725,9 +1725,15 @@ static void pr_out_port(struct dl *dl, struct nlattr **tb)
 
 		pr_out_str(dl, "flavour", port_flavour_name(port_flavour));
 	}
+	if (tb[DEVLINK_ATTR_PORT_NUMBER])
+		pr_out_uint(dl, "number",
+			    mnl_attr_get_u32(tb[DEVLINK_ATTR_PORT_NUMBER]));
 	if (tb[DEVLINK_ATTR_PORT_SPLIT_GROUP])
 		pr_out_uint(dl, "split_group",
 			    mnl_attr_get_u32(tb[DEVLINK_ATTR_PORT_SPLIT_GROUP]));
+	if (tb[DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER])
+		pr_out_uint(dl, "subport",
+			    mnl_attr_get_u32(tb[DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER]));
 	pr_out_port_handle_end(dl);
 }
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 5ea871c2739d..4a39ad2ecd08 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -243,6 +243,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
 	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
+	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
-- 
2.14.3

^ permalink raw reply related

* Re: [patch net-next RFC 00/12] devlink: introduce port flavours and common phys_port_name generation
From: Jiri Pirko @ 2018-03-23  6:37 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, davem, idosch, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, dsahern, vijaya.guvva,
	satananda.burla, raghu.vatsavayi, felix.manlunas, gospo,
	sathya.perla, vasundhara-v.volam, tariqt, eranbe,
	jeffrey.t.kirsher
In-Reply-To: <20180322203447.1e399382@cakuba.netronome.com>

Fri, Mar 23, 2018 at 04:34:47AM CET, jakub.kicinski@netronome.com wrote:
>On Thu, 22 Mar 2018 11:55:10 +0100, Jiri Pirko wrote:
>> Also, there is one extra port that I don't understand what
>> is the purpose for it - something nfp specific perhaps.
>
>Do you mean the PF netdev?  There can be multiple of those on
>multi-host cards.  There is one pf_repr from ASIC's perspective and a
>full-blown PF netdev which should be used by applications.  pf_repr is
>only for switch config.

Got it.

^ permalink raw reply

* [PATCH net-next] XDP router for veth
From: Md. Islam @ 2018-03-23  6:43 UTC (permalink / raw)
  To: ebiederm, xemul, netdev, shemminger, Eric Dumazet, dsahern, roopa,
	tom, alexei.starovoitov, f.fainelli, brouer

[-- Attachment #1: Type: text/plain, Size: 1493 bytes --]

Hi

This patch implements IPv4 forwarding on xdp_buff. Currently it only
works with VETH. It forwards packets as soon as a veth receives a
packet. Currently VETH uses slow path for packet forwarding which
requires packet to go through upper layers. However this patch
forwards the packet as soon as it is received by L2.

The patch accomplishes following:

1. xdp_buff support is added to VETH. This will allow us to implement
XDP forwarding logic without having physical NIC.
2. Ipv4 forwarding is implemented for xdp_buff. So router fast path is
completely implemented in kernel (unlike eBPF)

I tested my implementation in Mininet [1]. Mininet is based on veth. I
tested the patch for following topologies.


h1 -----s1-------h2


   h                      h
     \                     /
       \                  /
h -----s1-------------s2-------h
       /                 \
     /                    \
   h                      h

I generated packets using iperf and a custom video streaming
application. IPv4 forwarding seems to be working properly. I'm also
getting higher throughput and lower latency than current veth. For
instance, on my machine, the throughput of iperf improved from
53.8Gb/s to around 56Gb/s. Median RTT improved from around .055 ms to
around .045 ms.

1. http://mininet.org/

The patch has been generated with kernel 4.15.0+. Please let me know
any question or suggestions.

Many thanks
Tamim
PhD Candidate,
Kent State University
http://web.cs.kent.edu/~mislam4/

[-- Attachment #2: xdp-fastpath.patch --]
[-- Type: text/x-patch, Size: 9847 bytes --]

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 944ec3c..8c39128 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -328,6 +328,15 @@ config VETH
 	  When one end receives the packet it appears on its pair and vice
 	  versa.
 
+config XDP_ROUTER
+	bool "IP forwarding on XDP"
+	depends on IP_ADVANCED_ROUTER
+        depends on VETH
+        default y
+	help
+	  This option will enable IP forwarding on incoming xdp_buff. 
+          Currently it is only supported by VETH  
+
 config VIRTIO_NET
 	tristate "Virtio network driver"
 	depends on VIRTIO
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39..30876e3 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -110,7 +110,24 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 		kfree_skb(skb);
 		goto drop;
 	}
-
+#ifdef CONFIG_XDP_ROUTER
+        //if IP forwarding is enabled on the receiver, create xdp_buff
+        //from skb and call xdp_router_forward()
+        if(is_forwarding_enabled(rcv)){
+                struct xdp_buff *xdp = kmalloc(sizeof(*xdp), GFP_KERNEL);        
+                xdp->data = skb->data;        
+                xdp->data_end = skb->data + (skb->len - skb->data_len);
+                xdp->data_meta = skb;
+                if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)){
+                        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+                        u64_stats_update_begin(&stats->syncp);
+                        stats->bytes += length;
+                        stats->packets++;
+                        u64_stats_update_end(&stats->syncp);
+                        goto success;
+                }
+        }
+#endif
 	if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
@@ -122,6 +139,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 drop:
 		atomic64_inc(&priv->dropped);
 	}
+success:        
 	rcu_read_unlock();
 	return NETDEV_TX_OK;
 }
@@ -276,6 +294,57 @@ static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_XDP_ROUTER
+//Called holding RCU lock
+int veth_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp){
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *rcv;
+        struct ethhdr *ethh;
+        struct sk_buff *skb;
+	int length = xdp->data_end - xdp->data;
+
+	rcv = rcu_dereference(priv->peer);
+	if (unlikely(!rcv)) {
+		kfree(xdp);
+		goto drop;
+	}
+        
+        //Update MAC address and checksum
+	ethh = eth_hdr_xdp(xdp);
+        ether_addr_copy(ethh->h_source, dev->dev_addr);
+        ether_addr_copy(ethh->h_dest, rcv->dev_addr);
+        
+
+        //if IP forwarding is enabled on the receiver, call xdp_router_forward()
+        if(is_forwarding_enabled(rcv)){
+                if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)){
+                        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+                        u64_stats_update_begin(&stats->syncp);
+                        stats->bytes += length;
+                        stats->packets++;
+                        u64_stats_update_end(&stats->syncp);
+                        return NETDEV_TX_OK;
+                }
+        }
+
+        //Local deliver
+        skb = (struct sk_buff *)xdp->data_meta;
+	if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->bytes += length;
+		stats->packets++;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+drop:
+		atomic64_inc(&priv->dropped);
+	}
+        return NETDEV_TX_OK;
+
+}
+#endif
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -290,6 +359,9 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_get_iflink		= veth_get_iflink,
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
+#ifdef CONFIG_XDP_ROUTER  
+        .ndo_xdp_xmit           = veth_xdp_xmit,
+#endif        
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 492bc65..7ade66e 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -20,6 +20,22 @@
 #include <linux/skbuff.h>
 #include <uapi/linux/ip.h>
 
+#include <linux/filter.h>
+
+#ifdef CONFIG_XDP_ROUTER
+
+static inline struct iphdr *ip_hdr_xdp(const struct xdp_buff *xdp)
+{
+	return (struct iphdr *)(xdp->data+ETH_HLEN);
+}
+
+static inline struct ethhdr *eth_hdr_xdp(const struct xdp_buff *xdp)
+{
+	return (struct ethhdr *)(xdp->data);
+}
+
+#endif
+
 static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
 {
 	return (struct iphdr *)skb_network_header(skb);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4c77f39..25db780 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3290,6 +3290,12 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 	__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
 }
 
+#ifdef CONFIG_XDP_ROUTER
+bool is_xdp_forwardable(const struct net_device *dev, const struct xdp_buff *xdp);
+bool is_forwarding_enabled(struct net_device *dev);
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp);
+#endif
+
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f805243..623b2de 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -369,6 +369,12 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
+#ifdef CONFIG_XDP_ROUTER
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+			       u8 tos, struct net_device *dev,
+			       struct fib_result *res);
+#endif
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
 		       const struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index dda9d7b..f97818c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4090,6 +4090,85 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(do_xdp_generic);
 
+#ifdef CONFIG_XDP_ROUTER
+
+bool is_xdp_forwardable(const struct net_device *dev, const struct xdp_buff *xdp)
+{
+	unsigned int len;
+        unsigned int pk_len = xdp->data_end - xdp->data;
+	if (!(dev->flags & IFF_UP))
+		return false;
+
+	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+	if (pk_len <= len)
+		return true;
+
+	return false;
+}
+
+bool is_forwarding_enabled(struct net_device *dev){
+        struct in_device *in_dev;
+	
+        /* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+        if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))                
+                return false;        
+        
+        return true;
+}
+EXPORT_SYMBOL_GPL(is_forwarding_enabled);
+
+
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp)
+{
+        int err;
+        bool forwarded = false;
+        struct fib_result res;       
+        struct iphdr *iph;
+        struct ethhdr *ethh;
+        struct net_device *rcv;
+        
+        if(!is_xdp_forwardable(dev, xdp))
+            return NET_RX_DROP;       
+        
+        //Verify the MAC address of xdp_buff    
+	ethh = eth_hdr_xdp(xdp);
+        if(!ether_addr_equal_64bits(ethh->h_dest, dev->dev_addr))
+                return NET_RX_DROP;        
+
+        iph = ip_hdr_xdp(xdp);
+        
+        preempt_disable();
+        rcu_read_lock();
+
+        err = ip_route_lookup(iph->daddr, iph->saddr, 
+                iph->tos, dev, &res);
+        if(err)
+                goto exit;
+        
+        if(res.type == RTN_LOCAL)
+            goto exit;
+
+        rcv = FIB_RES_DEV(res);
+        if(rcv){
+            if (likely(rcv->netdev_ops->ndo_xdp_xmit(rcv, xdp) == NETDEV_TX_OK)) {
+                    forwarded = true;
+                    goto exit;                    
+            }                                   
+        }        
+
+exit:        
+        rcu_read_unlock();        
+        preempt_enable();
+        if(forwarded)
+            return NET_RX_SUCCESS;
+
+        return NET_RX_DROP;
+}
+EXPORT_SYMBOL_GPL(xdp_router_forward);
+
+#endif
+
 static int netif_rx_internal(struct sk_buff *skb)
 {
 	int ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 49cc1c1..58a5adb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1866,6 +1866,38 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }
 
+#ifdef CONFIG_XDP_ROUTER
+
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+			       u8 tos, struct net_device *dev,
+			       struct fib_result *res)
+{
+	struct flowi4	fl4;
+	int		err;
+	struct net    *net = dev_net(dev);  
+        
+	fl4.flowi4_oif = 0;
+	fl4.flowi4_iif = dev->ifindex;
+        fl4.flowi4_mark = 0;
+	fl4.flowi4_tos = tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+	fl4.daddr = daddr;
+	fl4.saddr = saddr;
+        
+	err = fib_lookup(net, &fl4, res, 0);       
+	if (err != 0)
+		return err;
+	
+        if(res->type != RTN_UNICAST && res->type != RTN_LOCAL)
+            return -EINVAL;    
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_route_lookup);
+#endif
+
+
 /*
  *	NOTE. We drop all the packets that has local source
  *	addresses, because every properly looped back packet

^ permalink raw reply related

* Re: [PATCH RFC net-next 7/7] netdevsim: Add simple FIB resource controller via devlink
From: Jiri Pirko @ 2018-03-23  6:50 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, davem, roopa, shm, jiri, idosch, jakub.kicinski,
	David Ahern
In-Reply-To: <20180322225757.10377-8-dsa@cumulusnetworks.com>

Thu, Mar 22, 2018 at 11:57:57PM CET, dsa@cumulusnetworks.com wrote:
>From: David Ahern <dsahern@gmail.com>

[...]


>+void nsim_devlink_teardown(struct netdevsim *ns)
>+{
>+	if (ns->devlink) {
>+		struct net *net = dev_net(ns->netdev);
>+		bool *reg_devlink = net_generic(net, nsim_devlink_id);
>+
>+		devlink_unregister(ns->devlink);
>+		devlink_free(ns->devlink);
>+		ns->devlink = NULL;
>+
>+		nsim_devlink_net_reset(net);
>+		*reg_devlink = true;
>+	}
>+}
>+
>+void nsim_devlink_setup(struct netdevsim *ns)
>+{
>+	struct net *net = dev_net(ns->netdev);
>+	bool *reg_devlink = net_generic(net, nsim_devlink_id);
>+	struct devlink *devlink;
>+	int err = -ENOMEM;
>+
>+	/* only one device per namespace controls devlink */
>+	if (!*reg_devlink) {
>+		ns->devlink = NULL;
>+		return;
>+	}
>+
>+	devlink = devlink_alloc(&nsim_devlink_ops, 0);
>+	if (!devlink)
>+		return;
>+
>+	devlink_net_set(devlink, net);
>+	err = devlink_register(devlink, &ns->dev);

This reg_devlink construct looks odd. Why don't you leave the devlink
instance in init_ns?



>+	if (err)
>+		goto err_devlink_free;
>+
>+	err = devlink_resources_register(devlink);
>+	if (err)
>+		goto err_dl_unregister;
>+
>+	ns->devlink = devlink;
>+
>+	*reg_devlink = false;
>+
>+	return;
>+
>+err_dl_unregister:
>+	devlink_unregister(devlink);
>+err_devlink_free:
>+	devlink_free(devlink);
>+}
>+
>+/* Initialize per network namespace state */
>+static int __net_init nsim_devlink_netns_init(struct net *net)
>+{
>+	bool *reg_devlink = net_generic(net, nsim_devlink_id);
>+
>+	*reg_devlink = true;
>+
>+	return 0;
>+}
>+
>+static struct pernet_operations nsim_devlink_net_ops __net_initdata = {
>+	.init = nsim_devlink_netns_init,
>+	.id   = &nsim_devlink_id,
>+	.size = sizeof(bool),
>+	.async = true,
>+};

^ permalink raw reply

* Re: [net-next:master 304/314] drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3878:8: error: too few arguments to function 'devlink_resource_register'
From: Jiri Pirko @ 2018-03-23  6:53 UTC (permalink / raw)
  To: David Ahern; +Cc: kbuild test robot, kbuild-all, netdev
In-Reply-To: <42d94112-06fe-77ea-1eba-b0da591dda88@gmail.com>

Fri, Mar 23, 2018 at 02:53:38AM CET, dsahern@gmail.com wrote:
>On 3/22/18 6:47 PM, kbuild test robot wrote:
>> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
>> head:   6686c459e1449a3ee5f3fd313b0a559ace7a700e
>> commit: 145307460ba9c11489807de7acd3f4c7395f60b7 [304/314] devlink: Remove top_hierarchy arg to devlink_resource_register
>> config: x86_64-randconfig-s1-03230751 (attached as .config)
>> compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
>> reproduce:
>>         git checkout 145307460ba9c11489807de7acd3f4c7395f60b7
>>         # save the attached .config to linux build tree
>>         make ARCH=x86_64 
>> 
>> All error/warnings (new ones prefixed by >>):
>> 
>>    drivers/net/ethernet/mellanox/mlxsw/spectrum.c: In function 'mlxsw_sp_resources_register':
>>>> drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3881:6: warning: passing argument 6 of 'devlink_resource_register' makes integer from pointer without a cast [-Wint-conversion]
>>          &kvd_size_params,
>>          ^
>>    In file included from drivers/net/ethernet/mellanox/mlxsw/core.h:47:0,
>>                     from drivers/net/ethernet/mellanox/mlxsw/spectrum.h:54,
>>                     from drivers/net/ethernet/mellanox/mlxsw/spectrum.c:64:
>>    include/net/devlink.h:560:1: note: expected 'u64 {aka long long unsigned int}' but argument is of type 'struct devlink_resource_size_params *'
>>     devlink_resource_register(struct devlink *devlink,
>>     ^~~~~~~~~~~~~~~~~~~~~~~~~
>
>I just did another full build (allmodconfig) on net-next and did not hit
>this error.

The "else branch" in "#if IS_ENABLED(CONFIG_NET_DEVLINK)" is the
problem:

static inline int
devlink_resource_register(struct devlink *devlink,
                          const char *resource_name,
                          bool top_hierarchy,
                          u64 resource_size,
                          u64 resource_id,
                          u64 parent_resource_id,
                          const struct devlink_resource_size_params *size_params,
                          const struct devlink_resource_ops *resource_ops)
{
        return 0;
}

^ permalink raw reply

* RE: [PATCH net 1/3] lan78xx: Set ASD in MAC_CR when EEE is enabled.
From: RaghuramChary.Jallipalli @ 2018-03-23  7:21 UTC (permalink / raw)
  To: sergei.shtylyov, davem; +Cc: netdev, UNGLinuxDriver, Woojung.Huh
In-Reply-To: <82838095-9790-c2f7-2065-94ec633ae7dd@cogentembedded.com>

Hi Sergei,

> Hello!
> 
>     Only stylistic comments.

Thanks for the comments. Will address them and submit in v1 patch.

Thanks,
-Raghu

^ permalink raw reply

* [PATCH net-next 0/2] kernel: add support to collect hardware logs in crash recovery kernel
From: Rahul Lakkireddy @ 2018-03-23  8:30 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	kexec-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: indranil-ut6Up61K2wZBDgjK7y7TUQ, nirranjan-ut6Up61K2wZBDgjK7y7TUQ,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	ganeshgr-ut6Up61K2wZBDgjK7y7TUQ, Rahul Lakkireddy,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn

On production servers running variety of workloads over time, kernel
panic can happen sporadically after days or even months. It is
important to collect as much debug logs as possible to root cause
and fix the problem, that may not be easy to reproduce. Snapshot of
underlying hardware/firmware state (like register dump, firmware
logs, adapter memory, etc.), at the time of kernel panic will be very
helpful while debugging the culprit device driver.

This series of patches add new generic framework that enable device
drivers to collect device specific snapshot of the hardware/firmware
state of the underlying device in the crash recovery kernel. In crash
recovery kernel, the collected logs are exposed via /sys/kernel/crashdd/
directory, which is copied by user space scripts for post-analysis.

A kernel module crashdd is newly added. In crash recovery kernel,
crashdd exposes /sys/kernel/crashdd/ directory containing device
specific hardware/firmware logs.

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
as follows:

1. During probe (before hardware is initialized), device drivers
register to the crashdd module (via crashdd_add_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. Crashdd creates a driver's directory under /sys/kernel/crashdd/<driver>.
Then, it allocates the buffer with requested size and invokes the
device driver's registered callback function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to crashdd.

4. Crashdd exposes the buffer as a file via
/sys/kernel/crashdd/<driver>/<dump_file>.

5. User space script (/usr/lib/kdump/kdump-lib-initramfs.sh) copies
the entire /sys/kernel/crashdd/ directory to /var/crash/ directory.

Patch 1 adds crashdd module to allow drivers to register callback to
collect the device specific hardware/firmware logs.  The module also
exports /sys/kernel/crashdd/ directory containing the hardware/firmware
logs.

Patch 2 shows a cxgb4 driver example using the API to collect
hardware/firmware logs in crash recovery kernel, before hardware is
initialized.  The logs for the devices are made available under
/sys/kernel/crashdd/cxgb4/ directory.

Thanks,
Rahul

RFC v1: https://lkml.org/lkml/2018/3/2/542
RFC v2: https://lkml.org/lkml/2018/3/16/326

---
Changes since rfc v2:
- Moved exporting crashdd from procfs to sysfs. Suggested by
  Stephen Hemminger <stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ@public.gmane.org>
- Moved code from fs/proc/crashdd.c to fs/crashdd/ directory.
- Replaced all proc API with sysfs API and updated comments.
- Calling driver callback before creating the binary file under
  crashdd sysfs.
- Changed binary dump file permission from S_IRUSR to S_IRUGO.
- Changed module name from CRASH_DRIVER_DUMP to CRASH_DEVICE_DUMP.

rfc v2:
- Collecting logs in 2nd kernel instead of during kernel panic.
  Suggested by Eric Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>.
- Added new crashdd module that exports /proc/crashdd/ containing
  driver's registered hardware/firmware logs in patch 1.
- Replaced the API to allow drivers to register their hardware/firmware
  log collect routine in crash recovery kernel in patch 1.
- Updated patch 2 to use the new API in patch 1.

Rahul Lakkireddy (2):
  fs/crashdd: add API to collect hardware dump in second kernel
  cxgb4: collect hardware dump in second kernel

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h       |   4 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c |  25 +++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h |   3 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  |  12 ++
 fs/Kconfig                                       |   1 +
 fs/Makefile                                      |   1 +
 fs/crashdd/Kconfig                               |  10 +
 fs/crashdd/Makefile                              |   3 +
 fs/crashdd/crashdd.c                             | 234 +++++++++++++++++++++++
 fs/crashdd/crashdd_internal.h                    |  24 +++
 include/linux/crashdd.h                          |  24 +++
 11 files changed, 341 insertions(+)
 create mode 100644 fs/crashdd/Kconfig
 create mode 100644 fs/crashdd/Makefile
 create mode 100644 fs/crashdd/crashdd.c
 create mode 100644 fs/crashdd/crashdd_internal.h
 create mode 100644 include/linux/crashdd.h

-- 
2.14.1

^ permalink raw reply

* [PATCH net-next 1/2] fs/crashdd: add API to collect hardware dump in second kernel
From: Rahul Lakkireddy @ 2018-03-23  8:31 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	kexec-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: indranil-ut6Up61K2wZBDgjK7y7TUQ, nirranjan-ut6Up61K2wZBDgjK7y7TUQ,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	ganeshgr-ut6Up61K2wZBDgjK7y7TUQ, Rahul Lakkireddy,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn
In-Reply-To: <cover.1521793455.git.rahul.lakkireddy-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

Add a new module crashdd that exports the /sys/kernel/crashdd/
directory in second kernel, containing collected hardware/firmware
dumps.

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
as follows:

1. During probe (before hardware is initialized), device drivers
register to the crashdd module (via crashdd_add_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. Crashdd creates a driver's directory under
/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
requested size and invokes the device driver's registered callback
function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to crashdd.

4. Crashdd exposes the buffer as a binary file via
/sys/kernel/crashdd/<driver>/<dump_file>.

Suggested-by: Eric Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>.
Suggested-by: Stephen Hemminger <stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ@public.gmane.org>
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Ganesh Goudar <ganeshgr-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
Changes since rfc v2:
- Moved exporting crashdd from procfs to sysfs.  Suggested by
  Stephen Hemminger <stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ@public.gmane.org>
- Moved code from fs/proc/crashdd.c to fs/crashdd/ directory.
- Replaced all proc API with sysfs API and updated comments.
- Calling driver callback before creating the binary file under
  crashdd sysfs.
- Changed binary dump file permission from S_IRUSR to S_IRUGO.
- Changed module name from CRASH_DRIVER_DUMP to CRASH_DEVICE_DUMP.

rfc v2:
- Collecting logs in 2nd kernel instead of during kernel panic.
  Suggested by Eric Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>.
- Patch added in this series.

 fs/Kconfig                    |   1 +
 fs/Makefile                   |   1 +
 fs/crashdd/Kconfig            |  10 ++
 fs/crashdd/Makefile           |   3 +
 fs/crashdd/crashdd.c          | 234 ++++++++++++++++++++++++++++++++++++++++++
 fs/crashdd/crashdd_internal.h |  24 +++++
 include/linux/crashdd.h       |  24 +++++
 7 files changed, 297 insertions(+)
 create mode 100644 fs/crashdd/Kconfig
 create mode 100644 fs/crashdd/Makefile
 create mode 100644 fs/crashdd/crashdd.c
 create mode 100644 fs/crashdd/crashdd_internal.h
 create mode 100644 include/linux/crashdd.h

diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a86d965..aae1c55a7dad 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -208,6 +208,7 @@ config ARCH_HAS_GIGANTIC_PAGE
 
 source "fs/configfs/Kconfig"
 source "fs/efivarfs/Kconfig"
+source "fs/crashdd/Kconfig"
 
 endmenu
 
diff --git a/fs/Makefile b/fs/Makefile
index add789ea270a..ff398a44f611 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -128,3 +128,4 @@ obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_CRASH_DEVICE_DUMP)	+= crashdd/
diff --git a/fs/crashdd/Kconfig b/fs/crashdd/Kconfig
new file mode 100644
index 000000000000..5db9c7c98c17
--- /dev/null
+++ b/fs/crashdd/Kconfig
@@ -0,0 +1,10 @@
+config CRASH_DEVICE_DUMP
+	bool "Crash Kernel Device Hardware/Firmware Logs"
+	depends on SYSFS && CRASH_DUMP
+	default y
+	---help---
+	  Device drivers can collect the device specific snapshot of
+	  their hardware or firmware before they are initialized in
+	  crash recovery kernel. If you say Y here a tree of device
+	  specific dumps will be made available under /sys/kernel/crashdd/
+	  directory.
diff --git a/fs/crashdd/Makefile b/fs/crashdd/Makefile
new file mode 100644
index 000000000000..8dbf946c0ea4
--- /dev/null
+++ b/fs/crashdd/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := crashdd.o
diff --git a/fs/crashdd/crashdd.c b/fs/crashdd/crashdd.c
new file mode 100644
index 000000000000..73882ff7722e
--- /dev/null
+++ b/fs/crashdd/crashdd.c
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Chelsio Communications, Inc. All rights reserved. */
+
+#include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
+#include <linux/crashdd.h>
+
+#include "crashdd_internal.h"
+
+static LIST_HEAD(crashdd_list);
+static DEFINE_MUTEX(crashdd_mutex);
+
+#define CRASHDD_SYSFS_MODE 444 /* S_IRUGO */
+static struct kobject *crashdd_kobj;
+
+static ssize_t crashdd_read(struct file *filp, struct kobject *kobj,
+			    struct bin_attribute *bin_attr,
+			    char *buf, loff_t fpos, size_t count)
+{
+	struct crashdd_dump_node *dump = bin_attr->private;
+
+	memcpy(buf, dump->buf + fpos, count);
+	return count;
+}
+
+static struct kobject *crashdd_mkdir(const char *name)
+{
+	return kobject_create_and_add(name, crashdd_kobj);
+}
+
+static int crashdd_add_file(struct kobject *kobj, const char *name,
+			    struct crashdd_dump_node *dump)
+{
+	dump->bin_attr.attr.name = name;
+	dump->bin_attr.attr.mode = CRASHDD_SYSFS_MODE;
+	dump->bin_attr.size = dump->size;
+	dump->bin_attr.read = crashdd_read;
+	dump->bin_attr.private = dump;
+
+	return sysfs_create_bin_file(kobj, &dump->bin_attr);
+}
+
+static void crashdd_rmdir(struct kobject *kobj)
+{
+	kobject_put(kobj);
+}
+
+/**
+ * crashdd_init_driver - create a sysfs driver entry.
+ * @name: Name of the directory.
+ *
+ * Creates a directory under /sys/kernel/crashdd/ with @name.  Allocates
+ * and saves the sysfs entry.  The sysfs entry is added to the global
+ * list and then returned to the caller. On failure, returns NULL.
+ */
+static struct crashdd_driver_node *crashdd_init_driver(const char *name)
+{
+	struct crashdd_driver_node *node;
+
+	node = vzalloc(sizeof(*node));
+	if (!node)
+		return NULL;
+
+	/* Create a driver's directory under /sys/kernel/crashdd/ */
+	node->kobj = crashdd_mkdir(name);
+	if (!node->kobj) {
+		vfree(node);
+		return NULL;
+	}
+
+	atomic_set(&node->refcnt, 1);
+
+	/* Initialize the list of dumps that go under this driver's
+	 * directory.
+	 */
+	INIT_LIST_HEAD(&node->dump_list);
+
+	/* Add the driver's entry to global list */
+	mutex_lock(&crashdd_mutex);
+	list_add_tail(&node->list, &crashdd_list);
+	mutex_unlock(&crashdd_mutex);
+
+	return node;
+}
+
+/**
+ * crashdd_get_driver - get an exisiting sysfs driver entry.
+ * @name: Name of the directory.
+ *
+ * Searches and fetches a sysfs entry having @name.  If @name is
+ * found, then the reference count is incremented and the entry
+ * is returned.  If @name is not found, NULL is returned.
+ */
+static struct crashdd_driver_node *crashdd_get_driver(const char *name)
+{
+	struct crashdd_driver_node *node;
+	int found = 0;
+
+	/* Search for an existing driver sysfs entry having @name */
+	mutex_lock(&crashdd_mutex);
+	list_for_each_entry(node, &crashdd_list, list) {
+		if (!strcmp(node->kobj->name, name)) {
+			atomic_inc(&node->refcnt);
+			found = 1;
+			break;
+		}
+	}
+	mutex_unlock(&crashdd_mutex);
+
+	if (found)
+		return node;
+
+	/* No driver with @name found */
+	return NULL;
+}
+
+/**
+ * crashdd_put_driver - put an exisiting sysfs driver entry.
+ * @node: driver sysfs entry.
+ *
+ * Decrement @node reference count.  If there are no dumps left under it,
+ * delete the sysfs directory and remove it from the global list.
+ */
+static void crashdd_put_driver(struct crashdd_driver_node *node)
+{
+	mutex_lock(&crashdd_mutex);
+	if (atomic_dec_and_test(&node->refcnt)) {
+		/* Delete @node driver entry if it has no dumps under it */
+		crashdd_rmdir(node->kobj);
+		list_del(&node->list);
+	}
+	mutex_unlock(&crashdd_mutex);
+}
+
+/**
+ * crashdd_add_dump - Allocate a directory under /sys/kernel/crashdd/ and
+ * add the dump to it.
+ * @driver_name: directory name under which the dump should be added.
+ * @data: dump info.
+ *
+ * Search for /sys/kernel/crashdd/<@driver_name>/ directory.  If not found,
+ * allocate a new directory under /sys/kernel/crashdd/ with @driver_name.
+ * Allocate the dump file's context and invoke the calling driver's dump
+ * collect routine.  Once collection is done, add the dump under
+ * /sys/kernel/crashdd/<@driver_name>/ directory.
+ */
+int crashdd_add_dump(const char *driver_name, struct crashdd_data *data)
+{
+	struct crashdd_driver_node *node;
+	struct crashdd_dump_node *dump;
+	void *buf = NULL;
+	int ret;
+
+	if (!driver_name || !strlen(driver_name) ||
+	    !data || !strlen(data->name) ||
+	    !data->crashdd_callback || !data->size)
+		return -EINVAL;
+
+	/* Get a driver sysfs entry with specified name. */
+	node = crashdd_get_driver(driver_name);
+	if (!node) {
+		/* No driver sysfs entry found with specified name.
+		 * So create a new one
+		 */
+		node = crashdd_init_driver(driver_name);
+		if (!node)
+			return -ENOMEM;
+	}
+
+	dump = vzalloc(sizeof(*dump));
+	if (!dump) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Allocate buffer for driver's to write their dumps */
+	buf = vzalloc(data->size);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Invoke the driver's dump collection routing */
+	ret = data->crashdd_callback(data, buf);
+	if (ret)
+		goto out_err;
+
+	dump->buf = buf;
+	dump->size = data->size;
+
+	/* Add a binary file under /sys/kernel/crashdd/@driver_name/ */
+	ret = crashdd_add_file(node->kobj, data->name, dump);
+	if (ret)
+		goto out_err;
+
+	/* Add the dump to driver sysfs list */
+	mutex_lock(&crashdd_mutex);
+	list_add_tail(&dump->list, &node->dump_list);
+	atomic_inc(&node->refcnt);
+	mutex_unlock(&crashdd_mutex);
+
+	/* Return back the driver sysfs reference */
+	crashdd_put_driver(node);
+	return 0;
+
+out_err:
+	if (buf)
+		vfree(buf);
+
+	if (dump)
+		vfree(dump);
+
+	crashdd_put_driver(node);
+	return ret;
+}
+EXPORT_SYMBOL(crashdd_add_dump);
+
+/* Init function for crash device dump module. */
+static int __init crashdd_init(void)
+{
+	/*
+	 * Only export this directory in 2nd kernel.
+	 */
+	if (!is_kdump_kernel())
+		return 0;
+
+	/* Create /sys/kernel/crashdd/ directory */
+	crashdd_kobj = kobject_create_and_add("crashdd", kernel_kobj);
+	if (!crashdd_kobj)
+		return -ENOMEM;
+
+	return 0;
+}
+fs_initcall(crashdd_init);
diff --git a/fs/crashdd/crashdd_internal.h b/fs/crashdd/crashdd_internal.h
new file mode 100644
index 000000000000..9162d1a4264b
--- /dev/null
+++ b/fs/crashdd/crashdd_internal.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Chelsio Communications, Inc. All rights reserved. */
+
+#ifndef CRASH_DEVICE_DUMP_INTERNAL_H
+#define CRASH_DEVICE_DUMP_INTERNAL_H
+
+/* Binary dump file's context internal to crashdd */
+struct crashdd_dump_node {
+	/* Pointer to list of dumps under the driver sysfs entry */
+	struct list_head list;
+	void *buf;                     /* Buffer containing device's dump */
+	unsigned long size;            /* Size of the buffer */
+	struct bin_attribute bin_attr; /* Binary dump file's attributes */
+};
+
+/* Driver sysfs entry internal to crashdd */
+struct crashdd_driver_node {
+	/* Pointer to global list of driver sysfs entries */
+	struct list_head list;
+	struct list_head dump_list; /* List of dumps under this driver */
+	atomic_t refcnt;            /* Number of dumps under this directory */
+	struct kobject *kobj;       /* Pointer to driver sysfs kobject */
+};
+#endif /* CRASH_DEVICE_DUMP_INTERNAL_H */
diff --git a/include/linux/crashdd.h b/include/linux/crashdd.h
new file mode 100644
index 000000000000..edaba8424019
--- /dev/null
+++ b/include/linux/crashdd.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Chelsio Communications, Inc. All rights reserved. */
+
+#ifndef CRASH_DEVICE_DUMP_H
+#define CRASH_DEVICE_DUMP_H
+
+/* Max dump name length */
+#define CRASHDD_NAME_LENGTH 32
+
+/* Device Dump information to be filled by drivers */
+struct crashdd_data {
+	char name[CRASHDD_NAME_LENGTH]; /* Unique name of the dump */
+	unsigned long size;             /* Size of the dump */
+	/* Driver's registered callback to be invoked to collect dump */
+	int (*crashdd_callback)(struct crashdd_data *data, void *buf);
+};
+
+#ifdef CONFIG_CRASH_DEVICE_DUMP
+int crashdd_add_dump(const char *driver_name, struct crashdd_data *data);
+#else
+#define crashdd_add_dump(x, y) 0
+#endif /* CONFIG_CRASH_DEVICE_DUMP */
+
+#endif /* CRASH_DEVICE_DUMP_H */
-- 
2.14.1

^ permalink raw reply related

* [PATCH net-next 2/2] cxgb4: collect hardware dump in second kernel
From: Rahul Lakkireddy @ 2018-03-23  8:31 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	kexec-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: indranil-ut6Up61K2wZBDgjK7y7TUQ, nirranjan-ut6Up61K2wZBDgjK7y7TUQ,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	ganeshgr-ut6Up61K2wZBDgjK7y7TUQ, Rahul Lakkireddy,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn
In-Reply-To: <cover.1521793455.git.rahul.lakkireddy-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

Register callback to collect hardware/firmware dumps in second kernel
before hardware/firmware is initialized.  The dumps for each device
will be available under /sys/kernel/crashdd/cxgb4/ directory in second
kernel.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Ganesh Goudar <ganeshgr-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
Changes since rfc v2:
- Update comments and commit message for sysfs change.

rfc v2:
- Updated dump registration to the new API in patch 1.

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h       |  4 ++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 25 ++++++++++++++++++++++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h |  3 +++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  | 12 ++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 688f95440af2..79a123bfb628 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -50,6 +50,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_classify.h>
+#include <linux/crashdd.h>
 #include <asm/io.h>
 #include "t4_chip_type.h"
 #include "cxgb4_uld.h"
@@ -964,6 +965,9 @@ struct adapter {
 	struct hma_data hma;
 
 	struct srq_data *srq;
+
+	/* Dump buffer for collecting logs in kdump kernel */
+	struct crashdd_data crashdd_buf;
 };
 
 /* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
index 143686c60234..ce9f544781af 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -488,3 +488,28 @@ void cxgb4_init_ethtool_dump(struct adapter *adapter)
 	adapter->eth_dump.version = adapter->params.fw_vers;
 	adapter->eth_dump.len = 0;
 }
+
+static int cxgb4_cudbg_crashdd_collect(struct crashdd_data *data, void *buf)
+{
+	struct adapter *adap = container_of(data, struct adapter, crashdd_buf);
+	u32 len = data->size;
+
+	return cxgb4_cudbg_collect(adap, buf, &len, CXGB4_ETH_DUMP_ALL);
+}
+
+int cxgb4_cudbg_crashdd_add_dump(struct adapter *adap)
+{
+	struct crashdd_data *data = &adap->crashdd_buf;
+	u32 len;
+
+	len = sizeof(struct cudbg_hdr) +
+	      sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY;
+	len += CUDBG_DUMP_BUFF_SIZE;
+
+	data->size = len;
+	snprintf(data->name, sizeof(data->name), "%s_%s", cxgb4_driver_name,
+		 adap->name);
+	data->crashdd_callback = cxgb4_cudbg_crashdd_collect;
+
+	return crashdd_add_dump(cxgb4_driver_name, data);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
index ce1ac9a1c878..095c6f04357e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
@@ -41,8 +41,11 @@ enum CXGB4_ETHTOOL_DUMP_FLAGS {
 	CXGB4_ETH_DUMP_HW = (1 << 1), /* various FW and HW dumps */
 };
 
+#define CXGB4_ETH_DUMP_ALL (CXGB4_ETH_DUMP_MEM | CXGB4_ETH_DUMP_HW)
+
 u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag);
 int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size,
 			u32 flag);
 void cxgb4_init_ethtool_dump(struct adapter *adapter);
+int cxgb4_cudbg_crashdd_add_dump(struct adapter *adap);
 #endif /* __CXGB4_CUDBG_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 99c9b88d6d34..c4e0e4705cda 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5526,6 +5526,18 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto out_free_adapter;
 
+	if (is_kdump_kernel()) {
+		/* Collect hardware state and append to
+		 * /sys/kernel/crashdd/cxgb4/ directory
+		 */
+		err = cxgb4_cudbg_crashdd_add_dump(adapter);
+		if (err) {
+			dev_warn(adapter->pdev_dev,
+				 "Fail collecting crash device dump, err: %d. Continuing\n",
+				 err);
+			err = 0;
+		}
+	}
 
 	if (!is_t4(adapter->params.chip)) {
 		s_qpp = (QUEUESPERPAGEPF0_S +
-- 
2.14.1

^ permalink raw reply related

* [PATCH iproute2] ss: Fix rendering of continuous output (-E, --events)
From: Stefano Brivio @ 2018-03-23  8:37 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Roman Mashak, netdev

Roman Mashak reported that ss currently shows no output when it
should continuously report information about terminated sockets
(-E, --events switch).

This happens because I missed this case in 691bd854bf4a ("ss:
Buffer raw fields first, then render them as a table") and the
rendering function is simply not called.

To fix this, we need to:

- call render() every time we need to display new socket events
  from generic_show_sock(), which is only used to follow events.
  Always call it even if specific socket display functions
  return errors to ensure we clean up buffers

- get the screen width every time we have new events to display,
  thus factor out getting the screen width from main() into a
  function we'll call whenever we calculate columns width

- reset the current field pointer after rendering, more output
  might come after render() is called

Reported-by: Roman Mashak <mrv@mojatatu.com>
Fixes: 691bd854bf4a ("ss: Buffer raw fields first, then render them as a table")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 misc/ss.c | 61 ++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index e087bef739b0..6338820bf4a0 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -1106,15 +1106,33 @@ static void buf_free_all(void)
 	buffer.head = NULL;
 }
 
+/* Get current screen width, default to 80 columns if TIOCGWINSZ fails */
+static int render_screen_width(void)
+{
+	int width = 80;
+
+	if (isatty(STDOUT_FILENO)) {
+		struct winsize w;
+
+		if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) != -1) {
+			if (w.ws_col > 0)
+				width = w.ws_col;
+		}
+	}
+
+	return width;
+}
+
 /* Calculate column width from contents length. If columns don't fit on one
  * line, break them into the least possible amount of lines and keep them
  * aligned across lines. Available screen space is equally spread between fields
  * as additional spacing.
  */
-static void render_calc_width(int screen_width)
+static void render_calc_width(void)
 {
-	int first, len = 0, linecols = 0;
+	int screen_width = render_screen_width();
 	struct column *c, *eol = columns - 1;
+	int first, len = 0, linecols = 0;
 
 	/* First pass: set width for each column to measured content length */
 	for (first = 1, c = columns; c - columns < COL_MAX; c++) {
@@ -1195,7 +1213,7 @@ newline:
 }
 
 /* Render buffered output with spacing and delimiters, then free up buffers */
-static void render(int screen_width)
+static void render(void)
 {
 	struct buf_token *token;
 	int printed, line_started = 0;
@@ -1209,7 +1227,7 @@ static void render(int screen_width)
 	/* Ensure end alignment of last token, it wasn't necessarily flushed */
 	buffer.tail->end += buffer.cur->len % 2;
 
-	render_calc_width(screen_width);
+	render_calc_width();
 
 	/* Rewind and replay */
 	buffer.tail = buffer.head;
@@ -1245,6 +1263,7 @@ static void render(int screen_width)
 	}
 
 	buf_free_all();
+	current_field = columns;
 }
 
 static void sock_state_print(struct sockstat *s)
@@ -4264,23 +4283,33 @@ static int generic_show_sock(const struct sockaddr_nl *addr,
 {
 	struct sock_diag_msg *r = NLMSG_DATA(nlh);
 	struct inet_diag_arg inet_arg = { .f = arg, .protocol = IPPROTO_MAX };
+	int ret;
 
 	switch (r->sdiag_family) {
 	case AF_INET:
 	case AF_INET6:
 		inet_arg.rth = inet_arg.f->rth_for_killing;
-		return show_one_inet_sock(addr, nlh, &inet_arg);
+		ret = show_one_inet_sock(addr, nlh, &inet_arg);
+		break;
 	case AF_UNIX:
-		return unix_show_sock(addr, nlh, arg);
+		ret = unix_show_sock(addr, nlh, arg);
+		break;
 	case AF_PACKET:
-		return packet_show_sock(addr, nlh, arg);
+		ret = packet_show_sock(addr, nlh, arg);
+		break;
 	case AF_NETLINK:
-		return netlink_show_sock(addr, nlh, arg);
+		ret = netlink_show_sock(addr, nlh, arg);
+		break;
 	case AF_VSOCK:
-		return vsock_show_sock(addr, nlh, arg);
+		ret = vsock_show_sock(addr, nlh, arg);
+		break;
 	default:
-		return -1;
+		ret = -1;
 	}
+
+	render();
+
+	return ret;
 }
 
 static int handle_follow_request(struct filter *f)
@@ -4647,7 +4676,6 @@ int main(int argc, char *argv[])
 	FILE *filter_fp = NULL;
 	int ch;
 	int state_filter = 0;
-	int screen_width = 80;
 
 	while ((ch = getopt_long(argc, argv,
 				 "dhaletuwxnro460spbEf:miA:D:F:vVzZN:KHS",
@@ -4949,15 +4977,6 @@ int main(int argc, char *argv[])
 	if (!(current_filter.states & (current_filter.states - 1)))
 		columns[COL_STATE].disabled = 1;
 
-	if (isatty(STDOUT_FILENO)) {
-		struct winsize w;
-
-		if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) != -1) {
-			if (w.ws_col > 0)
-				screen_width = w.ws_col;
-		}
-	}
-
 	if (show_header)
 		print_header();
 
@@ -4988,7 +5007,7 @@ int main(int argc, char *argv[])
 	if (show_users || show_proc_ctx || show_sock_ctx)
 		user_ent_destroy();
 
-	render(screen_width);
+	render();
 
 	return 0;
 }
-- 
2.15.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox