Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next  08/11] ixgbevf: implement CONFIG_NET_RX_BUSY_POLL
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch enables CONFIG_NET_RX_BUSY_POLL support in the VF code. This enables
sockets which have enabled the SO_BUSY_POLL socket option to use the
ndo_busy_poll_recv operation which could result in lower latency, at the cost
of higher CPU utilization, and increased power usage. This support is similar
to how the ixgbe driver works.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      | 109 ++++++++++++++++++++++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  68 ++++++++++++++
 2 files changed, 177 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index d7837dcc..85c7560 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -38,6 +38,10 @@
 
 #include "vf.h"
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#include <net/busy_poll.h>
+#endif
+
 /* wrapper around a pointer to a socket buffer,
  * so a DMA handle can be stored along with the buffer */
 struct ixgbevf_tx_buffer {
@@ -145,7 +149,112 @@ struct ixgbevf_q_vector {
 	struct napi_struct napi;
 	struct ixgbevf_ring_container rx, tx;
 	char name[IFNAMSIZ + 9];
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned int state;
+#define IXGBEVF_QV_STATE_IDLE		0
+#define IXGBEVF_QV_STATE_NAPI		1    /* NAPI owns this QV */
+#define IXGBEVF_QV_STATE_POLL		2    /* poll owns this QV */
+#define IXGBEVF_QV_STATE_DISABLED	4    /* QV is disabled */
+#define IXGBEVF_QV_OWNED (IXGBEVF_QV_STATE_NAPI | IXGBEVF_QV_STATE_POLL)
+#define IXGBEVF_QV_LOCKED (IXGBEVF_QV_OWNED | IXGBEVF_QV_STATE_DISABLED)
+#define IXGBEVF_QV_STATE_NAPI_YIELD	8    /* NAPI yielded this QV */
+#define IXGBEVF_QV_STATE_POLL_YIELD	16   /* poll yielded this QV */
+#define IXGBEVF_QV_YIELD (IXGBEVF_QV_STATE_NAPI_YIELD | IXGBEVF_QV_STATE_POLL_YIELD)
+#define IXGBEVF_QV_USER_PEND (IXGBEVF_QV_STATE_POLL | IXGBEVF_QV_STATE_POLL_YIELD)
+	spinlock_t lock;
+#endif /* CONFIG_NET_RX_BUSY_POLL */
 };
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline void ixgbevf_qv_init_lock(struct ixgbevf_q_vector *q_vector)
+{
+
+	spin_lock_init(&q_vector->lock);
+	q_vector->state = IXGBEVF_QV_STATE_IDLE;
+}
+
+/* called from the device poll routine to get ownership of a q_vector */
+static inline bool ixgbevf_qv_lock_napi(struct ixgbevf_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock_bh(&q_vector->lock);
+	if (q_vector->state & IXGBEVF_QV_LOCKED) {
+		WARN_ON(q_vector->state & IXGBEVF_QV_STATE_NAPI);
+		q_vector->state |= IXGBEVF_QV_STATE_NAPI_YIELD;
+		rc = false;
+	} else {
+		/* we don't care if someone yielded */
+		q_vector->state = IXGBEVF_QV_STATE_NAPI;
+	}
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* returns true is someone tried to get the qv while napi had it */
+static inline bool ixgbevf_qv_unlock_napi(struct ixgbevf_q_vector *q_vector)
+{
+	int rc = false;
+	spin_lock_bh(&q_vector->lock);
+	WARN_ON(q_vector->state & (IXGBEVF_QV_STATE_POLL |
+				   IXGBEVF_QV_STATE_NAPI_YIELD));
+
+	if (q_vector->state & IXGBEVF_QV_STATE_POLL_YIELD)
+		rc = true;
+	/* reset state to idle, unless QV is disabled */
+	q_vector->state &= IXGBEVF_QV_STATE_DISABLED;
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* called from ixgbevf_low_latency_poll() */
+static inline bool ixgbevf_qv_lock_poll(struct ixgbevf_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock_bh(&q_vector->lock);
+	if ((q_vector->state & IXGBEVF_QV_LOCKED)) {
+		q_vector->state |= IXGBEVF_QV_STATE_POLL_YIELD;
+		rc = false;
+	} else {
+		/* preserve yield marks */
+		q_vector->state |= IXGBEVF_QV_STATE_POLL;
+	}
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* returns true if someone tried to get the qv while it was locked */
+static inline bool ixgbevf_qv_unlock_poll(struct ixgbevf_q_vector *q_vector)
+{
+	int rc = false;
+	spin_lock_bh(&q_vector->lock);
+	WARN_ON(q_vector->state & (IXGBEVF_QV_STATE_NAPI));
+
+	if (q_vector->state & IXGBEVF_QV_STATE_POLL_YIELD)
+		rc = true;
+	/* reset state to idle, unless QV is disabled */
+	q_vector->state &= IXGBEVF_QV_STATE_DISABLED;
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+/* true if a socket is polling, even if it did not get the lock */
+static inline bool ixgbevf_qv_busy_polling(struct ixgbevf_q_vector *q_vector)
+{
+	WARN_ON(!(q_vector->state & IXGBEVF_QV_OWNED));
+	return q_vector->state & IXGBEVF_QV_USER_PEND;
+}
+
+/* false if QV is currently owned */
+static inline bool ixgbevf_qv_disable(struct ixgbevf_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock_bh(&q_vector->lock);
+	if (q_vector->state & IXGBEVF_QV_OWNED)
+		rc = false;
+	spin_unlock_bh(&q_vector->lock);
+	return rc;
+}
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
 
 /*
  * microsecond values for various ITR rates shifted by 2 to fit itr register
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 06e29bd..bc03a2e 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -310,6 +310,16 @@ static void ixgbevf_rx_skb(struct ixgbevf_q_vector *q_vector,
 			   struct sk_buff *skb, u8 status,
 			   union ixgbe_adv_rx_desc *rx_desc)
 {
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	skb_mark_napi_id(skb, &q_vector->napi);
+
+	if (ixgbevf_qv_busy_polling(q_vector)) {
+		netif_receive_skb(skb);
+		/* exit early if we busy polled */
+		return;
+	}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
 	ixgbevf_receive_skb(q_vector, skb, status, rx_desc);
 }
 
@@ -563,6 +573,11 @@ static int ixgbevf_poll(struct napi_struct *napi, int budget)
 	ixgbevf_for_each_ring(ring, q_vector->tx)
 		clean_complete &= ixgbevf_clean_tx_irq(q_vector, ring);
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	if (!ixgbevf_qv_lock_napi(q_vector))
+		return budget;
+#endif
+
 	/* attempt to distribute budget to each queue fairly, but don't allow
 	 * the budget to go below 1 because we'll exit polling */
 	if (q_vector->rx.count > 1)
@@ -577,6 +592,10 @@ static int ixgbevf_poll(struct napi_struct *napi, int budget)
 				   < per_ring_budget);
 	adapter->flags &= ~IXGBE_FLAG_IN_NETPOLL;
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	ixgbevf_qv_unlock_napi(q_vector);
+#endif
+
 	/* If all work not completed, return budget and keep polling */
 	if (!clean_complete)
 		return budget;
@@ -611,6 +630,34 @@ void ixgbevf_write_eitr(struct ixgbevf_q_vector *q_vector)
 	IXGBE_WRITE_REG(hw, IXGBE_VTEITR(v_idx), itr_reg);
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+/* must be called with local_bh_disable()d */
+static int ixgbevf_busy_poll_recv(struct napi_struct *napi)
+{
+	struct ixgbevf_q_vector *q_vector =
+			container_of(napi, struct ixgbevf_q_vector, napi);
+	struct ixgbevf_adapter *adapter = q_vector->adapter;
+	struct ixgbevf_ring  *ring;
+	int found = 0;
+
+	if (test_bit(__IXGBEVF_DOWN, &adapter->state))
+		return LL_FLUSH_FAILED;
+
+	if (!ixgbevf_qv_lock_poll(q_vector))
+		return LL_FLUSH_BUSY;
+
+	ixgbevf_for_each_ring(ring, q_vector->rx) {
+		found = ixgbevf_clean_rx_irq(q_vector, ring, 4);
+		if (found)
+			break;
+	}
+
+	ixgbevf_qv_unlock_poll(q_vector);
+
+	return found;
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
 /**
  * ixgbevf_configure_msix - Configure MSI-X hardware
  * @adapter: board private structure
@@ -1297,6 +1344,9 @@ static void ixgbevf_napi_enable_all(struct ixgbevf_adapter *adapter)
 
 	for (q_idx = 0; q_idx < q_vectors; q_idx++) {
 		q_vector = adapter->q_vector[q_idx];
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		ixgbevf_qv_init_lock(adapter->q_vector[q_idx]);
+#endif
 		napi_enable(&q_vector->napi);
 	}
 }
@@ -1310,6 +1360,12 @@ static void ixgbevf_napi_disable_all(struct ixgbevf_adapter *adapter)
 	for (q_idx = 0; q_idx < q_vectors; q_idx++) {
 		q_vector = adapter->q_vector[q_idx];
 		napi_disable(&q_vector->napi);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		while (!ixgbevf_qv_disable(adapter->q_vector[q_idx])) {
+			pr_info("QV %d locked\n", q_idx);
+			usleep_range(1000, 20000);
+		}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
 	}
 }
 
@@ -1960,6 +2016,9 @@ static int ixgbevf_alloc_q_vectors(struct ixgbevf_adapter *adapter)
 		q_vector->v_idx = q_idx;
 		netif_napi_add(adapter->netdev, &q_vector->napi,
 			       ixgbevf_poll, 64);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		napi_hash_add(&q_vector->napi);
+#endif
 		adapter->q_vector[q_idx] = q_vector;
 	}
 
@@ -1969,6 +2028,9 @@ err_out:
 	while (q_idx) {
 		q_idx--;
 		q_vector = adapter->q_vector[q_idx];
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		napi_hash_del(&q_vector->napi);
+#endif
 		netif_napi_del(&q_vector->napi);
 		kfree(q_vector);
 		adapter->q_vector[q_idx] = NULL;
@@ -1992,6 +2054,9 @@ static void ixgbevf_free_q_vectors(struct ixgbevf_adapter *adapter)
 		struct ixgbevf_q_vector *q_vector = adapter->q_vector[q_idx];
 
 		adapter->q_vector[q_idx] = NULL;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		napi_hash_del(&q_vector->napi);
+#endif
 		netif_napi_del(&q_vector->napi);
 		kfree(q_vector);
 	}
@@ -3323,6 +3388,9 @@ static const struct net_device_ops ixgbevf_netdev_ops = {
 	.ndo_tx_timeout		= ixgbevf_tx_timeout,
 	.ndo_vlan_rx_add_vid	= ixgbevf_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= ixgbevf_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	.ndo_busy_poll		= ixgbevf_busy_poll_recv,
+#endif
 };
 
 static void ixgbevf_assign_netdev_ops(struct net_device *dev)
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  07/11] ixgbevf: have clean_rx_irq return total_rx_packets cleaned
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

Rather than return true/false indicating whether there was budget left, return
the total packets cleaned. This currently has no use, but will be used in a
following patch which enables CONFIG_NET_RX_BUSY_POLL support in order to track
how many packets were cleaned during the busy poll as part of the extended
statistics.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index a2cc6bb..06e29bd 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -410,9 +410,9 @@ static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter,
 	IXGBE_WRITE_REG(hw, IXGBE_VTEIMS, qmask);
 }
 
-static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
-				 struct ixgbevf_ring *rx_ring,
-				 int budget)
+static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
+				struct ixgbevf_ring *rx_ring,
+				int budget)
 {
 	struct ixgbevf_adapter *adapter = q_vector->adapter;
 	struct pci_dev *pdev = adapter->pdev;
@@ -540,7 +540,7 @@ next_desc:
 	q_vector->rx.total_packets += total_rx_packets;
 	q_vector->rx.total_bytes += total_rx_bytes;
 
-	return !!budget;
+	return total_rx_packets;
 }
 
 /**
@@ -572,8 +572,9 @@ static int ixgbevf_poll(struct napi_struct *napi, int budget)
 
 	adapter->flags |= IXGBE_FLAG_IN_NETPOLL;
 	ixgbevf_for_each_ring(ring, q_vector->rx)
-		clean_complete &= ixgbevf_clean_rx_irq(q_vector, ring,
-						       per_ring_budget);
+		clean_complete &= (ixgbevf_clean_rx_irq(q_vector, ring,
+							per_ring_budget)
+				   < per_ring_budget);
 	adapter->flags &= ~IXGBE_FLAG_IN_NETPOLL;
 
 	/* If all work not completed, return budget and keep polling */
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  06/11] ixgbevf: add ixgbevf_rx_skb
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch adds ixgbevf_rx_skb in line with how ixgbe handles the variations on
how packets can be received. It will be extended in a following patch for
CONFIG_NET_RX_BUSY_POLL support.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 87279c8a..a2cc6bb 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -300,6 +300,20 @@ static void ixgbevf_receive_skb(struct ixgbevf_q_vector *q_vector,
 }
 
 /**
+ * ixgbevf_rx_skb - Helper function to determine proper Rx method
+ * @q_vector: structure containing interrupt and ring information
+ * @skb: packet to send up
+ * @status: hardware indication of status of receive
+ * @rx_desc: rx descriptor
+ **/
+static void ixgbevf_rx_skb(struct ixgbevf_q_vector *q_vector,
+			   struct sk_buff *skb, u8 status,
+			   union ixgbe_adv_rx_desc *rx_desc)
+{
+	ixgbevf_receive_skb(q_vector, skb, status, rx_desc);
+}
+
+/**
  * ixgbevf_rx_checksum - indicate in skb if hw indicated a good cksum
  * @ring: pointer to Rx descriptor ring structure
  * @status_err: hardware indication of status of receive
@@ -494,7 +508,7 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 			goto next_desc;
 		}
 
-		ixgbevf_receive_skb(q_vector, skb, staterr, rx_desc);
+		ixgbevf_rx_skb(q_vector, skb, staterr, rx_desc);
 
 next_desc:
 		rx_desc->wb.upper.status_error = 0;
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  05/11] ixgbe: remove unnecessary duplication of PCIe bandwidth display
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch removes the unnecessary display of PCIe bandwidth twice. Since the
ixgbe_check_minimum_link does a better job, and ensures accurate detection on
even complex chains, this older check is no longer necessary.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 36 ++++++++++-----------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 9753c8a..a7d1a1c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7752,29 +7752,6 @@ skip_sriov:
 	if (ixgbe_pcie_from_parent(hw))
 		ixgbe_get_parent_bus_info(adapter);
 
-	/* print bus type/speed/width info */
-	e_dev_info("(PCI Express:%s:%s) %pM\n",
-		   (hw->bus.speed == ixgbe_bus_speed_8000 ? "8.0GT/s" :
-		    hw->bus.speed == ixgbe_bus_speed_5000 ? "5.0GT/s" :
-		    hw->bus.speed == ixgbe_bus_speed_2500 ? "2.5GT/s" :
-		    "Unknown"),
-		   (hw->bus.width == ixgbe_bus_width_pcie_x8 ? "Width x8" :
-		    hw->bus.width == ixgbe_bus_width_pcie_x4 ? "Width x4" :
-		    hw->bus.width == ixgbe_bus_width_pcie_x1 ? "Width x1" :
-		    "Unknown"),
-		   netdev->dev_addr);
-
-	err = ixgbe_read_pba_string_generic(hw, part_str, IXGBE_PBANUM_LENGTH);
-	if (err)
-		strncpy(part_str, "Unknown", IXGBE_PBANUM_LENGTH);
-	if (ixgbe_is_sfp(hw) && hw->phy.sfp_type != ixgbe_sfp_type_not_present)
-		e_dev_info("MAC: %d, PHY: %d, SFP+: %d, PBA No: %s\n",
-			   hw->mac.type, hw->phy.type, hw->phy.sfp_type,
-		           part_str);
-	else
-		e_dev_info("MAC: %d, PHY: %d, PBA No: %s\n",
-			   hw->mac.type, hw->phy.type, part_str);
-
 	/* calculate the expected PCIe bandwidth required for optimal
 	 * performance. Note that some older parts will never have enough
 	 * bandwidth due to being older generation PCIe parts. We clamp these
@@ -7790,6 +7767,19 @@ skip_sriov:
 	}
 	ixgbe_check_minimum_link(adapter, expected_gts);
 
+	err = ixgbe_read_pba_string_generic(hw, part_str, IXGBE_PBANUM_LENGTH);
+	if (err)
+		strncpy(part_str, "Unknown", IXGBE_PBANUM_LENGTH);
+	if (ixgbe_is_sfp(hw) && hw->phy.sfp_type != ixgbe_sfp_type_not_present)
+		e_dev_info("MAC: %d, PHY: %d, SFP+: %d, PBA No: %s\n",
+			   hw->mac.type, hw->phy.type, hw->phy.sfp_type,
+		           part_str);
+	else
+		e_dev_info("MAC: %d, PHY: %d, PBA No: %s\n",
+			   hw->mac.type, hw->phy.type, part_str);
+
+	e_dev_info("%pM\n", netdev->dev_addr);
+
 	/* reset the hardware with the new settings */
 	err = hw->mac.ops.start_hw(hw);
 	if (err == IXGBE_ERR_EEPROM_VERSION) {
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  03/11] ixgbe: fix qv_lock_napi call in ixgbe_napi_disable_all
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem
  Cc: Jacob Keller, netdev, gospo, sassmann, Eliezer Tamir,
	Alexander Duyck, Hyong-Youb Kim, Amir Vadai, Dmitry Kravkov,
	Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

ixgbe_napi_disable_all calls napi_disable on each queue, however the busy
polling code introduced a local_bh_disable()d context around the napi_disable.
The original author did not realize that napi_disable might sleep, which would
cause a sleep while atomic BUG. In addition, on a single processor system, the
ixgbe_qv_lock_napi loop shouldn't have to mdelay. This patch adds an
ixgbe_qv_disable along with a new IXGBE_QV_STATE_DISABLED bit, which it uses to
indicate to the poll and napi routines that the q_vector has been disabled. Now
the ixgbe_napi_disable_all function will wait until all pending work has been
finished and prevent any future work from being started.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Cc: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Cc: Alexander Duyck <alexander.duyck@intel.com>
Cc: Hyong-Youb Kim <hykim@myri.com>
Cc: Amir Vadai <amirv@mellanox.com>
Cc: Dmitry Kravkov <dmitry@broadcom.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      | 48 ++++++++++++++++++++-------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  6 ++--
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index dc1588e..f51fd1f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -369,11 +369,13 @@ struct ixgbe_q_vector {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int state;
 #define IXGBE_QV_STATE_IDLE        0
-#define IXGBE_QV_STATE_NAPI	   1    /* NAPI owns this QV */
-#define IXGBE_QV_STATE_POLL	   2    /* poll owns this QV */
-#define IXGBE_QV_LOCKED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL)
-#define IXGBE_QV_STATE_NAPI_YIELD  4    /* NAPI yielded this QV */
-#define IXGBE_QV_STATE_POLL_YIELD  8    /* poll yielded this QV */
+#define IXGBE_QV_STATE_NAPI	   1     /* NAPI owns this QV */
+#define IXGBE_QV_STATE_POLL	   2     /* poll owns this QV */
+#define IXGBE_QV_STATE_DISABLED	   4     /* QV is disabled */
+#define IXGBE_QV_OWNED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL)
+#define IXGBE_QV_LOCKED (IXGBE_QV_OWNED | IXGBE_QV_STATE_DISABLED)
+#define IXGBE_QV_STATE_NAPI_YIELD  8     /* NAPI yielded this QV */
+#define IXGBE_QV_STATE_POLL_YIELD  16    /* poll yielded this QV */
 #define IXGBE_QV_YIELD (IXGBE_QV_STATE_NAPI_YIELD | IXGBE_QV_STATE_POLL_YIELD)
 #define IXGBE_QV_USER_PEND (IXGBE_QV_STATE_POLL | IXGBE_QV_STATE_POLL_YIELD)
 	spinlock_t lock;
@@ -394,7 +396,7 @@ static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector)
 static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 {
 	int rc = true;
-	spin_lock(&q_vector->lock);
+	spin_lock_bh(&q_vector->lock);
 	if (q_vector->state & IXGBE_QV_LOCKED) {
 		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
 		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
@@ -405,7 +407,7 @@ static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 	} else
 		/* we don't care if someone yielded */
 		q_vector->state = IXGBE_QV_STATE_NAPI;
-	spin_unlock(&q_vector->lock);
+	spin_unlock_bh(&q_vector->lock);
 	return rc;
 }
 
@@ -413,14 +415,15 @@ static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 static inline bool ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
 {
 	int rc = false;
-	spin_lock(&q_vector->lock);
+	spin_lock_bh(&q_vector->lock);
 	WARN_ON(q_vector->state & (IXGBE_QV_STATE_POLL |
 			       IXGBE_QV_STATE_NAPI_YIELD));
 
 	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
 		rc = true;
-	q_vector->state = IXGBE_QV_STATE_IDLE;
-	spin_unlock(&q_vector->lock);
+	/* will reset state to idle, unless QV is disabled */
+	q_vector->state &= IXGBE_QV_STATE_DISABLED;
+	spin_unlock_bh(&q_vector->lock);
 	return rc;
 }
 
@@ -451,7 +454,8 @@ static inline bool ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
 
 	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
 		rc = true;
-	q_vector->state = IXGBE_QV_STATE_IDLE;
+	/* will reset state to idle, unless QV is disabled */
+	q_vector->state &= IXGBE_QV_STATE_DISABLED;
 	spin_unlock_bh(&q_vector->lock);
 	return rc;
 }
@@ -459,9 +463,23 @@ static inline bool ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
 /* true if a socket is polling, even if it did not get the lock */
 static inline bool ixgbe_qv_busy_polling(struct ixgbe_q_vector *q_vector)
 {
-	WARN_ON(!(q_vector->state & IXGBE_QV_LOCKED));
+	WARN_ON(!(q_vector->state & IXGBE_QV_OWNED));
 	return q_vector->state & IXGBE_QV_USER_PEND;
 }
+
+/* false if QV is currently owned */
+static inline bool ixgbe_qv_disable(struct ixgbe_q_vector *q_vector)
+{
+	int rc = true;
+	spin_lock_bh(&q_vector->lock);
+	if (q_vector->state & IXGBE_QV_OWNED)
+		rc = false;
+	q_vector->state |= IXGBE_QV_STATE_DISABLED;
+	spin_unlock_bh(&q_vector->lock);
+
+	return rc;
+}
+
 #else /* CONFIG_NET_RX_BUSY_POLL */
 static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector)
 {
@@ -491,6 +509,12 @@ static inline bool ixgbe_qv_busy_polling(struct ixgbe_q_vector *q_vector)
 {
 	return false;
 }
+
+static inline bool ixgbe_qv_disable(struct ixgbe_q_vector *q_vector)
+{
+	return true;
+}
+
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 #ifdef CONFIG_IXGBE_HWMON
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ce3eb60..ee90dfb 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3891,15 +3891,13 @@ static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter)
 {
 	int q_idx;
 
-	local_bh_disable(); /* for ixgbe_qv_lock_napi() */
 	for (q_idx = 0; q_idx < adapter->num_q_vectors; q_idx++) {
 		napi_disable(&adapter->q_vector[q_idx]->napi);
-		while (!ixgbe_qv_lock_napi(adapter->q_vector[q_idx])) {
+		while (!ixgbe_qv_disable(adapter->q_vector[q_idx])) {
 			pr_info("QV %d locked\n", q_idx);
-			mdelay(1);
+			usleep_range(1000, 20000);
 		}
 	}
-	local_bh_enable();
 }
 
 #ifdef CONFIG_IXGBE_DCB
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  04/11] ixgbe: show <2% for encoding loss on PCIe Gen3
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch updates the ixgbe_check_minimum_link function to correctly show that
there is some minor loss of encoding, even though we don't calculate it in the
max GT/s equation. It is small enough to not bother, but is better to report it
than not.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ee90dfb..9753c8a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -245,7 +245,7 @@ static void ixgbe_check_minimum_link(struct ixgbe_adapter *adapter,
 		max_gts = 4 * width;
 		break;
 	case PCIE_SPEED_8_0GT:
-		/* 128b/130b encoding only reduces throughput by 1% */
+		/* 128b/130b encoding reduces throughput by less than 2% */
 		max_gts = 8 * width;
 		break;
 	default:
@@ -263,7 +263,7 @@ static void ixgbe_check_minimum_link(struct ixgbe_adapter *adapter,
 		   width,
 		   (speed == PCIE_SPEED_2_5GT ? "20%" :
 		    speed == PCIE_SPEED_5_0GT ? "20%" :
-		    speed == PCIE_SPEED_8_0GT ? "N/a" :
+		    speed == PCIE_SPEED_8_0GT ? "<2%" :
 		    "Unknown"));
 
 	if (max_gts < expected_gts) {
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  02/11] net: add might_sleep() call to napi_disable
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem
  Cc: Jacob Keller, netdev, gospo, sassmann, Eliezer Tamir,
	Alexander Duyck, Hyong-Youb Kim, Amir Vadai, Dmitry Kravkov,
	Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

napi_disable uses an msleep() call to wait for outstanding napi work to be
finished after setting the disable bit. It does not always sleep incase there
was no outstanding work. This resulted in a rare bug in ixgbe_down operation
where a napi_disable call took place inside of a local_bh_disable()d context.
In order to enable easier detection of future sleep while atomic BUGs, this
patch adds a might_sleep() call, so that every use of napi_disable during
atomic context will be visible.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Cc: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Cc: Alexander Duyck <alexander.duyck@intel.com>
Cc: Hyong-Youb Kim <hykim@myri.com>
Cc: Amir Vadai <amirv@mellanox.com>
Cc: Dmitry Kravkov <dmitry@broadcom.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 27f62f7..cb1d918 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -483,6 +483,7 @@ void napi_hash_del(struct napi_struct *napi);
  */
 static inline void napi_disable(struct napi_struct *n)
 {
+	might_sleep();
 	set_bit(NAPI_STATE_DISABLE, &n->state);
 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 		msleep(1);
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  01/11] vxlan: Have the NIC drivers do less work for offloads
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem; +Cc: Joseph Gasparakis, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1383048151-15002-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Joseph Gasparakis <joseph.gasparakis@intel.com>

This patch removes the burden from the NIC drivers to check if the
vxlan driver is enabled in the kernel and also makes available
the vxlan headrooms to them.

Signed-off-by: Joseph Gasparakis <joseph.gasparakis@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/vxlan.c |  4 ----
 include/net/vxlan.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 75a3a74..24260ce 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -60,10 +60,6 @@
 
 #define VXLAN_N_VID	(1u << 24)
 #define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
-/* IP header + UDP + VXLAN + Ethernet header */
-#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
-/* IPv6 header + UDP + VXLAN + Ethernet header */
-#define VXLAN6_HEADROOM (40 + 8 + 8 + 14)
 #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
 
 #define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 2d64d3c..6b6d180 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -36,5 +36,16 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 
 __be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb);
 
+/* IP header + UDP + VXLAN + Ethernet header */
+#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
+/* IPv6 header + UDP + VXLAN + Ethernet header */
+#define VXLAN6_HEADROOM (40 + 8 + 8 + 14)
+
+#if IS_ENABLED(CONFIG_VXLAN)
 void vxlan_get_rx_port(struct net_device *netdev);
+#else
+static inline void vxlan_get_rx_port(struct net_device *netdev)
+{
+}
+#endif
 #endif
-- 
1.8.3.1

^ permalink raw reply related

* [net-next  00/11][pull request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2013-10-29 12:02 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann

This series contains updates to vxlan, net, ixgbe, ixgbevf, and i40e.

Joseph provides a single patch against vxlan which removes the burden
from the NIC drivers to check if the vxlan driver is enabled in the
kernel and also makes available the vxlan headrooms to the drivers.

Jacob provides majority of the patches, with patches against net, ixgbe
and ixgbevf.  His net patch adds might_sleep() call to napi_disable so
that every use of napi_disable during atomic context will be visible.
Then Jacob provides a patch to fix qv_lock_napi call in
ixgbe_napi_disable_all.  The other ixgbe patches cleanup
ixgbe_check_minimum_link function to correctly show that there are some
minor loss of encoding, even though we don't calculate it and remove
unnecessary duplication of PCIe bandwidth display.  Lastly, Jacob
provides 4 patches against ixgbevf to add ixgbevf_rx_skb in line with
how ixgbe handles the variations on how packets can be received, adds
support in order to track how many packets were cleaned during busy poll
as part of the extended statistics.

Wei Yongjun provides a fix for i40e to return -ENOMEN in the memory
allocation error handling case instead of returning 0, as done
elsewhere in this function.

The following are changes since commit cdfb97bc010d9e9d994eb68f2cbac3a8ada26104:
  net, mc: fix the incorrect comments in two mc-related functions
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-next master

Don Skidmore (1):
  ixgbevf: Add zero_base handler to network statistics

Jacob Keller (8):
  net: add might_sleep() call to napi_disable
  ixgbe: fix qv_lock_napi call in ixgbe_napi_disable_all
  ixgbe: show <2% for encoding loss on PCIe Gen3
  ixgbe: remove unnecessary duplication of PCIe bandwidth display
  ixgbevf: add ixgbevf_rx_skb
  ixgbevf: have clean_rx_irq return total_rx_packets cleaned
  ixgbevf: implement CONFIG_NET_RX_BUSY_POLL
  ixgbevf: add BP_EXTENDED_STATS for CONFIG_NET_RX_BUSY_POLL

Joseph Gasparakis (1):
  vxlan: Have the NIC drivers do less work for offloads

Wei Yongjun (1):
  i40e: fix error return code in i40e_probe()

 drivers/net/ethernet/intel/i40e/i40e_main.c       |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe.h          |  48 ++++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  46 +++-----
 drivers/net/ethernet/intel/ixgbevf/ethtool.c      |  98 +++++++++++-----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      | 132 +++++++++++++++++++++-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 103 +++++++++++++++--
 drivers/net/vxlan.c                               |   4 -
 include/linux/netdevice.h                         |   1 +
 include/net/vxlan.h                               |  11 ++
 9 files changed, 366 insertions(+), 81 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: IPV6 nf defrag does not work
From: Florian Westphal @ 2013-10-29 11:56 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, pablo, netfilter-devel, yoshfuji, kadlec, kaber
In-Reply-To: <20131029105208.GA18526@minipsycho.orion>

Jiri Pirko <jiri@resnulli.us> wrote:
> On the current net-next if you on HOSTA do:
> ip6tables -I INPUT -p icmpv6 -j DROP
> ip6tables -I INPUT -p icmpv6 -m icmp6 --icmpv6-type 128 -j ACCEPT
> 
> and on HOSTB you do:
> ping6 HOSTA -s2000    (MTU is 1500)
> 
> Only the first ICMP echo request will be passed through, the rest is not
> passed on HOSTA. This issue does not occur with smaller packets than MTU (where
> fragmentation does not happen).
>
> I'm trying to find out where the problem is.

Are you sure this is new behaviour? As far back as I can remember
it was always like this.

in ip6tables, the individual fragments are sent through the ruleset,
iow. you'll need to make use of '-m conntrack' to match the fragments
belonging to an existing connection.

I don't know why this is, and I don't like this either.
But this is how it was implemented, see

net/ipv6/netfilter/nf_defrag_ipv6_hooks.c, ipv6_defrag() ->
nf_ct_frag6_output()

^ permalink raw reply

* Re: [PATCH] x86: Run checksumming in parallel accross multiple alu's
From: Neil Horman @ 2013-10-29 11:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric Dumazet, linux-kernel, sebastien.dugue, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, x86, netdev
In-Reply-To: <20131029113031.GA16897@gmail.com>

On Tue, Oct 29, 2013 at 12:30:31PM +0100, Ingo Molnar wrote:
> 
> * Neil Horman <nhorman@tuxdriver.com> wrote:
> 
> > Sure it was this:
> > for i in `seq 0 1 3`
> > do
> > echo $i > /sys/module/csum_test/parameters/module_test_mode
> > taskset -c 0 perf stat --repeat 20 -C 0 -ddd perf bench sched messaging -- /root/test.sh
> > done >> counters.txt 2>&1
> > 
> > where test.sh is:
> > #!/bin/sh
> > echo 1 > /sys/module/csum_test/parameters/test_fire
> 
> What does '-- /root/test.sh' do?
> 
> Unless I'm missing something, the line above will run:
> 
>   perf bench sched messaging -- /root/test.sh
> 
> which should be equivalent to:
> 
>   perf bench sched messaging
> 
> i.e. /root/test.sh won't be run.
> 
According to the perf man page, I'm supposed to be able to use -- to separate
perf command line parameters from the command I want to run.  And it definately
executed test.sh, I added an echo to stdout in there as a test run and observed
them get captured in counters.txt

Neil

> Thanks,
> 
> 	Ingo
> 

^ permalink raw reply

* Re: [PATCH] x86: Run checksumming in parallel accross multiple alu's
From: Ingo Molnar @ 2013-10-29 11:30 UTC (permalink / raw)
  To: Neil Horman
  Cc: Eric Dumazet, linux-kernel, sebastien.dugue, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, x86, netdev
In-Reply-To: <20131029112022.GA24477@neilslaptop.think-freely.org>


* Neil Horman <nhorman@tuxdriver.com> wrote:

> Sure it was this:
> for i in `seq 0 1 3`
> do
> echo $i > /sys/module/csum_test/parameters/module_test_mode
> taskset -c 0 perf stat --repeat 20 -C 0 -ddd perf bench sched messaging -- /root/test.sh
> done >> counters.txt 2>&1
> 
> where test.sh is:
> #!/bin/sh
> echo 1 > /sys/module/csum_test/parameters/test_fire

What does '-- /root/test.sh' do?

Unless I'm missing something, the line above will run:

  perf bench sched messaging -- /root/test.sh

which should be equivalent to:

  perf bench sched messaging

i.e. /root/test.sh won't be run.

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH] x86: Run checksumming in parallel accross multiple alu's
From: Neil Horman @ 2013-10-29 11:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric Dumazet, linux-kernel, sebastien.dugue, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, x86, netdev
In-Reply-To: <20131029082542.GA24625@gmail.com>

On Tue, Oct 29, 2013 at 09:25:42AM +0100, Ingo Molnar wrote:
> 
> * Neil Horman <nhorman@tuxdriver.com> wrote:
> 
> > Heres my data for running the same test with taskset restricting 
> > execution to only cpu0.  I'm not quite sure whats going on here, 
> > but doing so resulted in a 10x slowdown of the runtime of each 
> > iteration which I can't explain.  As before however, both the 
> > parallel alu run and the prefetch run resulted in speedups, but 
> > the two together were not in any way addative.  I'm going to keep 
> > playing with the prefetch stride, unless you have an alternate 
> > theory.
> 
> Could you please cite the exact command-line you used for running 
> the test?
> 
> Thanks,
> 
> 	Ingo
> 

Sure it was this:
for i in `seq 0 1 3`
do
echo $i > /sys/module/csum_test/parameters/module_test_mode
taskset -c 0 perf stat --repeat 20 -C 0 -ddd perf bench sched messaging -- /root/test.sh
done >> counters.txt 2>&1

where test.sh is:
#!/bin/sh
echo 1 > /sys/module/csum_test/parameters/test_fire


As before, module_test_mode selects a case in a switch statement I added in
do_csum to test one of the 4 csum variants we've been discusing (base, prefetch,
parallel ALU or both), and test_fire is a callback trigger I use in the test
module to run 100000 iterations of a checksum operation.  As you requested, I
ran the above on cpu 0 (-C 0 on perf and -c 0 on taskset), and I removed all irq
affinity to cpu 0.

Regards
Neil

^ permalink raw reply

* RE: [PATCH net-next] tipc: remove two indentation levels in tipc_recv_msg routine
From: David Laight @ 2013-10-29 11:13 UTC (permalink / raw)
  To: Ying Xue, davem, maloy
  Cc: Paul.Gortmaker, jon.maloy, erik.hugne, tipc-discussion, netdev
In-Reply-To: <1383044471-12982-1-git-send-email-ying.xue@windriver.com>

> The message dispatching part of tipc_recv_msg() is wrapped layers of
> while/if/if/switch, causing out-of-control indentation and does not
> look very good. We reduce two indentation levels by separating the
> message dispatching from the blocks that checks link state and
> sequence numbers, allowing longer function and arg names to be
> consistently indented without wrapping. This is a cosmetic change
> that does not alter the operation of TIPC in any way.
...
> +			tipc_node_unlock(n_ptr);
> +			goto cont;
> +		}
...
>		continue;
>  cont:
>  		kfree_skb(buf);
>  	}

I can only see one 'goto cont', an explicit kfree_skb() and
continue would be clearer.

	David

^ permalink raw reply

* Re: [PATCH] bridge: pass correct vlan id to multicast code
From: Toshiaki Makita @ 2013-10-29 11:08 UTC (permalink / raw)
  To: Amos Kong; +Cc: Vlad Yasevich, netdev, shemminger
In-Reply-To: <20131029023646.GA2795@amosk.info>

On Tue, 2013-10-29 at 10:36 +0800, Amos Kong wrote:
> On Mon, Oct 28, 2013 at 03:45:07PM -0400, Vlad Yasevich wrote:
> > Currently multicast code attempts to extrace the vlan id from
> > the skb even when vlan filtering is disabled.  This can lead
> > to mdb entries being created with the wrong vlan id.
> > Pass the already extracted vlan id to the multicast
> > filtering code to make the correct id is used in
> > creation as well as lookup.

Thanks!

Acked-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

>  
> Hi Vlad,
> 
> Can we just update br_vlan_get_tag() to set vid to 0 if dev->vlan is
> disabled? I guess it would effect br_handle_local_finish().

br_handle_local_finish() looks also buggy.
But adding vlan enabled checking would not fix it completely because
vlan_bitmap and PVID are not taken into account in that function.

Since we cannot pass vid as an argument from br_dev_xmit() to
br_handle_[local/frame]_finish() because of NF_HOOK,
br_handle_local_finish() seems to have to check vlan_enabled,
vlan_bitmap, and pvid by itself.

IMHO it can be addressed by another patch.

> 
> > Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
> > ---
> >  net/bridge/br_device.c    |  2 +-
> >  net/bridge/br_input.c     |  2 +-
> >  net/bridge/br_multicast.c | 44 +++++++++++++++++++-------------------------
> >  net/bridge/br_private.h   |  6 ++++--
> >  4 files changed, 25 insertions(+), 29 deletions(-)
> 
> ...
> 
> > diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> > index 8b0b610..686284f 100644
> > --- a/net/bridge/br_multicast.c
> > +++ b/net/bridge/br_multicast.c
> > @@ -947,7 +947,8 @@ void br_multicast_disable_port(struct net_bridge_port *port)
> >  
> >  static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
> >  					 struct net_bridge_port *port,
> > -					 struct sk_buff *skb)
> > +					 struct sk_buff *skb,
> > +					 u16 vid)
> >  {
> >  	struct igmpv3_report *ih;
> >  	struct igmpv3_grec *grec;
> > @@ -957,12 +958,10 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
> >  	int type;
> >  	int err = 0;
> >  	__be32 group;
> > -	u16 vid = 0;
> >  
> >  	if (!pskb_may_pull(skb, sizeof(*ih)))
> >  		return -EINVAL;
> >  
> > -	br_vlan_get_tag(skb, &vid);
> 
> After applied the patch, we always use vid in br_dev_xmit()->br_allowed_ingress(),
> is it possible that the vlan of bridge is re-enabled when other
> changed functions are called?
> 
> We can just add a enabled checking before this kind of br_vlan_get_tag()?
> 
> if (!br->vlan_enabled)
>     br_vlan_get_tag(skb2, &vid);

Maybe this leads to a wrong way to update mdb in some cases like
  Vlan_filtering is disabled (by default).
  Add some vids we want to allow.
  Receive a frame whose vid wouldn't be allowed with vlan_filtering enabled.
  The frame passes br_allowed_ingress().
  Enable vlan_filtering.
  The frame reaches br_ip4_multicast_igmp3_report().
  Mdb is updated with disabled vid.


Thanks,

Toshiaki Makita

> 
> 
> >  	ih = igmpv3_report_hdr(skb);
> >  	num = ntohs(ih->ngrec);
> >  	len = sizeof(*ih);
> 
> ...
> 

^ permalink raw reply

* [PATCH net-next] tipc: remove two indentation levels in tipc_recv_msg routine
From: Ying Xue @ 2013-10-29 11:01 UTC (permalink / raw)
  To: davem, maloy; +Cc: jon.maloy, netdev, Paul.Gortmaker, tipc-discussion

The message dispatching part of tipc_recv_msg() is wrapped layers of
while/if/if/switch, causing out-of-control indentation and does not
look very good. We reduce two indentation levels by separating the
message dispatching from the blocks that checks link state and
sequence numbers, allowing longer function and arg names to be
consistently indented without wrapping. This is a cosmetic change
that does not alter the operation of TIPC in any way.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
---
 net/tipc/link.c |  147 +++++++++++++++++++++++++++----------------------------
 1 file changed, 72 insertions(+), 75 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index e8153f6..1bfee8c 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1593,97 +1593,94 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr)
 
 		/* Now (finally!) process the incoming message */
 protocol_check:
-		if (likely(link_working_working(l_ptr))) {
-			if (likely(seq_no == mod(l_ptr->next_in_no))) {
-				l_ptr->next_in_no++;
-				if (unlikely(l_ptr->oldest_deferred_in))
-					head = link_insert_deferred_queue(l_ptr,
-									  head);
-deliver:
-				if (likely(msg_isdata(msg))) {
-					tipc_node_unlock(n_ptr);
-					tipc_port_recv_msg(buf);
-					continue;
-				}
-				switch (msg_user(msg)) {
-					int ret;
-				case MSG_BUNDLER:
-					l_ptr->stats.recv_bundles++;
-					l_ptr->stats.recv_bundled +=
-						msg_msgcnt(msg);
-					tipc_node_unlock(n_ptr);
-					tipc_link_recv_bundle(buf);
-					continue;
-				case NAME_DISTRIBUTOR:
-					n_ptr->bclink.recv_permitted = true;
-					tipc_node_unlock(n_ptr);
-					tipc_named_recv(buf);
-					continue;
-				case BCAST_PROTOCOL:
-					tipc_link_recv_sync(n_ptr, buf);
-					tipc_node_unlock(n_ptr);
-					continue;
-				case CONN_MANAGER:
-					tipc_node_unlock(n_ptr);
-					tipc_port_recv_proto_msg(buf);
-					continue;
-				case MSG_FRAGMENTER:
-					l_ptr->stats.recv_fragments++;
-					ret = tipc_link_recv_fragment(
-						&l_ptr->defragm_buf,
-						&buf, &msg);
-					if (ret == 1) {
-						l_ptr->stats.recv_fragmented++;
-						goto deliver;
-					}
-					if (ret == -1)
-						l_ptr->next_in_no--;
-					break;
-				case CHANGEOVER_PROTOCOL:
-					type = msg_type(msg);
-					if (link_recv_changeover_msg(&l_ptr,
-								     &buf)) {
-						msg = buf_msg(buf);
-						seq_no = msg_seqno(msg);
-						if (type == ORIGINAL_MSG)
-							goto deliver;
-						goto protocol_check;
-					}
-					break;
-				default:
-					kfree_skb(buf);
-					buf = NULL;
-					break;
-				}
+		if (unlikely(!link_working_working(l_ptr))) {
+			if (msg_user(msg) == LINK_PROTOCOL) {
+				link_recv_proto_msg(l_ptr, buf);
+				head = link_insert_deferred_queue(l_ptr, head);
+				tipc_node_unlock(n_ptr);
+				continue;
+			}
+
+			/* Traffic message. Conditionally activate link */
+			link_state_event(l_ptr, TRAFFIC_MSG_EVT);
+
+			if (link_working_working(l_ptr)) {
+				/* Re-insert buffer in front of queue */
+				buf->next = head;
+				head = buf;
 				tipc_node_unlock(n_ptr);
-				tipc_net_route_msg(buf);
 				continue;
 			}
+			tipc_node_unlock(n_ptr);
+			goto cont;
+		}
+
+		/* Link is now in state WORKING_WORKING */
+		if (unlikely(seq_no != mod(l_ptr->next_in_no))) {
 			link_handle_out_of_seq_msg(l_ptr, buf);
 			head = link_insert_deferred_queue(l_ptr, head);
 			tipc_node_unlock(n_ptr);
 			continue;
 		}
-
-		/* Link is not in state WORKING_WORKING */
-		if (msg_user(msg) == LINK_PROTOCOL) {
-			link_recv_proto_msg(l_ptr, buf);
+		l_ptr->next_in_no++;
+		if (unlikely(l_ptr->oldest_deferred_in))
 			head = link_insert_deferred_queue(l_ptr, head);
+deliver:
+		if (likely(msg_isdata(msg))) {
 			tipc_node_unlock(n_ptr);
+			tipc_port_recv_msg(buf);
 			continue;
 		}
-
-		/* Traffic message. Conditionally activate link */
-		link_state_event(l_ptr, TRAFFIC_MSG_EVT);
-
-		if (link_working_working(l_ptr)) {
-			/* Re-insert buffer in front of queue */
-			buf->next = head;
-			head = buf;
+		switch (msg_user(msg)) {
+			int ret;
+		case MSG_BUNDLER:
+			l_ptr->stats.recv_bundles++;
+			l_ptr->stats.recv_bundled += msg_msgcnt(msg);
+			tipc_node_unlock(n_ptr);
+			tipc_link_recv_bundle(buf);
+			continue;
+		case NAME_DISTRIBUTOR:
+			n_ptr->bclink.recv_permitted = true;
+			tipc_node_unlock(n_ptr);
+			tipc_named_recv(buf);
+			continue;
+		case BCAST_PROTOCOL:
+			tipc_link_recv_sync(n_ptr, buf);
+			tipc_node_unlock(n_ptr);
+			continue;
+		case CONN_MANAGER:
 			tipc_node_unlock(n_ptr);
+			tipc_port_recv_proto_msg(buf);
 			continue;
+		case MSG_FRAGMENTER:
+			l_ptr->stats.recv_fragments++;
+			ret = tipc_link_recv_fragment(&l_ptr->defragm_buf,
+						      &buf, &msg);
+			if (ret == 1) {
+				l_ptr->stats.recv_fragmented++;
+				goto deliver;
+			}
+			if (ret == -1)
+				l_ptr->next_in_no--;
+			break;
+		case CHANGEOVER_PROTOCOL:
+			type = msg_type(msg);
+			if (link_recv_changeover_msg(&l_ptr, &buf)) {
+				msg = buf_msg(buf);
+				seq_no = msg_seqno(msg);
+				if (type == ORIGINAL_MSG)
+					goto deliver;
+				goto protocol_check;
+			}
+			break;
+		default:
+			kfree_skb(buf);
+			buf = NULL;
+			break;
 		}
 		tipc_node_unlock(n_ptr);
+		tipc_net_route_msg(buf);
+		continue;
 cont:
 		kfree_skb(buf);
 	}
-- 
1.7.9.5


------------------------------------------------------------------------------
Android is increasing in popularity, but the open development platform that
developers love is also attractive to malware creators. Download this white
paper to learn more about secure code signing practices that can help keep
Android apps secure.
http://pubads.g.doubleclick.net/gampad/clk?id=65839951&iu=/4140/ostg.clktrk

^ permalink raw reply related

* IPV6 nf defrag does not work
From: Jiri Pirko @ 2013-10-29 10:52 UTC (permalink / raw)
  To: netdev; +Cc: pablo, netfilter-devel, yoshfuji, kadlec, kaber

Hi All.

On the current net-next if you on HOSTA do:
ip6tables -I INPUT -p icmpv6 -j DROP
ip6tables -I INPUT -p icmpv6 -m icmp6 --icmpv6-type 128 -j ACCEPT

and on HOSTB you do:
ping6 HOSTA -s2000    (MTU is 1500)

Only the first ICMP echo request will be passed through, the rest is not
passed on HOSTA. This issue does not occur with smaller packets than MTU (where
fragmentation does not happen).

I'm trying to find out where the problem is.

Any quick ideas?

Thanks

Jiri

^ permalink raw reply

* Re: [PATCH 00/19] Enable various Renesas drivers on all ARM platforms
From: Laurent Pinchart @ 2013-10-29  9:46 UTC (permalink / raw)
  To: Guennadi Liakhovetski
  Cc: linux-fbdev-u79uwXL29TY76Z2rM5mHXA,
	linux-sh-u79uwXL29TY76Z2rM5mHXA, Linus Walleij,
	Guennadi Liakhovetski, Thierry Reding,
	linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA, Laurent Pinchart, Vinod Koul,
	Wolfram Sang, Magnus Damm, Eduardo Valentin, Tomi Valkeinen,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-input-u79uwXL29TY76Z2rM5mHXA, Zhang Rui, Chris Ball,
	Jean-Christophe Plagniol-Villard,
	linux-media-u79uwXL29TY76Z2rM5mHXA,
	linux-pwm-u79uwXL29TY76Z2rM5mHXA, Samuel Ortiz,
	linux-pm-u79uwXL29TY76Z2rM5mHXA, Ian Molton, Mark Brown,
	linux-arm-k
In-Reply-To: <Pine.LNX.4.64.1310291009121.8404-0199iw4Nj15frtckUFj5Ag@public.gmane.org>

Hi Simon and Guennadi,

On Tuesday 29 October 2013 10:12:17 Guennadi Liakhovetski wrote:
> On Tue, 29 Oct 2013, Laurent Pinchart wrote:
> > Hello,
> > 
> > This patch series, based on v3.12-rc7, prepares various Renesas drivers
> > for migration to multiplatform kernels by enabling their compilation or
> > otherwise fixing them on all ARM platforms. The patches are pretty
> > straightforward and are described in their commit message.
> > 
> > I'd like to get all these patches merged in v3.14. As they will need to go
> > through their respective subsystems' trees, I would appreciate if all
> > maintainers involved could notify me when they merge patches from this
> > series in their tree to help me tracking the merge status. I don't plan
> > to send pull requests individually for these patches, and I will repost
> > patches individually if changes are requested during review.
> > 
> > If you believe the issue should be solved in a different way (for instance
> > by removing the architecture dependency completely) please reply to the
> > cover letter to let other maintainers chime in.
> 
> Exactly this was my doubt. If we let these drivers build on all ARM
> platforms... Maybe we should just let them build everywhere? Unless there
> are real ARM dependencies. Maybe you could try to remove the restriction
> and try to build them all on x86?

My concern is that they might not build on some architectures for which I 
don't have a cross-compiler. For instance not all architecture define ioread32 
and iowrite32, which some drivers might rely on. What about depending on 
SUPERH || ARM || COMPILE_TEST to start with, in order not to push compilation 
breakages to mainline ?

> > Cc: Chris Ball <cjb-2X9k7bc8m7Mdnm+yROfE0A@public.gmane.org>
> > Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> > Cc: David Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
> > Cc: dmaengine-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: Dmitry Torokhov <dmitry.torokhov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Cc: Eduardo Valentin <eduardo.valentin-l0cyMroinI0@public.gmane.org>
> > Cc: Greg Kroah-Hartman <gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>
> > Cc: Guennadi Liakhovetski <g.liakhovetski+renesas-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Cc: Ian Molton <ian-zdned+2MO1+9FHfhHBbuYA@public.gmane.org>
> > Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> > Cc: Jean-Christophe Plagniol-Villard <plagnioj-sclMFOaUSTBWk0Htik3J/w@public.gmane.org>
> > Cc: Joerg Roedel <joro-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
> > Cc: Linus Walleij <linus.walleij-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
> > Cc: linux-fbdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-i2c-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-input-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-mmc-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org
> > Cc: linux-pm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-pwm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-serial-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: linux-spi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: Magnus Damm <magnus.damm-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Cc: Mark Brown <broonie-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> > Cc: Mauro Carvalho Chehab <m.chehab-Sze3O3UU22JBDgjK7y7TUQ@public.gmane.org>
> > Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Cc: Samuel Ortiz <samuel-jcdQHdrhKHMdnm+yROfE0A@public.gmane.org>
> > Cc: Sergei Shtylyov <sergei.shtylyov-M4DtvfQ/ZS1MRgGoP+s0PdBPR1lH4CV8@public.gmane.org>
> > Cc: Thierry Reding <thierry.reding-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> > Cc: Tomi Valkeinen <tomi.valkeinen-l0cyMroinI0@public.gmane.org>
> > Cc: Vinod Koul <vinod.koul-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > Cc: Wolfram Sang <wsa-z923LK4zBo2bacvFa/9K2g@public.gmane.org>
> > Cc: Zhang Rui <rui.zhang-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> > 
> > Laurent Pinchart (19):
> >   serial: sh-sci: Enable the driver on all ARM platforms
> >   DMA: shdma: Enable the driver on all ARM platforms
> >   i2c: sh_mobile: Enable the driver on all ARM platforms
> >   input: sh_keysc: Enable the driver on all ARM platforms
> >   iommu: shmobile: Enable the driver on all ARM platforms
> >   i2c: rcar: Enable the driver on all ARM platforms
> >   v4l: sh_vou: Enable the driver on all ARM platforms
> >   mmc: sdhi: Enable the driver on all ARM platforms
> >   mmc: sh_mmcif: Enable the driver on all ARM platforms
> >   mtd: sh_flctl: Enable the driver on all ARM platforms
> >   net: sh_eth: Set receive alignment correctly on all ARM platforms
> >   irda: sh_irda: Enable the driver on all ARM platforms
> >   pinctrl: sh-pfc: Enable the driver on all ARM platforms
> >   pwm: pwm-renesas-tpu: Enable the driver on all ARM platforms
> >   sh: intc: Enable the driver on all ARM platforms
> >   spi: sh_msiof: Enable the driver on all ARM platforms
> >   spi: sh_hspi: Enable the driver on all ARM platforms
> >   thermal: rcar-thermal: Enable the driver on all ARM platforms
> >   fbdev: sh-mobile-lcdcfb: Enable the driver on all ARM platforms
> >  
> >  drivers/dma/sh/Kconfig                | 2 +-
> >  drivers/dma/sh/shdmac.c               | 6 +++---
> >  drivers/i2c/busses/Kconfig            | 4 ++--
> >  drivers/input/keyboard/Kconfig        | 2 +-
> >  drivers/iommu/Kconfig                 | 2 +-
> >  drivers/media/platform/Kconfig        | 2 +-
> >  drivers/mmc/host/Kconfig              | 4 ++--
> >  drivers/mmc/host/tmio_mmc_dma.c       | 2 +-
> >  drivers/mtd/nand/Kconfig              | 2 +-
> >  drivers/net/ethernet/renesas/sh_eth.c | 2 +-
> >  drivers/net/ethernet/renesas/sh_eth.h | 2 +-
> >  drivers/net/irda/Kconfig              | 2 +-
> >  drivers/pinctrl/Makefile              | 2 +-
> >  drivers/pinctrl/sh-pfc/Kconfig        | 2 +-
> >  drivers/pwm/Kconfig                   | 2 +-
> >  drivers/sh/intc/Kconfig               | 2 +-
> >  drivers/spi/Kconfig                   | 4 ++--
> >  drivers/thermal/Kconfig               | 2 +-
> >  drivers/tty/serial/Kconfig            | 2 +-
> >  drivers/video/Kconfig                 | 6 +++---
> >  20 files changed, 27 insertions(+), 27 deletions(-)

-- 
Regards,

Laurent Pinchart

^ permalink raw reply

* RE: [patch net-next] ipv6: allow userspace to create address with IFLA_F_TEMPORARY flag
From: David Laight @ 2013-10-29  9:37 UTC (permalink / raw)
  To: David Miller, hannes
  Cc: jiri, vyasevich, netdev, kuznet, jmorris, yoshfuji, kaber,
	thaller, stephen
In-Reply-To: <20131028.204306.2213130677400093266.davem@davemloft.net>

> Note that you don't even need to put the DHCP protocol core into the
> kernel to fix the promiscuous problem.  You just have to use the
> current kernel interfaces correctly.
> 
> It used to be the case a very long time ago that you couldn't even
> receive broadcast UDP datagrams on a socket until an address was
> configured on it.
> 
> So everyone turns on promiscuous mode and uses RAW sockets or
> AF_PACKET.
> 
> Stupid?  yes.

Not only that, but the dhcp client could use a normal UDP socket
to keep the lease renewed - I suspect it has only ever needed
to use the BPF interface (I didn't think it set promiscuous)
when acquiring the initial lease.

	David
 

^ permalink raw reply

* Re: [PATCH 1/4 net-next] net: phy: add Generic Netlink Ethernet switch configuration API
From: Felix Fietkau @ 2013-10-29  9:34 UTC (permalink / raw)
  To: Jamal Hadi Salim, Florian Fainelli, Neil Horman
  Cc: John Fastabend, netdev, David Miller, Sascha Hauer, John Crispin,
	Jonas Gorski, Gary Thomas, Vlad Yasevich, Stephen Hemminger
In-Reply-To: <526EEAE9.6090508@mojatatu.com>

On 2013-10-28 23:53, Jamal Hadi Salim wrote:
> On 10/27/13 15:51, Felix Fietkau wrote:
>> On 2013-10-27 6:19 PM, Jamal Hadi Salim wrote:
> 
>> That question does not make any sense to me. Aside from low level
>> control frames like pause frames for flow control, the switch has no
>> need to send packets to the CPU port on its own.
>> Remember what I told you about the switch being a *separate* entity from
>> the NIC that connects it to the CPU.
>>
> 
> I am assuming there is a MAC address which is identified to be that of a
> switch. Something responds to ARP for example for that MAC. I think
> you are  saying that for a certain class of switch chips, there is no
> concept of "cpu port" - therefore there cannot be unicast from the
> chip to the cpu.
These are simple switches, why would they respond to ARP?
I suspect that you're attributing too much functionality to the switch
itself. Think of it as a device similar to the cheap unmanaged ones you
can buy in a shop and hook up to your machine via Ethernet.
Add to that some very limited VLAN grouping functionality, and you're
pretty close to the limits of what these switches can do.
They don't do ARP, IP or other things. They learn about MAC addresses
from incoming packets to build their forwarding path.
The CPU port in this case is whatever port on the switch that you plug
the cable of your machine into :)

>> One of the currently very common switches in many embedded devices is
>> the RTL8366/RTL8367. It has some flexibility when it comes to
>> configuring VLANs, and it's one of the few ones where you can configure
>> a forwarding table for a VLAN (which spans multiple ports), which allows
>> software bridging between multiple VLANs.
>> However, what this switch does *not* support is adding a header/trailer
>> to packets to indicate the originating port.
>> This means that all per-port netdevs will be dummy ports which don't
>> include the data path.
> 
> My view is that netdevs are still valuable even if only they get used 
> for control path. Like you said earlier - you can still pull stats, flow 
> control messages still make it through etc. They provide you
> the consistent api to configure the switch above, ex:
> If i was to use the FDB api for this switch as long as i can
> abstract it in software as a bridge, I could send it a switch config
> via its ops which says:
> "I am giving you this entry with vland 400 for port 2, but i want you to
> send it to the hardware not to your local entry"
The FDB related abstraction that you're describing will not work with
the hardware that I'm talking about. Let's leave that one out of this
discussion.
As for per-port netdevs: Yes, you could pull stats.
No, flow control messages would not make it through.
No idea how it would provide a *consistent* API.
Either way, if adding netdevs just for stats and link state, that could
be easily added on top of swconfig (or whatever name we pick for it)
later. I just don't think it's worth it at this point.

>> So let's say you have a configuration where you're using VLAN ID 4 on
>> port 1, and you want to bridge it to VLAN ID 400 on port 2.
>>
>> Sounds easy enough, you can easily create a bridge that spans port1.4
>> and port2.400. Except, this particular switch (like pretty much any
>> other switch supported by swconfig) isn't actually able to handle such a
>> configuration on its own.
> 
> Makes sense.
> Let me point that even the Linux bridge cant handle this on its own
> either.
> You would need two bridges instantiated. The "cpu port" (we should call
> it the "L3 port" really) is implicit in the case of the bridge i.e it
> is the Linux network stack.
> You would need to set the vlan filters on the bridge to strip the vlan
> on egress of the first bridge etc ..
> 
>> It needs two VLAN configurations, with different forwarding table IDs,
>> and then the software bridge on the CPU port needs to forward between
>> the two different VLANs.
>> To be able to handle such a configuration, the code would have to detect
>> this kind of special case scenario, somehow hook itself via rx handler
>> into the NIC connected to the CPU port and emulate that VLAN ID
>> replacement behavior.
> 
> IMO: You dont need to muck with rx handler if you used bridge
> abstraction. It becomes a config issue.
If we don't need to muck with an rx handler, how are packets intercepted
from the NIC that connects to the switch?
That NIC is run by a driver that knows nothing about switch stuff.

>> With swconfig, you create two VLANs: VLAN 4, containing CPU and port1;
>> VLAN 400, containing CPU and port2. You then create a software bridge
>> between eth0.4 and eth0.400 (assuming eth0 is the NIC connected to the
>> switch).
> 
> Can we call that "L3" instead of software bridge?
L3? Why?

> Understood.
> I think that discovery is a must - so you can apply different behavior
> to different switches.
> But you seem to have solved this already. Linux as is does not.
> You can either have the driver tell you what it can/cant do or you
> can attempt to fire and miss and get a return code that will tell
> you that it cant achieve what you are asking it to do. I prefer the
> former.
I think that's way more confusing to users than presenting a consistent
model that properly reflects what you can do with the hardware.

But I sense a pattern here. I've long had my beef with quite a few Linux
network related APIs for being inconsistent, having no decent error
reporting when you're trying to configure things (errno doesn't count,
it's just too ambiguous), and just making it hard to figure out the
capabilities. Of course, none of this can be easily fixed due to ABI
stability constraints.
I do NOT wish to follow that pattern!

>> Those are just two simple scenarios from the top of my head - I'm pretty
>> sure I could come up with a long list of further corner cases and
>> quirks, which are simply either difficult to deal with, or completely
>> unnatural in the model that you're describing.
> I think these are the kind of things that need to be enumerated to come
> to some conclusion.
I'm not going to try to enumerate all the case; I have other projects
that I need to work on. :)

>> Trying to make all of these cases work in the code will make the whole
>> thing a lot more difficult to deal with and maintain. It will also make
>> it much harder for the user to figure out, what configurations work, and
>> what configurations don't.
>>
>>
>> Especially the case with reusing VLANs on different ports (but not
>> connecting them to each other) is something that can easily work with
>> software devices, but cannot be emulated on most embedded device
>> switches. The software bridge configuration model raises a lot of
>> expectations that these switches simply cannot meet.
> I wouldnt expect every thing a software bridge does would be met by
> a random switch.S/w bridge would be the super-set. But this is
> not a new concept, example: Netdev itself is an abstraction - we have
> USB, ethernet, wireless, variety of virtual interfaces etc.
> Sometimes we dont even have the concept of a "link" in some of these
> devices; infiniband would have a huge MAC address but i can still
> use ifconfig on it etc.
Only a *tiny* part of the software bridge configuration model can be
emulated, the rest does not fit and has to be handled through extensions
or different APIs anyway. That's why I am convinced that it's a really
bad model to try to make these switches fit into it.

You gain a tiny advantage with writing scripts, but at the same time,
the code gets more complex, the configuration interface gets more
confusing, there are more nasty corner cases to take care of.
Why do you insist on making so many things worse just for one tiny
advantage? Where's the pragmatic cost/benefit tradeoff?

>> If you look at the swconfig model, you will see that the abstraction
>> clearly communicates the limitations of these typical switches.
>>
> 
> I will have to go back and look - but like i said earlier seems to me
> you have solved this problem. Of the switch hardware i am familiar with
> (high end pricey stuff), the capabilities tend to fall into the
> following components:
> -flooding control (i.e what should happen on destination failure)
> -learning control (i.e what should happen on the source lookup failure)
> (Ive seen knobs for "drop", "send to portX" where "X" could be cpu etc)
> -fdb capacity
> -whether it can do vlans, filtering pvids etc
> -multicast snooping capability
Right, with most of the switches that we support, almost none of these
things work in a way that can be integrated with the network stack.

> To add to the above a few more based on talking to you:
> - cpu port (in what ive come across this is always present, but
> as you point out this cannot be assumed)
I'm not even sure what you mean when you say 'cpu port cannot be
assumed'. On pretty much all devices that we work with, one of the ports
connects to a NIC in the CPU. It's just that the switch cannot be
assumed to have special treatment for that CPU port. As far as it is
concerned, it is just another port like the others.

> - ingress port tag (you point out that some cases this may never be
> present even when the cpu port is present)
> - ive never seen table id, but i think this is another one; in which
> case the number of table ids becomes something one needs to discover..
Yes, and this is something that doesn't even map directly to something
in the software bridge world.

- Felix

^ permalink raw reply

* [PATCH v3 3/4] net: mvmdio: slight optimisation of orion_mdio_write
From: Leigh Brown @ 2013-10-29  9:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: Leigh Brown, Thomas Petazzoni, Sebastian Hesselbarth, netdev,
	linux-arm-kernel, Jason Cooper
In-Reply-To: <cover.1383038973.git.leigh@solinno.co.uk>

Make only a single call to mutex_unlock in orion_mdio_write.

Signed-off-by: Leigh Brown <leigh@solinno.co.uk>
---
 drivers/net/ethernet/marvell/mvmdio.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index e3898b3..0cfa0c8 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -150,10 +150,8 @@ static int orion_mdio_write(struct mii_bus *bus, int mii_id,
 	mutex_lock(&dev->lock);
 
 	ret = orion_mdio_wait_ready(bus);
-	if (ret < 0) {
-		mutex_unlock(&dev->lock);
-		return ret;
-	}
+	if (ret < 0)
+		goto out;
 
 	writel(((mii_id << MVMDIO_SMI_PHY_ADDR_SHIFT) |
 		(regnum << MVMDIO_SMI_PHY_REG_SHIFT)  |
@@ -161,9 +159,9 @@ static int orion_mdio_write(struct mii_bus *bus, int mii_id,
 		(value << MVMDIO_SMI_DATA_SHIFT)),
 	       dev->regs);
 
+out:
 	mutex_unlock(&dev->lock);
-
-	return 0;
+	return ret;
 }
 
 static int orion_mdio_reset(struct mii_bus *bus)
-- 
1.8.4.rc3

^ permalink raw reply related

* [PATCH v3 2/4] net: mvmdio: orion_mdio_ready: remove manual poll
From: Leigh Brown @ 2013-10-29  9:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: Leigh Brown, Thomas Petazzoni, Sebastian Hesselbarth, netdev,
	linux-arm-kernel, Jason Cooper
In-Reply-To: <cover.1383038973.git.leigh@solinno.co.uk>

Replace manual poll of MVMDIO_SMI_READ_VALID with a call to
orion_mdio_wait_ready.  This ensures a consistent timeout,
eliminates a busy loop, and allows for use of interrupts on
systems that support them.

Signed-off-by: Leigh Brown <leigh@solinno.co.uk>
---
 drivers/net/ethernet/marvell/mvmdio.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index 971a4c1..e3898b3 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -110,43 +110,35 @@ static int orion_mdio_read(struct mii_bus *bus, int mii_id,
 			   int regnum)
 {
 	struct orion_mdio_dev *dev = bus->priv;
-	int count;
 	u32 val;
 	int ret;
 
 	mutex_lock(&dev->lock);
 
 	ret = orion_mdio_wait_ready(bus);
-	if (ret < 0) {
-		mutex_unlock(&dev->lock);
-		return ret;
-	}
+	if (ret < 0)
+		goto out;
 
 	writel(((mii_id << MVMDIO_SMI_PHY_ADDR_SHIFT) |
 		(regnum << MVMDIO_SMI_PHY_REG_SHIFT)  |
 		MVMDIO_SMI_READ_OPERATION),
 	       dev->regs);
 
-	/* Wait for the value to become available */
-	count = 0;
-	while (1) {
-		val = readl(dev->regs);
-		if (val & MVMDIO_SMI_READ_VALID)
-			break;
-
-		if (count > 100) {
-			dev_err(bus->parent, "Timeout when reading PHY\n");
-			mutex_unlock(&dev->lock);
-			return -ETIMEDOUT;
-		}
+	ret = orion_mdio_wait_ready(bus);
+	if (ret < 0)
+		goto out;
 
-		udelay(10);
-		count++;
+	val = readl(dev->regs);
+	if (!(val & MVMDIO_SMI_READ_VALID)) {
+		dev_err(bus->parent, "SMI bus read not valid\n");
+		ret = -ENODEV;
+		goto out;
 	}
 
+	ret = val & 0xFFFF;
+out:
 	mutex_unlock(&dev->lock);
-
-	return val & 0xFFFF;
+	return ret;
 }
 
 static int orion_mdio_write(struct mii_bus *bus, int mii_id,
-- 
1.8.4.rc3

^ permalink raw reply related

* [PATCH v3 1/4] net: mvmdio: make orion_mdio_wait_ready consistent
From: Leigh Brown @ 2013-10-29  9:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: Leigh Brown, Thomas Petazzoni, Sebastian Hesselbarth, netdev,
	linux-arm-kernel, Jason Cooper
In-Reply-To: <cover.1383038973.git.leigh@solinno.co.uk>

Amend orion_mdio_wait_ready so that the same timeout is used when
polling or using wait_event_timeout.  Set the timeout to 1ms.

Replace udelay with usleep_range to avoid a busy loop, and set the
polling interval range as 45us to 55us, so that the first sleep
will be enough in almost all cases.

Generate the same log message at timeout when polling or using
wait_event_timeout.

Signed-off-by: Leigh Brown <leigh@solinno.co.uk>
---
 drivers/net/ethernet/marvell/mvmdio.c | 52 ++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index e2f6626..971a4c1 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -44,6 +44,15 @@
 #define  MVMDIO_ERR_INT_SMI_DONE	   0x00000010
 #define MVMDIO_ERR_INT_MASK		   0x0080
 
+/*
+ * SMI Timeout measurements:
+ * - Kirkwood 88F6281 (Globalscale Dreamplug): 45us to 95us (Interrupt)
+ * - Armada 370       (Globalscale Mirabox):   41us to 43us (Polled)
+ */
+#define MVMDIO_SMI_TIMEOUT		   1000 /* 1000us = 1ms */
+#define MVMDIO_SMI_POLL_INTERVAL_MIN	   45
+#define MVMDIO_SMI_POLL_INTERVAL_MAX	   55
+
 struct orion_mdio_dev {
 	struct mutex lock;
 	void __iomem *regs;
@@ -68,34 +77,33 @@ static int orion_mdio_smi_is_done(struct orion_mdio_dev *dev)
 static int orion_mdio_wait_ready(struct mii_bus *bus)
 {
 	struct orion_mdio_dev *dev = bus->priv;
-	int count;
+	unsigned long timeout = usecs_to_jiffies(MVMDIO_SMI_TIMEOUT);
+	unsigned long end = jiffies + timeout;
+	int timedout = 0;
 
-	if (dev->err_interrupt <= 0) {
-		count = 0;
-		while (1) {
-			if (orion_mdio_smi_is_done(dev))
-				break;
+	while (1) {
+	        if (orion_mdio_smi_is_done(dev))
+			return 0;
+	        else if (timedout)
+			break;
 
-			if (count > 100) {
-				dev_err(bus->parent,
-					"Timeout: SMI busy for too long\n");
-				return -ETIMEDOUT;
-			}
+	        if (dev->err_interrupt <= 0) {
+			usleep_range(MVMDIO_SMI_POLL_INTERVAL_MIN,
+				     MVMDIO_SMI_POLL_INTERVAL_MAX);
 
-			udelay(10);
-			count++;
-		}
-	} else {
-		if (!orion_mdio_smi_is_done(dev)) {
+			if (time_is_before_jiffies(end))
+				++timedout;
+	        } else {
 			wait_event_timeout(dev->smi_busy_wait,
-				orion_mdio_smi_is_done(dev),
-				msecs_to_jiffies(100));
-			if (!orion_mdio_smi_is_done(dev))
-				return -ETIMEDOUT;
-		}
+				           orion_mdio_smi_is_done(dev),
+				           timeout);
+
+			++timedout;
+	        }
 	}
 
-	return 0;
+	dev_err(bus->parent, "Timeout: SMI busy for too long\n");
+	return  -ETIMEDOUT;
 }
 
 static int orion_mdio_read(struct mii_bus *bus, int mii_id,
-- 
1.8.4.rc3

^ permalink raw reply related

* [PATCH v3 0/4] Fix MDIO bus timeout issues on Dreamplug
From: Leigh Brown @ 2013-10-29  9:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: Leigh Brown, Thomas Petazzoni, Sebastian Hesselbarth, netdev,
	linux-arm-kernel, Jason Cooper
In-Reply-To: <cover.1382637156.git.leigh@solinno.co.uk>

I think I have addressed all comments.

Changes since v2:
 - Fix coding style in ordio_mdio_wait_ready, as identified by David
 - Remove un-needed operator change in orion_mdio_write, as identified by David

This patchset fixes an issue with the Dreamplug MDIO bus timeout since the
mv643xx_eth driver was converted to use the mvmdio driver to access the bus.

Leigh Brown (4):
  net: mvmdio: make orion_mdio_wait_ready consistent
  net: mvmdio: orion_mdio_ready: remove manual poll
  net: mvmdio: slight optimisation of orion_mdio_write
  net: mvmdio: doc: mvmdio now used by mv643xx_eth

 drivers/net/ethernet/marvell/mvmdio.c | 110 ++++++++++++++++------------------
 1 file changed, 53 insertions(+), 57 deletions(-)

-- 
1.8.4.rc3

^ permalink raw reply

* [PATCH v3 4/4] net: mvmdio: doc: mvmdio now used by mv643xx_eth
From: Leigh Brown @ 2013-10-29  9:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: Leigh Brown, Thomas Petazzoni, Sebastian Hesselbarth, netdev,
	linux-arm-kernel, Jason Cooper
In-Reply-To: <cover.1383038973.git.leigh@solinno.co.uk>

Amend the documentation in the mvmdio driver to note the fact
that it is now used by both the mvneta and mv643xx_eth drivers.

Signed-off-by: Leigh Brown <leigh@solinno.co.uk>
---
 drivers/net/ethernet/marvell/mvmdio.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index 0cfa0c8..0d0311c 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -4,11 +4,9 @@
  * Since the MDIO interface of Marvell network interfaces is shared
  * between all network interfaces, having a single driver allows to
  * handle concurrent accesses properly (you may have four Ethernet
- * ports, but they in fact share the same SMI interface to access the
- * MDIO bus). Moreover, this MDIO interface code is similar between
- * the mv643xx_eth driver and the mvneta driver. For now, it is only
- * used by the mvneta driver, but it could later be used by the
- * mv643xx_eth driver as well.
+ * ports, but they in fact share the same SMI interface to access
+ * the MDIO bus). This driver is currently used by the mvneta and 
+ * mv643xx_eth drivers.
  *
  * Copyright (C) 2012 Marvell
  *
-- 
1.8.4.rc3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox