Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next 6/6] ixgbevf: Update descriptor macros to accept pointers and drop _ADV suffix
From: Jeff Kirsher @ 2012-07-18  2:20 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Greg Rose, Jeff Kirsher
In-Reply-To: <1342578045-17778-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change updates the descriptor macros to accept pointers, updates the
name to drop the _ADV suffix, and include the IXGBEVF name in the macro.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |   12 ++++++------
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index f92daca..1f13765 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -164,12 +164,12 @@ struct ixgbevf_q_vector {
 	((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
 	(R)->next_to_clean - (R)->next_to_use - 1)
 
-#define IXGBE_RX_DESC_ADV(R, i)	    \
-	(&(((union ixgbe_adv_rx_desc *)((R).desc))[i]))
-#define IXGBE_TX_DESC_ADV(R, i)	    \
-	(&(((union ixgbe_adv_tx_desc *)((R).desc))[i]))
-#define IXGBE_TX_CTXTDESC_ADV(R, i)	    \
-	(&(((struct ixgbe_adv_tx_context_desc *)((R).desc))[i]))
+#define IXGBEVF_RX_DESC(R, i)	    \
+	(&(((union ixgbe_adv_rx_desc *)((R)->desc))[i]))
+#define IXGBEVF_TX_DESC(R, i)	    \
+	(&(((union ixgbe_adv_tx_desc *)((R)->desc))[i]))
+#define IXGBEVF_TX_CTXTDESC(R, i)	    \
+	(&(((struct ixgbe_adv_tx_context_desc *)((R)->desc))[i]))
 
 #define IXGBE_MAX_JUMBO_FRAME_SIZE        16128
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 8e022c6..c98cdf7 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -195,7 +195,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 
 	i = tx_ring->next_to_clean;
 	eop = tx_ring->tx_buffer_info[i].next_to_watch;
-	eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
+	eop_desc = IXGBEVF_TX_DESC(tx_ring, eop);
 
 	while ((eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)) &&
 	       (count < tx_ring->count)) {
@@ -206,7 +206,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 			goto cont_loop;
 		for ( ; !cleaned; count++) {
 			struct sk_buff *skb;
-			tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i);
+			tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
 			tx_buffer_info = &tx_ring->tx_buffer_info[i];
 			cleaned = (i == eop);
 			skb = tx_buffer_info->skb;
@@ -235,7 +235,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 
 cont_loop:
 		eop = tx_ring->tx_buffer_info[i].next_to_watch;
-		eop_desc = IXGBE_TX_DESC_ADV(*tx_ring, eop);
+		eop_desc = IXGBEVF_TX_DESC(tx_ring, eop);
 	}
 
 	tx_ring->next_to_clean = i;
@@ -339,7 +339,7 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter,
 	bi = &rx_ring->rx_buffer_info[i];
 
 	while (cleaned_count--) {
-		rx_desc = IXGBE_RX_DESC_ADV(*rx_ring, i);
+		rx_desc = IXGBEVF_RX_DESC(rx_ring, i);
 		skb = bi->skb;
 		if (!skb) {
 			skb = netdev_alloc_skb(adapter->netdev,
@@ -405,7 +405,7 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 
 	i = rx_ring->next_to_clean;
-	rx_desc = IXGBE_RX_DESC_ADV(*rx_ring, i);
+	rx_desc = IXGBEVF_RX_DESC(rx_ring, i);
 	staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
 	rx_buffer_info = &rx_ring->rx_buffer_info[i];
 
@@ -432,7 +432,7 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 		if (i == rx_ring->count)
 			i = 0;
 
-		next_rxd = IXGBE_RX_DESC_ADV(*rx_ring, i);
+		next_rxd = IXGBEVF_RX_DESC(rx_ring, i);
 		prefetch(next_rxd);
 		cleaned_count++;
 
@@ -2437,7 +2437,7 @@ static int ixgbevf_tso(struct ixgbevf_adapter *adapter,
 		i = tx_ring->next_to_use;
 
 		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		context_desc = IXGBE_TX_CTXTDESC_ADV(*tx_ring, i);
+		context_desc = IXGBEVF_TX_CTXTDESC(tx_ring, i);
 
 		/* VLAN MACLEN IPLEN */
 		if (tx_flags & IXGBE_TX_FLAGS_VLAN)
@@ -2497,7 +2497,7 @@ static bool ixgbevf_tx_csum(struct ixgbevf_adapter *adapter,
 	    (tx_flags & IXGBE_TX_FLAGS_VLAN)) {
 		i = tx_ring->next_to_use;
 		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		context_desc = IXGBE_TX_CTXTDESC_ADV(*tx_ring, i);
+		context_desc = IXGBEVF_TX_CTXTDESC(tx_ring, i);
 
 		if (tx_flags & IXGBE_TX_FLAGS_VLAN)
 			vlan_macip_lens |= (tx_flags &
@@ -2700,7 +2700,7 @@ static void ixgbevf_tx_queue(struct ixgbevf_adapter *adapter,
 	i = tx_ring->next_to_use;
 	while (count--) {
 		tx_buffer_info = &tx_ring->tx_buffer_info[i];
-		tx_desc = IXGBE_TX_DESC_ADV(*tx_ring, i);
+		tx_desc = IXGBEVF_TX_DESC(tx_ring, i);
 		tx_desc->read.buffer_addr = cpu_to_le64(tx_buffer_info->dma);
 		tx_desc->read.cmd_type_len =
 			cpu_to_le32(cmd_type_len | tx_buffer_info->length);
-- 
1.7.10.4

^ permalink raw reply related

* [net] ixgbevf: fix VF untagging when 802.1 prio is set
From: Jeff Kirsher @ 2012-07-18  2:23 UTC (permalink / raw)
  To: davem; +Cc: Pascal Bouchareine, netdev, gospo, sassmann, Jeff Kirsher

From: Pascal Bouchareine <pascal@gandi.net>

We have had an issue when using ixgbe+ixgbevf and 802.1 VLAN tagging.

When attaching a VLAN to a VF, frames with a 802.1q priority appeared
untagged on the VF hence not reaching the VLAN, where frames with
priority 0 where tagged as expected and seen by the VLAN device.

This seems due to the way ixgbevf is looking up the full tag
(prio+cfi+vlan) against the adapter active_vlans, as a condition to mark
the skb tagged.

Signed-off-by: Pascal Bouchareine <pascal@gandi.net>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 41e3225..c16d32f 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -304,7 +304,7 @@ static void ixgbevf_receive_skb(struct ixgbevf_q_vector *q_vector,
 	bool is_vlan = (status & IXGBE_RXD_STAT_VP);
 	u16 tag = le16_to_cpu(rx_desc->wb.upper.vlan);
 
-	if (is_vlan && test_bit(tag, adapter->active_vlans))
+	if (is_vlan && test_bit(tag & VLAN_VID_MASK, adapter->active_vlans))
 		__vlan_hwaccel_put_tag(skb, tag);
 
 	if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL))
-- 
1.7.10.4

^ permalink raw reply related

* Re: [net] ixgbevf: fix VF untagging when 802.1 prio is set
From: Jeff Kirsher @ 2012-07-18  2:31 UTC (permalink / raw)
  To: davem; +Cc: Pascal Bouchareine, netdev, gospo, sassmann
In-Reply-To: <1342578236-20036-1-git-send-email-jeffrey.t.kirsher@intel.com>

[-- Attachment #1: Type: text/plain, Size: 981 bytes --]

On Tue, 2012-07-17 at 19:23 -0700, Jeff Kirsher wrote:
> From: Pascal Bouchareine <pascal@gandi.net>
> 
> We have had an issue when using ixgbe+ixgbevf and 802.1 VLAN tagging.
> 
> When attaching a VLAN to a VF, frames with a 802.1q priority appeared
> untagged on the VF hence not reaching the VLAN, where frames with
> priority 0 where tagged as expected and seen by the VLAN device.
> 
> This seems due to the way ixgbevf is looking up the full tag
> (prio+cfi+vlan) against the adapter active_vlans, as a condition to
> mark
> the skb tagged.
> 
> Signed-off-by: Pascal Bouchareine <pascal@gandi.net>
> Tested-by: Sibai Li <sibai.li@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
>  drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-) 

Dave-

Disregard, I just read your message about "that's it for net".  I will
push this into my net-next tree.

Cheers,
Jeff

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH v2] b44: add 64 bit stats
From: Eric Dumazet @ 2012-07-18  3:18 UTC (permalink / raw)
  To: Kevin Groeneveld; +Cc: netdev
In-Reply-To: <CABF+-6WmaLjgTTX_ot=QCzsM-tYMJhVk5oRYFAVj66cKG-z31A@mail.gmail.com>

On Tue, 2012-07-17 at 22:02 -0400, Kevin Groeneveld wrote:
> On Tue, Jul 17, 2012 at 2:08 AM, David Miller <davem@davemloft.net> wrote:
> > This patch was corrupted by your email client and is therefore
> > unusable.
> 
> If I resend the patch should I bump the version number in the subject?
> 
It doesnt matter in this case, its a formatting issue

> Should I include "Acked-by" lines that people have posted?
> 

You can if no semantic change is done

> I keep sending myself test messages with the patch but the white space
> is always mangled.  I am not sure if Thunderbird is mangling it in the
> sent message or the received message... :(

Documentation/email-clients.txt

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Thunderbird (GUI)

Thunderbird is an Outlook clone that likes to mangle text, but there are ways
to coerce it into behaving.

- Allows use of an external editor:
  The easiest thing to do with Thunderbird and patches is to use an
  "external editor" extension and then just use your favorite $EDITOR
  for reading/merging patches into the body text.  To do this, download
  and install the extension, then add a button for it using
  View->Toolbars->Customize... and finally just click on it when in the
  Compose dialog.

To beat some sense out of the internal editor, do this:

- Edit your Thunderbird config settings so that it won't use format=flowed.
  Go to "edit->preferences->advanced->config editor" to bring up the
  thunderbird's registry editor, and set "mailnews.send_plaintext_flowed" to
  "false".

- Disable HTML Format: Set "mail.identity.id1.compose_html" to "false".

- Enable "preformat" mode: Set "editor.quotesPreformatted" to "true".

- Enable UTF8: Set "prefs.converted-to-utf8" to "true".

- Install the "toggle wordwrap" extension.  Download the file from:
    https://addons.mozilla.org/thunderbird/addon/2351/
  Then go to "tools->add ons", select "install" at the bottom of the screen,
  and browse to where you saved the .xul file.  This adds an "Enable
  Wordwrap" entry under the Options menu of the message composer.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

^ permalink raw reply

* Re: [PATCH v2] b44: add 64 bit stats
From: Kevin Groeneveld @ 2012-07-18  3:46 UTC (permalink / raw)
  To: netdev; +Cc: Kevin Groeneveld
In-Reply-To: <20120716.230806.242760837075045729.davem@davemloft.net>

From: Kevin Groeneveld <kgroeneveld@gmail.com>

Add support for 64 bit stats to Broadcom b44 ethernet driver.

Signed-off-by: Kevin Groeneveld <kgroeneveld@gmail.com>
---
v2: use u64_stats_fetch_begin_bh/u64_stats_fetch_retry_bh instead of
    u64_stats_fetch_begin/u64_stats_fetch_retry as stats update happens in a
    timer interrupt

 drivers/net/ethernet/broadcom/b44.c |   96 ++++++++++++++++++++---------------
 drivers/net/ethernet/broadcom/b44.h |    3 +-
 2 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index d09c6b5..9786c0e 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -483,9 +483,11 @@ out:
 static void b44_stats_update(struct b44 *bp)
 {
 	unsigned long reg;
-	u32 *val;
+	u64 *val;
 
 	val = &bp->hw_stats.tx_good_octets;
+	u64_stats_update_begin(&bp->hw_stats.syncp);
+
 	for (reg = B44_TX_GOOD_O; reg <= B44_TX_PAUSE; reg += 4UL) {
 		*val++ += br32(bp, reg);
 	}
@@ -496,6 +498,8 @@ static void b44_stats_update(struct b44 *bp)
 	for (reg = B44_RX_GOOD_O; reg <= B44_RX_NPAUSE; reg += 4UL) {
 		*val++ += br32(bp, reg);
 	}
+
+	u64_stats_update_end(&bp->hw_stats.syncp);
 }
 
 static void b44_link_report(struct b44 *bp)
@@ -1635,44 +1639,49 @@ static int b44_close(struct net_device *dev)
 	return 0;
 }
 
-static struct net_device_stats *b44_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *b44_get_stats64(struct net_device *dev,
+					struct rtnl_link_stats64 *nstat)
 {
 	struct b44 *bp = netdev_priv(dev);
-	struct net_device_stats *nstat = &dev->stats;
 	struct b44_hw_stats *hwstat = &bp->hw_stats;
-
-	/* Convert HW stats into netdevice stats. */
-	nstat->rx_packets = hwstat->rx_pkts;
-	nstat->tx_packets = hwstat->tx_pkts;
-	nstat->rx_bytes   = hwstat->rx_octets;
-	nstat->tx_bytes   = hwstat->tx_octets;
-	nstat->tx_errors  = (hwstat->tx_jabber_pkts +
-			     hwstat->tx_oversize_pkts +
-			     hwstat->tx_underruns +
-			     hwstat->tx_excessive_cols +
-			     hwstat->tx_late_cols);
-	nstat->multicast  = hwstat->tx_multicast_pkts;
-	nstat->collisions = hwstat->tx_total_cols;
-
-	nstat->rx_length_errors = (hwstat->rx_oversize_pkts +
-				   hwstat->rx_undersize);
-	nstat->rx_over_errors   = hwstat->rx_missed_pkts;
-	nstat->rx_frame_errors  = hwstat->rx_align_errs;
-	nstat->rx_crc_errors    = hwstat->rx_crc_errs;
-	nstat->rx_errors        = (hwstat->rx_jabber_pkts +
-				   hwstat->rx_oversize_pkts +
-				   hwstat->rx_missed_pkts +
-				   hwstat->rx_crc_align_errs +
-				   hwstat->rx_undersize +
-				   hwstat->rx_crc_errs +
-				   hwstat->rx_align_errs +
-				   hwstat->rx_symbol_errs);
-
-	nstat->tx_aborted_errors = hwstat->tx_underruns;
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin_bh(&hwstat->syncp);
+
+		/* Convert HW stats into rtnl_link_stats64 stats. */
+		nstat->rx_packets = hwstat->rx_pkts;
+		nstat->tx_packets = hwstat->tx_pkts;
+		nstat->rx_bytes   = hwstat->rx_octets;
+		nstat->tx_bytes   = hwstat->tx_octets;
+		nstat->tx_errors  = (hwstat->tx_jabber_pkts +
+				     hwstat->tx_oversize_pkts +
+				     hwstat->tx_underruns +
+				     hwstat->tx_excessive_cols +
+				     hwstat->tx_late_cols);
+		nstat->multicast  = hwstat->tx_multicast_pkts;
+		nstat->collisions = hwstat->tx_total_cols;
+
+		nstat->rx_length_errors = (hwstat->rx_oversize_pkts +
+					   hwstat->rx_undersize);
+		nstat->rx_over_errors   = hwstat->rx_missed_pkts;
+		nstat->rx_frame_errors  = hwstat->rx_align_errs;
+		nstat->rx_crc_errors    = hwstat->rx_crc_errs;
+		nstat->rx_errors        = (hwstat->rx_jabber_pkts +
+					   hwstat->rx_oversize_pkts +
+					   hwstat->rx_missed_pkts +
+					   hwstat->rx_crc_align_errs +
+					   hwstat->rx_undersize +
+					   hwstat->rx_crc_errs +
+					   hwstat->rx_align_errs +
+					   hwstat->rx_symbol_errs);
+
+		nstat->tx_aborted_errors = hwstat->tx_underruns;
 #if 0
-	/* Carrier lost counter seems to be broken for some devices */
-	nstat->tx_carrier_errors = hwstat->tx_carrier_lost;
+		/* Carrier lost counter seems to be broken for some devices */
+		nstat->tx_carrier_errors = hwstat->tx_carrier_lost;
 #endif
+	} while (u64_stats_fetch_retry_bh(&hwstat->syncp, start));
 
 	return nstat;
 }
@@ -1993,17 +2002,24 @@ static void b44_get_ethtool_stats(struct net_device *dev,
 				  struct ethtool_stats *stats, u64 *data)
 {
 	struct b44 *bp = netdev_priv(dev);
-	u32 *val = &bp->hw_stats.tx_good_octets;
+	struct b44_hw_stats *hwstat = &bp->hw_stats;
+	u64 *data_src, *data_dst;
+	unsigned int start;
 	u32 i;
 
 	spin_lock_irq(&bp->lock);
-
 	b44_stats_update(bp);
+	spin_unlock_irq(&bp->lock);
 
-	for (i = 0; i < ARRAY_SIZE(b44_gstrings); i++)
-		*data++ = *val++;
+	do {
+		data_src = &hwstat->tx_good_octets;
+		data_dst = data;
+		start = u64_stats_fetch_begin_bh(&hwstat->syncp);
 
-	spin_unlock_irq(&bp->lock);
+		for (i = 0; i < ARRAY_SIZE(b44_gstrings); i++)
+			*data_dst++ = *data_src++;
+
+	} while (u64_stats_fetch_retry_bh(&hwstat->syncp, start));
 }
 
 static void b44_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
@@ -2113,7 +2129,7 @@ static const struct net_device_ops b44_netdev_ops = {
 	.ndo_open		= b44_open,
 	.ndo_stop		= b44_close,
 	.ndo_start_xmit		= b44_start_xmit,
-	.ndo_get_stats		= b44_get_stats,
+	.ndo_get_stats64	= b44_get_stats64,
 	.ndo_set_rx_mode	= b44_set_rx_mode,
 	.ndo_set_mac_address	= b44_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/ethernet/broadcom/b44.h b/drivers/net/ethernet/broadcom/b44.h
index e1905a4..8993d72 100644
--- a/drivers/net/ethernet/broadcom/b44.h
+++ b/drivers/net/ethernet/broadcom/b44.h
@@ -338,9 +338,10 @@ struct ring_info {
  * the layout
  */
 struct b44_hw_stats {
-#define _B44(x)	u32 x;
+#define _B44(x)	u64 x;
 B44_STAT_REG_DECLARE
 #undef _B44
+	struct u64_stats_sync	syncp;
 };
 
 struct ssb_device;
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH 0/5] Long term PMTU/redirect storage in ipv4.
From: Eric Dumazet @ 2012-07-18  3:46 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: David Miller, netdev
In-Reply-To: <alpine.LFD.2.00.1207180358190.2128@ja.ssi.bg>

On Wed, 2012-07-18 at 04:06 +0300, Julian Anastasov wrote:

> 
> 	I created patch with seqlock usage. This version
> is with global seqlock because I'm not sure if 2048 locks
> per NH are good idea. This is only compile tested.
> After comments may be I have to resubmit in separate message.
> 
> 
> Subject: [PATCH] ipv4: use seqlock for nh_exceptions
> 
> From: Julian Anastasov <ja@ssi.bg>
> 
> 	Use global seqlock for the nh_exceptions. Call
> fnhe_oldest with the right hash chain. Correct the diff
> value for dst_set_expires.
> 
> Signed-off-by: Julian Anastasov <ja@ssi.bg>
> ---
>  include/net/ip_fib.h |    2 +-
>  net/ipv4/route.c     |  117 ++++++++++++++++++++++++++++++++------------------
>  2 files changed, 76 insertions(+), 43 deletions(-)
> 

...

> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index f67e702..e037c73 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1334,8 +1334,9 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
>  }
>  
>  static DEFINE_SPINLOCK(fnhe_lock);
> +static DEFINE_SEQLOCK(fnhe_seqlock);

Hi Julian

I find this patch too complex.

You could only change fnhe_lock to a seqlock

 net/ipv4/route.c |   35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f67e702..a96fc9d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1333,7 +1333,7 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 		build_sk_flow_key(fl4, sk);
 }
 
-static DEFINE_SPINLOCK(fnhe_lock);
+static DEFINE_SEQLOCK(fnhe_seqlock);
 
 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
 {
@@ -1454,11 +1454,11 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 				struct fib_nh *nh = &FIB_RES_NH(res);
 				struct fib_nh_exception *fnhe;
 
-				spin_lock_bh(&fnhe_lock);
+				write_seqlock_bh(&fnhe_seqlock);
 				fnhe = find_or_create_fnhe(nh, fl4->daddr);
 				if (fnhe)
 					fnhe->fnhe_gw = new_gw;
-				spin_unlock_bh(&fnhe_lock);
+				write_sequnlock_bh(&fnhe_seqlock);
 			}
 			rt->rt_gateway = new_gw;
 			rt->rt_flags |= RTCF_REDIRECTED;
@@ -1665,13 +1665,13 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		struct fib_nh *nh = &FIB_RES_NH(res);
 		struct fib_nh_exception *fnhe;
 
-		spin_lock_bh(&fnhe_lock);
+		write_seqlock_bh(&fnhe_seqlock);
 		fnhe = find_or_create_fnhe(nh, fl4->daddr);
 		if (fnhe) {
 			fnhe->fnhe_pmtu = mtu;
 			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
 		}
-		spin_unlock_bh(&fnhe_lock);
+		write_sequnlock_bh(&fnhe_seqlock);
 	}
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
@@ -1904,18 +1904,29 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
-		if (fnhe->fnhe_daddr == daddr) {
-			if (fnhe->fnhe_pmtu) {
-				unsigned long expires = fnhe->fnhe_expires;
-				unsigned long diff = jiffies - expires;
+		unsigned int seq;
+		__be32 fnhe_daddr, gw;
+		u32 pmtu;
+		unsigned long expires;
+
+		do {
+			seq = read_seqbegin(&fnhe_seqlock);
+			fnhe_daddr = fnhe->fnhe_daddr;
+			gw = fnhe->fnhe_gw;
+			pmtu = fnhe->fnhe_pmtu;
+			expires = fnhe->fnhe_expires;
+		} while (read_seqretry(&fnhe_seqlock, seq));
+		if (fnhe_daddr == daddr) {
+			if (pmtu) {
+				unsigned long diff = expires - jiffies;
 
 				if (time_before(jiffies, expires)) {
-					rt->rt_pmtu = fnhe->fnhe_pmtu;
+					rt->rt_pmtu = pmtu;
 					dst_set_expires(&rt->dst, diff);
 				}
 			}
-			if (fnhe->fnhe_gw)
-				rt->rt_gateway = fnhe->fnhe_gw;
+			if (gw)
+				rt->rt_gateway = gw;
 			fnhe->fnhe_stamp = jiffies;
 			break;
 		}

^ permalink raw reply related

* Re: [PATCH v2] b44: add 64 bit stats
From: Eric Dumazet @ 2012-07-18  3:50 UTC (permalink / raw)
  To: Kevin Groeneveld; +Cc: netdev
In-Reply-To: <1342583161-1184-1-git-send-email-kgroeneveld@gmail.com>

On Tue, 2012-07-17 at 23:46 -0400, Kevin Groeneveld wrote:
> From: Kevin Groeneveld <kgroeneveld@gmail.com>
> 
> Add support for 64 bit stats to Broadcom b44 ethernet driver.
> 
> Signed-off-by: Kevin Groeneveld <kgroeneveld@gmail.com>
> ---
> v2: use u64_stats_fetch_begin_bh/u64_stats_fetch_retry_bh instead of
>     u64_stats_fetch_begin/u64_stats_fetch_retry as stats update happens in a
>     timer interrupt

Seems good this time, thanks

Signed-off-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* net-next and IPv6
From: Eric Dumazet @ 2012-07-18  4:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120717.110314.1984735460095688066.davem@davemloft.net>


IPv6 doesnt work anymore for me, at least TCP doesnt work

ssh ::1

ping6 is ok

tcpdump shows garbled source IP and ip-proto

^ permalink raw reply

* Re: [RFC PATCH] net: cgroup: null ptr dereference in netprio cgroup during init
From: John Fastabend @ 2012-07-18  5:50 UTC (permalink / raw)
  To: Gao feng; +Cc: davem, nhorman, mark.d.rustad, netdev, eric.dumazet
In-Reply-To: <5006188B.7060606@cn.fujitsu.com>

On 7/17/2012 6:59 PM, Gao feng wrote:
> 于 2012年07月18日 08:33, John Fastabend 写道:
>> When the netprio cgroup is built in the kernel cgroup_init will call
>> cgrp_create which eventually calls update_netdev_tables. This is
>> being called before do_initcalls() so a null ptr dereference occurs
>> on init_net.
>>

[...]

>
>
> Thanks John.
> It's my mistake.
>
> Can we make sure init_net.count is zero here?
> I can't find some places to initialize it to zero.
>

Its defined in net_namespace.c so it's zeroed by virtue
of being global. And initialized in setup_net via
pure_initcall() always after cgroup_init() if I've done
my accounting correctly.

.John

^ permalink raw reply

* RE: [RFC] r8169 : why SG / TX checksum are default disabled
From: hayeswang @ 2012-07-18  6:45 UTC (permalink / raw)
  To: 'Francois Romieu', 'Eric Dumazet'; +Cc: netdev
In-Reply-To: <20120717234037.GA26972@electric-eye.fr.zoreil.com>

Francois Romieu [mailto:romieu@fr.zoreil.com] 
[...]

> Hayes, should we not add into the kernel driver something similar to
> the rtl8168_start_xmit::skb_checksum_help stuff in Realtek's 
> 8168 driver ?
> There seems to be a bug for (skb->len < 60 && RTL_GIGA_MAC_VER_34.

For RTL8168E-VL (RTL_GIGA_MAC_VER_34), the hardware wouldn't send the packet
with the length less than 60 bytes. The hardware should pad this kind of packet
to 60 bytes, but it wouldn't. Therefore, the software has to pad the packet to
60 bytes. However, the hw checksum would be incorrect for the modified packet,
so the software checksum is necessary. That is, for the packet less than 60
bytes, the software has to pad the packet and calculate the checksum, and the hw
checksum has to be disabled.

Best Regards,
Hayes

^ permalink raw reply

* Re: net-next and IPv6
From: Eric Dumazet @ 2012-07-18  7:04 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1342587513.2626.1518.camel@edumazet-glaptop>

On Wed, 2012-07-18 at 06:58 +0200, Eric Dumazet wrote:
> IPv6 doesnt work anymore for me, at least TCP doesnt work
> 
> ssh ::1
> 
> ping6 is ok
> 
> tcpdump shows garbled source IP and ip-proto
> 
> 

Probable bug coming from

commit 35ad9b9cf7d8a2e6259a0d24022e910adb6f3489
Author: David S. Miller <davem@davemloft.net>
Date:   Mon Jul 16 03:44:56 2012 -0700

    ipv6: Add helper inet6_csk_update_pmtu().
    
    This is the ipv6 version of inet_csk_update_pmtu().
    
    Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* Re: [PATCH 0/5] Long term PMTU/redirect storage in ipv4.
From: Julian Anastasov @ 2012-07-18  7:28 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1342583166.2626.1367.camel@edumazet-glaptop>


	Hello,

On Wed, 18 Jul 2012, Eric Dumazet wrote:

> Hi Julian
> 
> I find this patch too complex.
> 
> You could only change fnhe_lock to a seqlock

	I'll change it, I was not sure if we are going to
use some array of seqlocks and also without adding locks in
the struct fib_nh.

> -static DEFINE_SPINLOCK(fnhe_lock);
> +static DEFINE_SEQLOCK(fnhe_seqlock);
>  
>  static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
>  {
> @@ -1454,11 +1454,11 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
>  				struct fib_nh *nh = &FIB_RES_NH(res);
>  				struct fib_nh_exception *fnhe;
>  
> -				spin_lock_bh(&fnhe_lock);
> +				write_seqlock_bh(&fnhe_seqlock);
>  				fnhe = find_or_create_fnhe(nh, fl4->daddr);
>  				if (fnhe)
>  					fnhe->fnhe_gw = new_gw;
> -				spin_unlock_bh(&fnhe_lock);
> +				write_sequnlock_bh(&fnhe_seqlock);
>  			}
>  			rt->rt_gateway = new_gw;
>  			rt->rt_flags |= RTCF_REDIRECTED;
> @@ -1665,13 +1665,13 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
>  		struct fib_nh *nh = &FIB_RES_NH(res);
>  		struct fib_nh_exception *fnhe;
>  
> -		spin_lock_bh(&fnhe_lock);
> +		write_seqlock_bh(&fnhe_seqlock);
>  		fnhe = find_or_create_fnhe(nh, fl4->daddr);
>  		if (fnhe) {
>  			fnhe->fnhe_pmtu = mtu;
>  			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
>  		}
> -		spin_unlock_bh(&fnhe_lock);
> +		write_sequnlock_bh(&fnhe_seqlock);
>  	}
>  	rt->rt_pmtu = mtu;
>  	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
> @@ -1904,18 +1904,29 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
>  
>  	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
>  	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
> -		if (fnhe->fnhe_daddr == daddr) {
> -			if (fnhe->fnhe_pmtu) {
> -				unsigned long expires = fnhe->fnhe_expires;
> -				unsigned long diff = jiffies - expires;
> +		unsigned int seq;
> +		__be32 fnhe_daddr, gw;
> +		u32 pmtu;
> +		unsigned long expires;
> +
> +		do {
> +			seq = read_seqbegin(&fnhe_seqlock);
> +			fnhe_daddr = fnhe->fnhe_daddr;
> +			gw = fnhe->fnhe_gw;
> +			pmtu = fnhe->fnhe_pmtu;
> +			expires = fnhe->fnhe_expires;
> +		} while (read_seqretry(&fnhe_seqlock, seq));

	This is going to read all values in the chain
before reaching daddr? Or may be FNHE_RECLAIM_DEPTH is
small and nobody will increase it. May be I can create
some func that searches daddr in chain instead. Do you still
prefer to remove the first daddr check or it is only
that the code is intended too much?

> +		if (fnhe_daddr == daddr) {

	Also, do we need some rcu locking in
__ip_rt_update_pmtu or may be ipv4_update_pmtu is
called always under rcu lock?

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* Re: net-next and IPv6
From: Eric Dumazet @ 2012-07-18  7:23 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1342595099.2626.1774.camel@edumazet-glaptop>

On Wed, 2012-07-18 at 09:05 +0200, Eric Dumazet wrote:
> On Wed, 2012-07-18 at 06:58 +0200, Eric Dumazet wrote:
> > IPv6 doesnt work anymore for me, at least TCP doesnt work
> > 
> > ssh ::1
> > 
> > ping6 is ok
> > 
> > tcpdump shows garbled source IP and ip-proto
> > 
> > 
> 
> Probable bug coming from
> 
> commit 35ad9b9cf7d8a2e6259a0d24022e910adb6f3489
> Author: David S. Miller <davem@davemloft.net>
> Date:   Mon Jul 16 03:44:56 2012 -0700
> 
>     ipv6: Add helper inet6_csk_update_pmtu().
>     
>     This is the ipv6 version of inet_csk_update_pmtu().
>     
>     Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> 

OK, I am testing a fix

^ permalink raw reply

* Re: [PATCH 0/5] Long term PMTU/redirect storage in ipv4.
From: Eric Dumazet @ 2012-07-18  7:30 UTC (permalink / raw)
  To: Julian Anastasov; +Cc: David Miller, netdev
In-Reply-To: <alpine.LFD.2.00.1207181014050.1652@ja.ssi.bg>

On Wed, 2012-07-18 at 10:28 +0300, Julian Anastasov wrote:

> 	This is going to read all values in the chain
> before reaching daddr? Or may be FNHE_RECLAIM_DEPTH is
> small and nobody will increase it. May be I can create
> some func that searches daddr in chain instead. Do you still
> prefer to remove the first daddr check or it is only
> that the code is intended too much?
> 

I would not bother, since real cost is the initial cache line miss.
Once you read one field, reading others is really fast.

> > +		if (fnhe_daddr == daddr) {
> 
> 	Also, do we need some rcu locking in
> __ip_rt_update_pmtu or may be ipv4_update_pmtu is
> called always under rcu lock?

Sorry, I dont understand, we use the full lock
write_seqlock_bh(&fnhe_seqlock);/write_sequnlock_bh(&fnhe_seqlock);

^ permalink raw reply

* [PATCH net-next] ipv6: fix inet6_csk_xmit()
From: Eric Dumazet @ 2012-07-18  7:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Neal Cardwell, Yuchung Cheng
In-Reply-To: <1342596218.2626.1813.camel@edumazet-glaptop>

From: Eric Dumazet <edumazet@google.com>

We should provide to inet6_csk_route_socket a struct flowi6 pointer,
so that net6_csk_xmit() works correctly instead of sending garbage.

Also add some consts 

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
---
 include/linux/ipv6.h             |    4 +-
 include/net/ip6_route.h          |    3 +-
 net/ipv6/inet6_connection_sock.c |   40 +++++++++++++++--------------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index bc6c8fd..379e433 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -299,9 +299,9 @@ struct ipv6_pinfo {
 	struct in6_addr 	rcv_saddr;
 	struct in6_addr		daddr;
 	struct in6_pktinfo	sticky_pktinfo;
-	struct in6_addr		*daddr_cache;
+	const struct in6_addr		*daddr_cache;
 #ifdef CONFIG_IPV6_SUBTREES
-	struct in6_addr		*saddr_cache;
+	const struct in6_addr		*saddr_cache;
 #endif
 
 	__be32			flow_label;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index b6b6f7d..5fa2af0 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -158,7 +158,8 @@ extern void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
  *	Store a destination cache entry in a socket
  */
 static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-				   struct in6_addr *daddr, struct in6_addr *saddr)
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct rt6_info *rt = (struct rt6_info *) dst;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 4a0c4d2..0251a60 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -171,7 +171,8 @@ EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
 
 static inline
 void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
-			   struct in6_addr *daddr, struct in6_addr *saddr)
+			   const struct in6_addr *daddr,
+			   const struct in6_addr *saddr)
 {
 	__ip6_dst_store(sk, dst, daddr, saddr);
 
@@ -203,31 +204,31 @@ struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
 	return dst;
 }
 
-static struct dst_entry *inet6_csk_route_socket(struct sock *sk)
+static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
+						struct flowi6 *fl6)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *final_p, final;
 	struct dst_entry *dst;
-	struct flowi6 fl6;
 
-	memset(&fl6, 0, sizeof(fl6));
-	fl6.flowi6_proto = sk->sk_protocol;
-	fl6.daddr = np->daddr;
-	fl6.saddr = np->saddr;
-	fl6.flowlabel = np->flow_label;
-	IP6_ECN_flow_xmit(sk, fl6.flowlabel);
-	fl6.flowi6_oif = sk->sk_bound_dev_if;
-	fl6.flowi6_mark = sk->sk_mark;
-	fl6.fl6_sport = inet->inet_sport;
-	fl6.fl6_dport = inet->inet_dport;
-	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+	memset(fl6, 0, sizeof(*fl6));
+	fl6->flowi6_proto = sk->sk_protocol;
+	fl6->daddr = np->daddr;
+	fl6->saddr = np->saddr;
+	fl6->flowlabel = np->flow_label;
+	IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+	fl6->flowi6_oif = sk->sk_bound_dev_if;
+	fl6->flowi6_mark = sk->sk_mark;
+	fl6->fl6_sport = inet->inet_sport;
+	fl6->fl6_dport = inet->inet_dport;
+	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
 
-	final_p = fl6_update_dst(&fl6, np->opt, &final);
+	final_p = fl6_update_dst(fl6, np->opt, &final);
 
 	dst = __inet6_csk_dst_check(sk, np->dst_cookie);
 	if (!dst) {
-		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+		dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
 
 		if (!IS_ERR(dst))
 			__inet6_csk_dst_store(sk, dst, NULL, NULL);
@@ -243,7 +244,7 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
 	struct dst_entry *dst;
 	int res;
 
-	dst = inet6_csk_route_socket(sk);
+	dst = inet6_csk_route_socket(sk, &fl6);
 	if (IS_ERR(dst)) {
 		sk->sk_err_soft = -PTR_ERR(dst);
 		sk->sk_route_caps = 0;
@@ -265,12 +266,13 @@ EXPORT_SYMBOL_GPL(inet6_csk_xmit);
 
 struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
 {
-	struct dst_entry *dst = inet6_csk_route_socket(sk);
+	struct flowi6 fl6;
+	struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6);
 
 	if (IS_ERR(dst))
 		return NULL;
 	dst->ops->update_pmtu(dst, sk, NULL, mtu);
 
-	return inet6_csk_route_socket(sk);
+	return inet6_csk_route_socket(sk, &fl6);
 }
 EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu);

^ permalink raw reply related

* Re: [RFC PATCH] net: cgroup: null ptr dereference in netprio cgroup during init
From: Gao feng @ 2012-07-18  7:58 UTC (permalink / raw)
  To: John Fastabend; +Cc: davem, nhorman, mark.d.rustad, netdev, eric.dumazet
In-Reply-To: <50064E95.7020503@intel.com>

于 2012年07月18日 13:50, John Fastabend 写道:
> On 7/17/2012 6:59 PM, Gao feng wrote:
>> 于 2012年07月18日 08:33, John Fastabend 写道:
>>> When the netprio cgroup is built in the kernel cgroup_init will call
>>> cgrp_create which eventually calls update_netdev_tables. This is
>>> being called before do_initcalls() so a null ptr dereference occurs
>>> on init_net.
>>>
> 
> [...]
> 
>>
>>
>> Thanks John.
>> It's my mistake.
>>
>> Can we make sure init_net.count is zero here?
>> I can't find some places to initialize it to zero.
>>
> 
> Its defined in net_namespace.c so it's zeroed by virtue
> of being global. And initialized in setup_net via
> pure_initcall() always after cgroup_init() if I've done
> my accounting correctly.

This looks fine to me.
Thanks John.

^ permalink raw reply

* Re: [PATCH 0/5] Long term PMTU/redirect storage in ipv4.
From: Julian Anastasov @ 2012-07-18  8:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1342596648.2626.1831.camel@edumazet-glaptop>


	Hello,

On Wed, 18 Jul 2012, Eric Dumazet wrote:

> On Wed, 2012-07-18 at 10:28 +0300, Julian Anastasov wrote:
> 
> > 	This is going to read all values in the chain
> > before reaching daddr? Or may be FNHE_RECLAIM_DEPTH is
> > small and nobody will increase it. May be I can create
> > some func that searches daddr in chain instead. Do you still
> > prefer to remove the first daddr check or it is only
> > that the code is intended too much?
> > 
> 
> I would not bother, since real cost is the initial cache line miss.
> Once you read one field, reading others is really fast.

	Is the cost of read_seqbegin a problem? Here is a
2nd version, I still keep this first check for now.

> > > +		if (fnhe_daddr == daddr) {
> > 
> > 	Also, do we need some rcu locking in
> > __ip_rt_update_pmtu or may be ipv4_update_pmtu is
> > called always under rcu lock?
> 
> Sorry, I dont understand, we use the full lock
> write_seqlock_bh(&fnhe_seqlock);/write_sequnlock_bh(&fnhe_seqlock);

	No, it is not related to the fnhe locking, I mean:

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c7a40c3..c911caf 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1686,12 +1686,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
+	rcu_read_lock();
 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
 
 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 				      jiffies + ip_rt_mtu_expires);
 	}
+	rcu_read_unlock();
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
 }


	How about the following, is the first daddr check
still a problem?

Subject: [PATCH v2] ipv4: use seqlock for nh_exceptions

From: Julian Anastasov <ja@ssi.bg>

	Use global seqlock for the nh_exceptions. Call
fnhe_oldest with the right hash chain. Correct the diff
value for dst_set_expires.

v2: after suggestions from Eric Dumazet:
* get rid of spin lock fnhe_lock, rearrange update_or_create_fnhe
* continue daddr search in rt_bind_exception

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_fib.h |    2 +-
 net/ipv4/route.c     |  119 +++++++++++++++++++++++++++++---------------------
 2 files changed, 70 insertions(+), 51 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e9ee1ca..2daf096 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -51,7 +51,7 @@ struct fib_nh_exception {
 	struct fib_nh_exception __rcu	*fnhe_next;
 	__be32				fnhe_daddr;
 	u32				fnhe_pmtu;
-	u32				fnhe_gw;
+	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
 	unsigned long			fnhe_stamp;
 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f67e702..1485db1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1333,9 +1333,9 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 		build_sk_flow_key(fl4, sk);
 }
 
-static DEFINE_SPINLOCK(fnhe_lock);
+static DEFINE_SEQLOCK(fnhe_seqlock);
 
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 {
 	struct fib_nh_exception *fnhe, *oldest;
 
@@ -1358,47 +1358,63 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 	return hval & (FNHE_HASH_SIZE - 1);
 }
 
-static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+				  u32 pmtu, unsigned long expires)
 {
-	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
 	int depth;
-	u32 hval;
+	u32 hval = fnhe_hashfun(daddr);
+
+	write_seqlock_bh(&fnhe_seqlock);
 
+	hash = nh->nh_exceptions;
 	if (!hash) {
-		hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
-						   GFP_ATOMIC);
+		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 		if (!hash)
-			return NULL;
+			goto out_unlock;
+		nh->nh_exceptions = hash;
 	}
 
-	hval = fnhe_hashfun(daddr);
 	hash += hval;
 
 	depth = 0;
 	for (fnhe = rcu_dereference(hash->chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 		if (fnhe->fnhe_daddr == daddr)
-			goto out;
+			break;
 		depth++;
 	}
 
-	if (depth > FNHE_RECLAIM_DEPTH) {
-		fnhe = fnhe_oldest(hash + hval, daddr);
-		goto out_daddr;
+	if (fnhe) {
+		if (gw)
+			fnhe->fnhe_gw = gw;
+		if (pmtu) {
+			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_expires = expires;
+		}
+	} else {
+		if (depth > FNHE_RECLAIM_DEPTH)
+			fnhe = fnhe_oldest(hash);
+		else {
+			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+			if (!fnhe)
+				goto out_unlock;
+
+			fnhe->fnhe_next = hash->chain;
+			rcu_assign_pointer(hash->chain, fnhe);
+		}
+		fnhe->fnhe_daddr = daddr;
+		fnhe->fnhe_gw = gw;
+		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_expires = expires;
 	}
-	fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
-	if (!fnhe)
-		return NULL;
-
-	fnhe->fnhe_next = hash->chain;
-	rcu_assign_pointer(hash->chain, fnhe);
 
-out_daddr:
-	fnhe->fnhe_daddr = daddr;
-out:
 	fnhe->fnhe_stamp = jiffies;
-	return fnhe;
+
+out_unlock:
+	write_sequnlock_bh(&fnhe_seqlock);
+	return;
 }
 
 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
@@ -1452,13 +1468,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 		} else {
 			if (fib_lookup(net, fl4, &res) == 0) {
 				struct fib_nh *nh = &FIB_RES_NH(res);
-				struct fib_nh_exception *fnhe;
 
-				spin_lock_bh(&fnhe_lock);
-				fnhe = find_or_create_fnhe(nh, fl4->daddr);
-				if (fnhe)
-					fnhe->fnhe_gw = new_gw;
-				spin_unlock_bh(&fnhe_lock);
+				update_or_create_fnhe(nh, fl4->daddr, new_gw,
+						      0, 0);
 			}
 			rt->rt_gateway = new_gw;
 			rt->rt_flags |= RTCF_REDIRECTED;
@@ -1663,15 +1675,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 
 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
-		struct fib_nh_exception *fnhe;
 
-		spin_lock_bh(&fnhe_lock);
-		fnhe = find_or_create_fnhe(nh, fl4->daddr);
-		if (fnhe) {
-			fnhe->fnhe_pmtu = mtu;
-			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
-		}
-		spin_unlock_bh(&fnhe_lock);
+		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+				      jiffies + ip_rt_mtu_expires);
 	}
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
@@ -1904,21 +1910,34 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
-		if (fnhe->fnhe_daddr == daddr) {
-			if (fnhe->fnhe_pmtu) {
-				unsigned long expires = fnhe->fnhe_expires;
-				unsigned long diff = jiffies - expires;
-
-				if (time_before(jiffies, expires)) {
-					rt->rt_pmtu = fnhe->fnhe_pmtu;
-					dst_set_expires(&rt->dst, diff);
-				}
+		__be32 fnhe_daddr, gw;
+		u32 pmtu;
+		unsigned long expires;
+		unsigned int seq;
+
+		if (fnhe->fnhe_daddr != daddr)
+			continue;
+		do {
+			seq = read_seqbegin(&fnhe_seqlock);
+			fnhe_daddr = fnhe->fnhe_daddr;
+			gw = fnhe->fnhe_gw;
+			pmtu = fnhe->fnhe_pmtu;
+			expires = fnhe->fnhe_expires;
+		} while (read_seqretry(&fnhe_seqlock, seq));
+		if (daddr != fnhe_daddr)
+			continue;
+		if (pmtu) {
+			unsigned long diff = expires - jiffies;
+
+			if (time_before(jiffies, expires)) {
+				rt->rt_pmtu = pmtu;
+				dst_set_expires(&rt->dst, diff);
 			}
-			if (fnhe->fnhe_gw)
-				rt->rt_gateway = fnhe->fnhe_gw;
-			fnhe->fnhe_stamp = jiffies;
-			break;
 		}
+		if (gw)
+			rt->rt_gateway = gw;
+		fnhe->fnhe_stamp = jiffies;
+		break;
 	}
 }
 
-- 
1.7.3.4

^ permalink raw reply related

* Re: [RFC] r8169 : why SG / TX checksum are default disabled
From: Eric Dumazet @ 2012-07-18  8:55 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev, Hayes Wang
In-Reply-To: <20120717234037.GA26972@electric-eye.fr.zoreil.com>

On Wed, 2012-07-18 at 01:40 +0200, Francois Romieu wrote:

> > (I found that activating them with ethtool automatically enables GSO,
> >  and performance with GSO is not good)
> 
> It's still an improvement though, isn't it ?
> 

On an old AMD machine, I can get line rate with default conf, but using
nearly all cpu cycles.

Following test is only partial, a real one should use forwarding for
example...


# perf stat netperf -H eric -C -c -t OMNI
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to eric () port 0 AF_INET
tcpi_rto 201000 tcpi_ato 0 tcpi_pmtu 1500 tcpi_rcv_ssthresh 14600
tcpi_rtt 1000 tcpi_rttvar 750 tcpi_snd_ssthresh 16 tpci_snd_cwnd 62
tcpi_reordering 3 tcpi_total_retrans 0
Local       Remote      Local  Elapsed Throughput Throughput  Local  Local  Remote Remote Local   Remote  Service  
Send Socket Recv Socket Send   Time               Units       CPU    CPU    CPU    CPU    Service Service Demand   
Size        Size        Size   (sec)                          Util   Util   Util   Util   Demand  Demand  Units    
Final       Final                                             %      Method %      Method                          
290160      549032      16384  10.00   915.44     10^6bits/s  44.93  S      3.61   S      8.042   7.755   usec/KB  

 Performance counter stats for 'netperf -H eric -C -c -t OMNI':

       5206,301186 task-clock                #    0,520 CPUs utilized          
            16 568 context-switches          #    0,003 M/sec                  
                 2 CPU-migrations            #    0,000 K/sec                  
               366 page-faults               #    0,070 K/sec                  
    12 362 775 266 cycles                    #    2,375 GHz                     [66,99%]
     2 529 275 760 stalled-cycles-frontend   #   20,46% frontend cycles idle    [67,00%]
     6 878 915 080 stalled-cycles-backend    #   55,64% backend  cycles idle    [66,24%]
     5 272 222 150 instructions              #    0,43  insns per cycle        
                                             #    1,30  stalled cycles per insn [66,85%]
       819 922 185 branches                  #  157,487 M/sec                   [66,79%]
        50 135 423 branch-misses             #    6,11% of all branches         [66,15%]

      10,019141027 seconds time elapsed


If I switch to SG+TX (GSO is automatically enabled), bandwidth is lower.

# ethtool -K eth1 tx on sg on
Actual changes:
tx-checksumming: on
	tx-checksum-ipv4: on
scatter-gather: on
	tx-scatter-gather: on
generic-segmentation-offload: on

# perf stat netperf -H eric -C -c -t OMNI
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to eric () port 0 AF_INET
tcpi_rto 201000 tcpi_ato 0 tcpi_pmtu 1500 tcpi_rcv_ssthresh 14600
tcpi_rtt 1875 tcpi_rttvar 750 tcpi_snd_ssthresh 21 tpci_snd_cwnd 169
tcpi_reordering 3 tcpi_total_retrans 0
Local       Remote      Local  Elapsed Throughput Throughput  Local  Local  Remote Remote Local   Remote  Service  
Send Socket Recv Socket Send   Time               Units       CPU    CPU    CPU    CPU    Service Service Demand   
Size        Size        Size   (sec)                          Util   Util   Util   Util   Demand  Demand  Units    
Final       Final                                             %      Method %      Method                          
790920      704640      16384  10.01   762.29     10^6bits/s  38.00  S      3.38   S      8.167   8.720   usec/KB  

 Performance counter stats for 'netperf -H eric -C -c -t OMNI':

       4526,838736 task-clock                #    0,452 CPUs utilized          
             2 031 context-switches          #    0,449 K/sec                  
                 3 CPU-migrations            #    0,001 K/sec                  
               366 page-faults               #    0,081 K/sec                  
     4 476 876 825 cycles                    #    0,989 GHz                     [66,41%]
       899 080 378 stalled-cycles-frontend   #   20,08% frontend cycles idle    [66,56%]
     2 430 763 937 stalled-cycles-backend    #   54,30% backend  cycles idle    [66,87%]
     1 685 481 163 instructions              #    0,38  insns per cycle        
                                             #    1,44  stalled cycles per insn [66,93%]
       280 404 977 branches                  #   61,943 M/sec                   [66,73%]
        15 608 497 branch-misses             #    5,57% of all branches         [66,54%]

      10,025486268 seconds time elapsed

Since most frames need between 2 and 3 segments
(one for the ip/tcp headers, and one or two frags for the payload), this
might be a MMIO issue, that Alexander tried to solve recently...

If I only switch to SG+TX its ok

# ethtool -K eth1 gso off

# perf stat netperf -H eric -C -c -t OMNI
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to eric () port 0 AF_INET
tcpi_rto 201000 tcpi_ato 0 tcpi_pmtu 1500 tcpi_rcv_ssthresh 14600
tcpi_rtt 1000 tcpi_rttvar 750 tcpi_snd_ssthresh 18 tpci_snd_cwnd 60
tcpi_reordering 3 tcpi_total_retrans 0
Local       Remote      Local  Elapsed Throughput Throughput  Local  Local  Remote Remote Local   Remote  Service  
Send Socket Recv Socket Send   Time               Units       CPU    CPU    CPU    CPU    Service Service Demand   
Size        Size        Size   (sec)                          Util   Util   Util   Util   Demand  Demand  Units    
Final       Final                                             %      Method %      Method                          
280800      549032      16384  10.00   916.61     10^6bits/s  40.05  S      3.62   S      7.159   7.774   usec/KB  

 Performance counter stats for 'netperf -H eric -C -c -t OMNI':

       4827,259625 task-clock                #    0,482 CPUs utilized          
            17 988 context-switches          #    0,004 M/sec                  
                 3 CPU-migrations            #    0,001 K/sec                  
               366 page-faults               #    0,076 K/sec                  
    11 448 148 411 cycles                    #    2,372 GHz                     [66,57%]
     2 278 563 777 stalled-cycles-frontend   #   19,90% frontend cycles idle    [66,38%]
     6 420 123 655 stalled-cycles-backend    #   56,08% backend  cycles idle    [66,38%]
     4 471 468 064 instructions              #    0,39  insns per cycle        
                                             #    1,44  stalled cycles per insn [67,48%]
       757 302 269 branches                  #  156,880 M/sec                   [67,08%]
        44 320 435 branch-misses             #    5,85% of all branches         [66,16%]

      10,020331031 seconds time elapsed

^ permalink raw reply

* [PATCH v2] ipvs: fixed sparse warning
From: Claudiu Ghioc @ 2012-07-18  9:10 UTC (permalink / raw)
  To: netdev
  Cc: davem, netfilter-devel, linux-kernel, wensong, horms, ja, pablo,
	kaber, claudiu.ghioc, daniel.baluta

Removed the following sparse warnings, wether CONFIG_SYSCTL
is defined or not:
*       warning: symbol 'ip_vs_control_net_init_sysctl' was not
	declared. Should it be static?
*       warning: symbol 'ip_vs_control_net_cleanup_sysctl' was
	not declared. Should it be static?

Signed-off-by: Claudiu Ghioc <claudiu.ghioc@gmail.com>
---
 net/netfilter/ipvs/ip_vs_ctl.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d43e3c1..044e845 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3674,7 +3674,7 @@ static void ip_vs_genl_unregister(void)
  * per netns intit/exit func.
  */
 #ifdef CONFIG_SYSCTL
-int __net_init ip_vs_control_net_init_sysctl(struct net *net)
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 {
 	int idx;
 	struct netns_ipvs *ipvs = net_ipvs(net);
@@ -3742,7 +3742,7 @@ int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	return 0;
 }
 
-void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
@@ -3753,8 +3753,8 @@ void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
 
 #else
 
-int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
-void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
 
 #endif
 
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH net-next] asix: Fix return value in AX88172A driver bind function
From: Christian Riesch @ 2012-07-18 10:56 UTC (permalink / raw)
  To: netdev; +Cc: Christian Riesch

Return -ENOTSUPP if the initialization fails because the
device is configured for a mode that is not supported by the driver.

Signed-off-by: Christian Riesch <christian.riesch@omicron.at>
---
 drivers/net/usb/ax88172a.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c
index 534a144..3d0f8fa 100644
--- a/drivers/net/usb/ax88172a.c
+++ b/drivers/net/usb/ax88172a.c
@@ -274,6 +274,7 @@ static int ax88172a_bind(struct usbnet *dev, struct usb_interface *intf)
 		break;
 	default:
 		netdev_err(dev->net, "Interface mode not supported by driver\n");
+		ret = -ENOTSUPP;
 		goto free;
 	}
 
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH net-next V1 2/9] include/linux: Add private flags for IPoIB interfaces
From: Or Gerlitz @ 2012-07-18 10:59 UTC (permalink / raw)
  To: davem; +Cc: roland, netdev, ali, sean.hefty, shlomop, Erez Shitrit,
	Or Gerlitz
In-Reply-To: <1342609202-32427-1-git-send-email-ogerlitz@mellanox.com>

From: Erez Shitrit <erezsh@mellanox.co.il>

The new 2 bits indicates whenever a device is considered PIF interface,
which means the "main" interfaces (ib0, ib1 etc), or cloned interfaces
(ib0.1, ib1.2 etc.) that is now in use by the eIPoIB driver.

Signed-off-by: Erez Shitrit <erezsh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 include/linux/if.h |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/include/linux/if.h b/include/linux/if.h
index 1ec407b..f50dbf2 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -84,6 +84,8 @@
 #define IFF_LIVE_ADDR_CHANGE 0x100000	/* device supports hardware address
 					 * change when it's running */
 
+#define IFF_EIPOIB_PIF  0x200000       /* IPoIB PIF intf (ib0, ib1 etc.) */
+#define IFF_EIPOIB_VIF  0x400000       /* IPoIB VIF intf (ib0.x, ib1.x etc.) */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V1 3/9] IB/ipoib: Add support for acting as VIF
From: Or Gerlitz @ 2012-07-18 10:59 UTC (permalink / raw)
  To: davem; +Cc: roland, netdev, ali, sean.hefty, shlomop, Erez Shitrit,
	Or Gerlitz
In-Reply-To: <1342609202-32427-1-git-send-email-ogerlitz@mellanox.com>

From: Erez Shitrit <erezsh@mellanox.co.il>

When IPoIB interface acts as a VIF for an eIPoIB interface, it uses
the skb cb storage area on the RX flow, to place information which
can be of use to the upper layer device.

One such usage example, is when an eIPoIB inteface needs to generate
a source mac for incoming Ethernet frames.

The IPoIB code checks the VIF private flag on the RX path, and according
to the value of the flag prepares the skb CB data, etc.

Signed-off-by: Erez Shitrit <erezsh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h      |    5 +++
 drivers/infiniband/ulp/ipoib/ipoib_cm.c   |    9 +++++
 drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    8 ++++-
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   21 +++++++++++
 include/rdma/e_ipoib.h                    |   54 +++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 1 deletions(-)
 create mode 100644 include/rdma/e_ipoib.h

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index a57db27..0416e8f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -52,6 +52,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_sa.h>
 #include <linux/sched.h>
+#include <rdma/e_ipoib.h>
 
 /* constants */
 
@@ -209,6 +210,7 @@ struct ipoib_cm_rx {
 	unsigned long		jiffies;
 	enum ipoib_cm_state	state;
 	int			recv_count;
+	u32			qpn;
 };
 
 struct ipoib_cm_tx {
@@ -695,6 +697,9 @@ extern int ipoib_recvq_size;
 
 extern struct ib_sa_client ipoib_sa_client;
 
+void set_skb_oob_cb_data(struct sk_buff *skb, struct ib_wc *wc,
+			 struct napi_struct *napi);
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 extern int ipoib_debug_level;
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 1ca7322..6042905 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -440,6 +440,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	struct net_device *dev = cm_id->context;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_rx *p;
+	struct ipoib_cm_data *data = event->private_data;
 	unsigned psn;
 	int ret;
 
@@ -452,6 +453,10 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	cm_id->context = p;
 	p->state = IPOIB_CM_RX_LIVE;
 	p->jiffies = jiffies;
+
+	/* used to keep track of base qpn in CM mode */
+	p->qpn = be32_to_cpu(data->qpn);
+
 	INIT_LIST_HEAD(&p->list);
 
 	p->qp = ipoib_cm_create_rx_qp(dev, p);
@@ -669,6 +674,10 @@ copied:
 	skb->dev = dev;
 	/* XXX get correct PACKET_ type here */
 	skb->pkt_type = PACKET_HOST;
+	/* if handler is registered on top of ipoib, set skb oob data. */
+	if (skb->dev->priv_flags & IFF_EIPOIB_VIF)
+		set_skb_oob_cb_data(skb, wc, NULL);
+
 	netif_receive_skb(skb);
 
 repost:
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f10221f..f248e6e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -304,7 +304,13 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 			likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	napi_gro_receive(&priv->napi, skb);
+	/* if handler is registered on top of ipoib, set skb oob data */
+	if (dev->priv_flags & IFF_EIPOIB_VIF) {
+		set_skb_oob_cb_data(skb, wc, &priv->napi);
+		/* the registered handler will take care of the skb */
+		netif_receive_skb(skb);
+	} else
+		napi_gro_receive(&priv->napi, skb);
 
 repost:
 	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index d0cb5cc..8575fa7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -91,6 +91,24 @@ static struct ib_client ipoib_client = {
 	.remove = ipoib_remove_one
 };
 
+void set_skb_oob_cb_data(struct sk_buff *skb, struct ib_wc *wc,
+			 struct napi_struct *napi)
+{
+	struct ipoib_cm_rx *p_cm_ctx = NULL;
+	struct eipoib_cb_data *data = NULL;
+
+	p_cm_ctx = wc->qp->qp_context;
+	data = IPOIB_HANDLER_CB(skb);
+
+	data->rx.slid = wc->slid;
+	data->rx.sqpn = wc->src_qp;
+	data->rx.napi = napi;
+
+	/* in CM mode, use the "base" qpn as sqpn */
+	if (p_cm_ctx)
+		data->rx.sqpn = p_cm_ctx->qpn;
+}
+
 int ipoib_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1277,6 +1295,9 @@ static struct net_device *ipoib_add_port(const char *format,
 		goto event_failed;
 	}
 
+	/* indicates pif port */
+	priv->dev->priv_flags |= IFF_EIPOIB_PIF;
+
 	result = register_netdev(priv->dev);
 	if (result) {
 		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
diff --git a/include/rdma/e_ipoib.h b/include/rdma/e_ipoib.h
new file mode 100644
index 0000000..7249334
--- /dev/null
+++ b/include/rdma/e_ipoib.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * openfabric.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _LINUX_ETH_IB_IPOIB_H
+#define _LINUX_ETH_IB_IPOIB_H
+
+#include <net/sch_generic.h>
+
+struct eipoib_cb_data {
+	/*
+	 * extra care taken not to collide with the usage done
+	 * by the qdisc layer in struct skb cb data.
+	 */
+	struct qdisc_skb_cb	qdisc_cb;
+	struct { /* must be <= 20 bytes */
+		u32 sqpn;
+		struct napi_struct *napi;
+		u16 slid;
+		u8 data[6];
+	} __packed rx;
+};
+
+#define IPOIB_HANDLER_CB(skb) ((struct eipoib_cb_data *)(skb)->cb)
+
+#endif /* _LINUX_ETH_IB_IPOIB_H */
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V1 8/9] net/eipoib: Add Makefile, Kconfig and MAINTAINERS entries
From: Or Gerlitz @ 2012-07-18 11:00 UTC (permalink / raw)
  To: davem; +Cc: roland, netdev, ali, sean.hefty, shlomop, Erez Shitrit,
	Or Gerlitz
In-Reply-To: <1342609202-32427-1-git-send-email-ogerlitz@mellanox.com>

From: Erez Shitrit <erezsh@mellanox.co.il>

Add Kconfig entry under drivers/net and MAINTAINERS entry for eIPoIB, also
add the driver makefile.

Signed-off-by: Erez Shitrit <erezsh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 MAINTAINERS                 |    6 ++++++
 drivers/net/Kconfig         |   15 +++++++++++++++
 drivers/net/Makefile        |    1 +
 drivers/net/eipoib/Makefile |    4 ++++
 4 files changed, 26 insertions(+), 0 deletions(-)
 create mode 100644 drivers/net/eipoib/Makefile

diff --git a/MAINTAINERS b/MAINTAINERS
index b4321fb..52f35ba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2618,6 +2618,12 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/ibm/ehea/
 
+EIPoIB (Ethernet services over IPoIB) DRIVER
+M:	Erez Shitrit <erezsh@mellanox.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/eipoib/
+
 EMBEDDED LINUX
 M:	Paul Gortmaker <paul.gortmaker@windriver.com>
 M:	Matt Mackall <mpm@selenic.com>
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0c2bd80..ba98f61 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -68,6 +68,21 @@ config DUMMY
 	  To compile this driver as a module, choose M here: the module
 	  will be called dummy.
 
+config E_IPOIB
+	tristate "Ethernet Services over IPoIB"
+	depends on INFINIBAND_IPOIB
+	---help---
+	  This driver supports Ethernet protocol over InfiniBand IPoIB devices.
+	  Some services can run only on top of Ethernet L2 interfaces, and
+	  cannot be bound to an IPoIB interface.
+	  With this new driver, these services can run seamlessly.
+
+	  Main use case of the driver is the Ethernet Virtual Switching used in
+	  virtualized environments, where an eipoib netdevice can be used as a
+	  Physical Interface (PIF) in the hypervisor domain, and allow other guests
+	  Virtual Interfaces (VIF) connected to the same Virtual Switch to run over
+	  the InfiniBand fabric.
+
 config EQUALIZER
 	tristate "EQL (serial line load balancing) support"
 	---help---
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 3d375ca..2c3409e 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_CAIF) += caif/
 obj-$(CONFIG_CAN) += can/
 obj-$(CONFIG_ETRAX_ETHERNET) += cris/
 obj-$(CONFIG_NET_DSA) += dsa/
+obj-$(CONFIG_E_IPOIB) += eipoib/
 obj-$(CONFIG_ETHERNET) += ethernet/
 obj-$(CONFIG_FDDI) += fddi/
 obj-$(CONFIG_HIPPI) += hippi/
diff --git a/drivers/net/eipoib/Makefile b/drivers/net/eipoib/Makefile
new file mode 100644
index 0000000..b64e96e
--- /dev/null
+++ b/drivers/net/eipoib/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_E_IPOIB)                         := eth_ipoib.o
+eth_ipoib-y                                    := eth_ipoib_main.o \
+                                                  eth_ipoib_sysfs.o \
+                                                  eth_ipoib_ethtool.o
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V1 1/9] IB/ipoib: Add support for clones / multiple childs on the same partition
From: Or Gerlitz @ 2012-07-18 10:59 UTC (permalink / raw)
  To: davem; +Cc: roland, netdev, ali, sean.hefty, shlomop, Or Gerlitz,
	Erez Shitrit
In-Reply-To: <1342609202-32427-1-git-send-email-ogerlitz@mellanox.com>

Allow creating "clone" child interfaces which further partition an
IPoIB interface to sub interfaces who either use the same pkey as
their parent or use the same pkey as already created child interface.

Each child now has a child index, which together with the pkey is
used as the identifier of the created network device.

All sorts of childs are still created/deleted through sysfs, in a
similar manner to the way legacy child interfaces are.

A major use case for clone childs is for virtualization purposes, where
a per VM NIC is desired at the hypervisor level, such as the solution
provided by the newly introduced Ethernet IPoIB driver.

Signed-off-by: Erez Shitrit <erezsh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 Documentation/infiniband/ipoib.txt         |   23 +++++++++++++
 drivers/infiniband/ulp/ipoib/ipoib.h       |    7 +++-
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |   48 +++++++++++++++++++++-------
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c  |   46 ++++++++++++++++++--------
 5 files changed, 98 insertions(+), 29 deletions(-)

diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
index 64eeb55..601f78f 100644
--- a/Documentation/infiniband/ipoib.txt
+++ b/Documentation/infiniband/ipoib.txt
@@ -24,6 +24,29 @@ Partitions and P_Keys
   The P_Key for any interface is given by the "pkey" file, and the
   main interface for a subinterface is in "parent."
 
+Clones
+  Its possible to further partition an IPoIB interfaces, and create
+  "clone" child interfaces which either use the same pkey as their
+  parent, or as an already created child interface. Each child now has
+  a child index, which together with the pkey is used as the identifier
+  of the created network device.
+
+ All sorts of childs are still created/deleted through sysfs, in a
+ similar manner to the way legacy child interfaces are, for example:
+
+    echo 0x8001.1 > /sys/class/net/ib0/create_child
+
+  will create an interface named ib0.8001.1 with P_Key 0x8001 and index 1
+
+    echo .1 > /sys/class/net/ib0/create_child
+
+  will create an interface named ib0.1 with same P_Key as ib0 and index 1
+
+  remove a subinterface, use the "delete_child" file:
+
+    echo 0x8001.1 > /sys/class/net/ib0/create_child
+    echo .1  > /sys/class/net/ib0/create_child
+
 Datagram vs Connected modes
 
   The IPoIB driver supports two modes of operation: datagram and
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 86df632..a57db27 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -332,6 +332,7 @@ struct ipoib_dev_priv {
 	struct net_device *parent;
 	struct list_head child_intfs;
 	struct list_head list;
+	int child_index;
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	struct ipoib_cm_dev_priv cm;
@@ -490,8 +491,10 @@ void ipoib_transport_dev_cleanup(struct net_device *dev);
 void ipoib_event(struct ib_event_handler *handler,
 		 struct ib_event *record);
 
-int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
-int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey,
+						unsigned char clone_index);
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey,
+						unsigned char clone_index);
 
 void ipoib_pkey_poll(struct work_struct *work);
 int ipoib_pkey_dev_delay_open(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index bbee4b2..d0cb5cc 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1095,17 +1095,44 @@ int ipoib_add_umcast_attr(struct net_device *dev)
 	return device_create_file(&dev->dev, &dev_attr_umcast);
 }
 
+static int parse_child(struct device *dev, const char *buf, int *pkey,
+		       int *child_index)
+{
+	int ret;
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+
+	*pkey = *child_index = -1;
+
+	/* 'pkey' or 'pkey.child_index' or '.child_index' are allowed */
+	ret = sscanf(buf, "%i.%i", pkey, child_index);
+	if (ret == 1)  /* just pkey, implicit child index is 0 */
+		*child_index = 0;
+	else  if (ret != 2) { /* pkey same as parent, specified child index */
+		*pkey = priv->pkey;
+		ret  = sscanf(buf, ".%i", child_index);
+		if (ret != 1 || *child_index == 0)
+			return -EINVAL;
+	}
+
+	if (*child_index < 0 || *child_index > 0xff)
+		return -EINVAL;
+
+	if (*pkey < 0 || *pkey > 0xffff)
+		return -EINVAL;
+
+	ipoib_dbg(priv, "parse_child inp %s out pkey %04x index %d\n",
+		buf, *pkey, *child_index);
+	return 0;
+}
+
 static ssize_t create_child(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	int pkey;
+	int pkey, child_index;
 	int ret;
 
-	if (sscanf(buf, "%i", &pkey) != 1)
-		return -EINVAL;
-
-	if (pkey < 0 || pkey > 0xffff)
+	if (parse_child(dev, buf, &pkey, &child_index))
 		return -EINVAL;
 
 	/*
@@ -1114,7 +1141,7 @@ static ssize_t create_child(struct device *dev,
 	 */
 	pkey |= 0x8000;
 
-	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
+	ret = ipoib_vlan_add(to_net_dev(dev), pkey, child_index);
 
 	return ret ? ret : count;
 }
@@ -1124,16 +1151,13 @@ static ssize_t delete_child(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	int pkey;
+	int pkey, child_index;
 	int ret;
 
-	if (sscanf(buf, "%i", &pkey) != 1)
-		return -EINVAL;
-
-	if (pkey < 0 || pkey > 0xffff)
+	if (parse_child(dev, buf, &pkey, &child_index))
 		return -EINVAL;
 
-	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
+	ret = ipoib_vlan_delete(to_net_dev(dev), pkey, child_index);
 
 	return ret ? ret : count;
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 049a997..2131772 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -167,7 +167,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 			size += ipoib_recvq_size * ipoib_max_conn_qp;
 	}
 
-	priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
+	priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size,
+				     priv->child_index % priv->ca->num_comp_vectors);
 	if (IS_ERR(priv->recv_cq)) {
 		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
 		goto out_free_mr;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index d7e9740..2d35cb4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -49,7 +49,8 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr,
 }
 static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
 
-int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey,
+		unsigned char child_index)
 {
 	struct ipoib_dev_priv *ppriv, *priv;
 	char intf_name[IFNAMSIZ];
@@ -65,25 +66,40 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 	mutex_lock(&ppriv->vlan_mutex);
 
 	/*
-	 * First ensure this isn't a duplicate. We check the parent device and
-	 * then all of the child interfaces to make sure the Pkey doesn't match.
+	 * First ensure this isn't a duplicate. We check all of the child
+	 * interfaces to make sure the Pkey AND the child index
+	 * don't match.
 	 */
-	if (ppriv->pkey == pkey) {
-		result = -ENOTUNIQ;
-		priv = NULL;
-		goto err;
-	}
-
 	list_for_each_entry(priv, &ppriv->child_intfs, list) {
-		if (priv->pkey == pkey) {
+		if (priv->pkey == pkey && priv->child_index == child_index) {
 			result = -ENOTUNIQ;
 			priv = NULL;
 			goto err;
 		}
 	}
 
-	snprintf(intf_name, sizeof intf_name, "%s.%04x",
-		 ppriv->dev->name, pkey);
+	/*
+	 * for the case of non-legacy and same pkey childs we wanted to use
+	 * a notation of ibN.pkey:index and ibN:index but this is problematic
+	 * with tools like ifconfig who treat devices with ":" in their names
+	 * as aliases which are restriced, e.t w.r.t counters, etc
+	 */
+	if (ppriv->pkey != pkey && child_index == 0) /* legacy child */
+		snprintf(intf_name, sizeof intf_name, "%s.%04x",
+			 ppriv->dev->name, pkey);
+	else if (ppriv->pkey != pkey && child_index != 0) /* non-legacy child */
+		snprintf(intf_name, sizeof intf_name, "%s.%04x.%d",
+			 ppriv->dev->name, pkey, child_index);
+	else if (ppriv->pkey == pkey && child_index != 0) /* same pkey child */
+		snprintf(intf_name, sizeof intf_name, "%s.%d",
+			 ppriv->dev->name, child_index);
+	else  {
+		ipoib_warn(ppriv, "wrong pkey/child_index pairing %04x %d\n",
+			   pkey, child_index);
+		result = -EINVAL;
+		goto err;
+	}
+
 	priv = ipoib_intf_alloc(intf_name);
 	if (!priv) {
 		result = -ENOMEM;
@@ -101,6 +117,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 		goto err;
 
 	priv->pkey = pkey;
+	priv->child_index = child_index;
 
 	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
 	priv->dev->broadcast[8] = pkey >> 8;
@@ -157,7 +174,8 @@ err:
 	return result;
 }
 
-int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey,
+		unsigned char child_index)
 {
 	struct ipoib_dev_priv *ppriv, *priv, *tpriv;
 	struct net_device *dev = NULL;
@@ -171,7 +189,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
 		return restart_syscall();
 	mutex_lock(&ppriv->vlan_mutex);
 	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
-		if (priv->pkey == pkey) {
+		if (priv->pkey == pkey && priv->child_index == child_index) {
 			unregister_netdevice(priv->dev);
 			ipoib_dev_cleanup(priv->dev);
 			list_del(&priv->list);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next V1 0/9] Add Ethernet IPoIB driver
From: Or Gerlitz @ 2012-07-18 10:59 UTC (permalink / raw)
  To: davem; +Cc: roland, netdev, ali, sean.hefty, shlomop, Or Gerlitz

changes from V0:
 - applied feedback from Eric/Dave - RX flow uses only the last 20 bytes of skb->cb[]
 - applied feedback from Ben H. on ethtool changes
 - fix sparse error on function which should be made static
 - made the netdev features related code of the driver more elegant/robust
 - used _bh locking in some paths which used plain rw locking in V0
 - some code rearrangements in flows that send ARPs

The eIPoIB driver provides a standard Ethernet netdevice over 
the InfiniBand IPoIB interface.

Some services can run only on top of Ethernet L2 interfaces, and cannot be
bound to an IPoIB interface. With this new driver, these services can run
seamlessly.

Main use case of the driver is the Ethernet Virtual Switching used in
virtualized environments, where an eipoib netdevice can be used as a 
Physical Interface (PIF) in the hypervisor domain, and allow other 
guests Virtual Interfaces (VIF) connected to the same Virtual Switch 
to run over the InfiniBand fabric.

This driver supports L2 Switching (Direct Bridging) as well as other L3
Switching modes (e.g. NAT).

Whenever an IPoIB interface is created, one eIPoIB PIF netdevice 
will be created. The default naming scheme is as in other Ethernet 
interfaces: ethX, for example, on a system with two IPoIB interfaces,
ib0 and ib1, two interfaces will be created ethX and ethX+1 When "X" 
is the next free Ethernet number in the system.

Using "ethtool -i " over the new interface can tell on which IPoIB
PIF interface that interface is above.  For example: driver: eth_ipoib:ib0 
indicates that eth3 is the Ethernet interface over the ib0 IPoIB interface.

The driver can be used as independent interface or to serve in
virtualization environment as the physical layer for the virtual
interfaces on the virtual guest.

The driver interface (eipoib interface or which is also referred to as parent) 
uses slave interfaces, IPoIB clones, which are the VIFs described above.

VIFs interfaces are enslaved/released from the eipoib driver on demand, according 
to the management interface provided to user space.

The management interface for the driver uses sysfs entries. Via these sysfs 
entries the driver gets details on new VIF's to manage. The driver can 
enslave new VIF (IPoIB cloned interface) or detaches from it.

Here are few sysfs commands that are used in order to manage the driver, 
according to few scenarios:

1. create new clone of IPoIB interface:

	$ echo .Y > /sys/class/net/ibX/create_child

create new clone ibX.Y with the same pkey as ibX, for example:

	$ echo .1 > /sys/class/net/ib0/create_child

will create new interface ib0.1

2. notify parent interface on new VIF to enslave:

	$ echo +ibX.Y > /sys/class/net/ethZ/eth/slaves

where ethZ is the driver interface, for example:

	$ echo +ib0.1 > /sys/class/net/eth4/eth/slaves

will enslave ib0.1 to eth4

3. notify parent interface interface on VIF details (mac and vlan)

	$ echo +ibX.Y <MAC address> > /sys/class/net/ethZ/eth/vifs

for example:

	$ echo +ib0.1 00:02:c9:43:3b:f1 > /sys/class/net/eth4/eth/vifs

4. notify parent to release VIF:

	$ echo -ibX.Y > /sys/class/net/ethZ/eth/slaves

where ethZ is the driver interface, for example:

        $ echo -ib0.1 > /sys/class/net/eth4/eth/slaves

will release ib0.1 from eth4

5. see the list of ipoib interfaces enslaved under eipoib interface,

	$ cat /sys/class/net/ethX/eth/vifs

for example:

	$ cat /sys/class/net/eth4/eth/vifs

	SLAVE=ib0.1      MAC=9a:c2:1f:d7:3b:63 VLAN=N/A
	SLAVE=ib0.2      MAC=52:54:00:60:55:88 VLAN=N/A
	SLAVE=ib0.3      MAC=52:54:00:60:55:89 VLAN=N/A

Note: Each ethX interface has at least one ibX.Y slave to serve the PIF
itself, in the VIFs list of ethX you'll notice that ibX.1 is always created 
to serve applications running from the Hypervisor on top of ethX interface directly.

For IB applications that require native IPoIB interfaces (e.g. RDMA-CM), the
original ipoib interfaces ibX can still be used.  For example, RDMA-CM and
eth_ipoib drivers can co-exist and make use of IPoIB

The last patch of this series was made such that the series works as is over 
net-next, in parallel to the submission of this driver, a patch to modify IPoIB 
such that it doesn't assume dst/neighbour on the skb was posted. 

Or.

Erez Shitrit (8):
  include/linux: Add private flags for IPoIB interfaces
  IB/ipoib: Add support for acting as VIF
  net/eipoib: Add private header file
  net/eipoib: Add ethtool file support
  net/eipoib: Add sysfs support
  net/eipoib: Add main driver functionality
  net/eipoib: Add Makefile, Kconfig and MAINTAINERS entries
  IB/ipoib: Add support for transmission of skbs w.o dst/neighbour

Or Gerlitz (1):
  IB/ipoib: Add support for clones / multiple childs on the same partition

 Documentation/infiniband/ipoib.txt         |   23 +
 MAINTAINERS                                |    6 +
 drivers/infiniband/ulp/ipoib/ipoib.h       |   12 +-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c    |    9 +
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |    8 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |   76 +-
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c  |   46 +-
 drivers/net/Kconfig                        |   15 +
 drivers/net/Makefile                       |    1 +
 drivers/net/eipoib/Makefile                |    4 +
 drivers/net/eipoib/eth_ipoib.h             |  227 ++++
 drivers/net/eipoib/eth_ipoib_ethtool.c     |  126 ++
 drivers/net/eipoib/eth_ipoib_main.c        | 1915 ++++++++++++++++++++++++++++
 drivers/net/eipoib/eth_ipoib_sysfs.c       |  640 ++++++++++
 include/linux/if.h                         |    2 +
 include/rdma/e_ipoib.h                     |   54 +
 17 files changed, 3134 insertions(+), 33 deletions(-)
 create mode 100644 drivers/net/eipoib/Makefile
 create mode 100644 drivers/net/eipoib/eth_ipoib.h
 create mode 100644 drivers/net/eipoib/eth_ipoib_ethtool.c
 create mode 100644 drivers/net/eipoib/eth_ipoib_main.c
 create mode 100644 drivers/net/eipoib/eth_ipoib_sysfs.c
 create mode 100644 include/rdma/e_ipoib.h

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox