Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] TCP: remove TCP_DEBUG
From: Flavio Leitner @ 2011-10-24 18:15 UTC (permalink / raw)
  To: netdev
  Cc: David Miller, Dan McGee, kuznet, jmorris, yoshfuji, kaber,
	linux-kernel
In-Reply-To: <CAEik5nPciVMwOE-hC+oPP1gMEvRzbhFwAZHKNQnHEc0YOUXuWA@mail.gmail.com>

It was enabled by default and the messages guarded
by the define are useful.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
---
 include/net/tcp.h    |    1 -
 net/ipv4/tcp.c       |    2 --
 net/ipv4/tcp_timer.c |    2 --
 3 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ed0e814..e147f42 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -18,7 +18,6 @@
 #ifndef _TCP_H
 #define _TCP_H
 
-#define TCP_DEBUG 1
 #define FASTRETRANS_DEBUG 1
 
 #include <linux/list.h>
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index eefc61e..34f5db1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1193,13 +1193,11 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 	struct tcp_sock *tp = tcp_sk(sk);
 	int time_to_ack = 0;
 
-#if TCP_DEBUG
 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 
 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
 	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
 	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
-#endif
 
 	if (inet_csk_ack_scheduled(sk)) {
 		const struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ecd44b0..2e0f0af 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -334,7 +334,6 @@ void tcp_retransmit_timer(struct sock *sk)
 		 * connection. If the socket is an orphan, time it out,
 		 * we cannot allow such beasts to hang infinitely.
 		 */
-#ifdef TCP_DEBUG
 		struct inet_sock *inet = inet_sk(sk);
 		if (sk->sk_family == AF_INET) {
 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
@@ -349,7 +348,6 @@ void tcp_retransmit_timer(struct sock *sk)
 			       inet->inet_num, tp->snd_una, tp->snd_nxt);
 		}
 #endif
-#endif
 		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
 			tcp_write_err(sk);
 			goto out;
-- 
1.7.6

^ permalink raw reply related

* Re: [net-next-2.6 PATCH 0/8 RFC v2] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-10-24 18:15 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, sri, dragos.tatulea, arnd, kvm, davem, mchan, dwang2,
	shemminger, eric.dumazet, kaber, benve, Rose, Gregory V
In-Reply-To: <20111024054710.GB24528@redhat.com>

On 10/23/11 10:47 PM, "Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Oct 18, 2011 at 11:25:54PM -0700, Roopa Prabhu wrote:
>> v1 version of this RFC patch was posted at
>> http://www.spinics.net/lists/netdev/msg174245.html
>> 
>> Today macvtap used in virtualized environment does not have support to
>> propagate MAC, VLAN and interface flags from guest to lowerdev.
>> Which means to be able to register additional VLANs, unicast and multicast
>> addresses or change pkt filter flags in the guest, the lowerdev has to be
>> put in promisocous mode. Today the only macvlan mode that supports this is
>> the PASSTHRU mode and it puts the lower dev in promiscous mode.
>> 
>> PASSTHRU mode was added primarily for the SRIOV usecase. In PASSTHRU mode
>> there is a 1-1 mapping between macvtap and physical NIC or VF.
>> 
>> There are two problems with putting the lowerdev in promiscous mode (ie SRIOV
>> VF's):
>> - Some SRIOV cards dont support promiscous mode today (Thread on Intel
>> driver indicates that http://lists.openwall.net/netdev/2011/09/27/6)
>> - For the SRIOV NICs that support it, Putting the lowerdev in
>> promiscous mode leads to additional traffic being sent up to the
>> guest virtio-net to filter result in extra overheads.
>> 
>> Both the above problems can be solved by offloading filtering to the
>> lowerdev hw. ie lowerdev does not need to be in promiscous mode as
>> long as the guest filters are passed down to the lowerdev.
>> 
>> This patch basically adds the infrastructure to set and get MAC and VLAN
>> filters on an interface via rtnetlink. And adds support in macvlan and
>> macvtap
>> to allow set and get filter operations.
> 
> Looks sane to me. Some minor comments below.
> 
>> Earlier version of this patch provided the TUNSETTXFILTER macvtap interface
>> for setting address filtering. In response to feedback, This version
>> introduces a netlink interface for the same.
>> 
>> Response to some of the questions raised during v1:
>> 
>> - Netlink interface:
>> This patch provides the following netlink interface to set mac and vlan
>> filters :
>> [IFLA_RX_FILTER] = {
>> [IFLA_ADDR_FILTER] = {
>> [IFLA_ADDR_FILTER_FLAGS]
>> [IFLA_ADDR_FILTER_UC_LIST] = {
>> [IFLA_ADDR_LIST_ENTRY]
>> }
>> [IFLA_ADDR_FILTER_MC_LIST] = {
>> [IFLA_ADDR_LIST_ENTRY]
>> }
>> }
>> [IFLA_VLAN_FILTER] = {
>> [IFLA_VLAN_BITMAP]
>> }
>> }
>> 
>> Note: The IFLA_VLAN_FILTER is a nested attribute and contains only
>> IFLA_VLAN_BITMAP today. The idea is that the IFLA_VLAN_FILTER can
>> be extended tomorrow to use a vlan list option if some implementations
>> prefer a list instead.
>> 
>> And it provides the following rtnl_link_ops to set/get MAC/VLAN filters:
>> 
>>        int                     (*set_rx_addr_filter)(struct net_device *dev,
>>                                                struct nlattr *tb[]);
>>        int                     (*set_rx_vlan_filter)(struct net_device *dev,
>>                                                 struct nlattr *tb[]);
>>        size_t                  (*get_rx_addr_filter_size)(const struct
>> net_device *dev);
>>        size_t                  (*get_rx_vlan_filter_size)(const struct
>> net_device *dev);
>>        int                     (*fill_rx_addr_filter)(struct sk_buff *skb,
>>                                                 const struct net_device
>> *dev);
>>        int                     (*fill_rx_vlan_filter)(struct sk_buff *skb,
>>                                                 const struct net_device
>> *dev);
>> 
>> 
>> Note: The choice of rtnl_link_ops was because I saw the use case for
>> this in virtual devices that need  to do filtering in sw like macvlan
>> and tun. Hw devices usually have filtering in hw with netdev->uc and
>> mc lists to indicate active filters. But I can move from rtnl_link_ops
>> to netdev_ops if that is the preferred way to go and if there is a
>> need to support this interface on all kinds of interfaces.
>> Please suggest.
>> 
>> - Protection against address spoofing:
>> - This patch adds filtering support only for macvtap PASSTHRU
>> Mode. PASSTHRU mode is used mainly with SRIOV VF's. And SRIOV VF's
>> come with anti mac/vlan spoofing support. (Recently added
>> IFLA_VF_SPOOFCHK). In 802.1Qbh case the port profile has a knob to
>> enable/disable anti spoof check. Lowerdevice drivers also enforce limits
>> on the number of address registrations allowed.
>> 
>> - Support for multiqueue devices: Enable filtering on individual queues (?):
>> AFAIK, there is no netdev interface to install per queue hw
>> filters for a multi queue interface. And also I dont know of any hw
>> that provides an interface to set hw filters on a per queue basis.
> 
> VMDq hardware would support this, no?
> 
Am not really sure. This patch uses netdev to pass filters to hw. And I
don't see any netdev infrastructure that would support per queue filters.
Maybe Greg (CC'ed) or anyone else from Intel can answer this.
Greg, michael had brought up this question during first version of these
patches as well. Will be nice to get the VMDq requirements for propagating
guest filters to hw clarified. Do you see any special VMDq nic requirement
we can cover in this patch. This is for VMDq queues directly connected to
guest nics. Thanks.


>> A multi queue device appears as a single lowerdev (ie netdev) and
>> uses the same uc and mc lists to setup unicast and multicast hw filters.
>> So i dont see a huge problem with this patch coming in the way for
>> multi queue devices.
>> 
>> - Support for non-PASSTHRU mode:
>> I started implementing this. But there are a couple of problems. 
>> - The lowerdev may not be a SRIOV VF and may not have
>> anti spoof capability
> 
> Anti-spoofing a really a separate feature, isn't it?
> 
Yes that is correct. It really should not be a concern with implementing
support for non-PASSTHRU mode. The only intent of adding the above line was
that eventually we should probably think of supporting anti-spoof feature on
Non-sriov devices if they are accepting filters from the guest.
I think I will move the above line to some place else more appropriate in
the comment log instead of covering it as part of the non-passthru macvlan
implementation.
 

>> - Today, in non-PASSTHRU cases macvlan_handle_frame assumes that
>> every macvlan device on top of the lowerdev has a single unique mac.
>> And the macvlans are hashed on that single mac address.
>> To support filtering for non-PASSTHRU mode in addition to this
>> patch the following needs to be done:
>> - non-passthru mode with a single macvlan over a lower dev
>> can be treated as PASSTHRU case
>> - For non-PASSTHRU mode with multiple macvlans over a single
>> lower dev:  
>> - Multiple unicast mac's now need to be hashed to the
>> same macvlan device. The macvlan hash needs to change
>> for lookup based on any one of the multiple unicast
>> addresses a macvlan is interested in
>> - We need to consider vlans during the lookup too
>> - So the macvlan device hash needs to hash on both mac
>> and vlan
> 
> It might be useful to expose the filters to the device.
> 

Yes

>> - But the support for filtering in non-PASSTHRU mode can be
>> built on this patch
> 
> 
> Agree, this can be added gradually.
> 

Ok thanks. Currently testing newer version of these patches, will post them
Sometime this week.

^ permalink raw reply

* e.m-ail
From: -Drw(REFF) @ 2011-10-24 18:03 UTC (permalink / raw)



KINDLY PROVIDE YOUR NAME-  ADDRESS-  SEX-  AGE-  MOBILE-  FOR YOUR ONE-MILLION POUNDS WHICH
YOU HAVE WON.  

^ permalink raw reply

* Re: Kernel Panic every 2 weeks on ISP server (NULL pointer dereference)
From: Luciano Ruete @ 2011-10-24 18:09 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1319346989.6180.71.camel@edumazet-laptop>

On Sunday, October 23, 2011 02:16:29 am Eric Dumazet wrote:
> Le samedi 22 octobre 2011 à 22:18 -0300, Luciano Ruete a écrit :
> > Hi,
> > 
> > I'm the sysadmin at a 3500 customers ISP, wich runs an iptables+tc
> > solution for load balancing and QoS.
> > 
> > Every 2 or 3 weeks the server panics with a "NULL pointer dereference"
> > and with IP at "dev_queue_xmit"
> > 
> > It is curious that if i disable MSI on the network card driver this
> > panics seems to disapear, does this ring a bell?
> > 
> > The server is an IBM, previously with Broadcom NetXtreme II BCM5709 nics
> > and now with Intel 82576. I change the nics thinking that maybe the bug
> > was in Broadcom Driver but it seems to affect MSI in general.
> > 
> > The tc+iptables rules are auto-generated with sequreisp[1] an ISP
> > solution that i wrote and is open sourced under AGPLv3.
> > 
> > Tell me if you need any further information, and plz CC because I'm not
> > suscribed.
> > 
> > 
> > root@server:~# uname -a
> > Linux server 2.6.35-30-server #60~lucid1-Ubuntu SMP Tue Sep 20 22:28:40
> > UTC 2011 x86_64 GNU/Linux
> > 
> > 
> > [1]https://github.com/sequre/sequreisp
> 
> Hi Luciano

Hi Eric!

Thanks for your answer...

> 
> [694250.472081] Code: f6
> 49 c1 e6 07          shl    $0x7,%r14
> 66 89 93 ac 00 00 00 mov    %dx,0xac(%rbx)
>[...]
> This looks like a dev_pick_tx() bug, using an out of bound
> queue_index number and returning a txq pointing after
> the device allocated array.

Clear explanation, is there a tool to map the trace to kernel code, or you did 
this by hand? 

> With recent kernels, this cannot happen anymore because
> we added fixes in this area.
> 
> You could try Ubuntu 11.10 (based on linux 3.0) kernel
> on your server, or apply following patch :
> 
> commit df32cc193ad88f7b1326b90af799c927b27f7654
> Author: Tom Herbert <therbert@google.com>
> Date:   Mon Nov 1 12:55:52 2010 -0700
> 
>     net: check queue_index from sock is valid for device
> 
>     In dev_pick_tx recompute the queue index if the value stored in the
>     socket is greater than or equal to the number of real queues for the
>     device.  The saved index in the sock structure is not guaranteed to
>     be appropriate for the egress device (this could happen on a route
>     change or in presence of tunnelling).  The result of the queue index
>     being bad would be to return a bogus queue (crash could prersumably
>     follow).

Lot of ruote changes in this server, there are 30 upstream providers(15 are 
dynamic IP ADSLs) load balanced using VLANs and a VLAN switch.

Thanks again i will try the kernel upgrade and post results in this thread.

Regards!
-- 
Luciano Ruete
Sequre - Sys Admin
Mitre 617, piso 7, of. 1 
+54 261 4254894
Mendoza - Argentina
http://www.sequreisp.com/
http://www.sequre.com.ar/

^ permalink raw reply

* [PATCH v3 2/2] dp83640: free packet queues on remove
From: Richard Cochran @ 2011-10-24 17:55 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Eric Dumazet, Johannes Berg, stable
In-Reply-To: <cover.1319478544.git.richard.cochran@omicron.at>

If the PHY should disappear (for example, on an USB Ethernet MAC), then
the driver would leak any undelivered time stamp packets. This commit
fixes the issue by calling the appropriate functions to free any packets
left in the transmit and receive queues.

The driver first appeared in v3.0.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: <stable@vger.kernel.org>
---
 drivers/net/phy/dp83640.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 311f5cb..dc44b73 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -875,6 +875,7 @@ static void dp83640_remove(struct phy_device *phydev)
 	struct dp83640_clock *clock;
 	struct list_head *this, *next;
 	struct dp83640_private *tmp, *dp83640 = phydev->priv;
+	struct sk_buff *skb;
 
 	if (phydev->addr == BROADCAST_ADDR)
 		return;
@@ -882,6 +883,12 @@ static void dp83640_remove(struct phy_device *phydev)
 	enable_status_frames(phydev, false);
 	cancel_work_sync(&dp83640->ts_work);
 
+	while ((skb = skb_dequeue(&dp83640->rx_queue)) != NULL)
+		kfree_skb(skb);
+
+	while ((skb = skb_dequeue(&dp83640->tx_queue)) != NULL)
+		skb_complete_tx_timestamp(skb, NULL);
+
 	clock = dp83640_clock_get(dp83640->clock);
 
 	if (dp83640 == clock->chosen) {
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH v3 1/2] dp83640: use proper function to free transmit time stamping packets
From: Richard Cochran @ 2011-10-24 17:55 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Eric Dumazet, Johannes Berg, stable
In-Reply-To: <cover.1319478544.git.richard.cochran@omicron.at>

Commit da92b194 introduced a new rule for handling the cloned packets
for transmit time stamping. These packets must not be freed using any other
function than skb_complete_tx_timestamp. This commit fixes the one and only
driver using this API.

The driver first appeared in v3.0.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: <stable@vger.kernel.org>
---
 drivers/net/phy/dp83640.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index edd7304..311f5cb 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -1060,7 +1060,7 @@ static void dp83640_txtstamp(struct phy_device *phydev,
 	struct dp83640_private *dp83640 = phydev->priv;
 
 	if (!dp83640->hwts_tx_en) {
-		kfree_skb(skb);
+		skb_complete_tx_timestamp(skb, NULL);
 		return;
 	}
 	skb_queue_tail(&dp83640->tx_queue, skb);
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH v3 0/2] net: time stamping fixes
From: Richard Cochran @ 2011-10-24 17:55 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Eric Dumazet, Johannes Berg

[ Changes in v3: omit accepted patch and fixup the dp83640 patches. ]

These two patches depend on commit da92b194 and fix two bugs in a
PTP Hardware Clock driver. This driver was first introduced in Linux
version 3.0.

Richard Cochran (2):
  dp83640: use proper function to free transmit time stamping packets
  dp83640: free packet queues on remove

 drivers/net/phy/dp83640.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

-- 
1.7.2.5

^ permalink raw reply

* |PATCH net-next] tg3: add tx_dropped counter
From: Eric Dumazet @ 2011-10-24 17:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Matt Carlson, Michael Chan

If a frame cant be transmitted, it is silently discarded.

Add a counter to report these errors to user.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
Note : merge errors expected because of pending tg3 patch in net tree, I
can respin if needed.

 drivers/net/ethernet/broadcom/tg3.c |   23 +++++++++++------------
 drivers/net/ethernet/broadcom/tg3.h |    1 +
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index b89027c..3447585 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -6671,10 +6671,8 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		u32 tcp_opt_len, hdr_len;
 
 		if (skb_header_cloned(skb) &&
-		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
-			dev_kfree_skb(skb);
-			goto out_unlock;
-		}
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto drop;
 
 		iph = ip_hdr(skb);
 		tcp_opt_len = tcp_optlen(skb);
@@ -6746,10 +6744,9 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	len = skb_headlen(skb);
 
 	mapping = pci_map_single(tp->pdev, skb->data, len, PCI_DMA_TODEVICE);
-	if (pci_dma_mapping_error(tp->pdev, mapping)) {
-		dev_kfree_skb(skb);
-		goto out_unlock;
-	}
+	if (pci_dma_mapping_error(tp->pdev, mapping))
+		goto drop;
+
 
 	tnapi->tx_buffers[entry].skb = skb;
 	dma_unmap_addr_set(&tnapi->tx_buffers[entry], mapping, mapping);
@@ -6805,7 +6802,7 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		budget = tg3_tx_avail(tnapi);
 		if (tigon3_dma_hwbug_workaround(tnapi, skb, &entry, &budget,
 						base_flags, mss, vlan))
-			goto out_unlock;
+			goto drop_nofree;
 	}
 
 	skb_tx_timestamp(skb);
@@ -6827,15 +6824,16 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			netif_tx_wake_queue(txq);
 	}
 
-out_unlock:
 	mmiowb();
-
 	return NETDEV_TX_OK;
 
 dma_error:
 	tg3_tx_skb_unmap(tnapi, tnapi->tx_prod, i);
-	dev_kfree_skb(skb);
 	tnapi->tx_buffers[tnapi->tx_prod].skb = NULL;
+drop:
+	dev_kfree_skb(skb);
+drop_nofree:
+	tp->tx_dropped++;
 	return NETDEV_TX_OK;
 }
 
@@ -10009,6 +10007,7 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
 		get_stat64(&hw_stats->rx_discards);
 
 	stats->rx_dropped = tp->rx_dropped;
+	stats->tx_dropped = tp->tx_dropped;
 
 	return stats;
 }
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index d2976f3..f32f288 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -2990,6 +2990,7 @@ struct tg3 {
 
 	/* begin "everything else" cacheline(s) section */
 	unsigned long			rx_dropped;
+	unsigned long			tx_dropped;
 	struct rtnl_link_stats64	net_stats_prev;
 	struct tg3_ethtool_stats	estats;
 	struct tg3_ethtool_stats	estats_prev;

^ permalink raw reply related

* Re: [PATCH v2 2/3] dp83640: use proper function to free transmit time stamping packets
From: Richard Cochran @ 2011-10-24 17:47 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, eric.dumazet, johannes, stable
In-Reply-To: <20111024.025555.265760947744724258.davem@davemloft.net>

On Mon, Oct 24, 2011 at 02:55:55AM -0400, David Miller wrote:
> From: Richard Cochran <richardcochran@gmail.com>
> Date: Fri, 21 Oct 2011 12:49:16 +0200
> 
> > The previous commit enforces a new rule for handling the cloned packets
> > for transmit time stamping. These packets must not be freed using any other
> > function than skb_complete_tx_timestamp. This commit fixes the one and only
> > driver using this API.
> > 
> > The driver first appeared in v3.0.
> > 
> > Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
> > Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
> > Cc: <stable@vger.kernel.org>
> 
> In the 'net' tree, which is where you should be targetting these dp83640
> driver patches, the code looks nothing like what you're patching against.
> 
> Please respin patches #2 and #3 against current sources.

Okay, but #2 will conflict with 

    dccaa9e0 dp83640: add time stamp insertion for sync messages

in net-next. Should I also submit a fix for that one?

Thanks,
Richard

^ permalink raw reply

* Re: [patch net-next V4] net: introduce ethernet teaming device
From: Michał Mirosław @ 2011-10-24 17:22 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, eric.dumazet, bhutchings, shemminger, fubar, andy,
	tgraf, ebiederm, kaber, greearb, jesse, fbl, benjamin.poirier,
	jzupka
In-Reply-To: <1319444005-1281-1-git-send-email-jpirko@redhat.com>

2011/10/24 Jiri Pirko <jpirko@redhat.com>:
> This patch introduces new network device called team. It supposes to be
> very fast, simple, userspace-driven alternative to existing bonding
> driver.
[...]
>  drivers/net/team/team.c                   | 1573 +++++++++++++++++++++++++++++
>  drivers/net/team/team_mode_activebackup.c |  152 +++
>  drivers/net/team/team_mode_roundrobin.c   |  107 ++

I think this mode-modularity is overkill. One mode will compile to at
most a few hundred bytes of code+data, but will use at least 10 times
that to get loaded and tracked properly. How often/how many more modes
you anticipate to be introduced? You could just keep the modular
design but drop the kernel module separation and maybe have modes
conditionally compiled (for those from the embedded world squeezing
every byte).

Best Regards,
Michał Mirosław

^ permalink raw reply

* Confidential/How are you
From: Barrister  Jacque Charles @ 2011-10-24 15:21 UTC (permalink / raw)





Dearest,


My name is Barrister Jacque Charles, a personal Attorney to a late client who died in car crash without a will.
For more information please contact via email: (jcchamber@rocketmail.com) upon your response, I shall then provide you with more details and relevant documents that will help you understand this transaction well.


Kindest Regards 
Barrister Jacque Charles,

^ permalink raw reply

* Re: [non-quoted-printable PATCH] Fix caif BUG() with network namespaces
From: Sjur Brændeland @ 2011-10-24 15:51 UTC (permalink / raw)
  To: Woodhouse, David; +Cc: davem@redhat.com, netdev@vger.kernel.org
In-Reply-To: <1319405079.13738.72.camel@shinybook.infradead.org>

Hi David,

> The caif code will register its own pernet_operations, and then register
> a netdevice_notifier. Each time the netdevice_notifier is triggered,
> it'll do some stuff... including a lookup of its own pernet stuff with
> net_generic().
>
> If the net_generic() call ever returns NULL, the caif code will BUG().
> That doesn't seem *so* unreasonable, I suppose — it does seem like it
> should never happen.
>
> However, it *does* happen. When we clone a network namespace,
> setup_net() runs through all the pernet_operations one at a time. It
> gets to loopback before it gets to caif. And loopback_net_init()
> registers a netdevice... while caif hasn't been initialised. So the caif
> netdevice notifier triggers, and immediately goes BUG().
>
> I'm not entirely sure how best to fix this in the general case. Perhaps
> the netdevice_notifier registration should be pernet too, rather than
> global? Or perhaps we should suppress the notifier calls during
> setup_net() and flush them at the end after everything has been
> initialised?
>
> But really, I'm inclined to just take the simple approach. Make
> caif_device_notify() *not* go looking for its pernet data structures if
> the device it's being notified about isn't a caif device in the first
> place. This simple patch is sufficient to avoid the problem, and is
> probably good enough.
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>

Thank you for analyzing and fixing this David, this looks good to me.
Acked-by: Sjur Brændeland <sjur.brandeland@stericsson.com>

^ permalink raw reply

* Re: [PATCH V2 2/4] MIPS: Add board support for Loongson1B
From: Giuseppe CAVALLARO @ 2011-10-24 15:35 UTC (permalink / raw)
  To: Kelvin Cheung
  Cc: Wu Zhangjin, linux-mips, linux-kernel, ralf, r0bertz, netdev
In-Reply-To: <CAJhJPsXxUAuF9HdivLd66MQC45mz-iYAuF1SdGdU=-duxJJ5bQ@mail.gmail.com>

On 10/24/2011 4:05 PM, Kelvin Cheung wrote:
> 2011/10/24, Giuseppe CAVALLARO <peppe.cavallaro@st.com>:
>> Hello Kelvin.
>>
>> On 10/24/2011 12:36 PM, Kelvin Cheung wrote:
>>
>> [snip]
>>
>>> According to datasheet of Loongson 1B, the buffer size in RX/TX
>>> descriptor is only 2KB. So the Loongson1B's GMAC could not handle
>>> jumbo frames. And the second buffer is useless in this case. Am I
>>> right? Is there a better way than ifdef CONFIG_MACH_LOONGSON1 to
>>> avoid duplicate code?
>>
>> Sorry for my misunderstanding.
>>
>> I think you have to use the normal descriptor and remove the enh_desc
>> from the platform w/o modifying the driver at all.
>>
>> The driver will be able to select/configure all automatically (also jumbo).
>>
>> Let me know.
> 
> That's the problem.
> The bitfield definition of Loongson1B is also different from normal descriptor.

The problem is not in the Loongson1B gmac.

The normal descriptor fields in the stmmac refer to an old synopsys
databook.
New chips have the same structure you have added; so we should fix this
in the driver w/o breaking the compatibility for old chips.
I kindly ask you to confirm if the currently normal descriptor structure
(w/o your changes) doesn't work on your platform.
Did you test it?

> Moreover, I want to enable the TX checksum offload function which is
> not supported in normal descriptor.
> Any suggestions?

It is supported but you have to pass from the platform: tx_coe = 1.

Peppe
> 
>> Note:
>> IIRC, there is a bit difference in case of normal descriptors for
>> Synopsys databook newer than the 1.91 (I used for testing this mode).
>> In any case, I remember that, on some platforms, the normal descriptors
>> have been used w/o problems also on these new chip generations.
>>
>> Peppe
>>
>>
> 
> 

^ permalink raw reply

* Re: [PATCH 07/10] RDMA/cxgb4: DB Drop Recovery for RDMA and LLD queues.
From: Vipul Pandya @ 2011-10-24 15:16 UTC (permalink / raw)
  To: David Miller
  Cc: swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW,
	roland-BHEL68pLQRGGvPXPguhicg, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, divy-ut6Up61K2wZBDgjK7y7TUQ,
	dm-ut6Up61K2wZBDgjK7y7TUQ, kumaras-ut6Up61K2wZBDgjK7y7TUQ
In-Reply-To: <20111020.165703.1713724038045504243.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>



On 21-10-2011 02:27, David Miller wrote:

> From: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
> Date: Thu, 20 Oct 2011 12:28:07 -0500
> 
>> On 10/20/2011 12:17 PM, Roland Dreier wrote:
>>>> I believe 5 and 7 have build dependencies.
>>> Right, missed that one too.
>>>
>>> But it seems 4,6,8,9,10 are independent of the rest of the series?
>>>
>>> ie I can trivially apply them and then worry about working out
>>> the drivers/net / drivers/infiniband interdependency a bit later?
>>>
>>
>> Some of these might be dependent on prior patches the series.  But if
>> they aren't, yes, you could do that.
> 
> So, how do you guys want to do this?  If you give me a list of which
> patches I should put into net-next and leave the rest to the infiniband
> tree, that'd work fine for me as long as net-next is left in a working
> state independent of the infiniband tree.


Hi Dave Miller/Roland,

With respect to above dependencies we did some experiments and found
following things

1. We can apply three cxgb4 patches, 01 02 and 03, on net-next tree
successfully and build it.

2. Out of 7 RDMA/cxgb4 patches only 04, 08 and 10 can be applied
trivially and driver can be built successfully. If we try to apply
remaining patches, 05 06 07 and 09, either they will fail to apply or
give build failure. Moreover patches 05, 06, 07 and 09 can be applied on
top of 04, 08 and 10 cleanly.

Based on above results we would like to propose following two things.

1. We would like to recommend that all the patches get included in
Roland's infiniband tree since it has build dependencies.

2. Alternatively,
- Patches 01, 02 and 03 can be included in net-next tree.
- Patches 04, 08 and 10 can be included in Roland's infiniband tree at
present.
- Patches 05, 06, 07 and 09 have to wait till the net-next hits the
3.2-rc1.

Please let us know if you have any other suggestions.

Thanks,
Vipul
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH V2 05/10] RDMA/cxgb4: Add DB Overflow Avoidance.
From: Vipul Pandya @ 2011-10-24 15:12 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW, Vipul Pandya

        - get FULL/EMPTY/DROP events from LLD

        - on FULL event, disable normal user mode DB rings.

        - add modify_qp semantics to allow user processes to call into
        the kernel to ring doobells without overflowing.

        Add DB Full/Empty/Drop stats.

        Mark queues when created indicating the doorbell state.

        If we're in the middle of db overflow avoidance, then newly created
        queues should start out in this mode.

        Bump the C4IW_UVERBS_ABI_VERSION to 2 so the user mode library can
        know if the driver supports the kernel mode db ringing.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
---
V2: Bump C4IW_UVERBS_ABI_VERSION to 2

 drivers/infiniband/hw/cxgb4/device.c   |   84 +++++++++++++++++++++++++++++--
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |   37 ++++++++++++--
 drivers/infiniband/hw/cxgb4/qp.c       |   51 +++++++++++++++++++-
 drivers/infiniband/hw/cxgb4/user.h     |    2 +-
 4 files changed, 162 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 8483111..9062ed9 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -44,6 +44,12 @@ MODULE_DESCRIPTION("Chelsio T4 RDMA Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
+struct uld_ctx {
+	struct list_head entry;
+	struct cxgb4_lld_info lldi;
+	struct c4iw_dev *dev;
+};
+
 static LIST_HEAD(uld_ctx_list);
 static DEFINE_MUTEX(dev_mutex);
 
@@ -263,6 +269,9 @@ static int stats_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "  OCQPMEM: %10llu %10llu %10llu\n",
 			dev->rdev.stats.ocqp.total, dev->rdev.stats.ocqp.cur,
 			dev->rdev.stats.ocqp.max);
+	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);
+	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);
+	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop);
 	return 0;
 }
 
@@ -283,6 +292,9 @@ static ssize_t stats_clear(struct file *file, const char __user *buf,
 	dev->rdev.stats.pbl.max = 0;
 	dev->rdev.stats.rqt.max = 0;
 	dev->rdev.stats.ocqp.max = 0;
+	dev->rdev.stats.db_full = 0;
+	dev->rdev.stats.db_empty = 0;
+	dev->rdev.stats.db_drop = 0;
 	mutex_unlock(&dev->rdev.stats.lock);
 	return count;
 }
@@ -443,12 +455,6 @@ static void c4iw_rdev_close(struct c4iw_rdev *rdev)
 	c4iw_destroy_resource(&rdev->resource);
 }
 
-struct uld_ctx {
-	struct list_head entry;
-	struct cxgb4_lld_info lldi;
-	struct c4iw_dev *dev;
-};
-
 static void c4iw_dealloc(struct uld_ctx *ctx)
 {
 	c4iw_rdev_close(&ctx->dev->rdev);
@@ -514,6 +520,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
 	idr_init(&devp->mmidr);
 	spin_lock_init(&devp->lock);
 	mutex_init(&devp->rdev.stats.lock);
+	mutex_init(&devp->db_mutex);
 
 	if (c4iw_debugfs_root) {
 		devp->debugfs_root = debugfs_create_dir(
@@ -659,11 +666,76 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
 	return 0;
 }
 
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_disable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void stop_queues(struct uld_ctx *ctx)
+{
+	spin_lock_irq(&ctx->dev->lock);
+	ctx->dev->db_state = FLOW_CONTROL;
+	idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_enable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void resume_queues(struct uld_ctx *ctx)
+{
+	spin_lock_irq(&ctx->dev->lock);
+	ctx->dev->db_state = NORMAL;
+	idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL);
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
+{
+	struct uld_ctx *ctx = handle;
+
+	switch (control) {
+	case CXGB4_CONTROL_DB_FULL:
+		stop_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_full++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	case CXGB4_CONTROL_DB_EMPTY:
+		resume_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_empty++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	case CXGB4_CONTROL_DB_DROP:
+		printk(KERN_WARNING MOD "%s: Fatal DB DROP\n",
+		       pci_name(ctx->lldi.pdev));
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_drop++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	default:
+		printk(KERN_WARNING MOD "%s: unknown control cmd %u\n",
+		       pci_name(ctx->lldi.pdev), control);
+		break;
+	}
+	return 0;
+}
+
 static struct cxgb4_uld_info c4iw_uld_info = {
 	.name = DRV_NAME,
 	.add = c4iw_uld_add,
 	.rx_handler = c4iw_uld_rx_handler,
 	.state_change = c4iw_uld_state_change,
+	.control = c4iw_uld_control,
 };
 
 static int __init c4iw_init_module(void)
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index ec7c848..1924c19 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -117,6 +117,9 @@ struct c4iw_stats {
 	struct c4iw_stat pbl;
 	struct c4iw_stat rqt;
 	struct c4iw_stat ocqp;
+	u64  db_full;
+	u64  db_empty;
+	u64  db_drop;
 };
 
 struct c4iw_rdev {
@@ -192,6 +195,12 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev,
 	return wr_waitp->ret;
 }
 
+enum db_state {
+	NORMAL = 0,
+	FLOW_CONTROL = 1,
+	RECOVERY = 2
+};
+
 struct c4iw_dev {
 	struct ib_device ibdev;
 	struct c4iw_rdev rdev;
@@ -200,7 +209,9 @@ struct c4iw_dev {
 	struct idr qpidr;
 	struct idr mmidr;
 	spinlock_t lock;
+	struct mutex db_mutex;
 	struct dentry *debugfs_root;
+	enum db_state db_state;
 };
 
 static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
@@ -228,8 +239,8 @@ static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid)
 	return idr_find(&rhp->mmidr, mmid);
 }
 
-static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
-				void *handle, u32 id)
+static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				 void *handle, u32 id, int lock)
 {
 	int ret;
 	int newid;
@@ -237,15 +248,29 @@ static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
 	do {
 		if (!idr_pre_get(idr, GFP_KERNEL))
 			return -ENOMEM;
-		spin_lock_irq(&rhp->lock);
+		if (lock)
+			spin_lock_irq(&rhp->lock);
 		ret = idr_get_new_above(idr, handle, id, &newid);
 		BUG_ON(newid != id);
-		spin_unlock_irq(&rhp->lock);
+		if (lock)
+			spin_unlock_irq(&rhp->lock);
 	} while (ret == -EAGAIN);
 
 	return ret;
 }
 
+static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 1);
+}
+
+static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr,
+				       void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 0);
+}
+
 static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
 {
 	spin_lock_irq(&rhp->lock);
@@ -369,6 +394,8 @@ struct c4iw_qp_attributes {
 	struct c4iw_ep *llp_stream_handle;
 	u8 layer_etype;
 	u8 ecode;
+	u16 sq_db_inc;
+	u16 rq_db_inc;
 };
 
 struct c4iw_qp {
@@ -443,6 +470,8 @@ static inline void insert_mmap(struct c4iw_ucontext *ucontext,
 
 enum c4iw_qp_attr_mask {
 	C4IW_QP_ATTR_NEXT_STATE = 1 << 0,
+	C4IW_QP_ATTR_SQ_DB = 1<<1,
+	C4IW_QP_ATTR_RQ_DB = 1<<2,
 	C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
 	C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
 	C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 74df98e..36fc94d 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -34,6 +34,10 @@
 
 #include "iw_cxgb4.h"
 
+static int db_delay_usecs = 1;
+module_param(db_delay_usecs, int, 0644);
+MODULE_PARM_DESC(db_delay_usecs, "Usecs to delay awaiting db fifo to drain");
+
 static int ocqp_support = 1;
 module_param(ocqp_support, int, 0644);
 MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)");
@@ -1117,6 +1121,29 @@ out:
 	return ret;
 }
 
+/*
+ * Called by the library when the qp has user dbs disabled due to
+ * a DB_FULL condition.  This function will single-thread all user
+ * DB rings to avoid overflowing the hw db-fifo.
+ */
+static int ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 inc)
+{
+	int delay = db_delay_usecs;
+
+	mutex_lock(&qhp->rhp->db_mutex);
+	do {
+		if (cxgb4_dbfifo_count(qhp->rhp->rdev.lldi.ports[0], 1) < 768) {
+			writel(V_QID(qid) | V_PIDX(inc), qhp->wq.db);
+			break;
+		}
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(usecs_to_jiffies(delay));
+		delay = min(delay << 1, 200000);
+	} while (1);
+	mutex_unlock(&qhp->rhp->db_mutex);
+	return 0;
+}
+
 int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 		   enum c4iw_qp_attr_mask mask,
 		   struct c4iw_qp_attributes *attrs,
@@ -1165,6 +1192,15 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 		qhp->attr = newattr;
 	}
 
+	if (mask & C4IW_QP_ATTR_SQ_DB) {
+		ret = ring_kernel_db(qhp, qhp->wq.sq.qid, attrs->sq_db_inc);
+		goto out;
+	}
+	if (mask & C4IW_QP_ATTR_RQ_DB) {
+		ret = ring_kernel_db(qhp, qhp->wq.rq.qid, attrs->rq_db_inc);
+		goto out;
+	}
+
 	if (!(mask & C4IW_QP_ATTR_NEXT_STATE))
 		goto out;
 	if (qhp->attr.state == attrs->next_state)
@@ -1454,7 +1490,11 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 	init_waitqueue_head(&qhp->wait);
 	atomic_set(&qhp->refcnt, 1);
 
-	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+	spin_lock_irq(&rhp->lock);
+	if (rhp->db_state != NORMAL)
+		t4_disable_wq_db(&qhp->wq);
+	ret = insert_handle_nolock(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+	spin_unlock_irq(&rhp->lock);
 	if (ret)
 		goto err2;
 
@@ -1598,6 +1638,15 @@ int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			 C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
 			 C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0;
 
+	/*
+	 * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for
+	 * ringing the queue db when we're in DB_FULL mode.
+	 */
+	attrs.sq_db_inc = attr->sq_psn;
+	attrs.rq_db_inc = attr->rq_psn;
+	mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0;
+	mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0;
+
 	return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0);
 }
 
diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h
index e6669d5..32b754c 100644
--- a/drivers/infiniband/hw/cxgb4/user.h
+++ b/drivers/infiniband/hw/cxgb4/user.h
@@ -32,7 +32,7 @@
 #ifndef __C4IW_USER_H__
 #define __C4IW_USER_H__
 
-#define C4IW_UVERBS_ABI_VERSION	1
+#define C4IW_UVERBS_ABI_VERSION	2
 
 /*
  * Make sure that all structs defined in this file remain laid out so
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH] bnx2x: Adding FW 7.0.29.0
From: David Woodhouse @ 2011-10-24 14:33 UTC (permalink / raw)
  To: dmitry; +Cc: ben@decadent.org.uk, netdev@vger.kernel.org, Eilon Greenstein
In-Reply-To: <1319464676.6155.3.camel@lb-tlvb-dmitry>

[-- Attachment #1: Type: text/plain, Size: 1088 bytes --]

On Mon, 2011-10-24 at 15:57 +0200, Dmitry Kravkov wrote:
> On Mon, 2011-10-17 at 09:12 -0700, Dmitry Kravkov wrote:
> > On Mon, 2011-10-17 at 07:00 -0700, Dmitry Kravkov wrote:
> > > Includes fixes for the following issues:
> > >   1. (iSCSI) Arrival of un-solicited ASYNC message causes
> > >      firmware to abort the connection with RST.
> > >   2. (FCoE) There is a probability that truncated FCoE packet on
> > >      RX path won't get detected which might lead to FW assert.
> > >   3. (iSCSI) Arrival of target-initiated NOP-IN during intense
> > >      ISCSI traffic might lead to FW assert.
> > >   4. (iSCSI) Chip hangs when in case of retransmission not aligned
> > >      to 4-bytes from the beginning of iSCSI PDU.
> > >   5. (FCoE) Arrival of packets beyond task IO size can lead to crash.
> > > 
> 
> David, do you have estimation to handle the request? We have pending
> patch for net-next. Thanks.

Pushed to git.infradead.org; thanks. I'll work on getting the tree back
onto git.kernel.org now I'm sitting at a table with the admin...

-- 
dwmw2

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5818 bytes --]

^ permalink raw reply

* Re: [patch net-next V4] net: introduce ethernet teaming device
From: Paul E. McKenney @ 2011-10-24 14:11 UTC (permalink / raw)
  To: Benjamin Poirier
  Cc: Jiri Pirko, netdev, davem, eric.dumazet, bhutchings, shemminger,
	fubar, andy, tgraf, ebiederm, mirqus, kaber, greearb, jesse, fbl,
	jzupka, Dipankar Sarma
In-Reply-To: <20111024130918.GB24473@synalogic.ca>

On Mon, Oct 24, 2011 at 09:09:19AM -0400, Benjamin Poirier wrote:
> On 11/10/24 10:13, Jiri Pirko wrote:
> > This patch introduces new network device called team. It supposes to be
> > very fast, simple, userspace-driven alternative to existing bonding
> > driver.
> > 
> > Userspace library called libteam with couple of demo apps is available
> > here:
> > https://github.com/jpirko/libteam
> > Note it's still in its dipers atm.
> > 
> > team<->libteam use generic netlink for communication. That and rtnl
> > suppose to be the only way to configure team device, no sysfs etc.
> > 
> > Python binding basis for libteam was recently introduced (some need
> > still need to be done on it though). Daemon providing arpmon/miimon
> > active-backup functionality will be introduced shortly.
> > All what's necessary is already implemented in kernel team driver.
> > 
> > Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> > 
> > v3->v4:
> > 	- remove redundant synchronize_rcu from __team_change_mode()
> > 	- revert "set and clear of mode_ops happens per pointer, not per
> > 	  byte"
> > 	- extend comment of function __team_change_mode()
> > 
> > v2->v3:
> > 	- team_change_mtu() user rcu version of list traversal to unwind
> > 	- set and clear of mode_ops happens per pointer, not per byte
> > 	- port hashlist changed to be embedded into team structure
> > 	- error branch in team_port_enter() does cleanup now
> > 	- fixed rtln->rtnl
> > 
> > v1->v2:
> > 	- modes are made as modules. Makes team more modular and
> > 	  extendable.
> > 	- several commenters' nitpicks found on v1 were fixed
> > 	- several other bugs were fixed.
> > 	- note I ignored Eric's comment about roundrobin port selector
> > 	  as Eric's way may be easily implemented as another mode (mode
> > 	  "random") in future.
> > ---
> >  Documentation/networking/team.txt         |    2 +
> >  MAINTAINERS                               |    7 +
> >  drivers/net/Kconfig                       |    2 +
> >  drivers/net/Makefile                      |    1 +
> >  drivers/net/team/Kconfig                  |   38 +
> >  drivers/net/team/Makefile                 |    7 +
> >  drivers/net/team/team.c                   | 1573 +++++++++++++++++++++++++++++
> >  drivers/net/team/team_mode_activebackup.c |  152 +++
> >  drivers/net/team/team_mode_roundrobin.c   |  107 ++
> >  include/linux/Kbuild                      |    1 +
> >  include/linux/if.h                        |    1 +
> >  include/linux/if_team.h                   |  231 +++++
> >  include/linux/rculist.h                   |   14 +
> 
> I think you're missing some CC's for the modifications to this file.
> I've taken the liberty of adding Dipankar and Paul to the discussion.

Thank you, and please see below.

> >  13 files changed, 2136 insertions(+), 0 deletions(-)
> >  create mode 100644 Documentation/networking/team.txt
> >  create mode 100644 drivers/net/team/Kconfig
> >  create mode 100644 drivers/net/team/Makefile
> >  create mode 100644 drivers/net/team/team.c
> >  create mode 100644 drivers/net/team/team_mode_activebackup.c
> >  create mode 100644 drivers/net/team/team_mode_roundrobin.c
> >  create mode 100644 include/linux/if_team.h
> > 
> 
> [...]
> 
> > diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
> > new file mode 100644
> > index 0000000..acfef4c
> > --- /dev/null
> > +++ b/drivers/net/team/team.c
> > +
> [...]
> > +static int team_change_mtu(struct net_device *dev, int new_mtu)
> > +{
> > +	struct team *team = netdev_priv(dev);
> > +	struct team_port *port;
> > +	int err;
> > +
> > +	rcu_read_lock();
> > +	list_for_each_entry_rcu(port, &team->port_list, list) {
> > +		err = dev_set_mtu(port->dev, new_mtu);
> > +		if (err) {
> > +			netdev_err(dev, "Device %s failed to change mtu",
> > +				   port->dev->name);
> > +			goto unwind;
> > +		}
> > +	}
> > +	rcu_read_unlock();
> > +
> > +	dev->mtu = new_mtu;
> > +
> > +	return 0;
> > +
> > +unwind:
> > +	list_for_each_entry_continue_reverse_rcu(port, &team->port_list, list)
> > +		dev_set_mtu(port->dev, dev->mtu);
> > +
> > +	rcu_read_unlock();
> > +	return err;
> > +}
> > +
> > +
> 
> [...]
> 
> > diff --git a/include/linux/rculist.h b/include/linux/rculist.h
> > index d079290..7586b2c 100644
> > --- a/include/linux/rculist.h
> > +++ b/include/linux/rculist.h
> > @@ -288,6 +288,20 @@ static inline void list_splice_init_rcu(struct list_head *list,
> >  	     pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
> >  
> >  /**
> > + * list_for_each_entry_continue_reverse_rcu - iterate backwards from the given point
> > + * @pos:	the type * to use as a loop cursor.
> > + * @head:	the head for your list.
> > + * @member:	the name of the list_struct within the struct.
> > + *
> > + * Start to iterate over list of given type backwards, continuing after
> > + * the current position.
> > + */
> > +#define list_for_each_entry_continue_reverse_rcu(pos, head, member)	\
> > +	for (pos = list_entry_rcu(pos->member.prev, typeof(*pos), member); \
> > +	     &pos->member != (head);	\
> > +	     pos = list_entry_rcu(pos->member.prev, typeof(*pos), member))
> > +
> 
> rcu lists can be modified while they are traversed with *_rcu()
> primitives. This benefit comes with the constraint that they may only be
> traversed forwards. This is implicit in the choice of *_rcu()
> list-traversal primitives: they only go forwards.
> 
> You suggest to add a backwards rcu list-traversal primitive. But
> consider what happens in this sequence:
> 
> CPU0					CPU1
> list_for_each_entry_continue_reverse_rcu(...)
> pos = list_entry_rcu(pos->member.prev, typeof(*pos), member)
> 					list_del_rcu(&pos->member)
> 					{ (&pos->member)->prev = LIST_POISON2 }
> pos = list_entry_rcu(pos->member.prev, typeof(*pos), member)
>     = container_of(LIST_POISON2, typeof(*pos), member)
> do_something(*pos)
>     BAM!
> 
> Going back to the problem you're trying to solve in team_change_mtu(),
> I think you could either:
> 1) take team->lock instead of rcu_read_lock() throughout this particular
> function
> 2) save each deleted element in a separate list on the side in case it's
> necessary to roll back
> 3) remove the rcu double locking, rely on rtnl and add some
> ASSERT_RTNL() if desired. You've said that you don't want to rely on
> rtnl and you want to use separate locking but I fail to see what
> advantage that brings to balance out the extra complexity in code and
> execution? Please clarify this.

Indeed -- the list_for_each_entry_continue_reverse_rcu() implementation
above would only work if elements were never deleted from the list.
But people would miss that fact, resulting in list-poison oopses.
Furthermore, even if you avoid the poisoning, there is no guarantee
that you will see the same objects in reverse that you saw going
forward because some might have been added or deleted in the meantime.

So please take some other approach.  For example, if the list has a
fixed upper bound, perhaps just keeping track of what elements you
visited would be workable.

						Thanx, Paul

> Thanks,
> -Ben
> 
> > +/**
> >   * hlist_del_rcu - deletes entry from hash list without re-initialization
> >   * @n: the element to delete from the hash list.
> >   *
> > -- 
> > 1.7.6
> > 
> 

^ permalink raw reply

* Re: [Xen-devel] Re: [PATCH 5/6] xen/netback: Enable netback on HVM guests
From: Konrad Rzeszutek Wilk @ 2011-10-24 14:19 UTC (permalink / raw)
  To: David Miller; +Cc: Ian.Campbell, netdev, dgdegra, xen-devel, david.vrabel
In-Reply-To: <20111024.053419.1995560587557035685.davem@davemloft.net>

On Mon, Oct 24, 2011 at 05:34:19AM -0400, David Miller wrote:
> From: Ian Campbell <Ian.Campbell@citrix.com>
> Date: Mon, 24 Oct 2011 10:31:08 +0100
> 
> > On Thu, 2011-10-20 at 16:35 +0100, Daniel De Graaf wrote:
> >> Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>
> > 
> > Acked-by: Ian Campbell <ian.campbell@citrix.com>
> > 
> > Normally netback patches would go in via the networking subsystem
> > maintainer's tree but since this depends on core Xen patches from this
> > series and is unlikely to conflict with anything in the net-next tree I
> > suspect it would make more sense for Konrad to take this one.
> > 
> > David (Miller) does that work for you?
> 
> Yes, it does.

OK, Can I stick Acked-by: David Miller on that patch?

Thank you.

^ permalink raw reply

* Re: bridge: HSR support
From: Arvid Brodin @ 2011-10-24 14:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <4E94D67A.9060207@enea.com>

Stephen Hemminger wrote:
> On Tue, 11 Oct 2011 20:25:08 +0200
> Arvid Brodin <arvid.brodin@enea.com> wrote:
> 
>> Hi,
>>
>> I want to add support for HSR ("High-availability Seamless Redundancy",
>> IEC-62439-3) to the bridge code. With HSR, all connected units have two network
>> ports and are connected in a ring. All new Ethernet packets are sent on both
>> ports (or passed through if the current unit is not the originating unit). The
>> same packet is never passed twice. Non-HSR units are not allowed in the ring.
>>
>> This gives instant, reconfiguration-free failover.
>>
>> I'd like your input on how to design the user interface. To me it seems natural
>> to use bridge-utils, which of course today supports STP.
>>
>> One solution is to simply add an "hsr" command:
>>
>> # brctl hsr <bridge> on|off
>>
>> But HSR is mutually exclusive to other modes, and I think that STP and standard
>> bridge mode are mutually exclusive, too? Perhaps it would be better (more user-
>> friendly) to 
>>
>> # brctl type <bridge> standard|stp|hsr
>>
>> ?
>>
>> 'brctl stp <bridge> on|off' would have to be kept for compatibility, but could
>> be a simple wrapper for 'brctl type <bridge> stp|standard'
>>
>> What do you think about this?
>>
> 
> Why is it a bridge thing and not a standalone or bonding (or the new team
> device feature? Wouldn't users want to use it without all the stuff
> related to bridging. The fact that it doesn't work with STP is a big
> red flag that it doesn't belong in the bridge.

Ok, having read up some more on this it looks like STP is a standardised part of
bridging, so I guess you're right. 


I need to do two things:

1) Bind two network interfaces into one (say, eth0 & eth1 => hsr0). Frames sent on
   hsr0 should get an HSR tag (including the correct EtherType) and go out on both
   eth0 and eth1.

2) Ingress frames on eth0 & eth1, with EtherType 0x88fb, should be captured and 
   handled specially (either received on hsr0 or forwarded to the other bound 
   physical interface).

Any ideas on the best way to implement this -- what's the nicest place to "hook
into" for this?


-- 
Arvid Brodin
Enea Services Stockholm AB

^ permalink raw reply

* Re: [PATCH V2 2/4] MIPS: Add board support for Loongson1B
From: Kelvin Cheung @ 2011-10-24 14:05 UTC (permalink / raw)
  To: Giuseppe CAVALLARO
  Cc: Wu Zhangjin, linux-mips, linux-kernel, ralf, r0bertz, netdev
In-Reply-To: <4EA557B2.4020504@st.com>

2011/10/24, Giuseppe CAVALLARO <peppe.cavallaro@st.com>:
> Hello Kelvin.
>
> On 10/24/2011 12:36 PM, Kelvin Cheung wrote:
>
> [snip]
>
>> According to datasheet of Loongson 1B, the buffer size in RX/TX
>> descriptor is only 2KB. So the Loongson1B's GMAC could not handle
>> jumbo frames. And the second buffer is useless in this case. Am I
>> right? Is there a better way than ifdef CONFIG_MACH_LOONGSON1 to
>> avoid duplicate code?
>
> Sorry for my misunderstanding.
>
> I think you have to use the normal descriptor and remove the enh_desc
> from the platform w/o modifying the driver at all.
>
> The driver will be able to select/configure all automatically (also jumbo).
>
> Let me know.

That's the problem.
The bitfield definition of Loongson1B is also different from normal descriptor.

Moreover, I want to enable the TX checksum offload function which is
not supported in normal descriptor.

Any suggestions?

> Note:
> IIRC, there is a bit difference in case of normal descriptors for
> Synopsys databook newer than the 1.91 (I used for testing this mode).
> In any case, I remember that, on some platforms, the normal descriptors
> have been used w/o problems also on these new chip generations.
>
> Peppe
>
>


-- 
Best Regards!
Kelvin

^ permalink raw reply

* Re: [PATCH] bnx2x: Adding FW 7.0.29.0
From: Dmitry Kravkov @ 2011-10-24 13:57 UTC (permalink / raw)
  To: dwmw2@infradead.org
  Cc: ben@decadent.org.uk, netdev@vger.kernel.org, Eilon Greenstein
In-Reply-To: <1318867968.5817.1.camel@lb-tlvb-dmitry>


On Mon, 2011-10-17 at 09:12 -0700, Dmitry Kravkov wrote:
> On Mon, 2011-10-17 at 07:00 -0700, Dmitry Kravkov wrote:
> > Includes fixes for the following issues:
> >   1. (iSCSI) Arrival of un-solicited ASYNC message causes
> >      firmware to abort the connection with RST.
> >   2. (FCoE) There is a probability that truncated FCoE packet on
> >      RX path won't get detected which might lead to FW assert.
> >   3. (iSCSI) Arrival of target-initiated NOP-IN during intense
> >      ISCSI traffic might lead to FW assert.
> >   4. (iSCSI) Chip hangs when in case of retransmission not aligned
> >      to 4-bytes from the beginning of iSCSI PDU.
> >   5. (FCoE) Arrival of packets beyond task IO size can lead to crash.
> > 

David, do you have estimation to handle the request? We have pending
patch for net-next. Thanks.

^ permalink raw reply

* Re: [patch net-next V4] net: introduce ethernet teaming device
From: Jiri Pirko @ 2011-10-24 13:50 UTC (permalink / raw)
  To: Benjamin Poirier
  Cc: netdev, davem, eric.dumazet, bhutchings, shemminger, fubar, andy,
	tgraf, ebiederm, mirqus, kaber, greearb, jesse, fbl, jzupka,
	Dipankar Sarma, Paul E. McKenney
In-Reply-To: <20111024130918.GB24473@synalogic.ca>

Mon, Oct 24, 2011 at 03:09:19PM CEST, benjamin.poirier@gmail.com wrote:
>On 11/10/24 10:13, Jiri Pirko wrote:
>> This patch introduces new network device called team. It supposes to be
>> very fast, simple, userspace-driven alternative to existing bonding
>> driver.
>> 
>> Userspace library called libteam with couple of demo apps is available
>> here:
>> https://github.com/jpirko/libteam
>> Note it's still in its dipers atm.
>> 
>> team<->libteam use generic netlink for communication. That and rtnl
>> suppose to be the only way to configure team device, no sysfs etc.
>> 
>> Python binding basis for libteam was recently introduced (some need
>> still need to be done on it though). Daemon providing arpmon/miimon
>> active-backup functionality will be introduced shortly.
>> All what's necessary is already implemented in kernel team driver.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> 
>> v3->v4:
>> 	- remove redundant synchronize_rcu from __team_change_mode()
>> 	- revert "set and clear of mode_ops happens per pointer, not per
>> 	  byte"
>> 	- extend comment of function __team_change_mode()
>> 
>> v2->v3:
>> 	- team_change_mtu() user rcu version of list traversal to unwind
>> 	- set and clear of mode_ops happens per pointer, not per byte
>> 	- port hashlist changed to be embedded into team structure
>> 	- error branch in team_port_enter() does cleanup now
>> 	- fixed rtln->rtnl
>> 
>> v1->v2:
>> 	- modes are made as modules. Makes team more modular and
>> 	  extendable.
>> 	- several commenters' nitpicks found on v1 were fixed
>> 	- several other bugs were fixed.
>> 	- note I ignored Eric's comment about roundrobin port selector
>> 	  as Eric's way may be easily implemented as another mode (mode
>> 	  "random") in future.
>> ---
>>  Documentation/networking/team.txt         |    2 +
>>  MAINTAINERS                               |    7 +
>>  drivers/net/Kconfig                       |    2 +
>>  drivers/net/Makefile                      |    1 +
>>  drivers/net/team/Kconfig                  |   38 +
>>  drivers/net/team/Makefile                 |    7 +
>>  drivers/net/team/team.c                   | 1573 +++++++++++++++++++++++++++++
>>  drivers/net/team/team_mode_activebackup.c |  152 +++
>>  drivers/net/team/team_mode_roundrobin.c   |  107 ++
>>  include/linux/Kbuild                      |    1 +
>>  include/linux/if.h                        |    1 +
>>  include/linux/if_team.h                   |  231 +++++
>>  include/linux/rculist.h                   |   14 +
>
>I think you're missing some CC's for the modifications to this file.
>I've taken the liberty of adding Dipankar and Paul to the discussion.
>
>>  13 files changed, 2136 insertions(+), 0 deletions(-)
>>  create mode 100644 Documentation/networking/team.txt
>>  create mode 100644 drivers/net/team/Kconfig
>>  create mode 100644 drivers/net/team/Makefile
>>  create mode 100644 drivers/net/team/team.c
>>  create mode 100644 drivers/net/team/team_mode_activebackup.c
>>  create mode 100644 drivers/net/team/team_mode_roundrobin.c
>>  create mode 100644 include/linux/if_team.h
>> 
>
>[...]
>
>> diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
>> new file mode 100644
>> index 0000000..acfef4c
>> --- /dev/null
>> +++ b/drivers/net/team/team.c
>> +
>[...]
>> +static int team_change_mtu(struct net_device *dev, int new_mtu)
>> +{
>> +	struct team *team = netdev_priv(dev);
>> +	struct team_port *port;
>> +	int err;
>> +
>> +	rcu_read_lock();
>> +	list_for_each_entry_rcu(port, &team->port_list, list) {
>> +		err = dev_set_mtu(port->dev, new_mtu);
>> +		if (err) {
>> +			netdev_err(dev, "Device %s failed to change mtu",
>> +				   port->dev->name);
>> +			goto unwind;
>> +		}
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	dev->mtu = new_mtu;
>> +
>> +	return 0;
>> +
>> +unwind:
>> +	list_for_each_entry_continue_reverse_rcu(port, &team->port_list, list)
>> +		dev_set_mtu(port->dev, dev->mtu);
>> +
>> +	rcu_read_unlock();
>> +	return err;
>> +}
>> +
>> +
>
>[...]
>
>> diff --git a/include/linux/rculist.h b/include/linux/rculist.h
>> index d079290..7586b2c 100644
>> --- a/include/linux/rculist.h
>> +++ b/include/linux/rculist.h
>> @@ -288,6 +288,20 @@ static inline void list_splice_init_rcu(struct list_head *list,
>>  	     pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
>>  
>>  /**
>> + * list_for_each_entry_continue_reverse_rcu - iterate backwards from the given point
>> + * @pos:	the type * to use as a loop cursor.
>> + * @head:	the head for your list.
>> + * @member:	the name of the list_struct within the struct.
>> + *
>> + * Start to iterate over list of given type backwards, continuing after
>> + * the current position.
>> + */
>> +#define list_for_each_entry_continue_reverse_rcu(pos, head, member)	\
>> +	for (pos = list_entry_rcu(pos->member.prev, typeof(*pos), member); \
>> +	     &pos->member != (head);	\
>> +	     pos = list_entry_rcu(pos->member.prev, typeof(*pos), member))
>> +
>
>rcu lists can be modified while they are traversed with *_rcu()
>primitives. This benefit comes with the constraint that they may only be
>traversed forwards. This is implicit in the choice of *_rcu()
>list-traversal primitives: they only go forwards.
>
>You suggest to add a backwards rcu list-traversal primitive. But
>consider what happens in this sequence:
>
>CPU0					CPU1
>list_for_each_entry_continue_reverse_rcu(...)
>pos = list_entry_rcu(pos->member.prev, typeof(*pos), member)
>					list_del_rcu(&pos->member)
>					{ (&pos->member)->prev = LIST_POISON2 }
>pos = list_entry_rcu(pos->member.prev, typeof(*pos), member)
>    = container_of(LIST_POISON2, typeof(*pos), member)
>do_something(*pos)
>    BAM!

Doh, you are right. :(

>
>Going back to the problem you're trying to solve in team_change_mtu(),
>I think you could either:
>1) take team->lock instead of rcu_read_lock() throughout this particular
>function
>2) save each deleted element in a separate list on the side in case it's
>necessary to roll back

I would go with this one.

>3) remove the rcu double locking, rely on rtnl and add some
>ASSERT_RTNL() if desired. You've said that you don't want to rely on
>rtnl and you want to use separate locking but I fail to see what
>advantage that brings to balance out the extra complexity in code and
>execution? Please clarify this.

I do not want to use rtnl. For example in gennetlink code rtnl is not
held so I need to depend on own lock.


>
>Thanks,
>-Ben
>
>> +/**
>>   * hlist_del_rcu - deletes entry from hash list without re-initialization
>>   * @n: the element to delete from the hash list.
>>   *
>> -- 
>> 1.7.6
>> 

^ permalink raw reply

* Re: [patch net-next V4] net: introduce ethernet teaming device
From: Benjamin Poirier @ 2011-10-24 13:09 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, eric.dumazet, bhutchings, shemminger, fubar, andy,
	tgraf, ebiederm, mirqus, kaber, greearb, jesse, fbl, jzupka,
	Dipankar Sarma, Paul E. McKenney
In-Reply-To: <1319444005-1281-1-git-send-email-jpirko@redhat.com>

On 11/10/24 10:13, Jiri Pirko wrote:
> This patch introduces new network device called team. It supposes to be
> very fast, simple, userspace-driven alternative to existing bonding
> driver.
> 
> Userspace library called libteam with couple of demo apps is available
> here:
> https://github.com/jpirko/libteam
> Note it's still in its dipers atm.
> 
> team<->libteam use generic netlink for communication. That and rtnl
> suppose to be the only way to configure team device, no sysfs etc.
> 
> Python binding basis for libteam was recently introduced (some need
> still need to be done on it though). Daemon providing arpmon/miimon
> active-backup functionality will be introduced shortly.
> All what's necessary is already implemented in kernel team driver.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> v3->v4:
> 	- remove redundant synchronize_rcu from __team_change_mode()
> 	- revert "set and clear of mode_ops happens per pointer, not per
> 	  byte"
> 	- extend comment of function __team_change_mode()
> 
> v2->v3:
> 	- team_change_mtu() user rcu version of list traversal to unwind
> 	- set and clear of mode_ops happens per pointer, not per byte
> 	- port hashlist changed to be embedded into team structure
> 	- error branch in team_port_enter() does cleanup now
> 	- fixed rtln->rtnl
> 
> v1->v2:
> 	- modes are made as modules. Makes team more modular and
> 	  extendable.
> 	- several commenters' nitpicks found on v1 were fixed
> 	- several other bugs were fixed.
> 	- note I ignored Eric's comment about roundrobin port selector
> 	  as Eric's way may be easily implemented as another mode (mode
> 	  "random") in future.
> ---
>  Documentation/networking/team.txt         |    2 +
>  MAINTAINERS                               |    7 +
>  drivers/net/Kconfig                       |    2 +
>  drivers/net/Makefile                      |    1 +
>  drivers/net/team/Kconfig                  |   38 +
>  drivers/net/team/Makefile                 |    7 +
>  drivers/net/team/team.c                   | 1573 +++++++++++++++++++++++++++++
>  drivers/net/team/team_mode_activebackup.c |  152 +++
>  drivers/net/team/team_mode_roundrobin.c   |  107 ++
>  include/linux/Kbuild                      |    1 +
>  include/linux/if.h                        |    1 +
>  include/linux/if_team.h                   |  231 +++++
>  include/linux/rculist.h                   |   14 +

I think you're missing some CC's for the modifications to this file.
I've taken the liberty of adding Dipankar and Paul to the discussion.

>  13 files changed, 2136 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/networking/team.txt
>  create mode 100644 drivers/net/team/Kconfig
>  create mode 100644 drivers/net/team/Makefile
>  create mode 100644 drivers/net/team/team.c
>  create mode 100644 drivers/net/team/team_mode_activebackup.c
>  create mode 100644 drivers/net/team/team_mode_roundrobin.c
>  create mode 100644 include/linux/if_team.h
> 

[...]

> diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
> new file mode 100644
> index 0000000..acfef4c
> --- /dev/null
> +++ b/drivers/net/team/team.c
> +
[...]
> +static int team_change_mtu(struct net_device *dev, int new_mtu)
> +{
> +	struct team *team = netdev_priv(dev);
> +	struct team_port *port;
> +	int err;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(port, &team->port_list, list) {
> +		err = dev_set_mtu(port->dev, new_mtu);
> +		if (err) {
> +			netdev_err(dev, "Device %s failed to change mtu",
> +				   port->dev->name);
> +			goto unwind;
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	dev->mtu = new_mtu;
> +
> +	return 0;
> +
> +unwind:
> +	list_for_each_entry_continue_reverse_rcu(port, &team->port_list, list)
> +		dev_set_mtu(port->dev, dev->mtu);
> +
> +	rcu_read_unlock();
> +	return err;
> +}
> +
> +

[...]

> diff --git a/include/linux/rculist.h b/include/linux/rculist.h
> index d079290..7586b2c 100644
> --- a/include/linux/rculist.h
> +++ b/include/linux/rculist.h
> @@ -288,6 +288,20 @@ static inline void list_splice_init_rcu(struct list_head *list,
>  	     pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
>  
>  /**
> + * list_for_each_entry_continue_reverse_rcu - iterate backwards from the given point
> + * @pos:	the type * to use as a loop cursor.
> + * @head:	the head for your list.
> + * @member:	the name of the list_struct within the struct.
> + *
> + * Start to iterate over list of given type backwards, continuing after
> + * the current position.
> + */
> +#define list_for_each_entry_continue_reverse_rcu(pos, head, member)	\
> +	for (pos = list_entry_rcu(pos->member.prev, typeof(*pos), member); \
> +	     &pos->member != (head);	\
> +	     pos = list_entry_rcu(pos->member.prev, typeof(*pos), member))
> +

rcu lists can be modified while they are traversed with *_rcu()
primitives. This benefit comes with the constraint that they may only be
traversed forwards. This is implicit in the choice of *_rcu()
list-traversal primitives: they only go forwards.

You suggest to add a backwards rcu list-traversal primitive. But
consider what happens in this sequence:

CPU0					CPU1
list_for_each_entry_continue_reverse_rcu(...)
pos = list_entry_rcu(pos->member.prev, typeof(*pos), member)
					list_del_rcu(&pos->member)
					{ (&pos->member)->prev = LIST_POISON2 }
pos = list_entry_rcu(pos->member.prev, typeof(*pos), member)
    = container_of(LIST_POISON2, typeof(*pos), member)
do_something(*pos)
    BAM!

Going back to the problem you're trying to solve in team_change_mtu(),
I think you could either:
1) take team->lock instead of rcu_read_lock() throughout this particular
function
2) save each deleted element in a separate list on the side in case it's
necessary to roll back
3) remove the rcu double locking, rely on rtnl and add some
ASSERT_RTNL() if desired. You've said that you don't want to rely on
rtnl and you want to use separate locking but I fail to see what
advantage that brings to balance out the extra complexity in code and
execution? Please clarify this.

Thanks,
-Ben

> +/**
>   * hlist_del_rcu - deletes entry from hash list without re-initialization
>   * @n: the element to delete from the hash list.
>   *
> -- 
> 1.7.6
> 

^ permalink raw reply

* Re: [RFD] Network configuration data in sysfs
From: Kay Sievers @ 2011-10-24 12:46 UTC (permalink / raw)
  To: David Miller
  Cc: kirill, netdev, kuznet, jmorris, yoshfuji, kaber, gregkh,
	gladkov.alexey
In-Reply-To: <20111024.005900.1091819103500072631.davem@davemloft.net>

On Mon, Oct 24, 2011 at 06:59, David Miller <davem@davemloft.net> wrote:
> From: "Kirill A. Shutemov" <kirill@shutemov.name>
> Date: Mon, 24 Oct 2011 07:24:00 +0300
>
>> On Sun, Oct 23, 2011 at 11:24:16PM -0400, David Miller wrote:
>>> From: "Kirill A. Shutemov" <kirill@shutemov.name>
>>> Date: Mon, 24 Oct 2011 04:34:07 +0300
>>>
>>> You can use netlink to perform any configuration change you want, or
>>> to view any network configuration setting.
>>
>> You need /sbin/ip or similar tool to do this, right?
>
> I'm talking about udev using netlink natively.

Kirill, what exactly is the use case? And why what does udev support
mean in that context?

I doubt that "not having /sbin/ip installed" should be a reason to add
and expose complex interfaces in /sys, while we already have a
perfectly working native way to do it.

Kay

^ permalink raw reply

* [PATCH net-next 4/4] be2net: don't create multiple RX/TX rings in multi channel mode
From: Sathya Perla @ 2011-10-24 12:45 UTC (permalink / raw)
  To: netdev; +Cc: Suresh Reddy
In-Reply-To: <1319460303-25560-1-git-send-email-sathya.perla@emulex.com>

When the HW is in multi-channel mode based on the skew/IPL, there are 4 functions per port and so not enough resources to create multiple RX/TX rings for each function.

Signed-off-by: Suresh Reddy <suresh.reddy@emulex.com>
Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_cmds.h |    6 ++++++
 drivers/net/ethernet/emulex/benet/be_main.c |   16 ++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index 75b7574..a35cd03 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -1046,6 +1046,12 @@ struct be_cmd_resp_modify_eq_delay {
 
 /******************** Get FW Config *******************/
 #define BE_FUNCTION_CAPS_RSS			0x2
+/* The HW can come up in either of the following multi-channel modes
+ * based on the skew/IPL.
+ */
+#define FLEX10_MODE				0x400
+#define VNIC_MODE				0x20000
+#define UMC_ENABLED				0x1000000
 struct be_cmd_req_query_fw_cfg {
 	struct be_cmd_req_hdr hdr;
 	u32 rsvd[31];
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 77ce24b..91fe12a 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -114,6 +114,13 @@ static const char * const ue_status_hi_desc[] = {
 	"Unknown"
 };
 
+/* Is BE in a multi-channel mode */
+static inline bool be_is_mc(struct be_adapter *adapter) {
+	return (adapter->function_mode & FLEX10_MODE ||
+		adapter->function_mode & VNIC_MODE ||
+		adapter->function_mode & UMC_ENABLED);
+}
+
 static void be_queue_free(struct be_adapter *adapter, struct be_queue_info *q)
 {
 	struct be_dma_mem *mem = &q->dma_mem;
@@ -1289,7 +1296,7 @@ static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo)
 	if (rxcp->vlanf) {
 		/* vlanf could be wrongly set in some cards.
 		 * ignore if vtm is not set */
-		if ((adapter->function_mode & 0x400) && !rxcp->vtm)
+		if ((adapter->function_mode & FLEX10_MODE) && !rxcp->vtm)
 			rxcp->vlanf = 0;
 
 		if (!lancer_chip(adapter))
@@ -1636,7 +1643,7 @@ static void be_tx_queues_destroy(struct be_adapter *adapter)
 static int be_num_txqs_want(struct be_adapter *adapter)
 {
 	if ((num_vfs && adapter->sriov_enabled) ||
-		(adapter->function_mode & 0x400) ||
+		be_is_mc(adapter) ||
 		lancer_chip(adapter) || !be_physfn(adapter) ||
 		adapter->generation == BE_GEN2)
 		return 1;
@@ -1718,7 +1725,8 @@ static void be_rx_queues_destroy(struct be_adapter *adapter)
 static u32 be_num_rxqs_want(struct be_adapter *adapter)
 {
 	if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) &&
-		!adapter->sriov_enabled && !(adapter->function_mode & 0x400)) {
+		!adapter->sriov_enabled && be_physfn(adapter) &&
+		!be_is_mc(adapter)) {
 		return 1 + MAX_RSS_QS; /* one default non-RSS queue */
 	} else {
 		dev_warn(&adapter->pdev->dev,
@@ -3187,7 +3195,7 @@ static int be_get_config(struct be_adapter *adapter)
 	if (status)
 		return status;
 
-	if (adapter->function_mode & 0x400)
+	if (adapter->function_mode & FLEX10_MODE)
 		adapter->max_vlans = BE_NUM_VLANS_SUPPORTED/4;
 	else
 		adapter->max_vlans = BE_NUM_VLANS_SUPPORTED;
-- 
1.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox