Netdev List
 help / color / mirror / Atom feed
* [net-next PATCH 3/3] igb: set vf rlpml wasn't taking vlan tag into account
From: Jeff Kirsher @ 2009-09-04  0:49 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20090904004828.14537.72187.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch updates things so that vlan tags are taken into account when
setting the receive large packet maximum length.  This allows the VF driver
to correctly receive full sized frames when vlans are enabled.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb.h      |    1 +
 drivers/net/igb/igb_main.c |   39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index b2c98de..7126fea 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -70,6 +70,7 @@ struct vf_data_storage {
 	unsigned char vf_mac_addresses[ETH_ALEN];
 	u16 vf_mc_hashes[IGB_MAX_VF_MC_ENTRIES];
 	u16 num_vf_mc_hashes;
+	u16 vlans_enabled;
 	bool clear_to_send;
 };
 
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 1d03fcb..90b0b1b 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -154,6 +154,12 @@ static inline int igb_set_vf_rlpml(struct igb_adapter *adapter, int size,
 	struct e1000_hw *hw = &adapter->hw;
 	u32 vmolr;
 
+	/* if it isn't the PF check to see if VFs are enabled and
+	 * increase the size to support vlan tags */
+	if (vfn < adapter->vfs_allocated_count &&
+	    adapter->vf_data[vfn].vlans_enabled)
+		size += VLAN_TAG_SIZE;
+
 	vmolr = rd32(E1000_VMOLR(vfn));
 	vmolr &= ~E1000_VMOLR_RLPML_MASK;
 	vmolr |= size | E1000_VMOLR_LPE;
@@ -4006,6 +4012,8 @@ static void igb_clear_vf_vfta(struct igb_adapter *adapter, u32 vf)
 
 		wr32(E1000_VLVF(i), reg);
 	}
+
+	adapter->vf_data[vf].vlans_enabled = 0;
 }
 
 static s32 igb_vlvf_set(struct igb_adapter *adapter, u32 vid, bool add, u32 vf)
@@ -4054,6 +4062,22 @@ static s32 igb_vlvf_set(struct igb_adapter *adapter, u32 vid, bool add, u32 vf)
 			reg |= vid;
 
 			wr32(E1000_VLVF(i), reg);
+
+			/* do not modify RLPML for PF devices */
+			if (vf >= adapter->vfs_allocated_count)
+				return 0;
+
+			if (!adapter->vf_data[vf].vlans_enabled) {
+				u32 size;
+				reg = rd32(E1000_VMOLR(vf));
+				size = reg & E1000_VMOLR_RLPML_MASK;
+				size += 4;
+				reg &= ~E1000_VMOLR_RLPML_MASK;
+				reg |= size;
+				wr32(E1000_VMOLR(vf), reg);
+			}
+			adapter->vf_data[vf].vlans_enabled++;
+
 			return 0;
 		}
 	} else {
@@ -4066,6 +4090,21 @@ static s32 igb_vlvf_set(struct igb_adapter *adapter, u32 vid, bool add, u32 vf)
 				igb_vfta_set(hw, vid, false);
 			}
 			wr32(E1000_VLVF(i), reg);
+
+			/* do not modify RLPML for PF devices */
+			if (vf >= adapter->vfs_allocated_count)
+				return 0;
+
+			adapter->vf_data[vf].vlans_enabled--;
+			if (!adapter->vf_data[vf].vlans_enabled) {
+				u32 size;
+				reg = rd32(E1000_VMOLR(vf));
+				size = reg & E1000_VMOLR_RLPML_MASK;
+				size -= 4;
+				reg &= ~E1000_VMOLR_RLPML_MASK;
+				reg |= size;
+				wr32(E1000_VMOLR(vf), reg);
+			}
 			return 0;
 		}
 	}


^ permalink raw reply related

* [net-next PATCH 2/3] igb: only disable/enable interrupt bits for igb physical function
From: Jeff Kirsher @ 2009-09-04  0:49 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20090904004828.14537.72187.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

The igb_irq_disable/enable calls were causing virtual functions associated
with the igb physical function to have their interrupts disabled.  In order
to prevent this from occuring we should only clear/set the bits related to
the physical function.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_main.c |   14 +++++++++-----
 1 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 95089a8..1d03fcb 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -821,9 +821,11 @@ static void igb_irq_disable(struct igb_adapter *adapter)
 	struct e1000_hw *hw = &adapter->hw;
 
 	if (adapter->msix_entries) {
-		wr32(E1000_EIAM, 0);
-		wr32(E1000_EIMC, ~0);
-		wr32(E1000_EIAC, 0);
+		u32 regval = rd32(E1000_EIAM);
+		wr32(E1000_EIAM, regval & ~adapter->eims_enable_mask);
+		wr32(E1000_EIMC, adapter->eims_enable_mask);
+		regval = rd32(E1000_EIAC);
+		wr32(E1000_EIAC, regval & ~adapter->eims_enable_mask);
 	}
 
 	wr32(E1000_IAM, 0);
@@ -841,8 +843,10 @@ static void igb_irq_enable(struct igb_adapter *adapter)
 	struct e1000_hw *hw = &adapter->hw;
 
 	if (adapter->msix_entries) {
-		wr32(E1000_EIAC, adapter->eims_enable_mask);
-		wr32(E1000_EIAM, adapter->eims_enable_mask);
+		u32 regval = rd32(E1000_EIAC);
+		wr32(E1000_EIAC, regval | adapter->eims_enable_mask);
+		regval = rd32(E1000_EIAM);
+		wr32(E1000_EIAM, regval | adapter->eims_enable_mask);
 		wr32(E1000_EIMS, adapter->eims_enable_mask);
 		if (adapter->vfs_allocated_count)
 			wr32(E1000_MBVFIMR, 0xFF);


^ permalink raw reply related

* [net-next PATCH 1/3] igb: add support for set_rx_mode netdevice operation
From: Jeff Kirsher @ 2009-09-04  0:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds support for the set_rx_mode netdevice operation so that igb
can better support multiple unicast addresses.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/igb/igb_main.c |   73 +++++++++++++++++++++++++++++---------------
 1 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 7a054d9..95089a8 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -94,7 +94,7 @@ static void igb_clean_all_tx_rings(struct igb_adapter *);
 static void igb_clean_all_rx_rings(struct igb_adapter *);
 static void igb_clean_tx_ring(struct igb_ring *);
 static void igb_clean_rx_ring(struct igb_ring *);
-static void igb_set_multi(struct net_device *);
+static void igb_set_rx_mode(struct net_device *);
 static void igb_update_phy_info(unsigned long);
 static void igb_watchdog(unsigned long);
 static void igb_watchdog_task(struct work_struct *);
@@ -928,7 +928,7 @@ static void igb_configure(struct igb_adapter *adapter)
 	int i;
 
 	igb_get_hw_control(adapter);
-	igb_set_multi(netdev);
+	igb_set_rx_mode(netdev);
 
 	igb_restore_vlan(adapter);
 
@@ -1169,7 +1169,8 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_stop		= igb_close,
 	.ndo_start_xmit		= igb_xmit_frame_adv,
 	.ndo_get_stats		= igb_get_stats,
-	.ndo_set_multicast_list	= igb_set_multi,
+	.ndo_set_rx_mode	= igb_set_rx_mode,
+	.ndo_set_multicast_list	= igb_set_rx_mode,
 	.ndo_set_mac_address	= igb_set_mac,
 	.ndo_change_mtu		= igb_change_mtu,
 	.ndo_do_ioctl		= igb_ioctl,
@@ -2519,48 +2520,70 @@ static int igb_set_mac(struct net_device *netdev, void *p)
 	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
 	memcpy(hw->mac.addr, addr->sa_data, netdev->addr_len);
 
-	hw->mac.ops.rar_set(hw, hw->mac.addr, 0);
-
+	igb_rar_set(hw, hw->mac.addr, 0);
 	igb_set_rah_pool(hw, adapter->vfs_allocated_count, 0);
 
 	return 0;
 }
 
 /**
- * igb_set_multi - Multicast and Promiscuous mode set
+ * igb_set_rx_mode - Secondary Unicast, Multicast and Promiscuous mode set
  * @netdev: network interface device structure
  *
- * The set_multi entry point is called whenever the multicast address
- * list or the network interface flags are updated.  This routine is
- * responsible for configuring the hardware for proper multicast,
+ * The set_rx_mode entry point is called whenever the unicast or multicast
+ * address lists or the network interface flags are updated.  This routine is
+ * responsible for configuring the hardware for proper unicast, multicast,
  * promiscuous mode, and all-multi behavior.
  **/
-static void igb_set_multi(struct net_device *netdev)
+static void igb_set_rx_mode(struct net_device *netdev)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
-	struct dev_mc_list *mc_ptr;
+	unsigned int rar_entries = hw->mac.rar_entry_count -
+	                           (adapter->vfs_allocated_count + 1);
+	struct dev_mc_list *mc_ptr = netdev->mc_list;
 	u8  *mta_list = NULL;
 	u32 rctl;
 	int i;
 
 	/* Check for Promiscuous and All Multicast modes */
-
 	rctl = rd32(E1000_RCTL);
 
 	if (netdev->flags & IFF_PROMISC) {
 		rctl |= (E1000_RCTL_UPE | E1000_RCTL_MPE);
 		rctl &= ~E1000_RCTL_VFE;
 	} else {
-		if (netdev->flags & IFF_ALLMULTI) {
+		if (netdev->flags & IFF_ALLMULTI)
 			rctl |= E1000_RCTL_MPE;
+		else
+			rctl &= ~E1000_RCTL_MPE;
+
+		if (netdev->uc.count > rar_entries)
+			rctl |= E1000_RCTL_UPE;
+		else
 			rctl &= ~E1000_RCTL_UPE;
-		} else
-			rctl &= ~(E1000_RCTL_UPE | E1000_RCTL_MPE);
 		rctl |= E1000_RCTL_VFE;
 	}
 	wr32(E1000_RCTL, rctl);
 
+	if (netdev->uc.count && rar_entries) {
+		struct netdev_hw_addr *ha;
+		list_for_each_entry(ha, &netdev->uc.list, list) {
+			if (!rar_entries)
+				break;
+			igb_rar_set(hw, ha->addr, rar_entries);
+			igb_set_rah_pool(hw, adapter->vfs_allocated_count,
+			                 rar_entries);
+			rar_entries--;
+		}
+	}
+	/* write the addresses in reverse order to avoid write combining */
+	for (; rar_entries > 0 ; rar_entries--) {
+		wr32(E1000_RAH(rar_entries), 0);
+		wr32(E1000_RAL(rar_entries), 0);
+	}
+	wrfl();
+
 	if (!netdev->mc_count) {
 		/* nothing to program, so clear mc list */
 		igb_update_mc_addr_list(hw, NULL, 0);
@@ -2576,8 +2599,6 @@ static void igb_set_multi(struct net_device *netdev)
 	}
 
 	/* The shared function expects a packed array of only addresses. */
-	mc_ptr = netdev->mc_list;
-
 	for (i = 0; i < netdev->mc_count; i++) {
 		if (!mc_ptr)
 			break;
@@ -3938,7 +3959,7 @@ static int igb_set_vf_multicasts(struct igb_adapter *adapter,
 		vf_data->vf_mc_hashes[i] = hash_list[i];;
 
 	/* Flush and reset the mta with the new values */
-	igb_set_multi(adapter->netdev);
+	igb_set_rx_mode(adapter->netdev);
 
 	return 0;
 }
@@ -4072,13 +4093,14 @@ static inline void igb_vf_reset_event(struct igb_adapter *adapter, u32 vf)
 	adapter->vf_data[vf].num_vf_mc_hashes = 0;
 
 	/* Flush and reset the mta with the new values */
-	igb_set_multi(adapter->netdev);
+	igb_set_rx_mode(adapter->netdev);
 }
 
 static inline void igb_vf_reset_msg(struct igb_adapter *adapter, u32 vf)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	unsigned char *vf_mac = adapter->vf_data[vf].vf_mac_addresses;
+	int rar_entry = hw->mac.rar_entry_count - (vf + 1);
 	u32 reg, msgbuf[3];
 	u8 *addr = (u8 *)(&msgbuf[1]);
 
@@ -4086,8 +4108,8 @@ static inline void igb_vf_reset_msg(struct igb_adapter *adapter, u32 vf)
 	igb_vf_reset_event(adapter, vf);
 
 	/* set vf mac address */
-	igb_rar_set(hw, vf_mac, vf + 1);
-	igb_set_rah_pool(hw, vf, vf + 1);
+	igb_rar_set(hw, vf_mac, rar_entry);
+	igb_set_rah_pool(hw, vf, rar_entry);
 
 	/* enable transmit and receive for vf */
 	reg = rd32(E1000_VFTE);
@@ -5228,7 +5250,7 @@ static int __igb_shutdown(struct pci_dev *pdev, bool *enable_wake)
 
 	if (wufc) {
 		igb_setup_rctl(adapter);
-		igb_set_multi(netdev);
+		igb_set_rx_mode(netdev);
 
 		/* turn on all-multi mode if wake on multicast is enabled */
 		if (wufc & E1000_WUFC_MC) {
@@ -5482,12 +5504,13 @@ static int igb_set_vf_mac(struct igb_adapter *adapter,
                           int vf, unsigned char *mac_addr)
 {
 	struct e1000_hw *hw = &adapter->hw;
-	int rar_entry = vf + 1; /* VF MAC addresses start at entry 1 */
-
-	igb_rar_set(hw, mac_addr, rar_entry);
+	/* VF MAC addresses start at end of receive addresses and moves
+	 * torwards the first, as a result a collision should not be possible */
+	int rar_entry = hw->mac.rar_entry_count - (vf + 1);
 
 	memcpy(adapter->vf_data[vf].vf_mac_addresses, mac_addr, ETH_ALEN);
 
+	igb_rar_set(hw, mac_addr, rar_entry);
 	igb_set_rah_pool(hw, vf, rar_entry);
 
 	return 0;


^ permalink raw reply related

* Re: [NET] Add proc file to display the state of all qdiscs.
From: David Miller @ 2009-09-03 23:31 UTC (permalink / raw)
  To: kaber; +Cc: hawk, cl, eric.dumazet, jarkao2, netdev
In-Reply-To: <4AA00351.4090601@trash.net>

From: Patrick McHardy <kaber@trash.net>
Date: Thu, 03 Sep 2009 19:56:33 +0200

> It would probably also be possible to use TC classifiers for queue
> selection.

You mean just like Intel's multiqueue scheduler and multiqueue
classifiers which we already have in the tree :-)

^ permalink raw reply

* Re: [NET] Add proc file to display the state of all qdiscs.
From: David Miller @ 2009-09-03 23:28 UTC (permalink / raw)
  To: hawk; +Cc: kaber, cl, eric.dumazet, jarkao2, netdev
In-Reply-To: <Pine.LNX.4.64.0909031911400.3490@ask.diku.dk>

From: Jesper Dangaard Brouer <hawk@diku.dk>
Date: Thu, 3 Sep 2009 19:30:00 +0200 (CEST)

> I especially like the possibility to access each qdisc seperately.
> Does it then support having seperate qdisc per TX queue?  (I'm toying
> with the idea of transmitting our multicast traffic into/via a
> seperate TX hardware queue, and making a special qdisc for IPTV
> MPEG2-TS shaping)

Indeed it sounds interesting.

But I want to warn everyone against adding support for anything
that adds a way to do operations "on every device qdisc" at the
kernel level.

I tried to do that and the error unwinding complexity is unacceptable.

We can make the tools support things like this.  Keep the kernel
simple and allow it to only operate on one queue at a time for
a given individual config request.


^ permalink raw reply

* Re: skb header allocation
From: Stephen Hemminger @ 2009-09-03 23:23 UTC (permalink / raw)
  To: Frank Blaschka; +Cc: Chris Ross, netdev
In-Reply-To: <4A94CFB3.4020706@linux.vnet.ibm.com>

On Wed, 26 Aug 2009 08:01:23 +0200
Frank Blaschka <blaschka@linux.vnet.ibm.com> wrote:

> Is it save to change dev->hard_header_len to a value other than ETH_HLEN for
> an ethernet device?
> 
> I rememeber we run in trouble with some kernel components (netfilter?, I'm
> not sure can't remember the details ...) when we did this is the past.
> 

You can add extra data but need a minimum with ETH_HLEN of space above
packet when receiving.


-- 

^ permalink raw reply

* Re: [NET] Add proc file to display the state of all qdiscs.
From: David Miller @ 2009-09-03 23:22 UTC (permalink / raw)
  To: hawk; +Cc: cl, eric.dumazet, jarkao2, kaber, netdev
In-Reply-To: <Pine.LNX.4.64.0909031617060.28935@ask.diku.dk>

From: Jesper Dangaard Brouer <hawk@diku.dk>
Date: Thu, 3 Sep 2009 16:19:44 +0200 (CEST)

> 
> On Wed, 2 Sep 2009, Christoph Lameter wrote:
>> On Wed, 2 Sep 2009, Eric Dumazet wrote:
>>
>>> Same name "eth0" is displayed, that might confuse parsers...
>>>
>>> What naming convention should we choose for multiqueue devices ?
>>
>> eth0/tx<number> ?
> 
> Remember that we already have a naming convention in /proc/interrupts
> 
>  eth0-tx-<number>
> 
> Lets not introduce too many new once ;-)

Yes, please :-)

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Paul E. McKenney @ 2009-09-03 23:17 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <alpine.DEB.1.10.0909031731540.24199@V090114053VZO-1>

On Thu, Sep 03, 2009 at 05:44:54PM -0500, Christoph Lameter wrote:
> On Thu, 3 Sep 2009, Eric Dumazet wrote:
> 
> > It seems very smart, but needs review of all callers to make sure no slabs
> > are waiting for final freeing in call_rcu queue on some cpu.
> 
> Yes. Again this is the first time we encounter a situation where a
> DESTROY_BY_RCU slab has to be destroyed. So the review is quite short.
> 
> > I suspect most of them will then have to use rcu_barrier() before calling
> > kmem_cache_destroy(), so why not factorizing code in one place ?
> 
> There are different forms of RCU which require different forms of
> barriers. Its best to leave that up to the user. Again the user must make
> sure that no objects are in use before a slab is destroyed. For
> SLAB_DESTROY_BY_RCU this means that there are no potential outstanding
> reads on the structure. You may need an rcu_barrier() to accomplish that.
> 
> Slight variations in the use of RCU could require different method. Better
> reduce the entanglement of slabs to RCU to a mininum possible.

If it were the user of the slab who was invoking some variant of
call_rcu(), then I would agree with you.

However, call_rcu() is instead being invoked by the slab itself in the
case of SLAB_DESTROY_BY_RCU, so that there is no variation in usage.
Requiring that the user call rcu_barrier() is asking for subtle bugs.
Therefore, the best approach is to have kmem_cache_destroy() handle
the RCU cleanup, given that this cleanup is for actions taken by
kmem_cache_free(), not by the user.

							Thanx, Paul

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Christoph Lameter @ 2009-09-03 22:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Pekka Enberg, Zdenek Kabelac, Patrick McHardy, Robin Holt,
	Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers, paulmck
In-Reply-To: <4AA00400.1030005@gmail.com>

On Thu, 3 Sep 2009, Eric Dumazet wrote:

> It seems very smart, but needs review of all callers to make sure no slabs
> are waiting for final freeing in call_rcu queue on some cpu.

Yes. Again this is the first time we encounter a situation where a
DESTROY_BY_RCU slab has to be destroyed. So the review is quite short.

> I suspect most of them will then have to use rcu_barrier() before calling
> kmem_cache_destroy(), so why not factorizing code in one place ?

There are different forms of RCU which require different forms of
barriers. Its best to leave that up to the user. Again the user must make
sure that no objects are in use before a slab is destroyed. For
SLAB_DESTROY_BY_RCU this means that there are no potential outstanding
reads on the structure. You may need an rcu_barrier() to accomplish that.

Slight variations in the use of RCU could require different method. Better
reduce the entanglement of slabs to RCU to a mininum possible.



^ permalink raw reply

* [PATCH NEXT 3/3] netxen: fix infinite loop on dma mapping failure
From: Dhananjay Phadke @ 2009-09-03 23:10 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1252019455-30683-1-git-send-email-dhananjay@netxen.com>

Fix a perpetual while() loop in unwinding partial
mapped tx skb on dma mapping failure.

Reported-by: "Juha Leppanen" <juha_motorsportcom@luukku.com>
Signed-off-by: Dhananjay Phadke <dhananjay@netxen.com>
---
 drivers/net/netxen/netxen_nic_main.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index 53d57dc..41b2967 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -1577,8 +1577,8 @@ netxen_map_tx_skb(struct pci_dev *pdev,
 	return 0;
 
 unwind:
-	while (i > 0) {
-		nf = &pbuf->frag_array[i];
+	while (--i >= 0) {
+		nf = &pbuf->frag_array[i+1];
 		pci_unmap_page(pdev, nf->dma, nf->length, PCI_DMA_TODEVICE);
 	}
 
-- 
1.6.0.2


^ permalink raw reply related

* [PATCH NEXT 2/3] netxen: remove duplicate napi_add
From: Dhananjay Phadke @ 2009-09-03 23:10 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1252019455-30683-1-git-send-email-dhananjay@netxen.com>

Remove duplicate calls to netxen_napi_add().

Signed-off-by: Dhananjay Phadke <dhananjay@netxen.com>
---
 drivers/net/netxen/netxen_nic_main.c |    6 ------
 1 files changed, 0 insertions(+), 6 deletions(-)

diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index f824a39..53d57dc 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -1095,10 +1095,6 @@ netxen_setup_netdev(struct netxen_adapter *adapter,
 
 	netdev->irq = adapter->msix_entries[0].vector;
 
-	err = netxen_napi_add(adapter, netdev);
-	if (err)
-		return err;
-
 	init_timer(&adapter->watchdog_timer);
 	adapter->watchdog_timer.function = &netxen_watchdog;
 	adapter->watchdog_timer.data = (unsigned long)adapter;
@@ -2038,8 +2034,6 @@ netxen_remove_sysfs_entries(struct netxen_adapter *adapter)
 		device_remove_file(dev, &dev_attr_bridged_mode);
 }
 
-static void netxen_watchdog(unsigned long);
-
 #ifdef CONFIG_INET
 
 #define is_netxen_netdev(dev) (dev->netdev_ops == &netxen_netdev_ops)
-- 
1.6.0.2


^ permalink raw reply related

* [PATCH NEXT 1/3] netxen: fix lro buffer allocation
From: Dhananjay Phadke @ 2009-09-03 23:10 UTC (permalink / raw)
  To: davem; +Cc: netdev, Dhananjay Phadke

From: Dhananjay Phadke <dhananjay@qlogic.com>

Alloc 12k skbuffs so that firmware can aggregate more
packets into one buffer. This doesn't raise memory
consumption since 9k skbs use 16k slab cache anyway.

Signed-off-by: Dhananjay Phadke <dhananjay@netxen.com>
---
 drivers/net/netxen/netxen_nic.h      |    1 +
 drivers/net/netxen/netxen_nic_init.c |    8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h
index 224a746..ede2fa7 100644
--- a/drivers/net/netxen/netxen_nic.h
+++ b/drivers/net/netxen/netxen_nic.h
@@ -149,6 +149,7 @@
 #define NX_P2_RX_JUMBO_BUF_MAX_LEN     (NX_MAX_ETHERHDR + P2_MAX_MTU)
 #define NX_P3_RX_JUMBO_BUF_MAX_LEN     (NX_MAX_ETHERHDR + P3_MAX_MTU)
 #define NX_CT_DEFAULT_RX_BUF_LEN	2048
+#define NX_LRO_BUFFER_EXTRA		2048
 
 #define NX_RX_LRO_BUFFER_LENGTH		(8060)
 
diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c
index 8d4aa6f..04e36f2 100644
--- a/drivers/net/netxen/netxen_nic_init.c
+++ b/drivers/net/netxen/netxen_nic_init.c
@@ -265,6 +265,10 @@ int netxen_alloc_sw_resources(struct netxen_adapter *adapter)
 			else
 				rds_ring->dma_size =
 					NX_P2_RX_JUMBO_BUF_MAX_LEN;
+
+			if (adapter->capabilities & NX_CAP0_HW_LRO)
+				rds_ring->dma_size += NX_LRO_BUFFER_EXTRA;
+
 			rds_ring->skb_size =
 				rds_ring->dma_size + NET_IP_ALIGN;
 			break;
@@ -1217,6 +1221,7 @@ netxen_process_rcv(struct netxen_adapter *adapter,
 	if (pkt_offset)
 		skb_pull(skb, pkt_offset);
 
+	skb->truesize = skb->len + sizeof(struct sk_buff);
 	skb->protocol = eth_type_trans(skb, netdev);
 
 	napi_gro_receive(&sds_ring->napi, skb);
@@ -1278,8 +1283,7 @@ netxen_process_lro(struct netxen_adapter *adapter,
 
 	skb_put(skb, lro_length + data_offset);
 
-	skb->truesize = (skb->len + sizeof(struct sk_buff) +
-			((unsigned long)skb->data - (unsigned long)skb->head));
+	skb->truesize = skb->len + sizeof(struct sk_buff) + skb_headroom(skb);
 
 	skb_pull(skb, l2_hdr_offset);
 	skb->protocol = eth_type_trans(skb, netdev);
-- 
1.6.0.2


^ permalink raw reply related

* Re: [PATCH] slub: fix slab_pad_check()
From: Christoph Lameter @ 2009-09-03 22:43 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Eric Dumazet, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <20090903174435.GF6761@linux.vnet.ibm.com>

On Thu, 3 Sep 2009, Paul E. McKenney wrote:

> 2.	CPU 0 discovers that the slab cache can now be destroyed.
>
> 	It determines that there are no users, and has guaranteed
> 	that there will be no future users.  So it knows that it
> 	can safely do kmem_cache_destroy().
>
> 3.	In absence of rcu_barrier(), kmem_cache_destroy() would
> 	immediately tear down the slab data structures.

Of course. This has been discussed before.

You need to ensure that no objects are in use before destroying a slab. In
case of DESTROY_BY_RCU you must ensure that there are no potential
readers. So use a suitable rcu barrier or something else like a
synchronize_rcu...

> > But going through the RCU period is pointless since no user of the cache
> > remains.
>
> Which is irrelevant.  The outstanding RCU callback was posted by the
> slab cache itself, -not- by the user of the slab cache.

There will be no rcu callbacks generated at kmem_cache_destroy with the
patch I posted.

> > The dismantling does not need RCU since there are no operations on the
> > objects in progress. So simply switch DESTROY_BY_RCU off for close.
>
> Unless I am missing something, this patch re-introduces the bug that
> the rcu_barrier() was added to prevent.  So, in absence of a better
> explanation of what I am missing:

The "fix" was ill advised. Slab users must ensure that no objects are in
use before destroying a slab. Only the slab users know how the objects
are being used. The slab allocator itself cannot know how to ensure that
there are no pending references. Putting a rcu_barrier in there creates an
inconsistency in the operation of kmem_cache_destroy() and an expectation
of functionality that the function cannot provide.


^ permalink raw reply

* Re: [stable] [Patch] Fix commit 63d9950b08184e6531adceb65f64b429909cc101 (ipv6: Make v4-mapped bindings consistent with IPv4)
From: Greg KH @ 2009-09-03 22:25 UTC (permalink / raw)
  To: Bruno Prémont; +Cc: stable, vladislav.yasevich, netdev, David Miller
In-Reply-To: <20090825222042.4d13aacd@neptune.home>

On Tue, Aug 25, 2009 at 10:20:42PM +0200, Bruno Prémont wrote:
> On Sun, 23 August 2009 David Miller <davem@davemloft.net> wrote:
> > From: Bruno Prémont <bonbons@linux-vserver.org>
> > Date: Sat, 22 Aug 2009 01:45:23 +0200
> > 
> > > Commit 63d9950b08184e6531adceb65f64b429909cc101
> > >   (ipv6: Make v4-mapped bindings consistent with IPv4)
> > > changes behavior of inet6_bind() for v4-mapped addresses so it
> > > should behave the same way as inet_bind().
> > > 
> > > During this change setting of err to -EADDRNOTAVAIL got lost:
> >  ...
> > > Signed-off-by Bruno Prémont <bonbons@linux-vserver.org>
> > 
> > Thanks for finding and fixing this bug, applied, thanks!
> 
> Please consider queueing this for 2.6.30.x stable, upstream commit is
> ca6982b858e1d08010c1d29d8e8255b2ac2ad70a

Now queued up.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Eric Dumazet @ 2009-09-03 22:17 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Paul E. McKenney, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <4AA03E6A.7070800@gmail.com>

Eric Dumazet a écrit :
> 
> 
> 
> Problem is not _objects_ Christoph, but _slabs_, and your patch is not working.
> 
> Its true that when User calls kmem_cache_destroy(), all _objects_ were previously freed.
> This is mandatory, with or withou SLAB_DESTROY_BY_RCU thing
> 
> Problem is that slub has some internal state, including some to-be-freed _slabs_,
> that User have no control at all on it.
> 
> User cannot "know" slabs are freed, inuse, or whatever state in cache or call_rcu queues.
> 
> Face it, SLAB_DESTROY_BY_RCU is internal affair (to slub/slab/... allocators)
> 
> We absolutely need a rcu_barrier() somewhere, believe it or not. You can argue that it should
> be done *before*, but it gives no speedup, only potential bugs.
> 
> Only case User should do its rcu_barrier() itself is if it knows some call_rcu() are pending
> and are delaying _objects_ freeing (typical !SLAB_DESTROY_RCU usage in RCU algos).
> 
> I dont even understand why you care so much about kmem_cache_destroy(SLAB_DESTROY_BY_RCU),
> given that almost nobody use it. We took almost one month to find out what the bug was in first
> place...


So maybe the safest thing would be to include the rcu_barrier() to insure all objects where freed

And another one for SLAB_DESTROY_BY_RCU to make sure slabs where freed

void kmem_cache_destroy(struct kmem_cache *s)
{
	/*
	 * Make sure no objects are waiting in call_rcu queues to be freed
	 */
	rcu_barrier();

	down_write(&slub_lock);
	s->refcount--;
	if (!s->refcount) {
                list_del(&s->list);
                up_write(&slub_lock);
                if (kmem_cache_close(s)) {
                        printk(KERN_ERR "SLUB %s: %s called for cache that "
                                "still has objects.\n", s->name, __func__);
                        dump_stack();
                }
		/*
		 * Make sure no slabs are waiting in call_rcu queues to be freed
		 */
                if (s->flags & SLAB_DESTROY_BY_RCU)
                        rcu_barrier();
                sysfs_slab_remove(s);
        } else
                up_write(&slub_lock);
}

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Eric Dumazet @ 2009-09-03 22:08 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Paul E. McKenney, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <alpine.DEB.1.10.0909031736340.24199@V090114053VZO-1>

Christoph Lameter a écrit :
> On Thu, 3 Sep 2009, Paul E. McKenney wrote:
> 
>> 2.	CPU 0 discovers that the slab cache can now be destroyed.
>>
>> 	It determines that there are no users, and has guaranteed
>> 	that there will be no future users.  So it knows that it
>> 	can safely do kmem_cache_destroy().
>>
>> 3.	In absence of rcu_barrier(), kmem_cache_destroy() would
>> 	immediately tear down the slab data structures.
> 
> Of course. This has been discussed before.
> 
> You need to ensure that no objects are in use before destroying a slab. In
> case of DESTROY_BY_RCU you must ensure that there are no potential
> readers. So use a suitable rcu barrier or something else like a
> synchronize_rcu...
> 
>>> But going through the RCU period is pointless since no user of the cache
>>> remains.
>> Which is irrelevant.  The outstanding RCU callback was posted by the
>> slab cache itself, -not- by the user of the slab cache.
> 
> There will be no rcu callbacks generated at kmem_cache_destroy with the
> patch I posted.
> 
>>> The dismantling does not need RCU since there are no operations on the
>>> objects in progress. So simply switch DESTROY_BY_RCU off for close.
>> Unless I am missing something, this patch re-introduces the bug that
>> the rcu_barrier() was added to prevent.  So, in absence of a better
>> explanation of what I am missing:
> 
> The "fix" was ill advised. Slab users must ensure that no objects are in
> use before destroying a slab. Only the slab users know how the objects
> are being used. The slab allocator itself cannot know how to ensure that
> there are no pending references. Putting a rcu_barrier in there creates an
> inconsistency in the operation of kmem_cache_destroy() and an expectation
> of functionality that the function cannot provide.
> 



Problem is not _objects_ Christoph, but _slabs_, and your patch is not working.

Its true that when User calls kmem_cache_destroy(), all _objects_ were previously freed.
This is mandatory, with or withou SLAB_DESTROY_BY_RCU thing

Problem is that slub has some internal state, including some to-be-freed _slabs_,
that User have no control at all on it.

User cannot "know" slabs are freed, inuse, or whatever state in cache or call_rcu queues.

Face it, SLAB_DESTROY_BY_RCU is internal affair (to slub/slab/... allocators)

We absolutely need a rcu_barrier() somewhere, believe it or not. You can argue that it should
be done *before*, but it gives no speedup, only potential bugs.

Only case User should do its rcu_barrier() itself is if it knows some call_rcu() are pending
and are delaying _objects_ freeing (typical !SLAB_DESTROY_RCU usage in RCU algos).

I dont even understand why you care so much about kmem_cache_destroy(SLAB_DESTROY_BY_RCU),
given that almost nobody use it. We took almost one month to find out what the bug was in first
place...
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Paul E. McKenney @ 2009-09-03 22:03 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <alpine.DEB.1.10.0909031736340.24199@V090114053VZO-1>

On Thu, Sep 03, 2009 at 05:43:12PM -0500, Christoph Lameter wrote:
> On Thu, 3 Sep 2009, Paul E. McKenney wrote:
> 
> > 2.	CPU 0 discovers that the slab cache can now be destroyed.
> >
> > 	It determines that there are no users, and has guaranteed
> > 	that there will be no future users.  So it knows that it
> > 	can safely do kmem_cache_destroy().
> >
> > 3.	In absence of rcu_barrier(), kmem_cache_destroy() would
> > 	immediately tear down the slab data structures.
> 
> Of course. This has been discussed before.
> 
> You need to ensure that no objects are in use before destroying a slab. In
> case of DESTROY_BY_RCU you must ensure that there are no potential
> readers. So use a suitable rcu barrier or something else like a
> synchronize_rcu...

If by "you must ensure" you mean "kmem_cache_destroy must ensure", then
we are in complete agreement.  Otherwise, not a chance.

> > > But going through the RCU period is pointless since no user of the cache
> > > remains.
> >
> > Which is irrelevant.  The outstanding RCU callback was posted by the
> > slab cache itself, -not- by the user of the slab cache.
> 
> There will be no rcu callbacks generated at kmem_cache_destroy with the
> patch I posted.

That is true.  However, there well might be RCU callbacks generated by
kmem_cache_free() that have not yet been invoked.  Since kmem_cache_free()
generated them, it is ridiculous to insist that the user account for them.
That responsibility must instead fall on kmem_cache_destroy().

> > > The dismantling does not need RCU since there are no operations on the
> > > objects in progress. So simply switch DESTROY_BY_RCU off for close.
> >
> > Unless I am missing something, this patch re-introduces the bug that
> > the rcu_barrier() was added to prevent.  So, in absence of a better
> > explanation of what I am missing:
> 
> The "fix" was ill advised. Slab users must ensure that no objects are in
> use before destroying a slab. Only the slab users know how the objects
> are being used. The slab allocator itself cannot know how to ensure that
> there are no pending references. Putting a rcu_barrier in there creates an
> inconsistency in the operation of kmem_cache_destroy() and an expectation
> of functionality that the function cannot provide.

If by "must ensure that no objects are in use", you mean "must have
no further references to them", then we are in agreement.  And in my
scenario above, it is not the -user- who later references the memory,
but rather the slab code itself.

Put the rcu_barrier() in kmem_cache_free().  Please.

							Thanx, Paul

^ permalink raw reply

* [RFC] Inconsistency in MDIO ioctl behaviour
From: Ben Hutchings @ 2009-09-03 21:04 UTC (permalink / raw)
  To: netdev

While comparing the various implementations of SIOC{G,S}MIIREG and
SIOCGMIIPHY, I found some differences in behaviour that could make some
applications unreliable across different drivers.

1. Implementations of SIOCGMIIPHY must set phy_id to the PHY's MDIO
address (PRTAD) or to a dummy address if there is no real MDIO bus.
Many implementations, including the generic ones (mii, phylib,
pci-skeleton and now mdio) then proceed as for SIOCGMIIPHY.  Others
return without accessing any registers.  Which behaviour is right?

2. Implementations of SIOC{G,S}MIIREG that do not support access to
arbitary MDIO addresses handle non-matching values of phy_id in at least
three different ways:
(a) ignore it and always use the expected PHY address
(b) ignore writes and return a dummy value for reads
(c) fail
I favour behaviour (c) but I'm not sure what the error code should be.
ENODEV, EINVAL or EIO?

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [PATCH 3/3] netdev: Convert MDIO ioctl implementation to use struct mii_ioctl_data
From: Ben Hutchings @ 2009-09-03 20:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers

A few drivers still access the arguments to MDIO ioctls as an array of
u16.  Convert them to use struct mii_ioctl_data.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
The PCMCIA drivers are compile-tested.  The emac driver is not since
it's for PPC only, but I'm fairly confident that it will work.

Ben.

 drivers/net/ibm_newemac/core.c  |   10 ++++++----
 drivers/net/pcmcia/3c574_cs.c   |   13 ++++++++-----
 drivers/net/pcmcia/axnet_cs.c   |    9 +++++----
 drivers/net/pcmcia/pcnet_cs.c   |    9 +++++----
 drivers/net/pcmcia/xirc2ps_cs.c |   13 ++++++++-----
 5 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index 8a95234..1d7d7fe 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -2209,7 +2209,7 @@ static const struct ethtool_ops emac_ethtool_ops = {
 static int emac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 {
 	struct emac_instance *dev = netdev_priv(ndev);
-	uint16_t *data = (uint16_t *) & rq->ifr_ifru;
+	struct mii_ioctl_data *data = if_mii(rq);
 
 	DBG(dev, "ioctl %08x" NL, cmd);
 
@@ -2218,14 +2218,16 @@ static int emac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 
 	switch (cmd) {
 	case SIOCGMIIPHY:
-		data[0] = dev->phy.address;
+		data->phy_id = dev->phy.address;
 		/* Fall through */
 	case SIOCGMIIREG:
-		data[3] = emac_mdio_read(ndev, dev->phy.address, data[1]);
+		data->val_out = emac_mdio_read(ndev, dev->phy.address,
+					       data->reg_num);
 		return 0;
 
 	case SIOCSMIIREG:
-		emac_mdio_write(ndev, dev->phy.address, data[1], data[2]);
+		emac_mdio_write(ndev, dev->phy.address, data->reg_num,
+				data->val_in);
 		return 0;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index d836af1..ee8ad3e 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -85,6 +85,7 @@ earlier 3Com products.
 #include <linux/ioport.h>
 #include <linux/ethtool.h>
 #include <linux/bitops.h>
+#include <linux/mii.h>
 
 #include <pcmcia/cs_types.h>
 #include <pcmcia/cs.h>
@@ -1096,16 +1097,16 @@ static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct el3_private *lp = netdev_priv(dev);
 	unsigned int ioaddr = dev->base_addr;
-	u16 *data = (u16 *)&rq->ifr_ifru;
+	struct mii_ioctl_data *data = if_mii(rq);
 	int phy = lp->phys & 0x1f;
 
 	DEBUG(2, "%s: In ioct(%-.6s, %#4.4x) %4.4x %4.4x %4.4x %4.4x.\n",
 		  dev->name, rq->ifr_ifrn.ifrn_name, cmd,
-		  data[0], data[1], data[2], data[3]);
+		  data->phy_id, data->reg_num, data->val_in, data->val_out);
 
 	switch(cmd) {
 	case SIOCGMIIPHY:		/* Get the address of the PHY in use. */
-		data[0] = phy;
+		data->phy_id = phy;
 	case SIOCGMIIREG:		/* Read the specified MII register. */
 		{
 			int saved_window;
@@ -1114,7 +1115,8 @@ static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 			spin_lock_irqsave(&lp->window_lock, flags);
 			saved_window = inw(ioaddr + EL3_CMD) >> 13;
 			EL3WINDOW(4);
-			data[3] = mdio_read(ioaddr, data[0] & 0x1f, data[1] & 0x1f);
+			data->val_out = mdio_read(ioaddr, data->phy_id & 0x1f,
+						  data->reg_num & 0x1f);
 			EL3WINDOW(saved_window);
 			spin_unlock_irqrestore(&lp->window_lock, flags);
 			return 0;
@@ -1127,7 +1129,8 @@ static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 			spin_lock_irqsave(&lp->window_lock, flags);
 			saved_window = inw(ioaddr + EL3_CMD) >> 13;
 			EL3WINDOW(4);
-			mdio_write(ioaddr, data[0] & 0x1f, data[1] & 0x1f, data[2]);
+			mdio_write(ioaddr, data->phy_id & 0x1f,
+				   data->reg_num & 0x1f, data->val_in);
 			EL3WINDOW(saved_window);
 			spin_unlock_irqrestore(&lp->window_lock, flags);
 			return 0;
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 4f2fef6..3131a59 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -37,6 +37,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/crc32.h>
+#include <linux/mii.h>
 #include "../8390.h"
 
 #include <pcmcia/cs_types.h>
@@ -697,16 +698,16 @@ static const struct ethtool_ops netdev_ethtool_ops = {
 static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
     axnet_dev_t *info = PRIV(dev);
-    u16 *data = (u16 *)&rq->ifr_ifru;
+    struct mii_ioctl_data *data = if_mii(rq);
     unsigned int mii_addr = dev->base_addr + AXNET_MII_EEP;
     switch (cmd) {
     case SIOCGMIIPHY:
-	data[0] = info->phy_id;
+	data->phy_id = info->phy_id;
     case SIOCGMIIREG:		/* Read MII PHY register. */
-	data[3] = mdio_read(mii_addr, data[0], data[1] & 0x1f);
+	data->val_out = mdio_read(mii_addr, data->phy_id, data->reg_num & 0x1f);
 	return 0;
     case SIOCSMIIREG:		/* Write MII PHY register. */
-	mdio_write(mii_addr, data[0], data[1] & 0x1f, data[2]);
+	mdio_write(mii_addr, data->phy_id, data->reg_num & 0x1f, data->val_in);
 	return 0;
     }
     return -EOPNOTSUPP;
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 8996b45..90a94d2 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -40,6 +40,7 @@
 #include <linux/netdevice.h>
 #include <linux/log2.h>
 #include <linux/etherdevice.h>
+#include <linux/mii.h>
 #include "../8390.h"
 
 #include <pcmcia/cs_types.h>
@@ -1191,7 +1192,7 @@ static const struct ethtool_ops netdev_ethtool_ops = {
 static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
     pcnet_dev_t *info = PRIV(dev);
-    u16 *data = (u16 *)&rq->ifr_ifru;
+    struct mii_ioctl_data *data = if_mii(rq);
     unsigned int mii_addr = dev->base_addr + DLINK_GPIO;
 
     if (!(info->flags & (IS_DL10019|IS_DL10022)))
@@ -1199,12 +1200,12 @@ static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 
     switch (cmd) {
     case SIOCGMIIPHY:
-	data[0] = info->phy_id;
+	data->phy_id = info->phy_id;
     case SIOCGMIIREG:		/* Read MII PHY register. */
-	data[3] = mdio_read(mii_addr, data[0], data[1] & 0x1f);
+	data->val_out = mdio_read(mii_addr, data->phy_id, data->reg_num & 0x1f);
 	return 0;
     case SIOCSMIIREG:		/* Write MII PHY register. */
-	mdio_write(mii_addr, data[0], data[1] & 0x1f, data[2]);
+	mdio_write(mii_addr, data->phy_id, data->reg_num & 0x1f, data->val_in);
 	return 0;
     }
     return -EOPNOTSUPP;
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index 9709dd1..cf84231 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -80,6 +80,7 @@
 #include <linux/if_arp.h>
 #include <linux/ioport.h>
 #include <linux/bitops.h>
+#include <linux/mii.h>
 
 #include <pcmcia/cs_types.h>
 #include <pcmcia/cs.h>
@@ -1558,24 +1559,26 @@ do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
     local_info_t *local = netdev_priv(dev);
     unsigned int ioaddr = dev->base_addr;
-    u16 *data = (u16 *)&rq->ifr_ifru;
+    struct mii_ioctl_data *data = if_mii(rq);
 
     DEBUG(1, "%s: ioctl(%-.6s, %#04x) %04x %04x %04x %04x\n",
 	  dev->name, rq->ifr_ifrn.ifrn_name, cmd,
-	  data[0], data[1], data[2], data[3]);
+	  data->phy_id, data->reg_num, data->val_in, data->val_out);
 
     if (!local->mohawk)
 	return -EOPNOTSUPP;
 
     switch(cmd) {
       case SIOCGMIIPHY:		/* Get the address of the PHY in use. */
-	data[0] = 0;		/* we have only this address */
+	data->phy_id = 0;	/* we have only this address */
 	/* fall through */
       case SIOCGMIIREG:		/* Read the specified MII register. */
-	data[3] = mii_rd(ioaddr, data[0] & 0x1f, data[1] & 0x1f);
+	data->val_out = mii_rd(ioaddr, data->phy_id & 0x1f,
+			       data->reg_num & 0x1f);
 	break;
       case SIOCSMIIREG:		/* Write the specified MII register */
-	mii_wr(ioaddr, data[0] & 0x1f, data[1] & 0x1f, data[2], 16);
+	mii_wr(ioaddr, data->phy_id & 0x1f, data->reg_num & 0x1f, data->val_in,
+	       16);
 	break;
       default:
 	return -EOPNOTSUPP;

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH 2/3] netdev: Remove redundant checks for CAP_NET_ADMIN in MDIO implementations
From: Ben Hutchings @ 2009-09-03 20:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers

dev_ioctl() already checks capable(CAP_NET_ADMIN) before calling the
driver's implementation of MDIO ioctls.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
I wrote:
> Currently most drivers[1] that support the "MII" ioctls allow all users
> to read MDIO registers, while MDIO writes require CAP_NET_ADMIN.

Thankfully this was not true, so I can remove code instead of adding it.

Ben.

 drivers/net/amd8111e.c          |    3 ---
 drivers/net/atl1c/atl1c_main.c  |    8 --------
 drivers/net/atl1e/atl1e_main.c  |    8 --------
 drivers/net/atlx/atl2.c         |    4 ----
 drivers/net/bnx2.c              |    3 ---
 drivers/net/cassini.c           |    4 ----
 drivers/net/e1000/e1000_main.c  |    4 ----
 drivers/net/e1000e/netdev.c     |    2 --
 drivers/net/ibm_newemac/core.c  |    2 --
 drivers/net/igb/igb_main.c      |    2 --
 drivers/net/mdio.c              |    3 ---
 drivers/net/mii.c               |    3 ---
 drivers/net/natsemi.c           |    2 --
 drivers/net/pci-skeleton.c      |    5 -----
 drivers/net/pcmcia/3c574_cs.c   |    2 --
 drivers/net/pcmcia/axnet_cs.c   |    2 --
 drivers/net/pcmcia/pcnet_cs.c   |    2 --
 drivers/net/pcmcia/xirc2ps_cs.c |    2 --
 drivers/net/phy/phy.c           |    3 ---
 drivers/net/r8169.c             |    2 --
 drivers/net/sis900.c            |    2 --
 drivers/net/skge.c              |    3 ---
 drivers/net/sky2.c              |    3 ---
 drivers/net/sungem.c            |    4 +---
 drivers/net/tg3.c               |    3 ---
 drivers/net/tlan.c              |    2 --
 drivers/net/tulip/tulip_core.c  |    2 --
 drivers/net/tulip/winbond-840.c |    2 --
 drivers/net/via-velocity.c      |    4 ----
 drivers/net/yellowfin.c         |    2 --
 30 files changed, 1 insertions(+), 92 deletions(-)

diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c
index 98b5f46..4e6359f 100644
--- a/drivers/net/amd8111e.c
+++ b/drivers/net/amd8111e.c
@@ -1524,9 +1524,6 @@ static int amd8111e_ioctl(struct net_device * dev , struct ifreq *ifr, int cmd)
 	int err;
 	u32 mii_regval;
 
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	switch(cmd) {
 	case SIOCGMIIPHY:
 		data->phy_id = lp->ext_phy_addr;
diff --git a/drivers/net/atl1c/atl1c_main.c b/drivers/net/atl1c/atl1c_main.c
index e46bf92..be2c6cf 100644
--- a/drivers/net/atl1c/atl1c_main.c
+++ b/drivers/net/atl1c/atl1c_main.c
@@ -534,10 +534,6 @@ static int atl1c_mii_ioctl(struct net_device *netdev,
 		break;
 
 	case SIOCGMIIREG:
-		if (!capable(CAP_NET_ADMIN)) {
-			retval = -EPERM;
-			goto out;
-		}
 		if (atl1c_read_phy_reg(&adapter->hw, data->reg_num & 0x1F,
 				    &data->val_out)) {
 			retval = -EIO;
@@ -546,10 +542,6 @@ static int atl1c_mii_ioctl(struct net_device *netdev,
 		break;
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN)) {
-			retval = -EPERM;
-			goto out;
-		}
 		if (data->reg_num & ~(0x1F)) {
 			retval = -EFAULT;
 			goto out;
diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c
index bca127e..69b830f 100644
--- a/drivers/net/atl1e/atl1e_main.c
+++ b/drivers/net/atl1e/atl1e_main.c
@@ -453,10 +453,6 @@ static int atl1e_mii_ioctl(struct net_device *netdev,
 		break;
 
 	case SIOCGMIIREG:
-		if (!capable(CAP_NET_ADMIN)) {
-			retval = -EPERM;
-			goto out;
-		}
 		if (atl1e_read_phy_reg(&adapter->hw, data->reg_num & 0x1F,
 				    &data->val_out)) {
 			retval = -EIO;
@@ -465,10 +461,6 @@ static int atl1e_mii_ioctl(struct net_device *netdev,
 		break;
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN)) {
-			retval = -EPERM;
-			goto out;
-		}
 		if (data->reg_num & ~(0x1F)) {
 			retval = -EFAULT;
 			goto out;
diff --git a/drivers/net/atlx/atl2.c b/drivers/net/atlx/atl2.c
index 10c06b9..ab68886 100644
--- a/drivers/net/atlx/atl2.c
+++ b/drivers/net/atlx/atl2.c
@@ -966,8 +966,6 @@ static int atl2_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 		data->phy_id = 0;
 		break;
 	case SIOCGMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		spin_lock_irqsave(&adapter->stats_lock, flags);
 		if (atl2_read_phy_reg(&adapter->hw,
 			data->reg_num & 0x1F, &data->val_out)) {
@@ -977,8 +975,6 @@ static int atl2_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 		spin_unlock_irqrestore(&adapter->stats_lock, flags);
 		break;
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		if (data->reg_num & ~(0x1F))
 			return -EFAULT;
 		spin_lock_irqsave(&adapter->stats_lock, flags);
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 1357d54..08cddb6 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -7480,9 +7480,6 @@ bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-
 		if (bp->phy_flags & BNX2_PHY_FLAG_REMOTE_PHY_CAP)
 			return -EOPNOTSUPP;
 
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 7517dc1..05916aa 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -4875,10 +4875,6 @@ static int cas_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable(CAP_NET_ADMIN)) {
-			rc = -EPERM;
-			break;
-		}
 		spin_lock_irqsave(&cp->lock, flags);
 		cas_mif_poll(cp, 0);
 		rc = cas_phy_write(cp, data->reg_num & 0x1f, data->val_in);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 2c723b1..c66dd4f 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -4714,8 +4714,6 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr,
 		data->phy_id = hw->phy_addr;
 		break;
 	case SIOCGMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		spin_lock_irqsave(&adapter->stats_lock, flags);
 		if (e1000_read_phy_reg(hw, data->reg_num & 0x1F,
 				   &data->val_out)) {
@@ -4725,8 +4723,6 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr,
 		spin_unlock_irqrestore(&adapter->stats_lock, flags);
 		break;
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		if (data->reg_num & ~(0x1F))
 			return -EFAULT;
 		mii_reg = data->val_in;
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 0f8d961..16c193a 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -4346,8 +4346,6 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr,
 		data->phy_id = adapter->hw.phy.addr;
 		break;
 	case SIOCGMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		switch (data->reg_num & 0x1F) {
 		case MII_BMCR:
 			data->val_out = adapter->phy_regs.bmcr;
diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index 24ae671..8a95234 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -2225,8 +2225,6 @@ static int emac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 		return 0;
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		emac_mdio_write(ndev, dev->phy.address, data[1], data[2]);
 		return 0;
 	default:
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 7a054d9..56c2228 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -4849,8 +4849,6 @@ static int igb_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 		data->phy_id = adapter->hw.phy.addr;
 		break;
 	case SIOCGMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		if (igb_read_phy_reg(&adapter->hw, data->reg_num & 0x1F,
 		                     &data->val_out))
 			return -EIO;
diff --git a/drivers/net/mdio.c b/drivers/net/mdio.c
index 7d2e610..21f8754 100644
--- a/drivers/net/mdio.c
+++ b/drivers/net/mdio.c
@@ -380,10 +380,7 @@ int mdio_mii_ioctl(const struct mdio_if_info *mdio,
 		cmd = SIOCGMIIREG;
 		break;
 	case SIOCGMIIREG:
-		break;
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		break;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/mii.c b/drivers/net/mii.c
index d81a5d2..210b2b1 100644
--- a/drivers/net/mii.c
+++ b/drivers/net/mii.c
@@ -433,9 +433,6 @@ int generic_mii_ioctl(struct mii_if_info *mii_if,
 	case SIOCSMIIREG: {
 		u16 val = mii_data->val_in;
 
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-
 		if (mii_data->phy_id == mii_if->phy_id) {
 			switch(mii_data->reg_num) {
 			case MII_BMCR: {
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index b52e5d9..b2722c4 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -3075,8 +3075,6 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		return 0;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		if (dev->if_port == PORT_TP) {
 			if ((data->phy_id & 0x1f) == np->phy_addr_external) {
  				if ((data->reg_num & 0x1f) == MII_ADVERTISE)
diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c
index c47ba36..0c44b48 100644
--- a/drivers/net/pci-skeleton.c
+++ b/drivers/net/pci-skeleton.c
@@ -1784,11 +1784,6 @@ static int netdrv_ioctl (struct net_device *dev, struct ifreq *rq, int cmd)
 		break;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable (CAP_NET_ADMIN)) {
-			rc = -EPERM;
-			break;
-		}
-
 		spin_lock_irqsave (&tp->lock, flags);
 		mdio_write (dev, data->phy_id & 0x1f, data->reg_num & 0x1f, data->val_in);
 		spin_unlock_irqrestore (&tp->lock, flags);
diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c
index 382d265..d836af1 100644
--- a/drivers/net/pcmcia/3c574_cs.c
+++ b/drivers/net/pcmcia/3c574_cs.c
@@ -1124,8 +1124,6 @@ static int el3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 			int saved_window;
                        unsigned long flags;
 
-			if (!capable(CAP_NET_ADMIN))
-				return -EPERM;
 			spin_lock_irqsave(&lp->window_lock, flags);
 			saved_window = inw(ioaddr + EL3_CMD) >> 13;
 			EL3WINDOW(4);
diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c
index 3b681c1..4f2fef6 100644
--- a/drivers/net/pcmcia/axnet_cs.c
+++ b/drivers/net/pcmcia/axnet_cs.c
@@ -706,8 +706,6 @@ static int axnet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	data[3] = mdio_read(mii_addr, data[0], data[1] & 0x1f);
 	return 0;
     case SIOCSMIIREG:		/* Write MII PHY register. */
-	if (!capable(CAP_NET_ADMIN))
-	    return -EPERM;
 	mdio_write(mii_addr, data[0], data[1] & 0x1f, data[2]);
 	return 0;
     }
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 9ef1c1b..8996b45 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -1204,8 +1204,6 @@ static int ei_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	data[3] = mdio_read(mii_addr, data[0], data[1] & 0x1f);
 	return 0;
     case SIOCSMIIREG:		/* Write MII PHY register. */
-	if (!capable(CAP_NET_ADMIN))
-	    return -EPERM;
 	mdio_write(mii_addr, data[0], data[1] & 0x1f, data[2]);
 	return 0;
     }
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index 68de891..9709dd1 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -1575,8 +1575,6 @@ do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	data[3] = mii_rd(ioaddr, data[0] & 0x1f, data[1] & 0x1f);
 	break;
       case SIOCSMIIREG:		/* Write the specified MII register */
-	if (!capable(CAP_NET_ADMIN))
-	    return -EPERM;
 	mii_wr(ioaddr, data[0] & 0x1f, data[1] & 0x1f, data[2], 16);
 	break;
       default:
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index eda94fc..6b71b00 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -324,9 +324,6 @@ int phy_mii_ioctl(struct phy_device *phydev,
 		break;
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-
 		if (mii_data->phy_id == phydev->addr) {
 			switch(mii_data->reg_num) {
 			case MII_BMCR:
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index ec0092a..a91e9b3 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1991,8 +1991,6 @@ static int rtl_xmii_ioctl(struct rtl8169_private *tp, struct mii_ioctl_data *dat
 		return 0;
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		mdio_write(tp->mmio_addr, data->reg_num & 0x1f, data->val_in);
 		return 0;
 	}
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index d882732..97949d0 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -2128,8 +2128,6 @@ static int mii_ioctl(struct net_device *net_dev, struct ifreq *rq, int cmd)
 		return 0;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		mdio_write(net_dev, data->phy_id & 0x1f, data->reg_num & 0x1f, data->val_in);
 		return 0;
 	default:
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 1a1e685..62e852e 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -2496,9 +2496,6 @@ static int skge_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-
 		spin_lock_bh(&hw->phy_lock);
 		if (hw->chip_id == CHIP_ID_GENESIS)
 			err = xm_phy_write(hw, skge->port, data->reg_num & 0x1f,
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index d1f3b46..ecce1df 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1213,9 +1213,6 @@ static int sky2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-
 		spin_lock_bh(&sky2->phy_lock);
 		err = gm_phy_write(hw, sky2->port, data->reg_num & 0x1f,
 				   data->val_in);
diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index e0dfdd2..305ec3d 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -2852,9 +2852,7 @@ static int gem_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable(CAP_NET_ADMIN))
-			rc = -EPERM;
-		else if (!gp->running)
+		if (!gp->running)
 			rc = -EAGAIN;
 		else {
 			__phy_write(gp, data->phy_id & 0x1f, data->reg_num & 0x1f,
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index bd49810..7cf8000 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -10610,9 +10610,6 @@ static int tg3_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (tp->tg3_flags2 & TG3_FLG2_PHY_SERDES)
 			break;			/* We have no PHY */
 
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-
 		if (tp->link_config.phy_is_low_power)
 			return -EAGAIN;
 
diff --git a/drivers/net/tlan.c b/drivers/net/tlan.c
index 49e273b..3d31b47 100644
--- a/drivers/net/tlan.c
+++ b/drivers/net/tlan.c
@@ -1004,8 +1004,6 @@ static int TLan_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 

 	case SIOCSMIIREG:		/* Write MII PHY register. */
-			if (!capable(CAP_NET_ADMIN))
-				return -EPERM;
 			TLan_MiiWriteReg(dev, data->phy_id & 0x1f,
 					 data->reg_num & 0x1f, data->val_in);
 			return 0;
diff --git a/drivers/net/tulip/tulip_core.c b/drivers/net/tulip/tulip_core.c
index b89b73c..6b2330e 100644
--- a/drivers/net/tulip/tulip_core.c
+++ b/drivers/net/tulip/tulip_core.c
@@ -923,8 +923,6 @@ static int private_ioctl (struct net_device *dev, struct ifreq *rq, int cmd)
 		return 0;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable (CAP_NET_ADMIN))
-			return -EPERM;
 		if (regnum & ~0x1f)
 			return -EINVAL;
 		if (data->phy_id == phy) {
diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c
index 3e59397..b38d3b7 100644
--- a/drivers/net/tulip/winbond-840.c
+++ b/drivers/net/tulip/winbond-840.c
@@ -1470,8 +1470,6 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		return 0;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		spin_lock_irq(&np->lock);
 		mdio_write(dev, data->phy_id & 0x1f, data->reg_num & 0x1f, data->val_in);
 		spin_unlock_irq(&np->lock);
diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c
index ced1446..e04e5be 100644
--- a/drivers/net/via-velocity.c
+++ b/drivers/net/via-velocity.c
@@ -2328,14 +2328,10 @@ static int velocity_mii_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd
 		miidata->phy_id = readb(&regs->MIIADR) & 0x1f;
 		break;
 	case SIOCGMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		if (velocity_mii_read(vptr->mac_regs, miidata->reg_num & 0x1f, &(miidata->val_out)) < 0)
 			return -ETIMEDOUT;
 		break;
 	case SIOCSMIIREG:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		spin_lock_irqsave(&vptr->lock, flags);
 		err = velocity_mii_write(vptr->mac_regs, miidata->reg_num & 0x1f, miidata->val_in);
 		spin_unlock_irqrestore(&vptr->lock, flags);
diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c
index 4987040..40ad0de 100644
--- a/drivers/net/yellowfin.c
+++ b/drivers/net/yellowfin.c
@@ -1365,8 +1365,6 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		return 0;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
 		if (data->phy_id == np->phys[0]) {
 			u16 value = data->val_in;
 			switch (data->reg_num) {

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH 1/3] netdev: Remove SIOCDEVPRIVATE aliases for MDIO ioctls
From: Ben Hutchings @ 2009-09-03 20:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

The standard MDIO ioctl numbers are well-established and these should
no longer be needed.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ibm_newemac/core.c |    3 ---
 drivers/net/natsemi.c          |    3 ---
 2 files changed, 0 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index d7579e4..24ae671 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -2218,16 +2218,13 @@ static int emac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 
 	switch (cmd) {
 	case SIOCGMIIPHY:
-	case SIOCDEVPRIVATE:
 		data[0] = dev->phy.address;
 		/* Fall through */
 	case SIOCGMIIREG:
-	case SIOCDEVPRIVATE + 1:
 		data[3] = emac_mdio_read(ndev, dev->phy.address, data[1]);
 		return 0;
 
 	case SIOCSMIIREG:
-	case SIOCDEVPRIVATE + 2:
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 		emac_mdio_write(ndev, dev->phy.address, data[1], data[2]);
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index bd41351..b52e5d9 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -3053,12 +3053,10 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 
 	switch(cmd) {
 	case SIOCGMIIPHY:		/* Get address of MII PHY in use. */
-	case SIOCDEVPRIVATE:		/* for binary compat, remove in 2.5 */
 		data->phy_id = np->phy_addr_external;
 		/* Fall Through */
 
 	case SIOCGMIIREG:		/* Read MII PHY register. */
-	case SIOCDEVPRIVATE+1:		/* for binary compat, remove in 2.5 */
 		/* The phy_id is not enough to uniquely identify
 		 * the intended target. Therefore the command is sent to
 		 * the given mii on the current port.
@@ -3077,7 +3075,6 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		return 0;
 
 	case SIOCSMIIREG:		/* Write MII PHY register. */
-	case SIOCDEVPRIVATE+2:		/* for binary compat, remove in 2.5 */
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 		if (dev->if_port == PORT_TP) {

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: [PATCH 1/9] task_struct: add PF_NONOTIFY for fanotify to use
From: Eric Paris @ 2009-09-03 20:25 UTC (permalink / raw)
  Cc: linux-kernel, linux-fsdevel, netdev, davem, viro, alan, hch
In-Reply-To: <20090828223626.GA8283@ioremap.net>

> On Fri, Aug 28, 2009 at 02:55:42PM -0400, Eric Paris (eparis@redhat.com) wrote:
> > Since fanotify opens file descriptors inside the kernel for it's listeners
> > it needs a way to make sure that 2 fanotify listeners, both which listen to
> > open events do not continuously see each others open events (and get into a
> > livelock reporting on each other's activity).  This fix is to create a new
> > tast_struct flags called PF_NONOTIFY.  If this flag is set in a task no
> > fanotify events will be generated for that task.   fanotify will set the
> > flag before and open call and will clear it immediately after.

So this patch isn't actually needed, I thought the fsnotify_open() call
was deeper than it is.  I'm going to drop this particular patch.

Can I take silence on the list as a lack of disagreement?  I'd like to
start putting these into linux-next starting tomorrow.  Obviously I'd
like to see the networking definitions approved and taken by davem, I'd
like to see the FMODE_ change approved and taken by viro, but if you two
would like me to push it directly I'd be happy to.

Davem, maybe you'd rather my af_fanotify was somewhere inside net/
instead of in fs/notify/fanotify?  I'd love to get some review here, but
noone onlist is excited enough to step up.  (I have numerous people who
have privately claimed they intend to use this stuff, so i promise it
won't be dead code.)

I'm ready to start committing and adding features, anyone telling me no?

-Eric




^ permalink raw reply

* Re: [PATCH net-next-2.6] bonding: introduce primary_passive option
From: Nicolas de Pesloüan @ 2009-09-03 19:56 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, fubar, bonding-devel
In-Reply-To: <20090901174234.GA3209@psychotron.redhat.com>

Jiri Pirko wrote:
> (updated)
> 
> In some cases there is not desirable to switch back to primary interface when
> it's link recovers and rather stay with currently active one. We need to avoid
> packetloss as much as we can in some cases. This is solved by introducing
> primary_passive option. Note that enslaved primary slave is set as current
> active no matter what.
> 
> This patch depends on the following one:
> [net-next-2.6] bonding: make ab_arp select active slaves as other modes
> http://patchwork.ozlabs.org/patch/32684/
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> 
> diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
> index d5181ce..e70fa8e 100644
> --- a/Documentation/networking/bonding.txt
> +++ b/Documentation/networking/bonding.txt
> @@ -614,6 +614,17 @@ primary
>  
>  	The primary option is only valid for active-backup mode.
>  
> +primary_passive
> +
> +	Specifies the behaviour of the primary slave in case of
> +	it's link recovery has been detected. By default (value 0) the
> +	primary slave is set as active slave immediately after the link
> +	recovery. If the value is 1 or 2 then current active slave doesn't
> +	change as long as it's link status doesn't change. This prevents
> +	the bonding device from flip-flopping. Plus if the value is 1 this
> +	behaviour happens only if the speed and duplex of primary slave is
> +	higher. It the value is 2 then it happens everytime.
> +

May I suggest the following option name and description instead ?

-----

primary_return

	Specifies the behavior of the current active slave when the primary was
	down and comes back up. This option is designed to prevent flip-flopping
	between the primary slave and other slaves. The possible values and
	their respective effects are:

	alway or 0 (default)

		The primary slave becomes the active slave whenever it comes
		back up.

	better or 1

		The primary slave becomes the active slave when it comes back
		up, if the speed and duplex of the primary slave is better
		than the speed and duplex of the current active slave.

	failure_only or 2

		The primary slave becomes the active slave only if the current
		active slave fails and the primary slave is up.

	When no slave are active, if the primary comes back up, it becomes the
	active slave, regardless of the value of primary_return.

-----

Then, to allow those logical names, I suggest you use the bond_parse_parm() 
function and the following constants and struct, to parse module params and 
sysfs configuration.

enum {
         BOND_PRI_RETURN_ALWAYS = 0,
         BOND_PRI_RETURN_BETTER = 1,
         BOND_PRI_RETURN_FAILURE_ONLY = 2,
};

const struct bond_parm_tbl bond_pri_return_tbl[] = {
{       "always",         BOND_PRI_RETURN_ALWAYS},
{       "better",         BOND_PRI_RETURN_BETTER},
{       "failure_only",   BOND_PRI_RETURN_FAILURE_ONLY},
{       NULL,           -1},
};

	Nicolas.

>  updelay
>  
>  	Specifies the time, in milliseconds, to wait before enabling a
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 699bfdd..65066c1 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -94,6 +94,7 @@ static int downdelay;
>  static int use_carrier	= 1;
>  static char *mode;
>  static char *primary;
> +static int primary_passive;
>  static char *lacp_rate;
>  static char *ad_select;
>  static char *xmit_hash_policy;
> @@ -126,6 +127,9 @@ MODULE_PARM_DESC(mode, "Mode of operation : 0 for balance-rr, "
>  		       "6 for balance-alb");
>  module_param(primary, charp, 0);
>  MODULE_PARM_DESC(primary, "Primary network device to use");
> +module_param(primary_passive, int, 0);
> +MODULE_PARM_DESC(primary_passive, "Do not set primary slave active once it comes up; "
> +			       "0 for off (default), 1 for on only if speed of primary is not higher, 2 for on");
>  module_param(lacp_rate, charp, 0);
>  MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner "
>  			    "(slow/fast)");
> @@ -1070,6 +1074,25 @@ out:
>  
>  }
>  
> +static bool bond_should_loose_active(struct bonding *bond)
> +{
> +	struct slave *prim = bond->primary_slave;
> +	struct slave *curr = bond->curr_active_slave;
> +
> +	if (!prim || !curr || curr->link != BOND_LINK_UP)
> +		return true;
> +	if (bond->force_primary) {
> +		bond->force_primary = false;
> +		return true;
> +	}
> +	if (bond->params.primary_passive == 1 &&
> +	    (prim->speed < curr->speed ||
> +	     (prim->speed == curr->speed && prim->duplex <= curr->duplex)))
> +		return false;
> +	if (bond->params.primary_passive == 2)
> +		return false;
> +	return true;
> +}
>  
>  /**
>   * find_best_interface - select the best available slave to be the active one
> @@ -1094,7 +1117,8 @@ static struct slave *bond_find_best_slave(struct bonding *bond)
>  	}
>  
>  	if ((bond->primary_slave) &&
> -	    bond->primary_slave->link == BOND_LINK_UP) {
> +	    bond->primary_slave->link == BOND_LINK_UP &&
> +	    bond_should_loose_active(bond)) {
>  		new_active = bond->primary_slave;
>  	}
>  
> @@ -1675,8 +1699,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>  
>  	if (USES_PRIMARY(bond->params.mode) && bond->params.primary[0]) {
>  		/* if there is a primary slave, remember it */
> -		if (strcmp(bond->params.primary, new_slave->dev->name) == 0)
> +		if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
>  			bond->primary_slave = new_slave;
> +			bond->force_primary = true;
> +		}
>  	}
>  
>  	write_lock_bh(&bond->curr_slave_lock);
> @@ -4942,6 +4968,18 @@ static int bond_check_params(struct bond_params *params)
>  		primary = NULL;
>  	}
>  
> +	if (primary) {
> +		if ((primary_passive != 0) && (primary_passive != 1) &&
> +		    (primary_passive != 2)) {
> +			pr_warning(DRV_NAME
> +				   ": Warning: primary_passive module parameter "
> +				   "(%d), not of valid value (0/1/2), so it was "
> +				   "set to 0\n",
> +				   primary_passive);
> +			primary_passive = 0;
> +		}
> +	}
> +
>  	if (fail_over_mac) {
>  		fail_over_mac_value = bond_parse_parm(fail_over_mac,
>  						      fail_over_mac_tbl);
> @@ -4973,6 +5011,7 @@ static int bond_check_params(struct bond_params *params)
>  	params->use_carrier = use_carrier;
>  	params->lacp_fast = lacp_fast;
>  	params->primary[0] = 0;
> +	params->primary_passive = primary_passive;
>  	params->fail_over_mac = fail_over_mac_value;
>  
>  	if (primary) {
> diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
> index 6044e12..e813d48 100644
> --- a/drivers/net/bonding/bond_sysfs.c
> +++ b/drivers/net/bonding/bond_sysfs.c
> @@ -1212,6 +1212,59 @@ static DEVICE_ATTR(primary, S_IRUGO | S_IWUSR,
>  		   bonding_show_primary, bonding_store_primary);
>  
>  /*
> + * Show and set the primary_passive flag.
> + */
> +static ssize_t bonding_show_primary_passive(struct device *d,
> +					    struct device_attribute *attr,
> +					    char *buf)
> +{
> +	struct bonding *bond = to_bond(d);
> +
> +	return sprintf(buf, "%d\n", bond->params.primary_passive);
> +}
> +
> +static ssize_t bonding_store_primary_passive(struct device *d,
> +					     struct device_attribute *attr,
> +					     const char *buf, size_t count)
> +{
> +	int new_value, ret = count;
> +	struct bonding *bond = to_bond(d);
> +
> +	if (!rtnl_trylock())
> +		return restart_syscall();
> +
> +	if (sscanf(buf, "%d", &new_value) != 1) {
> +		pr_err(DRV_NAME
> +		       ": %s: no primary_passive value specified.\n",
> +		       bond->dev->name);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	if (new_value == 0 || new_value == 1 || new_value == 2) {
> +		bond->params.primary_passive = new_value;
> +		pr_info(DRV_NAME ": %s: Setting primary_passive to %d.\n",
> +		       bond->dev->name, new_value);
> +		if (new_value == 0 || new_value == 1) {
> +			bond->force_primary = true;
> +			read_lock(&bond->lock);
> +			write_lock_bh(&bond->curr_slave_lock);
> +			bond_select_active_slave(bond);
> +			write_unlock_bh(&bond->curr_slave_lock);
> +			read_unlock(&bond->lock);
> +		}
> +	} else {
> +		pr_info(DRV_NAME
> +		       ": %s: Ignoring invalid primary_passive value %d.\n",
> +		       bond->dev->name, new_value);
> +	}
> +out:
> +	rtnl_unlock();
> +	return count;
> +}
> +static DEVICE_ATTR(primary_passive, S_IRUGO | S_IWUSR,
> +		   bonding_show_primary_passive, bonding_store_primary_passive);
> +
> +/*
>   * Show and set the use_carrier flag.
>   */
>  static ssize_t bonding_show_carrier(struct device *d,
> @@ -1500,6 +1553,7 @@ static struct attribute *per_bond_attrs[] = {
>  	&dev_attr_num_unsol_na.attr,
>  	&dev_attr_miimon.attr,
>  	&dev_attr_primary.attr,
> +	&dev_attr_primary_passive.attr,
>  	&dev_attr_use_carrier.attr,
>  	&dev_attr_active_slave.attr,
>  	&dev_attr_mii_status.attr,
> diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
> index 6290a50..b6287e0 100644
> --- a/drivers/net/bonding/bonding.h
> +++ b/drivers/net/bonding/bonding.h
> @@ -131,6 +131,7 @@ struct bond_params {
>  	int lacp_fast;
>  	int ad_select;
>  	char primary[IFNAMSIZ];
> +	int primary_passive;
>  	__be32 arp_targets[BOND_MAX_ARP_TARGETS];
>  };
>  
> @@ -190,6 +191,7 @@ struct bonding {
>  	struct   slave *curr_active_slave;
>  	struct   slave *current_arp_slave;
>  	struct   slave *primary_slave;
> +	bool     force_primary;
>  	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
>  	rwlock_t lock;
>  	rwlock_t curr_slave_lock;
> 


^ permalink raw reply

* Re: Network hangs with 2.6.30.5
From: Holger Hoffstaette @ 2009-09-03 19:55 UTC (permalink / raw)
  To: netdev
In-Reply-To: <4AA0188C.20107@gmail.com>

On Thu, 03 Sep 2009 21:27:08 +0200, Eric Dumazet wrote:

> Holger Hoffstaette a écrit :
>> Problem found! At least for me..
>> 
>>> On 01-09-2009 17:32, Holger Hoffstaette wrote:
>>>> On Tue, 01 Sep 2009 16:17:08 +0200, Holger Hoffstaette wrote:
>>>>
>>>> [network regressions in .30]
>> 
>> I got the git .30.y stable tree and reverted various e1000 commits that
>> seemed to coincide with the various .30-rc releases but nothing helped.
>> Also no relation to offloads etc.
>> 
>> However I did notice that the "stuck squid" problem seemed to magically
>> fix itself after a few seconds - then hang again, fix itself after
>> timeouts etc. So I suspected something TCP related and BINGO!
>> 
>> Turns out I had both tcp_tw_recycle and tcp_tw_reuse set to 1 for
>> reasons I don't want to explain. :)
>> 
>> I can now arbitrarily fix the hanging behaviour by setting
>> tcp_tw_recycle to 0, and cause hangs by setting it to 1 again. For
>> obvious reasons this seems to affect squid more than other tasks with
>> more long-lived connections. What is the right behaviour? beats me.
>> 
>> tcp_tw_reuse does not appear to play a role, so the real culprit at
>> least in my case seems to be tcp_tw_recycle. In previous releases this
>> (and tw_reuse) was necessary for various server tasks.
>> 
>> Nevertheless, something has changed between .29 and .30 that "broke" the
>> previous behaviour. Whether this is progress or an regression I cannot
>> say. Maybe someone else has an idea?
>> 
>> 
> Well... not yet :)
> 
> We probably can reproduce this problem with any NIC...
> 
> Could you send from the 'buggy' setup
> 
> $ grep . /proc/sys/net/ipv4/*

Sure:

root>grep . /proc/sys/net/ipv4/*
grep: /proc/sys/net/ipv4/conf: Invalid argument
/proc/sys/net/ipv4/icmp_echo_ignore_all:0
/proc/sys/net/ipv4/icmp_echo_ignore_broadcasts:1
/proc/sys/net/ipv4/icmp_errors_use_inbound_ifaddr:0
/proc/sys/net/ipv4/icmp_ignore_bogus_error_responses:1
/proc/sys/net/ipv4/icmp_ratelimit:1000
/proc/sys/net/ipv4/icmp_ratemask:6168
/proc/sys/net/ipv4/igmp_max_memberships:20
/proc/sys/net/ipv4/igmp_max_msf:10
/proc/sys/net/ipv4/inet_peer_gc_maxtime:120
/proc/sys/net/ipv4/inet_peer_gc_mintime:10
/proc/sys/net/ipv4/inet_peer_maxttl:600
/proc/sys/net/ipv4/inet_peer_minttl:120
/proc/sys/net/ipv4/inet_peer_threshold:65664
/proc/sys/net/ipv4/ip_default_ttl:64
/proc/sys/net/ipv4/ip_dynaddr:0
/proc/sys/net/ipv4/ip_forward:0
/proc/sys/net/ipv4/ip_local_port_range:32768    61000
/proc/sys/net/ipv4/ip_no_pmtu_disc:0
/proc/sys/net/ipv4/ip_nonlocal_bind:0
/proc/sys/net/ipv4/ipfrag_high_thresh:8388608
/proc/sys/net/ipv4/ipfrag_low_thresh:1048575
/proc/sys/net/ipv4/ipfrag_max_dist:64
/proc/sys/net/ipv4/ipfrag_secret_interval:600
/proc/sys/net/ipv4/ipfrag_time:30
grep: /proc/sys/net/ipv4/neigh: Invalid argument
grep: /proc/sys/net/ipv4/route: Invalid argument
/proc/sys/net/ipv4/rt_cache_rebuild_count:4
/proc/sys/net/ipv4/tcp_abc:0
/proc/sys/net/ipv4/tcp_abort_on_overflow:0
/proc/sys/net/ipv4/tcp_adv_win_scale:2
/proc/sys/net/ipv4/tcp_allowed_congestion_control:cubic reno
/proc/sys/net/ipv4/tcp_app_win:31
/proc/sys/net/ipv4/tcp_available_congestion_control:cubic reno
/proc/sys/net/ipv4/tcp_base_mss:512
/proc/sys/net/ipv4/tcp_congestion_control:cubic
/proc/sys/net/ipv4/tcp_dsack:1
/proc/sys/net/ipv4/tcp_ecn:0
/proc/sys/net/ipv4/tcp_fack:1
/proc/sys/net/ipv4/tcp_fin_timeout:60
/proc/sys/net/ipv4/tcp_frto:2
/proc/sys/net/ipv4/tcp_frto_response:0
/proc/sys/net/ipv4/tcp_keepalive_intvl:75
/proc/sys/net/ipv4/tcp_keepalive_probes:9
/proc/sys/net/ipv4/tcp_keepalive_time:7200
/proc/sys/net/ipv4/tcp_low_latency:0
/proc/sys/net/ipv4/tcp_max_orphans:16384
/proc/sys/net/ipv4/tcp_max_ssthresh:0
/proc/sys/net/ipv4/tcp_max_syn_backlog:1024
/proc/sys/net/ipv4/tcp_max_tw_buckets:180000
/proc/sys/net/ipv4/tcp_mem:82944        110592  165888
/proc/sys/net/ipv4/tcp_moderate_rcvbuf:1
/proc/sys/net/ipv4/tcp_mtu_probing:0
/proc/sys/net/ipv4/tcp_no_metrics_save:0
/proc/sys/net/ipv4/tcp_orphan_retries:0
/proc/sys/net/ipv4/tcp_reordering:3
/proc/sys/net/ipv4/tcp_retrans_collapse:1
/proc/sys/net/ipv4/tcp_retries1:3
/proc/sys/net/ipv4/tcp_retries2:15
/proc/sys/net/ipv4/tcp_rfc1337:0
/proc/sys/net/ipv4/tcp_rmem:4096        262144  8388608
/proc/sys/net/ipv4/tcp_sack:1
/proc/sys/net/ipv4/tcp_slow_start_after_idle:1
/proc/sys/net/ipv4/tcp_stdurg:0
/proc/sys/net/ipv4/tcp_syn_retries:5
/proc/sys/net/ipv4/tcp_synack_retries:5
/proc/sys/net/ipv4/tcp_timestamps:1
/proc/sys/net/ipv4/tcp_tso_win_divisor:3
/proc/sys/net/ipv4/tcp_tw_recycle:1
/proc/sys/net/ipv4/tcp_tw_reuse:1
/proc/sys/net/ipv4/tcp_window_scaling:1
/proc/sys/net/ipv4/tcp_wmem:4096        262144  8388608
/proc/sys/net/ipv4/tcp_workaround_signed_windows:0
/proc/sys/net/ipv4/udp_mem:82944        110592  165888
/proc/sys/net/ipv4/udp_rmem_min:4096
/proc/sys/net/ipv4/udp_wmem_min:4096
root> 

> When you say squid is stuck, does it mean it doesnt accept new connections
> ?

Yes, that seems to be the behaviour. To verify - I just browse any random
website and after a few requests the browser does not get any replies any
longer, timeouts etc. After setting tw_recycle to 0 it immediately starts
working again (and then continues to work).

Took a bit to find out in which direction things were hanging/getting
stuck, since it was not immediately clear from simply observing the client.

> Could help to strace it and check what it is doing ? --

After a quite frantic start (endless amount of 54 byte writes??) it sits
in its epoll loop and waits:

..
epoll_wait(4, {}, 8192, 1000)           = 0
gettimeofday({1252007105, 637264}, NULL) = 0
gettimeofday({1252007105, 637309}, NULL) = 0
epoll_wait(4, {}, 8192, 1000)           = 0
gettimeofday({1252007106, 637262}, NULL) = 0
gettimeofday({1252007106, 637308}, NULL) = 0
epoll_wait(4, {}, 8192, 0)              = 0
gettimeofday({1252007106, 637389}, NULL) = 0
gettimeofday({1252007106, 637421}, NULL) = 0
epoll_wait(4, {}, 8192, 1000)           = 0
gettimeofday({1252007107, 637266}, NULL) = 0
gettimeofday({1252007107, 637311}, NULL) = 0
..etc..

and occasionally diddles with its cache directory:

..
epoll_wait(4, {}, 8192, 997)            = 0
gettimeofday({1252007148, 554039}, NULL) = 0
gettimeofday({1252007148, 554097}, NULL) = 0
open("/var/cache/squid/02/09", O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY|O_CLOEXEC) = 9
getdents64(9, /* 113 entries */, 32768) = 3600
getdents64(9, /* 0 entries */, 32768)   = 0
close(9)                                = 0
epoll_wait(4, {}, 8192, 0)              = 0
gettimeofday({1252007148, 555923}, NULL) = 0
..

When requests come in, it does whatever squid does, but when it hangs, the
following popped up:

gettimeofday({1252007207, 116223}, NULL) = 0
epoll_ctl(4, EPOLL_CTL_MOD, 23, {EPOLLIN|EPOLLERR|EPOLLHUP, {u32=23, u64=582647025628086295}}) = 0
gettimeofday({1252007207, 116380}, NULL) = 0
epoll_wait(4, {{EPOLLIN, {u32=15, u64=582647025628086287}}}, 8192, 359) = 1
gettimeofday({1252007207, 117309}, NULL) = 0
read(15, "48,33,48,31,40,31\" href=\"/2/hi/af"..., 2046) = 581
gettimeofday({1252007207, 117476}, NULL) = 0
epoll_ctl(4, EPOLL_CTL_MOD, 23, {EPOLLIN|EPOLLOUT|EPOLLERR|EPOLLHUP, {u32=23, u64=23}}) = 0
epoll_wait(4, {{EPOLLIN, {u32=15, u64=582647025628086287}}, {EPOLLOUT, {u32=23, u64=23}}}, 8192, 358) = 2
gettimeofday({1252007207, 117790}, NULL) = 0
read(15, ""..., 2046)                   = 0
write(23, "48,33,48,31,40,31\" href=\"/2/hi/af"..., 581) = 581
gettimeofday({1252007207, 118032}, NULL) = 0
read(15, ""..., 65535)                  = 0
read(23, 0xbf925401, 65535)             = -1 EAGAIN (Resource temporarily unavailable)
epoll_ctl(4, EPOLL_CTL_DEL, 15, {0, {u32=15, u64=15}}) = 0
close(15)                               = 0
epoll_ctl(4, EPOLL_CTL_MOD, 23, {EPOLLOUT|EPOLLERR|EPOLLHUP, {u32=23, u64=581683939341500439}}) = 0
epoll_ctl(4, EPOLL_CTL_DEL, 23, {0, {u32=23, u64=581683939341500439}}) = 0
close(23)                               = 0
epoll_wait(4, {}, 8192, 357)            = 0
gettimeofday({1252007207, 477363}, NULL) = 0
gettimeofday({1252007207, 477408}, NULL) = 0
epoll_wait(4, {}, 8192, 0)              = 0
gettimeofday({1252007207, 477561}, NULL) = 0
gettimeofday({1252007207, 477629}, NULL) = 0
epoll_wait(4, {}, 8192, 449)            = 0

Then it again spins in the epoll loop for a while until it recovers and
starts to handle the outstanding requests (as far as I can read - it
scrolls by too quickly).

Was that somewhat helpful? I can certainly create a full trace but that's
going to be big.

regards
Holger



^ permalink raw reply

* Re: [PATCH] slub: Fix kmem_cache_destroy() with SLAB_DESTROY_BY_RCU
From: Eric Dumazet @ 2009-09-03 19:56 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Christoph Lameter, Zdenek Kabelac, Patrick McHardy, Robin Holt,
	Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers, paulmck,
	stable@kernel.org
In-Reply-To: <84144f020909031248s56c16205j7992930c413b1bbe@mail.gmail.com>

Pekka Enberg a écrit :
> On Thu, Sep 3, 2009 at 5:18 PM, Eric Dumazet<eric.dumazet@gmail.com> wrote:
>> Here is the second patch (RCU thing). Stable candidate
>>
>> [PATCH] slub: Fix kmem_cache_destroy() with SLAB_DESTROY_BY_RCU
>>
>> kmem_cache_destroy() should call rcu_barrier() *after* kmem_cache_close()
>> and *before* sysfs_slab_remove() or risk rcu_free_slab()
>> being called after kmem_cache is deleted (kfreed).
>>
>> rmmod nf_conntrack can crash the machine because it has to
>> kmem_cache_destroy() a SLAB_DESTROY_BY_RCU enabled cache.
> 
> Do we have a bugzilla URL for this?

Well, I can crash my 2.6.30.5 machine just doing rmmod nf_conntrack

(You'll need CONFIG_SLUB_DEBUG_ON or equivalent)

Original Zdenek report : http://thread.gmane.org/gmane.linux.kernel/876016/focus=876086

> 
>> Reported-by: Zdenek Kabelac <zdenek.kabelac@gmail.com>
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> OK, this is in for-next now and queued for 2.6.31. If you guys want to
> fix this in a different way, lets do that in 2.6.32.

Seems the right thing IMHO
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox