Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next-2.6] bonding: introduce primary_passive option
From: Jiri Pirko @ 2009-09-01 17:42 UTC (permalink / raw)
  To: netdev; +Cc: davem, fubar, bonding-devel, nicolas.2p.debian

(updated)

In some cases there is not desirable to switch back to primary interface when
it's link recovers and rather stay with currently active one. We need to avoid
packetloss as much as we can in some cases. This is solved by introducing
primary_passive option. Note that enslaved primary slave is set as current
active no matter what.

This patch depends on the following one:
[net-next-2.6] bonding: make ab_arp select active slaves as other modes
http://patchwork.ozlabs.org/patch/32684/

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index d5181ce..e70fa8e 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -614,6 +614,17 @@ primary
 
 	The primary option is only valid for active-backup mode.
 
+primary_passive
+
+	Specifies the behaviour of the primary slave in case of
+	it's link recovery has been detected. By default (value 0) the
+	primary slave is set as active slave immediately after the link
+	recovery. If the value is 1 or 2 then current active slave doesn't
+	change as long as it's link status doesn't change. This prevents
+	the bonding device from flip-flopping. Plus if the value is 1 this
+	behaviour happens only if the speed and duplex of primary slave is
+	higher. It the value is 2 then it happens everytime.
+
 updelay
 
 	Specifies the time, in milliseconds, to wait before enabling a
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 699bfdd..65066c1 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -94,6 +94,7 @@ static int downdelay;
 static int use_carrier	= 1;
 static char *mode;
 static char *primary;
+static int primary_passive;
 static char *lacp_rate;
 static char *ad_select;
 static char *xmit_hash_policy;
@@ -126,6 +127,9 @@ MODULE_PARM_DESC(mode, "Mode of operation : 0 for balance-rr, "
 		       "6 for balance-alb");
 module_param(primary, charp, 0);
 MODULE_PARM_DESC(primary, "Primary network device to use");
+module_param(primary_passive, int, 0);
+MODULE_PARM_DESC(primary_passive, "Do not set primary slave active once it comes up; "
+			       "0 for off (default), 1 for on only if speed of primary is not higher, 2 for on");
 module_param(lacp_rate, charp, 0);
 MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner "
 			    "(slow/fast)");
@@ -1070,6 +1074,25 @@ out:
 
 }
 
+static bool bond_should_loose_active(struct bonding *bond)
+{
+	struct slave *prim = bond->primary_slave;
+	struct slave *curr = bond->curr_active_slave;
+
+	if (!prim || !curr || curr->link != BOND_LINK_UP)
+		return true;
+	if (bond->force_primary) {
+		bond->force_primary = false;
+		return true;
+	}
+	if (bond->params.primary_passive == 1 &&
+	    (prim->speed < curr->speed ||
+	     (prim->speed == curr->speed && prim->duplex <= curr->duplex)))
+		return false;
+	if (bond->params.primary_passive == 2)
+		return false;
+	return true;
+}
 
 /**
  * find_best_interface - select the best available slave to be the active one
@@ -1094,7 +1117,8 @@ static struct slave *bond_find_best_slave(struct bonding *bond)
 	}
 
 	if ((bond->primary_slave) &&
-	    bond->primary_slave->link == BOND_LINK_UP) {
+	    bond->primary_slave->link == BOND_LINK_UP &&
+	    bond_should_loose_active(bond)) {
 		new_active = bond->primary_slave;
 	}
 
@@ -1675,8 +1699,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 
 	if (USES_PRIMARY(bond->params.mode) && bond->params.primary[0]) {
 		/* if there is a primary slave, remember it */
-		if (strcmp(bond->params.primary, new_slave->dev->name) == 0)
+		if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
 			bond->primary_slave = new_slave;
+			bond->force_primary = true;
+		}
 	}
 
 	write_lock_bh(&bond->curr_slave_lock);
@@ -4942,6 +4968,18 @@ static int bond_check_params(struct bond_params *params)
 		primary = NULL;
 	}
 
+	if (primary) {
+		if ((primary_passive != 0) && (primary_passive != 1) &&
+		    (primary_passive != 2)) {
+			pr_warning(DRV_NAME
+				   ": Warning: primary_passive module parameter "
+				   "(%d), not of valid value (0/1/2), so it was "
+				   "set to 0\n",
+				   primary_passive);
+			primary_passive = 0;
+		}
+	}
+
 	if (fail_over_mac) {
 		fail_over_mac_value = bond_parse_parm(fail_over_mac,
 						      fail_over_mac_tbl);
@@ -4973,6 +5011,7 @@ static int bond_check_params(struct bond_params *params)
 	params->use_carrier = use_carrier;
 	params->lacp_fast = lacp_fast;
 	params->primary[0] = 0;
+	params->primary_passive = primary_passive;
 	params->fail_over_mac = fail_over_mac_value;
 
 	if (primary) {
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 6044e12..e813d48 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -1212,6 +1212,59 @@ static DEVICE_ATTR(primary, S_IRUGO | S_IWUSR,
 		   bonding_show_primary, bonding_store_primary);
 
 /*
+ * Show and set the primary_passive flag.
+ */
+static ssize_t bonding_show_primary_passive(struct device *d,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	struct bonding *bond = to_bond(d);
+
+	return sprintf(buf, "%d\n", bond->params.primary_passive);
+}
+
+static ssize_t bonding_store_primary_passive(struct device *d,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	int new_value, ret = count;
+	struct bonding *bond = to_bond(d);
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (sscanf(buf, "%d", &new_value) != 1) {
+		pr_err(DRV_NAME
+		       ": %s: no primary_passive value specified.\n",
+		       bond->dev->name);
+		ret = -EINVAL;
+		goto out;
+	}
+	if (new_value == 0 || new_value == 1 || new_value == 2) {
+		bond->params.primary_passive = new_value;
+		pr_info(DRV_NAME ": %s: Setting primary_passive to %d.\n",
+		       bond->dev->name, new_value);
+		if (new_value == 0 || new_value == 1) {
+			bond->force_primary = true;
+			read_lock(&bond->lock);
+			write_lock_bh(&bond->curr_slave_lock);
+			bond_select_active_slave(bond);
+			write_unlock_bh(&bond->curr_slave_lock);
+			read_unlock(&bond->lock);
+		}
+	} else {
+		pr_info(DRV_NAME
+		       ": %s: Ignoring invalid primary_passive value %d.\n",
+		       bond->dev->name, new_value);
+	}
+out:
+	rtnl_unlock();
+	return count;
+}
+static DEVICE_ATTR(primary_passive, S_IRUGO | S_IWUSR,
+		   bonding_show_primary_passive, bonding_store_primary_passive);
+
+/*
  * Show and set the use_carrier flag.
  */
 static ssize_t bonding_show_carrier(struct device *d,
@@ -1500,6 +1553,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_num_unsol_na.attr,
 	&dev_attr_miimon.attr,
 	&dev_attr_primary.attr,
+	&dev_attr_primary_passive.attr,
 	&dev_attr_use_carrier.attr,
 	&dev_attr_active_slave.attr,
 	&dev_attr_mii_status.attr,
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 6290a50..b6287e0 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -131,6 +131,7 @@ struct bond_params {
 	int lacp_fast;
 	int ad_select;
 	char primary[IFNAMSIZ];
+	int primary_passive;
 	__be32 arp_targets[BOND_MAX_ARP_TARGETS];
 };
 
@@ -190,6 +191,7 @@ struct bonding {
 	struct   slave *curr_active_slave;
 	struct   slave *current_arp_slave;
 	struct   slave *primary_slave;
+	bool     force_primary;
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;

^ permalink raw reply related

* Trouble with mdio- gpio driver.
From: Александр Дориф @ 2009-09-01 15:43 UTC (permalink / raw)
  To: netdev

Hello! I compiled Linux kernel 2.6.31-rc8 an get this warning:
WARNING: drivers/net/phy/mdio-gpio.o(.devexit.text+0x1c): Section mismatch
in reference from the function mdio_gpio_remove() to the function
.devinit.text:mdio_gpio_bus_deinit()
The function __devexit mdio_gpio_remove() references
a function __devinit mdio_gpio_bus_deinit().
This is often seen when error handling in the exit function

I hope it will be corrected soon and won't appear in release kernel.


^ permalink raw reply

* Re: netconsole as normal console for userspace messages?
From: Matt Mackall @ 2009-09-01 17:34 UTC (permalink / raw)
  To: Arkadiusz Miskiewicz; +Cc: Jarek Poplawski, netdev
In-Reply-To: <200909011800.40243.a.miskiewicz@gmail.com>

On Tue, 2009-09-01 at 18:00 +0200, Arkadiusz Miskiewicz wrote:
> On Tuesday 01 of September 2009, Matt Mackall wrote:
> > On Tue, 2009-09-01 at 10:16 +0000, Jarek Poplawski wrote:
> > > On 30-08-2009 02:12, Arkadiusz Miskiewicz wrote:
> > > > Hi,
> > >
> > > Hi,
> > >
> > > > Can netconsole be somehow used to see userspace messages, too?
> > > > Something similar to console=ttySX... just =netconsole.
> > >
> > > AFAIK it can't, but AFAICR some time ago the netconsole maintainer
> > > gave some hope to some Debian guy with a similar question.
> >
> > It is, as they say, just a simple matter of programming to get a full
> > bidirectional console out of it. As cool as that would be, I just
> > haven't had time for it.
> 
> Unidirectional console would be enough for me but it seems that netconsole 
> isn't capable even for that (userspace messages).

Yes. What's needed is to add a low-level tty interface and register it
with the tty core. Most of that is attaching the tty_ops->write() method
to netpoll's send_msg in the obvious way. Just about all the other
methods are optional or trivial. 

Once you've got unidirectional working, bidirectional is approximately
one line of additional code to attach the read side: call
tty_insert_flip_string(my_tty, buf, len) in netpoll's receive function.

-- 
http://selenic.com : development and support for Mercurial and Linux



^ permalink raw reply

* Re: neighbour table RCU
From: Octavian Purdila @ 2009-09-01 16:56 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, Lucian Adrian Grijincu, netdev
In-Reply-To: <4A9D486D.4020408@gmail.com>

On Tuesday 01 September 2009 19:14:37 Eric Dumazet wrote:
> Octavian Purdila a écrit :
> > On Tuesday 01 September 2009 09:50:17 Eric Dumazet wrote:
> >> Stephen Hemminger a écrit :
> >>> Looking at the neighbour table, it should be possible to get
> >>> rid of the two reader/writer locks.  The hash table lock is pretty
> >>> amenable to RCU, but the dynamic resizing makes it non-trivial.
> >>> Thinking of using a combination of RCU and sequence counts so that the
> >>> reader would just rescan if resize was in progress.
> >>
> >> I am not sure neigh_tbl_lock rwlock should be changed, I did not
> >> see any contention on it.
> >
> > Speaking about neighbour optimizations, here is a RFC patch which makes
> > the tables double linked, for constant time deletion. It has given us a
> > significant performance improvement - in less then usual setups though,
> > with lots of neighbours.
>
> How many "struct neigh_parms" do you have in your setups, and
> what is the frequency of neigh_parms_release() calls ???
>

Each L3 interface part (at least for IPv4/IPv6 and I see DecNet as well) has a 
neigh_parms associated. And we use up to 128K interfaces in certain tests 
scenarios ;)

In our case the interfaces are created/destroyed dynamically, so when using a 
large number of interfaces the test cleanup takes forever without this (and 
other) patch(es).

Thanks,
tavi

^ permalink raw reply

* [PATCH net-next-2.6] bonding: use compare_ether_addr_64bits() in ALB
From: Eric Dumazet @ 2009-09-01 16:31 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List, Jay Vosburgh

We can speedup ether addresses compares using compare_ether_addr_64bits()
instead of memcmp(). We make sure all operands are at least 8 bytes long and
16bits aligned (or better, long word aligned if possible)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/bonding/bond_alb.c |   71 ++++++++++++++++---------------
 drivers/net/bonding/bonding.h  |    2
 2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 2108706..9b5936f 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -79,8 +79,15 @@
  */
 #define RLB_PROMISC_TIMEOUT	10*ALB_TIMER_TICKS_PER_SEC
 
-static const u8 mac_bcast[ETH_ALEN] = {0xff,0xff,0xff,0xff,0xff,0xff};
-static const u8 mac_v6_allmcast[ETH_ALEN] = {0x33,0x33,0x00,0x00,0x00,0x01};
+#ifndef __long_aligned
+#define __long_aligned __attribute__((aligned((sizeof(long)))))
+#endif
+static const u8 mac_bcast[ETH_ALEN] __long_aligned = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+static const u8 mac_v6_allmcast[ETH_ALEN] __long_aligned = {
+	0x33, 0x33, 0x00, 0x00, 0x00, 0x01
+};
 static const int alb_delta_in_ticks = HZ / ALB_TIMER_TICKS_PER_SEC;
 
 #pragma pack(1)
@@ -460,8 +467,8 @@ static void rlb_clear_slave(struct bonding *bond, struct slave *slave)
 
 			if (assigned_slave) {
 				rx_hash_table[index].slave = assigned_slave;
-				if (memcmp(rx_hash_table[index].mac_dst,
-					   mac_bcast, ETH_ALEN)) {
+				if (compare_ether_addr_64bits(rx_hash_table[index].mac_dst,
+							      mac_bcast)) {
 					bond_info->rx_hashtbl[index].ntt = 1;
 					bond_info->rx_ntt = 1;
 					/* A slave has been removed from the
@@ -575,7 +582,7 @@ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *sla
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 
 		if ((client_info->slave == slave) &&
-		    memcmp(client_info->mac_dst, mac_bcast, ETH_ALEN)) {
+		    compare_ether_addr_64bits(client_info->mac_dst, mac_bcast)) {
 			client_info->ntt = 1;
 			ntt = 1;
 		}
@@ -616,9 +623,9 @@ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip)
 		 * unicast mac address.
 		 */
 		if ((client_info->ip_src == src_ip) &&
-		    memcmp(client_info->slave->dev->dev_addr,
-			   bond->dev->dev_addr, ETH_ALEN) &&
-		    memcmp(client_info->mac_dst, mac_bcast, ETH_ALEN)) {
+		    compare_ether_addr_64bits(client_info->slave->dev->dev_addr,
+			   bond->dev->dev_addr) &&
+		    compare_ether_addr_64bits(client_info->mac_dst, mac_bcast)) {
 			client_info->ntt = 1;
 			bond_info->rx_ntt = 1;
 		}
@@ -645,7 +652,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		if ((client_info->ip_src == arp->ip_src) &&
 		    (client_info->ip_dst == arp->ip_dst)) {
 			/* the entry is already assigned to this client */
-			if (memcmp(arp->mac_dst, mac_bcast, ETH_ALEN)) {
+			if (compare_ether_addr_64bits(arp->mac_dst, mac_bcast)) {
 				/* update mac address from arp */
 				memcpy(client_info->mac_dst, arp->mac_dst, ETH_ALEN);
 			}
@@ -680,7 +687,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		memcpy(client_info->mac_dst, arp->mac_dst, ETH_ALEN);
 		client_info->slave = assigned_slave;
 
-		if (memcmp(client_info->mac_dst, mac_bcast, ETH_ALEN)) {
+		if (compare_ether_addr_64bits(client_info->mac_dst, mac_bcast)) {
 			client_info->ntt = 1;
 			bond->alb_info.rx_ntt = 1;
 		} else {
@@ -1046,21 +1053,18 @@ static void alb_change_hw_addr_on_detach(struct bonding *bond, struct slave *sla
 	int perm_curr_diff;
 	int perm_bond_diff;
 
-	perm_curr_diff = memcmp(slave->perm_hwaddr,
-				slave->dev->dev_addr,
-				ETH_ALEN);
-	perm_bond_diff = memcmp(slave->perm_hwaddr,
-				bond->dev->dev_addr,
-				ETH_ALEN);
+	perm_curr_diff = compare_ether_addr_64bits(slave->perm_hwaddr,
+						   slave->dev->dev_addr);
+	perm_bond_diff = compare_ether_addr_64bits(slave->perm_hwaddr,
+						   bond->dev->dev_addr);
 
 	if (perm_curr_diff && perm_bond_diff) {
 		struct slave *tmp_slave;
 		int i, found = 0;
 
 		bond_for_each_slave(bond, tmp_slave, i) {
-			if (!memcmp(slave->perm_hwaddr,
-				    tmp_slave->dev->dev_addr,
-				    ETH_ALEN)) {
+			if (!compare_ether_addr_64bits(slave->perm_hwaddr,
+						       tmp_slave->dev->dev_addr)) {
 				found = 1;
 				break;
 			}
@@ -1114,10 +1118,10 @@ static int alb_handle_addr_collision_on_attach(struct bonding *bond, struct slav
 	 * check uniqueness of slave's mac address against the other
 	 * slaves in the bond.
 	 */
-	if (memcmp(slave->perm_hwaddr, bond->dev->dev_addr, ETH_ALEN)) {
+	if (compare_ether_addr_64bits(slave->perm_hwaddr, bond->dev->dev_addr)) {
 		bond_for_each_slave(bond, tmp_slave1, i) {
-			if (!memcmp(tmp_slave1->dev->dev_addr, slave->dev->dev_addr,
-				    ETH_ALEN)) {
+			if (!compare_ether_addr_64bits(tmp_slave1->dev->dev_addr,
+						       slave->dev->dev_addr)) {
 				found = 1;
 				break;
 			}
@@ -1140,9 +1144,8 @@ static int alb_handle_addr_collision_on_attach(struct bonding *bond, struct slav
 	bond_for_each_slave(bond, tmp_slave1, i) {
 		found = 0;
 		bond_for_each_slave(bond, tmp_slave2, j) {
-			if (!memcmp(tmp_slave1->perm_hwaddr,
-				    tmp_slave2->dev->dev_addr,
-				    ETH_ALEN)) {
+			if (!compare_ether_addr_64bits(tmp_slave1->perm_hwaddr,
+						       tmp_slave2->dev->dev_addr)) {
 				found = 1;
 				break;
 			}
@@ -1157,9 +1160,8 @@ static int alb_handle_addr_collision_on_attach(struct bonding *bond, struct slav
 		}
 
 		if (!has_bond_addr) {
-			if (!memcmp(tmp_slave1->dev->dev_addr,
-				    bond->dev->dev_addr,
-				    ETH_ALEN)) {
+			if (!compare_ether_addr_64bits(tmp_slave1->dev->dev_addr,
+						       bond->dev->dev_addr)) {
 
 				has_bond_addr = tmp_slave1;
 			}
@@ -1313,7 +1315,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	case ETH_P_IP: {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if ((memcmp(eth_data->h_dest, mac_bcast, ETH_ALEN) == 0) ||
+		if (!compare_ether_addr_64bits(eth_data->h_dest, mac_bcast) ||
 		    (iph->daddr == ip_bcast) ||
 		    (iph->protocol == IPPROTO_IGMP)) {
 			do_tx_balance = 0;
@@ -1327,7 +1329,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 		/* IPv6 doesn't really use broadcast mac address, but leave
 		 * that here just in case.
 		 */
-		if (memcmp(eth_data->h_dest, mac_bcast, ETH_ALEN) == 0) {
+		if (!compare_ether_addr_64bits(eth_data->h_dest, mac_bcast)) {
 			do_tx_balance = 0;
 			break;
 		}
@@ -1335,7 +1337,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 		/* IPv6 uses all-nodes multicast as an equivalent to
 		 * broadcasts in IPv4.
 		 */
-		if (memcmp(eth_data->h_dest, mac_v6_allmcast, ETH_ALEN) == 0) {
+		if (!compare_ether_addr_64bits(eth_data->h_dest, mac_v6_allmcast)) {
 			do_tx_balance = 0;
 			break;
 		}
@@ -1660,8 +1662,8 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave
 		struct slave *tmp_slave;
 		/* find slave that is holding the bond's mac address */
 		bond_for_each_slave(bond, tmp_slave, i) {
-			if (!memcmp(tmp_slave->dev->dev_addr,
-				    bond->dev->dev_addr, ETH_ALEN)) {
+			if (!compare_ether_addr_64bits(tmp_slave->dev->dev_addr,
+						       bond->dev->dev_addr)) {
 				swap_slave = tmp_slave;
 				break;
 			}
@@ -1739,7 +1741,8 @@ int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr)
 	swap_slave = NULL;
 
 	bond_for_each_slave(bond, slave, i) {
-		if (!memcmp(slave->dev->dev_addr, bond_dev->dev_addr, ETH_ALEN)) {
+		if (!compare_ether_addr_64bits(slave->dev->dev_addr,
+					       bond_dev->dev_addr)) {
 			swap_slave = slave;
 			break;
 		}
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 6290a50..6824771 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -163,9 +163,9 @@ struct slave {
 	u32    original_flags;
 	u32    original_mtu;
 	u32    link_failure_count;
+	u8     perm_hwaddr[ETH_ALEN];
 	u16    speed;
 	u8     duplex;
-	u8     perm_hwaddr[ETH_ALEN];
 	struct ad_slave_info ad_info; /* HUGE - better to dynamically alloc */
 	struct tlb_slave_info tlb_info;
 };

^ permalink raw reply related

* Re: neighbour table RCU
From: Stephen Hemminger @ 2009-09-01 16:23 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: Eric Dumazet, Lucian Adrian Grijincu, netdev
In-Reply-To: <200909011855.34175.opurdila@ixiacom.com>

On Tue, 1 Sep 2009 18:55:34 +0300
Octavian Purdila <opurdila@ixiacom.com> wrote:

> On Tuesday 01 September 2009 09:50:17 Eric Dumazet wrote:
> > Stephen Hemminger a écrit :
> > > Looking at the neighbour table, it should be possible to get
> > > rid of the two reader/writer locks.  The hash table lock is pretty
> > > amenable to RCU, but the dynamic resizing makes it non-trivial.
> > > Thinking of using a combination of RCU and sequence counts so that the
> > > reader would just rescan if resize was in progress.
> >
> > I am not sure neigh_tbl_lock rwlock should be changed, I did not
> > see any contention on it.
> >
> 
> Speaking about neighbour optimizations, here is a RFC patch which makes the 
> tables double linked, for constant time deletion. It has given us a significant 
> performance improvement - in less then usual setups though, with lots of 
> neighbours.
> 
> Would something like this be acceptable for upstream? (pardon the p4 diff dump 
> :) - but I think it will give a rough idea, if acceptable will clean it up and 
> properly submit it)
> 
> BTW, would switching to list_head be better?

Use hlist for the neighbour table. It has the right properties
and makes future RCU easier.

^ permalink raw reply

* Re: [PATCH 19/19] netdev: convert bulk of drivers to netdev_tx_t
From: David Dillow @ 2009-09-01 16:21 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev
In-Reply-To: <20090901055130.375691503@vyatta.com>

On Mon, 2009-08-31 at 22:50 -0700, Stephen Hemminger wrote:
>  drivers/net/typhoon.c                |    2 +-

For what it's worth (ie, not much), typhoon bits
Acked-by: David Dillow <dave@thedillows.gov>

^ permalink raw reply

* [PATCH] WARNING: some request_irq() failures ignored in el2_open()
From: Roel Kluin @ 2009-09-01 16:24 UTC (permalink / raw)
  To: netdev, Andrew Morton, David S. Miller

Request_irq() may fail in different ways, handle accordingly.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
---
Don't we have to do something like this? 

diff --git a/drivers/net/3c503.c b/drivers/net/3c503.c
index 134638a..81c148a 100644
--- a/drivers/net/3c503.c
+++ b/drivers/net/3c503.c
@@ -383,7 +383,7 @@ out:
 static int
 el2_open(struct net_device *dev)
 {
-    int retval = -EAGAIN;
+    int retval;
 
     if (dev->irq < 2) {
 	int irqlist[] = {5, 9, 3, 4, 0};
@@ -391,7 +391,8 @@ el2_open(struct net_device *dev)
 
 	outb(EGACFR_NORM, E33G_GACFR);	/* Enable RAM and interrupts. */
 	do {
-	    if (request_irq (*irqp, NULL, 0, "bogus", dev) != -EBUSY) {
+	    retval = request_irq(*irqp, NULL, 0, "bogus", dev);
+	    if (retval >= 0) {
 		/* Twinkle the interrupt, and check if it's seen. */
 		unsigned long cookie = probe_irq_on();
 		outb_p(0x04 << ((*irqp == 9) ? 2 : *irqp), E33G_IDCFR);
@@ -400,11 +401,14 @@ el2_open(struct net_device *dev)
 		    && ((retval = request_irq(dev->irq = *irqp,
 		    eip_interrupt, 0, dev->name, dev)) == 0))
 		    break;
+	    } else {
+		    if (retval != -EBUSY)
+			    return reval;
 	    }
 	} while (*++irqp);
 	if (*irqp == 0) {
 	    outb(EGACFR_IRQOFF, E33G_GACFR);	/* disable interrupts. */
-	    return retval;
+	    return -EAGAIN;
 	}
     } else {
 	if ((retval = request_irq(dev->irq, eip_interrupt, 0, dev->name, dev))) {

^ permalink raw reply related

* Re: UDP is bypassing qdisc statistics ....
From: Christoph Lameter @ 2009-09-01 20:13 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Patrick McHardy, Mark Smith, Jarek Poplawski, netdev
In-Reply-To: <4A9D448D.6050309@gmail.com>

On Tue, 1 Sep 2009, Eric Dumazet wrote:

> For your multicast test anyway, only one queue should be used (one flow)

Right.

I finally get some numbers with the qdisc_stats patch:

clameter@rd-strategy3-deb64:~$ cat /proc/net/qdisc_stats
Type     Device St  Bytes    Packts Qlen Bklg Drops  Requeu Overlimits
TXS root <NULL>   0        0      0    0    0      0      0      0
TX root <NULL>   0        0      0    0    0      0      0      0
RXS root <NULL>   0        0      0    0    0      0      0      0
RX root <NULL>   0        0      0    0    0      0      0      0
TXS root   eth0   0     2778     35    0    0      0      0      0
TX root   eth0   0     2778     35    0    0      0      0      0
TXS root   eth0   0     1622     18    0    0      0      0      0
TX root   eth0   0     1622     18    0    0      0      0      0
TXS root   eth0   0     2618     22    0    0      0      0      0
TX root   eth0   0     2618     22    0    0      0      0      0
TXS root   eth0   0     9792     65    0    0      0      0      0
TX root   eth0   0     9792     65    0    0      0      0      0
TXS root   eth0   0     9758     68    0    0      0      0      0
TX root   eth0   0     9758     68    0    0      0      0      0
TXS root   eth0   0     1413     18    0    0      0      0      0
TX root   eth0   0     1413     18    0    0      0      0      0
TXS root   eth0   0      557      8    0    0      0      0      0
TX root   eth0   0      557      8    0    0      0      0      0
TXS root   eth0   0     1220     11    0    0      0      0      0
TX root   eth0   0     1220     11    0    0      0      0      0
RXS root <NULL>   0        0      0    0    0      0      0      0


^ permalink raw reply

* Re: neighbour table RCU
From: Eric Dumazet @ 2009-09-01 16:14 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: Stephen Hemminger, Lucian Adrian Grijincu, netdev
In-Reply-To: <200909011855.34175.opurdila@ixiacom.com>

Octavian Purdila a écrit :
> On Tuesday 01 September 2009 09:50:17 Eric Dumazet wrote:
>> Stephen Hemminger a écrit :
>>> Looking at the neighbour table, it should be possible to get
>>> rid of the two reader/writer locks.  The hash table lock is pretty
>>> amenable to RCU, but the dynamic resizing makes it non-trivial.
>>> Thinking of using a combination of RCU and sequence counts so that the
>>> reader would just rescan if resize was in progress.
>> I am not sure neigh_tbl_lock rwlock should be changed, I did not
>> see any contention on it.
>>
> 
> Speaking about neighbour optimizations, here is a RFC patch which makes the 
> tables double linked, for constant time deletion. It has given us a significant 
> performance improvement - in less then usual setups though, with lots of 
> neighbours.

How many "struct neigh_parms" do you have in your setups, and
what is the frequency of neigh_parms_release() calls ???

> 
> Would something like this be acceptable for upstream? (pardon the p4 diff dump 
> :) - but I think it will give a rough idea, if acceptable will clean it up and 
> properly submit it)

Seems straightforward

> 
> BTW, would switching to list_head be better?

Yes, definitly :)

> 
> Thanks,
> tavi
> 
> ==== //packages/linux-2.6.7/main/src/include/net/neighbour.h#2 (text) ====
> 
> @@ -56,6 +56,7 @@
>  struct neigh_parms
>  {
>         struct neigh_parms *next;
> +       struct neigh_parms **pprev;
>         int     (*neigh_setup)(struct neighbour *);
>         struct neigh_table *tbl;
>         int     entries;
> 
> ==== //packages/linux-2.6.7/main/src/net/core/neighbour.c#3 (text) ====
> 
> @@ -1127,8 +1127,10 @@
>                 }
>                 p->sysctl_table = NULL;
>                 write_lock_bh(&tbl->lock);
> -               p->next         = tbl->parms.next;
> +               if ((p->next = tbl->parms.next))
> +                       p->next->pprev = &p->next;
>                 tbl->parms.next = p;
> +               p->pprev = &tbl->parms.next;
>                 write_unlock_bh(&tbl->lock);
>         }
>         return p;
> @@ -1136,21 +1138,14 @@
>  
>  void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
>  {
> -       struct neigh_parms **p;
> -
>         if (!parms || parms == &tbl->parms)
>                 return;
>         write_lock_bh(&tbl->lock);
> -       for (p = &tbl->parms.next; *p; p = &(*p)->next) {
> -               if (*p == parms) {
> -                       *p = parms->next;
> -                       write_unlock_bh(&tbl->lock);
> -                       kfree(parms);
> -                       return;
> -               }
> -       }
> +       if ((*parms->pprev = parms->next))
> +               parms->next->pprev = parms->pprev;
>         write_unlock_bh(&tbl->lock);
> -       NEIGH_PRINTK1("neigh_parms_release: not found\n");
> +       kfree(parms);
> +       return;
>  }
> --

^ permalink raw reply

* Re: neighbour table RCU
From: Eric Dumazet @ 2009-09-01 16:13 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20090901085921.2c836dac@nehalam>

Stephen Hemminger a écrit :
> On Tue, 01 Sep 2009 08:50:17 +0200
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
>> Stephen Hemminger a écrit :
>>> Looking at the neighbour table, it should be possible to get
>>> rid of the two reader/writer locks.  The hash table lock is pretty
>>> amenable to RCU, but the dynamic resizing makes it non-trivial.
>>> Thinking of using a combination of RCU and sequence counts so that the
>>> reader would just rescan if resize was in progress.
>> I am not sure neigh_tbl_lock rwlock should be changed, I did not
>> see any contention on it.
>>
>>> The reader/writer lock on the neighbour entry is more of a problem.
>>> Probably would be simpler/faster to change it into a spinlock and
>>> be done with it.
>>>
>>> The reader/writer lock is also used for the proxy list hash table,
>>> but that can just be a simple spinlock.
>>>
>> This is probably is the only thing we want to do at this moment,
>> halving atomic ops on neigh_resolve_output()
>>
>> But why neigh_resolve_output() was called so much in the bench
>> is the question...
>>
> 
> Every packet has to have an ARP resolution.
> 

Sure, but I thought we had a cache ?

static inline int ip_finish_output2(struct sk_buff *skb)
{
...
	if (dst->hh)
		return neigh_hh_output(dst->hh, skb);
	else if (dst->neighbour)
		return dst->neighbour->output(skb);  << should fill cache first time >>
...
}

in my pktgen benches, I always hit same dst so should take the hh cache ?

^ permalink raw reply

* Re: [RFC] Virtual Machine Device Queues(VMDq) support on KVM
From: Stephen Hemminger @ 2009-09-01 16:05 UTC (permalink / raw)
  To: Xin, Xiaohui
  Cc: mst@redhat.com, netdev@vger.kernel.org,
	virtualization@lists.linux-foundation.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org,
	akpm@linux-foundation.org, hpa@zytor.com,
	gregory.haskins@gmail.com
In-Reply-To: <C85CEDA13AB1CF4D9D597824A86D2B9006AEB94861@PDSMSX501.ccr.corp.intel.com>

On Tue, 1 Sep 2009 14:58:19 +0800
"Xin, Xiaohui" <xiaohui.xin@intel.com> wrote:

>               [RFC] Virtual Machine Device Queues (VMDq) support on KVM
> 
> Network adapter with VMDq technology presents multiple pairs of tx/rx queues,
> and renders network L2 sorting mechanism based on MAC addresses and VLAN tags
> for each tx/rx queue pair. Here we present a generic framework, in which network
> traffic to/from a tx/rx queue pair can be directed from/to a KVM guest without
> any software copy.
> 
> Actually this framework can apply to traditional network adapters which have
> just one tx/rx queue pair. And applications using the same user/kernel interface
> can utilize this framework to send/receive network traffic directly thru a tx/rx
> queue pair in a network adapter.
> 
> We use virtio-net architecture to illustrate the framework.
> 
> 
> |--------------------|     pop               add_buf    |----------------|
> |    Qemu process    |  <---------    TX   <----------  | Guest Kernel   |
> |                    |  --------->         ---------->  |                |
> |    Virtio-net      |     push              get_buf    |                |
> |  (Backend service) |  --------->    RX   ---------->  |  Virtio-net    |
> |                    |  <---------         <----------  |    driver      |
> |                    |     push              get_buf    |                |
> |--------------------|                                  |----------------|
>                    |
>                    |
>                    | AIO (read & write) combined with Direct I/O
>                    |   (which substitute synced file operations)
> |-----------------------------------------------------------------------|
> |     Host kernel  | read: copy-less with directly mapped user          |
> |                  |       space to kernel, payload directly DMAed      |
> |                  |       into user space                              |
> |                  | write: copy-less with directly mapped user         |
> |                  |       space to kernel, payload directly hooked     |
> |                  |       to a skb                                     |
> |                  |                                                    |
> |  (a likely       |                                                    |
> |   queue pair     |                                                    |
> |   instance)      |                                                    |
> |      |           |                                                    |
> | NIC driver <-->  TUN/TAP driver                                       |
> |-----------------------------------------------------------------------|
>        |
>        |
>    traditional adapter or a tx/rx queue pair
> 
> The basic idea is to utilize the kernel Asynchronous I/O combined with Direct
> I/O to implements copy-less TUN/TAP device. AIO and Direct I/O is not new to
> kernel, we still can see it in SCSI tape driver.
> 
> With traditional file operations, a copying of payload contents from/to the
> kernel DMA address to/from a user buffer is needed. That's what the copying we
> want to save.
> 
> The proposed framework is like this:
> A TUN/TAP device is bound to a traditional NIC adapter or a tx/rx queue pair in
> host side. KVM virto-net Backend service, the user space program submits
> asynchronous read/write I/O requests to the host kernel through TUN/TAP device.
> The requests are corresponding to the vqueue elements include both transmission
> & receive. They can be queued in one AIO request and later, the completion will
> be notified through the underlying packets tx/rx processing of the rx/tx queue
> pair.
> 
> Detailed path:
> 
> To guest Virtio-net driver, packets receive corresponding to asynchronous read
> I/O requests of Backend service.
> 
> 1) Guest Virtio-net driver provides header and payload address through the
> receive vqueue to Virtio-net backend service.
> 
> 2) Virtio-net backend service encapsulates multiple vqueue elements into
> multiple AIO control blocks and composes them into one AIO read request.
> 
> 3) Virtio-net backend service uses io_submit() syscall to pass the request to
> the TUN/TAP device.
> 
> 4) Virtio-net backend service uses io_getevents() syscall to check the
> completion of the request.
> 
> 5) The TUN/TAP driver receives packets from the queue pair of NIC, and prepares
> for Direct I/O.
>    A modified NIC driver may render a skb which header is allocated in host
> kernel, but the payload buffer is directly mapped from user space buffer which
> are rendered through the AIO request by the Backend service. get_user_pages()
> may do this. For one AIO read request, the TUN/TAP driver maintains a list for
> the directly mapped buffers, and a NIC driver tries to get the buffers as
> payload buffer to compose the new skbs. Of course, if getting the buffers
> fails, then kernel allocated buffers are used.
> 
> 6) Modern NIC cards now mostly have the header split feature. The NIC queue
> pair then may directly DMA the payload into the user spaces mapped payload
> buffers.
> Thus a zero-copy for payload is implemented in packet receiving.
> 
> 7) The TUN/TAP driver manually copy the host header to space user mapped.
> 
> 8) aio_complete() to notify the Virtio-net backend service for io_getevents().
> 
> 
> To guest Virtio-net driver, packets send corresponding to asynchronous write
> I/O requests of backend. The path is similar to packet receive.
> 
> 1) Guest Virtio-net driver provides header and payload address filled with
> contents through the transmit vqueue to Virtio-net backed service.
> 
> 2) Virtio-net backend service encapsulates the vqueue elements into multiple
> AIO control blocks and composes them into one AIO write request.
> 
> 3) Virtio-net backend service uses the io_submit() syscall to pass the
> requests to the TUN/TAP device.
> 
> 4) Virtio-net backend service uses io_getevents() syscall to check the request
> completion.
> 
> 5) The TUN/TAP driver gets the write requests and allocates skbs for it. The
> header contents are copied into the skb header. The directly mapped user space
> buffer is easily hooked into skb. Thus a zero copy for payload is implemented
> in packet sending.
> 
> 6) aio_complete() to notify the Virtio-net backend service for io_getevents().
> 
> The proposed framework is described as above.
> 
> Consider the modifications to the kernel and qemu:
> 
> To kernel:
> 1) The TUN/TAP driver may be modified a lot to implement AIO device operations
> and to implement directly user space mapping into kernel. Code to maintain the
> directly mapped user buffers should be in. It's just a modification for driver.
> 
> 2) The NIC driver may be modified to compose skb differently and slightly data
> structure change to add user directly mapped buffer pointer.
> Here, maybe it's better for a NIC driver to present an interface for an rx/tx
> queue pair instance which will also apply to traditional hardware, the kernel
> interface should not be changed to make the other components happy.
> The abstraction is useful, though it is not needed immediately here.
> 
> 3) The skb shared info structure may be modified a little to contain the user
> directly mapped info.
> 
> To Qemu:
> 1) The Virtio-net backend service may be modified to handle AIO read/write
> requests from the vqueues.
> 2) Maybe a separate pthread to handle the AIO request triggering is needed.
> 
> Any comments are appreciated here.

* Code is easier to review than bullet points.

* Direct I/O has to be safe when page is shared by multiple threads,
  and has to be non-blocking since network I/O can take indeterminately
  long (think big queue's, tunneling, ...)

* In the past attempts at Direct I/O on network have always had SMP
  TLB issues. The page has to be flipped or marked as COW on all CPU's
  and the cost of the Inter Processor Interrupt to steal the page has
  been slower than copying



-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: netconsole as normal console for userspace messages?
From: Arkadiusz Miskiewicz @ 2009-09-01 16:00 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Jarek Poplawski, netdev
In-Reply-To: <1251818111.3960.425.camel@calx>

On Tuesday 01 of September 2009, Matt Mackall wrote:
> On Tue, 2009-09-01 at 10:16 +0000, Jarek Poplawski wrote:
> > On 30-08-2009 02:12, Arkadiusz Miskiewicz wrote:
> > > Hi,
> >
> > Hi,
> >
> > > Can netconsole be somehow used to see userspace messages, too?
> > > Something similar to console=ttySX... just =netconsole.
> >
> > AFAIK it can't, but AFAICR some time ago the netconsole maintainer
> > gave some hope to some Debian guy with a similar question.
>
> It is, as they say, just a simple matter of programming to get a full
> bidirectional console out of it. As cool as that would be, I just
> haven't had time for it.

Unidirectional console would be enough for me but it seems that netconsole 
isn't capable even for that (userspace messages).

-- 
Arkadiusz Miśkiewicz        PLD/Linux Team
arekm / maven.pl            http://ftp.pld-linux.org/

^ permalink raw reply

* Re: neighbour table RCU
From: Stephen Hemminger @ 2009-09-01 15:59 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <4A9CC429.5020803@gmail.com>

On Tue, 01 Sep 2009 08:50:17 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Stephen Hemminger a écrit :
> > Looking at the neighbour table, it should be possible to get
> > rid of the two reader/writer locks.  The hash table lock is pretty
> > amenable to RCU, but the dynamic resizing makes it non-trivial.
> > Thinking of using a combination of RCU and sequence counts so that the
> > reader would just rescan if resize was in progress.
> 
> I am not sure neigh_tbl_lock rwlock should be changed, I did not
> see any contention on it.
> 
> > 
> > The reader/writer lock on the neighbour entry is more of a problem.
> > Probably would be simpler/faster to change it into a spinlock and
> > be done with it.
> > 
> > The reader/writer lock is also used for the proxy list hash table,
> > but that can just be a simple spinlock.
> > 
> 
> This is probably is the only thing we want to do at this moment,
> halving atomic ops on neigh_resolve_output()
> 
> But why neigh_resolve_output() was called so much in the bench
> is the question...
> 

Every packet has to have an ARP resolution.

-- 

^ permalink raw reply

* Re: UDP is bypassing qdisc statistics ....
From: Eric Dumazet @ 2009-09-01 15:58 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Patrick McHardy, Mark Smith, Jarek Poplawski, netdev
In-Reply-To: <4A9D410D.5000507@gmail.com>

Eric Dumazet a écrit :
> Christoph Lameter a écrit :
>> On Tue, 1 Sep 2009, Eric Dumazet wrote:
>>
>>> You should see that in /proc/interrupts, if I correctly understand bnx2.c
>> Hmmm I have 8 interrupts:
>>
>>   62:        158          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-0
>>   63:         84          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-1
>>   64:        412          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-2
>>   65:         25          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-3
>>   66:      49718          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-4
>>   67:         65          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-5
>>   68:        686          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-6
>>   69:       2582          0          0          0          0          0
>> 0          0          0          0          0          0          0
>> 0          0          0  IR-PCI-MSI-edge      eth0-7
> 
> Yes, this confirm you have 8 queues on this NIC
> 
> Strange thing is they seem to be all serviced by CPU-0, which is not good...
> 
> 
> 

Given that bnx2.c uses num_online_cpus() at init time, you could
as a workaround do the insmod/modprobe bnx2 with only one online cpu,
and you'll revert to a mono-queue NIC :)

int msix_vecs = min(cpus + 1, RX_MAX_RINGS); 
...
if ((bp->flags & BNX2_FLAG_MSIX_CAP) && !dis_msi && cpus > 1)
	bnx2_enable_msix(bp, msix_vecs);


For your multicast test anyway, only one queue should be used (one flow)



^ permalink raw reply

* Re: [PATCH] sky2: Use 32bit read to read Y2_VAUX_AVAIL
From: Stephen Hemminger @ 2009-09-01 15:57 UTC (permalink / raw)
  To: Mike McCormack; +Cc: netdev
In-Reply-To: <4A9D2793.1020502@ring3k.org>

On Tue, 01 Sep 2009 22:54:27 +0900
Mike McCormack <mikem@ring3k.org> wrote:

> B0_CTST is a 24bit register according to the vendor driver (sk98lin).
> A 16bit read on B0_CTST will always return 0 for Y2_VAUX_AVAIL (1<<16),
>  so use a 32bit read when testing Y2_VAUX_AVAIL
> 
> Signed-off-by: Mike McCormack <mikem@ring3k.org>
> ---
>  drivers/net/sky2.c |    2 +-
>  drivers/net/sky2.h |    2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
> index 7108b12..d1f3b46 100644
> --- a/drivers/net/sky2.c
> +++ b/drivers/net/sky2.c
> @@ -273,7 +273,7 @@ static void sky2_power_aux(struct sky2_hw *hw)
>  			    Y2_COR_CLK_LNK2_DIS | Y2_CLK_GAT_LNK2_DIS);
>  
>  	/* switch power to VAUX */
> -	if (sky2_read16(hw, B0_CTST) & Y2_VAUX_AVAIL)
> +	if (sky2_read32(hw, B0_CTST) & Y2_VAUX_AVAIL)
>  		sky2_write8(hw, B0_POWER_CTRL,
>  			    (PC_VAUX_ENA | PC_VCC_ENA |
>  			     PC_VAUX_ON | PC_VCC_OFF));
> diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h
> index 34042ff..e0f23a1 100644
> --- a/drivers/net/sky2.h
> +++ b/drivers/net/sky2.h
> @@ -260,7 +260,7 @@ enum csr_regs {
>  	Y2_CFG_AER      = 0x1d00,	/* PCI Advanced Error Report region */
>  };
>  
> -/*	B0_CTST			16 bit	Control/Status register */
> +/*	B0_CTST			24 bit	Control/Status register */
>  enum {
>  	Y2_VMAIN_AVAIL	= 1<<17,/* VMAIN available (YUKON-2 only) */
>  	Y2_VAUX_AVAIL	= 1<<16,/* VAUX available (YUKON-2 only) */


Acked-by: Stephen Hemminger <shemminger@vyatta.com>



^ permalink raw reply

* Re: neighbour table RCU
From: Octavian Purdila @ 2009-09-01 15:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, Lucian Adrian Grijincu, netdev
In-Reply-To: <4A9CC429.5020803@gmail.com>

On Tuesday 01 September 2009 09:50:17 Eric Dumazet wrote:
> Stephen Hemminger a écrit :
> > Looking at the neighbour table, it should be possible to get
> > rid of the two reader/writer locks.  The hash table lock is pretty
> > amenable to RCU, but the dynamic resizing makes it non-trivial.
> > Thinking of using a combination of RCU and sequence counts so that the
> > reader would just rescan if resize was in progress.
>
> I am not sure neigh_tbl_lock rwlock should be changed, I did not
> see any contention on it.
>

Speaking about neighbour optimizations, here is a RFC patch which makes the 
tables double linked, for constant time deletion. It has given us a significant 
performance improvement - in less then usual setups though, with lots of 
neighbours.

Would something like this be acceptable for upstream? (pardon the p4 diff dump 
:) - but I think it will give a rough idea, if acceptable will clean it up and 
properly submit it)

BTW, would switching to list_head be better?

Thanks,
tavi

==== //packages/linux-2.6.7/main/src/include/net/neighbour.h#2 (text) ====

@@ -56,6 +56,7 @@
 struct neigh_parms
 {
        struct neigh_parms *next;
+       struct neigh_parms **pprev;
        int     (*neigh_setup)(struct neighbour *);
        struct neigh_table *tbl;
        int     entries;

==== //packages/linux-2.6.7/main/src/net/core/neighbour.c#3 (text) ====

@@ -1127,8 +1127,10 @@
                }
                p->sysctl_table = NULL;
                write_lock_bh(&tbl->lock);
-               p->next         = tbl->parms.next;
+               if ((p->next = tbl->parms.next))
+                       p->next->pprev = &p->next;
                tbl->parms.next = p;
+               p->pprev = &tbl->parms.next;
                write_unlock_bh(&tbl->lock);
        }
        return p;
@@ -1136,21 +1138,14 @@
 
 void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
 {
-       struct neigh_parms **p;
-
        if (!parms || parms == &tbl->parms)
                return;
        write_lock_bh(&tbl->lock);
-       for (p = &tbl->parms.next; *p; p = &(*p)->next) {
-               if (*p == parms) {
-                       *p = parms->next;
-                       write_unlock_bh(&tbl->lock);
-                       kfree(parms);
-                       return;
-               }
-       }
+       if ((*parms->pprev = parms->next))
+               parms->next->pprev = parms->pprev;
        write_unlock_bh(&tbl->lock);
-       NEIGH_PRINTK1("neigh_parms_release: not found\n");
+       kfree(parms);
+       return;
 }

^ permalink raw reply

* [PATCH net-next-2.6] macvlan: Use compare_ether_addr_64bits()
From: Eric Dumazet @ 2009-09-01 15:46 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, Linux Netdev List

To speedup ether addresses compares, we can use compare_ether_addr_64bits()
(all operands are guaranteed to be at least 8 bytes long)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 99eed9f..1978452 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -54,7 +54,7 @@ static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
 	struct hlist_node *n;
 
 	hlist_for_each_entry_rcu(vlan, n, &port->vlan_hash[addr[5]], hlist) {
-		if (!compare_ether_addr(vlan->dev->dev_addr, addr))
+		if (!compare_ether_addr_64bits(vlan->dev->dev_addr, addr))
 			return vlan;
 	}
 	return NULL;
@@ -92,7 +92,7 @@ static int macvlan_addr_busy(const struct macvlan_port *port,
 	 * currently in use by the underlying device or
 	 * another macvlan.
 	 */
-	if (memcmp(port->dev->dev_addr, addr, ETH_ALEN) == 0)
+	if (!compare_ether_addr_64bits(port->dev->dev_addr, addr))
 		return 1;
 
 	if (macvlan_hash_lookup(port, addr))
@@ -130,7 +130,7 @@ static void macvlan_broadcast(struct sk_buff *skb,
 			dev->stats.multicast++;
 
 			nskb->dev = dev;
-			if (!compare_ether_addr(eth->h_dest, dev->broadcast))
+			if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast))
 				nskb->pkt_type = PACKET_BROADCAST;
 			else
 				nskb->pkt_type = PACKET_MULTICAST;

^ permalink raw reply related

* Re: UDP is bypassing qdisc statistics ....
From: Eric Dumazet @ 2009-09-01 15:43 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Patrick McHardy, Mark Smith, Jarek Poplawski, netdev
In-Reply-To: <alpine.DEB.1.10.0909011534350.12112@V090114053VZO-1>

Christoph Lameter a écrit :
> On Tue, 1 Sep 2009, Eric Dumazet wrote:
> 
>> You should see that in /proc/interrupts, if I correctly understand bnx2.c
> 
> Hmmm I have 8 interrupts:
> 
>   62:        158          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-0
>   63:         84          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-1
>   64:        412          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-2
>   65:         25          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-3
>   66:      49718          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-4
>   67:         65          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-5
>   68:        686          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-6
>   69:       2582          0          0          0          0          0
> 0          0          0          0          0          0          0
> 0          0          0  IR-PCI-MSI-edge      eth0-7

Yes, this confirm you have 8 queues on this NIC

Strange thing is they seem to be all serviced by CPU-0, which is not good...



^ permalink raw reply

* Re: Crypto oops in async_chainiv_do_postponed
From: Brad Bosch @ 2009-09-01 15:42 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto, netdev, offbase0
In-Reply-To: <20090831220459.GA15713@gondor.apana.org.au>

Herbert Xu writes:
 > On Mon, Aug 31, 2009 at 11:11:42AM -0500, Brad Bosch wrote:
 > >
 > > OK.  I was looking for something subtle because the crash takes a long
 > > time to happen.  But do you agree that the race I described above also
 > > a real bug?
 > 
 > No I don't think it is.  CHAINV_STATE_INUSE guarantees that only
 > one entity can use ctx->err at any time.

I don't see how you are protecting ctx->err with the INUSE flag.  For
example:

If two threads enter async_chainiv_givencrypt at the same time, one
thread will call async_chainiv_postpone_request (INUSE will be clear
until set by async_chainiv_postpone_request) and the other thread will
call async_chainiv_givencrypt_tail (INUSE may or may not be set yet).

Now, ctx-err may be used by both async_chainiv_postpone_request to
store the return value from skcipher_enqueue_givcrypt and by
async_chainiv_givencrypt_tail to store the return value from
crypto_ablkcipher_encrypt at the same time.  This can cause the
calling function to think async_chainiv_givencrypt has completed it's
work, when in fact, the work was defered.

The patch I proposed earlier (included again below) avoids this and
also makes the error handling simpler and more direct without
requiring ctx->err at all.  I still don't understand why ctx->err was
required in the first place.

Did I miss something with regard to the use of ctx->err?

Now, as to the other bug...

 > 
 > Where we subtract the offset the pointer can never be NULL.  Please
 > try my patch.

OK.  I see now that your offset patch should indeed solve that
problem.  But why did you choose to fix it in a complex way?  My
suggestion just adds a single test while yours adds new parameters, a
new function and an extra function call.

Thanks for your help.

--Brad

Index: chainiv.c
===================================================================
RCS file: /share/cvs/sdg/kernels/kernel.wms/kernel_2_6_27/src/crypto/chainiv.c,v
retrieving revision 1.1.1.1.4.2
diff -u -r1.1.1.1.4.2 chainiv.c
--- chainiv.c	10 Mar 2009 05:16:24 -0000	1.1.1.1.4.2
+++ chainiv.c	27 Aug 2009 19:40:27 -0000
@@ -36,7 +36,6 @@
 	unsigned long state;
 
 	spinlock_t lock;
-	int err;
 
 	struct crypto_queue queue;
 	struct work_struct postponed;
@@ -114,10 +113,9 @@
 	return chainiv_init_common(tfm);
 }
 
-static int async_chainiv_schedule_work(struct async_chainiv_ctx *ctx)
+static void async_chainiv_schedule_work(struct async_chainiv_ctx *ctx)
 {
 	int queued;
-	int err = ctx->err;
 
 	if (!ctx->queue.qlen) {
 		smp_mb__before_clear_bit();
@@ -125,14 +123,11 @@
 
 		if (!ctx->queue.qlen ||
 		    test_and_set_bit(CHAINIV_STATE_INUSE, &ctx->state))
-			goto out;
+			return;
 	}
 
 	queued = schedule_work(&ctx->postponed);
 	BUG_ON(!queued);
-
-out:
-	return err;
 }
 
 static int async_chainiv_postpone_request(struct skcipher_givcrypt_request *req)
@@ -148,8 +143,8 @@
 	if (test_and_set_bit(CHAINIV_STATE_INUSE, &ctx->state))
 		return err;
 
-	ctx->err = err;
-	return async_chainiv_schedule_work(ctx);
+	async_chainiv_schedule_work(ctx);
+	return err;
 }
 
 static int async_chainiv_givencrypt_tail(struct skcipher_givcrypt_request *req)
@@ -158,18 +153,20 @@
 	struct async_chainiv_ctx *ctx = crypto_ablkcipher_ctx(geniv);
 	struct ablkcipher_request *subreq = skcipher_givcrypt_reqctx(req);
 	unsigned int ivsize = crypto_ablkcipher_ivsize(geniv);
+	int err;
 
 	memcpy(req->giv, ctx->iv, ivsize);
 	memcpy(subreq->info, ctx->iv, ivsize);
 
-	ctx->err = crypto_ablkcipher_encrypt(subreq);
-	if (ctx->err)
+	err = crypto_ablkcipher_encrypt(subreq);
+	if (err)
 		goto out;
 
 	memcpy(ctx->iv, subreq->info, ivsize);
 
 out:
-	return async_chainiv_schedule_work(ctx);
+	async_chainiv_schedule_work(ctx);
+	return err;
 }
 
 static int async_chainiv_givencrypt(struct skcipher_givcrypt_request *req)
@@ -236,7 +233,7 @@
 	spin_unlock_bh(&ctx->lock);
 
 	if (!req) {
-		async_chainiv_schedule_work(ctx);
+	    async_chainiv_schedule_work(ctx);
 		return;
 	}
 

^ permalink raw reply

* [PATCH net-next] can: sja1000: legacy SJA1000 ISA bus driver
From: Wolfgang Grandegger @ 2009-09-01 15:37 UTC (permalink / raw)
  To: Linux Netdev List; +Cc: SocketCAN Core Mailing List, Oliver Hartkopp

This patch adds support for legacy SJA1000 CAN controllers on the ISA
or PC-104 bus. The I/O port or memory address and the IRQ number must
be specified via module parameters:

  insmod sja1000_isa.ko port=0x310,0x380 irq=7,11

for ISA devices using I/O ports or:

  insmod sja1000_isa.ko mem=0xd1000,0xd1000 irq=7,11

for memory mapped ISA devices.

Indirect access via address and data port is supported as well:

  insmod sja1000_isa.ko port=0x310,0x380 indirect=1 irq=7,11

Here is a full list of the supported module parameters:

  port:I/O port number (array of ulong)
  mem:I/O memory address (array of ulong)
  indirect:Indirect access via address and data port (array of byte)
  irq:IRQ number (array of int)
  clk:External oscillator clock frequency (default=16000000 [16 MHz])
      (array of int)
  cdr:Clock divider register (default=0x48 [CDR_CBP | CDR_CLK_OFF])
      (array of byte)
  ocr:Output clock register (default=0x18 [OCR_TX0_PUSHPULL])
      (array of byte)

Note: for clk, cdr, ocr, the first argument re-defines the default
for all other devices, e.g.:

 insmod sja1000_isa.ko mem=0xd1000,0xd1000 irq=7,11 clk=24000000

is equivalent to

 insmod sja1000_isa.ko mem=0xd1000,0xd1000 irq=7,11 \
                       clk=24000000,24000000

Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>
Tested-by: Oliver Hartkopp <oliver@hartkopp.net>
---
 drivers/net/can/Kconfig               |    7 
 drivers/net/can/sja1000/Makefile      |    1 
 drivers/net/can/sja1000/sja1000_isa.c |  281 ++++++++++++++++++++++++++++++++++
 3 files changed, 289 insertions(+)

Index: net-next-2.6/drivers/net/can/sja1000/sja1000_isa.c
===================================================================
--- /dev/null
+++ net-next-2.6/drivers/net/can/sja1000/sja1000_isa.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2009 Wolfgang Grandegger <wg@grandegger.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/isa.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/can.h>
+#include <linux/can/dev.h>
+#include <linux/can/platform/sja1000.h>
+
+#include "sja1000.h"
+
+#define DRV_NAME "sja1000_isa"
+
+#define MAXDEV 8
+
+MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>");
+MODULE_DESCRIPTION("Socket-CAN driver for SJA1000 on the ISA bus");
+MODULE_LICENSE("GPL v2");
+
+#define CLK_DEFAULT	16000000	/* 16 MHz */
+#define CDR_DEFAULT	(CDR_CBP | CDR_CLK_OFF)
+#define OCR_DEFAULT	OCR_TX0_PUSHPULL
+
+static unsigned long port[MAXDEV];
+static unsigned long mem[MAXDEV];
+static int __devinitdata irq[MAXDEV];
+static int __devinitdata clk[MAXDEV];
+static char __devinitdata cdr[MAXDEV] = {[0 ... (MAXDEV - 1)] = -1};
+static char __devinitdata ocr[MAXDEV] = {[0 ... (MAXDEV - 1)] = -1};
+static char __devinitdata indirect[MAXDEV] = {[0 ... (MAXDEV - 1)] = -1};
+
+module_param_array(port, ulong, NULL, S_IRUGO);
+MODULE_PARM_DESC(port, "I/O port number");
+
+module_param_array(mem, ulong, NULL, S_IRUGO);
+MODULE_PARM_DESC(mem, "I/O memory address");
+
+module_param_array(indirect, byte, NULL, S_IRUGO);
+MODULE_PARM_DESC(indirect, "Indirect access via address and data port");
+
+module_param_array(irq, int, NULL, S_IRUGO);
+MODULE_PARM_DESC(irq, "IRQ number");
+
+module_param_array(clk, int, NULL, S_IRUGO);
+MODULE_PARM_DESC(clk, "External oscillator clock frequency "
+		 "(default=16000000 [16 MHz])");
+
+module_param_array(cdr, byte, NULL, S_IRUGO);
+MODULE_PARM_DESC(cdr, "Clock divider register "
+		 "(default=0x48 [CDR_CBP | CDR_CLK_OFF])");
+
+module_param_array(ocr, byte, NULL, S_IRUGO);
+MODULE_PARM_DESC(ocr, "Output control register "
+		 "(default=0x18 [OCR_TX0_PUSHPULL])");
+
+#define SJA1000_IOSIZE          0x20
+#define SJA1000_IOSIZE_INDIRECT 0x02
+
+static u8 sja1000_isa_mem_read_reg(const struct sja1000_priv *priv, int reg)
+{
+	return readb(priv->reg_base + reg);
+}
+
+static void sja1000_isa_mem_write_reg(const struct sja1000_priv *priv,
+				      int reg, u8 val)
+{
+	writeb(val, priv->reg_base + reg);
+}
+
+static u8 sja1000_isa_port_read_reg(const struct sja1000_priv *priv, int reg)
+{
+	return inb((unsigned long)priv->reg_base + reg);
+}
+
+static void sja1000_isa_port_write_reg(const struct sja1000_priv *priv,
+				       int reg, u8 val)
+{
+	outb(val, (unsigned long)priv->reg_base + reg);
+}
+
+static u8 sja1000_isa_port_read_reg_indirect(const struct sja1000_priv *priv,
+					     int reg)
+{
+	unsigned long base = (unsigned long)priv->reg_base;
+
+	outb(reg, base);
+	return inb(base + 1);
+}
+
+static void sja1000_isa_port_write_reg_indirect(const struct sja1000_priv *priv,
+						int reg, u8 val)
+{
+	unsigned long base = (unsigned long)priv->reg_base;
+
+	outb(reg, base);
+	outb(val, base + 1);
+}
+
+static int __devinit sja1000_isa_match(struct device *pdev, unsigned int idx)
+{
+	if (port[idx] || mem[idx]) {
+		if (irq[idx])
+			return 1;
+	} else if (idx)
+		return 0;
+
+	dev_err(pdev, "insufficient parameters supplied\n");
+	return 0;
+}
+
+static int __devinit sja1000_isa_probe(struct device *pdev, unsigned int idx)
+{
+	struct net_device *dev;
+	struct sja1000_priv *priv;
+	void __iomem *base = NULL;
+	int iosize = SJA1000_IOSIZE;
+	int err;
+
+	if (mem[idx]) {
+		if (!request_mem_region(mem[idx], iosize, DRV_NAME)) {
+			err = -EBUSY;
+			goto exit;
+		}
+		base = ioremap_nocache(mem[idx], iosize);
+		if (!base) {
+			err = -ENOMEM;
+			goto exit_release;
+		}
+	} else {
+		if (indirect[idx] > 0 ||
+		    (indirect[idx] == -1 && indirect[0] > 0))
+			iosize = SJA1000_IOSIZE_INDIRECT;
+		if (!request_region(port[idx], iosize, DRV_NAME)) {
+			err = -EBUSY;
+			goto exit;
+		}
+	}
+
+	dev = alloc_sja1000dev(0);
+	if (!dev) {
+		err = -ENOMEM;
+		goto exit_unmap;
+	}
+	priv = netdev_priv(dev);
+
+	dev->irq = irq[idx];
+	priv->irq_flags = IRQF_SHARED;
+	if (mem[idx]) {
+		priv->reg_base = base;
+		dev->base_addr = mem[idx];
+		priv->read_reg = sja1000_isa_mem_read_reg;
+		priv->write_reg = sja1000_isa_mem_write_reg;
+	} else {
+		priv->reg_base = (void __iomem *)port[idx];
+		dev->base_addr = port[idx];
+
+		if (iosize == SJA1000_IOSIZE_INDIRECT) {
+			priv->read_reg = sja1000_isa_port_read_reg_indirect;
+			priv->write_reg = sja1000_isa_port_write_reg_indirect;
+		} else {
+			priv->read_reg = sja1000_isa_port_read_reg;
+			priv->write_reg = sja1000_isa_port_write_reg;
+		}
+	}
+
+	if (clk[idx])
+		priv->can.clock.freq = clk[idx] / 2;
+	else if (clk[0])
+		priv->can.clock.freq = clk[0] / 2;
+	else
+		priv->can.clock.freq = CLK_DEFAULT / 2;
+
+	if (ocr[idx] != -1)
+		priv->ocr = ocr[idx] & 0xff;
+	else if (ocr[0] != -1)
+		priv->ocr = ocr[0] & 0xff;
+	else
+		priv->ocr = OCR_DEFAULT;
+
+	if (cdr[idx] != -1)
+		priv->cdr = cdr[idx] & 0xff;
+	else if (cdr[0] != -1)
+		priv->cdr = cdr[0] & 0xff;
+	else
+		priv->cdr = CDR_DEFAULT;
+
+	dev_set_drvdata(pdev, dev);
+	SET_NETDEV_DEV(dev, pdev);
+
+	err = register_sja1000dev(dev);
+	if (err) {
+		dev_err(pdev, "registering %s failed (err=%d)\n",
+			DRV_NAME, err);
+		goto exit_unmap;
+	}
+
+	dev_info(pdev, "%s device registered (reg_base=0x%p, irq=%d)\n",
+		 DRV_NAME, priv->reg_base, dev->irq);
+	return 0;
+
+ exit_unmap:
+	if (mem[idx])
+		iounmap(base);
+ exit_release:
+	if (mem[idx])
+		release_mem_region(mem[idx], iosize);
+	else
+		release_region(port[idx], iosize);
+ exit:
+	return err;
+}
+
+static int __devexit sja1000_isa_remove(struct device *pdev, unsigned int idx)
+{
+	struct net_device *dev = dev_get_drvdata(pdev);
+	struct sja1000_priv *priv = netdev_priv(dev);
+
+	unregister_sja1000dev(dev);
+	dev_set_drvdata(pdev, NULL);
+
+	if (mem[idx]) {
+		iounmap(priv->reg_base);
+		release_mem_region(mem[idx], SJA1000_IOSIZE);
+	} else {
+		if (priv->read_reg == sja1000_isa_port_read_reg_indirect)
+			release_region(port[idx], SJA1000_IOSIZE_INDIRECT);
+		else
+			release_region(port[idx], SJA1000_IOSIZE);
+	}
+	free_sja1000dev(dev);
+
+	return 0;
+}
+
+static struct isa_driver sja1000_isa_driver = {
+	.match = sja1000_isa_match,
+	.probe = sja1000_isa_probe,
+	.remove = __devexit_p(sja1000_isa_remove),
+	.driver = {
+		.name = DRV_NAME,
+	},
+};
+
+static int __init sja1000_isa_init(void)
+{
+	int err = isa_register_driver(&sja1000_isa_driver, MAXDEV);
+
+	if (!err)
+		printk(KERN_INFO
+		       "Legacy %s driver for max. %d devices registered\n",
+		       DRV_NAME, MAXDEV);
+	return err;
+}
+
+static void __exit sja1000_isa_exit(void)
+{
+	isa_unregister_driver(&sja1000_isa_driver);
+}
+
+module_init(sja1000_isa_init);
+module_exit(sja1000_isa_exit);
Index: net-next-2.6/drivers/net/can/Kconfig
===================================================================
--- net-next-2.6.orig/drivers/net/can/Kconfig
+++ net-next-2.6/drivers/net/can/Kconfig
@@ -41,6 +41,13 @@ config CAN_SJA1000
 	---help---
 	  Driver for the SJA1000 CAN controllers from Philips or NXP
 
+config CAN_SJA1000_ISA
+	depends on CAN_SJA1000 && ISA
+	tristate "ISA Bus based legacy SJA1000 driver"
+	---help---
+	  This driver adds legacy support for SJA1000 chips connected to
+	  the ISA bus using I/O port, memory mapped or indirect access.
+
 config CAN_SJA1000_PLATFORM
 	depends on CAN_SJA1000
 	tristate "Generic Platform Bus based SJA1000 driver"
Index: net-next-2.6/drivers/net/can/sja1000/Makefile
===================================================================
--- net-next-2.6.orig/drivers/net/can/sja1000/Makefile
+++ net-next-2.6/drivers/net/can/sja1000/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_CAN_SJA1000) += sja1000.o
+obj-$(CONFIG_CAN_SJA1000_ISA) += sja1000_isa.o
 obj-$(CONFIG_CAN_SJA1000_PLATFORM) += sja1000_platform.o
 obj-$(CONFIG_CAN_SJA1000_OF_PLATFORM) += sja1000_of_platform.o
 obj-$(CONFIG_CAN_EMS_PCI) += ems_pci.o

^ permalink raw reply

* Re: UDP is bypassing qdisc statistics ....
From: Christoph Lameter @ 2009-09-01 19:35 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Patrick McHardy, Mark Smith, Jarek Poplawski, netdev
In-Reply-To: <4A9D3D9A.1030500@gmail.com>

On Tue, 1 Sep 2009, Eric Dumazet wrote:

> You should see that in /proc/interrupts, if I correctly understand bnx2.c

Hmmm I have 8 interrupts:

  62:        158          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-0
  63:         84          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-1
  64:        412          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-2
  65:         25          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-3
  66:      49718          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-4
  67:         65          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-5
  68:        686          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-6
  69:       2582          0          0          0          0          0
0          0          0          0          0          0          0
0          0          0  IR-PCI-MSI-edge      eth0-7

^ permalink raw reply

* RE: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Xin, Xiaohui @ 2009-09-01 15:37 UTC (permalink / raw)
  To: Anthony Liguori, Avi Kivity
  Cc: mst@redhat.com, netdev@vger.kernel.org,
	virtualization@lists.linux-foundation.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org,
	akpm@linux-foundation.org, hpa@zytor.com,
	gregory.haskins@gmail.com
In-Reply-To: <4A9C4723.5080309@codemonkey.ws>


>It may be possible to make vmdq appear like an sr-iov capable device 
>from userspace.  sr-iov provides the userspace interfaces to allocate 
>interfaces and assign mac addresses.  To make it useful, you would have 
>to handle tx multiplexing in the driver but that would be much easier to 
>consume for kvm

What we have thought is to support multiple net_dev structures 
according to multiple queue pairs of one vmdq adapter and presents
multiple mac address in user space and each one mac can be used 
by a guest. 
What does the tx multiplexing in the driver exactly mean?

Thanks
Xiaohui

-----Original Message-----
From: Anthony Liguori [mailto:anthony@codemonkey.ws] 
Sent: Tuesday, September 01, 2009 5:57 AM
To: Avi Kivity
Cc: Xin, Xiaohui; mst@redhat.com; netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com
Subject: Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server

Avi Kivity wrote:
> On 08/31/2009 02:42 PM, Xin, Xiaohui wrote:
>> Hi, Michael
>> That's a great job. We are now working on support VMDq on KVM, and 
>> since the VMDq hardware presents L2 sorting based on MAC addresses 
>> and VLAN tags, our target is to implement a zero copy solution using 
>> VMDq. We stared from the virtio-net architecture. What we want to 
>> proposal is to use AIO combined with direct I/O:
>> 1) Modify virtio-net Backend service in Qemu to submit aio requests 
>> composed from virtqueue.
>> 2) Modify TUN/TAP device to support aio operations and the user space 
>> buffer directly mapping into the host kernel.
>> 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC.
>> 4) Modify the net_dev and skb structure to permit allocated skb to 
>> use user space directly mapped payload buffer address rather then 
>> kernel allocated.
>>
>> As zero copy is also your goal, we are interested in what's in your 
>> mind, and would like to collaborate with you if possible.
>>    
>
> One way to share the effort is to make vmdq queues available as normal 
> kernel interfaces.

It may be possible to make vmdq appear like an sr-iov capable device 
from userspace.  sr-iov provides the userspace interfaces to allocate 
interfaces and assign mac addresses.  To make it useful, you would have 
to handle tx multiplexing in the driver but that would be much easier to 
consume for kvm.

Regards,

Anthony Liguori

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: UDP is bypassing qdisc statistics ....
From: Eric Dumazet @ 2009-09-01 15:34 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Patrick McHardy, Mark Smith, Jarek Poplawski, netdev, davem
In-Reply-To: <alpine.DEB.1.10.0909011526480.12112@V090114053VZO-1>

Christoph Lameter a écrit :
> On Tue, 1 Sep 2009, Patrick McHardy wrote:
> 
>> That explains it. The bnx2 driver uses multiple TX queues, but
>> tc_dump_qdisc() only dumps the statistics from queue number 0.
> 
> There are no stats in other queues either... This is the result after
> sending 10000 or so packets. Maybe I am not catching all the qdiscs?
> 
> (this is the result of a pretty raw patch to dump all qdiscs)
> 
> #cat /proc/net/qdisc_stats
> Type Device State Bytes Packets Qlen Backlog Drops Requeues Overlimits
> TX root <NULL> 0 0 0 0 0 0 0 0
> RX root <NULL> 0 0 0 0 0 0 0 0
> TX root  eth0 0 24830 155 0 0 0 0 0
> RX root <NULL> 0 0 0 0 0 0 0 0
> TX root <NULL> 0 0 0 0 0 0 0 0
> RX root <NULL> 0 0 0 0 0 0 0 0
> TX root <NULL> 0 0 0 0 0 0 0 0
> RX root <NULL> 0 0 0 0 0 0 0 0
> TX root <NULL> 0 0 0 0 0 0 0 0
> RX root <NULL> 0 0 0 0 0 0 0 0
> 
> ---
>  net/sched/sch_api.c |   64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 64 insertions(+)
> 
> Index: linux-2.6/net/sched/sch_api.c
> ===================================================================
> --- linux-2.6.orig/net/sched/sch_api.c	2009-08-31 21:19:04.000000000 +0000
> +++ linux-2.6/net/sched/sch_api.c	2009-09-01 14:30:40.000000000 +0000
> @@ -1699,6 +1699,69 @@ static const struct file_operations psch
>  	.llseek = seq_lseek,
>  	.release = single_release,
>  };
> +
> +static void dump_qdisc(struct seq_file *seq, struct Qdisc *q, char *inout, char *text)
> +{
> +	seq_printf(seq, "%2s %2s %5s %lx %lld %d %d %d %d %d %d\n",
> +		inout, text, q->dev_queue->dev->name, q->state,
> +		q->bstats.bytes, q->bstats.packets,
> +		q->qstats.qlen, q->qstats.backlog, q->qstats.drops,
> +		q->qstats.requeues, q->qstats.overlimits);
> +}
> +
> +static void dump_qdisc_root(struct seq_file *seq, struct Qdisc *root, char *inout)
> +{
> +	struct Qdisc *q;
> +	int n = 0;
> +
> +	if (!root)
> +		return;
> +
> +	dump_qdisc(seq, root, inout, "root");
> +
> +	list_for_each_entry(q, &root->list, list) {
> +		char buffer[10];
> +
> +		sprintf(buffer,"%d", ++n);
> +		dump_qdisc(seq, q, inout, buffer);
> +	}
> +}
> +
> +
> +static int qdisc_show(struct seq_file *seq, void *v)
> +{
> +	struct net_device *dev;
> +
> +	seq_printf(seq, "Type Device State Bytes Packets "
> +			"Qlen Backlog Drops Requeues Overlimits\n");
> +
> +	read_lock(&dev_base_lock);
> +
> +	for_each_netdev(&init_net, dev) {
> +		struct netdev_queue *dev_queue;
> +
> +		dev_queue = netdev_get_tx_queue(dev, 0);
> +		dump_qdisc_root(seq, dev_queue->qdisc_sleeping, "TX");

you should iterate here 

	for (i = 0 ; i < dev->real_num_tx_queues; i++) {
		dev_queue = netdev_get_tx_queue(dev, i);
		dump_qdisc_root(seq, dev_queue->qdisc_sleeping, "TX");
	}


> +		dev_queue = &dev->rx_queue;
> +		dump_qdisc_root(seq, dev_queue->qdisc_sleeping, "RX");
> +	}
> +
> +	read_unlock(&dev_base_lock);
> +	return 0;
> +}
> +
> +static int qdisc_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, qdisc_show, PDE(inode)->data);
> +}
> +
> +static const struct file_operations qdisc_fops = {
> +	.owner = THIS_MODULE,
> +	.open = qdisc_open,
> +	.read = seq_read,
> +	.llseek = seq_lseek,
> +	.release = single_release,
> +};
>  #endif
> 
>  static int __init pktsched_init(void)
> @@ -1706,6 +1769,7 @@ static int __init pktsched_init(void)
>  	register_qdisc(&pfifo_qdisc_ops);
>  	register_qdisc(&bfifo_qdisc_ops);
>  	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
> +	proc_net_fops_create(&init_net, "qdisc_stats", 0, &qdisc_fops);
> 
>  	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
>  	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);


^ permalink raw reply

* Re: Network hangs with 2.6.30.5
From: Holger Hoffstaette @ 2009-09-01 15:32 UTC (permalink / raw)
  To: netdev
In-Reply-To: <pan.2009.09.01.14.17.06.671250@googlemail.com>

On Tue, 01 Sep 2009 16:17:08 +0200, Holger Hoffstaette wrote:

[network regressions in .30]

> I do have an older Intel Gbit card identified thusly: 00:0b.0 Ethernet
> controller: Intel Corporation 82545GM Gigabit Ethernet Controller (rev 04)
> 
> and enabled all sorts of offloading:
> 
> $ethtool -k eth0
> Offload parameters for eth0:
> rx-checksumming: on
> tx-checksumming: on
> scatter-gather: on
> tcp segmentation offload: on
> udp fragmentation offload: off
> generic segmentation offload: on
> 
> Maybe that is the culprit, as Eric Dumazet suspected in his mail..I will
> try the latest .30 stable again without that, but in any case something is
> indeed very broken in there.

So I just tried .30.5 again. Indeed the offloading seems to play a role:
with everything enabled I cannot even reliably ssh into the machine (only
"sometimes"?); however without any offloading things get "a bit better"
and squid even serves up some pages..for a while. Then it seems to hang,
swallow requests or not finish them. The tested sites reliably work for
the Windows client when it bypasses squid, as does DNS (also served from
the box). It *seems* to affect incoming traffic more than outgoing - e.g.
mail or news polling seemed to kick off and finish just fine.
Rebooting back into .29 fixes everything. Last time I tried
.31rc-something (4 IIRC) it exhibited the same problems.

I'm open to suggestions and willing to help fix this but need this machine
for actual work. :/

-h



^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox