Netdev List
 help / color / mirror / Atom feed
* Re: sch_sfb
From: Jarek Poplawski @ 2011-01-14 21:06 UTC (permalink / raw)
  To: Juliusz Chroboczek; +Cc: Patrick McHardy, netdev, David Miller
In-Reply-To: <7ivd1rsj6n.fsf@lanthane.pps.jussieu.fr>

Juliusz Chroboczek wrote:
>> I just looked at it out of interest after already having started my
>> own version.
> 
>>>   http://thread.gmane.org/gmane.linux.network/90225
>>>   http://thread.gmane.org/gmane.linux.network/90375
> 
>>> It was reviewed in particular by one Patrick McHardy.
> 
>> There's no reason to be pissed
> 
> Yes, there is.
> 
> First you object to my patch by making a bunch of unreasonable requests
> (notably that I use the in-kernel classifiers, which are not usable with
> Bloom filters).  Then it turns out you're implementing your own version
> "from scratch".  And then you claim that you never saw my version in the
> first place?
> 
> Patrick, what you're doing is not merely rude, it's actually unethical.

Or Linux Classics ;-) Vide: Molnar vs Kolivas.

Jarek P.

^ permalink raw reply

* Re: [PATCH] ethtool : Add option -L | --set-common to set common flags.
From: Ben Hutchings @ 2011-01-14 21:19 UTC (permalink / raw)
  To: Mahesh Bandewar; +Cc: David Miller, Tom Herbert, Laurent Chavey, netdev
In-Reply-To: <1294963892-11997-1-git-send-email-maheshb@google.com>

On Thu, 2011-01-13 at 16:11 -0800, Mahesh Bandewar wrote:
> This patch adds -L | --set-common option to add / remove common flags which
> includes loopback flag. The -l | --show-common displays the current values
> for these common flags.
> 
> Signed-off-by: Mahesh Bandewar <maheshb@google.com>
> ---
>  ethtool-copy.h |    1 +
>  ethtool.8      |   16 ++++++++++
>  ethtool.c      |   90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 105 insertions(+), 2 deletions(-)
> 
> diff --git a/ethtool-copy.h b/ethtool-copy.h
> index 75c3ae7..5fd18c7 100644
> --- a/ethtool-copy.h
> +++ b/ethtool-copy.h
> @@ -309,6 +309,7 @@ struct ethtool_perm_addr {
>   * flag differs from the read-only value.
>   */
>  enum ethtool_flags {
> +	ETH_FLAG_LOOPBACK	= (1 << 2),	/* Loopback enable / disable */
>  	ETH_FLAG_TXVLAN		= (1 << 7),	/* TX VLAN offload enabled */
>  	ETH_FLAG_RXVLAN		= (1 << 8),	/* RX VLAN offload enabled */
>  	ETH_FLAG_LRO		= (1 << 15),	/* LRO is enabled */
> diff --git a/ethtool.8 b/ethtool.8
> index 1760924..cf7128f 100644
> --- a/ethtool.8
> +++ b/ethtool.8
> @@ -174,6 +174,13 @@ ethtool \- Display or change ethernet card settings
>  .B2 txvlan on off
>  .B2 rxhash on off
>  
> +.B ethtool \-l|\-\-show\-common
> +.I ethX
> +
> +.B ethtool \-L|\-\-set\-common
> +.I ethX
> +.B2 loopback on off
> +
>  .B ethtool \-p|\-\-identify
>  .I ethX
>  .RI [ N ]
> @@ -406,6 +413,15 @@ Specifies whether TX VLAN acceleration should be enabled
>  .A2 rxhash on off
>  Specifies whether receive hashing offload should be enabled
>  .TP
> +.B \-l \-\-show\-common
> +Queries the specified ethernet device for common flag settings.
> +.TP
> +.B \-L \-\-set\-common
> +Changes the common parameters of the specified ethernet device.
> +.TP
> +.A2 loopback on off
> +Specifies whether loopback should be enabled.
> +.TP

I've just gone through the manual page and changed 'ethernet device' to
'network device' for all generic operations; please follow that.  The
source for the manual page was also renamed to ethtool.8.in as it now
goes through autoconf substitution.

>  .B \-p \-\-identify
>  Initiates adapter-specific action intended to enable an operator to
>  easily identify the adapter by sight.  Typically this involves
> diff --git a/ethtool.c b/ethtool.c
> index 63e0ead..1a0c10c 100644
> --- a/ethtool.c
> +++ b/ethtool.c
[...]
> @@ -1905,6 +1932,13 @@ static int dump_offload(int rx, int tx, int sg, int tso, int ufo, int gso,
>  	return 0;
>  }
>  
> +static int dump_common_flags(int loopback)
> +{
> +	fprintf(stdout, "loopback: %s\n", loopback ? "on" : "off");
> +
> +	return 0;
> +}
> +
>  static int dump_rxfhash(int fhash, u64 val)
>  {
>  	switch (fhash) {
[...]
> @@ -2219,6 +2257,53 @@ static int do_scoalesce(int fd, struct ifreq *ifr)
>  	return 0;
>  }
>  
> +static int do_gcommon(int fd, struct ifreq *ifr)
> +{
> +	struct ethtool_value eval;
> +	int loopback = 0;
> +
> +	fprintf(stdout, "Common flags for %s:\n", devname);
> +
> +	eval.cmd = ETHTOOL_GFLAGS;
> +	ifr->ifr_data = (caddr_t)&eval;
> +	if (ioctl(fd, SIOCETHTOOL, ifr)) {
> +		perror("Cannot get device flags");
> +	} else {
> +		loopback = (eval.data & ETH_FLAG_LOOPBACK) != 0;
> +	}
> +
> +	return dump_common_flags(loopback);

Breaking up a bitmask into a list of flag parameters is fairly
pointless.  I realise do_goffload() and dump_offload() do that but I am
just waiting for Michał Mirosław's changes to offload flags to be
settled before I fix them.

> +}
> +
> +static int do_scommon(int fd, struct ifreq *ifr)
> +{
> +	struct ethtool_value eval;
> +
> +	if (common_flags_mask) {
> +		eval.cmd = ETHTOOL_GFLAGS;
> +		eval.data = 0;
> +		ifr->ifr_data = (caddr_t)&eval;
> +		if (ioctl(fd, SIOCETHTOOL, ifr)) {
> +			perror("Cannot get device common flags");
> +			return 1;
> +		}
> +
> +		eval.cmd = ETHTOOL_SFLAGS;
> +		eval.data =
> +		    ((eval.data & ~(common_flags_mask | off_flags_mask)) |
> +		     (common_flags_wanted | off_flags_wanted));

Why should this use off_flags_mask and off_flags_wanted?  They should
both be 0 if this function is called.

> +		if (ioctl(fd, SIOCETHTOOL, ifr)) {
> +			perror("Cannot set device common flags");
> +			return 1;
> +		}
> +	} else {
> +		fprintf(stdout, "No common settings changed\n");
> +	}
> +
> +	return 0;
> +}
> +
>  static int do_goffload(int fd, struct ifreq *ifr)
>  {
>  	struct ethtool_value eval;
> @@ -2407,8 +2492,9 @@ static int do_soffload(int fd, struct ifreq *ifr)
>  		}
>  
>  		eval.cmd = ETHTOOL_SFLAGS;
> -		eval.data = ((eval.data & ~off_flags_mask) |
> -			     off_flags_wanted);
> +		eval.data =
> +		    ((eval.data & ~(off_flags_mask | common_flags_mask)) |
> +		     (off_flags_wanted | common_flags_wanted));

Similarly, why should this use common_flags_mask and
common_flags_wanted?

Ben.

>  
>  		err = ioctl(fd, SIOCETHTOOL, ifr);
>  		if (err) {

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] r8169: keep firmware in memory.
From: Rafael J. Wysocki @ 2011-01-14 21:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ben Hutchings, Michael Tokarev, David Woodhouse, Johannes Berg,
	Greg Kroah-Hartman, Francois Romieu, David Miller, netdev,
	Jarek Kamiński, Hayes
In-Reply-To: <AANLkTinumTQN3F=O_hC=jFTFLKVYyWmJNNvHx2A1jL2p@mail.gmail.com>

On Friday, January 14, 2011, Linus Torvalds wrote:
> On Fri, Jan 14, 2011 at 8:30 AM, Ben Hutchings <benh@debian.org> wrote:
> >
> > This is something I started to implement, but never got finished.  I
> > don't think it can be done without an API change, though, as we need
> > to know when to drop firmware from the cache.  But perhaps this could
> > be done with a hook in the device-driver binding code.
> 
> Or just associate the firmware with a module?
> 
> So if the firmware gets loaded, it stays in memory until the module is unloaded?
> 
> And this all would only be the case if CONFIG_PM is set, so you'd not
> waste memory unnecessarily.

Alternatively, a suspend/hibernate notifier can be used for that I think.

They are called before the freezing and after the thawing of user space, so the
the PM_POST_SUSPEND or PM_POST_RESTORE notification can easily cause the
firmare(s) to be dropped from memory.

Thanks,
Rafael

^ permalink raw reply

* [PATCH] Make INET_LHTABLE_SIZE a compile-time tunable
From: Bill Sommerfeld @ 2011-01-14 21:48 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Tom Herbert, Bill Sommerfeld

INET_LHTABLE_SIZE has been fixed at 32 for a long time.  It should be
tunable as larger systems may be running many more than 32 listeners.
Since the exising code depends on the hash table size
being a power of two, use a shift value as the tunable and
compute the hash table size from the shift value.

Signed-off-by: Bill Sommerfeld <wsommerfeld@google.com>
---
Background: We've observed that many of our machines are now running
with well in excess of 32 TCP listener sockets open.  As the number of
cpu cores per system increases we expect this to grow further.

In general hash tables should be sized to keep hash chains short, and
32 is not enough for this on some of our machines.

The help text was inspired by the help text for LOG_BUF_SHIFT in
init/Kconfig

 include/net/inet_hashtables.h |    2 +-
 net/ipv4/Kconfig              |   14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index e9c2ed8..7253fce 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -113,7 +113,7 @@ struct inet_listen_hashbucket {
 };
 
 /* This is for listening sockets, thus all sockets which possess wildcards. */
-#define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
+#define INET_LHTABLE_SIZE	(1U << (CONFIG_INET_LHTABLE_SHIFT))
 
 struct inet_hashinfo {
 	/* This is for sockets with full identity only.  Sockets here will
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9e95d7f..a92954e 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -440,6 +440,20 @@ config INET_TCP_DIAG
 	depends on INET_DIAG
 	def_tristate INET_DIAG
 
+config INET_LHTABLE_SHIFT
+	int "Number of TCP port listener table buckets (5 => 32, 8 => 256)"
+	default 5
+	range 0 12
+	help
+	  Select number of buckets in TCP listener hash as a power of 2
+	  32 is probably enough unless you run a lot of different servers
+	  Examples:
+		     4 => 16
+		     5 => 32 (default)
+		     6 => 64
+		     7 => 128
+		     8 => 256
+
 menuconfig TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
 	---help---
-- 
1.7.3.1


^ permalink raw reply related

* Re: [PATCH] bonding: added 802.3ad round-robin hashing policy for single TCP session balancing
From: Oleg V. Ukhno @ 2011-01-14 22:51 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev, David S. Miller
In-Reply-To: <17405.1295036019@death>



Jay Vosburgh wrote:

> 	This is a violation of the 802.3ad (now 802.1ax) standard, 5.2.1
> (f), which requires that all frames of a given "conversation" are passed
> to a single port.
> 
> 	The existing layer3+4 hash has a similar problem (that it may
> send packets from a conversation to multiple ports), but for that case
> it's an unlikely exception (only in the case of IP fragmentation), but
> here it's the norm.  At a minimum, this must be clearly documented.
> 
> 	Also, what does a round robin in 802.3ad provide that the
> existing round robin does not?  My presumption is that you're looking to
> get the aggregator autoconfiguration that 802.3ad provides, but you
> don't say.
> 
> 	I don't necessarily think this is a bad cheat (round robining on
> 802.3ad as an explicit non-standard extension), since everybody wants to
> stripe their traffic across multiple slaves.  I've given some thought to
> making round robin into just another hash mode, but this also does some
> magic to the MAC addresses of the outgoing frames (more on that below).
Yes, I am resetting MAC addresses when transmitting packets to have 
switch to put packets into different ports of the receiving etherchannel.
I am using this patch to provide full-mesh ISCSI connectivity between at 
least 4 hosts (all hosts of course are in same ethernet segment) and 
every host is connected with aggregate link with 4 slaves(usually).
Using round-robin I provide near-equal load striping when transmitting, 
using MAC address magic I force switch to stripe packets over all slave 
links in destination port-channel(when number of rx-ing slaves is equal 
to number ot tx-ing slaves and is even). So I am able to utilize all 
slaves for tx and for rx up to maximum capacity; besides I am getting L2 
link failure detection (and load rebalancing), which is (in my opinion) 
much faster and robust than L3 or than dm-multipath provides.
It's my idea with the patch
> 

> 
> 	This is the code that resets the MAC header as described above.
> It doesn't quite match the documentation, since it only resets the MAC
> for ETH_P_IP packets.
Yes, I really meant that my patch applies to ETH_P_IP packets and I've 
missed that from documentation I wrote.
> 

> 
> ---
> 	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
> 

-- 
Best regards,

Oleg Ukhno


^ permalink raw reply

* Re: [PATCH] bonding: added 802.3ad round-robin hashing policy for single TCP session balancing
From: Oleg V. Ukhno @ 2011-01-14 23:12 UTC (permalink / raw)
  To: John Fastabend; +Cc: netdev@vger.kernel.org, Jay Vosburgh, David S. Miller
In-Reply-To: <4D30ADB3.7010509@intel.com>



John Fastabend wrote:

> 
> I think you want this patch against net-next not 2.6.37.
This patch is against  2.6.37-git11 and I've tried to apply it to 
net-next - it applied ok
> 
> Oleg, sorry but I don't follow. If this is simply sending every next packet
> via "next" slave interface how are packets not going to get out of order? If
> the links have different RTT this would seem problematic.
> 
> Have you considered using multipath at the block layer? This is how I generally
> handle load balancing over iSCSI/FCoE and it works reasonably well.
> 
> see ./drivers/md/dm-mpath.c

John, the first solution I was using a long time for ISCSI load 
balancing was multipath. But there are some problems with dm-multipath:
- it is slow(I am using ISCSI for Oracle , so I need to minimize latency)
- it handles any link failures bad, because of it's command queue 
limitation(all queued commands above 32 are discarded in case of path 
failure, as I remember)
- it performs very bad when there are many devices and maтy paths(I was 
unable to utilize more that 2Gbps of 4 even with 100 disks with 4 paths 
per each disk)

My patch won't work correct when slave links have different RTT, this is 
true - it is usable only within one ethernet segment with equal/near 
equal RTT. This is it's limitation.


>> +static int bond_xmit_hash_policy_rr(struct sk_buff *skb,
>> +				   struct net_device *bond_dev, int count)
> 
> Here's one reason why this won't work on net-next-2.6.
> 
> int      (*xmit_hash_policy)(struct sk_buff *, int);

Thank you, I've missed that change.

> 
> 
> Thanks,
> John
> 

-- 
Best reagrds,
Oleg Ukhno

^ permalink raw reply

* [PATCH] CHOKe flow scheduler (0.9)
From: Stephen Hemminger @ 2011-01-14 23:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Patrick McHardy, David Miller, netdev
In-Reply-To: <1295015043.3937.20.camel@edumazet-laptop>

CHOKe ("CHOose and Kill" or "CHOose and Keep") is an alternative
packet scheduler based on the Random Exponential Drop (RED) algorithm.

The core idea is:
  For every packet arrival:
  	Calculate Qave
	if (Qave < minth) 
	     Queue the new packet
	else 
	     Select randomly a packet from the queue 
	     if (both packets from same flow)
	     then Drop both the packets
	     else if (Qave > maxth)
	          Drop packet
	     else
	       	  Admit packet with proability p (same as RED)

See also:
  Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
   queue management scheme for approximating fair bandwidth allocation", 
  Proceeding of INFOCOM'2000, March 2000.

Help from:
     Eric Dumazet <eric.dumazet@gmail.com>
     Patrick McHardy <kaber@trash.net>

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
This version is based on net-next, and assumes Eric's patch for
corrected bstats is already applied.

0.9 incorporate patches from Patrick/Eric
    rework the peek_random and drop code to simplify and fix bug where
    random_N needs to called with full length (including holes).

 include/linux/pkt_sched.h |   29 ++
 net/sched/Kconfig         |   11 
 net/sched/Makefile        |    1 
 net/sched/sch_choke.c     |  579 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 620 insertions(+)

--- a/net/sched/Kconfig	2011-01-14 10:43:19.062537393 -0800
+++ b/net/sched/Kconfig	2011-01-14 10:43:50.915195785 -0800
@@ -205,6 +205,17 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
--- a/net/sched/Makefile	2011-01-14 10:43:19.072538228 -0800
+++ b/net/sched/Makefile	2011-01-14 10:43:50.915195785 -0800
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/net/sched/sch_choke.c	2011-01-14 11:30:33.826240296 -0800
@@ -0,0 +1,579 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+/*	CHOKe stateless AQM for fair bandwidth allocation
+        =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+   needs to access packets in queue randomly. It has a minimal class
+   interface to allow overriding the builtin flow classifier with
+   filters.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table */
+#define CHOKE_MAX_QUEUE	(128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+
+/* Variables */
+	struct tcf_proto *filter_list;
+	struct {
+		u32	prob_drop;	/* Early probability drops */
+		u32	prob_mark;	/* Early probability marks */
+		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
+		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
+		u32	pdrop;          /* Drops due to queue limits */
+		u32	other;          /* Drops due to drop() calls */
+		u32	matched;	/* Drops to flow match */
+	} stats;
+
+	unsigned int	 head;
+	unsigned int	 tail;
+
+	unsigned int	 tab_mask; /* size - 1 */
+
+	struct sk_buff **tab;
+};
+
+/* deliver a random number between 0 and N - 1 */
+static u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	do {
+		q->head = (q->head + 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	do {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = q->tab[idx];
+
+	q->tab[idx] = NULL;
+
+	if (idx == q->head)
+		choke_zap_head_holes(q);
+	if (idx == q->tail)
+		choke_zap_tail_holes(q);
+
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_drop(skb, sch);
+	qdisc_tree_decrease_qlen(sch, 1);
+	--sch->q.qlen;
+}
+
+/* Classify flow using either:
+   1. pre-existing classification result in skb
+   2. fast internal classification
+   3. use TC filter based classification
+*/
+static unsigned int choke_classify(struct sk_buff *skb,
+				   struct Qdisc *sch, int *qerr)
+
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return skb_get_rxhash(skb);
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		return TC_H_MIN(res.classid);
+	}
+
+	return 0;
+}
+
+/* Select a packet at random from the queue and return flow classification
+   returns 0 if queue empty
+ */
+static unsigned int choke_peek_random(struct Qdisc *sch,
+				      unsigned int *pidx)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	const struct sk_buff *skb;
+	int retrys = 3;
+
+	do {
+		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+		skb = q->tab[*pidx];
+		if (skb)
+			break;
+	} while (--retrys > 0);
+
+	/* queue is has lots of holes use the head which is known to exist
+	 * Note : result can still be NULL if q->head == q->tail
+	 */
+	if (!skb) {
+		*pidx = q->head;
+		skb = q->tab[*pidx];
+	}
+
+	/* retrieve flow classification is saved during enqueue */
+	return skb ? *(unsigned int *)(qdisc_skb_cb(skb)->data) : 0;
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+	unsigned int hash;
+	int uninitialized_var(ret);
+
+	hash = choke_classify(skb, sch, &ret);
+	if (!hash) {
+		/* Packet was eaten by filter */
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+
+	/* Maybe add hash as field in struct qdisc_skb_cb? */
+	*(unsigned int *)(qdisc_skb_cb(skb)->data) = hash;
+
+	/* Compute average queue usage (see RED) */
+	p->qavg = red_calc_qavg(p, sch->q.qlen);
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	/* Is queue small? */
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		unsigned int idx;
+
+		/* Draw a packet at random from queue and compare flow */
+		if (choke_peek_random(sch, &idx) == hash) {
+			q->stats.matched++;
+			choke_drop_by_idx(sch, idx);
+			goto congestion_drop;
+		}
+
+		/* Queue is large, always mark/drop */
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		} else if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (sch->q.qlen < q->limit) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+		++sch->q.qlen;
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		qdisc_bstats_update(sch, skb);
+		return NET_XMIT_SUCCESS;
+	}
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+		return NULL;
+	}
+
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL;
+	choke_zap_head_holes(q);
+	--sch->q.qlen;
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CHOKE_MAX + 1];
+	const struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CHOKE_PARMS] == NULL ||
+	    tb[TCA_CHOKE_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+	if (ctl->limit > CHOKE_MAX_QUEUE)
+		return -EINVAL;
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab;
+
+		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int oqlen = sch->q.qlen, tail = 0;
+
+			while (q->head != q->tail) {
+				struct sk_buff *skb = q->tab[q->head];
+
+				q->head = (q->head + 1) & q->tab_mask;
+				if (!skb)
+					continue;
+				if (tail < mask) {
+					ntab[tail++] = skb;
+					continue;
+				}
+				sch->qstats.backlog -= qdisc_pkt_len(skb);
+				--sch->q.qlen;
+				qdisc_drop(skb, sch);
+			}
+			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+			q->head = 0;
+			q->tail = tail;
+		}
+
+		q->tab_mask = mask;
+		q->tab = ntab;
+	} else
+		sch_tree_lock(sch);
+
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_CHOKE_STAB]));
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_choke_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.matched = q->stats.matched,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+				u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	if (!arg->stop) {
+		if (arg->fn(sch, 1, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+	.leaf		=	choke_leaf,
+	.get		=	choke_get,
+	.put		=	choke_put,
+	.tcf_chain	=	choke_find_tcf,
+	.bind_tcf	=	choke_bind,
+	.unbind_tcf	=	choke_put,
+	.dump		=	choke_dump_class,
+	.walk		=	choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	choke_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
--- a/include/linux/pkt_sched.h	2011-01-14 10:43:19.092539898 -0800
+++ b/include/linux/pkt_sched.h	2011-01-14 10:43:50.915195785 -0800
@@ -247,6 +247,35 @@ struct tc_gred_sopt {
 	__u16		pad1;
 };
 
+/* CHOKe section */
+
+enum {
+	TCA_CHOKE_UNSPEC,
+	TCA_CHOKE_PARMS,
+	TCA_CHOKE_STAB,
+	__TCA_CHOKE_MAX,
+};
+
+#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
+
+struct tc_choke_qopt {
+	__u32		limit;		/* HARD maximal queue length (packets)	*/
+	__u32		qth_min;	/* Min average length threshold (packets) */
+	__u32		qth_max;	/* Max average length threshold (packets) */
+	unsigned char   Wlog;		/* log(W)		*/
+	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
+	unsigned char   Scell_log;	/* cell size for idle damping */
+	unsigned char	flags;		/* see RED flags */
+};
+
+struct tc_choke_xstats {
+	__u32		early;          /* Early drops */
+	__u32		pdrop;          /* Drops due to queue limits */
+	__u32		other;          /* Drops due to drop() calls */
+	__u32		marked;         /* Marked packets */
+	__u32		matched;	/* Drops due to flow match */
+};
+
 /* HTB section */
 #define TC_HTB_NUMPRIO		8
 #define TC_HTB_MAXDEPTH		8


-- 

^ permalink raw reply

* Re: [PATCH] bonding: added 802.3ad round-robin hashing policy for single TCP session balancing
From: Jay Vosburgh @ 2011-01-15  0:05 UTC (permalink / raw)
  To: Oleg V. Ukhno; +Cc: netdev, David S. Miller, John Fastabend
In-Reply-To: <4D30D37B.6090908@yandex-team.ru>

Oleg V. Ukhno <olegu@yandex-team.ru> wrote:
>Jay Vosburgh wrote:
>
>> 	This is a violation of the 802.3ad (now 802.1ax) standard, 5.2.1
>> (f), which requires that all frames of a given "conversation" are passed
>> to a single port.
>>
>> 	The existing layer3+4 hash has a similar problem (that it may
>> send packets from a conversation to multiple ports), but for that case
>> it's an unlikely exception (only in the case of IP fragmentation), but
>> here it's the norm.  At a minimum, this must be clearly documented.
>>
>> 	Also, what does a round robin in 802.3ad provide that the
>> existing round robin does not?  My presumption is that you're looking to
>> get the aggregator autoconfiguration that 802.3ad provides, but you
>> don't say.

	I'm still curious about this question.  Given the rather
intricate setup of your particular network (described below), I'm not
sure why 802.3ad is of benefit over traditional etherchannel
(balance-rr / balance-xor).

>> 	I don't necessarily think this is a bad cheat (round robining on
>> 802.3ad as an explicit non-standard extension), since everybody wants to
>> stripe their traffic across multiple slaves.  I've given some thought to
>> making round robin into just another hash mode, but this also does some
>> magic to the MAC addresses of the outgoing frames (more on that below).
>Yes, I am resetting MAC addresses when transmitting packets to have switch
>to put packets into different ports of the receiving etherchannel.

	By "etherchannel" do you really mean "Cisco switch with a
port-channel group using LACP"?

>I am using this patch to provide full-mesh ISCSI connectivity between at
>least 4 hosts (all hosts of course are in same ethernet segment) and every
>host is connected with aggregate link with 4 slaves(usually).
>Using round-robin I provide near-equal load striping when transmitting,
>using MAC address magic I force switch to stripe packets over all slave
>links in destination port-channel(when number of rx-ing slaves is equal to
>number ot tx-ing slaves and is even).

	By "MAC address magic" do you mean that you're assigning
specifically chosen MAC addresses to the slaves so that the switch's
hash is essentially "assigning" the bonding slaves to particular ports
on the outgoing port-channel group?

	Assuming that this is the case, it's an interesting idea, but
I'm unconvinced that it's better on 802.3ad vs. balance-rr.  Unless I'm
missing something, you can get everything you need from an option to
have balance-rr / balance-xor utilize the slave's permanent address as
the source address for outgoing traffic.

>[...] So I am able to utilize all slaves
>for tx and for rx up to maximum capacity; besides I am getting L2 link
>failure detection (and load rebalancing), which is (in my opinion) much
>faster and robust than L3 or than dm-multipath provides.
>It's my idea with the patch

	Can somebody (John?) more knowledgable than I about dm-multipath
comment on the above?

>> 	This is the code that resets the MAC header as described above.
>> It doesn't quite match the documentation, since it only resets the MAC
>> for ETH_P_IP packets.
>Yes, I really meant that my patch applies to ETH_P_IP packets and I've
>missed that from documentation I wrote.

	Is limiting this to just ETH_P_IP really a means to exclude ARP,
or is there some advantage to (effectively) only balancing IP traffic,
and leaving other traffic (IPv6, for one) essentially unbalanced (when
exiting the switch through the destination port-channel group, which
you've set to use a src-mac hash)?

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* [PATCH net-next 0/8] vmxnet3 fixes and enhancements
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers

The following series fixes bugs and enhances functionality in the
vmxnet3 driver.

---

Shreyas N Bhatewara (8):
      vmxnet3: fix ring size update
      vmxnet3: Preserve the MAC address configured by ifconfig
      vmxnet3: Enable HW Rx VLAN stripping by default
      vmxnet3: Provide required number of bytes in first SG buffer
      vmxnet3: Make ethtool handlers multiqueue aware
      vmxnet3: Disable napi in suspend, reenable in resume.
      vmxnet3: Add locking for access to command register
      vmxnet3: Dont allocate extra MSI-x vectors


 drivers/net/vmxnet3/vmxnet3_drv.c     |   93 +++++++----
 drivers/net/vmxnet3/vmxnet3_ethtool.c |  274 +++++++++++++++++++--------------
 drivers/net/vmxnet3/vmxnet3_int.h     |    7 -
 3 files changed, 226 insertions(+), 148 deletions(-)

-- 
Thanking you.
Shreyas

^ permalink raw reply

* [PATCH net-next 1/8] vmxnet3: fix ring size update
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers, Dmitry Torokhov
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

Fix a bug while changing ring size when MTU is changed.

Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
Acked-by: Dmitry Torokhov <dtor@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index d143e8b..562bdbb 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2426,7 +2426,7 @@ vmxnet3_adjust_rx_ring_size(struct vmxnet3_adapter *adapter)
 	sz = adapter->rx_buf_per_pkt * VMXNET3_RING_SIZE_ALIGN;
 	ring0_size = adapter->rx_queue[0].rx_ring[0].size;
 	ring0_size = (ring0_size + sz - 1) / sz * sz;
-	ring0_size = min_t(u32, rq->rx_ring[0].size, VMXNET3_RX_RING_MAX_SIZE /
+	ring0_size = min_t(u32, ring0_size, VMXNET3_RX_RING_MAX_SIZE /
 			   sz * sz);
 	ring1_size = adapter->rx_queue[0].rx_ring[1].size;
 	comp_size = ring0_size + ring1_size;

^ permalink raw reply related

* [PATCH net-next 2/8] vmxnet3: Preserve the MAC address configured by ifconfig
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

While activating the device get it's MAC address from netdev. This will allow
the MAC address iconfigured using ifconfig to persist through the reset.

Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 562bdbb..89bcee8 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -48,6 +48,9 @@ static atomic_t devices_found;
 static int enable_mq = 1;
 static int irq_share_mode;
 
+static void
+vmxnet3_write_mac_addr(struct vmxnet3_adapter *adapter, u8 *mac);
+
 /*
  *    Enable/Disable the given intr
  */
@@ -2168,6 +2171,8 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter)
 	/* rx filter settings */
 	devRead->rxFilterConf.rxMode = 0;
 	vmxnet3_restore_vlan(adapter);
+	vmxnet3_write_mac_addr(adapter, adapter->netdev->dev_addr);
+
 	/* the rest are already zeroed */
 }
 


^ permalink raw reply related

* [PATCH net-next 3/8] vmxnet3: Enable HW Rx VLAN stripping by default
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers, Guolin Yang
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

Make hw vlan tag stripping as enabled by default. Thereby remove
the code to conditionally enable it later.

Signed-off-by: Guolin Yang <gyang@vmware.com>
Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c |   14 +-------------
 1 files changed, 1 insertions(+), 13 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 89bcee8..f47db1c 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1867,13 +1867,8 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 		/* add vlan rx stripping. */
 		if (adapter->netdev->features & NETIF_F_HW_VLAN_RX) {
 			int i;
-			struct Vmxnet3_DSDevRead *devRead = &shared->devRead;
 			adapter->vlan_grp = grp;
 
-			/* update FEATURES to device */
-			devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
-			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
-					       VMXNET3_CMD_UPDATE_FEATURE);
 			/*
 			 *  Clear entire vfTable; then enable untagged pkts.
 			 *  Note: setting one entry in vfTable to non-zero turns
@@ -1905,11 +1900,6 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 			}
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
-
-			/* update FEATURES to device */
-			devRead->misc.uptFeatures &= ~UPT1_F_RXVLAN;
-			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
-					       VMXNET3_CMD_UPDATE_FEATURE);
 		}
 	}
 }
@@ -2083,10 +2073,8 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter)
 		devRead->misc.uptFeatures |= UPT1_F_LRO;
 		devRead->misc.maxNumRxSG = cpu_to_le16(1 + MAX_SKB_FRAGS);
 	}
-	if ((adapter->netdev->features & NETIF_F_HW_VLAN_RX) &&
-	    adapter->vlan_grp) {
+	if (adapter->netdev->features & NETIF_F_HW_VLAN_RX)
 		devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
-	}
 
 	devRead->misc.mtu = cpu_to_le32(adapter->netdev->mtu);
 	devRead->misc.queueDescPA = cpu_to_le64(adapter->queue_desc_pa);

^ permalink raw reply related

* [PATCH net-next 4/8] vmxnet3: Provide required number of bytes in first SG buffer
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

This is a performance enhancement fix. vmxnet3 device performs better when
provided with at least 54 bytes (ethernet 14 + IP 20+ TCP 20) in the first SG
buffer. For UDP packets driver provides lesser than that in first sg. This
change fixes the same. Also avoid the redundant pskb_may_pull() call.

Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c |   23 +++++++++--------------
 1 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index f47db1c..a1632a9 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -807,30 +807,25 @@ vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 				   skb_transport_header(skb))->doff * 4;
 		ctx->copy_size = ctx->eth_ip_hdr_size + ctx->l4_hdr_size;
 	} else {
-		unsigned int pull_size;
-
 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			ctx->eth_ip_hdr_size = skb_checksum_start_offset(skb);
 
 			if (ctx->ipv4) {
 				struct iphdr *iph = (struct iphdr *)
 						    skb_network_header(skb);
-				if (iph->protocol == IPPROTO_TCP) {
-					pull_size = ctx->eth_ip_hdr_size +
-						    sizeof(struct tcphdr);
-
-					if (unlikely(!pskb_may_pull(skb,
-								pull_size))) {
-						goto err;
-					}
+				if (iph->protocol == IPPROTO_TCP)
 					ctx->l4_hdr_size = ((struct tcphdr *)
 					   skb_transport_header(skb))->doff * 4;
-				} else if (iph->protocol == IPPROTO_UDP) {
+				else if (iph->protocol == IPPROTO_UDP)
+					/*
+					 * Use tcp header size so that bytes to
+					 * be copied are more than required by
+					 * the device.
+					 */
 					ctx->l4_hdr_size =
-							sizeof(struct udphdr);
-				} else {
+							sizeof(struct tcphdr);
+				else
 					ctx->l4_hdr_size = 0;
-				}
 			} else {
 				/* for simplicity, don't copy L4 headers */
 				ctx->l4_hdr_size = 0;

^ permalink raw reply related

* [PATCH net-next 5/8] vmxnet3: Make ethtool handlers multiqueue aware
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

Show per-queue stats in ethtool -S output for vmxnet3 interface. Register dump
of ethtool should dump registers for all tx and rx queues.

Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_ethtool.c |  259 ++++++++++++++++++---------------
 1 files changed, 145 insertions(+), 114 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 8e17fc8..d70cee1 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -68,76 +68,78 @@ vmxnet3_set_rx_csum(struct net_device *netdev, u32 val)
 static const struct vmxnet3_stat_desc
 vmxnet3_tq_dev_stats[] = {
 	/* description,         offset */
-	{ "TSO pkts tx",        offsetof(struct UPT1_TxStats, TSOPktsTxOK) },
-	{ "TSO bytes tx",       offsetof(struct UPT1_TxStats, TSOBytesTxOK) },
-	{ "ucast pkts tx",      offsetof(struct UPT1_TxStats, ucastPktsTxOK) },
-	{ "ucast bytes tx",     offsetof(struct UPT1_TxStats, ucastBytesTxOK) },
-	{ "mcast pkts tx",      offsetof(struct UPT1_TxStats, mcastPktsTxOK) },
-	{ "mcast bytes tx",     offsetof(struct UPT1_TxStats, mcastBytesTxOK) },
-	{ "bcast pkts tx",      offsetof(struct UPT1_TxStats, bcastPktsTxOK) },
-	{ "bcast bytes tx",     offsetof(struct UPT1_TxStats, bcastBytesTxOK) },
-	{ "pkts tx err",        offsetof(struct UPT1_TxStats, pktsTxError) },
-	{ "pkts tx discard",    offsetof(struct UPT1_TxStats, pktsTxDiscard) },
+	{ "Tx Queue#",        0 },
+	{ "  TSO pkts tx",	offsetof(struct UPT1_TxStats, TSOPktsTxOK) },
+	{ "  TSO bytes tx",	offsetof(struct UPT1_TxStats, TSOBytesTxOK) },
+	{ "  ucast pkts tx",	offsetof(struct UPT1_TxStats, ucastPktsTxOK) },
+	{ "  ucast bytes tx",	offsetof(struct UPT1_TxStats, ucastBytesTxOK) },
+	{ "  mcast pkts tx",	offsetof(struct UPT1_TxStats, mcastPktsTxOK) },
+	{ "  mcast bytes tx",	offsetof(struct UPT1_TxStats, mcastBytesTxOK) },
+	{ "  bcast pkts tx",	offsetof(struct UPT1_TxStats, bcastPktsTxOK) },
+	{ "  bcast bytes tx",	offsetof(struct UPT1_TxStats, bcastBytesTxOK) },
+	{ "  pkts tx err",	offsetof(struct UPT1_TxStats, pktsTxError) },
+	{ "  pkts tx discard",	offsetof(struct UPT1_TxStats, pktsTxDiscard) },
 };
 
 /* per tq stats maintained by the driver */
 static const struct vmxnet3_stat_desc
 vmxnet3_tq_driver_stats[] = {
 	/* description,         offset */
-	{"drv dropped tx total", offsetof(struct vmxnet3_tq_driver_stats,
-					drop_total) },
-	{ "   too many frags",  offsetof(struct vmxnet3_tq_driver_stats,
-					drop_too_many_frags) },
-	{ "   giant hdr",       offsetof(struct vmxnet3_tq_driver_stats,
-					drop_oversized_hdr) },
-	{ "   hdr err",         offsetof(struct vmxnet3_tq_driver_stats,
-					drop_hdr_inspect_err) },
-	{ "   tso",             offsetof(struct vmxnet3_tq_driver_stats,
-					drop_tso) },
-	{ "ring full",          offsetof(struct vmxnet3_tq_driver_stats,
-					tx_ring_full) },
-	{ "pkts linearized",    offsetof(struct vmxnet3_tq_driver_stats,
-					linearized) },
-	{ "hdr cloned",         offsetof(struct vmxnet3_tq_driver_stats,
-					copy_skb_header) },
-	{ "giant hdr",          offsetof(struct vmxnet3_tq_driver_stats,
-					oversized_hdr) },
+	{"  drv dropped tx total",	offsetof(struct vmxnet3_tq_driver_stats,
+						 drop_total) },
+	{ "     too many frags", offsetof(struct vmxnet3_tq_driver_stats,
+					  drop_too_many_frags) },
+	{ "     giant hdr",	offsetof(struct vmxnet3_tq_driver_stats,
+					 drop_oversized_hdr) },
+	{ "     hdr err",	offsetof(struct vmxnet3_tq_driver_stats,
+					 drop_hdr_inspect_err) },
+	{ "     tso",		offsetof(struct vmxnet3_tq_driver_stats,
+					 drop_tso) },
+	{ "  ring full",	offsetof(struct vmxnet3_tq_driver_stats,
+					 tx_ring_full) },
+	{ "  pkts linearized",	offsetof(struct vmxnet3_tq_driver_stats,
+					 linearized) },
+	{ "  hdr cloned",	offsetof(struct vmxnet3_tq_driver_stats,
+					 copy_skb_header) },
+	{ "  giant hdr",	offsetof(struct vmxnet3_tq_driver_stats,
+					 oversized_hdr) },
 };
 
 /* per rq stats maintained by the device */
 static const struct vmxnet3_stat_desc
 vmxnet3_rq_dev_stats[] = {
-	{ "LRO pkts rx",        offsetof(struct UPT1_RxStats, LROPktsRxOK) },
-	{ "LRO byte rx",        offsetof(struct UPT1_RxStats, LROBytesRxOK) },
-	{ "ucast pkts rx",      offsetof(struct UPT1_RxStats, ucastPktsRxOK) },
-	{ "ucast bytes rx",     offsetof(struct UPT1_RxStats, ucastBytesRxOK) },
-	{ "mcast pkts rx",      offsetof(struct UPT1_RxStats, mcastPktsRxOK) },
-	{ "mcast bytes rx",     offsetof(struct UPT1_RxStats, mcastBytesRxOK) },
-	{ "bcast pkts rx",      offsetof(struct UPT1_RxStats, bcastPktsRxOK) },
-	{ "bcast bytes rx",     offsetof(struct UPT1_RxStats, bcastBytesRxOK) },
-	{ "pkts rx out of buf", offsetof(struct UPT1_RxStats, pktsRxOutOfBuf) },
-	{ "pkts rx err",        offsetof(struct UPT1_RxStats, pktsRxError) },
+	{ "Rx Queue#",        0 },
+	{ "  LRO pkts rx",	offsetof(struct UPT1_RxStats, LROPktsRxOK) },
+	{ "  LRO byte rx",	offsetof(struct UPT1_RxStats, LROBytesRxOK) },
+	{ "  ucast pkts rx",	offsetof(struct UPT1_RxStats, ucastPktsRxOK) },
+	{ "  ucast bytes rx",	offsetof(struct UPT1_RxStats, ucastBytesRxOK) },
+	{ "  mcast pkts rx",	offsetof(struct UPT1_RxStats, mcastPktsRxOK) },
+	{ "  mcast bytes rx",	offsetof(struct UPT1_RxStats, mcastBytesRxOK) },
+	{ "  bcast pkts rx",	offsetof(struct UPT1_RxStats, bcastPktsRxOK) },
+	{ "  bcast bytes rx",	offsetof(struct UPT1_RxStats, bcastBytesRxOK) },
+	{ "  pkts rx OOB",	offsetof(struct UPT1_RxStats, pktsRxOutOfBuf) },
+	{ "  pkts rx err",	offsetof(struct UPT1_RxStats, pktsRxError) },
 };
 
 /* per rq stats maintained by the driver */
 static const struct vmxnet3_stat_desc
 vmxnet3_rq_driver_stats[] = {
 	/* description,         offset */
-	{ "drv dropped rx total", offsetof(struct vmxnet3_rq_driver_stats,
-					   drop_total) },
-	{ "   err",            offsetof(struct vmxnet3_rq_driver_stats,
-					drop_err) },
-	{ "   fcs",            offsetof(struct vmxnet3_rq_driver_stats,
-					drop_fcs) },
-	{ "rx buf alloc fail", offsetof(struct vmxnet3_rq_driver_stats,
-					rx_buf_alloc_failure) },
+	{ "  drv dropped rx total", offsetof(struct vmxnet3_rq_driver_stats,
+					     drop_total) },
+	{ "     err",		offsetof(struct vmxnet3_rq_driver_stats,
+					 drop_err) },
+	{ "     fcs",		offsetof(struct vmxnet3_rq_driver_stats,
+					 drop_fcs) },
+	{ "  rx buf alloc fail", offsetof(struct vmxnet3_rq_driver_stats,
+					  rx_buf_alloc_failure) },
 };
 
 /* gloabl stats maintained by the driver */
 static const struct vmxnet3_stat_desc
 vmxnet3_global_stats[] = {
 	/* description,         offset */
-	{ "tx timeout count",   offsetof(struct vmxnet3_adapter,
+	{ "tx timeout count",	offsetof(struct vmxnet3_adapter,
 					 tx_timeout_count) }
 };
 
@@ -193,12 +195,15 @@ vmxnet3_get_stats(struct net_device *netdev)
 static int
 vmxnet3_get_sset_count(struct net_device *netdev, int sset)
 {
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	switch (sset) {
 	case ETH_SS_STATS:
-		return ARRAY_SIZE(vmxnet3_tq_dev_stats) +
-			ARRAY_SIZE(vmxnet3_tq_driver_stats) +
-			ARRAY_SIZE(vmxnet3_rq_dev_stats) +
-			ARRAY_SIZE(vmxnet3_rq_driver_stats) +
+		return (ARRAY_SIZE(vmxnet3_tq_dev_stats) +
+			ARRAY_SIZE(vmxnet3_tq_driver_stats)) *
+		       adapter->num_tx_queues +
+		       (ARRAY_SIZE(vmxnet3_rq_dev_stats) +
+			ARRAY_SIZE(vmxnet3_rq_driver_stats)) *
+		       adapter->num_rx_queues +
 			ARRAY_SIZE(vmxnet3_global_stats);
 	default:
 		return -EOPNOTSUPP;
@@ -206,10 +211,16 @@ vmxnet3_get_sset_count(struct net_device *netdev, int sset)
 }
 
 
+/* Should be multiple of 4 */
+#define NUM_TX_REGS	8
+#define NUM_RX_REGS	12
+
 static int
 vmxnet3_get_regs_len(struct net_device *netdev)
 {
-	return 20 * sizeof(u32);
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	return (adapter->num_tx_queues * NUM_TX_REGS * sizeof(u32) +
+		adapter->num_rx_queues * NUM_RX_REGS * sizeof(u32));
 }
 
 
@@ -240,29 +251,37 @@ vmxnet3_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
 static void
 vmxnet3_get_strings(struct net_device *netdev, u32 stringset, u8 *buf)
 {
+	 struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	if (stringset == ETH_SS_STATS) {
-		int i;
-
-		for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_dev_stats); i++) {
-			memcpy(buf, vmxnet3_tq_dev_stats[i].desc,
-			       ETH_GSTRING_LEN);
-			buf += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_driver_stats); i++) {
-			memcpy(buf, vmxnet3_tq_driver_stats[i].desc,
-			       ETH_GSTRING_LEN);
-			buf += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_dev_stats); i++) {
-			memcpy(buf, vmxnet3_rq_dev_stats[i].desc,
-			       ETH_GSTRING_LEN);
-			buf += ETH_GSTRING_LEN;
+		int i, j;
+		for (j = 0; j < adapter->num_tx_queues; j++) {
+			for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_dev_stats); i++) {
+				memcpy(buf, vmxnet3_tq_dev_stats[i].desc,
+				       ETH_GSTRING_LEN);
+				buf += ETH_GSTRING_LEN;
+			}
+			for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_driver_stats);
+			     i++) {
+				memcpy(buf, vmxnet3_tq_driver_stats[i].desc,
+				       ETH_GSTRING_LEN);
+				buf += ETH_GSTRING_LEN;
+			}
 		}
-		for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_driver_stats); i++) {
-			memcpy(buf, vmxnet3_rq_driver_stats[i].desc,
-			       ETH_GSTRING_LEN);
-			buf += ETH_GSTRING_LEN;
+
+		for (j = 0; j < adapter->num_rx_queues; j++) {
+			for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_dev_stats); i++) {
+				memcpy(buf, vmxnet3_rq_dev_stats[i].desc,
+				       ETH_GSTRING_LEN);
+				buf += ETH_GSTRING_LEN;
+			}
+			for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_driver_stats);
+			     i++) {
+				memcpy(buf, vmxnet3_rq_driver_stats[i].desc,
+				       ETH_GSTRING_LEN);
+				buf += ETH_GSTRING_LEN;
+			}
 		}
+
 		for (i = 0; i < ARRAY_SIZE(vmxnet3_global_stats); i++) {
 			memcpy(buf, vmxnet3_global_stats[i].desc,
 				ETH_GSTRING_LEN);
@@ -310,23 +329,31 @@ vmxnet3_get_ethtool_stats(struct net_device *netdev,
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_STATS);
 
 	/* this does assume each counter is 64-bit wide */
-/* TODO change this for multiple queues */
-
-	base = (u8 *)&adapter->tqd_start[j].stats;
-	for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_dev_stats); i++)
-		*buf++ = *(u64 *)(base + vmxnet3_tq_dev_stats[i].offset);
-
-	base = (u8 *)&adapter->tx_queue[j].stats;
-	for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_driver_stats); i++)
-		*buf++ = *(u64 *)(base + vmxnet3_tq_driver_stats[i].offset);
-
-	base = (u8 *)&adapter->rqd_start[j].stats;
-	for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_dev_stats); i++)
-		*buf++ = *(u64 *)(base + vmxnet3_rq_dev_stats[i].offset);
+	for (j = 0; j < adapter->num_tx_queues; j++) {
+		base = (u8 *)&adapter->tqd_start[j].stats;
+		*buf++ = (u64)j;
+		for (i = 1; i < ARRAY_SIZE(vmxnet3_tq_dev_stats); i++)
+			*buf++ = *(u64 *)(base +
+					  vmxnet3_tq_dev_stats[i].offset);
+
+		base = (u8 *)&adapter->tx_queue[j].stats;
+		for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_driver_stats); i++)
+			*buf++ = *(u64 *)(base +
+					  vmxnet3_tq_driver_stats[i].offset);
+	}
 
-	base = (u8 *)&adapter->rx_queue[j].stats;
-	for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_driver_stats); i++)
-		*buf++ = *(u64 *)(base + vmxnet3_rq_driver_stats[i].offset);
+	for (j = 0; j < adapter->num_tx_queues; j++) {
+		base = (u8 *)&adapter->rqd_start[j].stats;
+		*buf++ = (u64) j;
+		for (i = 1; i < ARRAY_SIZE(vmxnet3_rq_dev_stats); i++)
+			*buf++ = *(u64 *)(base +
+					  vmxnet3_rq_dev_stats[i].offset);
+
+		base = (u8 *)&adapter->rx_queue[j].stats;
+		for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_driver_stats); i++)
+			*buf++ = *(u64 *)(base +
+					  vmxnet3_rq_driver_stats[i].offset);
+	}
 
 	base = (u8 *)adapter;
 	for (i = 0; i < ARRAY_SIZE(vmxnet3_global_stats); i++)
@@ -339,7 +366,7 @@ vmxnet3_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	u32 *buf = p;
-	int i = 0;
+	int i = 0, j = 0;
 
 	memset(p, 0, vmxnet3_get_regs_len(netdev));
 
@@ -348,31 +375,35 @@ vmxnet3_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p)
 	/* Update vmxnet3_get_regs_len if we want to dump more registers */
 
 	/* make each ring use multiple of 16 bytes */
-/* TODO change this for multiple queues */
-	buf[0] = adapter->tx_queue[i].tx_ring.next2fill;
-	buf[1] = adapter->tx_queue[i].tx_ring.next2comp;
-	buf[2] = adapter->tx_queue[i].tx_ring.gen;
-	buf[3] = 0;
-
-	buf[4] = adapter->tx_queue[i].comp_ring.next2proc;
-	buf[5] = adapter->tx_queue[i].comp_ring.gen;
-	buf[6] = adapter->tx_queue[i].stopped;
-	buf[7] = 0;
-
-	buf[8] = adapter->rx_queue[i].rx_ring[0].next2fill;
-	buf[9] = adapter->rx_queue[i].rx_ring[0].next2comp;
-	buf[10] = adapter->rx_queue[i].rx_ring[0].gen;
-	buf[11] = 0;
-
-	buf[12] = adapter->rx_queue[i].rx_ring[1].next2fill;
-	buf[13] = adapter->rx_queue[i].rx_ring[1].next2comp;
-	buf[14] = adapter->rx_queue[i].rx_ring[1].gen;
-	buf[15] = 0;
-
-	buf[16] = adapter->rx_queue[i].comp_ring.next2proc;
-	buf[17] = adapter->rx_queue[i].comp_ring.gen;
-	buf[18] = 0;
-	buf[19] = 0;
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		buf[j++] = adapter->tx_queue[i].tx_ring.next2fill;
+		buf[j++] = adapter->tx_queue[i].tx_ring.next2comp;
+		buf[j++] = adapter->tx_queue[i].tx_ring.gen;
+		buf[j++] = 0;
+
+		buf[j++] = adapter->tx_queue[i].comp_ring.next2proc;
+		buf[j++] = adapter->tx_queue[i].comp_ring.gen;
+		buf[j++] = adapter->tx_queue[i].stopped;
+		buf[j++] = 0;
+	}
+
+	for (i = 0; i < adapter->num_rx_queues; i++) {
+		buf[j++] = adapter->rx_queue[i].rx_ring[0].next2fill;
+		buf[j++] = adapter->rx_queue[i].rx_ring[0].next2comp;
+		buf[j++] = adapter->rx_queue[i].rx_ring[0].gen;
+		buf[j++] = 0;
+
+		buf[j++] = adapter->rx_queue[i].rx_ring[1].next2fill;
+		buf[j++] = adapter->rx_queue[i].rx_ring[1].next2comp;
+		buf[j++] = adapter->rx_queue[i].rx_ring[1].gen;
+		buf[j++] = 0;
+
+		buf[j++] = adapter->rx_queue[i].comp_ring.next2proc;
+		buf[j++] = adapter->rx_queue[i].comp_ring.gen;
+		buf[j++] = 0;
+		buf[j++] = 0;
+	}
+
 }
 
 

^ permalink raw reply related

* [PATCH net-next 6/8] vmxnet3: Disable napi in suspend, reenable in resume.
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers, Dmitry Torokhov
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

There is a small possibility of a race where the suspend routine gets
called, while a napi callback is still pending and when that comes up,
it enables interrupts which just got disabled in the suspend routine.
This change adds napi disable call in suspend and enable in resume to
avoid race.

Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
Acked-by: Dmitry Torokhov <dtor@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index a1632a9..20ef4f3 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -3101,6 +3101,9 @@ vmxnet3_suspend(struct device *device)
 	if (!netif_running(netdev))
 		return 0;
 
+	for (i = 0; i < adapter->num_rx_queues; i++)
+		napi_disable(&adapter->rx_queue[i].napi);
+
 	vmxnet3_disable_all_intrs(adapter);
 	vmxnet3_free_irqs(adapter);
 	vmxnet3_free_intr_resources(adapter);
@@ -3192,7 +3195,7 @@ skip_arp:
 static int
 vmxnet3_resume(struct device *device)
 {
-	int err;
+	int err, i = 0;
 	struct pci_dev *pdev = to_pci_dev(device);
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
@@ -3224,6 +3227,8 @@ vmxnet3_resume(struct device *device)
 			       VMXNET3_CMD_UPDATE_PMCFG);
 	vmxnet3_alloc_intr_resources(adapter);
 	vmxnet3_request_irqs(adapter);
+	for (i = 0; i < adapter->num_rx_queues; i++)
+		napi_enable(&adapter->rx_queue[i].napi);
 	vmxnet3_enable_all_intrs(adapter);
 
 	return 0;

^ permalink raw reply related

* [PATCH net-next 7/8] vmxnet3: Add locking for access to command register
From: Shreyas N Bhatewara @ 2011-01-15  0:59 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers, Matthieu Bucchianeri
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

Access to cmd register is racey, especially in smp environments. Protect
it using a spinlock.

Signed-off-by: Matthieu Bucchianeri <matthieu@vmware.com>
Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c     |   38 +++++++++++++++++++++++++++++++++
 drivers/net/vmxnet3/vmxnet3_ethtool.c |   15 +++++++++++++
 drivers/net/vmxnet3/vmxnet3_int.h     |    1 +
 3 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 20ef4f3..3b5b134 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -142,9 +142,13 @@ vmxnet3_check_link(struct vmxnet3_adapter *adapter, bool affectTxQueue)
 {
 	u32 ret;
 	int i;
+	unsigned long flags;
 
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_LINK);
 	ret = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
+
 	adapter->link_speed = ret >> 16;
 	if (ret & 1) { /* Link is up. */
 		printk(KERN_INFO "%s: NIC Link is Up %d Mbps\n",
@@ -186,8 +190,10 @@ vmxnet3_process_events(struct vmxnet3_adapter *adapter)
 
 	/* Check if there is an error on xmit/recv queues */
 	if (events & (VMXNET3_ECR_TQERR | VMXNET3_ECR_RQERR)) {
+		spin_lock(&adapter->cmd_lock);
 		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 				       VMXNET3_CMD_GET_QUEUE_STATUS);
+		spin_unlock(&adapter->cmd_lock);
 
 		for (i = 0; i < adapter->num_tx_queues; i++)
 			if (adapter->tqd_start[i].status.stopped)
@@ -1857,6 +1863,7 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	struct Vmxnet3_DriverShared *shared = adapter->shared;
 	u32 *vfTable = adapter->shared->devRead.rxFilterConf.vfTable;
+	unsigned long flags;
 
 	if (grp) {
 		/* add vlan rx stripping. */
@@ -1873,8 +1880,10 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 				vfTable[i] = 0;
 
 			VMXNET3_SET_VFTABLE_ENTRY(vfTable, 0);
+			spin_lock_irqsave(&adapter->cmd_lock, flags);
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+			spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 		} else {
 			printk(KERN_ERR "%s: vlan_rx_register when device has "
 			       "no NETIF_F_HW_VLAN_RX\n", netdev->name);
@@ -1893,8 +1902,10 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 				 */
 				vfTable[i] = 0;
 			}
+			spin_lock_irqsave(&adapter->cmd_lock, flags);
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+			spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 		}
 	}
 }
@@ -1927,10 +1938,13 @@ vmxnet3_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	u32 *vfTable = adapter->shared->devRead.rxFilterConf.vfTable;
+	unsigned long flags;
 
 	VMXNET3_SET_VFTABLE_ENTRY(vfTable, vid);
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 }
 
 
@@ -1939,10 +1953,13 @@ vmxnet3_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	u32 *vfTable = adapter->shared->devRead.rxFilterConf.vfTable;
+	unsigned long flags;
 
 	VMXNET3_CLEAR_VFTABLE_ENTRY(vfTable, vid);
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 }
 
 
@@ -1973,6 +1990,7 @@ static void
 vmxnet3_set_mc(struct net_device *netdev)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	unsigned long flags;
 	struct Vmxnet3_RxFilterConf *rxConf =
 					&adapter->shared->devRead.rxFilterConf;
 	u8 *new_table = NULL;
@@ -2008,6 +2026,7 @@ vmxnet3_set_mc(struct net_device *netdev)
 		rxConf->mfTablePA = 0;
 	}
 
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	if (new_mode != rxConf->rxMode) {
 		rxConf->rxMode = cpu_to_le32(new_mode);
 		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
@@ -2016,6 +2035,7 @@ vmxnet3_set_mc(struct net_device *netdev)
 
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_UPDATE_MAC_FILTERS);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 
 	kfree(new_table);
 }
@@ -2165,6 +2185,7 @@ vmxnet3_activate_dev(struct vmxnet3_adapter *adapter)
 {
 	int err, i;
 	u32 ret;
+	unsigned long flags;
 
 	dev_dbg(&adapter->netdev->dev, "%s: skb_buf_size %d, rx_buf_per_pkt %d,"
 		" ring sizes %u %u %u\n", adapter->netdev->name,
@@ -2194,9 +2215,11 @@ vmxnet3_activate_dev(struct vmxnet3_adapter *adapter)
 			       adapter->shared_pa));
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_DSAH, VMXNET3_GET_ADDR_HI(
 			       adapter->shared_pa));
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_ACTIVATE_DEV);
 	ret = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 
 	if (ret != 0) {
 		printk(KERN_ERR "Failed to activate dev %s: error %u\n",
@@ -2243,7 +2266,10 @@ rq_err:
 void
 vmxnet3_reset_dev(struct vmxnet3_adapter *adapter)
 {
+	unsigned long flags;
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_RESET_DEV);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 }
 
 
@@ -2251,12 +2277,15 @@ int
 vmxnet3_quiesce_dev(struct vmxnet3_adapter *adapter)
 {
 	int i;
+	unsigned long flags;
 	if (test_and_set_bit(VMXNET3_STATE_BIT_QUIESCED, &adapter->state))
 		return 0;
 
 
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_QUIESCE_DEV);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 	vmxnet3_disable_all_intrs(adapter);
 
 	for (i = 0; i < adapter->num_rx_queues; i++)
@@ -2706,9 +2735,11 @@ vmxnet3_alloc_intr_resources(struct vmxnet3_adapter *adapter)
 	u32 cfg;
 
 	/* intr settings */
+	spin_lock(&adapter->cmd_lock);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_GET_CONF_INTR);
 	cfg = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD);
+	spin_unlock(&adapter->cmd_lock);
 	adapter->intr.type = cfg & 0x3;
 	adapter->intr.mask_mode = (cfg >> 2) & 0x3;
 
@@ -2893,6 +2924,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
 	adapter->netdev = netdev;
 	adapter->pdev = pdev;
 
+	spin_lock_init(&adapter->cmd_lock);
 	adapter->shared = pci_alloc_consistent(adapter->pdev,
 			  sizeof(struct Vmxnet3_DriverShared),
 			  &adapter->shared_pa);
@@ -3096,6 +3128,7 @@ vmxnet3_suspend(struct device *device)
 	u8 *arpreq;
 	struct in_device *in_dev;
 	struct in_ifaddr *ifa;
+	unsigned long flags;
 	int i = 0;
 
 	if (!netif_running(netdev))
@@ -3179,8 +3212,10 @@ skip_arp:
 	adapter->shared->devRead.pmConfDesc.confPA = cpu_to_le64(virt_to_phys(
 								 pmConf));
 
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_UPDATE_PMCFG);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 
 	pci_save_state(pdev);
 	pci_enable_wake(pdev, pci_choose_state(pdev, PMSG_SUSPEND),
@@ -3196,6 +3231,7 @@ static int
 vmxnet3_resume(struct device *device)
 {
 	int err, i = 0;
+	unsigned long flags;
 	struct pci_dev *pdev = to_pci_dev(device);
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
@@ -3223,8 +3259,10 @@ vmxnet3_resume(struct device *device)
 
 	pci_enable_wake(pdev, PCI_D0, 0);
 
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_UPDATE_PMCFG);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 	vmxnet3_alloc_intr_resources(adapter);
 	vmxnet3_request_irqs(adapter);
 	for (i = 0; i < adapter->num_rx_queues; i++)
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index d70cee1..81254be 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -45,6 +45,7 @@ static int
 vmxnet3_set_rx_csum(struct net_device *netdev, u32 val)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	unsigned long flags;
 
 	if (adapter->rxcsum != val) {
 		adapter->rxcsum = val;
@@ -56,8 +57,10 @@ vmxnet3_set_rx_csum(struct net_device *netdev, u32 val)
 				adapter->shared->devRead.misc.uptFeatures &=
 				~UPT1_F_RXCSUM;
 
+			spin_lock_irqsave(&adapter->cmd_lock, flags);
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_FEATURE);
+			spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 		}
 	}
 	return 0;
@@ -153,12 +156,15 @@ vmxnet3_get_stats(struct net_device *netdev)
 	struct UPT1_TxStats *devTxStats;
 	struct UPT1_RxStats *devRxStats;
 	struct net_device_stats *net_stats = &netdev->stats;
+	unsigned long flags;
 	int i;
 
 	adapter = netdev_priv(netdev);
 
 	/* Collect the dev stats into the shared area */
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_STATS);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 
 	memset(net_stats, 0, sizeof(*net_stats));
 	for (i = 0; i < adapter->num_tx_queues; i++) {
@@ -296,6 +302,7 @@ vmxnet3_set_flags(struct net_device *netdev, u32 data)
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	u8 lro_requested = (data & ETH_FLAG_LRO) == 0 ? 0 : 1;
 	u8 lro_present = (netdev->features & NETIF_F_LRO) == 0 ? 0 : 1;
+	unsigned long flags;
 
 	if (data & ~ETH_FLAG_LRO)
 		return -EOPNOTSUPP;
@@ -311,8 +318,10 @@ vmxnet3_set_flags(struct net_device *netdev, u32 data)
 		else
 			adapter->shared->devRead.misc.uptFeatures &=
 							~UPT1_F_LRO;
+		spin_lock_irqsave(&adapter->cmd_lock, flags);
 		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 				       VMXNET3_CMD_UPDATE_FEATURE);
+		spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 	}
 	return 0;
 }
@@ -322,11 +331,14 @@ vmxnet3_get_ethtool_stats(struct net_device *netdev,
 			  struct ethtool_stats *stats, u64  *buf)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	unsigned long flags;
 	u8 *base;
 	int i;
 	int j = 0;
 
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_STATS);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 
 	/* this does assume each counter is 64-bit wide */
 	for (j = 0; j < adapter->num_tx_queues; j++) {
@@ -605,6 +617,7 @@ vmxnet3_set_rss_indir(struct net_device *netdev,
 		      const struct ethtool_rxfh_indir *p)
 {
 	unsigned int i;
+	unsigned long flags;
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 	struct UPT1_RSSConf *rssConf = adapter->rss_conf;
 
@@ -623,8 +636,10 @@ vmxnet3_set_rss_indir(struct net_device *netdev,
 	for (i = 0; i < rssConf->indTableSize; i++)
 		rssConf->indTable[i] = p->ring_index[i];
 
+	spin_lock_irqsave(&adapter->cmd_lock, flags);
 	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 			       VMXNET3_CMD_UPDATE_RSSIDT);
+	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 
 	return 0;
 
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 7fadeed..474f5df 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -317,6 +317,7 @@ struct vmxnet3_adapter {
 	struct vmxnet3_rx_queue		rx_queue[VMXNET3_DEVICE_MAX_RX_QUEUES];
 	struct vlan_group		*vlan_grp;
 	struct vmxnet3_intr		intr;
+	spinlock_t			cmd_lock;
 	struct Vmxnet3_DriverShared	*shared;
 	struct Vmxnet3_PMConf		*pm_conf;
 	struct Vmxnet3_TxQueueDesc	*tqd_start;     /* all tx queue desc */

^ permalink raw reply related

* [PATCH net-next 8/8] vmxnet3: Dont allocate extra MSI-x vectors
From: Shreyas N Bhatewara @ 2011-01-15  1:00 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: pv-drivers
In-Reply-To: <20110115005701.1064.67435.stgit@sbhatewara-dev1.eng.vmware.com>

In case of single tx and rx queues, three MSI-x vectors are allocated instead
of two. This patch fixes that.

Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c |    4 ++--
 drivers/net/vmxnet3/vmxnet3_int.h |    6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 3b5b134..cc14b4a 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2712,7 +2712,7 @@ vmxnet3_acquire_msix_vectors(struct vmxnet3_adapter *adapter,
 			break;
 		} else {
 			/* If fails to enable required number of MSI-x vectors
-			 * try enabling 3 of them. One each for rx, tx and event
+			 * try enabling minimum number of vectors required.
 			 */
 			vectors = vector_threshold;
 			printk(KERN_ERR "Failed to enable %d MSI-X for %s, try"
@@ -2774,7 +2774,7 @@ vmxnet3_alloc_intr_resources(struct vmxnet3_adapter *adapter)
 		 */
 		if (err == VMXNET3_LINUX_MIN_MSIX_VECT) {
 			if (adapter->share_intr != VMXNET3_INTR_BUDDYSHARE
-			    || adapter->num_rx_queues != 2) {
+			    || adapter->num_rx_queues != 1) {
 				adapter->share_intr = VMXNET3_INTR_TXSHARE;
 				printk(KERN_ERR "Number of rx queues : 1\n");
 				adapter->num_rx_queues = 1;
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 474f5df..fb5d245 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -68,10 +68,10 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.0.16.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.0.25.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01001000
+#define VMXNET3_DRIVER_VERSION_NUM      0x01001900
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */
@@ -289,7 +289,7 @@ struct vmxnet3_rx_queue {
 
 #define VMXNET3_LINUX_MAX_MSIX_VECT     (VMXNET3_DEVICE_MAX_TX_QUEUES + \
 					 VMXNET3_DEVICE_MAX_RX_QUEUES + 1)
-#define VMXNET3_LINUX_MIN_MSIX_VECT     3    /* 1 for each : tx, rx and event */
+#define VMXNET3_LINUX_MIN_MSIX_VECT     2 /* 1 for tx-rx pair and 1 for event */
 
 
 struct vmxnet3_intr {

^ permalink raw reply related

* [PATCH] RFC: ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables
From: Lucian Adrian Grijincu @ 2011-01-15  2:46 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Alexey Kuznetsov, Pekka Savola (ipv6),
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Nick Piggin,
	Al Viro, Christoph Hellwig, Lucian Adrian Grijincu, Dave Chinner,
	Neil Horman, Eric Dumazet, Alexey Dobriyan, Octavian Purdila,
	Vlad Dogaru

[-- Attachment #1: Type: text/plain, Size: 1802 bytes --]

For each network device DEVNAME that supports ipv4 a new table was
registered in net/ipv4/conf/DEVNAME/. The sysctl table was identical
for all network devices, except for the ->data, ->extra1 and ->extra2
members.

However, given the 'struct net_device*' corresponding to the device
DEVNAME we can compute data and extra1.

extra2 cannot be computed, it is the 'struct net*' of the device and
it is used to find the correct 'struct net_device*' with this name
(e.g. we will have 'lo' in every network namespace).

We cannot use the current process' network namespace. For example,
after running this code:

   int fd = open("/proc/sys/net/ipv4/conf/default/tag", O_RDONLY);
   unshare(CLONE_NEWNET);

'fd' points to a sysctl for a network namespace different from the
current process' network namespace.

To gain access to the name of the directory above a file, sysctl
handlers are passed an extra argument: the 'struct file*'
corresponding to the file. From the file we walk up one level to find
the name of the device. None of the other handlers were changed to
receive this extra parameter, but due to C's calling convention they
shouldn't care.

Each table has 26 entries (25 files + 1 sentinel). For each net device
this patch should save:
* 32bit: 26 * 36 = 936 bytes
* 64bit: 26 * 64 = 1664 bytes

**This patch was not thoroughly tested, I'm just looking for feedback
whether the approach is sound and could be applied. Similar changes
can be implemented for ipv6 too.**

Signed-off-by: Lucian Adrian Grijincu <lucian.grijincu@gmail.com>
---
 fs/proc/proc_sysctl.c      |   11 ++-
 include/linux/inetdevice.h |   14 +++-
 include/net/netns/ipv4.h   |   14 +++
 net/ipv4/devinet.c         |  236 +++++++++++++++++++++++++++++---------------
 4 files changed, 194 insertions(+), 81 deletions(-)

[-- Attachment #2: 0001-RFC-ipv4-share-sysctl-net-ipv4-conf-DEVNAME-tables.patch --]
[-- Type: text/x-patch, Size: 15497 bytes --]

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92..901bfe3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -129,6 +129,11 @@ out:
 	return err;
 }
 
+
+typedef int proc_handler_extended(struct ctl_table *ctl, int write,
+				  void __user *buffer, size_t *lenp, loff_t *ppos,
+				  struct file *filp);
+
 static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 		size_t count, loff_t *ppos, int write)
 {
@@ -137,6 +142,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	ssize_t error;
 	size_t res;
+	proc_handler_extended *phx = (proc_handler_extended *) table->proc_handler;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
@@ -156,7 +162,10 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 
 	/* careful: calling conventions are nasty here */
 	res = count;
-	error = table->proc_handler(table, write, buf, &res, ppos);
+	/* most handlers only use the first 5 arguments (without @filp).
+	 * Changing all is too much of work, as, at the time of writting only
+	 * one proc_handler knows about and uses the @filp. */
+	error = phx(table, write, buf, &res, ppos, filp);
 	if (!error)
 		error = res;
 out:
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index ae8fdc5..abe2e25 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -43,8 +43,20 @@ enum
 
 #define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1)
 
+
+struct devinet_sysctl {
+	/*
+	 * dev_name holds a copy of dev_name, because '.procname' is regarded as const
+	 * by sysctl and we wouldn't want anyone to change it under our feet
+	 * (see SIOCSIFNAME).
+	 */
+	char *dev_name;
+	struct ctl_table_header *sysctl_header;
+};
+
+
 struct ipv4_devconf {
-	void	*sysctl;
+	struct devinet_sysctl devinet_sysctl;
 	int	data[IPV4_DEVCONF_MAX];
 	DECLARE_BITMAP(state, IPV4_DEVCONF_MAX);
 };
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d68c3f1..5210215 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -7,6 +7,7 @@
 
 #include <net/inet_frag.h>
 
+struct ctl_table;
 struct ctl_table_header;
 struct ipv4_devconf;
 struct fib_rules_ops;
@@ -19,6 +20,19 @@ struct netns_ipv4 {
 	struct ctl_table_header	*frags_hdr;
 	struct ctl_table_header	*ipv4_hdr;
 	struct ctl_table_header *route_hdr;
+	/* This holds the contents of /proc/sys/net/ipv4/conf/$DEV_NAME/
+	 *
+	 * The devinet_sysctl_table is shared by all network devices
+	 * in the same network namespace, but each network namespace
+	 * needs it's own copy of the table (the ->extra2 member of
+	 * each ctl_table must point to the corresponding 'struct net*').
+	 *
+	 * The ->data member is not used as defined in this table. The
+	 * proc_handler determines the apropiate data location based
+	 * on the 'struct net_device*' having the same name as
+	 * $DEV_NAME above.
+	 */
+	struct ctl_table	*devinet_sysctl_table;
 #endif
 	struct ipv4_devconf	*devconf_all;
 	struct ipv4_devconf	*devconf_dflt;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 748cb5b..d03dfdc 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -158,7 +158,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
 		goto out;
 	memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
 			sizeof(in_dev->cnf));
-	in_dev->cnf.sysctl = NULL;
+	in_dev->cnf.devinet_sysctl.dev_name = NULL;
+	in_dev->cnf.devinet_sysctl.sysctl_header = NULL;
 	in_dev->dev = dev;
 	in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
 	if (!in_dev->arp_parms)
@@ -1375,9 +1376,56 @@ static void inet_forward_change(struct net *net)
 	}
 }
 
-static int devinet_conf_proc(ctl_table *ctl, int write,
-			     void __user *buffer,
-			     size_t *lenp, loff_t *ppos)
+
+
+static int devinet_conf_handler(ctl_table *ctl, int write,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos,
+				struct file *filp,
+				proc_handler *proc_handler)
+{
+	/* The path to this file is of the form /proc/sys/net/ipv4/conf/$DEVNAME/$CTL
+	 *
+	 * To save space, ctl_table is shared between all network
+	 * devices in the same network namespace, but we need to
+	 * change the data corresponding to the $DEVNAME network
+	 * device, not any other's.
+	 *
+	 * Use $DEVNAME to obtain the coresponding ipv4_devconf.
+	 */
+	struct net *net = ctl->extra2;
+	const char *dev_name = filp->f_path.dentry->d_parent->d_name.name;
+	struct ctl_table tmp_ctl = *ctl;
+	struct net_device *dev = NULL;
+	struct ipv4_devconf *cnf;
+	int ret;
+
+	if (strcmp(dev_name, "all") == 0) {
+		cnf = net->ipv4.devconf_all;
+	} else if (strcmp(dev_name, "default") == 0) {
+		cnf = net->ipv4.devconf_dflt;
+	} else {
+		/* the device could have been renamed (SIOCSIFADDR) or
+		 * deleted since we started accessing it's proc sysctl */
+		dev = dev_get_by_name(net, dev_name);
+		if (dev == NULL)
+			return -ENOENT;
+		cnf = &in_dev_get(dev)->cnf;
+	}
+
+	tmp_ctl.data += (char *)cnf - (char *)&ipv4_devconf;
+	tmp_ctl.extra1 = cnf;
+
+	ret = proc_handler(&tmp_ctl, write, buffer, lenp, ppos);
+
+	if (dev)
+		dev_put(dev);
+	return ret;
+}
+
+static int __devinet_conf_proc(ctl_table *ctl, int write,
+			       void __user *buffer,
+			       size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
 
@@ -1395,9 +1443,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
 	return ret;
 }
 
-static int devinet_sysctl_forward(ctl_table *ctl, int write,
-				  void __user *buffer,
-				  size_t *lenp, loff_t *ppos)
+static int __devinet_sysctl_forward(ctl_table *ctl, int write,
+				    void __user *buffer,
+				    size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
@@ -1430,9 +1478,10 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 	return ret;
 }
 
-static int ipv4_doint_and_flush(ctl_table *ctl, int write,
-				void __user *buffer,
-				size_t *lenp, loff_t *ppos)
+
+static int __ipv4_doint_and_flush(ctl_table *ctl, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
 {
 	int *valp = ctl->data;
 	int val = *valp;
@@ -1445,6 +1494,33 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
 	return ret;
 }
 
+static int devinet_conf_proc(ctl_table *ctl, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos,
+			     struct file *filp)
+{
+	return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
+				    __devinet_conf_proc);
+}
+
+static int devinet_sysctl_forward(ctl_table *ctl, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos,
+				  struct file *filp)
+{
+	return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
+				    __devinet_sysctl_forward);
+}
+
+static int ipv4_doint_and_flush(ctl_table *ctl, int write,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos,
+				struct file *filp)
+{
+	return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
+				    __ipv4_doint_and_flush);
+}
+
 #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
 	{ \
 		.procname	= name, \
@@ -1454,6 +1530,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
 		.mode		= mval, \
 		.proc_handler	= proc, \
 		.extra1		= &ipv4_devconf, \
+		.extra2		= &init_net, \
 	}
 
 #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
@@ -1468,51 +1545,45 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
 #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
 	DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
 
-static struct devinet_sysctl_table {
-	struct ctl_table_header *sysctl_header;
-	struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
-	char *dev_name;
-} devinet_sysctl = {
-	.devinet_vars = {
-		DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
-					     devinet_sysctl_forward),
-		DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
-
-		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
-		DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
-		DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
-		DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
-		DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
-		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
-					"accept_source_route"),
-		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
-		DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
-		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
-		DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
-		DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
-		DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
-		DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
-		DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
-		DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
-		DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
-		DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
-		DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
-		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
-
-		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
-		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-		DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
-					      "force_igmp_version"),
-		DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
-					      "promote_secondaries"),
-	},
+struct ctl_table ipv4_devinet_sysctl_table[__IPV4_DEVCONF_MAX] = {
+	DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+				     devinet_sysctl_forward),
+	DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+
+	DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+	DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+	DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+	DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+	DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+	DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+				"accept_source_route"),
+	DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+	DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+	DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+	DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+	DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+	DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+	DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+	DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+	DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+	DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+	DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+	DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+	DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+
+	DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+	DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+	DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
+				      "force_igmp_version"),
+	DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+				      "promote_secondaries"),
+	{ }
 };
 
 static int __devinet_sysctl_register(struct net *net, char *dev_name,
-					struct ipv4_devconf *p)
+				     struct ipv4_devconf *cnf)
 {
-	int i;
-	struct devinet_sysctl_table *t;
+	struct devinet_sysctl *dsys = &cnf->devinet_sysctl;
 
 #define DEVINET_CTL_PATH_DEV	3
 
@@ -1524,54 +1595,42 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
 		{ },
 	};
 
-	t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
-	if (!t)
-		goto out;
-
-	for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
-		t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
-		t->devinet_vars[i].extra1 = p;
-		t->devinet_vars[i].extra2 = net;
-	}
-
 	/*
 	 * Make a copy of dev_name, because '.procname' is regarded as const
 	 * by sysctl and we wouldn't want anyone to change it under our feet
 	 * (see SIOCSIFNAME).
 	 */
-	t->dev_name = kstrdup(dev_name, GFP_KERNEL);
-	if (!t->dev_name)
-		goto free;
+	dsys->dev_name = kstrdup(dev_name, GFP_KERNEL);
+	if (!dsys->dev_name)
+		goto out;
 
-	devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
+	devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = dsys->dev_name;
 
-	t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
-			t->devinet_vars);
-	if (!t->sysctl_header)
+	dsys->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
+						net->ipv4.devinet_sysctl_table);
+	if (!dsys->sysctl_header)
 		goto free_procname;
 
-	p->sysctl = t;
 	return 0;
 
 free_procname:
-	kfree(t->dev_name);
-free:
-	kfree(t);
+	kfree(dsys->dev_name);
 out:
 	return -ENOBUFS;
 }
 
 static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
 {
-	struct devinet_sysctl_table *t = cnf->sysctl;
+	struct devinet_sysctl *dsys = &cnf->devinet_sysctl;
 
-	if (t == NULL)
+	if (dsys == NULL)
 		return;
 
-	cnf->sysctl = NULL;
-	unregister_sysctl_table(t->sysctl_header);
-	kfree(t->dev_name);
-	kfree(t);
+	unregister_sysctl_table(dsys->sysctl_header);
+	kfree(dsys->dev_name);
+
+	dsys->dev_name = NULL;
+	dsys->sysctl_header = NULL;
 }
 
 static void devinet_sysctl_register(struct in_device *idev)
@@ -1610,9 +1669,10 @@ static __net_initdata struct ctl_path net_ipv4_path[] = {
 
 static __net_init int devinet_init_net(struct net *net)
 {
-	int err;
+	int i, err;
 	struct ipv4_devconf *all, *dflt;
 #ifdef CONFIG_SYSCTL
+	struct ctl_table *devinet_sysctl_table;
 	struct ctl_table *tbl = ctl_forward_entry;
 	struct ctl_table_header *forw_hdr;
 #endif
@@ -1620,6 +1680,7 @@ static __net_init int devinet_init_net(struct net *net)
 	err = -ENOMEM;
 	all = &ipv4_devconf;
 	dflt = &ipv4_devconf_dflt;
+	devinet_sysctl_table = &ipv4_devinet_sysctl_table[0];
 
 	if (!net_eq(net, &init_net)) {
 		all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
@@ -1638,10 +1699,23 @@ static __net_init int devinet_init_net(struct net *net)
 		tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
 		tbl[0].extra1 = all;
 		tbl[0].extra2 = net;
+
+		devinet_sysctl_table = kmemdup(ipv4_devinet_sysctl_table,
+					       sizeof(ipv4_devinet_sysctl_table),
+					       GFP_KERNEL);
+		if (devinet_sysctl_table == NULL)
+			goto err_alloc_devinet_sysctl_table;
+
+		/* the last element in the table is { } and must remain so */
+		for (i = 0; i < ARRAY_SIZE(ipv4_devinet_sysctl_table) - 1; i++) {
+			devinet_sysctl_table->extra2 = net;
+		}
 #endif
 	}
 
 #ifdef CONFIG_SYSCTL
+	net->ipv4.devinet_sysctl_table = devinet_sysctl_table;
+
 	err = __devinet_sysctl_register(net, "all", all);
 	if (err < 0)
 		goto err_reg_all;
@@ -1667,6 +1741,9 @@ err_reg_ctl:
 err_reg_dflt:
 	__devinet_sysctl_unregister(all);
 err_reg_all:
+	if (devinet_sysctl_table != &ipv4_devinet_sysctl_table[0])
+		kfree(devinet_sysctl_table);
+err_alloc_devinet_sysctl_table:
 	if (tbl != ctl_forward_entry)
 		kfree(tbl);
 err_alloc_ctl:
@@ -1689,6 +1766,7 @@ static __net_exit void devinet_exit_net(struct net *net)
 	unregister_net_sysctl_table(net->ipv4.forw_hdr);
 	__devinet_sysctl_unregister(net->ipv4.devconf_dflt);
 	__devinet_sysctl_unregister(net->ipv4.devconf_all);
+	kfree(net->ipv4.devinet_sysctl_table);
 	kfree(tbl);
 #endif
 	kfree(net->ipv4.devconf_dflt);

^ permalink raw reply related

* Re: [PATCH] CHOKe flow scheduler (0.9)
From: Eric Dumazet @ 2011-01-15  7:45 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Patrick McHardy, David Miller, netdev
In-Reply-To: <20110114154521.54cc8ef5@nehalam>

Le vendredi 14 janvier 2011 à 15:45 -0800, Stephen Hemminger a écrit :
> CHOKe ("CHOose and Kill" or "CHOose and Keep") is an alternative
> packet scheduler based on the Random Exponential Drop (RED) algorithm.
> 
> The core idea is:
>   For every packet arrival:
>   	Calculate Qave
> 	if (Qave < minth) 
> 	     Queue the new packet
> 	else 
> 	     Select randomly a packet from the queue 
> 	     if (both packets from same flow)
> 	     then Drop both the packets
> 	     else if (Qave > maxth)
> 	          Drop packet
> 	     else
> 	       	  Admit packet with proability p (same as RED)
> 
> See also:
>   Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
>    queue management scheme for approximating fair bandwidth allocation", 
>   Proceeding of INFOCOM'2000, March 2000.
> 
> Help from:
>      Eric Dumazet <eric.dumazet@gmail.com>
>      Patrick McHardy <kaber@trash.net>
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
> This version is based on net-next, and assumes Eric's patch for
> corrected bstats is already applied.
> 
> 0.9 incorporate patches from Patrick/Eric
>     rework the peek_random and drop code to simplify and fix bug where
>     random_N needs to called with full length (including holes).

Nice catch, I now have more "matched" counts after my test :

qdisc choke 11: parent 1:11 limit 130000b min 10833b max 32500b ewma 13 Plog 21 Scell_log 30
 Sent 93944198 bytes 170889 pkt (dropped 829140, overlimits 436686 requeues 0) 
 rate 48bit 0pps backlog 0b 0p requeues 0 
  marked 0 early 436686 pdrop 0 other 0 matched 196227

You missed the qdisc_bstats_update() move from enqueue() to dequeue()

And some minor CodingStyle / checkpatch.pl changes, here is my
latest diff on top of 0.9

I believe you can release v1 :)

Thanks !

 net/sched/sch_choke.c |   35 ++++++++++++++++++-----------------
 1 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 55eeb7d..a7605a0 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -20,7 +20,7 @@
 #include <net/red.h>
 
 /*	CHOKe stateless AQM for fair bandwidth allocation
-        =================================================
+	=================================================
 
    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
@@ -137,10 +137,10 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
 }
 
 /* Classify flow using either:
-   1. pre-existing classification result in skb
-   2. fast internal classification
-   3. use TC filter based classification
-*/
+ * 1. pre-existing classification result in skb
+ * 2. fast internal classification (rxhash)
+ * 3. use TC filter based classification
+ */
 static unsigned int choke_classify(struct sk_buff *skb,
 				   struct Qdisc *sch, int *qerr)
 
@@ -176,10 +176,9 @@ static unsigned int choke_classify(struct sk_buff *skb,
 }
 
 /* Select a packet at random from the queue and return flow classification
-   returns 0 if queue empty
+ * returns 0 if queue empty
  */
-static unsigned int choke_peek_random(struct Qdisc *sch,
-				      unsigned int *pidx)
+static unsigned int choke_peek_random(struct Qdisc *sch, unsigned int *pidx)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 	const struct sk_buff *skb;
@@ -229,9 +228,9 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		red_end_of_idle_period(p);
 
 	/* Is queue small? */
-	if (p->qavg <= p->qth_min)
+	if (p->qavg <= p->qth_min) {
 		p->qcount = -1;
-	else {
+	} else {
 		unsigned int idx;
 
 		/* Draw a packet at random from queue and compare flow */
@@ -266,8 +265,9 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 				q->stats.prob_mark++;
 			}
-		} else
+		} else {
 			p->qR = red_random(p);
+		}
 	}
 
 	/* Admit new packet */
@@ -276,7 +276,6 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->tail = (q->tail + 1) & q->tab_mask;
 		++sch->q.qlen;
 		sch->qstats.backlog += qdisc_pkt_len(skb);
-		qdisc_bstats_update(sch, skb);
 		return NET_XMIT_SUCCESS;
 	}
 
@@ -306,6 +305,7 @@ static struct sk_buff *choke_dequeue(struct Qdisc *sch)
 	choke_zap_head_holes(q);
 	--sch->q.qlen;
 	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
 
 	return skb;
 }
@@ -316,9 +316,9 @@ static unsigned int choke_drop(struct Qdisc *sch)
 	unsigned int len;
 
 	len = qdisc_queue_drop(sch);
-	if (len > 0)
+	if (len > 0) {
 		q->stats.other++;
-	else {
+	} else {
 		if (!red_is_idling(&q->parms))
 			red_start_of_idle_period(&q->parms);
 	}
@@ -326,7 +326,7 @@ static unsigned int choke_drop(struct Qdisc *sch)
 	return len;
 }
 
-static void choke_reset(struct Qdisc* sch)
+static void choke_reset(struct Qdisc *sch)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 
@@ -410,8 +410,9 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 
 		q->tab_mask = mask;
 		q->tab = ntab;
-	} else
+	} else {
 		sch_tree_lock(sch);
+	}
 
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
@@ -428,7 +429,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	return choke_change(sch, opt);
 }



^ permalink raw reply related

* Re: [PATCH] RFC: ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables
From: Alexey Dobriyan @ 2011-01-15 10:41 UTC (permalink / raw)
  To: Lucian Adrian Grijincu
  Cc: netdev, David S. Miller, Alexey Kuznetsov, Pekka Savola (ipv6),
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Nick Piggin,
	Al Viro, Christoph Hellwig, Dave Chinner, Neil Horman,
	Eric Dumazet, Octavian Purdila, Vlad Dogaru
In-Reply-To: <AANLkTikVDn_0c+Ob_XnUauEMBZhsYCh+42QF7FwM=Bey@mail.gmail.com>

On Sat, Jan 15, 2011 at 04:46:11AM +0200, Lucian Adrian Grijincu wrote:
> To gain access to the name of the directory above a file, sysctl
> handlers are passed an extra argument: the 'struct file*'
> corresponding to the file. From the file we walk up one level to find
> the name of the device. None of the other handlers were changed to
> receive this extra parameter, but due to C's calling convention they
> shouldn't care.

We don't do creepy stuff like that.

I wonder where interactions with device renaming are handled.

> +static int devinet_conf_handler(ctl_table *ctl, int write,
> +				void __user *buffer,
> +				size_t *lenp, loff_t *ppos,
> +				struct file *filp,
> +				proc_handler *proc_handler)
> +{
> +	/* The path to this file is of the form /proc/sys/net/ipv4/conf/$DEVNAME/$CTL
> +	 *
> +	 * To save space, ctl_table is shared between all network
> +	 * devices in the same network namespace, but we need to
> +	 * change the data corresponding to the $DEVNAME network
> +	 * device, not any other's.
> +	 *
> +	 * Use $DEVNAME to obtain the coresponding ipv4_devconf.
> +	 */
> +	struct net *net = ctl->extra2;
> +	const char *dev_name = filp->f_path.dentry->d_parent->d_name.name;

^ permalink raw reply

* Re: [PATCH] RFC: ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables
From: Lucian Adrian Grijincu @ 2011-01-15 11:09 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: netdev, David S. Miller, Alexey Kuznetsov, Pekka Savola (ipv6),
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Nick Piggin,
	Al Viro, Christoph Hellwig, Dave Chinner, Neil Horman,
	Eric Dumazet, Octavian Purdila, Vlad Dogaru
In-Reply-To: <20110115104139.GA4816@p183.telecom.by>

On Sat, Jan 15, 2011 at 12:41 PM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
> On Sat, Jan 15, 2011 at 04:46:11AM +0200, Lucian Adrian Grijincu wrote:
>> To gain access to the name of the directory above a file, sysctl
>> handlers are passed an extra argument: the 'struct file*'
>> corresponding to the file. From the file we walk up one level to find
>> the name of the device. None of the other handlers were changed to
>> receive this extra parameter, but due to C's calling convention they
>> shouldn't care.
>
> We don't do creepy stuff like that.


I did this this way to not waste time changing all the handlers in the
tree and then get this patch struck down as uninterested, ugly-hack or
being suggested another more sensible, yet completely different
approach.


> I wonder where interactions with device renaming are handled.


I took two things taken into consideration:
* the .procname of the device directory has it's own copy of the device name
  This was inherited from the previous version

* when looking for a "struct net_device*" we might not find any:

+		/* the device could have been renamed (SIOCSIFADDR) or
+		 * deleted since we started accessing it's proc sysctl */
+		dev = dev_get_by_name(net, dev_name);
+		if (dev == NULL)
+			return -ENOENT;


I'm not sure (will look into it later) whether
   filp->f_path.dentry->d_parent->d_name.name;
is still valid if a rename is running concurrently.


On device rename we run this

net/ipv4/devinet.c:
static int inetdev_event(struct notifier_block *this, unsigned long event,
			 void *ptr)
....
	case NETDEV_CHANGENAME:
		/* Do not notify about label change, this event is
		 * not interesting to applications using netlink.
		 */
		inetdev_changename(dev, in_dev);

		devinet_sysctl_unregister(in_dev);
		devinet_sysctl_register(in_dev);


I'm not sure whether this code can run in parallel with the
proc_handlers and if it can if it will invalidate
   filp->f_path.dentry->d_parent->d_name.name;


but I'll look into it.

Is there anything else that I should check regarding net device renaming?

-- 
 .
..: Lucian

^ permalink raw reply

* [PATCH resend] netfilter: change IPS_*_BIT to IPS_*
From: Changli Gao @ 2011-01-15 11:20 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, netfilter-devel, netdev, Changli Gao,
	Tim Gardner, Eric Dumazet

My previous patch made a mistake when converting atomic_set to a normal
bit 'or'. IPS_*_BIT should be replaced with IPS_*.

  commit 76a2d3bcfcc86e2a8044258515b86492a37631a3
  Author: Changli Gao <xiaosuo@gmail.com>
  Date:   Mon Nov 15 11:59:03 2010 +0100

  netfilter: nf_nat: don't use atomic bit operation

  As we own the conntrack and the others can't see it until we confirm it,
  we don't need to use atomic bit operation on ct->status.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Cc: Tim Gardner <tim.gardner@canonical.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/netfilter/nf_nat_core.h |    4 ++--
 net/ipv4/netfilter/nf_nat_core.c    |    4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index 5aec85c..3dc7b98 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -21,9 +21,9 @@ static inline int nf_nat_initialized(struct nf_conn *ct,
 				     enum nf_nat_manip_type manip)
 {
 	if (manip == IP_NAT_MANIP_SRC)
-		return ct->status & IPS_SRC_NAT_DONE_BIT;
+		return ct->status & IPS_SRC_NAT_DONE;
 	else
-		return ct->status & IPS_DST_NAT_DONE_BIT;
+		return ct->status & IPS_DST_NAT_DONE;
 }
 
 struct nlattr;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index eb55835..21e6e92 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -323,9 +323,9 @@ nf_nat_setup_info(struct nf_conn *ct,
 
 	/* It's done. */
 	if (maniptype == IP_NAT_MANIP_DST)
-		ct->status |= IPS_DST_NAT_DONE_BIT;
+		ct->status |= IPS_DST_NAT_DONE;
 	else
-		ct->status |= IPS_SRC_NAT_DONE_BIT;
+		ct->status |= IPS_SRC_NAT_DONE;
 
 	return NF_ACCEPT;
 }

^ permalink raw reply related

* [PATCH v2 resend] netfilter: remove an atomic bit operation
From: Changli Gao @ 2011-01-15 11:23 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, netfilter-devel, netdev, Changli Gao,
	Tim Gardner, Eric Dumazet

As this ct won't be seen by the others, we don't need to set the
IPS_CONFIRMED_BIT in atomic way.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Cc: Tim Gardner <tim.gardner@canonical.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
v2: IPS_CONFIRMED_BIT is changed to IPS_CONFIRMED.
 net/netfilter/nf_conntrack_core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 27a5ea6..c708248 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -486,7 +486,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	ct->timeout.expires += jiffies;
 	add_timer(&ct->timeout);
 	atomic_inc(&ct->ct_general.use);
-	set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	ct->status |= IPS_CONFIRMED;
 
 	/* Since the lookup is lockless, hash insertion must be done after
 	 * starting the timer and setting the CONFIRMED bit. The RCU barriers

^ permalink raw reply related

* [PATCH resend] netfilter: place in source hash after SNAT is done
From: Changli Gao @ 2011-01-15 11:28 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netfilter-devel, netdev, Changli Gao

If SNAT isn't done, the wrong info maybe got by the other cts.

As the filter table is after DNAT table, the packets dropped in filter
table also bother bysource hash table.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 net/ipv4/netfilter/nf_nat_core.c |   18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787c..51ce55a 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	   manips not an issue.  */
 	if (maniptype == IP_NAT_MANIP_SRC &&
 	    !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
-		if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
+		/* try the original tuple first */
+		if (in_range(orig_tuple, range)) {
+			if (!nf_nat_used_tuple(orig_tuple, ct)) {
+				*tuple = *orig_tuple;
+				return;
+			}
+		} else if (find_appropriate_src(net, zone, orig_tuple, tuple,
+			   range)) {
 			pr_debug("get_unique_tuple: Found current src map\n");
 			if (!nf_nat_used_tuple(tuple, ct))
 				return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
 	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 	struct nf_conn_nat *nat;
-	int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
 
 	/* nat helper or nfctnetlink also setup binding */
 	nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 			ct->status |= IPS_DST_NAT;
 	}
 
-	/* Place in source hash if this is the first time. */
-	if (have_to_hash) {
+	if (maniptype == IP_NAT_MANIP_SRC) {
 		unsigned int srchash;
 
 		srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -532,7 +537,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 	if (nat == NULL || nat->ct == NULL)
 		return;
 
-	NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
+	NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
 
 	spin_lock_bh(&nf_nat_lock);
 	hlist_del_rcu(&nat->bysource);
@@ -545,11 +550,10 @@ static void nf_nat_move_storage(void *new, void *old)
 	struct nf_conn_nat *old_nat = old;
 	struct nf_conn *ct = old_nat->ct;
 
-	if (!ct || !(ct->status & IPS_NAT_DONE_MASK))
+	if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
 		return;
 
 	spin_lock_bh(&nf_nat_lock);
-	new_nat->ct = ct;
 	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
 	spin_unlock_bh(&nf_nat_lock);
 }

^ permalink raw reply related

* [PATCH resend] netfilter: make rcu read section smaller
From: Changli Gao @ 2011-01-15 11:28 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netfilter-devel, netdev, Changli Gao
In-Reply-To: <1295090930-16671-1-git-send-email-xiaosuo@gmail.com>

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 net/ipv4/netfilter/nf_nat_core.c |   38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787c..7300611 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -85,7 +85,7 @@ in_range(const struct nf_conntrack_tuple *tuple,
 	 const struct nf_nat_range *range)
 {
 	const struct nf_nat_protocol *proto;
-	int ret = 0;
+	int ret = 1;
 
 	/* If we are supposed to map IPs, then we must be in the
 	   range specified, otherwise let this drag us onto a new src IP. */
@@ -95,13 +95,14 @@ in_range(const struct nf_conntrack_tuple *tuple,
 			return 0;
 	}
 
-	rcu_read_lock();
-	proto = __nf_nat_proto_find(tuple->dst.protonum);
-	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
-	    proto->in_range(tuple, IP_NAT_MANIP_SRC,
-			    &range->min, &range->max))
-		ret = 1;
-	rcu_read_unlock();
+	if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+		rcu_read_lock();
+		proto = __nf_nat_proto_find(tuple->dst.protonum);
+		if (!proto->in_range(tuple, IP_NAT_MANIP_SRC, &range->min,
+				     &range->max))
+			ret = 0;
+		rcu_read_unlock();
+	}
 
 	return ret;
 }
@@ -235,22 +236,21 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 
 	/* 3) The per-protocol part of the manip is made to map into
 	   the range to make a unique tuple. */
+	if (!(range->flags & (IP_NAT_RANGE_PROTO_RANDOM |
+			      IP_NAT_RANGE_PROTO_SPECIFIED)) &&
+	    !nf_nat_used_tuple(tuple, ct))
+		return;
 
 	rcu_read_lock();
 	proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
 
 	/* Only bother mapping if it's not already in range and unique */
-	if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
-		if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
-			if (proto->in_range(tuple, maniptype, &range->min,
-					    &range->max) &&
-			    (range->min.all == range->max.all ||
-			     !nf_nat_used_tuple(tuple, ct)))
-				goto out;
-		} else if (!nf_nat_used_tuple(tuple, ct)) {
-			goto out;
-		}
-	}
+	if ((range->flags & (IP_NAT_RANGE_PROTO_RANDOM |
+			     IP_NAT_RANGE_PROTO_SPECIFIED)) ==
+	    IP_NAT_RANGE_PROTO_SPECIFIED &&
+	    proto->in_range(tuple, maniptype, &range->min, &range->max) &&
+	    (range->min.all == range->max.all || !nf_nat_used_tuple(tuple, ct)))
+		goto out;
 
 	/* Last change: get protocol to try to obtain unique tuple. */
 	proto->unique_tuple(tuple, range, maniptype, ct);

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox