Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: Stephen Hemminger @ 2009-10-26  6:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Octavian Purdila, netdev
In-Reply-To: <4AE4C1FA.7000002@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1536 bytes --]

I overkilled this with more functions and compared filenames as well.


genarated names (dummyNNNN)
Algorithm             Time (us)     Ratio       Max   StdDev
kr_hash                  277925   152408.6   468448 543.19
string_hash31            329356     5859.4    16042  44.18
SuperFastHash            324570     4885.9    10502   2.29
djb2                     327908     5608.5    15210  38.08
string_hash17            326769     4883.6     9896   0.76
full_name_hash           343196    63921.0   140628 343.62
jhash_string             463801     4883.8    10085   1.02
sdbm                     398587     9801.7    29634  99.18

filesystem names
Algorithm             Time (us)     Ratio       Max   StdDev
kr_hash                  278840   152314.9   468717 543.01
string_hash31            331206     5802.1    16004  42.87
SuperFastHash            325938     4887.5    13528   2.88
djb2                     330621     5607.1    15333  38.05
string_hash17            331181     4884.9    13274   1.78
full_name_hash           347312    63972.2   141336 343.77
jhash_string             466799     4885.2    13275   1.92
sdbm                     403691     9771.7    29629  98.88

Ratio is the average number of buckets examined when scanning
the whole set of names.


1) Increased hash buckets to 1024 which seems reasonable if we are
   going to test that many names.
2) Increased name size to 256 so that longer filenames could be
   checked and name blocks were not in same cache line

* SuperFastHash is too big to put inline


[-- Attachment #2: hashtest.c --]
[-- Type: text/x-c++src, Size: 6872 bytes --]

#include <stdio.h>
#include <stdint.h>
#include <malloc.h>
#include <sys/types.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>

static unsigned int
kr_hash(const unsigned char *str, unsigned int len)
{
	unsigned int hash = 0;

        while (len--)
	    hash += *str++;

	return hash;
}

static unsigned int
sdbm(const unsigned char *name, unsigned int len)
{
        unsigned long hash = 0;

        while (len--)
            hash = *name++ + (hash << 6) + (hash << 16) - hash;

        return hash;
}

#define init_name_hash()                0

/* partial hash update function. Assume roughly 4 bits per character */
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
        return (prevhash + (c << 4) + (c >> 4)) * 11;
}

/* Compute the hash for a name string. */
static unsigned int
full_name_hash(const unsigned char *name, unsigned int len)
{
        unsigned long hash = 0;
        while (len--)
                hash = partial_name_hash(*name++, hash);
        return hash;
}

static unsigned int
djb2(const unsigned char *str, unsigned int len)
{
	unsigned long hash = 5381;

        while (len--)
		hash = ((hash << 5) + hash) + *str++; /* hash * 33 + c */

        return hash;
}

static unsigned int
string_hash31(const unsigned char *name, unsigned int len)
{
        unsigned hash = 0;
        int i;

        for (i = 0; i < len; ++i)
                hash = 31 * hash + name[i];
        return hash;
}

static unsigned int
string_hash17(const unsigned char *name, unsigned int len)
{
        unsigned hash = 0;
        int i;

        for (i = 0; i < len; ++i)
                hash = 17 * hash + name[i];
        return hash;
}

/* will need to change to unaligned_access() */
static inline uint16_t get16bits(const unsigned char *data)
{
	return *(uint16_t *) data;
}

static inline unsigned int
SuperFastHash (const unsigned char * data, unsigned int len)
{
	uint32_t hash = len, tmp;
	int rem;

	rem = len & 3;
	len >>= 2;

	/* Main loop */
	for (;len > 0; len--) {
		hash  += get16bits (data);
		tmp    = (get16bits (data+2) << 11) ^ hash;
		hash   = (hash << 16) ^ tmp;
		data  += 2*sizeof (uint16_t);
		hash  += hash >> 11;
	}

	/* Handle end cases */
	switch (rem) {
        case 3: hash += get16bits (data);
                hash ^= hash << 16;
                hash ^= data[sizeof (uint16_t)] << 18;
                hash += hash >> 11;
                break;
        case 2: hash += get16bits (data);
                hash ^= hash << 11;
                hash += hash >> 17;
                break;
        case 1: hash += *data;
                hash ^= hash << 10;
                hash += hash >> 1;
	}

	/* Force "avalanching" of final 127 bits */
	hash ^= hash << 3;
	hash += hash >> 5;
	hash ^= hash << 4;
	hash += hash >> 17;
	hash ^= hash << 25;
	hash += hash >> 6;

	return hash;
}


/* NOTE: Arguments are modified. */
#define __jhash_mix(a, b, c) \
{ \
  a -= b; a -= c; a ^= (c>>13); \
  b -= c; b -= a; b ^= (a<<8); \
  c -= a; c -= b; c ^= (b>>13); \
  a -= b; a -= c; a ^= (c>>12);  \
  b -= c; b -= a; b ^= (a<<16); \
  c -= a; c -= b; c ^= (b>>5); \
  a -= b; a -= c; a ^= (c>>3);  \
  b -= c; b -= a; b ^= (a<<10); \
  c -= a; c -= b; c ^= (b>>15); \
}

/* The golden ration: an arbitrary value */
#define JHASH_GOLDEN_RATIO	0x9e3779b9

/* The most generic version, hashes an arbitrary sequence
 * of bytes.  No alignment or length assumptions are made about
 * the input key.
 */
static inline uint32_t jhash(const void *key, uint32_t length, uint32_t initval)
{
	uint32_t a, b, c, len;
	const uint8_t *k = key;

	len = length;
	a = b = JHASH_GOLDEN_RATIO;
	c = initval;

	while (len >= 12) {
		a += (k[0] +((uint32_t)k[1]<<8) +((uint32_t)k[2]<<16) +((uint32_t)k[3]<<24));
		b += (k[4] +((uint32_t)k[5]<<8) +((uint32_t)k[6]<<16) +((uint32_t)k[7]<<24));
		c += (k[8] +((uint32_t)k[9]<<8) +((uint32_t)k[10]<<16)+((uint32_t)k[11]<<24));

		__jhash_mix(a,b,c);

		k += 12;
		len -= 12;
	}

	c += length;
	switch (len) {
	case 11: c += ((uint32_t)k[10]<<24);
	case 10: c += ((uint32_t)k[9]<<16);
	case 9 : c += ((uint32_t)k[8]<<8);
	case 8 : b += ((uint32_t)k[7]<<24);
	case 7 : b += ((uint32_t)k[6]<<16);
	case 6 : b += ((uint32_t)k[5]<<8);
	case 5 : b += k[4];
	case 4 : a += ((uint32_t)k[3]<<24);
	case 3 : a += ((uint32_t)k[2]<<16);
	case 2 : a += ((uint32_t)k[1]<<8);
	case 1 : a += k[0];
	};

	__jhash_mix(a,b,c);

	return c;
}

static unsigned int
jhash_string(const unsigned char *name, unsigned int len)
{
	return jhash(name, len, 0);
}

#define TESTSIZE	10000000ul
#define HASHBITS	10
#define HASHSZ		(1<<HASHBITS)
#define HASHMSK		(HASHSZ-1)
#define IFNAMSIZ	256

static char *names[TESTSIZE];

static void generate_names(void)
{
	unsigned int i;
	char *buf = malloc(TESTSIZE * IFNAMSIZ);

	printf("\ngenarated names (dummyNNNN)\n");
	for (i = 0; i < TESTSIZE; i++) {
		snprintf(buf, IFNAMSIZ, "dummy%d", i);
		names[i] = buf;
		buf += IFNAMSIZ;
	}
}

static inline void measure(const char *name,  
			   unsigned int (*hash)(const unsigned char *, unsigned int))
{
	unsigned int i;
	struct timeval t0, t1;
	unsigned int dist[HASHSZ] = { 0 };
	unsigned long long ratio = 0;
	long us;
	unsigned long m = 0;
	const double avg = TESTSIZE / HASHSZ;
	double std = 0;

	gettimeofday(&t0, NULL);
	
	for (i = 0; i < TESTSIZE; i++) {
		const unsigned char *str = (const unsigned char *) names[i];
		unsigned int h = hash(str, strlen(names[i]));
		++dist[h & HASHMSK];
	}

	gettimeofday(&t1, NULL);
	us = (t1.tv_sec - t0.tv_sec) * 1000000;
	us += (t1.tv_usec - t0.tv_usec);

	for (i = 0; i < HASHSZ; i++) {
		unsigned int n = dist[i];
		unsigned long long s;
		double delta = (n - avg);

		s = n;
		s *= (1 + n);
		ratio += s / 2;
		if (n > m) m = n;
		
		std += delta * delta;
	}

	printf("%-20s %10ld %10.1f %8ld %6.2f\n", name, us, 
	       (double) ratio / (double)TESTSIZE, m, sqrt(std / TESTSIZE));

}
#define MEASURE(func)	measure(#func, &func)


static void filesystem_names(void)
{
	FILE *f = fopen("/tmp/names", "r");
	if (!f) { perror("/tmp/names"); exit(1); }
	unsigned int i = 0;
	char line[IFNAMSIZ];

	printf("\nfilesystem names\n");
	while (fgets(line, sizeof line, f) != NULL) {
		strncpy(names[i], line, IFNAMSIZ);
		if (++i == TESTSIZE)
			break;
	}
	fclose(f);

}


int main()
{

	generate_names();

	printf("Algorithm             Time (us)     Ratio       Max   StdDev\n");
	MEASURE(kr_hash);
	MEASURE(string_hash31);
	MEASURE(SuperFastHash);
	MEASURE(djb2);
	MEASURE(string_hash17);
	MEASURE(full_name_hash);
	MEASURE(jhash_string);
	MEASURE(sdbm);

	filesystem_names();

	printf("Algorithm             Time (us)     Ratio       Max   StdDev\n");
	MEASURE(kr_hash);
	MEASURE(string_hash31);
	MEASURE(SuperFastHash);
	MEASURE(djb2);
	MEASURE(string_hash17);
	MEASURE(full_name_hash);
	MEASURE(jhash_string);
	MEASURE(sdbm);

        return 0;
}

^ permalink raw reply

* Re: [PATCH 1/4] Adds random ect generation to tfrc-sp sender side
From: Gerrit Renker @ 2009-10-26  6:55 UTC (permalink / raw)
  To: Ivo Calado; +Cc: dccp, netdev
In-Reply-To: <cb00fa210910210615v409d09c6p34cd6a48c48196f5@mail.gmail.com>

| >    That is, at the moment both the sender and receiver side of the ECN Nonce
| >    sum verification are placeholders which currently have no effect.
| >
| 
| Okay, then the implementation would be useless now.
I was not suggesting to throw away the patches, we can keep them for
later use.

They are a good starting basis once it makes sense to work with ECN.
Or how can we test ECN if we are not sure that the other parts work
as they are supposed to?

| >  3) Starting an implementation throws up further questions that need to
| >    be addressed, both the basis and the extension need to be verified.
| >
| > I would like to suggest to implement the basis, that is CCID-4 with ECN
| > (using plain ECT(0)), test with that until it works satisfactorily, and
| > then continue adding measures such as the ECN Nonce verification.
| >
| 
| Okay. But, when would be good to at least include random ECT
| generation? When DCCP ECN code will get fixed? Is there any work on
| this?
| 
I asked this on netdev earlier, there was not much enthusiasm.

The issue is that ECN belongs both to the network and the transport layer.
This network layer is in inet_ecn.h, outside of DCCP.

I believe that the changes would not be too hard to do, by changing the
macros. But it requires working with the people on netdev, in particular
to ensure it does not break something in the TCP/SCTP subsystems (both
also use ECN, and then there are also raw sockets).

I also have an interest in resolving this, due to the ugliness at the
moment for enabling ECN on IPv6.

RFC 2884, written by one of the Linux ECN developers, describes early
IPv4 ECN evaluation. It seems that initially it was planned to only 
support ECN over Ipv4, parts of the code may be still from that time.

We can pursue this in parallel to the other issues. Ideally this would be
resolved at the time the other parts of CCID-4 are ready for testing.

| > In summary, I would like to suggest to remove the ECN verification for
| > the moment and focus on the "basic" issues first.
| >
| > Would you be ok with that?
| >
| 
| Yes, we'll keep the ECN verification code here at our git until the
| scenario is ready.
| 
I was going to suggest to put them onto a webpage, such as yours, or on 
www.erg.abdn.ac.uk/users/gerrit/dccp there is also still some space.

Can you please elaborate how to keep your git tree and the one on
eden-feed synchronized? At the moment I have not made any changes other
than the ones I emailed you about. Is there a way of keeping both trees
in synch without running into versioning difficulties?

(The simplest way I can think of is to keep the patches in a separate
 set, or to spawn a subtree which contains the ECN patches on top of the
 CCID-4 tree.)


| > (Also for later) I wonder how to do the sums, with RFC 3168
| > ECT(0) = 0x2 => !0x2 = 0
| > ECT(1) = 0x1 => !0x1 = 0
| >
| 
| I don't understand. Can you try to explain it? Or cite RFC section
| that address it?
 
The values are from figure 1, page 7, the expressions evaluate both as 0:

void main(void)
{
	printf("!0x2 = %d, !0x1 = %d\n", !0x2, !0x1);
}

^ permalink raw reply

* Re: [PATCH 1/5] page allocator: Always wake kswapd when restarting an allocation attempt after direct reclaim failed
From: David Rientjes @ 2009-10-26  7:10 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Mel Gorman, Frans Pop, Jiri Kosina, Sven Geggus,
	Karol Lewandowski, Tobias Oetiker, Rafael J. Wysocki,
	David Miller, Reinette Chatre, Kalle Valo, Mohamed Abbas,
	Jens Axboe, John W. Linville, Pekka Enberg,
	Bartlomiej Zolnierkiewicz, Greg Kroah-Hartman,
	Stephan von Krawczynski, Kernel Testers List, netdev,
	linux-kernel
In-Reply-To: <20091026100019.2F4A.A69D9226@jp.fujitsu.com>

On Mon, 26 Oct 2009, KOSAKI Motohiro wrote:

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index bf72055..5a27896 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1899,6 +1899,12 @@ rebalance:
>  	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
>  		/* Wait for some write requests to complete then retry */
>  		congestion_wait(BLK_RW_ASYNC, HZ/50);
> +
> +		/*
> +		 * While we wait congestion wait, Amount of free memory can
> +		 * be changed dramatically. Thus, we kick kswapd again.
> +		 */
> +		wake_all_kswapd(order, zonelist, high_zoneidx);
>  		goto rebalance;
>  	}
>  

We're blocking to finish writeback of the directly reclaimed memory, why 
do we need to wake kswapd afterwards?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Jarek Poplawski @ 2009-10-26  7:41 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <1256309191.12174.51.camel@johannes.local>

On Fri, Oct 23, 2009 at 04:46:31PM +0200, Johannes Berg wrote:
> On Fri, 2009-10-23 at 16:39 +0200, Tilman Schmidt wrote:
> 
> > Strange. Then what are the two separate functions netif_rx() and
> > netif_rx_ni() for?
> 
> netif_rx_ni() disables preemption.

You wrote earlier:

> [...] the networking layer needs to have
> packets handed to it with softirqs disabled.

How disabling preemption can fix something which needs softirqs
disabled? Could you be more precise?

> > > This really should be obvious. You're fixing the warning at the source
> > > of the warning, rather than the source of the problem.
> > 
> > Good idea. So please do tell us where the source of the problem is.
> 
> You use netif_rx_ni() instead of netif_rx() at whatever place that
> causes this problem.

This isn't a very precise description either.

Jarek P.

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Johannes Berg @ 2009-10-26  7:44 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <20091026074126.GA6187@ff.dom.local>

[-- Attachment #1: Type: text/plain, Size: 474 bytes --]

On Mon, 2009-10-26 at 07:41 +0000, Jarek Poplawski wrote:

> > netif_rx_ni() disables preemption.
> 
> You wrote earlier:
> 
> > [...] the networking layer needs to have
> > packets handed to it with softirqs disabled.
> 
> How disabling preemption can fix something which needs softirqs
> disabled? Could you be more precise?

No, I wrote that I didn't know. I suppose now that I looked at it I do
know, and only disabling preemption is required.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: Eric Dumazet @ 2009-10-26  7:48 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Octavian Purdila, netdev
In-Reply-To: <20091025233016.6860d9c7@nehalam>

Stephen Hemminger a écrit :
> I overkilled this with more functions and compared filenames as well.
> 
> 
> genarated names (dummyNNNN)
> Algorithm             Time (us)     Ratio       Max   StdDev
> kr_hash                  277925   152408.6   468448 543.19
> string_hash31            329356     5859.4    16042  44.18
> SuperFastHash            324570     4885.9    10502   2.29
> djb2                     327908     5608.5    15210  38.08
> string_hash17            326769     4883.6     9896   0.76
> full_name_hash           343196    63921.0   140628 343.62
> jhash_string             463801     4883.8    10085   1.02
> sdbm                     398587     9801.7    29634  99.18
> 
> filesystem names
> Algorithm             Time (us)     Ratio       Max   StdDev
> kr_hash                  278840   152314.9   468717 543.01
> string_hash31            331206     5802.1    16004  42.87
> SuperFastHash            325938     4887.5    13528   2.88
> djb2                     330621     5607.1    15333  38.05
> string_hash17            331181     4884.9    13274   1.78
> full_name_hash           347312    63972.2   141336 343.77
> jhash_string             466799     4885.2    13275   1.92
> sdbm                     403691     9771.7    29629  98.88
> 
> Ratio is the average number of buckets examined when scanning
> the whole set of names.
> 
> 
> 1) Increased hash buckets to 1024 which seems reasonable if we are
>    going to test that many names.
> 2) Increased name size to 256 so that longer filenames could be
>    checked and name blocks were not in same cache line
> 
> * SuperFastHash is too big to put inline
> 
> 

Thanks Stephen

1) dcache hash is very big on average machines.

2) dcache : We hash last component, against its parent, acting as a base.
   Your hashtest program considers the name as a single entity, giving different
   hash distribution.

3) netdev names are special, since we have only one parent, and
 smaller hash table.

4) jhash is not that expensive, but it might be because of huge working set of your
test program : strings are not in cpu caches and speed is mostly driven by ram bandwidth.

But current full_name_hash() seems a pretty bad choice !

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Jarek Poplawski @ 2009-10-26  7:54 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <1256543054.28230.6.camel@johannes.local>

On Mon, Oct 26, 2009 at 08:44:14AM +0100, Johannes Berg wrote:
> On Mon, 2009-10-26 at 07:41 +0000, Jarek Poplawski wrote:
> 
> > > netif_rx_ni() disables preemption.
> > 
> > You wrote earlier:
> > 
> > > [...] the networking layer needs to have
> > > packets handed to it with softirqs disabled.
> > 
> > How disabling preemption can fix something which needs softirqs
> > disabled? Could you be more precise?
> 
> No, I wrote that I didn't know. I suppose now that I looked at it I do
> know, and only disabling preemption is required.

But netif_rx has preemption disabled most of the time (by hardirqs
disabling). So maybe disabling preemption isn't the main reason here
either?

Jarek P.

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Johannes Berg @ 2009-10-26  7:58 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Tilman Schmidt, David Miller, hidave.darkstar, linux-kernel, tglx,
	linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <20091026075438.GB6187@ff.dom.local>

[-- Attachment #1: Type: text/plain, Size: 765 bytes --]

On Mon, 2009-10-26 at 07:54 +0000, Jarek Poplawski wrote:

> > No, I wrote that I didn't know. I suppose now that I looked at it I do
> > know, and only disabling preemption is required.
> 
> But netif_rx has preemption disabled most of the time (by hardirqs
> disabling). So maybe disabling preemption isn't the main reason here
> either?

Not for netpoll though, which may or may not be relevant (if I were to
venture a guess I'd say it isn't and it disables preemption to be able
to do the softirq thing)

However, I lost track now of why we're discussing this.

Basically it boils down to using netif_rx() when in (soft)irq, and
netif_rx_ni() when in process context. That could just be an
optimisation, but it's a very valid one.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH v2 8/8] Document future removal of sysctl_tcp_* options
From: Gilad Ben-Yossef @ 2009-10-26  8:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Bill Fink, William Allen Simpson, netdev, Ilpo Järvinen
In-Reply-To: <4AE52DBD.3030805@gmail.com>

Eric Dumazet wrote:

> Bill Fink a écrit :
>   
>> On Sun, 25 Oct 2009, Gilad Ben-Yossef wrote:
>>
>>     
>>> Eric Dumazet wrote:
>>>
>>>
>>> I still think having a global kill switch and per route options better 
>>> (basically use the exiting patch but not retire the global kill 
>>> switch|), but if you must Hgow about we leave the global sysctl as they 
>>> are and just have a two bit route option:
>>>
>>> 0 Use global default
>>> 1 Off
>>> 2 On
>>>
>>> It's kind of funny, because this is what the original patch from 
>>> Comsleep does and I thought it needlessly complicates things.
>>>
>>> So, what do you say - which will it be?
>>>       
>> I personally feel the 2-bit settings are overkill.  What i think
>> makes the most sense is for the global options to act as they always
>> have in the absence of any route specific settings, and for any
>> route specific settings to override the related global settings.
>> This is both simple and maintains backward compatibility.
>>     
>
> Backward compatibility is important, very important, if not the most
> important thing. Then usability comes.
>   
I tend to agree.
> I know some busy servers where adding/changing a single route makes them
> go crazy (because of ip route flush cache)
>
> So if a route is overriding a global conf, and the admin wants to make an
> emergency change during peak hours, he should do it by a global setting,
> or he wont use at all this new stuff, and stay conservative.
>
> Alternative would be to not trigger the flush of cache when changing
> features flags.
>
>   
OK. It really sounds like we should go with my first suggestion: global 
sysctl based kill switches, just as we have now and in addition, the 
ability to kill TCP options per route. The TCP option will be used if 
and only if both kill switches (global and per route) are not set.

What we achieve is:

1. Global kill switches work exactly as they do now, whether you use the 
new per route options or not, so backwards compatible.

2. In addition, if the global kill switch is not in effect, you can also 
kill the options on a per route basis.

I'm going to send third version of the patch to this effect, minus the 
new remote DoS possibility that Ilpo pointed out and leaving the global 
sysctl kill switches be.

If you like it, please ACK ;-)

Thanks,
Gilad



-- 
Gilad Ben-Yossef
Chief Coffee Drinker & CTO
Codefidence Ltd.

Web:   http://codefidence.com
Cell:  +972-52-8260388
Skype: gilad_codefidence
Tel:   +972-8-9316883 ext. 201
Fax:   +972-8-9316884
Email: gilad@codefidence.com

Check out our Open Source technology and training blog - http://tuxology.net

	"Linux is Ir. Ir, of course, is a form of hypereviscerated Reiyk."
		-- Marc Volovic, linux-il, 14 Dec 2000


^ permalink raw reply

* [PATCH v3 0/7] Per route TCP options
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef

Third iteration of patch to allow disablng of TCP SACK, DSCAK, 
time stamp and window scale TCP options on a per route basis, now
with 100% less remote DoS opportunities (thank you Ilpo for 
spotting it ;-)

You usualy want to disable SACK, DSACK, time stamp or window
scale if you've got a piece of broken networking equipment somewhere 
as a stop gap until you can bring a big enough hammer to deal with
the broken network equipment. It doesn't make sense to "punish" the
entire connections going through the machine to destinations not 
related to the broken equipment.

This is doubly true when you're dealing with network containers
used to isolate several virtual domains.

Per route options implemented in free bits in the features route
entry property, which in some cases were reserved by name for these
options, so this does not inflate any structure.

Global sysctl based kill switches for these options are still
preserved, as some people seems to want them, so behaviour
is default to on, unless switched off either globaly or on
per route basis.

Tested on x86 using Qemu/KVM.  

Working but crude matching patch to iproute2 sent earlier to the list.

Patchset based on original work by Ori Finkelman and Yony Amit 
from ComSleep Ltd.

Gilad Ben-Yossef (7):
  Only parse time stamp TCP option in time wait sock
  Allow tcp_parse_options to consult dst entry
  Infrastructure for querying route entry features
  Add the no SACK route option feature
  Allow disabling TCP timestamp options per route
  Allow to turn off TCP window scale opt per route
  Allow disabling of DSACK TCP option per route

 include/linux/rtnetlink.h |    6 ++++--
 include/net/dst.h         |    8 +++++++-
 include/net/tcp.h         |    3 ++-
 net/ipv4/syncookies.c     |   27 ++++++++++++++-------------
 net/ipv4/tcp_input.c      |   26 ++++++++++++++++++--------
 net/ipv4/tcp_ipv4.c       |   21 ++++++++++++---------
 net/ipv4/tcp_minisocks.c  |    8 +++++---
 net/ipv4/tcp_output.c     |   18 +++++++++++++-----
 net/ipv6/syncookies.c     |   28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c       |    3 ++-
 10 files changed, 92 insertions(+), 56 deletions(-)


^ permalink raw reply

* [PATCH v3 6/7] Allow to turn off TCP window scale opt per route
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Add and use no window scale bit in the features field.

Note that this is not the same as setting a window scale of 0 
as would happen with window limit on route.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>
---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2ab8c75..6784b34 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -380,6 +380,7 @@ enum
 #define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
+#define RTAX_FEATURE_NO_WSCALE	0x00000010
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d2f9742..4f5e914 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3739,7 +3739,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_WINDOW:
 				if (opsize == TCPOLEN_WINDOW && th->syn &&
-				    !estab && sysctl_tcp_window_scaling) {
+				    !estab && sysctl_tcp_window_scaling &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)) {
 					__u8 snd_wscale = *(__u8 *)ptr;
 					opt_rx->wscale_ok = 1;
 					if (snd_wscale > 14) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f30c18..ff60a21 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -496,7 +496,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->tsecr = tp->rx_opt.ts_recent;
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
-	if (likely(sysctl_tcp_window_scaling)) {
+	if (likely(sysctl_tcp_window_scaling &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE))) {
 		opts->ws = tp->rx_opt.rcv_wscale;
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
@@ -2347,7 +2348,8 @@ static void tcp_connect_init(struct sock *sk)
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
-				  sysctl_tcp_window_scaling,
+				  (sysctl_tcp_window_scaling &&
+				   !dst_feature(dst, RTAX_FEATURE_NO_WSCALE)),
 				  &rcv_wscale);
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 4/7] Add the no SACK route option feature
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Implement querying and acting upon the no sack bit in the features
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    4 +++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index adf2068..9c802a6 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -377,7 +377,7 @@ enum
 #define RTAX_MAX (__RTAX_MAX - 1)
 
 #define RTAX_FEATURE_ECN	0x00000001
-#define RTAX_FEATURE_SACK	0x00000002
+#define RTAX_FEATURE_NO_SACK	0x00000002
 #define RTAX_FEATURE_TIMESTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d502f49..b14f780 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3763,7 +3763,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 				break;
 			case TCPOPT_SACK_PERM:
 				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
-				    !estab && sysctl_tcp_sack) {
+				    !estab && sysctl_tcp_sack &&
+				    !dst_feature(dst, RTAX_FEATURE_NO_SACK)) {
 					opt_rx->sack_ok = 1;
 					tcp_sack_reset(opt_rx);
 				}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a..64db8dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -464,6 +464,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 				struct tcp_md5sig_key **md5) {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned size = 0;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -498,7 +499,8 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		opts->options |= OPTION_WSCALE;
 		size += TCPOLEN_WSCALE_ALIGNED;
 	}
-	if (likely(sysctl_tcp_sack)) {
+	if (likely(sysctl_tcp_sack &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_SACK))) {
 		opts->options |= OPTION_SACK_ADVERTISE;
 		if (unlikely(!(OPTION_TS & opts->options)))
 			size += TCPOLEN_SACKPERM_ALIGNED;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 7/7] Allow disabling of DSACK TCP option per route
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Add and use no DSCAK bit in the features field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    1 +
 net/ipv4/tcp_input.c      |    8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6784b34..e78b60c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -381,6 +381,7 @@ enum
 #define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 #define RTAX_FEATURE_NO_WSCALE	0x00000010
+#define RTAX_FEATURE_NO_DSACK	0x00000020
 
 struct rta_session
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4f5e914..4262da5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4080,8 +4080,10 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
 static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+	if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+	    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 		int mib_idx;
 
 		if (before(seq, tp->rcv_nxt))
@@ -4110,13 +4112,15 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
 static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
 		tcp_enter_quickack_mode(sk);
 
-		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+		if (tcp_is_sack(tp) && sysctl_tcp_dsack &&
+		    !dst_feature(dst, RTAX_FEATURE_NO_DSACK)) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 1/7] Only parse time stamp TCP option in time wait sock
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef, Yony Amit
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Since we only use tcp_parse_options here to check for the exietence
of TCP timestamp option in the header, it is better to call with
the "established" flag on.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Signed-off-by: Ori Finkelman <ori@comsleep.com>
Signed-off-by: Yony Amit <yony@comsleep.com>

---
 net/ipv4/tcp_minisocks.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 624c3c9..c49a550 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -100,9 +100,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	struct tcp_options_received tmp_opt;
 	int paws_reject = 0;
 
-	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, 0);
+		tcp_parse_options(skb, &tmp_opt, 1);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 5/7] Allow disabling TCP timestamp options per route
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Implement querying and acting upon the no timestamp bit in the feature 
field.

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/linux/rtnetlink.h |    2 +-
 net/ipv4/tcp_input.c      |    3 ++-
 net/ipv4/tcp_output.c     |    8 ++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c802a6..2ab8c75 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -378,7 +378,7 @@ enum
 
 #define RTAX_FEATURE_ECN	0x00000001
 #define RTAX_FEATURE_NO_SACK	0x00000002
-#define RTAX_FEATURE_TIMESTAMP	0x00000004
+#define RTAX_FEATURE_NO_TSTAMP	0x00000004
 #define RTAX_FEATURE_ALLFRAG	0x00000008
 
 struct rta_session
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b14f780..d2f9742 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3755,7 +3755,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 			case TCPOPT_TIMESTAMP:
 				if ((opsize == TCPOLEN_TIMESTAMP) &&
 				    ((estab && opt_rx->tstamp_ok) ||
-				     (!estab && sysctl_tcp_timestamps))) {
+				     (!estab && sysctl_tcp_timestamps &&
+				      !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP)))) {
 					opt_rx->saw_tstamp = 1;
 					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
 					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 64db8dd..8f30c18 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -488,7 +488,9 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	opts->mss = tcp_advertise_mss(sk);
 	size += TCPOLEN_MSS_ALIGNED;
 
-	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+	if (likely(sysctl_tcp_timestamps &&
+		   !dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) &&
+		   *md5 == NULL)) {
 		opts->options |= OPTION_TS;
 		opts->tsval = TCP_SKB_CB(skb)->when;
 		opts->tsecr = tp->rx_opt.ts_recent;
@@ -2317,7 +2319,9 @@ static void tcp_connect_init(struct sock *sk)
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
 	 */
 	tp->tcp_header_len = sizeof(struct tcphdr) +
-		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+		(sysctl_tcp_timestamps &&
+		(!dst_feature(dst, RTAX_FEATURE_NO_TSTAMP) ?
+		  TCPOLEN_TSTAMP_ALIGNED : 0));
 
 #ifdef CONFIG_TCP_MD5SIG
 	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 3/7] Add dst_feature to query route entry features
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

Adding an accessor to existing  dst_entry feautres field and
refactor the only supported feature (allfrag) to use it.


Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/dst.h |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 5a900dd..b562be3 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -111,6 +111,12 @@ dst_metric(const struct dst_entry *dst, int metric)
 	return dst->metrics[metric-1];
 }
 
+static inline u32
+dst_feature(const struct dst_entry *dst, u32 feature)
+{
+	return dst_metric(dst, RTAX_FEATURES) & feature;
+}
+
 static inline u32 dst_mtu(const struct dst_entry *dst)
 {
 	u32 mtu = dst_metric(dst, RTAX_MTU);
@@ -136,7 +142,7 @@ static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric,
 static inline u32
 dst_allfrag(const struct dst_entry *dst)
 {
-	int ret = dst_metric(dst, RTAX_FEATURES) & RTAX_FEATURE_ALLFRAG;
+	int ret = dst_feature(dst,  RTAX_FEATURE_ALLFRAG);
 	/* Yes, _exactly_. This is paranoia. */
 	barrier();
 	return ret;
-- 
1.5.6.3


^ permalink raw reply related

* [PATCH v3 2/7] Allow tcp_parse_options to consult dst entry
From: Gilad Ben-Yossef @ 2009-10-26  8:06 UTC (permalink / raw)
  To: netdev; +Cc: ori, Gilad Ben-Yossef, Gilad Ben-Yossef
In-Reply-To: <1256544393-12450-1-git-send-email-gilad@codefidence.com>

From: Gilad Ben-Yossef <gby@watson.codefidence.com>

We need tcp_parse_options to be aware of dst_entry to
take into account per dst_entry TCP options settings

Signed-off-by: Gilad Ben-Yossef <gilad@codefidence.com>
Sigend-off-by: Ori Finkelman <ori@comsleep.com>
Sigend-off-by: Yony Amit <yony@comsleep.com>

---
 include/net/tcp.h        |    3 ++-
 net/ipv4/syncookies.c    |   27 ++++++++++++++-------------
 net/ipv4/tcp_input.c     |    9 ++++++---
 net/ipv4/tcp_ipv4.c      |   21 ++++++++++++---------
 net/ipv4/tcp_minisocks.c |    7 +++++--
 net/ipv6/syncookies.c    |   28 +++++++++++++++-------------
 net/ipv6/tcp_ipv6.c      |    3 ++-
 7 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 03a49c7..740d09b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -409,7 +409,8 @@ extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 
 extern void			tcp_parse_options(struct sk_buff *skb,
 						  struct tcp_options_received *opt_rx,
-						  int estab);
+						  int estab,
+						  struct dst_entry *dst);
 
 extern u8			*tcp_parse_md5sig_option(struct tcphdr *th);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a6e0e07..4990dd4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -276,13 +276,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
 
-	/* check for timestamp cookie support */
-	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, 0);
-
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
-
 	ret = NULL;
 	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
 	if (!req)
@@ -298,12 +291,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
 	ireq->ecn_ok		= 0;
-	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
-	ireq->sack_ok		= tcp_opt.sack_ok;
-	ireq->wscale_ok		= tcp_opt.wscale_ok;
-	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
-	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -351,6 +338,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, 0, &rt->u.dst);
+
+	if (tcp_opt.saw_tstamp)
+		cookie_check_timestamp(&tcp_opt);
+
+	ireq->snd_wscale        = tcp_opt.snd_wscale;
+	ireq->rcv_wscale        = tcp_opt.rcv_wscale;
+	ireq->sack_ok           = tcp_opt.sack_ok;
+	ireq->wscale_ok         = tcp_opt.wscale_ok;
+	ireq->tstamp_ok         = tcp_opt.saw_tstamp;
+	req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+
 	/* Try to redo what tcp_v4_send_synack did. */
 	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d86784b..d502f49 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3698,12 +3698,14 @@ old_ack:
  * the fast version below fails.
  */
 void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
-		       int estab)
+		       int estab,  struct dst_entry *dst)
 {
 	unsigned char *ptr;
 	struct tcphdr *th = tcp_hdr(skb);
 	int length = (th->doff * 4) - sizeof(struct tcphdr);
 
+	BUG_ON(!estab && !dst);
+
 	ptr = (unsigned char *)(th + 1);
 	opt_rx->saw_tstamp = 0;
 
@@ -3820,7 +3822,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
 		if (tcp_parse_aligned_timestamp(tp, th))
 			return 1;
 	}
-	tcp_parse_options(skb, &tp->rx_opt, 1);
+	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
 	return 1;
 }
 
@@ -5364,8 +5366,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int saved_clamp = tp->rx_opt.mss_clamp;
+	struct dst_entry *dst = __sk_dst_get(sk);
 
-	tcp_parse_options(skb, &tp->rx_opt, 0);
+	tcp_parse_options(skb, &tp->rx_opt, 0, dst);
 
 	if (th->ack) {
 		/* rfc793:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b..1d611e3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1256,11 +1256,21 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
+	ireq = inet_rsk(req);
+	ireq->loc_addr = daddr;
+	ireq->rmt_addr = saddr;
+	ireq->no_srccheck = inet_sk(sk)->transparent;
+	ireq->opt = tcp_v4_save_options(sk, skb);
+
+	dst = inet_csk_route_req(sk, req);
+	if(!dst)
+		goto drop_and_free;
+
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = 536;
 	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
 
-	tcp_parse_options(skb, &tmp_opt, 0);
+	tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
@@ -1269,14 +1279,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	ireq = inet_rsk(req);
-	ireq->loc_addr = daddr;
-	ireq->rmt_addr = saddr;
-	ireq->no_srccheck = inet_sk(sk)->transparent;
-	ireq->opt = tcp_v4_save_options(sk, skb);
-
 	if (security_inet_conn_request(sk, skb, req))
-		goto drop_and_free;
+		goto drop_and_release;
 
 	if (!want_cookie)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
@@ -1301,7 +1305,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (tmp_opt.saw_tstamp &&
 		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet_csk_route_req(sk, req)) != NULL &&
 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 		    peer->v4daddr == saddr) {
 			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c49a550..70ff955 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -101,7 +101,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 	int paws_reject = 0;
 
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, 1);
+		tcp_parse_options(skb, &tmp_opt, 1, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
@@ -499,10 +499,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	int paws_reject = 0;
 	struct tcp_options_received tmp_opt;
 	struct sock *child;
+	struct dst_entry *dst = inet_csk_route_req(sk, req);
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(skb, &tmp_opt, 0);
+		tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
@@ -515,6 +516,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	dst_release(dst);
+
 	/* Check for pure retransmitted SYN. */
 	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 	    flg == TCP_FLAG_SYN &&
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 6b6ae91..6ece408 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -184,13 +184,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
 
-	/* check for timestamp cookie support */
-	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, 0);
-
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
-
 	ret = NULL;
 	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
 	if (!req)
@@ -224,12 +217,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	req->expires = 0UL;
 	req->retrans = 0;
 	ireq->ecn_ok		= 0;
-	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
-	ireq->sack_ok		= tcp_opt.sack_ok;
-	ireq->wscale_ok		= tcp_opt.wscale_ok;
-	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
-	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
 	treq->snt_isn = cookie;
 
@@ -264,6 +251,21 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 			goto out_free;
 	}
 
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, 0, dst);
+
+	if (tcp_opt.saw_tstamp)
+		cookie_check_timestamp(&tcp_opt);
+
+	req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+
+	ireq->snd_wscale        = tcp_opt.snd_wscale;
+	ireq->rcv_wscale        = tcp_opt.rcv_wscale;
+	ireq->sack_ok           = tcp_opt.sack_ok;
+	ireq->wscale_ok         = tcp_opt.wscale_ok;
+	ireq->tstamp_ok         = tcp_opt.saw_tstamp;
+
 	req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
 	tcp_select_initial_window(tcp_full_space(sk), req->mss,
 				  &req->rcv_wnd, &req->window_clamp,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 21d100b..2eebab5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1165,6 +1165,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct request_sock *req = NULL;
 	__u32 isn = TCP_SKB_CB(skb)->when;
+	struct dst_entry *dst = __sk_dst_get(sk);
 #ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
 #else
@@ -1203,7 +1204,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
 
-	tcp_parse_options(skb, &tmp_opt, 0);
+	tcp_parse_options(skb, &tmp_opt, 0, dst);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
-- 
1.5.6.3


^ permalink raw reply related

* Re: VLAN and ARP failure on tg3 drivers
From: Benny Amorsen @ 2009-10-26  8:20 UTC (permalink / raw)
  To: Gertjan Hofman; +Cc: Matt Carlson, netdev@vger.kernel.org, Eric Dumazet
In-Reply-To: <477963.52849.qm@web32605.mail.mud.yahoo.com>

Gertjan Hofman <gertjan_hofman@yahoo.com> writes:

> Dear Matt, Eric, Benny,
>
> Sorry about the slow response to your fast replies. I think Benny is
> correct, the 'problem' lies in the fact that we were using a VLAN ID
> of 0, without knowing its special significance. User error.
>
> I tested it with other VLAN id's (>0) and it appears to work fine. We
> are not entirely sure we understand  why it used to work with VLAN ID
> 0 on the Broadcom chips and still does with a number of different
> cards (with >2.6.27 kernels).  What is the 'correct' behaviour for
> this incorrect usage ?

VLAN 0 isn't incorrect, it's just surprising. When you send a packet
tagged with VLAN 0, it means that the packet should be interpreted as
being the same VLAN as a completely untagged packet.

So in theory, if both ends are using VLAN 0 and you aren't using eth0
for anything, traffic should flow, at least if both ends are on the same
kernel version. Feel free to debug why that isn't the case for you, of
course...


/Benny


^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Tilman Schmidt @ 2009-10-26  8:47 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Jarek Poplawski, David Miller, hidave.darkstar, linux-kernel,
	tglx, linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <1256543932.28230.9.camel@johannes.local>

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Am 26.10.2009 09:58 schrieb Johannes Berg:
> On Mon, 2009-10-26 at 07:54 +0000, Jarek Poplawski wrote:
> 
>>> No, I wrote that I didn't know. I suppose now that I looked at it I do
>>> know, and only disabling preemption is required.
>> But netif_rx has preemption disabled most of the time (by hardirqs
>> disabling). So maybe disabling preemption isn't the main reason here
>> either?
> 
> Not for netpoll though, which may or may not be relevant (if I were to
> venture a guess I'd say it isn't and it disables preemption to be able
> to do the softirq thing)
> 
> However, I lost track now of why we're discussing this.

The starting point were several reports of the kernel message:

NOHZ: local_softirq_pending 08

Originally most if not all of them came from wireless networking,
but I muddied the waters by adding to the mix a case involving ISDN.
You stated that all the solutions proposed so far were wrong, so
we're naturally turning to you for guidance on what the right
solution might be.

> Basically it boils down to using netif_rx() when in (soft)irq, and
> netif_rx_ni() when in process context. That could just be an
> optimisation, but it's a very valid one.

Hmmm. That seems to contradict your earlier statement to me that
simply replacing a call to netif_rx() by one to netif_rx_ni()
when not in interrupt context isn't a valid solution either.
What am I missing?

Thanks,
Tilman

- --
Tilman Schmidt                    E-Mail: tilman@imap.cc
Bonn, Germany
Diese Nachricht besteht zu 100% aus wiederverwerteten Bits.
Ungeöffnet mindestens haltbar bis: (siehe Rückseite)
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.4 (MingW32)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQFK5WIXQ3+did9BuFsRAsj1AJ0e4VJ7Nsp69ROXCiT4oM/Q6lIpSwCfZvXS
4nV4tWTIzgk4IRlCt0CCF3Y=
=r15I
-----END PGP SIGNATURE-----

^ permalink raw reply

* Re: VLAN and ARP failure on tg3 drivers
From: Eric Dumazet @ 2009-10-26  8:54 UTC (permalink / raw)
  To: Benny Amorsen; +Cc: Gertjan Hofman, Matt Carlson, netdev@vger.kernel.org
In-Reply-To: <m34opmr2fz.fsf@ursa.amorsen.dk>

Benny Amorsen a écrit :
> Gertjan Hofman <gertjan_hofman@yahoo.com> writes:
> 
>> Dear Matt, Eric, Benny,
>>
>> Sorry about the slow response to your fast replies. I think Benny is
>> correct, the 'problem' lies in the fact that we were using a VLAN ID
>> of 0, without knowing its special significance. User error.
>>
>> I tested it with other VLAN id's (>0) and it appears to work fine. We
>> are not entirely sure we understand  why it used to work with VLAN ID
>> 0 on the Broadcom chips and still does with a number of different
>> cards (with >2.6.27 kernels).  What is the 'correct' behaviour for
>> this incorrect usage ?
> 
> VLAN 0 isn't incorrect, it's just surprising. When you send a packet
> tagged with VLAN 0, it means that the packet should be interpreted as
> being the same VLAN as a completely untagged packet.
> 
> So in theory, if both ends are using VLAN 0 and you aren't using eth0
> for anything, traffic should flow, at least if both ends are on the same
> kernel version. Feel free to debug why that isn't the case for you, of
> course...
> 

VLAN id 0 is not usable on current kernel because we use 16 bits in skb to
 store vlan_tci, and vlan_tci = 0 means there is no VLAN tagging.


We could use high order bit (0x8000) to tell if vlan tagging is set or not.

^ permalink raw reply

* Re: [PATCH] virtio-net: fix data corruption with OOM
From: Michael S. Tsirkin @ 2009-10-26  8:54 UTC (permalink / raw)
  To: Rusty Russell; +Cc: virtualization, kvm, netdev
In-Reply-To: <200910261211.52148.rusty@rustcorp.com.au>

On Mon, Oct 26, 2009 at 12:11:51PM +1030, Rusty Russell wrote:
> On Mon, 26 Oct 2009 03:33:40 am Michael S. Tsirkin wrote:
> > virtio net used to unlink skbs from send queues on error,
> > but ever since 48925e372f04f5e35fec6269127c62b2c71ab794
> > we do not do this. This causes guest data corruption and crashes
> > with vhost since net core can requeue the skb or free it without
> > it being taken off the list.
> > 
> > This patch fixes this by queueing the skb after successfull
> > transmit.
> 
> I originally thought that this was racy: as soon as we do add_buf, we need to
> make sure we're ready for the callback (for virtio_pci, it's ->kick, but we
> shouldn't rely on that).
> 
> So a comment would be nice.  How's this?

Acked-by: Michael S. Tsirkin <mst@redhat.com>

> Subject: virtio-net: fix data corruption with OOM
> Date: Sun, 25 Oct 2009 19:03:40 +0200
> From: "Michael S. Tsirkin" <mst@redhat.com>
> 
> virtio net used to unlink skbs from send queues on error,
> but ever since 48925e372f04f5e35fec6269127c62b2c71ab794
> we do not do this. This causes guest data corruption and crashes
> with vhost since net core can requeue the skb or free it without
> it being taken off the list.
> 
> This patch fixes this by queueing the skb after successful
> transmit.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ comment)
> ---
> 
> Rusty, here's a fix for another data corrupter I saw.
> This fixes a regression from 2.6.31, so definitely
> 2.6.32 I think. Comments?
> 
>  drivers/net/virtio_net.c |    8 +++++---
>  1 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -516,8 +516,7 @@ again:
>  	/* Free up any pending old buffers before queueing new ones. */
>  	free_old_xmit_skbs(vi);
>  
> -	/* Put new one in send queue and do transmit */
> -	__skb_queue_head(&vi->send, skb);
> +	/* Try to transmit */
>  	capacity = xmit_skb(vi, skb);
>  
>  	/* This can happen with OOM and indirect buffers. */
> @@ -531,8 +530,17 @@ again:
>  		}
>  		return NETDEV_TX_BUSY;
>  	}
> +	vi->svq->vq_ops->kick(vi->svq);
>  
> -	vi->svq->vq_ops->kick(vi->svq);
> +	/*
> +	 * Put new one in send queue.  You'd expect we'd need this before
> +	 * xmit_skb calls add_buf(), since the callback can be triggered
> +	 * immediately after that.  But since the callback just triggers
> +	 * another call back here, normal network xmit locking prevents the
> +	 * race.
> +	 */
> +	__skb_queue_head(&vi->send, skb);
> +
>  	/* Don't wait up for transmitted skbs to be freed. */
>  	skb_orphan(skb);
>  	nf_reset(skb);

^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Johannes Berg @ 2009-10-26  8:56 UTC (permalink / raw)
  To: Tilman Schmidt
  Cc: Jarek Poplawski, David Miller, hidave.darkstar, linux-kernel,
	tglx, linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <4AE56217.3040708@imap.cc>

[-- Attachment #1: Type: text/plain, Size: 1582 bytes --]

On Mon, 2009-10-26 at 10:47 +0200, Tilman Schmidt wrote:

> > However, I lost track now of why we're discussing this.
> 
> The starting point were several reports of the kernel message:
> 
> NOHZ: local_softirq_pending 08
> 
> Originally most if not all of them came from wireless networking,
> but I muddied the waters by adding to the mix a case involving ISDN.
> You stated that all the solutions proposed so far were wrong, so
> we're naturally turning to you for guidance on what the right
> solution might be.

Right. Sorry about getting lost here.

> > Basically it boils down to using netif_rx() when in (soft)irq, and
> > netif_rx_ni() when in process context. That could just be an
> > optimisation, but it's a very valid one.
> 
> Hmmm. That seems to contradict your earlier statement to me that
> simply replacing a call to netif_rx() by one to netif_rx_ni()
> when not in interrupt context isn't a valid solution either.
> What am I missing?

Well, I think you misunderstood me. It would be correct to do this, if
and only if the code that calls it doesn't need the extra guarantee.

Any code (say ISDN code) that calls netif_rx() is clearly assuming to
always be running in (soft)irq context, otherwise it couldn't call
netif_rx() unconditionally. Agree so far?

So now if you change the ISDN code to call netif_rx_ni(), you've changed
the assumption that the ISDN code makes -- that it is running in
(soft)irq context. Therefore, you need to verify that this is actually a
correct change, which is what I tried to say.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH 00/13] TProxy IPv6 support 2nd round
From: Balazs Scheidler @ 2009-10-26  9:00 UTC (permalink / raw)
  To: Harald Welte; +Cc: netfilter-devel, netdev
In-Reply-To: <20091025101612.GS29852@prithivi.gnumonks.org>

On Sun, 2009-10-25 at 11:16 +0100, Harald Welte wrote:
> Dear Balazs,
> 
> as you might have read from other mails (and by the long period of silence),
> Patrick McHardy is currently unavailable to perform his usual maintainer
> role.
> 
> I personally am too much out of touch with recent developments in
> netfitler-land to be able to confidently review your patches...  
> 
> So unless somebody else from the team (Jozsef?, Pablo?) feels confident in
> ACKing your patchset, I will have to ask for your patience until Patrick is
> back and can do it by himself.

Thanks for letting me know, hopefully Patrick gets better soon. I've
planned another round of the TProxy patches as I got some comments at
the previous round I'm yet to address.

So no need to hurry.

-- 
Bazsi


^ permalink raw reply

* Re: [PATCH] virtio-net: fix data corruption with OOM
From: Michael S. Tsirkin @ 2009-10-26  9:00 UTC (permalink / raw)
  To: Rusty Russell; +Cc: virtualization, kvm, netdev
In-Reply-To: <200910261211.52148.rusty@rustcorp.com.au>

On Mon, Oct 26, 2009 at 12:11:51PM +1030, Rusty Russell wrote:
> On Mon, 26 Oct 2009 03:33:40 am Michael S. Tsirkin wrote:
> > virtio net used to unlink skbs from send queues on error,
> > but ever since 48925e372f04f5e35fec6269127c62b2c71ab794
> > we do not do this. This causes guest data corruption and crashes
> > with vhost since net core can requeue the skb or free it without
> > it being taken off the list.
> > 
> > This patch fixes this by queueing the skb after successfull
> > transmit.
> 
> I originally thought that this was racy: as soon as we do add_buf, we need to
> make sure we're ready for the callback (for virtio_pci, it's ->kick, but we
> shouldn't rely on that).

BTW, wanted to note that unlink on error would *also* be racy if we did any
processing in the callback.

> So a comment would be nice.  How's this?
> 
> Subject: virtio-net: fix data corruption with OOM
> Date: Sun, 25 Oct 2009 19:03:40 +0200
> From: "Michael S. Tsirkin" <mst@redhat.com>
> 
> virtio net used to unlink skbs from send queues on error,
> but ever since 48925e372f04f5e35fec6269127c62b2c71ab794
> we do not do this. This causes guest data corruption and crashes
> with vhost since net core can requeue the skb or free it without
> it being taken off the list.
> 
> This patch fixes this by queueing the skb after successful
> transmit.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ comment)
> ---
> 
> Rusty, here's a fix for another data corrupter I saw.
> This fixes a regression from 2.6.31, so definitely
> 2.6.32 I think. Comments?
> 
>  drivers/net/virtio_net.c |    8 +++++---
>  1 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -516,8 +516,7 @@ again:
>  	/* Free up any pending old buffers before queueing new ones. */
>  	free_old_xmit_skbs(vi);
>  
> -	/* Put new one in send queue and do transmit */
> -	__skb_queue_head(&vi->send, skb);
> +	/* Try to transmit */
>  	capacity = xmit_skb(vi, skb);
>  
>  	/* This can happen with OOM and indirect buffers. */
> @@ -531,8 +530,17 @@ again:
>  		}
>  		return NETDEV_TX_BUSY;
>  	}
> +	vi->svq->vq_ops->kick(vi->svq);
>  
> -	vi->svq->vq_ops->kick(vi->svq);
> +	/*
> +	 * Put new one in send queue.  You'd expect we'd need this before
> +	 * xmit_skb calls add_buf(), since the callback can be triggered
> +	 * immediately after that.  But since the callback just triggers
> +	 * another call back here, normal network xmit locking prevents the
> +	 * race.
> +	 */
> +	__skb_queue_head(&vi->send, skb);
> +
>  	/* Don't wait up for transmitted skbs to be freed. */
>  	skb_orphan(skb);
>  	nf_reset(skb);

^ permalink raw reply

* Re: [RFC] make per interface sysctl entries configurable
From: Cosmin Ratiu @ 2009-10-26  9:01 UTC (permalink / raw)
  To: Octavian Purdila
  Cc: Eric Dumazet, Benjamin LaHaise, Stephen Hemminger, netdev
In-Reply-To: <200910260021.48785.opurdila@ixiacom.com>

On Monday 26 October 2009 00:21:48 Octavian Purdila wrote:
> On Sunday 25 October 2009 23:37:19 you wrote:
> > Octavian Purdila a écrit :
> > > RFC patches are attached.
> > >
> > > Another possible approach: add an interface flag and use it to decide
> > > whether we want per interface sysctl entries or not.
> >
> > Hmm, could we speedup sysctl instead, adding rbtree or something ?
> 
> Very good point, I think this is the best solution for people using a
> moderately high number of interfaces (a few thousand).
> 
> But for really large setups there is another issue: memory consumption. In
> fact, in order to be able to scale to 128K interfaces and still have a
> significant amount of memory available to applications we also had to
>  disable sysfs and #ifdef CONFIG_SYSFS struct device from net_device.
> 
> I would also argue that when you have such a large number of interfaces you
> don't need to change setting on a per interface basis. Or at least this is
>  our case :)  and I suspect that the case with a large number of PPP
>  interfaces is similar.
> 

Another possible approach: shared settings for an interface group. If you have 
a large number of interfaces of the same type it would be nice if you could 
change some setting for the whole group instead of globally or individually. 

Is this approach feasible anyway? Or I'm talking rubbish.

Cosmin.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox