* Re: [PATCH v2] qlge: call ql_core_dump() only if dump memory was allocated.
From: David Miller @ 2013-09-24 15:20 UTC (permalink / raw)
To: malahal; +Cc: netdev
In-Reply-To: <1379712077-31750-1-git-send-email-malahal@us.ibm.com>
From: Malahal Naineni <malahal@us.ibm.com>
Date: Fri, 20 Sep 2013 16:21:17 -0500
> Also changed a log message to indicate that memory was not allocated
> instead of memory not available!
>
> Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Applied.
^ permalink raw reply
* [PATCH v2 net-next] net: introduce SO_MAX_PACING_RATE
From: Eric Dumazet @ 2013-09-24 15:20 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Steinar H. Gunderson, Michael Kerrisk
In-Reply-To: <1379949014.3165.24.camel@edumazet-glaptop>
From: Eric Dumazet <edumazet@google.com>
As mentioned in commit afe4fd062416b ("pkt_sched: fq: Fair Queue packet
scheduler"), this patch adds a new socket option.
SO_MAX_PACING_RATE offers the application the ability to cap the
rate computed by transport layer. Value is in bytes per second.
u32 val = 1000000;
setsockopt(sockfd, SOL_SOCKET, SO_MAX_PACING_RATE, &val, sizeof(val));
To be effectively paced, a flow must use FQ packet scheduler.
Note that a packet scheduler takes into account the headers for its
computations. The effective payload rate depends on MSS and retransmits
if any.
I chose to make this pacing rate a SOL_SOCKET option instead of a
TCP one because this can be used by other protocols.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Steinar H. Gunderson <sesse@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
---
v2: cap sk->sk_pacing_rate in sock_setsockopt()
arch/alpha/include/uapi/asm/socket.h | 4 +++-
arch/avr32/include/uapi/asm/socket.h | 2 ++
arch/cris/include/uapi/asm/socket.h | 2 ++
arch/frv/include/uapi/asm/socket.h | 2 ++
arch/h8300/include/uapi/asm/socket.h | 2 ++
arch/ia64/include/uapi/asm/socket.h | 2 ++
arch/m32r/include/uapi/asm/socket.h | 2 ++
arch/mips/include/uapi/asm/socket.h | 2 ++
arch/mn10300/include/uapi/asm/socket.h | 2 ++
arch/parisc/include/uapi/asm/socket.h | 2 ++
arch/powerpc/include/uapi/asm/socket.h | 2 ++
arch/s390/include/uapi/asm/socket.h | 2 ++
arch/sparc/include/uapi/asm/socket.h | 2 ++
arch/xtensa/include/uapi/asm/socket.h | 2 ++
include/net/sock.h | 1 +
include/uapi/asm-generic/socket.h | 2 ++
net/core/sock.c | 12 ++++++++++++
net/ipv4/tcp_input.c | 2 +-
18 files changed, 45 insertions(+), 2 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 467de01..e3a1491 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -81,6 +81,8 @@
#define SO_SELECT_ERR_QUEUE 45
-#define SO_BUSY_POLL 46
+#define SO_BUSY_POLL 46
+
+#define SO_MAX_PACING_RATE 47
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
index 11c4259..4399364 100644
--- a/arch/avr32/include/uapi/asm/socket.h
+++ b/arch/avr32/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* __ASM_AVR32_SOCKET_H */
diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
index eb723e5..13829aa 100644
--- a/arch/cris/include/uapi/asm/socket.h
+++ b/arch/cris/include/uapi/asm/socket.h
@@ -78,6 +78,8 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index f0cb1c3..5d42997 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -76,5 +76,7 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/h8300/include/uapi/asm/socket.h b/arch/h8300/include/uapi/asm/socket.h
index 9490758..214ccaf 100644
--- a/arch/h8300/include/uapi/asm/socket.h
+++ b/arch/h8300/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 556d070..c25302f 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -85,4 +85,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 24be7c8..5296665 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 61c01f0..0df9787 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -94,4 +94,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index e2a2b203..71dedca 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -76,4 +76,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 71700e6..7c614d0 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -75,6 +75,8 @@
#define SO_BUSY_POLL 0x4027
+#define SO_MAX_PACING_RATE 0x4048
+
/* O_NONBLOCK clashes with the bits used for socket types. Therefore we
* have to define SOCK_NONBLOCK to a different value here.
*/
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index a6d7446..fa69832 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -83,4 +83,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 9249449..c286c2e 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -82,4 +82,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 4e1d66c..0f21e9a 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -72,6 +72,8 @@
#define SO_BUSY_POLL 0x0030
+#define SO_MAX_PACING_RATE 0x0031
+
/* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index c114483..7db5c22 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -87,4 +87,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* _XTENSA_SOCKET_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4625d2e..240aa3f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -363,6 +363,7 @@ struct sock {
int sk_wmem_queued;
gfp_t sk_allocation;
u32 sk_pacing_rate; /* bytes per second */
+ u32 sk_max_pacing_rate;
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
int sk_gso_type;
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index f04b69b..38f14d0 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -78,4 +78,6 @@
#define SO_BUSY_POLL 46
+#define SO_MAX_PACING_RATE 47
+
#endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 5b6beba..2bd9b3f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -914,6 +914,13 @@ set_rcvbuf:
}
break;
#endif
+
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1177,6 +1184,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
#endif
+ case SO_MAX_PACING_RATE:
+ v.val = sk->sk_max_pacing_rate;
+ break;
+
default:
return -ENOPROTOOPT;
}
@@ -2319,6 +2330,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_ll_usec = sysctl_net_busy_read;
#endif
+ sk->sk_max_pacing_rate = ~0U;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 25a89ea..75372c0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -713,7 +713,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
if (tp->srtt > 8 + 2)
do_div(rate, tp->srtt);
- sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+ sk->sk_pacing_rate = min_t(u64, rate, sk->sk_max_pacing_rate);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
^ permalink raw reply related
* Re: [PATCH] net: net_secret should not depend on TCP
From: Eric Dumazet @ 2013-09-24 15:22 UTC (permalink / raw)
To: Hannes Frederic Sowa; +Cc: Tom Herbert, davem, netdev, jesse.brandeburg
In-Reply-To: <20130924151333.GA1527@order.stressinduktion.org>
On Tue, 2013-09-24 at 17:13 +0200, Hannes Frederic Sowa wrote:
> On Tue, Sep 24, 2013 at 06:19:57AM -0700, Eric Dumazet wrote:
> > -void net_secret_init(void)
> > +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
> > +
> > +static void net_secret_init(void)
> > {
> > - get_random_bytes(net_secret, sizeof(net_secret));
> > + u32 tmp;
> > + int i;
> > +
> > + if (likely(net_secret[0]))
> > + return;
> > +
> > + for (i = NET_SECRET_SIZE; i > 0;) {
> > + do {
> > + get_random_bytes(&tmp, sizeof(tmp));
> > + } while (!tmp);
>
> I am afraid we can block here on embedded systems in an atomic section? Is
> this actually an issue? It does get called in a spin_lock_h.
I do not see issues : get_random_bytes() is irq safe.
^ permalink raw reply
* Re: [PATCH 1/2] net: Toeplitz library functions
From: Tom Herbert @ 2013-09-24 15:22 UTC (permalink / raw)
To: David Laight; +Cc: David Miller, Linux Netdev List, Brandeburg, Jesse
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6026B7355@saturn3.aculab.com>
On Tue, Sep 24, 2013 at 1:32 AM, David Laight <David.Laight@aculab.com> wrote:
>> +static inline unsigned int
>> +toeplitz_hash(const unsigned char *bytes,
>> + struct toeplitz *toeplitz, int n)
>> +{
>> + int i;
>> + unsigned int result = 0;
>> +
>> + for (i = 0; i < n; i++)
>> + result ^= toeplitz->key_cache[i][bytes[i]];
>> +
>> + return result;
>> +};
>
> That is a horrid hash function to be calculating in software.
>
> The code looks very much like a simple 32bit CRC.
> It isn't entirely clears exactly where the 'key' gets included,
> but I suspect it is just xored with the data bytes.
>
Please google Toeplitz hash function to learn about the algorithm.
http://msdn.microsoft.com/en-us/library/windows/hardware/ff570725(v=vs.85).aspx
> Using in it hardware is probably fine - the hardware can do
> it cheaply (in dedicated logic) as the frame arrives.
> The CRC polynomial probably collapses to a few XOR operations
> when done byte by byte (the hdlc crc16 collapses to 3 levels
> of xor).
>
> IIRC jhash() works on 32bit quantities - so has far fewer
> maths operations and well as not having all the random data
> accesses (cache misses and displacing other parts of the
> working set from the cache).
>
Yes, but pretty much every NIC vendor implements Toeplitz hash and
provides it in their receive descriptor. We use this value for
steering, and could use it for other uses like connection lookup.
> I also thought the hash was arranged so that tx and rx packets
> for a single connection hash to the same value?
>
ehashfn hashes consistently based on local and remote sides.
skb_get_rxhash orders the addresses and ports to make a consistent
hash in both directions. Presumably, this allows skb_get_rxhash to be
called from TX side to get same value of RX side, but when we get
rxhash from device (e.g. Toeplitz) this property is broken anyway.
Instead of jumping through this hoop, it might be better to have a
separate function from calculating RX hash for reverse path on TX.
Assuming skb_rx_hash does symmetric calculation is currently
incorrect. For instance, looks like tun.c is trying to implement a
sort of 'flow director' logic to pair TX queues and RX queues using
skb_get_rxhash an expecting that the value is calculated
symmetrically. If HW is providing RX hash, this is broken and we'll
never match the flows. We could either recompute the hash in SW or
try to match HW hash.
Tom
> David
>
>
>
^ permalink raw reply
* Re: [PATCH net v3 1/1] xen-netback: Handle backend state transitions in a more robust way
From: Wei Liu @ 2013-09-24 15:26 UTC (permalink / raw)
To: Paul Durrant; +Cc: xen-devel, netdev, Ian Campbell, Wei Liu, David Vrabel
In-Reply-To: <1380034282-11210-2-git-send-email-paul.durrant@citrix.com>
On Tue, Sep 24, 2013 at 03:51:22PM +0100, Paul Durrant wrote:
> When the frontend state changes metback now specifies its desired state to
netback
> a new function, set_backend_state(), which transitions through any
[...]
> +/* Handle backend state transitions:
> + *
> + * The backend state starts in InitWait and the following transtions are
transitions
> + * allowed.
>
[...]
> @@ -363,7 +448,9 @@ static void hotplug_status_changed(struct xenbus_watch *watch,
> if (IS_ERR(str))
> return;
> if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
> - xenbus_switch_state(be->dev, XenbusStateConnected);
> + /* Complete any pending state change */
> + xenbus_switch_state(be->dev, be->state);
> +
The state transition takes place iff hotplug status is "connected", is
this desirable? What if hotplug fails?
If it cycles through connect again it looks like it will trigger that
BUG_ON in connect()?
Wei.
^ permalink raw reply
* Re: [PATCH] net: net_secret should not depend on TCP
From: Hannes Frederic Sowa @ 2013-09-24 15:28 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Tom Herbert, davem, netdev, jesse.brandeburg
In-Reply-To: <1380036147.3165.72.camel@edumazet-glaptop>
On Tue, Sep 24, 2013 at 08:22:27AM -0700, Eric Dumazet wrote:
> On Tue, 2013-09-24 at 17:13 +0200, Hannes Frederic Sowa wrote:
> > On Tue, Sep 24, 2013 at 06:19:57AM -0700, Eric Dumazet wrote:
> > > -void net_secret_init(void)
> > > +static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
> > > +
> > > +static void net_secret_init(void)
> > > {
> > > - get_random_bytes(net_secret, sizeof(net_secret));
> > > + u32 tmp;
> > > + int i;
> > > +
> > > + if (likely(net_secret[0]))
> > > + return;
> > > +
> > > + for (i = NET_SECRET_SIZE; i > 0;) {
> > > + do {
> > > + get_random_bytes(&tmp, sizeof(tmp));
> > > + } while (!tmp);
> >
> > I am afraid we can block here on embedded systems in an atomic section? Is
> > this actually an issue? It does get called in a spin_lock_h.
>
> I do not see issues : get_random_bytes() is irq safe.
But couldn't it be that get_random_bytes always returns 0 and we won't make
any progress here. Does the reseed happen from irq context or from softirqs? I
always thought it would be from a softirq (which could be blocked).
Thanks,
Hannes
^ permalink raw reply
* Re: [PATCH 1/2] net: Toeplitz library functions
From: Eric Dumazet @ 2013-09-24 15:29 UTC (permalink / raw)
To: Tom Herbert
Cc: David Laight, David Miller, Linux Netdev List, Brandeburg, Jesse
In-Reply-To: <CA+mtBx_y0hK69jcdY5e0MUzAa8hEPwExBDuK9Dn4Ceo5Hkn_iw@mail.gmail.com>
On Tue, 2013-09-24 at 08:22 -0700, Tom Herbert wrote:
> Assuming skb_rx_hash does symmetric calculation is currently
> incorrect. For instance, looks like tun.c is trying to implement a
> sort of 'flow director' logic to pair TX queues and RX queues using
> skb_get_rxhash an expecting that the value is calculated
> symmetrically. If HW is providing RX hash, this is broken and we'll
> never match the flows. We could either recompute the hash in SW or
> try to match HW hash.
Its not incorrect, its an implementation choice.
Its software in linux, we do not have to care of how its done in
hardware.
This is done to reduce conntracking cost, in case RPS is used on a
router : Same cpu will process frames in both ways.
But conntracking does not 'rely' on rxhash being symmetric, thats an
optimization to have better data locality.
^ permalink raw reply
* Re: [PATCH net-next] ipv6: do not allow ipv6 module to be removed
From: David Miller @ 2013-09-24 15:32 UTC (permalink / raw)
To: amwang; +Cc: netdev, yoshfuji, stephen
In-Reply-To: <1379733141-10643-1-git-send-email-amwang@redhat.com>
From: Cong Wang <amwang@redhat.com>
Date: Sat, 21 Sep 2013 11:12:21 +0800
> From: Cong Wang <amwang@redhat.com>
>
> There was some bug report on ipv6 module removal path before.
> Also, as Stephen pointed out, after vxlan module gets ipv6 support,
> the ipv6 stub it used is not safe against this module removal either.
> So, let's just remove inet6_exit() so that ipv6 module will not be
> able to be unloaded.
>
> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Cc: David S. Miller <davem@davemloft.net>
> Signed-off-by: Cong Wang <amwang@redhat.com>
Fair enough, applied. We can always revert this later if we ever
make ipv6 properly unloadable.
^ permalink raw reply
* Re: question tehuti.c
From: Andy Gospodarek @ 2013-09-24 15:34 UTC (permalink / raw)
To: Julia Lawall; +Cc: netdev, linux-kernel, Alexander Indenbaum
In-Reply-To: <alpine.DEB.2.02.1309231752040.2242@hadrien>
On Mon, Sep 23, 2013 at 05:54:00PM +0200, Julia Lawall wrote:
> The function bdx_setmulti in the file drivers/net/ethernet/tehuti/tehuti.c
> contains:
>
> u32 rxf_val =
> GMAC_RX_FILTER_AM | GMAC_RX_FILTER_AB | GMAC_RX_FILTER_OSEN;
>
> and then later:
>
> } else {
> DBG("only own mac %d\n", netdev_mc_count(ndev));
> rxf_val |= GMAC_RX_FILTER_AB;
> }
>
> The last assignment doesn't look very useful, because GMAC_RX_FILTER_ABis
> already included in rxf_val. Was something else intended?
You are correct that the setting of GMAC_RX_FILTER_AB is a bit redundant
since it was set earlier in the function.
I have not worked on this driver in several years and if there is no
response from Alexander or someone else at Tehuti Networks then you may
not get an answer to your question. :-/
^ permalink raw reply
* RE: [PATCH net v3 1/1] xen-netback: Handle backend state transitions in a more robust way
From: Paul Durrant @ 2013-09-24 15:30 UTC (permalink / raw)
To: Wei Liu
Cc: xen-devel@lists.xen.org, netdev@vger.kernel.org, Ian Campbell,
Wei Liu, David Vrabel
In-Reply-To: <20130924152611.GA14211@zion.uk.xensource.com>
> -----Original Message-----
> From: Wei Liu [mailto:wei.liu2@citrix.com]
> Sent: 24 September 2013 16:26
> To: Paul Durrant
> Cc: xen-devel@lists.xen.org; netdev@vger.kernel.org; Ian Campbell; Wei Liu;
> David Vrabel
> Subject: Re: [PATCH net v3 1/1] xen-netback: Handle backend state
> transitions in a more robust way
>
> On Tue, Sep 24, 2013 at 03:51:22PM +0100, Paul Durrant wrote:
> > When the frontend state changes metback now specifies its desired state
> to
> netback
> > a new function, set_backend_state(), which transitions through any
> [...]
> > +/* Handle backend state transitions:
> > + *
> > + * The backend state starts in InitWait and the following transtions are
> transitions
> > + * allowed.
> >
> [...]
> > @@ -363,7 +448,9 @@ static void hotplug_status_changed(struct
> xenbus_watch *watch,
> > if (IS_ERR(str))
> > return;
> > if (len == sizeof("connected")-1 && !memcmp(str, "connected", len))
> {
> > - xenbus_switch_state(be->dev, XenbusStateConnected);
> > + /* Complete any pending state change */
> > + xenbus_switch_state(be->dev, be->state);
> > +
>
> The state transition takes place iff hotplug status is "connected", is
> this desirable? What if hotplug fails?
>
The xenbus state will remain in InitWait but the backend state will be Connected.
> If it cycles through connect again it looks like it will trigger that
> BUG_ON in connect()?
>
No, I don't think so because be->state is not the same as dev->state. The frontend can go to Closing/Closed (which will take dev->state to Closing/Closed) and this should be fine.
Paul
^ permalink raw reply
* RE: [PATCH net v3 1/1] xen-netback: Handle backend state transitions in a more robust way
From: Paul Durrant @ 2013-09-24 15:33 UTC (permalink / raw)
To: Wei Liu
Cc: xen-devel@lists.xen.org, netdev@vger.kernel.org, Ian Campbell,
Wei Liu, David Vrabel
In-Reply-To: <20130924152611.GA14211@zion.uk.xensource.com>
> -----Original Message-----
> From: Paul Durrant
> Sent: 24 September 2013 16:31
> To: Wei Liu
> Cc: xen-devel@lists.xen.org; netdev@vger.kernel.org; Ian Campbell; Wei Liu;
> David Vrabel
> Subject: RE: [PATCH net v3 1/1] xen-netback: Handle backend state
> transitions in a more robust way
>
> > -----Original Message-----
> > From: Wei Liu [mailto:wei.liu2@citrix.com]
> > Sent: 24 September 2013 16:26
> > To: Paul Durrant
> > Cc: xen-devel@lists.xen.org; netdev@vger.kernel.org; Ian Campbell; Wei
> Liu;
> > David Vrabel
> > Subject: Re: [PATCH net v3 1/1] xen-netback: Handle backend state
> > transitions in a more robust way
> >
> > On Tue, Sep 24, 2013 at 03:51:22PM +0100, Paul Durrant wrote:
> > > When the frontend state changes metback now specifies its desired state
> > to
> > netback
> > > a new function, set_backend_state(), which transitions through any
> > [...]
> > > +/* Handle backend state transitions:
> > > + *
> > > + * The backend state starts in InitWait and the following transtions are
> > transitions
> > > + * allowed.
> > >
> > [...]
> > > @@ -363,7 +448,9 @@ static void hotplug_status_changed(struct
> > xenbus_watch *watch,
> > > if (IS_ERR(str))
> > > return;
> > > if (len == sizeof("connected")-1 && !memcmp(str, "connected", len))
> > {
> > > - xenbus_switch_state(be->dev, XenbusStateConnected);
> > > + /* Complete any pending state change */
> > > + xenbus_switch_state(be->dev, be->state);
> > > +
> >
> > The state transition takes place iff hotplug status is "connected", is
> > this desirable? What if hotplug fails?
> >
>
> The xenbus state will remain in InitWait but the backend state will be
> Connected.
>
> > If it cycles through connect again it looks like it will trigger that
> > BUG_ON in connect()?
> >
>
> No, I don't think so because be->state is not the same as dev->state. The
> frontend can go to Closing/Closed (which will take dev->state to
> Closing/Closed) and this should be fine.
>
Sorry, I misread. You're right. I'll fix.
Paul
^ permalink raw reply
* Re: [PATCH 1/2] net: Toeplitz library functions
From: David Miller @ 2013-09-24 15:39 UTC (permalink / raw)
To: therbert; +Cc: David.Laight, netdev, jesse.brandeburg
In-Reply-To: <CA+mtBx_y0hK69jcdY5e0MUzAa8hEPwExBDuK9Dn4Ceo5Hkn_iw@mail.gmail.com>
From: Tom Herbert <therbert@google.com>
Date: Tue, 24 Sep 2013 08:22:55 -0700
> We use this value for steering, and could use it for other uses like
> connection lookup.
For security reasons we absolutely cannot use it for that purpose,
please stop claiming this.
Any hash function which an attacker can reproduce is attackable.
^ permalink raw reply
* Re: [PATCH] ipv6: udp packets following an UFO enqueued packet need also be handled by UFO
From: David Miller @ 2013-09-24 15:43 UTC (permalink / raw)
To: hannes; +Cc: netdev, yoshfuji
In-Reply-To: <20130921042700.GB8070@order.stressinduktion.org>
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 21 Sep 2013 06:27:00 +0200
> In the following scenario the socket is corked:
> If the first UDP packet is larger then the mtu we try to append it to the
> write queue via ip6_ufo_append_data. A following packet, which is smaller
> than the mtu would be appended to the already queued up gso-skb via
> plain ip6_append_data. This causes random memory corruptions.
>
> In ip6_ufo_append_data we also have to be careful to not queue up the
> same skb multiple times. So setup the gso frame only when no first skb
> is available.
>
> This also fixes a shortcoming where we add the current packet's length to
> cork->length but return early because of a packet > mtu with dontfrag set
> (instead of sutracting it again).
>
> Found with trinity.
>
> Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Applied and queued up for -stable, thanks.
^ permalink raw reply
* [PATCH net v4 0/1] xen-netback: windows frontend compatibility fixes
From: Paul Durrant @ 2013-09-24 15:44 UTC (permalink / raw)
To: xen-devel, netdev
The following patches fix a couple more issues found when testing with
Windows frontends.
v4:
- Fix problem with hotplug failure noticed by Wei Liu
v3:
- Collapse both v2 patches into a single patch that introduces a new
function to handle backend state transtions. By doing this we ensure that
we always transition through intermediate states and that we don't attempt
repeated connects or disconnects.
v2:
- Add comment in 2/2 to note that state transitions from Connected to Closed
are incorrect.
^ permalink raw reply
* [PATCH net v4 1/1] xen-netback: Handle backend state transitions in a more robust way
From: Paul Durrant @ 2013-09-24 15:45 UTC (permalink / raw)
To: xen-devel, netdev; +Cc: Paul Durrant, Ian Campbell, Wei Liu, David Vrabel
In-Reply-To: <1380037500-12174-1-git-send-email-paul.durrant@citrix.com>
When the frontend state changes metback now specifies its desired state to
a new function, set_backend_state(), which transitions through any
necessary intermediate states.
This fixes an issue observed with some old Windows frontend drivers where
they failed to transition through the Closing state and netback would not
behave correctly.
Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
---
drivers/net/xen-netback/xenbus.c | 143 ++++++++++++++++++++++++++++++--------
1 file changed, 113 insertions(+), 30 deletions(-)
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index a53782e..f311c38 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -24,6 +24,7 @@
struct backend_info {
struct xenbus_device *dev;
struct xenvif *vif;
+ enum xenbus_state state;
enum xenbus_state frontend_state;
struct xenbus_watch hotplug_status_watch;
u8 have_hotplug_status_watch:1;
@@ -136,6 +137,8 @@ static int netback_probe(struct xenbus_device *dev,
if (err)
goto fail;
+ be->state = XenbusStateInitWait;
+
/* This kicks hotplug scripts, so do it immediately. */
backend_create_xenvif(be);
@@ -208,24 +211,113 @@ static void backend_create_xenvif(struct backend_info *be)
kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
}
-
-static void disconnect_backend(struct xenbus_device *dev)
+static void backend_disconnect(struct backend_info *be)
{
- struct backend_info *be = dev_get_drvdata(&dev->dev);
-
if (be->vif)
xenvif_disconnect(be->vif);
}
-static void destroy_backend(struct xenbus_device *dev)
+static void backend_connect(struct backend_info *be)
{
- struct backend_info *be = dev_get_drvdata(&dev->dev);
+ if (be->vif)
+ connect(be);
+}
- if (be->vif) {
- kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
- xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
- xenvif_free(be->vif);
- be->vif = NULL;
+static inline void backend_switch_state(struct backend_info *be,
+ enum xenbus_state state)
+{
+ struct xenbus_device *dev = be->dev;
+
+ pr_debug("%s -> %s\n", dev->nodename, xenbus_strstate(state));
+ be->state = state;
+
+ /* If we are waiting for a hotplug script then defer the
+ * actual xenbus state change.
+ */
+ if (!be->have_hotplug_status_watch)
+ xenbus_switch_state(dev, state);
+}
+
+/* Handle backend state transitions:
+ *
+ * The backend state starts in InitWait and the following transtions are
+ * allowed.
+ *
+ * InitWait -> Connected
+ *
+ * ^ \ |
+ * | \ |
+ * | \ |
+ * | \ |
+ * | \ |
+ * | \ |
+ * | V V
+ *
+ * Closed <-> Closing
+ *
+ * The state argument specifies the eventual state of the backend and the
+ * function transitions to that state via the shortest path.
+ */
+static void set_backend_state(struct backend_info *be,
+ enum xenbus_state state)
+{
+ while (be->state != state) {
+ switch (be->state) {
+ case XenbusStateClosed:
+ switch (state) {
+ case XenbusStateInitWait:
+ case XenbusStateConnected:
+ pr_info("%s: prepare for reconnect\n",
+ be->dev->nodename);
+ backend_switch_state(be, XenbusStateInitWait);
+ break;
+ case XenbusStateClosing:
+ backend_switch_state(be, XenbusStateClosing);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case XenbusStateInitWait:
+ switch (state) {
+ case XenbusStateConnected:
+ backend_connect(be);
+ backend_switch_state(be, XenbusStateConnected);
+ break;
+ case XenbusStateClosing:
+ case XenbusStateClosed:
+ backend_switch_state(be, XenbusStateClosing);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case XenbusStateConnected:
+ switch (state) {
+ case XenbusStateInitWait:
+ case XenbusStateClosing:
+ case XenbusStateClosed:
+ backend_disconnect(be);
+ backend_switch_state(be, XenbusStateClosing);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ case XenbusStateClosing:
+ switch (state) {
+ case XenbusStateInitWait:
+ case XenbusStateConnected:
+ case XenbusStateClosed:
+ backend_switch_state(be, XenbusStateClosed);
+ break;
+ default:
+ BUG();
+ }
+ break;
+ default:
+ BUG();
+ }
}
}
@@ -237,40 +329,33 @@ static void frontend_changed(struct xenbus_device *dev,
{
struct backend_info *be = dev_get_drvdata(&dev->dev);
- pr_debug("frontend state %s\n", xenbus_strstate(frontend_state));
+ pr_debug("%s -> %s\n", dev->otherend, xenbus_strstate(frontend_state));
be->frontend_state = frontend_state;
switch (frontend_state) {
case XenbusStateInitialising:
- if (dev->state == XenbusStateClosed) {
- pr_info("%s: prepare for reconnect\n", dev->nodename);
- xenbus_switch_state(dev, XenbusStateInitWait);
- }
+ set_backend_state(be, XenbusStateInitWait);
break;
case XenbusStateInitialised:
break;
case XenbusStateConnected:
- if (dev->state == XenbusStateConnected)
- break;
- if (be->vif)
- connect(be);
+ set_backend_state(be, XenbusStateConnected);
break;
case XenbusStateClosing:
- disconnect_backend(dev);
- xenbus_switch_state(dev, XenbusStateClosing);
+ set_backend_state(be, XenbusStateClosing);
break;
case XenbusStateClosed:
- xenbus_switch_state(dev, XenbusStateClosed);
+ set_backend_state(be, XenbusStateClosed);
if (xenbus_dev_is_online(dev))
break;
- destroy_backend(dev);
/* fall through if not online */
case XenbusStateUnknown:
+ set_backend_state(be, XenbusStateClosed);
device_unregister(&dev->dev);
break;
@@ -363,7 +448,9 @@ static void hotplug_status_changed(struct xenbus_watch *watch,
if (IS_ERR(str))
return;
if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
- xenbus_switch_state(be->dev, XenbusStateConnected);
+ /* Complete any pending state change */
+ xenbus_switch_state(be->dev, be->state);
+
/* Not interested in this watch anymore. */
unregister_hotplug_status_watch(be);
}
@@ -393,12 +480,8 @@ static void connect(struct backend_info *be)
err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
hotplug_status_changed,
"%s/%s", dev->nodename, "hotplug-status");
- if (err) {
- /* Switch now, since we can't do a watch. */
- xenbus_switch_state(dev, XenbusStateConnected);
- } else {
+ if (!err)
be->have_hotplug_status_watch = 1;
- }
netif_wake_queue(be->vif->dev);
}
--
1.7.10.4
^ permalink raw reply related
* Re: [PATCH] net: net_secret should not depend on TCP
From: Eric Dumazet @ 2013-09-24 15:46 UTC (permalink / raw)
To: Hannes Frederic Sowa; +Cc: Tom Herbert, davem, netdev, jesse.brandeburg
In-Reply-To: <20130924152858.GB1527@order.stressinduktion.org>
On Tue, 2013-09-24 at 17:28 +0200, Hannes Frederic Sowa wrote:
> But couldn't it be that get_random_bytes always returns 0 and we won't make
> any progress here. Does the reseed happen from irq context or from softirqs? I
> always thought it would be from a softirq (which could be blocked).
Really if get_random_bytes() could return 0 forever I think we would
have a big problem with its implementation and documentation :
/*
* This function is the exported kernel interface. It returns some
* number of good random numbers, suitable for key generation, seeding
* TCP sequence numbers, etc. It does not use the hw random number
* generator, if available; use get_random_bytes_arch() for that.
*/
void get_random_bytes(void *buf, int nbytes)
{
extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
}
EXPORT_SYMBOL(get_random_bytes);
^ permalink raw reply
* Re: [PATCH 1/2] net: Toeplitz library functions
From: Tom Herbert @ 2013-09-24 15:54 UTC (permalink / raw)
To: David Miller; +Cc: David Laight, Linux Netdev List, Brandeburg, Jesse
In-Reply-To: <20130924.113953.1275344954032811572.davem@redhat.com>
On Tue, Sep 24, 2013 at 8:39 AM, David Miller <davem@redhat.com> wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Tue, 24 Sep 2013 08:22:55 -0700
>
>> We use this value for steering, and could use it for other uses like
>> connection lookup.
>
> For security reasons we absolutely cannot use it for that purpose,
> please stop claiming this.
>
> Any hash function which an attacker can reproduce is attackable.
The Toeplitz function uses a secret key whose length is based on the
input length. 96 bits in IPv4, 320 bits in IPv6. I don't see how an
attacker can reproduce this if the key is random. If the problem is
that devices are not being configured with a sufficiently random key
(some actually are using a fixed key :-( ), that's a separate issue
that should be addressed. It is possible to DoS attack through the
steering mechanism.
Tom
^ permalink raw reply
* Re: [PATCH 1/2] net: Toeplitz library functions
From: Hannes Frederic Sowa @ 2013-09-24 16:00 UTC (permalink / raw)
To: Tom Herbert
Cc: David Miller, David Laight, Linux Netdev List, Brandeburg, Jesse
In-Reply-To: <CA+mtBx_AcmcNBDhRKVB8YDkSNdOwgJ3CXxBqpgxDw23X3m2rvQ@mail.gmail.com>
On Tue, Sep 24, 2013 at 08:54:24AM -0700, Tom Herbert wrote:
> On Tue, Sep 24, 2013 at 8:39 AM, David Miller <davem@redhat.com> wrote:
> > From: Tom Herbert <therbert@google.com>
> > Date: Tue, 24 Sep 2013 08:22:55 -0700
> >
> >> We use this value for steering, and could use it for other uses like
> >> connection lookup.
> >
> > For security reasons we absolutely cannot use it for that purpose,
> > please stop claiming this.
> >
> > Any hash function which an attacker can reproduce is attackable.
>
> The Toeplitz function uses a secret key whose length is based on the
> input length. 96 bits in IPv4, 320 bits in IPv6. I don't see how an
> attacker can reproduce this if the key is random. If the problem is
> that devices are not being configured with a sufficiently random key
> (some actually are using a fixed key :-( ), that's a separate issue
> that should be addressed. It is possible to DoS attack through the
> steering mechanism.
I agree, my first comment on the second patch was wrong. I did not assume that
the hashing function does seed itself. We also do not rehash the connection
tables. So if Eric's comments would be addressed its use could be fine.
^ permalink raw reply
* Re: [PATCH 1/2] net: Toeplitz library functions
From: Hannes Frederic Sowa @ 2013-09-24 16:01 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Tom Herbert, davem, netdev, jesse.brandeburg
In-Reply-To: <1380001118.3165.41.camel@edumazet-glaptop>
On Mon, Sep 23, 2013 at 10:38:38PM -0700, Eric Dumazet wrote:
> On Tue, 2013-09-24 at 05:35 +0200, Hannes Frederic Sowa wrote:
>
> > > build_ehash_secret builds up the data which seeds fragmentation ids, ephermal
> > > port randomization etc. Could we drop the check of sock->type? I guess the
> > > idea was that in-kernel sockets of type raw/udp do not seed the keys when no
> > > entropy is available?
> >
> > Would this be better (I checked inet_ehash_secret, ipv6_hash_secret
> > and net_secret to actual get initialized)?
> >
>
> inet_ehash_secret is used only to make jhash() for tcp ehash, not for
> fragmentation ids or other uses (port randomization).
inet(6)_ehashfn does get used by udp btw. But I do not understand the code,
yet. ;)
^ permalink raw reply
* [PATCH net v2] vxlan: Use RCU apis to access sk_user_data.
From: Pravin B Shelar @ 2013-09-24 16:09 UTC (permalink / raw)
To: netdev; +Cc: eric.dumazet, Pravin B Shelar, Jesse Gross
Use of RCU api makes vxlan code easier to understand. It also
fixes bug due to missing ACCESS_ONCE() on sk_user_data dereference.
In rare case without ACCESS_ONCE() compiler might omit vs on
sk_user_data dereference. Compiler can use vs as alias for
sk->sk_user_data, resulting in multiple sk_user_data dereference
in rcu read context which could change.
CC: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
v2:
Use RCU_INIT_POINTER
---
drivers/net/vxlan.c | 11 ++++-------
include/net/sock.h | 2 ++
2 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d1292fe..1fae0d6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1,4 +1,4 @@
-/*
+ /*
* VXLAN: Virtual eXtensible Local Area Network
*
* Copyright (c) 2012-2013 Vyatta Inc.
@@ -952,8 +952,7 @@ void vxlan_sock_release(struct vxlan_sock *vs)
spin_lock(&vn->sock_lock);
hlist_del_rcu(&vs->hlist);
- smp_wmb();
- vs->sock->sk->sk_user_data = NULL;
+ RCU_INIT_POINTER(sk_user_data_rcu(vs->sock->sk), NULL);
vxlan_notify_del_rx_port(sk);
spin_unlock(&vn->sock_lock);
@@ -1048,8 +1047,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
port = inet_sk(sk)->inet_sport;
- smp_read_barrier_depends();
- vs = (struct vxlan_sock *)sk->sk_user_data;
+ vs = rcu_dereference(sk_user_data_rcu(sk));
if (!vs)
goto drop;
@@ -2302,8 +2300,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
atomic_set(&vs->refcnt, 1);
vs->rcv = rcv;
vs->data = data;
- smp_wmb();
- vs->sock->sk->sk_user_data = vs;
+ rcu_assign_pointer(sk_user_data_rcu(vs->sock->sk), vs);
spin_lock(&vn->sock_lock);
hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
diff --git a/include/net/sock.h b/include/net/sock.h
index 6ba2e7b..ffd1356 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -409,6 +409,8 @@ struct sock {
void (*sk_destruct)(struct sock *sk);
};
+#define sk_user_data_rcu(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
+
/*
* SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
* or not whether his port will be reused by someone else. SK_FORCE_REUSE
--
1.7.1
^ permalink raw reply related
* Re: [PATCH 1/2] net: Toeplitz library functions
From: Eric Dumazet @ 2013-09-24 16:10 UTC (permalink / raw)
To: Tom Herbert
Cc: David Miller, David Laight, Linux Netdev List, Brandeburg, Jesse
In-Reply-To: <CA+mtBx_AcmcNBDhRKVB8YDkSNdOwgJ3CXxBqpgxDw23X3m2rvQ@mail.gmail.com>
On Tue, 2013-09-24 at 08:54 -0700, Tom Herbert wrote:
> The Toeplitz function uses a secret key whose length is based on the
> input length. 96 bits in IPv4, 320 bits in IPv6. I don't see how an
> attacker can reproduce this if the key is random. If the problem is
> that devices are not being configured with a sufficiently random key
> (some actually are using a fixed key :-( ), that's a separate issue
> that should be addressed. It is possible to DoS attack through the
> steering mechanism.
Well, your patch would make sense [1] only if we could use hardware
assistance, but right now we have no idea of how safe are the existing
assistances.
[1] Computing Toeplitz in software is way more expensive than jhash.
Dos attack is quite simple right now, even without knowing if the target
uses or not steering.
^ permalink raw reply
* Re: [PATCH net v4 1/1] xen-netback: Handle backend state transitions in a more robust way
From: Wei Liu @ 2013-09-24 16:12 UTC (permalink / raw)
To: Paul Durrant; +Cc: xen-devel, netdev, Ian Campbell, Wei Liu, David Vrabel
In-Reply-To: <1380037500-12174-2-git-send-email-paul.durrant@citrix.com>
You forgot to fix those typos -- metback, transtions. :-)
On Tue, Sep 24, 2013 at 04:45:00PM +0100, Paul Durrant wrote:
> When the frontend state changes metback now specifies its desired state to
metback
[...]
> + * The backend state starts in InitWait and the following transtions are
transtions
Wei.
^ permalink raw reply
* Re: [PATCH 1/2] net: Toeplitz library functions
From: Eric Dumazet @ 2013-09-24 16:14 UTC (permalink / raw)
To: Hannes Frederic Sowa; +Cc: Tom Herbert, davem, netdev, jesse.brandeburg
In-Reply-To: <20130924160145.GB26769@order.stressinduktion.org>
On Tue, 2013-09-24 at 18:01 +0200, Hannes Frederic Sowa wrote:
> inet(6)_ehashfn does get used by udp btw. But I do not understand the code,
> yet. ;)
Oh well, thats the SO_REUSEPORT thing. We do not really care, thats only
used to try to steer udp flows on a given socket, but its a hint.
Presumably it should use a separate secret key.
^ permalink raw reply
* Re: [PATCH net v2] vxlan: Use RCU apis to access sk_user_data.
From: Eric Dumazet @ 2013-09-24 16:16 UTC (permalink / raw)
To: Pravin B Shelar; +Cc: netdev, Jesse Gross
In-Reply-To: <1380038979-24035-1-git-send-email-pshelar@nicira.com>
On Tue, 2013-09-24 at 09:09 -0700, Pravin B Shelar wrote:
> Use of RCU api makes vxlan code easier to understand. It also
> fixes bug due to missing ACCESS_ONCE() on sk_user_data dereference.
> In rare case without ACCESS_ONCE() compiler might omit vs on
> sk_user_data dereference. Compiler can use vs as alias for
> sk->sk_user_data, resulting in multiple sk_user_data dereference
> in rcu read context which could change.
>
> CC: Jesse Gross <jesse@nicira.com>
> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
> ---
> v2:
> Use RCU_INIT_POINTER
> ---
> drivers/net/vxlan.c | 11 ++++-------
> include/net/sock.h | 2 ++
> 2 files changed, 6 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index d1292fe..1fae0d6 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -1,4 +1,4 @@
> -/*
> + /*
> * VXLAN: Virtual eXtensible Local Area Network
> *
extra space ?
^ permalink raw reply
* [PATCH 1/2] net: Add function to get SW rxhash
From: Tom Herbert @ 2013-09-24 16:20 UTC (permalink / raw)
To: davem; +Cc: netdev
Some uses of skb_get_rxhash expect that the function will return
a consistent value whether it is called on RX or TX paths. On RX
path, we will use the rxhash if provided by the NIC, so this
would not normally be the same result computed in TX path which is
a software calculation.
This patch adds skb_get_sw_rxhash to explicitly request a hash
calculated by the stack, disregarding the hash provided by NIC.
Signed-off-by: Tom Herbert <therbert@google.com>
---
include/linux/skbuff.h | 11 ++++++++++-
net/core/flow_dissector.c | 1 +
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2ddb48d..917a590 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -488,6 +488,7 @@ struct sk_buff {
__u8 pfmemalloc:1;
__u8 ooo_okay:1;
__u8 l4_rxhash:1;
+ __u8 sw_rxhash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
@@ -498,7 +499,7 @@ struct sk_buff {
* headers if needed
*/
__u8 encapsulation:1;
- /* 7/9 bit hole (depending on ndisc_nodetype presence) */
+ /* 6/8 bit hole (depending on ndisc_nodetype presence) */
kmemcheck_bitfield_end(flags2);
#if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
@@ -726,6 +727,14 @@ static inline __u32 skb_get_rxhash(struct sk_buff *skb)
return skb->rxhash;
}
+static inline __u32 skb_get_sw_rxhash(struct sk_buff *skb)
+{
+ if (!skb->l4_rxhash && !skb->sw_rxhash)
+ __skb_get_rxhash(skb);
+
+ return skb->rxhash;
+}
+
#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1929af8..8979121 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -200,6 +200,7 @@ void __skb_get_rxhash(struct sk_buff *skb)
hash = 1;
skb->rxhash = hash;
+ skb->sw_rxhash = 1;
}
EXPORT_SYMBOL(__skb_get_rxhash);
--
1.8.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox