Netdev List
 help / color / mirror / Atom feed
* RE: [PATCH RFC] c_can_pci: generic module for c_can on PCI
From: AnilKumar, Chimata @ 2012-06-05 15:12 UTC (permalink / raw)
  To: Alessandro Rubini, mkl@pengutronix.de
  Cc: bhupesh.sharma@st.com, federico.vaga@gmail.com,
	alan@lxorguk.ukuu.org.uk, wg@grandegger.com,
	giancarlo.asnaghi@st.com, alan@linux.intel.com,
	linux-can@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20120605133013.GA16108@mail.gnudd.com>

Hi Alessandro/Federico,

On Tue, Jun 05, 2012 at 19:00:13, Alessandro Rubini wrote:
> > I personally like the "pci device sets up a platform device" idea.
> 
> Good. Than me or federico will submit a proposal. 
>  

I am late to the discussion, is there any specific reason to maintain a
separate platform file (c_can_pci.c). I think 90% of the code is copied
from c_can_paltform.c, code changes will be less if you merge to existing
c_can platform driver. You can look at D_CAN integration to C_CAN driver.

[1] https://gitorious.org/linux-can/linux-can-next

Regards
AnilKumar

^ permalink raw reply

* Re: Reoccuring kern.log events after running xl2tp with ethernet adapter Realtek 8111E
From: Francois Romieu @ 2012-06-05 15:37 UTC (permalink / raw)
  To: Dustin Schumm; +Cc: netdev
In-Reply-To: <CANcVnzivnQrhSderZ+b5yoy+F1OiO7Ajv7WeijqzQKyAkYRqug@mail.gmail.com>

Dustin Schumm <shodid@gmail.com> :
[...]
> Forgive me for sending this in plain test as I got a reject from
> netdev for html.

Please:
- no full quote
- no top answer
- no html

Thanks.

[...]
> I've also tested with Fedora 17 using kernel 3.3.7
> http://codepad.org/lxkcRLqp
> This paste is from Fedora. Like Ubuntu, it starts with "general
> protection fault: 0000 [#1] SMP" but does not give the other looping
> messages after as in Ubuntu.

This is with an usual 1500 bytes MTU and without TSO (see 'ethtool -k ethX'),
right ?

-- 
Ueimor

^ permalink raw reply

* [PATCH] Allow receiving packets on the fallback tunnel if they pass sanity checks
From: Phil Dibowitz @ 2012-06-05 15:40 UTC (permalink / raw)
  To: davem; +Cc: netdev, phild

Ville Nuorvala wrote this patch to solve a problem we were having at Facebook.
He hasn't had time to send it upstream yet, but gave me permission to do it,
and I want to make sure it makes it upstream by World v6 Launch Day.

>From b413062771afbba064ae9bc49b5daed7abb1243d Mon Sep 17 00:00:00 2001
From: Ville Nuorvala <ville.nuorvala@gmail.com>
Subject: [PATCH] Allow receiving packets on the fallback tunnel if they pass sanity checks

The same IPv6 address checks are performed as with any normal tunnel,
but as the fallback tunnel endpoint addresses are unspecified, the checks
must be performed on a per-packet basis, rather than at tunnel
configuration time.

Signed-off-by: Ville Nuorvala <ville.nuorvala@gmail.com>
Tested-by: Phil Dibowitz <phil@ipom.com>

---
 include/net/ip6_tunnel.h |    2 +
 net/ipv6/ip6_tunnel.c    |   65 +++++++++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index fc73e66..358fb86 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -9,6 +9,8 @@
 #define IP6_TNL_F_CAP_XMIT 0x10000
 /* capable of receiving packets */
 #define IP6_TNL_F_CAP_RCV 0x20000
+/* determine capability on a per-packet basis */
+#define IP6_TNL_F_CAP_PER_PACKET 0x40000
 
 /* IPv6 tunnel */
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index c9015fa..04a3cba 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -684,24 +684,50 @@ static void ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
 		IP6_ECN_set_ce(ipv6_hdr(skb));
 }
 
+static __u32 ip6_tnl_get_cap(struct ip6_tnl *t,
+			     const struct in6_addr *laddr,
+			     const struct in6_addr *raddr)
+{
+	struct ip6_tnl_parm *p = &t->parms;
+	int ltype = ipv6_addr_type(laddr);
+	int rtype = ipv6_addr_type(raddr);
+	__u32 flags = 0;
+
+	if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) {
+		flags = IP6_TNL_F_CAP_PER_PACKET;
+	} else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+		   rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+		   !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
+		   (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
+		if (ltype&IPV6_ADDR_UNICAST)
+			flags |= IP6_TNL_F_CAP_XMIT;
+		if (rtype&IPV6_ADDR_UNICAST)
+			flags |= IP6_TNL_F_CAP_RCV;
+	}
+	return flags;
+}
+
 /* called with rcu_read_lock() */
-static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t)
+static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
+				  const struct in6_addr *laddr,
+				  const struct in6_addr *raddr)
 {
 	struct ip6_tnl_parm *p = &t->parms;
 	int ret = 0;
 	struct net *net = dev_net(t->dev);
 
-	if (p->flags & IP6_TNL_F_CAP_RCV) {
+	if ((p->flags & IP6_TNL_F_CAP_RCV) ||
+	    ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
+	     (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) {
 		struct net_device *ldev = NULL;
 
 		if (p->link)
 			ldev = dev_get_by_index_rcu(net, p->link);
 
-		if ((ipv6_addr_is_multicast(&p->laddr) ||
-		     likely(ipv6_chk_addr(net, &p->laddr, ldev, 0))) &&
-		    likely(!ipv6_chk_addr(net, &p->raddr, NULL, 0)))
+		if ((ipv6_addr_is_multicast(laddr) ||
+		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
+		    likely(!ipv6_chk_addr(net, raddr, NULL, 0)))
 			ret = 1;
-
 	}
 	return ret;
 }
@@ -740,7 +766,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 			goto discard;
 		}
 
-		if (!ip6_tnl_rcv_ctl(t)) {
+		if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
 			t->dev->stats.rx_dropped++;
 			rcu_read_unlock();
 			goto discard;
@@ -1114,25 +1140,6 @@ tx_err:
 	return NETDEV_TX_OK;
 }
 
-static void ip6_tnl_set_cap(struct ip6_tnl *t)
-{
-	struct ip6_tnl_parm *p = &t->parms;
-	int ltype = ipv6_addr_type(&p->laddr);
-	int rtype = ipv6_addr_type(&p->raddr);
-
-	p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV);
-
-	if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
-	    rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
-	    !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
-	    (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
-		if (ltype&IPV6_ADDR_UNICAST)
-			p->flags |= IP6_TNL_F_CAP_XMIT;
-		if (rtype&IPV6_ADDR_UNICAST)
-			p->flags |= IP6_TNL_F_CAP_RCV;
-	}
-}
-
 static void ip6_tnl_link_config(struct ip6_tnl *t)
 {
 	struct net_device *dev = t->dev;
@@ -1153,7 +1160,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 	if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
 		fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
 
-	ip6_tnl_set_cap(t);
+	p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
+	p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
 
 	if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
 		dev->flags |= IFF_POINTOPOINT;
@@ -1438,6 +1446,9 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
 
 	t->parms.proto = IPPROTO_IPV6;
 	dev_hold(dev);
+
+	ip6_tnl_link_config(t);
+
 	rcu_assign_pointer(ip6n->tnls_wc[0], t);
 	return 0;
 }

-- 
Phil Dibowitz                             phil@ipom.com
Open Source software and tech docs        Insanity Palace of Metallica
http://www.phildev.net/                   http://www.ipom.com/

"Be who you are and say what you feel, because those who mind don't matter
 and those who matter don't mind."
 - Dr. Seuss

^ permalink raw reply related

* Re: Reoccuring kern.log events after running xl2tp with ethernet adapter Realtek 8111E
From: Dustin Schumm @ 2012-06-05 15:59 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev
In-Reply-To: <20120605153753.GA12514@electric-eye.fr.zoreil.com>

On Tue, Jun 5, 2012 at 11:37 AM, Francois Romieu <romieu@fr.zoreil.com> wrote:
>
> This is with an usual 1500 bytes MTU and without TSO (see 'ethtool -k ethX'),
> right ?
>
> --
> Ueimor

Yes,

MTU:1500
tcp-segmentation-offload: off

^ permalink raw reply

* Re: [PATCH RFC] c_can_pci: generic module for c_can on PCI
From: Alessandro Rubini @ 2012-06-05 16:50 UTC (permalink / raw)
  To: anilkumar
  Cc: mkl, bhupesh.sharma, federico.vaga, alan, wg, giancarlo.asnaghi,
	alan, linux-can, netdev, linux-kernel
In-Reply-To: <331ABD5ECB02734CA317220B2BBEABC13E9CB18A@DBDE01.ent.ti.com>

> I am late to the discussion, is there any specific reason to maintain a
> separate platform file (c_can_pci.c).

Because it depends on pci and ifdef is bad.

> I think 90% of the code is copied from c_can_paltform.c, code
> changes will be less if you merge to existing c_can platform driver.

Yes, but then we need to ifdef around, which merges two bad files
into a single but worse file.

But since the only current user of c_can is the platform device, why
not merging the platform with the core and having pci just register a
platform device?  The only problem I see is that we need cooperation,
because neither me nor federico have a c_can equipped board besides
the pci one.

thanks
/alessandro

^ permalink raw reply

* [PATCH IPROUTE2] tc: Update manpage
From: Vijay Subramanian @ 2012-06-05 17:13 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Eric Dumazet, Vijay Subramanian

Add a reference to fq_codel to the SEE ALSO section.

Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
---
 man/man8/tc.8 |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index 14a1cd8..5d73c98 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -369,6 +369,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2.
 .BR tc-cbq (8),
 .BR tc-choke (8),
 .BR tc-codel (8),
+.BR tc-fq_codel (8),
 .BR tc-drr (8),
 .BR tc-htb (8),
 .BR tc-hfsc (8),
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH 3/3] Revert Backoff [v3]: Calculate TCP's connection close threshold as a time value.
From: Jerry Chu @ 2012-06-05 17:42 UTC (permalink / raw)
  To: Damian Lukowski; +Cc: Netdev, David Miller, Ilpo Järvinen
In-Reply-To: <CAPshTChwnEaMTOsofFDP0ZY8U3VCP2ZrtXjdhKG03rRsx7kryA@mail.gmail.com>

On Mon, Jun 4, 2012 at 4:50 PM, Jerry Chu <hkchu@google.com> wrote:
> Hi Damian,
>
> On Mon, Jun 4, 2012 at 10:50 AM, Damian Lukowski
> <damian@tvk.rwth-aachen.de> wrote:
>> Hi Jerry,
>>
>> please verify, I understood you correctly.
>>
>> You have set TCP_RTO_MIN to a lower value, e.g. 0.002 seconds to improve
>> your internal low-latency traffic. Because of the improvement, R1
>> timeouts are triggered too fast for external high-RTT traffic. Is that
>> correct?
>
> Correct.
>
>> If so, may I suggest to set tcp_retries1 to a higher value? For
>> TCP_RTO_MIN == 0.002 and tcp_retries1 ==  10, R1 will be calculated to
>> approximately 4 seconds.
>
> I think hacking tcp_retries1 is the wrong solution. E.g., 10 retries may be too
> generous for those short RTT flows.
>
> I think the fundamental problem is - the ideal fix for your original RTO revert
> problem should've used the per-flow RTO to compute R1 & R2. But that
> computation may be too expensive so you used TCP_RTO_MIN as an
> approximation - not a good idea IMHO!

Just realized the correct fix of using the original, non-backoff per flow RTO is
not any more expensive than the current code through ilog2(). What's needed
is a new field "base_rto" to record the original RTO before backoff. I'm leaning
toward this more accurate fix now without any fudge because fudging almost
always causes bugs.

Any comment is welcome. I'm not sure in the existing code if it makes sense
to apply the exponential backoff based computation to thin stream but it's a
separate question so I won't touch it.

Jerry

>
> The easiest solution I can see so far is to replace the check
>
> if (!inet_csk(sk)->icsk_retransmits)
>                return false;
>
> at the beginning of retransmits_timed_out() with
>
> if (inet_csk(sk)->icsk_retransmits < boundary)
>                return false;
>
> Best,
>
> Jerry
>
>>
>> Is that ok?
>>
>> Best regards
>>  Damian
>>
>> Am Freitag, den 01.06.2012, 15:58 -0700 schrieb Jerry Chu:
>>> > From: Damian Lukowski <damian@tvk.rwth-aachen.de>
>>> > Date: Wed, Aug 26, 2009 at 3:16 AM
>>> > Subject: [PATCH 3/3] Revert Backoff [v3]: Calculate TCP's connection close
>>> > threshold as a time value.
>>> > To: Netdev <netdev@vger.kernel.org>
>>> >
>>> >
>>> > RFC 1122 specifies two threshold values R1 and R2 for connection timeouts,
>>> > which may represent a number of allowed retransmissions or a timeout value.
>>> > Currently linux uses sysctl_tcp_retries{1,2} to specify the thresholds
>>> > in number of allowed retransmissions.
>>> >
>>> > For any desired threshold R2 (by means of time) one can specify tcp_retries2
>>> > (by means of number of retransmissions) such that TCP will not time out
>>> > earlier than R2. This is the case, because the RTO schedule follows a fixed
>>> > pattern, namely exponential backoff.
>>> >
>>> > However, the RTO behaviour is not predictable any more if RTO backoffs can
>>> > be
>>> > reverted, as it is the case in the draft
>>> > "Make TCP more Robust to Long Connectivity Disruptions"
>>> > (http://tools.ietf.org/html/draft-zimmermann-tcp-lcd).
>>> >
>>> > In the worst case TCP would time out a connection after 3.2 seconds, if the
>>> > initial RTO equaled MIN_RTO and each backoff has been reverted.
>>> >
>>> > This patch introduces a function retransmits_timed_out(N),
>>> > which calculates the timeout of a TCP connection, assuming an initial
>>> > RTO of MIN_RTO and N unsuccessful, exponentially backed-off retransmissions.
>>> >
>>> > Whenever timeout decisions are made by comparing the retransmission counter
>>> > to some value N, this function can be used, instead.
>>> >
>>> > The meaning of tcp_retries2 will be changed, as many more RTO
>>> > retransmissions
>>> > can occur than the value indicates. However, it yields a timeout which is
>>> > similar to the one of an unpatched, exponentially backing off TCP in the
>>> > same
>>> > scenario. As no application could rely on an RTO greater than MIN_RTO, there
>>> > should be no risk of a regression.
>>>
>>> This looks like a typical "fix one problem, introducing a few more" patch :(.
>>> What do you mean by "no application could rely on an RTO greater than
>>> MIN_RTO..."
>>> above? How can you make the assumption that RTO is not too far off
>>> from TCP_RTO_MIN?
>>>
>>> While you tried to address a problem where the retransmission count
>>> was high but the actual
>>> timeout duration was too short, have you considered the other case
>>> around, i.e., the timeout
>>> duration is long but the retransmission count is too short? This is
>>> exactly what's happening
>>> to us with your patch. We've much reduced TCP_RTO_MIN for our internal
>>> traffic, but not
>>> noticing your change has severely shortened the R1 & R2 recommended by
>>> RFC1122 for our
>>> long haul traffic until now. In many cases R1 threshold was met upon
>>> the first retrans timeout.
>>>
>>> I think retransmits_timed_out() should check against both time
>>> duration and retrans count
>>> (icsk_retransmits).
>>>
>>> Thought?
>>>
>>> Jerry
>>>
>>> >
>>> > Signed-off-by: Damian Lukowski <damian@tvk.rwth-aachen.de>
>>> > ---
>>> >  include/net/tcp.h    |   18 ++++++++++++++++++
>>> >  net/ipv4/tcp_timer.c |   11 +++++++----
>>> >  2 files changed, 25 insertions(+), 4 deletions(-)
>>> >
>>> > diff --git a/include/net/tcp.h b/include/net/tcp.h
>>> > index c35b329..17d1a88 100644
>>> > --- a/include/net/tcp.h
>>> > +++ b/include/net/tcp.h
>>> > @@ -1247,6 +1247,24 @@ static inline struct sk_buff
>>> > *tcp_write_queue_prev(struct sock *sk, struct sk_bu
>>> >  #define tcp_for_write_queue_from_safe(skb, tmp, sk)                    \
>>> >        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
>>> >
>>> > +static inline bool retransmits_timed_out(const struct sock *sk,
>>> > +                                        unsigned int boundary)
>>> > +{
>>> > +       int limit, K;
>>> > +       if (!inet_csk(sk)->icsk_retransmits)
>>> > +               return false;
>>> > +
>>> > +       K = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
>>> > +
>>> > +       if (boundary <= K)
>>> > +               limit = ((2 << boundary) - 1) * TCP_RTO_MIN;
>>> > +       else
>>> > +               limit = ((2 << K) - 1) * TCP_RTO_MIN +
>>> > +                       (boundary - K) * TCP_RTO_MAX;
>>> > +
>>> > +       return (tcp_time_stamp - tcp_sk(sk)->retrans_stamp) >= limit;
>>> > +}
>>> > +
>>> >  static inline struct sk_buff *tcp_send_head(struct sock *sk)
>>> >  {
>>> >        return sk->sk_send_head;
>>> > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>>> > index a3ba494..2972d7b 100644
>>> > --- a/net/ipv4/tcp_timer.c
>>> > +++ b/net/ipv4/tcp_timer.c
>>> > @@ -137,13 +137,14 @@ static int tcp_write_timeout(struct sock *sk)
>>> >  {
>>> >        struct inet_connection_sock *icsk = inet_csk(sk);
>>> >        int retry_until;
>>> > +       bool do_reset;
>>> >
>>> >        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
>>> >                if (icsk->icsk_retransmits)
>>> >                        dst_negative_advice(&sk->sk_dst_cache);
>>> >                retry_until = icsk->icsk_syn_retries ? :
>>> > sysctl_tcp_syn_retries;
>>> >        } else {
>>> > -               if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
>>> > +               if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
>>> >                        /* Black hole detection */
>>> >                        tcp_mtu_probing(icsk, sk);
>>> >
>>> > @@ -155,13 +156,15 @@ static int tcp_write_timeout(struct sock *sk)
>>> >                        const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
>>> >
>>> >                        retry_until = tcp_orphan_retries(sk, alive);
>>> > +                       do_reset = alive ||
>>> > +                                  !retransmits_timed_out(sk, retry_until);
>>> >
>>> > -                       if (tcp_out_of_resources(sk, alive ||
>>> > icsk->icsk_retransmits < retry_until))
>>> > +                       if (tcp_out_of_resources(sk, do_reset))
>>> >                                return 1;
>>> >                }
>>> >        }
>>> >
>>> > -       if (icsk->icsk_retransmits >= retry_until) {
>>> > +       if (retransmits_timed_out(sk, retry_until)) {
>>> >                /* Has it gone just too far? */
>>> >                tcp_write_err(sk);
>>> >                return 1;
>>> > @@ -385,7 +388,7 @@ void tcp_retransmit_timer(struct sock *sk)
>>> >  out_reset_timer:
>>> >        icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
>>> >        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
>>> > TCP_RTO_MAX);
>>> > -       if (icsk->icsk_retransmits > sysctl_tcp_retries1)
>>> > +       if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
>>> >                __sk_dst_reset(sk);
>>> >
>>> >  out:;
>>> > --
>>> > 1.6.3.3
>>> >
>>> > --
>>> > To unsubscribe from this list: send the line "unsubscribe netdev" in
>>> > the body of a message to majordomo@vger.kernel.org
>>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> >
>>
>>

^ permalink raw reply

* Re: [PATCH IPROUTE2] tc: Update manpage
From: Jan Ceuleers @ 2012-06-05 17:46 UTC (permalink / raw)
  To: Vijay Subramanian; +Cc: netdev, Stephen Hemminger, Eric Dumazet
In-Reply-To: <1338916400-9190-1-git-send-email-subramanian.vijay@gmail.com>

On 06/05/2012 07:13 PM, Vijay Subramanian wrote:
> @@ -369,6 +369,7 @@ was written by Alexey N. Kuznetsov and added in Linux 2.2.
>  .BR tc-cbq (8),
>  .BR tc-choke (8),
>  .BR tc-codel (8),
> +.BR tc-fq_codel (8),
>  .BR tc-drr (8),
>  .BR tc-htb (8),
>  .BR tc-hfsc (8),

Could I suggest maintaining alphabetic order?

^ permalink raw reply

* [PATCH IPROUTE2 V2] tc: Update manpage
From: Vijay Subramanian @ 2012-06-05 18:41 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Eric Dumazet, jan.ceuleers, Vijay Subramanian

This makes 2 changes:
1: Add fq_codel to SEE ALSO section in tc manpage.
2: Reorder the SEE ALSO section to make the order alphabetical
(suggested by Jan Ceuleers ).

Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
---
 V2: Reorder the SEE ALSO section to make it alphabetical.

 man/man8/tc.8 |   15 ++++++++-------
 1 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index 14a1cd8..958ab98 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -366,21 +366,22 @@ print rates in IEC units (ie. 1K = 1024).
 .B tc
 was written by Alexey N. Kuznetsov and added in Linux 2.2.
 .SH SEE ALSO
+.BR tc-bfifo (8),
 .BR tc-cbq (8),
 .BR tc-choke (8),
 .BR tc-codel (8),
 .BR tc-drr (8),
-.BR tc-htb (8),
-.BR tc-hfsc (8),
+.BR tc-fq_codel (8),
 .BR tc-hfsc (7),
-.BR tc-sfb (8),
-.BR tc-sfq (8),
-.BR tc-red (8),
-.BR tc-tbf (8),
+.BR tc-hfsc (8),
+.BR tc-htb (8),
 .BR tc-pfifo (8),
-.BR tc-bfifo (8),
 .BR tc-pfifo_fast (8),
+.BR tc-red (8),
+.BR tc-sfb (8),
+.BR tc-sfq (8),
 .BR tc-stab (8),
+.BR tc-tbf (8),
 .br
 .RB "User documentation at " http://lartc.org/ ", but please direct bugreports and patches to: " <netdev@vger.kernel.org>
 
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH 3/3] Revert Backoff [v3]: Calculate TCP's connection close threshold as a time value.
From: Damian Lukowski @ 2012-06-05 18:39 UTC (permalink / raw)
  To: Jerry Chu; +Cc: Netdev, David Miller, Ilpo Järvinen
In-Reply-To: <CAPshTCg8W0QxubvO_dWBsSOqvCEdBSWLKv3cNOvn=VbyqHA1zg@mail.gmail.com>

Am Dienstag, den 05.06.2012, 10:42 -0700 schrieb Jerry Chu:
> On Mon, Jun 4, 2012 at 4:50 PM, Jerry Chu <hkchu@google.com> wrote:
> > Hi Damian,
> >
> > On Mon, Jun 4, 2012 at 10:50 AM, Damian Lukowski
> > <damian@tvk.rwth-aachen.de> wrote:
> >> Hi Jerry,
> >>
> >> please verify, I understood you correctly.
> >>
> >> You have set TCP_RTO_MIN to a lower value, e.g. 0.002 seconds to improve
> >> your internal low-latency traffic. Because of the improvement, R1
> >> timeouts are triggered too fast for external high-RTT traffic. Is that
> >> correct?
> >
> > Correct.
> >
> >> If so, may I suggest to set tcp_retries1 to a higher value? For
> >> TCP_RTO_MIN == 0.002 and tcp_retries1 ==  10, R1 will be calculated to
> >> approximately 4 seconds.
> >
> > I think hacking tcp_retries1 is the wrong solution. E.g., 10 retries may be too
> > generous for those short RTT flows.
> >
> > I think the fundamental problem is - the ideal fix for your original RTO revert
> > problem should've used the per-flow RTO to compute R1 & R2. But that
> > computation may be too expensive so you used TCP_RTO_MIN as an
> > approximation - not a good idea IMHO!
> 
> Just realized the correct fix of using the original, non-backoff per flow RTO is
> not any more expensive than the current code through ilog2(). What's needed
> is a new field "base_rto" to record the original RTO before backoff. I'm leaning
> toward this more accurate fix now without any fudge because fudging almost
> always causes bugs.


The current version of retransmits_timed_out() uses such a field
already. I suppose, we can do a combination like the following?

-       unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
+       unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : __tcp_set_rto(tcp_sk(sk));
+       rto_base = rto_base ? : TCP_RTO_MIN;
 
-       if (!inet_csk(sk)->icsk_retransmits)
+       if (inet_csk(sk)->icsk_retransmits < boundary)
 

Regards
 Damian

> 
> Any comment is welcome. I'm not sure in the existing code if it makes sense
> to apply the exponential backoff based computation to thin stream but it's a
> separate question so I won't touch it.
> 
> Jerry
> 
> >
> > The easiest solution I can see so far is to replace the check
> >
> > if (!inet_csk(sk)->icsk_retransmits)
> >                return false;
> >
> > at the beginning of retransmits_timed_out() with
> >
> > if (inet_csk(sk)->icsk_retransmits < boundary)
> >                return false;
> >
> > Best,
> >
> > Jerry
> >
> >>
> >> Is that ok?
> >>
> >> Best regards
> >>  Damian
> >>
> >> Am Freitag, den 01.06.2012, 15:58 -0700 schrieb Jerry Chu:
> >>> > From: Damian Lukowski <damian@tvk.rwth-aachen.de>
> >>> > Date: Wed, Aug 26, 2009 at 3:16 AM
> >>> > Subject: [PATCH 3/3] Revert Backoff [v3]: Calculate TCP's connection close
> >>> > threshold as a time value.
> >>> > To: Netdev <netdev@vger.kernel.org>
> >>> >
> >>> >
> >>> > RFC 1122 specifies two threshold values R1 and R2 for connection timeouts,
> >>> > which may represent a number of allowed retransmissions or a timeout value.
> >>> > Currently linux uses sysctl_tcp_retries{1,2} to specify the thresholds
> >>> > in number of allowed retransmissions.
> >>> >
> >>> > For any desired threshold R2 (by means of time) one can specify tcp_retries2
> >>> > (by means of number of retransmissions) such that TCP will not time out
> >>> > earlier than R2. This is the case, because the RTO schedule follows a fixed
> >>> > pattern, namely exponential backoff.
> >>> >
> >>> > However, the RTO behaviour is not predictable any more if RTO backoffs can
> >>> > be
> >>> > reverted, as it is the case in the draft
> >>> > "Make TCP more Robust to Long Connectivity Disruptions"
> >>> > (http://tools.ietf.org/html/draft-zimmermann-tcp-lcd).
> >>> >
> >>> > In the worst case TCP would time out a connection after 3.2 seconds, if the
> >>> > initial RTO equaled MIN_RTO and each backoff has been reverted.
> >>> >
> >>> > This patch introduces a function retransmits_timed_out(N),
> >>> > which calculates the timeout of a TCP connection, assuming an initial
> >>> > RTO of MIN_RTO and N unsuccessful, exponentially backed-off retransmissions.
> >>> >
> >>> > Whenever timeout decisions are made by comparing the retransmission counter
> >>> > to some value N, this function can be used, instead.
> >>> >
> >>> > The meaning of tcp_retries2 will be changed, as many more RTO
> >>> > retransmissions
> >>> > can occur than the value indicates. However, it yields a timeout which is
> >>> > similar to the one of an unpatched, exponentially backing off TCP in the
> >>> > same
> >>> > scenario. As no application could rely on an RTO greater than MIN_RTO, there
> >>> > should be no risk of a regression.
> >>>
> >>> This looks like a typical "fix one problem, introducing a few more" patch :(.
> >>> What do you mean by "no application could rely on an RTO greater than
> >>> MIN_RTO..."
> >>> above? How can you make the assumption that RTO is not too far off
> >>> from TCP_RTO_MIN?
> >>>
> >>> While you tried to address a problem where the retransmission count
> >>> was high but the actual
> >>> timeout duration was too short, have you considered the other case
> >>> around, i.e., the timeout
> >>> duration is long but the retransmission count is too short? This is
> >>> exactly what's happening
> >>> to us with your patch. We've much reduced TCP_RTO_MIN for our internal
> >>> traffic, but not
> >>> noticing your change has severely shortened the R1 & R2 recommended by
> >>> RFC1122 for our
> >>> long haul traffic until now. In many cases R1 threshold was met upon
> >>> the first retrans timeout.
> >>>
> >>> I think retransmits_timed_out() should check against both time
> >>> duration and retrans count
> >>> (icsk_retransmits).
> >>>
> >>> Thought?
> >>>
> >>> Jerry
> >>>
> >>> >
> >>> > Signed-off-by: Damian Lukowski <damian@tvk.rwth-aachen.de>
> >>> > ---
> >>> >  include/net/tcp.h    |   18 ++++++++++++++++++
> >>> >  net/ipv4/tcp_timer.c |   11 +++++++----
> >>> >  2 files changed, 25 insertions(+), 4 deletions(-)
> >>> >
> >>> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> >>> > index c35b329..17d1a88 100644
> >>> > --- a/include/net/tcp.h
> >>> > +++ b/include/net/tcp.h
> >>> > @@ -1247,6 +1247,24 @@ static inline struct sk_buff
> >>> > *tcp_write_queue_prev(struct sock *sk, struct sk_bu
> >>> >  #define tcp_for_write_queue_from_safe(skb, tmp, sk)                    \
> >>> >        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
> >>> >
> >>> > +static inline bool retransmits_timed_out(const struct sock *sk,
> >>> > +                                        unsigned int boundary)
> >>> > +{
> >>> > +       int limit, K;
> >>> > +       if (!inet_csk(sk)->icsk_retransmits)
> >>> > +               return false;
> >>> > +
> >>> > +       K = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
> >>> > +
> >>> > +       if (boundary <= K)
> >>> > +               limit = ((2 << boundary) - 1) * TCP_RTO_MIN;
> >>> > +       else
> >>> > +               limit = ((2 << K) - 1) * TCP_RTO_MIN +
> >>> > +                       (boundary - K) * TCP_RTO_MAX;
> >>> > +
> >>> > +       return (tcp_time_stamp - tcp_sk(sk)->retrans_stamp) >= limit;
> >>> > +}
> >>> > +
> >>> >  static inline struct sk_buff *tcp_send_head(struct sock *sk)
> >>> >  {
> >>> >        return sk->sk_send_head;
> >>> > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> >>> > index a3ba494..2972d7b 100644
> >>> > --- a/net/ipv4/tcp_timer.c
> >>> > +++ b/net/ipv4/tcp_timer.c
> >>> > @@ -137,13 +137,14 @@ static int tcp_write_timeout(struct sock *sk)
> >>> >  {
> >>> >        struct inet_connection_sock *icsk = inet_csk(sk);
> >>> >        int retry_until;
> >>> > +       bool do_reset;
> >>> >
> >>> >        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
> >>> >                if (icsk->icsk_retransmits)
> >>> >                        dst_negative_advice(&sk->sk_dst_cache);
> >>> >                retry_until = icsk->icsk_syn_retries ? :
> >>> > sysctl_tcp_syn_retries;
> >>> >        } else {
> >>> > -               if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
> >>> > +               if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
> >>> >                        /* Black hole detection */
> >>> >                        tcp_mtu_probing(icsk, sk);
> >>> >
> >>> > @@ -155,13 +156,15 @@ static int tcp_write_timeout(struct sock *sk)
> >>> >                        const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
> >>> >
> >>> >                        retry_until = tcp_orphan_retries(sk, alive);
> >>> > +                       do_reset = alive ||
> >>> > +                                  !retransmits_timed_out(sk, retry_until);
> >>> >
> >>> > -                       if (tcp_out_of_resources(sk, alive ||
> >>> > icsk->icsk_retransmits < retry_until))
> >>> > +                       if (tcp_out_of_resources(sk, do_reset))
> >>> >                                return 1;
> >>> >                }
> >>> >        }
> >>> >
> >>> > -       if (icsk->icsk_retransmits >= retry_until) {
> >>> > +       if (retransmits_timed_out(sk, retry_until)) {
> >>> >                /* Has it gone just too far? */
> >>> >                tcp_write_err(sk);
> >>> >                return 1;
> >>> > @@ -385,7 +388,7 @@ void tcp_retransmit_timer(struct sock *sk)
> >>> >  out_reset_timer:
> >>> >        icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
> >>> >        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
> >>> > TCP_RTO_MAX);
> >>> > -       if (icsk->icsk_retransmits > sysctl_tcp_retries1)
> >>> > +       if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
> >>> >                __sk_dst_reset(sk);
> >>> >
> >>> >  out:;
> >>> > --
> >>> > 1.6.3.3
> >>> >
> >>> > --
> >>> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> >>> > the body of a message to majordomo@vger.kernel.org
> >>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>> >
> >>
> >>

^ permalink raw reply

* Re: [net-next PATCH 1/3] Added kernel support in EEE Ethtool commands
From: Ben Hutchings @ 2012-06-05 19:01 UTC (permalink / raw)
  To: Yuval Mintz; +Cc: davem, netdev, eilong, peppe.cavallaro
In-Reply-To: <1338878342-24586-2-git-send-email-yuvalmin@broadcom.com>

On Tue, 2012-06-05 at 09:39 +0300, Yuval Mintz wrote:
> This patch extends the kernel's ethtool interface by adding support
> for 2 new EEE commands - get_eee and set_eee.
> 
> Thanks goes to Giuseppe Cavallaro for his original patch adding this support.
> 
> Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
> Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
> ---
>  include/linux/ethtool.h |   19 +++++++++++++++++++
>  net/core/ethtool.c      |   34 ++++++++++++++++++++++++++++++++++
>  2 files changed, 53 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
> index e17fa71..527de4c 100644
> --- a/include/linux/ethtool.h
> +++ b/include/linux/ethtool.h
> @@ -136,6 +136,19 @@ struct ethtool_eeprom {
>  	__u8	data[0];
>  };
>  
> +/* EEE settings */
> +struct ethtool_eee {
> +	__u32	cmd;
> +	__u32	supported;
> +	__u32	advertised;
> +	__u32	lp_advertised;
> +	__u32	eee_active;
> +	__u32	eee_enabled;
> +	__u32	tx_lpi_enabled;
> +	__u32	tx_lpi_timer;
> +	__u32	reserved[2];
> +};

This needs a kernel-doc comment explaining exactly what each of the
fields means.

[...]
> --- a/net/core/ethtool.c
> +++ b/net/core/ethtool.c
> @@ -729,6 +729,34 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
>  	return dev->ethtool_ops->set_wol(dev, &wol);
>  }
>  
> +static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
> +{
> +	struct ethtool_eee edata;
> +
> +	if (!dev->ethtool_ops->get_eee)
> +		return -EOPNOTSUPP;

We *must* initialise all of edata and we should not leave it to the
driver to do.  Otherwise we will be copying back uninitialised data to
userland which (1) might reveal sensitive information stored on the
stack by previous function calls (2) sets the cmd field to the wrong
value (3) sets the reserved fields to random values, making them
unusable for expansion.

> +	dev->ethtool_ops->get_eee(dev, &edata);

Missing error check.

> +	if (copy_to_user(useraddr, &edata, sizeof(edata)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
[...]

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* [PATCH] netdev: mv643xx_eth: Prevent build on PPC32
From: Josh Boyer @ 2012-06-05 19:28 UTC (permalink / raw)
  To: Lennert Buytenhek; +Cc: Andrew Lunn, Olof Johansson, netdev

Commit 452503ebc (ARM: Orion: Eth: Add clk/clkdev support.) added use of
the clk driver API which results in compile errors on architectures that
don't implement the clk API.

ERROR: "clk_enable" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
ERROR: "clk_disable" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
ERROR: "clk_put" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
ERROR: "clk_get_rate" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
ERROR: "clk_get" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!

Selecting CLKDEV_LOOKUP doesn't fix this either, as the build then fails with:

In file included from drivers/clk/clkdev.c:21:0:
include/linux/clkdev.h:15:24: fatal error: asm/clkdev.h: No such file or directory

So we just prevent this from building at all on PPC32.

Signed-off-by: Josh Boyer <jwboyer@redhat.com>
---
 drivers/net/ethernet/marvell/Kconfig |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig
index 0029934..628f5b1 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -20,7 +20,7 @@ if NET_VENDOR_MARVELL
 
 config MV643XX_ETH
 	tristate "Marvell Discovery (643XX) and Orion ethernet support"
-	depends on (MV64X60 || PPC32 || PLAT_ORION) && INET
+	depends on (MV64X60 || PLAT_ORION) && INET
 	select INET_LRO
 	select PHYLIB
 	---help---
-- 
1.7.10.2

^ permalink raw reply related

* Re: linux-next: Tree for Apr 12
From: Andrew Morton @ 2012-06-05 19:50 UTC (permalink / raw)
  To: Eric Paris
  Cc: Stephen Rothwell, linux-next, LKML, netdev, James Morris,
	Stephen Smalley
In-Reply-To: <1338864128.17933.2.camel@localhost>

On Mon, 04 Jun 2012 22:42:08 -0400
Eric Paris <eparis@redhat.com> wrote:

> > I really do not want to revert this and feel that the only right fix is
> > going to be to update your selinux policy to allow this new check.  I'd
> > rather not allow (whatever program) to truncate() files willy-nilly (in
> > violation of the intentions of selinux policy)
> > 
> > I'm sorry I never saw it sooner.  We've had it in RHEL for even longer
> > than the 3 months it's been in -next.  I think the 'right' fix is going
> > to have to be an update to SELinux policy (for your long dead system, if
> > you give me the denial I can build you a new policy) rather than leaving
> > the potential security hole in mainline...
> 
> Andrew sent me his audit log and it didn't show anything.  But it got me
> thinking.  Now I think this actually is a code bug.  Andrew, can you
> test this?
> 
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 2e7bd67..20a4315 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -2758,7 +2758,7 @@ static int selinux_inode_setattr(struct dentry *dentry, struct iattr *iattr)
>  			ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET))
>  		return dentry_has_perm(cred, dentry, FILE__SETATTR);
>  
> -	if (ia_valid & ATTR_SIZE)
> +	if ((ia_valid & ATTR_SIZE) && selinux_policycap_openperm)
>  		av |= FILE__OPEN;
>  
>  	return dentry_has_perm(cred, dentry, av);

That fixed it.

^ permalink raw reply

* Re: [PATCH 3/3] Revert Backoff [v3]: Calculate TCP's connection close threshold as a time value.
From: Jerry Chu @ 2012-06-05 21:22 UTC (permalink / raw)
  To: Damian Lukowski; +Cc: Netdev, David Miller, Ilpo Järvinen
In-Reply-To: <1338921588.17263.6.camel@nexus>

On Tue, Jun 5, 2012 at 11:39 AM, Damian Lukowski
<damian@tvk.rwth-aachen.de> wrote:
> Am Dienstag, den 05.06.2012, 10:42 -0700 schrieb Jerry Chu:
>> On Mon, Jun 4, 2012 at 4:50 PM, Jerry Chu <hkchu@google.com> wrote:
>> > Hi Damian,
>> >
>> > On Mon, Jun 4, 2012 at 10:50 AM, Damian Lukowski
>> > <damian@tvk.rwth-aachen.de> wrote:
>> >> Hi Jerry,
>> >>
>> >> please verify, I understood you correctly.
>> >>
>> >> You have set TCP_RTO_MIN to a lower value, e.g. 0.002 seconds to improve
>> >> your internal low-latency traffic. Because of the improvement, R1
>> >> timeouts are triggered too fast for external high-RTT traffic. Is that
>> >> correct?
>> >
>> > Correct.
>> >
>> >> If so, may I suggest to set tcp_retries1 to a higher value? For
>> >> TCP_RTO_MIN == 0.002 and tcp_retries1 ==  10, R1 will be calculated to
>> >> approximately 4 seconds.
>> >
>> > I think hacking tcp_retries1 is the wrong solution. E.g., 10 retries may be too
>> > generous for those short RTT flows.
>> >
>> > I think the fundamental problem is - the ideal fix for your original RTO revert
>> > problem should've used the per-flow RTO to compute R1 & R2. But that
>> > computation may be too expensive so you used TCP_RTO_MIN as an
>> > approximation - not a good idea IMHO!
>>
>> Just realized the correct fix of using the original, non-backoff per flow RTO is
>> not any more expensive than the current code through ilog2(). What's needed
>> is a new field "base_rto" to record the original RTO before backoff. I'm leaning
>> toward this more accurate fix now without any fudge because fudging almost
>> always causes bugs.
>
>
> The current version of retransmits_timed_out() uses such a field
> already. I suppose, we can do a combination like the following?
>
> -       unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
> +       unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : __tcp_set_rto(tcp_sk(sk));

Yes that could work and we probably don't need a new field for the original RTO.

But I started wondering what the problem you tried to solve initially. The old
counter (icsk_retransmits) based code was really easy to understand, debug, and
matched well with the API (sysctl_tcp_retries1, sysctl_tcp_retries2,
TCP_SYNCNT,...), which are all counter based. Moreover, my simple brain has
a strong prejudice against complex code unless the complexity is justified.

Could you point out where backoff revert might happen? (tcp_v4_err() when
handing ICMP errors?) And for those cases is it possible to either not increment
icsk_retransmits (as long as it won't get us into infinite
retransmissions), or invent
a separate field for the sole purpose of timeout check? Won't that be
much simpler
than your current fix?

Best,

Jerry

> +       rto_base = rto_base ? : TCP_RTO_MIN;
>
> -       if (!inet_csk(sk)->icsk_retransmits)
> +       if (inet_csk(sk)->icsk_retransmits < boundary)
>
>
> Regards
>  Damian
>
>>
>> Any comment is welcome. I'm not sure in the existing code if it makes sense
>> to apply the exponential backoff based computation to thin stream but it's a
>> separate question so I won't touch it.
>>
>> Jerry
>>
>> >
>> > The easiest solution I can see so far is to replace the check
>> >
>> > if (!inet_csk(sk)->icsk_retransmits)
>> >                return false;
>> >
>> > at the beginning of retransmits_timed_out() with
>> >
>> > if (inet_csk(sk)->icsk_retransmits < boundary)
>> >                return false;
>> >
>> > Best,
>> >
>> > Jerry
>> >
>> >>
>> >> Is that ok?
>> >>
>> >> Best regards
>> >>  Damian
>> >>
>> >> Am Freitag, den 01.06.2012, 15:58 -0700 schrieb Jerry Chu:
>> >>> > From: Damian Lukowski <damian@tvk.rwth-aachen.de>
>> >>> > Date: Wed, Aug 26, 2009 at 3:16 AM
>> >>> > Subject: [PATCH 3/3] Revert Backoff [v3]: Calculate TCP's connection close
>> >>> > threshold as a time value.
>> >>> > To: Netdev <netdev@vger.kernel.org>
>> >>> >
>> >>> >
>> >>> > RFC 1122 specifies two threshold values R1 and R2 for connection timeouts,
>> >>> > which may represent a number of allowed retransmissions or a timeout value.
>> >>> > Currently linux uses sysctl_tcp_retries{1,2} to specify the thresholds
>> >>> > in number of allowed retransmissions.
>> >>> >
>> >>> > For any desired threshold R2 (by means of time) one can specify tcp_retries2
>> >>> > (by means of number of retransmissions) such that TCP will not time out
>> >>> > earlier than R2. This is the case, because the RTO schedule follows a fixed
>> >>> > pattern, namely exponential backoff.
>> >>> >
>> >>> > However, the RTO behaviour is not predictable any more if RTO backoffs can
>> >>> > be
>> >>> > reverted, as it is the case in the draft
>> >>> > "Make TCP more Robust to Long Connectivity Disruptions"
>> >>> > (http://tools.ietf.org/html/draft-zimmermann-tcp-lcd).
>> >>> >
>> >>> > In the worst case TCP would time out a connection after 3.2 seconds, if the
>> >>> > initial RTO equaled MIN_RTO and each backoff has been reverted.
>> >>> >
>> >>> > This patch introduces a function retransmits_timed_out(N),
>> >>> > which calculates the timeout of a TCP connection, assuming an initial
>> >>> > RTO of MIN_RTO and N unsuccessful, exponentially backed-off retransmissions.
>> >>> >
>> >>> > Whenever timeout decisions are made by comparing the retransmission counter
>> >>> > to some value N, this function can be used, instead.
>> >>> >
>> >>> > The meaning of tcp_retries2 will be changed, as many more RTO
>> >>> > retransmissions
>> >>> > can occur than the value indicates. However, it yields a timeout which is
>> >>> > similar to the one of an unpatched, exponentially backing off TCP in the
>> >>> > same
>> >>> > scenario. As no application could rely on an RTO greater than MIN_RTO, there
>> >>> > should be no risk of a regression.
>> >>>
>> >>> This looks like a typical "fix one problem, introducing a few more" patch :(.
>> >>> What do you mean by "no application could rely on an RTO greater than
>> >>> MIN_RTO..."
>> >>> above? How can you make the assumption that RTO is not too far off
>> >>> from TCP_RTO_MIN?
>> >>>
>> >>> While you tried to address a problem where the retransmission count
>> >>> was high but the actual
>> >>> timeout duration was too short, have you considered the other case
>> >>> around, i.e., the timeout
>> >>> duration is long but the retransmission count is too short? This is
>> >>> exactly what's happening
>> >>> to us with your patch. We've much reduced TCP_RTO_MIN for our internal
>> >>> traffic, but not
>> >>> noticing your change has severely shortened the R1 & R2 recommended by
>> >>> RFC1122 for our
>> >>> long haul traffic until now. In many cases R1 threshold was met upon
>> >>> the first retrans timeout.
>> >>>
>> >>> I think retransmits_timed_out() should check against both time
>> >>> duration and retrans count
>> >>> (icsk_retransmits).
>> >>>
>> >>> Thought?
>> >>>
>> >>> Jerry
>> >>>
>> >>> >
>> >>> > Signed-off-by: Damian Lukowski <damian@tvk.rwth-aachen.de>
>> >>> > ---
>> >>> >  include/net/tcp.h    |   18 ++++++++++++++++++
>> >>> >  net/ipv4/tcp_timer.c |   11 +++++++----
>> >>> >  2 files changed, 25 insertions(+), 4 deletions(-)
>> >>> >
>> >>> > diff --git a/include/net/tcp.h b/include/net/tcp.h
>> >>> > index c35b329..17d1a88 100644
>> >>> > --- a/include/net/tcp.h
>> >>> > +++ b/include/net/tcp.h
>> >>> > @@ -1247,6 +1247,24 @@ static inline struct sk_buff
>> >>> > *tcp_write_queue_prev(struct sock *sk, struct sk_bu
>> >>> >  #define tcp_for_write_queue_from_safe(skb, tmp, sk)                    \
>> >>> >        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
>> >>> >
>> >>> > +static inline bool retransmits_timed_out(const struct sock *sk,
>> >>> > +                                        unsigned int boundary)
>> >>> > +{
>> >>> > +       int limit, K;
>> >>> > +       if (!inet_csk(sk)->icsk_retransmits)
>> >>> > +               return false;
>> >>> > +
>> >>> > +       K = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
>> >>> > +
>> >>> > +       if (boundary <= K)
>> >>> > +               limit = ((2 << boundary) - 1) * TCP_RTO_MIN;
>> >>> > +       else
>> >>> > +               limit = ((2 << K) - 1) * TCP_RTO_MIN +
>> >>> > +                       (boundary - K) * TCP_RTO_MAX;
>> >>> > +
>> >>> > +       return (tcp_time_stamp - tcp_sk(sk)->retrans_stamp) >= limit;
>> >>> > +}
>> >>> > +
>> >>> >  static inline struct sk_buff *tcp_send_head(struct sock *sk)
>> >>> >  {
>> >>> >        return sk->sk_send_head;
>> >>> > diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>> >>> > index a3ba494..2972d7b 100644
>> >>> > --- a/net/ipv4/tcp_timer.c
>> >>> > +++ b/net/ipv4/tcp_timer.c
>> >>> > @@ -137,13 +137,14 @@ static int tcp_write_timeout(struct sock *sk)
>> >>> >  {
>> >>> >        struct inet_connection_sock *icsk = inet_csk(sk);
>> >>> >        int retry_until;
>> >>> > +       bool do_reset;
>> >>> >
>> >>> >        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
>> >>> >                if (icsk->icsk_retransmits)
>> >>> >                        dst_negative_advice(&sk->sk_dst_cache);
>> >>> >                retry_until = icsk->icsk_syn_retries ? :
>> >>> > sysctl_tcp_syn_retries;
>> >>> >        } else {
>> >>> > -               if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
>> >>> > +               if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
>> >>> >                        /* Black hole detection */
>> >>> >                        tcp_mtu_probing(icsk, sk);
>> >>> >
>> >>> > @@ -155,13 +156,15 @@ static int tcp_write_timeout(struct sock *sk)
>> >>> >                        const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
>> >>> >
>> >>> >                        retry_until = tcp_orphan_retries(sk, alive);
>> >>> > +                       do_reset = alive ||
>> >>> > +                                  !retransmits_timed_out(sk, retry_until);
>> >>> >
>> >>> > -                       if (tcp_out_of_resources(sk, alive ||
>> >>> > icsk->icsk_retransmits < retry_until))
>> >>> > +                       if (tcp_out_of_resources(sk, do_reset))
>> >>> >                                return 1;
>> >>> >                }
>> >>> >        }
>> >>> >
>> >>> > -       if (icsk->icsk_retransmits >= retry_until) {
>> >>> > +       if (retransmits_timed_out(sk, retry_until)) {
>> >>> >                /* Has it gone just too far? */
>> >>> >                tcp_write_err(sk);
>> >>> >                return 1;
>> >>> > @@ -385,7 +388,7 @@ void tcp_retransmit_timer(struct sock *sk)
>> >>> >  out_reset_timer:
>> >>> >        icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
>> >>> >        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
>> >>> > TCP_RTO_MAX);
>> >>> > -       if (icsk->icsk_retransmits > sysctl_tcp_retries1)
>> >>> > +       if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
>> >>> >                __sk_dst_reset(sk);
>> >>> >
>> >>> >  out:;
>> >>> > --
>> >>> > 1.6.3.3
>> >>> >
>> >>> > --
>> >>> > To unsubscribe from this list: send the line "unsubscribe netdev" in
>> >>> > the body of a message to majordomo@vger.kernel.org
>> >>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> >>> >
>> >>
>> >>
>
>

^ permalink raw reply

* Re: [PATCH net-next 5/5] gianfar_ethtool: coding style and whitespace cleanups
From: David Miller @ 2012-06-05 21:37 UTC (permalink / raw)
  To: jan.ceuleers; +Cc: b06378, joe, netdev
In-Reply-To: <4FCDE447.3030103@computer.org>

From: Jan Ceuleers <jan.ceuleers@computer.org>
Date: Tue, 05 Jun 2012 12:49:43 +0200

> On 06/05/2012 11:14 AM, David Miller wrote:
>> From: Jan Ceuleers <jan.ceuleers@computer.org>
>> Date: Tue, 05 Jun 2012 07:54:29 +0200
>> 
>>> So your build environment happens to be one of powerpc, alpha or mips,
>>> does it?
>> 
>> No, I get those errors you posted too.
>> 
>> But like I said, you simply IGNORE THEM, and look for newly introduced
>> errors and warnings.
> 
> David,
> 
> The error I quoted was a fatal error, meaning that compilation did not
> proceed beyond the point to which the error pertained (inclusion of a

I know it's a fatal error.

I'm saying IGNORE it.  I get it TOO!

You look at what it outputs before your changes, and then you run it
again after your changes and see if anything NEW shows up.

That's how I found your typo.

^ permalink raw reply

* Re: [PATCH net-next 5/5] gianfar_ethtool: coding style and whitespace cleanups
From: David Miller @ 2012-06-05 21:38 UTC (permalink / raw)
  To: florian; +Cc: jan.ceuleers, b06378, joe, netdev
In-Reply-To: <4550590.BhQA539plh@flexo>

From: Florian Fainelli <florian@openwrt.org>
Date: Tue, 05 Jun 2012 12:54:34 +0200

> Hi,
> 
> On Tuesday 05 June 2012 12:49:43 Jan Ceuleers wrote:
>> On 06/05/2012 11:14 AM, David Miller wrote:
>> > From: Jan Ceuleers <jan.ceuleers@computer.org>
>> > Date: Tue, 05 Jun 2012 07:54:29 +0200
>> > 
>> >> So your build environment happens to be one of powerpc, alpha or mips,
>> >> does it?
>> > 
>> > No, I get those errors you posted too.
>> > 
>> > But like I said, you simply IGNORE THEM, and look for newly introduced
>> > errors and warnings.
>> 
>> David,
>> 
>> The error I quoted was a fatal error, meaning that compilation did not
>> proceed beyond the point to which the error pertained (inclusion of a
>> header that does not exist in my arch). So I cannot test-compile the
>> driver on my arch and draw any conclusions from that beyond line 91.
> 
> What about you setup a cross-compiler and build for one of these architectures 
> where the driver is enabled?

Dammit, people read what I'm saying.

This isn't even necessary for quick validation of patches.

You ignore the compile errors when you force build the driver, and
then you simply make sure you don't introduce any new errors of
warnings into the build.

For quick sanity checking a cross build is absolutely overkill and
simply not necessary.

^ permalink raw reply

* Re: [PATCH 1/1] net: add dev_loopback_xmit() to avoid duplicate code
From: David Miller @ 2012-06-05 21:41 UTC (permalink / raw)
  To: michel; +Cc: netdev, linux-kernel
In-Reply-To: <1338902613.2792.16.camel@Thor>

From: Michel Machado <michel@digirati.com.br>
Date: Tue, 05 Jun 2012 09:23:33 -0400

> Just to avoid bothering you further, should I resend this patch or not?

Always resend patches.

^ permalink raw reply

* Re: [PATCH v3 net-next 0/5] gianfar: coding style cleanups
From: David Miller @ 2012-06-05 22:38 UTC (permalink / raw)
  To: jan.ceuleers; +Cc: b06378, joe, netdev
In-Reply-To: <1338903735-24527-1-git-send-email-jan.ceuleers@computer.org>

From: Jan Ceuleers <jan.ceuleers@computer.org>
Date: Tue,  5 Jun 2012 15:42:10 +0200

> Various coding style cleanups, mostly whitespace and comment reformatting.
> 
> I have left a number of lines untouched where I felt that the changes I
> was considering were not improving readability. But de gustibus etc.
> 
> Patch 4/5 also removes some superfluous local variable initialisations.
> These are obviously correct because the variable is initialised again
> right after. But you might consider this to be unnecessary code churn
> because the compiler is unlikely to generate better code due to this patch.
> 
> WARNING: Compile-tested only. I do not have the hardware.

All applied, thanks Jan.

^ permalink raw reply

* Re: [Bug 43327] New: IP routing: cached route is applied to wrong network interface
From: David Miller @ 2012-06-05 22:40 UTC (permalink / raw)
  To: shemminger; +Cc: netdev
In-Reply-To: <20120531174641.2809b5c4@nehalam.linuxnetplumber.net>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 31 May 2012 17:46:41 -0700

> Subject: [Bug 43327] New: IP routing: cached route is applied to wrong network interface

I suspect we'll need to root the inetpeer trees in the fib rules
to fix this.

^ permalink raw reply

* Re: [PATCH] netdev: mv643xx_eth: Prevent build on PPC32
From: Ben Hutchings @ 2012-06-05 23:30 UTC (permalink / raw)
  To: Josh Boyer; +Cc: Lennert Buytenhek, Andrew Lunn, Olof Johansson, netdev
In-Reply-To: <20120605192820.GC7683@zod.bos.redhat.com>

[-- Attachment #1: Type: text/plain, Size: 1567 bytes --]

On Tue, 2012-06-05 at 15:28 -0400, Josh Boyer wrote:
> Commit 452503ebc (ARM: Orion: Eth: Add clk/clkdev support.) added use of
> the clk driver API which results in compile errors on architectures that
> don't implement the clk API.
> 
> ERROR: "clk_enable" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
> ERROR: "clk_disable" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
> ERROR: "clk_put" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
> ERROR: "clk_get_rate" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
> ERROR: "clk_get" [drivers/net/ethernet/marvell/mv643xx_eth.ko] undefined!
> 
> Selecting CLKDEV_LOOKUP doesn't fix this either, as the build then fails with:
> 
> In file included from drivers/clk/clkdev.c:21:0:
> include/linux/clkdev.h:15:24: fatal error: asm/clkdev.h: No such file or directory
> 
> So we just prevent this from building at all on PPC32.
[...]

This dependency was introduced by:

commit 16b817579fb61050f1abcc0e81089974328a9c27
Author: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date:   Sat Apr 16 15:24:30 2005 -0700

    [PATCH] ppc32: MV643XX ethernet is an option for Pegasos

commit 06ede91017d015a03cf8c1c87b3ff668f9a846e0
Author: Dale Farnsworth <dale@farnsworth.org>
Date:   Wed Sep 20 12:24:34 2006 -0700

    [PATCH] mv643xx_eth: restrict to 32-bit PPC_MULTIPLATFORM

If Pegasos is still supposed to be supported then this needs to be fixed
properly.

Ben.

-- 
Ben Hutchings
I haven't lost my mind; it's backed up on tape somewhere.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: [v2 PATCH] netfilter: xt_HMARK: fix endian bugs and warnings
From: Pablo Neira Ayuso @ 2012-06-05 23:41 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: kaber, jengelh, netfilter-devel, netdev, dan.carpenter,
	hans.schillstrom
In-Reply-To: <1337330146-26305-1-git-send-email-hans@schillstrom.com>

On Fri, May 18, 2012 at 10:35:46AM +0200, Hans Schillstrom wrote:
> A mix of u32 and __be32 causes endian warning.
> The hash value produced is now the same for big and little endian machines.
> i.e. a mix of Big and Little endian in a cluster is now possible.

Applied with minor glitches, Thanks Hans.

I have rewritten the description.

> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
> ---
>  include/linux/netfilter/xt_HMARK.h |    5 +++
>  net/netfilter/xt_HMARK.c           |   69 ++++++++++++++++++++----------------
>  2 files changed, 43 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h
> index abb1650..826fc58 100644
> --- a/include/linux/netfilter/xt_HMARK.h
> +++ b/include/linux/netfilter/xt_HMARK.h
> @@ -27,7 +27,12 @@ union hmark_ports {
>  		__u16	src;
>  		__u16	dst;
>  	} p16;
> +	struct {
> +		__be16	src;
> +		__be16	dst;
> +	} b16;
>  	__u32	v32;
> +	__be32	b32;
>  };
>  
>  struct xt_hmark_info {
> diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
> index 0a96a43..5119666 100644
> --- a/net/netfilter/xt_HMARK.c
> +++ b/net/netfilter/xt_HMARK.c
> @@ -32,13 +32,13 @@ MODULE_ALIAS("ipt_HMARK");
>  MODULE_ALIAS("ip6t_HMARK");
>  
>  struct hmark_tuple {
> -	u32			src;
> -	u32			dst;
> +	__be32			src;
> +	__be32			dst;
>  	union hmark_ports	uports;
>  	uint8_t			proto;

I have converted this uint8_t to u8. I think I'm responsible for that
slipped through.

>  };
>  
> -static inline u32 hmark_addr6_mask(const __u32 *addr32, const __u32 *mask)
> +static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask)
>  {
>  	return (addr32[0] & mask[0]) ^
>  	       (addr32[1] & mask[1]) ^
> @@ -46,8 +46,8 @@ static inline u32 hmark_addr6_mask(const __u32 *addr32, const __u32 *mask)
>  	       (addr32[3] & mask[3]);
>  }
>  
> -static inline u32
> -hmark_addr_mask(int l3num, const __u32 *addr32, const __u32 *mask)
> +static inline __be32
> +hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask)
>  {
>  	switch (l3num) {
>  	case AF_INET:
> @@ -58,6 +58,22 @@ hmark_addr_mask(int l3num, const __u32 *addr32, const __u32 *mask)
>  	return 0;
>  }
>  
> +static inline void hmark_swap_ports(union hmark_ports *uports,
> +				    const struct xt_hmark_info *info)
> +{
> +	union hmark_ports hp;
> +	u16 src,dst;
> +
> +	hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32;
> +	src = ntohs(hp.b16.src);
> +	dst = ntohs(hp.b16.dst);
> +
> +	if (dst > src)
> +		uports->v32 = (dst << 16) | src;
> +	else
> +		uports->v32 = (src << 16) | dst;
> +}
> +
>  static int
>  hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t,
>  		    const struct xt_hmark_info *info)
> @@ -74,22 +90,19 @@ hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t,
>  	otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
>  	rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
>  
> -	t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.all,
> -				 info->src_mask.all);
> -	t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.all,
> -				 info->dst_mask.all);
> +	t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6,
> +				 info->src_mask.ip6);
> +	t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6,
> +				 info->dst_mask.ip6);
>  
>  	if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))
>  		return 0;
>  
>  	t->proto = nf_ct_protonum(ct);
>  	if (t->proto != IPPROTO_ICMP) {
> -		t->uports.p16.src = otuple->src.u.all;
> -		t->uports.p16.dst = rtuple->src.u.all;
> -		t->uports.v32 = (t->uports.v32 & info->port_mask.v32) |
> -				info->port_set.v32;
> -		if (t->uports.p16.dst < t->uports.p16.src)
> -			swap(t->uports.p16.dst, t->uports.p16.src);
> +		t->uports.b16.src = otuple->src.u.all;
> +		t->uports.b16.dst = rtuple->src.u.all;
> +		hmark_swap_ports(&t->uports, info);
>  	}
>  
>  	return 0;
> @@ -102,11 +115,13 @@ static inline u32

I've added this comment on top of hmark_hash:

/* This hash function is endian independent, to ensure consistent hashing if
 * the cluster is composed of big and little endian systems. */

In case that someone is curious about all those conversions.

>  hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info)
[...]

^ permalink raw reply

* Re: [Resend PATCH 1/2] netfilter: remove include/linux/netfilter_ipv4/ipt_addrtype.h
From: Pablo Neira Ayuso @ 2012-06-05 23:41 UTC (permalink / raw)
  To: Cong Wang; +Cc: netfilter-devel, netdev, Cong Wang
In-Reply-To: <1337438341-29732-1-git-send-email-amwang@redhat.com>

On Sat, May 19, 2012 at 10:39:00PM +0800, Cong Wang wrote:
> From: Cong Wang <xiyou.wangcong@gmail.com>
> 
> It was scheduled to be removed.

Applied, thanks.

^ permalink raw reply

* Re: [Resend PATCH 2/2] netfilter: remove rev 0 of xt_connlimit
From: Pablo Neira Ayuso @ 2012-06-05 23:41 UTC (permalink / raw)
  To: Cong Wang; +Cc: netfilter-devel, netdev, Cong Wang, Jan Engelhardt
In-Reply-To: <1337438341-29732-2-git-send-email-amwang@redhat.com>

On Sat, May 19, 2012 at 10:39:01PM +0800, Cong Wang wrote:
> From: Cong Wang <xiyou.wangcong@gmail.com>
> 
> It was scheduled to be removed.

Applied, thanks.

^ permalink raw reply

* Re: [PATCH 01/15] netfilter: add namespace support for l4proto
From: Pablo Neira Ayuso @ 2012-06-05 23:56 UTC (permalink / raw)
  To: Gao feng
  Cc: netfilter-devel, netdev, serge.hallyn, ebiederm, dlezcano,
	Gao feng
In-Reply-To: <1338275063-11711-2-git-send-email-gaofeng@cn.fujitsu.com>

On Tue, May 29, 2012 at 03:04:09PM +0800, Gao feng wrote:
> From: Gao feng <gaofeng@cn.fujitus.com>
> 
> struct nf_proto_net stroes proto's ctl_table_header and ctl_table,
> nf_ct_l4proto_(un)register_sysctl use it to register sysctl.
> because AF_INET6's protocols need not do compat, so register or
> unregister sysctl when l4proto.l3proto != AF_INET6.
> 
> - the net_id field is used to store the pernet_operations id
>   that belones to l4proto.
> 
> - init_net will be used to initial the proto's pernet data
> 
> - nf_ct_(un)register_sysctl are changed to support net namespace,
>   use (un)register_net_sysctl_table replaces (un)register_sysctl_paths.
>   and in nf_ct_unregister_sysctl,kfree table only when users is 0.
> 
> - Add the struct net as param of nf_conntrack_l4proto_(un)register.
>   register or unregister the l4proto only when the net is init_net.
> 
> - nf_conntrack_l4proto_register call init_net to initial the pernet
>   data of l4proto.
> 
> - nf_ct_l4proto_net is used to get the pernet data of l4proto.
> 
> - use init_net as a param of nf_conntrack_l4proto_(un)register.

I have applied this patchset, but I had to rewrite the patch
descriptions.

I don't blame your English neither your writing abilities (I'm not
native speaker and not that good at writing either) but I think you
can make it better next time.

Basically, you don't need to comment every single change that the
patch does. That's easy to see by looking at the patchset.

Instead, just provide brief explanation on your intentions with the
patch, clarify things that may look not obvious to the reviewer and
what we'll get with this.

^ permalink raw reply

* Re: [PATCH] netfilter: xt_recent: Add optional mask option for xt_recent
From: Pablo Neira Ayuso @ 2012-06-06  0:01 UTC (permalink / raw)
  To: Denys Fedoryshchenko; +Cc: Linux netdev
In-Reply-To: <1337285238-13589-1-git-send-email-denys@visp.net.lb>

On Thu, May 17, 2012 at 11:07:18PM +0300, Denys Fedoryshchenko wrote:
> Use case for this feature:
> 1)In some occasions if you need to allow,block,match specific subnet.
> 2)I can use recent as a trigger when netfilter rule matches, with mask 0.0.0.0
> 
> Tested for backward compatibility:
> )old (userspace) iptables, new kernel
> )old kernel, new iptables
> )new kernel, new iptables
> 
> For v2:
>  As Pablo Neira Ayuso suggested, moved nf_inet_addr_mask to xt_recent.h
>  and made info_v1 as a stack variable.

Applied with some minor glitches (see below).

Thanks Denys.

> Signed-off-by: Denys Fedoryshchenko <denys@visp.net.lb>
> CC: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
>  include/linux/netfilter/xt_recent.h |   20 +++++++++++
>  net/netfilter/xt_recent.c           |   62 ++++++++++++++++++++++++++++++----
>  2 files changed, 74 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/netfilter/xt_recent.h b/include/linux/netfilter/xt_recent.h
> index 83318e0..5f69ebc 100644
> --- a/include/linux/netfilter/xt_recent.h
> +++ b/include/linux/netfilter/xt_recent.h
> @@ -32,4 +32,24 @@ struct xt_recent_mtinfo {
>  	__u8 side;
>  };
>  
> +struct xt_recent_mtinfo_v1 {
> +	__u32 seconds;
> +	__u32 hit_count;
> +	__u8 check_set;
> +	__u8 invert;
> +	char name[XT_RECENT_NAME_LEN];
> +	__u8 side;
> +	union nf_inet_addr mask;
> +};
> +
> +static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
> +				    union nf_inet_addr *result,
> +				    const union nf_inet_addr *mask)
> +{
> +	result->all[0] = a1->all[0] & mask->all[0];
> +	result->all[1] = a1->all[1] & mask->all[1];
> +	result->all[2] = a1->all[2] & mask->all[2];
> +	result->all[3] = a1->all[3] & mask->all[3];

Yes, I told you to move this to xt_recent. But then I noticed that
nf_inet_addr_cmp in linux/netfilter.h. and well, this is static inline
and other may get some benefit with it.

Yes, I'm changing my mind :-).

> +}
> +
>  #endif /* _LINUX_NETFILTER_XT_RECENT_H */
> diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
> index fc0d6db..ca4375c 100644
> --- a/net/netfilter/xt_recent.c
> +++ b/net/netfilter/xt_recent.c
> @@ -75,6 +75,7 @@ struct recent_entry {
>  struct recent_table {
>  	struct list_head	list;
>  	char			name[XT_RECENT_NAME_LEN];
> +	union nf_inet_addr	mask;
>  	unsigned int		refcnt;
>  	unsigned int		entries;
>  	struct list_head	lru_list;
> @@ -228,10 +229,11 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
>  {
>  	struct net *net = dev_net(par->in ? par->in : par->out);
>  	struct recent_net *recent_net = recent_pernet(net);
> -	const struct xt_recent_mtinfo *info = par->matchinfo;
> +	const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
>  	struct recent_table *t;
>  	struct recent_entry *e;
>  	union nf_inet_addr addr = {};
> +	union nf_inet_addr addr_masked;

I've put addr_masked with addr (same line).

>  	u_int8_t ttl;
>  	bool ret = info->invert;
>  
> @@ -261,12 +263,15 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
>  
>  	spin_lock_bh(&recent_lock);
>  	t = recent_table_lookup(recent_net, info->name);
> -	e = recent_entry_lookup(t, &addr, par->family,
> +
> +	nf_inet_addr_mask(&addr, &addr_masked, &t->mask);
> +
> +	e = recent_entry_lookup(t, &addr_masked, par->family,
>  				(info->check_set & XT_RECENT_TTL) ? ttl : 0);
>  	if (e == NULL) {
>  		if (!(info->check_set & XT_RECENT_SET))
>  			goto out;
> -		e = recent_entry_init(t, &addr, par->family, ttl);
> +		e = recent_entry_init(t, &addr_masked, par->family, ttl);
>  		if (e == NULL)
>  			par->hotdrop = true;
>  		ret = !ret;
> @@ -306,10 +311,10 @@ out:
>  	return ret;
>  }
>  
> -static int recent_mt_check(const struct xt_mtchk_param *par)
> +static int recent_mt_check(const struct xt_mtchk_param *par,
> +	const struct xt_recent_mtinfo_v1 *info)
>  {
>  	struct recent_net *recent_net = recent_pernet(par->net);
> -	const struct xt_recent_mtinfo *info = par->matchinfo;
>  	struct recent_table *t;
>  #ifdef CONFIG_PROC_FS
>  	struct proc_dir_entry *pde;
> @@ -361,6 +366,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par)
>  		goto out;
>  	}
>  	t->refcnt = 1;
> +
> +	memcpy(&t->mask, &info->mask, sizeof(t->mask));
>  	strcpy(t->name, info->name);
>  	INIT_LIST_HEAD(&t->lru_list);
>  	for (i = 0; i < ip_list_hash_size; i++)
> @@ -385,10 +392,29 @@ out:
>  	return ret;
>  }
>  
> +static int recent_mt_check_v0(const struct xt_mtchk_param *par)
> +{
> +	const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo;
> +	struct xt_recent_mtinfo_v1 info_v1;
> +	int ret;
> +
> +	/* Copy old data */
> +	memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo));
> +	/* Default mask will make same behavior as old recent */
> +	memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all));
> +	ret = recent_mt_check(par, &info_v1);
> +	return ret;

return recent_mt_check(...)

I removed that ret variable.

> +}
> +
> +static int recent_mt_check_v1(const struct xt_mtchk_param *par)
> +{
> +	return recent_mt_check(par, par->matchinfo);
> +}
> +
>  static void recent_mt_destroy(const struct xt_mtdtor_param *par)
>  {
>  	struct recent_net *recent_net = recent_pernet(par->net);
> -	const struct xt_recent_mtinfo *info = par->matchinfo;
> +	const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
>  	struct recent_table *t;
>  
>  	mutex_lock(&recent_mutex);
> @@ -625,7 +651,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
>  		.family     = NFPROTO_IPV4,
>  		.match      = recent_mt,
>  		.matchsize  = sizeof(struct xt_recent_mtinfo),
> -		.checkentry = recent_mt_check,
> +		.checkentry = recent_mt_check_v0,
>  		.destroy    = recent_mt_destroy,
>  		.me         = THIS_MODULE,
>  	},
> @@ -635,10 +661,30 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
>  		.family     = NFPROTO_IPV6,
>  		.match      = recent_mt,
>  		.matchsize  = sizeof(struct xt_recent_mtinfo),
> -		.checkentry = recent_mt_check,
> +		.checkentry = recent_mt_check_v0,
> +		.destroy    = recent_mt_destroy,
> +		.me         = THIS_MODULE,
> +	},
> +	{
> +		.name       = "recent",
> +		.revision   = 1,
> +		.family     = NFPROTO_IPV4,
> +		.match      = recent_mt,
> +		.matchsize  = sizeof(struct xt_recent_mtinfo_v1),
> +		.checkentry = recent_mt_check_v1,
>  		.destroy    = recent_mt_destroy,
>  		.me         = THIS_MODULE,
>  	},
> +	{
> +		.name       = "recent",
> +		.revision   = 1,
> +		.family     = NFPROTO_IPV6,
> +		.match      = recent_mt,
> +		.matchsize  = sizeof(struct xt_recent_mtinfo_v1),
> +		.checkentry = recent_mt_check_v1,
> +		.destroy    = recent_mt_destroy,
> +		.me         = THIS_MODULE,
> +	}
>  };
>  
>  static int __init recent_mt_init(void)
> -- 
> 1.7.3.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox