Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/6] ipvs: Avoid null-pointer deref in debug code
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

From: Alex Gartrell <agartrell@fb.com>

Use daddr instead of reaching into dest.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alex Gartrell <agartrell@fb.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 91f17c1..437a366 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -316,7 +316,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
 						  local))) {
 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
-			     " daddr=%pI4\n", &dest->addr.ip);
+			     " daddr=%pI4\n", &daddr);
 		goto err_put;
 	}
 
@@ -458,7 +458,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
 						  local))) {
 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
-			     " daddr=%pI6\n", &dest->addr.in6);
+			     " daddr=%pI6\n", daddr);
 		goto err_put;
 	}
 
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 6/6] netfilter: nft_reject_bridge: restrict reject to prerouting and input
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

Restrict the reject expression to the prerouting and input bridge
hooks. If we allow this to be used from forward or any other later
bridge hook, if the frame is flooded to several ports, we'll end up
sending several reject packets, one per cloned packet.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nft_reject_bridge.c |   33 +++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 31b27e1..654c901 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -18,6 +18,7 @@
 #include <net/netfilter/ipv6/nf_reject.h>
 #include <linux/ip.h>
 #include <net/ip.h>
+#include <linux/netfilter_bridge.h>
 #include "../br_private.h"
 
 static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
@@ -305,12 +306,34 @@ out:
 	data[NFT_REG_VERDICT].verdict = NF_DROP;
 }
 
+static int nft_reject_bridge_validate_hooks(const struct nft_chain *chain)
+{
+	struct nft_base_chain *basechain;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		basechain = nft_base_chain(chain);
+
+		switch (basechain->ops[0].hooknum) {
+		case NF_BR_PRE_ROUTING:
+		case NF_BR_LOCAL_IN:
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
 static int nft_reject_bridge_init(const struct nft_ctx *ctx,
 				  const struct nft_expr *expr,
 				  const struct nlattr * const tb[])
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
-	int icmp_code;
+	int icmp_code, err;
+
+	err = nft_reject_bridge_validate_hooks(ctx->chain);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_REJECT_TYPE] == NULL)
 		return -EINVAL;
@@ -359,6 +382,13 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_reject_bridge_validate(const struct nft_ctx *ctx,
+				      const struct nft_expr *expr,
+				      const struct nft_data **data)
+{
+	return nft_reject_bridge_validate_hooks(ctx->chain);
+}
+
 static struct nft_expr_type nft_reject_bridge_type;
 static const struct nft_expr_ops nft_reject_bridge_ops = {
 	.type		= &nft_reject_bridge_type,
@@ -366,6 +396,7 @@ static const struct nft_expr_ops nft_reject_bridge_ops = {
 	.eval		= nft_reject_bridge_eval,
 	.init		= nft_reject_bridge_init,
 	.dump		= nft_reject_bridge_dump,
+	.validate	= nft_reject_bridge_validate,
 };
 
 static struct nft_expr_type nft_reject_bridge_type __read_mostly = {
-- 
1.7.10.4


^ permalink raw reply related

* Re: [PATCH] virtio_net: fix use after free
From: Eric Dumazet @ 2014-10-31 12:24 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, netdev, linux-kernel, virtualization,
	David S. Miller
In-Reply-To: <5453270F.3090300@redhat.com>

On Fri, 2014-10-31 at 14:07 +0800, Jason Wang wrote:

> Since they are called before the possible free_old_xmit_skbs(), skb
> won't get freed at this time.

Oh right, I forgot there is no completion handler yet, timer based or
whatever.

Thanks.

^ permalink raw reply

* Re: [PATCH -next v2 1/2] syncookies: remove ecn_ok validation when decoding option timestamp
From: Eric Dumazet @ 2014-10-31 13:32 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netdev
In-Reply-To: <1414757602-27637-2-git-send-email-fw@strlen.de>

On Fri, 2014-10-31 at 13:13 +0100, Florian Westphal wrote:
> Won't work anymore when tcp_ecn=0 and RTAX_FEATURES route attribute did
> allow ecn.
> 
> Just turn on ecn if the client ts says so.
> 
> This means that while syn cookies are in use clients can turn on ecn
> even if it is off on the server.
> 
> However, there seems to be no harm in permitting this.
> 
> Alternatively one can extend the test to also perform route lookup and
> check RTAX_FEATURES, but it simply doesn't appear to be worth the effort.
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  Changes since v1:
>   - reword commit message

Sorry.

Google chose to disable ecn on their GFE, so we set sysctl_tcp_ecn to 0

If I understand your patch, if a synflood is going on, some innocent
connections could get ECN enabled, while we do not want this to ever
happen. ECN really hurts our customers, this is a known fact.

You cannot change this like that, it would force us (and maybe others)
to either revert this patch, or add a knob.

If sysctl_tcp_ecn = 0, there is no way a connection should have ECN
enabled. This was documented years ago.

For the record :

vi +247 Documentation/networking/ip-sysctl.txt

tcp_ecn - INTEGER
        Control use of Explicit Congestion Notification (ECN) by TCP.
        ECN is used only when both ends of the TCP connection indicate
        support for it.  This feature is useful in avoiding losses due
        to congestion by allowing supporting routers to signal
        congestion before having to drop packets.
        Possible values are:
                0 Disable ECN.  Neither initiate nor accept ECN.
                1 Enable ECN when requested by incoming connections and
                  also request ECN on outgoing connection attempts.
                2 Enable ECN when requested by incoming connections
                  but do not request ECN on outgoing connections.
        Default: 2

^ permalink raw reply

* Re: [PATCH -next v2 1/2] syncookies: remove ecn_ok validation when decoding option timestamp
From: Florian Westphal @ 2014-10-31 13:39 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netdev
In-Reply-To: <1414762333.499.16.camel@edumazet-glaptop2.roam.corp.google.com>

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-10-31 at 13:13 +0100, Florian Westphal wrote:
> > Won't work anymore when tcp_ecn=0 and RTAX_FEATURES route attribute did
> > allow ecn.
> > 
> > Just turn on ecn if the client ts says so.
> > 
> > This means that while syn cookies are in use clients can turn on ecn
> > even if it is off on the server.
> > 
> > However, there seems to be no harm in permitting this.
> > 
> > Alternatively one can extend the test to also perform route lookup and
> > check RTAX_FEATURES, but it simply doesn't appear to be worth the effort.
> > 
> > Signed-off-by: Florian Westphal <fw@strlen.de>
> > ---
> >  Changes since v1:
> >   - reword commit message
> 
> Sorry.
> 
> Google chose to disable ecn on their GFE, so we set sysctl_tcp_ecn to 0
> 
> If I understand your patch, if a synflood is going on, some innocent
> connections could get ECN enabled, while we do not want this to ever
> happen. ECN really hurts our customers, this is a known fact.
>
> You cannot change this like that, it would force us (and maybe others)
> to either revert this patch, or add a knob.

Mot needed, if you think its wrong to remove the check, I will respin
with a proper validation.

> If sysctl_tcp_ecn = 0, there is no way a connection should have ECN
> enabled. This was documented years ago.

It would only get enabled if the echoed timestamp (ie the timestamp we
sent in the synack) indicates that ecn was enabled, i.e. the client or
a middlebox would have to munge/modify it to set the 'ecn on' bit in the
timestamp.

If that is too fragile in your opinion I will respin the patch to include
the additional validation via dst.  We already need to fetch the dst
object anyway to fetch certain route attributes not in the timestamp or
cookie, so its only a matter of reorganizing code first to avoid two lookups.

Let me know what you prefer.

Thanks,
Florian

^ permalink raw reply

* Re: [PATCH net-next 8/8] net/mlx4_en: Extend checksum offloading by CHECKSUM COMPLETE
From: Or Gerlitz @ 2014-10-31 13:52 UTC (permalink / raw)
  To: Yann Ylavic
  Cc: Or Gerlitz, David S. Miller, Linux Netdev List, Matan Barak,
	Amir Vadai, Saeed Mahameed, Shani Michaeli, Jerry Chu
In-Reply-To: <CAKQ1sVNu9AN=gaRQfgqWq635YBZjEHegz0Ljj3xvvcDSoFaU=A@mail.gmail.com>

On Fri, Oct 31, 2014 at 2:38 AM, Yann Ylavic <ylavic.dev@gmail.com> wrote:
> Hi,
>
> On Thu, Oct 30, 2014 at 5:06 PM, Or Gerlitz <ogerlitz@mellanox.com> wrote:
> [...]
>> +static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, int hwtstamp_rx_filter)
>> +{
>> +       __wsum hw_checksum = 0;
>> +
>> +       void *hdr = (u8 *)skb->data + sizeof(struct ethhdr);
>> +
>> +       hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
>> +
>> +       if (((struct ethhdr *)skb->data)->h_proto == htons(ETH_P_8021Q) &&
>> +           hwtstamp_rx_filter != HWTSTAMP_FILTER_NONE) {
>> +               /* next protocol non IPv4 or IPv6 */
>> +               if (((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto
>> +                   != htons(ETH_P_IP) ||
>
> Shouldn't this be a AND (&&)?

Oh, yes of course, good catch (this protects against the case of QinQ
which isn't supported, so somehow passed the testing... Shani, please
fix it up.

Or.

>
>> +                   ((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto
>> +                   != htons(ETH_P_IPV6))
>> +                       return -1;

^ permalink raw reply

* Re: [PATCH net-next 8/8] net/mlx4_en: Extend checksum offloading by CHECKSUM COMPLETE
From: Or Gerlitz @ 2014-10-31 13:57 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Or Gerlitz, David S. Miller, Linux Netdev List, Matan Barak,
	Amir Vadai, Saeed Mahameed, Shani Michaeli, Jerry Chu
In-Reply-To: <CA+mtBx-AvGkUAxq69pTFWoyiM1pdktofEz+o+hCDw=7SWjWtpQ@mail.gmail.com>

On Fri, Oct 31, 2014 at 1:59 AM, Tom Herbert <therbert@google.com> wrote:
> On Thu, Oct 30, 2014 at 9:06 AM, Or Gerlitz <ogerlitz@mellanox.com> wrote:
>> From: Shani Michaeli <shanim@mellanox.com>
>>
>> When processing received traffic, pass CHECKSUM_COMPLETE status to the
>> stack, with calculated checksum for non TCP/UDP packets (such
>> as GRE or ICMP).

> This is very exciting work!

thanks, good to know...

> One question though, what would mlx4
> return in the case of a zero UDP checksum? (I assume this patch won't
> affect this case but would like to make sure).

This patch doesn't change any functionality w.r.t UDP or TCP packets,
only for IP protocols which are different from those two.

When VXLAN packets arrive with zero UDP checksum and the HW verified
the internal checksum we return CHECKSUM_UNNECESSARY, for other cases
(e.g UDP but not VXLAN) I'd like to have 2nd look.

Or.

^ permalink raw reply

* Re: [PATCH net-next 5/8] net/mlx4_en: Remove redundant code from RX/GRO path
From: Or Gerlitz @ 2014-10-31 14:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Or Gerlitz, David S. Miller, Linux Netdev List, Matan Barak,
	Amir Vadai, Saeed Mahameed, Shani Michaeli, Ido Shamay
In-Reply-To: <1414725541.499.3.camel@edumazet-glaptop2.roam.corp.google.com>

On Fri, Oct 31, 2014 at 5:19 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-10-31 at 01:25 +0200, Or Gerlitz wrote:
>> On Thu, Oct 30, 2014 at 9:00 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > On Thu, 2014-10-30 at 18:06 +0200, Or Gerlitz wrote:
>> >> Remove the code which goes through napi_gro_frags() on the RX path,
>> >> use only napi_gro_receive().
>>
>> > Hmpff... napi_gro_frags() should be faster.
>> > Have you benchmarked this ?
>>
>>
>> yep we did, napi_gro_frags() was somehow better for single stream. Do
>> you think we need to do it the other way around, e.g converge to use
>> napi_gro_frags()?

> napi_gro_frags() is faster because the napi->skb is reused fast (not
> going through kfree_skb()/alloc_skb() for every fragment)

I see. Is this a strong vote to convert the code to use napi_gro_frags
on it's usual track?

Or.

^ permalink raw reply

* Re: [PATCH -next v2 1/2] syncookies: remove ecn_ok validation when decoding option timestamp
From: Eric Dumazet @ 2014-10-31 14:04 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netdev
In-Reply-To: <20141031133948.GJ10069@breakpoint.cc>

On Fri, 2014-10-31 at 14:39 +0100, Florian Westphal wrote:

> It would only get enabled if the echoed timestamp (ie the timestamp we
> sent in the synack) indicates that ecn was enabled, i.e. the client or
> a middlebox would have to munge/modify it to set the 'ecn on' bit in the
> timestamp.
> 
> If that is too fragile in your opinion I will respin the patch to include
> the additional validation via dst.  We already need to fetch the dst
> object anyway to fetch certain route attributes not in the timestamp or
> cookie, so its only a matter of reorganizing code first to avoid two lookups.

Well, your changelog is so confusing, I have no idea what is your
intent.

I do not really understand why you need to change something.

Maybe this is because I have not yet took my coffee ;)

Thanks

^ permalink raw reply

* Re: [PATCH -next v2 1/2] syncookies: remove ecn_ok validation when decoding option timestamp
From: Florian Westphal @ 2014-10-31 14:15 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netdev
In-Reply-To: <1414764287.27538.1.camel@edumazet-glaptop2.roam.corp.google.com>

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-10-31 at 14:39 +0100, Florian Westphal wrote:
> 
> > It would only get enabled if the echoed timestamp (ie the timestamp we
> > sent in the synack) indicates that ecn was enabled, i.e. the client or
> > a middlebox would have to munge/modify it to set the 'ecn on' bit in the
> > timestamp.
> > 
> > If that is too fragile in your opinion I will respin the patch to include
> > the additional validation via dst.  We already need to fetch the dst
> > object anyway to fetch certain route attributes not in the timestamp or
> > cookie, so its only a matter of reorganizing code first to avoid two lookups.
> 
> Well, your changelog is so confusing, I have no idea what is your
> intent.

Sorry :-/

So if you have a per route ecn setting, and syncookies are used,
and tcp_ecn sysctl is 0:

1. we receive syn with ecn on and timestamps
2. we send cookie synack, with timestamp and ecn (route allowed it),
the lower bits of the timestamp have a "magic" bit set that allows
us to infer that ecn was negotiated successfully.
3. we drop the ack from the client, since timestamp decoding sees
"ecn is on according to timestamp, but the tcp_ecn sysctl is off".

So to fix this, step 3 either has to check the dst setting
in addition to the global sysctl, or to rely on the timestamp alone
that ecn was requested by the original client and allowed by our host
at the time synack timestamp was generated/sent.

I hope that explains the reason behind patch #1 up.

> I do not really understand why you need to change something.

Yes, unfortunately you're not the first person saying that my
changelogs are not precise enough sometimes, I hope to do
a better job next time around.

> Maybe this is because I have not yet took my coffee ;)

Oh, well, that could also explain it 8-)

^ permalink raw reply

* Re: [PATCH net-next 5/8] net/mlx4_en: Remove redundant code from RX/GRO path
From: Eric Dumazet @ 2014-10-31 15:46 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Or Gerlitz, David S. Miller, Linux Netdev List, Matan Barak,
	Amir Vadai, Saeed Mahameed, Shani Michaeli, Ido Shamay
In-Reply-To: <CAJ3xEMiDnv9=nvvJ1m7_taoSncdmgv4GJVR8DiD5t5GCsFig1A@mail.gmail.com>

On Fri, 2014-10-31 at 16:00 +0200, Or Gerlitz wrote:
> On Fri, Oct 31, 2014 at 5:19 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Fri, 2014-10-31 at 01:25 +0200, Or Gerlitz wrote:
> >> On Thu, Oct 30, 2014 at 9:00 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> > On Thu, 2014-10-30 at 18:06 +0200, Or Gerlitz wrote:
> >> >> Remove the code which goes through napi_gro_frags() on the RX path,
> >> >> use only napi_gro_receive().
> >>
> >> > Hmpff... napi_gro_frags() should be faster.
> >> > Have you benchmarked this ?
> >>
> >>
> >> yep we did, napi_gro_frags() was somehow better for single stream. Do
> >> you think we need to do it the other way around, e.g converge to use
> >> napi_gro_frags()?
> 
> > napi_gro_frags() is faster because the napi->skb is reused fast (not
> > going through kfree_skb()/alloc_skb() for every fragment)
> 
> I see. Is this a strong vote to convert the code to use napi_gro_frags
> on it's usual track?

I don't know yet. In some cases, actually slowing down the rx path can
help by building bigger GRO packets. But instead of inserting delays,
we can simply force napi to be run another time, with a nanosec based
timer.

I've tested this kind of heuristic :

       /* If some packets are waiting in GRO engine and timeout is not expired,
        * reschedule a NAPI poll. We allow servicing other softirqs
        * before repoll, we do not rearm CQ.
        */
       if (rx_nsecs && napi->gro_list && !need_resched()) {
               u64 now = local_clock();
               unsigned long flags;

               /* If we got packets in this round, restart timeout */
               if (done)
                       cq->tstart = now;
               else if (now - cq->tstart >= (u64)rx_nsecs)
                       goto complete;

               /* Since we might need one skb very soon, build it now */
               napi_get_frags(napi);

               local_irq_save(flags);
               list_del(&napi->poll_list);
               __napi_schedule_irqoff(napi);
               local_irq_restore(flags);

        } else {
complete:
                napi_complete(napi);
                mlx4_en_arm_cq(priv, cq);
        }
	return done;

^ permalink raw reply

* Re: [PATCH -next v2 1/2] syncookies: remove ecn_ok validation when decoding option timestamp
From: Eric Dumazet @ 2014-10-31 15:47 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netdev
In-Reply-To: <20141031141503.GL10069@breakpoint.cc>

On Fri, 2014-10-31 at 15:15 +0100, Florian Westphal wrote:

> So if you have a per route ecn setting, and syncookies are used,
> and tcp_ecn sysctl is 0:

This part I do not understand.

Why should tcp_ecn be 0 here, and not 2 (default value) ?

> 
> 1. we receive syn with ecn on and timestamps
> 2. we send cookie synack, with timestamp and ecn (route allowed it),
> the lower bits of the timestamp have a "magic" bit set that allows
> us to infer that ecn was negotiated successfully.
> 3. we drop the ack from the client, since timestamp decoding sees
> "ecn is on according to timestamp, but the tcp_ecn sysctl is off".
> 
> So to fix this, step 3 either has to check the dst setting
> in addition to the global sysctl, or to rely on the timestamp alone
> that ecn was requested by the original client and allowed by our host
> at the time synack timestamp was generated/sent.
> 
> I hope that explains the reason behind patch #1 up.

^ permalink raw reply

* [PATCH net-next] ethernet: mvneta: Use PHY status standard message
From: Ezequiel Garcia @ 2014-10-31 15:57 UTC (permalink / raw)
  To: Thomas Petazzoni, Gregory Clement, David Miller
  Cc: Nadav Haklai, Tawfik Bayouk, Lior Amsalem, netdev,
	Ezequiel Garcia

Use phy_print_status() to report a change in the PHY status.
The current message is not verbose enough, so this commit improves
it by using the generic status message.

After this change, the kernel reports PHY status down and up events as:

mvneta f1070000.ethernet eth0: Link is Down
mvneta f1070000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx

Signed-off-by: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvneta.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index ade067d..ccc3ce2 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2558,11 +2558,10 @@ static void mvneta_adjust_link(struct net_device *ndev)
 				MVNETA_GMAC_FORCE_LINK_DOWN);
 			mvreg_write(pp, MVNETA_GMAC_AUTONEG_CONFIG, val);
 			mvneta_port_up(pp);
-			netdev_info(pp->dev, "link up\n");
 		} else {
 			mvneta_port_down(pp);
-			netdev_info(pp->dev, "link down\n");
 		}
+		phy_print_status(phydev);
 	}
 }
 
-- 
2.1.0

^ permalink raw reply related

* Re: [PATCH -next v2 1/2] syncookies: remove ecn_ok validation when decoding option timestamp
From: Florian Westphal @ 2014-10-31 16:00 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netdev
In-Reply-To: <1414770460.27538.9.camel@edumazet-glaptop2.roam.corp.google.com>

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2014-10-31 at 15:15 +0100, Florian Westphal wrote:
> 
> > So if you have a per route ecn setting, and syncookies are used,
> > and tcp_ecn sysctl is 0:
> 
> This part I do not understand.
> 
> Why should tcp_ecn be 0 here, and not 2 (default value) ?

Because admin might have changed it.
There is no problem if tcp_ecn sysctl is nonzero (1 or 2).

This problem will only manifest itself iff tcp_ecn sysctl was set to 0,
and the remote peer requests ecn and a route specific setting enabled
ecn for the source network and syncookies are used.

Current timestamp cookie validation will think "client is lying about
ecn in the timestamp as sysctl is off", since it does not consider a
per-route ecn knob.

^ permalink raw reply

* Re: [PATCH] VNIC: Adding support for Cavium ThunderX network controller
From: Robert Richter @ 2014-10-31 16:17 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Robert Richter, netdev, linux-kernel, Stefan Assmann,
	Sunil Goutham, David S. Miller, linux-arm-kernel
In-Reply-To: <20141030194513.089d27ec@urahara>

On 30.10.14 19:45:13, Stephen Hemminger wrote:
> On Thu, 30 Oct 2014 17:54:34 +0100
> Robert Richter <rric@kernel.org> wrote:
> 
> > diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
> > index 1fa99a301817..80bd3336691e 100644
> > --- a/include/linux/pci_ids.h
> > +++ b/include/linux/pci_ids.h
> > @@ -2324,6 +2324,8 @@
> >  #define PCI_DEVICE_ID_ALTIMA_AC9100	0x03ea
> >  #define PCI_DEVICE_ID_ALTIMA_AC1003	0x03eb
> >  
> > +#define PCI_VENDOR_ID_CAVIUM		0x177d
> 
> I don't think PCI folks want this updated with every id anymore.

This is just the vendor id, the device id is part of the driver.

Since there will be multiple drivers I put the vendor id here.

-Robert

^ permalink raw reply

* Re: [PATCH net-next] bridge: make proxy arp configurable
From: David Miller @ 2014-10-31 16:21 UTC (permalink / raw)
  To: shemming; +Cc: kyeyoonp, netdev
In-Reply-To: <20141030200942.5a531e34@urahara>

From: Stephen Hemminger <shemming@brocade.com>
Date: Thu, 30 Oct 2014 20:09:42 -0700

> @@ -60,3 +60,19 @@ config BRIDGE_VLAN_FILTERING
>  	  Say N to exclude this support and reduce the binary size.
>  
>  	  If unsure, say Y.
> +
> +config BRIDGE_ARP_PROXY
> +	bool "ARP proxying"
> +	depends on BRIDGE
> +	depends on INET
> +	default y
> +	---help---
> +	  If you say Y here, then the Ethernet bridge to keep track of
> +	  the hardware address to IP address mapping.
> +
> +	  It is most useful when used as a wireless AP.
> +
> +	  Say N to exclude this support and reduce the binary size.
> +
> +	  If unsure, say Y.
> +

Please do not ever add empty lines at the end of files, GIT warns
about this when I try to apply your patch.

^ permalink raw reply

* Re: [PATCH] ipv4: avoid divide 0 error in tcp_incr_quickack
From: Alexei Starovoitov @ 2014-10-31 16:24 UTC (permalink / raw)
  To: Chen Weilong, Eric Dumazet, netdev@vger.kernel.org
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, linux-kernel@vger.kernel.org
In-Reply-To: <1414767047-8972-1-git-send-email-chenweilong@huawei.com>

cc-ing netdev

On Fri, Oct 31, 2014 at 7:50 AM, Chen Weilong <chenweilong@huawei.com> wrote:
> From: Weilong Chen <chenweilong@huawei.com>
>
> We got a problem like this:
>  [ffff8801c1a05570] machine_kexec at ffffffff81025039
>  [ffff8801c1a055d0] crash_kexec at ffffffff8109b253
>  [ffff8801c1a056a0] oops_end at ffffffff81442aed
>  [ffff8801c1a056d0] die at ffffffff81005603
>  [ffff8801c1a05700] do_trap at ffffffff81442448
>  [ffff8801c1a05760] do_divide_error at ffffffff81002c10
>  [ffff8801c1a05888] tcp_send_dupack at ffffffff81385e44
>  [ffff8801c1a058c8] tcp_validate_incoming at ffffffff813886b5
>  [ffff8801c1a05908] tcp_rcv_state_process at ffffffff8138d0b7
>  [ffff8801c1a05958] tcp_child_process at ffffffff81397255
>  [ffff8801c1a05988] tcp_v4_do_rcv at ffffffff81395a70
>  [ffff8801c1a059d8] tcp_v4_rcv at ffffffff81396fc8
>  [ffff8801c1a05a48] ip_local_deliver_finish at ffffffff813746e9
>  [ffff8801c1a05a78] ip_local_deliver at ffffffff81374a20
>  [ffff8801c1a05aa8] ip_rcv_finish at ffffffff81374389
>  [ffff8801c1a05ad8] ip_rcv at ffffffff81374c78
> There was a wrong ack packet coming during TCP handshake. The socket's state
> was TCP_SYN_RECV, its rcv_mss was not initialize yet. So
> tcp_send_dupack -> tcp_enter_quickack_mode got a divide 0 error.
> This patch add a state check before tcp_enter_quickack_mode.

ouch. Is it remote exploitable?

> Signed-off-by: Weilong Chen <chenweilong@huawei.com>
> ---
>  net/ipv4/tcp_input.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 4e4617e..9eb56dc 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3986,7 +3986,8 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
>         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
>             before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
>                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
> -               tcp_enter_quickack_mode(sk);
> +               if (sk->sk_state != TCP_SYN_RECV)
> +                       tcp_enter_quickack_mode(sk);
>
>                 if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
>                         u32 end_seq = TCP_SKB_CB(skb)->end_seq;
> --
> 1.7.12
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* [PATCH v2] stmmac: pci: set default of the filter bins
From: Andy Shevchenko @ 2014-10-31 16:28 UTC (permalink / raw)
  To: Giuseppe Cavallaro, netdev, Kweh Hock Leong, David S . Miller,
	Vince Bridgers
  Cc: Andy Shevchenko, stable

The commit 3b57de958e2a brought the support for a different amount of the
filter bins, but didn't update the PCI driver accordingly. This patch appends
the default values when the device is enumerated via PCI bus.

Fixes: 3b57de958e2a (net: stmmac: Support devicetree configs for mcast and ucast filter entries)
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: stable@vger.kernel.org
---
Since v1:
- fix ugly style
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 655a23b..e17a970 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -33,6 +33,7 @@ static struct stmmac_dma_cfg dma_cfg;
 static void stmmac_default_data(void)
 {
 	memset(&plat_dat, 0, sizeof(struct plat_stmmacenet_data));
+
 	plat_dat.bus_id = 1;
 	plat_dat.phy_addr = 0;
 	plat_dat.interface = PHY_INTERFACE_MODE_GMII;
@@ -47,6 +48,12 @@ static void stmmac_default_data(void)
 	dma_cfg.pbl = 32;
 	dma_cfg.burst_len = DMA_AXI_BLEN_256;
 	plat_dat.dma_cfg = &dma_cfg;
+
+	/* Set default value for multicast hash bins */
+	plat_dat.multicast_filter_bins = HASH_TABLE_SIZE;
+
+	/* Set default value for unicast filter entries */
+	plat_dat.unicast_filter_entries = 1;
 }
 
 /**
-- 
2.1.1

^ permalink raw reply related

* Re: [PATCH 0/6] netfilter/ipvs fixes for net
From: David Miller @ 2014-10-31 16:30 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 31 Oct 2014 13:18:26 +0100

> The following patchset contains fixes for netfilter/ipvs. This round of
> fixes is larger than usual at this stage, specifically because of the
> nf_tables bridge reject fixes that I would like to see in 3.18. The
> patches are:
 ...
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thanks Pablo.

^ permalink raw reply

* Re: Mistake in commit 0d961b3b52f566f823070ce2366511a7f64b928c breaks cpsw non dual_emac mode.
From: Lennart Sorensen @ 2014-10-31 17:10 UTC (permalink / raw)
  To: Heiko Schocher; +Cc: David Miller, linux-kernel, mugunthanvnm, netdev
In-Reply-To: <545330EA.60303@denx.de>

On Fri, Oct 31, 2014 at 07:49:14AM +0100, Heiko Schocher wrote:
> Seems I missed your original patch ... looked in it here:
> 
> https://lkml.org/lkml/2014/10/28/837
> 
> and I think you are correct, thanks for this fix. You can add my
> Acked-by: Heiko Schocher <hs@denx.de>
> if you post a corrected v2, as David suggested.

Will do.  I noticed a wrong word in one of the messages too so good
thing it gets fixed before getting commited.

-- 
Len Sorensen

^ permalink raw reply

* Re: [PATCH] VNIC: Adding support for Cavium ThunderX network controller
From: Sunil Kovvuri @ 2014-10-31 17:14 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Robert Richter, David S. Miller, Sunil Goutham, Robert Richter,
	Stefan Assmann, LKML, LAKML, netdev
In-Reply-To: <20141030195458.2958d88a@urahara>

On Fri, Oct 31, 2014 at 8:24 AM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Thu, 30 Oct 2014 17:54:34 +0100
> Robert Richter <rric@kernel.org> wrote:
>
>> +#ifdef       VNIC_RSS_SUPPORT
>> +static int rss_config = RSS_IP_HASH_ENA | RSS_TCP_HASH_ENA | RSS_UDP_HASH_ENA;
>> +module_param(rss_config, int, S_IRUGO);
>> +MODULE_PARM_DESC(rss_config,
>> +              "RSS hash config [bits 8:0] (Bit0:L2 extended, 1:IP, 2:TCP, 3:TCP SYN, 4:UDP, 5:L4 extended, 6:ROCE 7:L3 bi-directional, 8:L4 bi-directional)");
>> +#endif
>
> This should managed  be via ethtool ETHTOOL_GRXFH rather than a module parameter.
Thanks, i will add setting hash options via ETHTOOL_SRXFH as well.
The idea here is to have a choice of hash while module load (through
module params) and if it needs to be changed runtime then
via Ethtool.

Sunil.

^ permalink raw reply

* drivers: net: cpsw: Fix broken loop condition in switch mode
From: Lennart Sorensen @ 2014-10-31 17:28 UTC (permalink / raw)
  To: linux-kernel
  Cc: Heiko Schocher, Len Sorensen, Mugunthan V N, David S. Miller,
	netdev

0d961b3b52f566f823070ce2366511a7f64b928c (drivers: net: cpsw: fix buggy
loop condition) accidentally fixed a loop comparison in too many places
while fixing a real bug.

It was correct to fix the dual_emac mode section since there 'i' is used
as an index into priv->slaves which is a 0 based array.

However the other two changes (which are only used in switch mode)
are wrong since there 'i' is actually the ALE port number, and port 0
is the host port, while port 1 and up are the slave ports.

Putting the loop condition back in the switch mode section fixes it.

A comment has been added to point out the intent clearly to avoid future
confusion.  Also a comment is fixed that said the opposite of what was
actually happening.

Signed-off-by: Len Sorensen <lsorense@csclub.uwaterloo.ca>
Acked-by: Heiko Schocher <hs@denx.de>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 952e1e4..4683196 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -591,8 +591,8 @@ static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 		if (enable) {
 			unsigned long timeout = jiffies + HZ;
 
-			/* Disable Learn for all ports */
-			for (i = 0; i < priv->data.slaves; i++) {
+			/* Disable Learn for all ports (host is port 0 and slaves are port 1 and up */
+			for (i = 0; i <= priv->data.slaves; i++) {
 				cpsw_ale_control_set(ale, i,
 						     ALE_PORT_NOLEARN, 1);
 				cpsw_ale_control_set(ale, i,
@@ -616,11 +616,11 @@ static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 			cpsw_ale_control_set(ale, 0, ALE_P0_UNI_FLOOD, 1);
 			dev_dbg(&ndev->dev, "promiscuity enabled\n");
 		} else {
-			/* Flood All Unicast Packets to Host port */
+			/* Don't Flood All Unicast Packets to Host port */
 			cpsw_ale_control_set(ale, 0, ALE_P0_UNI_FLOOD, 0);
 
-			/* Enable Learn for all ports */
-			for (i = 0; i < priv->data.slaves; i++) {
+			/* Enable Learn for all ports (host is port 0 and slaves are port 1 and up */
+			for (i = 0; i <= priv->data.slaves; i++) {
 				cpsw_ale_control_set(ale, i,
 						     ALE_PORT_NOLEARN, 0);
 				cpsw_ale_control_set(ale, i,

-- 
Len Sorensen

^ permalink raw reply related

* drivers: net: cpsw: Support ALLMULTI and fix IFF_PROMISC in switch mode
From: Lennart Sorensen @ 2014-10-31 17:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: Len Sorensen, Mugunthan V N, David S. Miller, netdev

The cpsw driver did not support the IFF_ALLMULTI flag which makes dynamic
multicast routing not work.  Related to this, when enabling IFF_PROMISC
in switch mode, all registered multicast addresses are flushed, resulting
in only broadcast and unicast traffic being received.

A new cpsw_ale_set_allmulti function now scans through the ALE entry
table and adds/removes the host port from the unregistered multicast
port mask of each vlan entry depending on the state of IFF_ALLMULTI.
In promiscious mode, cpsw_ale_set_allmulti is used to force reception
of all multicast traffic in addition to the unicast and broadcast traffic.

With this change dynamic multicast and promiscious mode both work in
switch mode.

Signed-off-by: Len Sorensen <lsorense@csclub.uwaterloo.ca>

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 952e1e4..96a61d1 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -638,12 +638,16 @@ static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 	if (ndev->flags & IFF_PROMISC) {
 		/* Enable promiscuous mode */
 		cpsw_set_promiscious(ndev, true);
+		cpsw_ale_set_allmulti(priv->ale, IFF_ALLMULTI);
 		return;
 	} else {
 		/* Disable promiscuous mode */
 		cpsw_set_promiscious(ndev, false);
 	}
 
+	/* Restore allmulti on vlans if necessary */
+	cpsw_ale_set_allmulti(priv->ale, priv->ndev->flags & IFF_ALLMULTI);
+
 	/* Clear all mcast from ALE */
 	cpsw_ale_flush_multicast(priv->ale, ALE_ALL_PORTS << priv->host_port);
 
@@ -1149,6 +1153,7 @@ static inline void cpsw_add_default_vlan(struct cpsw_priv *priv)
 	const int port = priv->host_port;
 	u32 reg;
 	int i;
+	int unreg_mcast_mask;
 
 	reg = (priv->version == CPSW_VERSION_1) ? CPSW1_PORT_VLAN :
 	       CPSW2_PORT_VLAN;
@@ -1158,9 +1163,14 @@ static inline void cpsw_add_default_vlan(struct cpsw_priv *priv)
 	for (i = 0; i < priv->data.slaves; i++)
 		slave_write(priv->slaves + i, vlan, reg);
 
+	if (priv->ndev->flags & IFF_ALLMULTI)
+		unreg_mcast_mask = ALE_ALL_PORTS;
+	else
+		unreg_mcast_mask = ALE_PORT_1 | ALE_PORT_2;
+
 	cpsw_ale_add_vlan(priv->ale, vlan, ALE_ALL_PORTS << port,
 			  ALE_ALL_PORTS << port, ALE_ALL_PORTS << port,
-			  (ALE_PORT_1 | ALE_PORT_2) << port);
+			  unreg_mcast_mask << port);
 }
 
 static void cpsw_init_host_port(struct cpsw_priv *priv)
@@ -1620,11 +1630,17 @@ static inline int cpsw_add_vlan_ale_entry(struct cpsw_priv *priv,
 				unsigned short vid)
 {
 	int ret;
+	int unreg_mcast_mask;
+
+	if (priv->ndev->flags & IFF_ALLMULTI)
+		unreg_mcast_mask = ALE_ALL_PORTS;
+	else
+		unreg_mcast_mask = ALE_PORT_1 | ALE_PORT_2;
 
 	ret = cpsw_ale_add_vlan(priv->ale, vid,
 				ALE_ALL_PORTS << priv->host_port,
 				0, ALE_ALL_PORTS << priv->host_port,
-				(ALE_PORT_1 | ALE_PORT_2) << priv->host_port);
+				unreg_mcast_mask << priv->host_port);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index 0579b22..3ae8387 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -443,6 +443,35 @@ int cpsw_ale_del_vlan(struct cpsw_ale *ale, u16 vid, int port_mask)
 	return 0;
 }
 
+void cpsw_ale_set_allmulti(struct cpsw_ale *ale, int allmulti)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS];
+	int type, idx;
+	int unreg_mcast = 0;
+
+	/* Only bother doing the work if the setting is actually changing */
+	if (ale->allmulti == allmulti)
+		return;
+
+	/* Remember the new setting to check against next time */
+	ale->allmulti = allmulti;
+
+	for (idx = 0; idx < ale->params.ale_entries; idx++) {
+		cpsw_ale_read(ale, idx, ale_entry);
+		type = cpsw_ale_get_entry_type(ale_entry);
+		if (type != ALE_TYPE_VLAN)
+			continue;
+
+		unreg_mcast = cpsw_ale_get_vlan_unreg_mcast(ale_entry);
+		if (allmulti)
+			unreg_mcast |= 1;
+		else
+			unreg_mcast &= ~1;
+		cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast);
+		cpsw_ale_write(ale, idx, ale_entry);
+	}
+}
+
 struct ale_control_info {
 	const char	*name;
 	int		offset, port_offset;
diff --git a/drivers/net/ethernet/ti/cpsw_ale.h b/drivers/net/ethernet/ti/cpsw_ale.h
index 31cf43c..c0d4127 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.h
+++ b/drivers/net/ethernet/ti/cpsw_ale.h
@@ -27,6 +27,7 @@ struct cpsw_ale {
 	struct cpsw_ale_params	params;
 	struct timer_list	timer;
 	unsigned long		ageout;
+	int			allmulti;
 };
 
 enum cpsw_ale_control {
@@ -103,6 +104,7 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
 int cpsw_ale_add_vlan(struct cpsw_ale *ale, u16 vid, int port, int untag,
 			int reg_mcast, int unreg_mcast);
 int cpsw_ale_del_vlan(struct cpsw_ale *ale, u16 vid, int port);
+void cpsw_ale_set_allmulti(struct cpsw_ale *ale, int allmulti);
 
 int cpsw_ale_control_get(struct cpsw_ale *ale, int port, int control);
 int cpsw_ale_control_set(struct cpsw_ale *ale, int port,

-- 
Len Sorensen

^ permalink raw reply related

* Re: [PATCH] ipv4: avoid divide 0 error in tcp_incr_quickack
From: Eric Dumazet @ 2014-10-31 17:40 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Chen Weilong, netdev@vger.kernel.org, David S. Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, linux-kernel@vger.kernel.org
In-Reply-To: <CAADnVQLe+Xs22sbdy7=8KW2+LscFOiFWdKxSFFuPTWSPTHbSow@mail.gmail.com>

On Fri, 2014-10-31 at 09:24 -0700, Alexei Starovoitov wrote:
> cc-ing netdev
> 
> On Fri, Oct 31, 2014 at 7:50 AM, Chen Weilong <chenweilong@huawei.com> wrote:
> > From: Weilong Chen <chenweilong@huawei.com>
> >
> > We got a problem like this:
> >  [ffff8801c1a05570] machine_kexec at ffffffff81025039
> >  [ffff8801c1a055d0] crash_kexec at ffffffff8109b253
> >  [ffff8801c1a056a0] oops_end at ffffffff81442aed
> >  [ffff8801c1a056d0] die at ffffffff81005603
> >  [ffff8801c1a05700] do_trap at ffffffff81442448
> >  [ffff8801c1a05760] do_divide_error at ffffffff81002c10
> >  [ffff8801c1a05888] tcp_send_dupack at ffffffff81385e44
> >  [ffff8801c1a058c8] tcp_validate_incoming at ffffffff813886b5
> >  [ffff8801c1a05908] tcp_rcv_state_process at ffffffff8138d0b7
> >  [ffff8801c1a05958] tcp_child_process at ffffffff81397255
> >  [ffff8801c1a05988] tcp_v4_do_rcv at ffffffff81395a70
> >  [ffff8801c1a059d8] tcp_v4_rcv at ffffffff81396fc8
> >  [ffff8801c1a05a48] ip_local_deliver_finish at ffffffff813746e9
> >  [ffff8801c1a05a78] ip_local_deliver at ffffffff81374a20
> >  [ffff8801c1a05aa8] ip_rcv_finish at ffffffff81374389
> >  [ffff8801c1a05ad8] ip_rcv at ffffffff81374c78
> > There was a wrong ack packet coming during TCP handshake. The socket's state
> > was TCP_SYN_RECV, its rcv_mss was not initialize yet. So
> > tcp_send_dupack -> tcp_enter_quickack_mode got a divide 0 error.
> > This patch add a state check before tcp_enter_quickack_mode.
> 
> ouch. Is it remote exploitable?

Seems to be SYN crossing. Quite hard, but possible.

^ permalink raw reply

* Re: DMA-API warning from sunhme - unchecked dma_map_single error
From: David Miller @ 2014-10-31 17:43 UTC (permalink / raw)
  To: mroos; +Cc: netdev, sparclinux
In-Reply-To: <alpine.SOC.1.00.1311291036050.3807@math.ut.ee>

From: Meelis Roos <mroos@linux.ee>
Date: Fri, 29 Nov 2013 10:40:40 +0200 (EET)

> It seems to be correct warning - dma_map_single is used unchecked in 
> sunhme.c. I can try fixing it - the error handling will be the only 
> problem. Is it considered worthwile?

Can you test this patch?

====================
sunhme: Add DMA mapping error checks.

Reported-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sun/sunhme.c | 62 +++++++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index 72c8525..9c01480 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -1262,6 +1262,7 @@ static void happy_meal_init_rings(struct happy_meal *hp)
 	HMD(("init rxring, "));
 	for (i = 0; i < RX_RING_SIZE; i++) {
 		struct sk_buff *skb;
+		u32 mapping;
 
 		skb = happy_meal_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
 		if (!skb) {
@@ -1272,10 +1273,16 @@ static void happy_meal_init_rings(struct happy_meal *hp)
 
 		/* Because we reserve afterwards. */
 		skb_put(skb, (ETH_FRAME_LEN + RX_OFFSET + 4));
+		mapping = dma_map_single(hp->dma_dev, skb->data, RX_BUF_ALLOC_SIZE,
+					 DMA_FROM_DEVICE);
+		if (dma_mapping_error(hp->dma_dev, mapping)) {
+			dev_kfree_skb_any(skb);
+			hme_write_rxd(hp, &hb->happy_meal_rxd[i], 0, 0);
+			continue;
+		}
 		hme_write_rxd(hp, &hb->happy_meal_rxd[i],
 			      (RXFLAG_OWN | ((RX_BUF_ALLOC_SIZE - RX_OFFSET) << 16)),
-			      dma_map_single(hp->dma_dev, skb->data, RX_BUF_ALLOC_SIZE,
-					     DMA_FROM_DEVICE));
+			      mapping);
 		skb_reserve(skb, RX_OFFSET);
 	}
 
@@ -2020,6 +2027,7 @@ static void happy_meal_rx(struct happy_meal *hp, struct net_device *dev)
 		skb = hp->rx_skbs[elem];
 		if (len > RX_COPY_THRESHOLD) {
 			struct sk_buff *new_skb;
+			u32 mapping;
 
 			/* Now refill the entry, if we can. */
 			new_skb = happy_meal_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
@@ -2027,13 +2035,21 @@ static void happy_meal_rx(struct happy_meal *hp, struct net_device *dev)
 				drops++;
 				goto drop_it;
 			}
+			skb_put(new_skb, (ETH_FRAME_LEN + RX_OFFSET + 4));
+			mapping = dma_map_single(hp->dma_dev, new_skb->data,
+						 RX_BUF_ALLOC_SIZE,
+						 DMA_FROM_DEVICE);
+			if (unlikely(dma_mapping_error(hp->dma_dev, mapping))) {
+				dev_kfree_skb_any(new_skb);
+				drops++;
+				goto drop_it;
+			}
+
 			dma_unmap_single(hp->dma_dev, dma_addr, RX_BUF_ALLOC_SIZE, DMA_FROM_DEVICE);
 			hp->rx_skbs[elem] = new_skb;
-			skb_put(new_skb, (ETH_FRAME_LEN + RX_OFFSET + 4));
 			hme_write_rxd(hp, this,
 				      (RXFLAG_OWN|((RX_BUF_ALLOC_SIZE-RX_OFFSET)<<16)),
-				      dma_map_single(hp->dma_dev, new_skb->data, RX_BUF_ALLOC_SIZE,
-						     DMA_FROM_DEVICE));
+				      mapping);
 			skb_reserve(new_skb, RX_OFFSET);
 
 			/* Trim the original skb for the netif. */
@@ -2248,6 +2264,25 @@ static void happy_meal_tx_timeout(struct net_device *dev)
 	netif_wake_queue(dev);
 }
 
+static void unmap_partial_tx_skb(struct happy_meal *hp, u32 first_mapping,
+				 u32 first_len, u32 first_entry, u32 entry)
+{
+	struct happy_meal_txd *txbase = &hp->happy_block->happy_meal_txd[0];
+
+	dma_unmap_single(hp->dma_dev, first_mapping, first_len, DMA_TO_DEVICE);
+
+	first_entry = NEXT_TX(first_entry);
+	while (first_entry != entry) {
+		struct happy_meal_txd *this = &txbase[first_entry];
+		u32 addr, len;
+
+		addr = hme_read_desc32(hp, &this->tx_addr);
+		len = hme_read_desc32(hp, &this->tx_flags);
+		len &= TXFLAG_SIZE;
+		dma_unmap_page(hp->dma_dev, addr, len, DMA_TO_DEVICE);
+	}
+}
+
 static netdev_tx_t happy_meal_start_xmit(struct sk_buff *skb,
 					 struct net_device *dev)
 {
@@ -2284,6 +2319,8 @@ static netdev_tx_t happy_meal_start_xmit(struct sk_buff *skb,
 
 		len = skb->len;
 		mapping = dma_map_single(hp->dma_dev, skb->data, len, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(hp->dma_dev, mapping)))
+			goto out_dma_error;
 		tx_flags |= (TXFLAG_SOP | TXFLAG_EOP);
 		hme_write_txd(hp, &hp->happy_block->happy_meal_txd[entry],
 			      (tx_flags | (len & TXFLAG_SIZE)),
@@ -2299,6 +2336,8 @@ static netdev_tx_t happy_meal_start_xmit(struct sk_buff *skb,
 		first_len = skb_headlen(skb);
 		first_mapping = dma_map_single(hp->dma_dev, skb->data, first_len,
 					       DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(hp->dma_dev, first_mapping)))
+			goto out_dma_error;
 		entry = NEXT_TX(entry);
 
 		for (frag = 0; frag < skb_shinfo(skb)->nr_frags; frag++) {
@@ -2308,6 +2347,11 @@ static netdev_tx_t happy_meal_start_xmit(struct sk_buff *skb,
 			len = skb_frag_size(this_frag);
 			mapping = skb_frag_dma_map(hp->dma_dev, this_frag,
 						   0, len, DMA_TO_DEVICE);
+			if (unlikely(dma_mapping_error(hp->dma_dev, mapping))) {
+				unmap_partial_tx_skb(hp, first_mapping, first_len,
+						     first_entry, entry);
+				goto out_dma_error;
+			}
 			this_txflags = tx_flags;
 			if (frag == skb_shinfo(skb)->nr_frags - 1)
 				this_txflags |= TXFLAG_EOP;
@@ -2333,6 +2377,14 @@ static netdev_tx_t happy_meal_start_xmit(struct sk_buff *skb,
 
 	tx_add_log(hp, TXLOG_ACTION_TXMIT, 0);
 	return NETDEV_TX_OK;
+
+out_dma_error:
+	hp->tx_skbs[hp->tx_new] = NULL;
+	spin_unlock_irq(&hp->happy_lock);
+
+	dev_kfree_skb_any(skb);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
 }
 
 static struct net_device_stats *happy_meal_get_stats(struct net_device *dev)
-- 
1.9.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox