Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 2/8] net/sched: cls_flower: Try to offload only if skip_hw flag isn't set
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
	Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>

Check skip_hw flag isn't set before calling
fl_hw_{replace/destroy}_filter and fl_hw_update_stats functions.

Replace the call to tc_should_offload with tc_can_offload.
tc_can_offload only checks if the device supports offloading, the check for
skip_hw flag is done earlier in the flow.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 net/sched/cls_flower.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index e8dd09a..5e70f65 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -207,7 +207,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
 	struct tc_cls_flower_offload offload = {0};
 	struct tc_to_netdev tc;
 
-	if (!tc_should_offload(dev, tp, 0))
+	if (!tc_can_offload(dev, tp))
 		return;
 
 	offload.command = TC_CLSFLOWER_DESTROY;
@@ -231,7 +231,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	struct tc_to_netdev tc;
 	int err;
 
-	if (!tc_should_offload(dev, tp, flags))
+	if (!tc_can_offload(dev, tp))
 		return tc_skip_sw(flags) ? -EINVAL : 0;
 
 	offload.command = TC_CLSFLOWER_REPLACE;
@@ -259,7 +259,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload offload = {0};
 	struct tc_to_netdev tc;
 
-	if (!tc_should_offload(dev, tp, 0))
+	if (!tc_can_offload(dev, tp))
 		return;
 
 	offload.command = TC_CLSFLOWER_STATS;
@@ -275,7 +275,8 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
 	list_del_rcu(&f->list);
-	fl_hw_destroy_filter(tp, (unsigned long)f);
+	if (!tc_skip_hw(f->flags))
+		fl_hw_destroy_filter(tp, (unsigned long)f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 }
@@ -743,20 +744,23 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 			goto errout;
 	}
 
-	err = fl_hw_replace_filter(tp,
-				   &head->dissector,
-				   &mask.key,
-				   &fnew->key,
-				   &fnew->exts,
-				   (unsigned long)fnew,
-				   fnew->flags);
-	if (err)
-		goto errout;
+	if (!tc_skip_hw(fnew->flags)) {
+		err = fl_hw_replace_filter(tp,
+					   &head->dissector,
+					   &mask.key,
+					   &fnew->key,
+					   &fnew->exts,
+					   (unsigned long)fnew,
+					   fnew->flags);
+		if (err)
+			goto errout;
+	}
 
 	if (fold) {
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
-		fl_hw_destroy_filter(tp, (unsigned long)fold);
+		if (!tc_skip_hw(fold->flags))
+			fl_hw_destroy_filter(tp, (unsigned long)fold);
 	}
 
 	*arg = (unsigned long) fnew;
@@ -879,7 +883,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			goto nla_put_failure;
 	}
 
-	fl_hw_update_stats(tp, f);
+	if (!tc_skip_hw(f->flags))
+		fl_hw_update_stats(tp, f);
 
 	if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
 			    mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 4/8] net/sched: act_mirred: Add new tc_action_ops get_dev()
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
	Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>

Adding support to a new tc_action_ops.
get_dev is a general option which allows to get the underline
device when trying to offload a tc rule.

In case of mirred action the returned device is the mirred (egress)
device.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/act_api.h  |  2 ++
 net/sched/act_mirred.c | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index d8eae87..9dddf77 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -119,6 +119,8 @@ struct tc_action_ops {
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int, const struct tc_action_ops *);
 	void	(*stats_update)(struct tc_action *, u64, u32, u64);
+	int	(*get_dev)(const struct tc_action *a, struct net *net,
+			   struct net_device **mirred_dev);
 };
 
 struct tc_action_net {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1af7baa..bb09ba3 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -315,6 +315,17 @@ static int mirred_device_event(struct notifier_block *unused,
 	.notifier_call = mirred_device_event,
 };
 
+static int tcf_mirred_device(const struct tc_action *a, struct net *net,
+			     struct net_device **mirred_dev)
+{
+	int ifindex = tcf_mirred_ifindex(a);
+
+	*mirred_dev = __dev_get_by_index(net, ifindex);
+	if (!mirred_dev)
+		return -EINVAL;
+	return 0;
+}
+
 static struct tc_action_ops act_mirred_ops = {
 	.kind		=	"mirred",
 	.type		=	TCA_ACT_MIRRED,
@@ -327,6 +338,7 @@ static int mirred_device_event(struct notifier_block *unused,
 	.walk		=	tcf_mirred_walker,
 	.lookup		=	tcf_mirred_search,
 	.size		=	sizeof(struct tcf_mirred),
+	.get_dev	=	tcf_mirred_device,
 };
 
 static __net_init int mirred_init_net(struct net *net)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 3/8] net/sched: cls_flower: Provide a filter to replace/destroy hardware filter functions
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
	Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>

Instead of providing many arguments to fl_hw_{replace/destroy}_filter
functions, just provide cls_fl_filter struct that includes all the relevant
args.

This patches doesn't add any new functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 net/sched/cls_flower.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5e70f65..13b349f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -201,7 +201,7 @@ static void fl_destroy_filter(struct rcu_head *head)
 	kfree(f);
 }
 
-static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
+static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
@@ -211,7 +211,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
 		return;
 
 	offload.command = TC_CLSFLOWER_DESTROY;
-	offload.cookie = cookie;
+	offload.cookie = (unsigned long)f;
 
 	tc.type = TC_SETUP_CLSFLOWER;
 	tc.cls_flower = &offload;
@@ -222,9 +222,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
 static int fl_hw_replace_filter(struct tcf_proto *tp,
 				struct flow_dissector *dissector,
 				struct fl_flow_key *mask,
-				struct fl_flow_key *key,
-				struct tcf_exts *actions,
-				unsigned long cookie, u32 flags)
+				struct cls_fl_filter *f)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
@@ -232,14 +230,14 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	int err;
 
 	if (!tc_can_offload(dev, tp))
-		return tc_skip_sw(flags) ? -EINVAL : 0;
+		return tc_skip_sw(f->flags) ? -EINVAL : 0;
 
 	offload.command = TC_CLSFLOWER_REPLACE;
-	offload.cookie = cookie;
+	offload.cookie = (unsigned long)f;
 	offload.dissector = dissector;
 	offload.mask = mask;
-	offload.key = key;
-	offload.exts = actions;
+	offload.key = &f->key;
+	offload.exts = &f->exts;
 
 	tc.type = TC_SETUP_CLSFLOWER;
 	tc.cls_flower = &offload;
@@ -247,7 +245,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
 					    &tc);
 
-	if (tc_skip_sw(flags))
+	if (tc_skip_sw(f->flags))
 		return err;
 
 	return 0;
@@ -276,7 +274,7 @@ static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
 	list_del_rcu(&f->list);
 	if (!tc_skip_hw(f->flags))
-		fl_hw_destroy_filter(tp, (unsigned long)f);
+		fl_hw_destroy_filter(tp, f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 }
@@ -748,10 +746,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		err = fl_hw_replace_filter(tp,
 					   &head->dissector,
 					   &mask.key,
-					   &fnew->key,
-					   &fnew->exts,
-					   (unsigned long)fnew,
-					   fnew->flags);
+					   fnew);
 		if (err)
 			goto errout;
 	}
@@ -760,7 +755,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
 		if (!tc_skip_hw(fold->flags))
-			fl_hw_destroy_filter(tp, (unsigned long)fold);
+			fl_hw_destroy_filter(tp, fold);
 	}
 
 	*arg = (unsigned long) fnew;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 1/8] net/sched: Add separate check for skip_hw flag
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
	Roi Dayan, Hadar Hen Zion
In-Reply-To: <1480516895-29545-1-git-send-email-hadarh@mellanox.com>

Creating a difference between two possible cases:
1. Not offloading tc rule since the user sets 'skip_hw' flag.
2. Not offloading tc rule since the device doesn't support offloading.

This patch doesn't add any new functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/pkt_cls.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 767b03a..45ad9aa 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -425,16 +425,14 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_should_offload(const struct net_device *dev,
-				     const struct tcf_proto *tp, u32 flags)
+static inline bool tc_can_offload(const struct net_device *dev,
+				  const struct tcf_proto *tp)
 {
 	const struct Qdisc *sch = tp->q;
 	const struct Qdisc_class_ops *cops = sch->ops->cl_ops;
 
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
-	if (flags & TCA_CLS_FLAGS_SKIP_HW)
-		return false;
 	if (!dev->netdev_ops->ndo_setup_tc)
 		return false;
 	if (cops && cops->tcf_cl_offload)
@@ -443,6 +441,19 @@ static inline bool tc_should_offload(const struct net_device *dev,
 	return true;
 }
 
+static inline bool tc_skip_hw(u32 flags)
+{
+	return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false;
+}
+
+static inline bool tc_should_offload(const struct net_device *dev,
+				     const struct tcf_proto *tp, u32 flags)
+{
+	if (tc_skip_hw(flags))
+		return false;
+	return tc_can_offload(dev, tp);
+}
+
 static inline bool tc_skip_sw(u32 flags)
 {
 	return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 0/8] Offloading tc rules using underline Hardware device
From: Hadar Hen Zion @ 2016-11-30 14:41 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Saeed Mahameed, Jiri Pirko, Amir Vadai, Or Gerlitz,
	Roi Dayan, Hadar Hen Zion

This series adds flower classifier support in offloading tc rules when the
Software ingress device is different from the Hardware ingress device, 
such as when dealing with IP tunnels  

The first two patches are a small fixes to flower, checking the skip_hw flag
wasn't set before calling the Hardware offloading functions which will try to
offload the rule.

The next two patches are infrastructure patches, a preparation for the fourth
patch which is adding support in flower to offload rules when the ingress
device is not a Hardware device and therefore can't offload.
In this case ndo_setup_tc is called with the mirred (egress) device.

The last three patchs are adding mlx5e support to offload rules using the new
"egress_device" flag.

Thanks,
Hadar

Hadar Hen Zion (8):
  net/sched: Add separate check for skip_hw flag
  net/sched: cls_flower: Try to offload only if skip_hw flag isn't set
  net/sched: cls_flower: Provide a filter to replace/destroy hardware
    filter functions
  net/sched: act_mirred: Add new tc_action_ops get_dev()
  net/sched: cls_flower: Add offload support using egress Hardware
    device
  net/mlx5e: Bring back representor's ndos that were accidentally
    removed
  net/mlx5e: Save the represntor netdevice as part of the representor
  net/mlx5e: Support adding ingress tc rule when egress device flag is
    set

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 25 +++++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  3 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 12 ++-
 include/linux/netdevice.h                          |  1 +
 include/net/act_api.h                              |  2 +
 include/net/pkt_cls.h                              | 21 +++++-
 net/sched/act_mirred.c                             | 12 +++
 net/sched/cls_api.c                                | 22 ++++++
 net/sched/cls_flower.c                             | 87 ++++++++++++----------
 10 files changed, 133 insertions(+), 54 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [net-next 1/1] samples: bpf: Refactor test_cgrp2_attach -- use getopt, and add mode
From: David Miller @ 2016-11-30 15:29 UTC (permalink / raw)
  To: sargun; +Cc: netdev, daniel, ast
In-Reply-To: <20161128225240.GA8761@ircssh.c.rugged-nimbus-611.internal>

From: Sargun Dhillon <sargun@sargun.me>
Date: Mon, 28 Nov 2016 14:52:42 -0800

> This patch modifies test_cgrp2_attach to use getopt so we can use standard
> command line parsing.
> 
> It also adds an option to run the program in detach only mode. This does
> not attach a new filter at the cgroup, but only runs the detach command.
> 
> Lastly, it changes the attach code to not detach and then attach. It relies
> on the 'hotswap' behaviour of CGroup BPF programs to be able to change
> in-place. If detach-then-attach behaviour needs to be tested, the example
> can be run in detach only mode prior to attachment.
> 
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] net: brocade: bna: use new api ethtool_{get|set}_link_ksettings
From: David Miller @ 2016-11-30 15:29 UTC (permalink / raw)
  To: tremyfr
  Cc: rasesh.mody, sudarsana.kalluru, Dept-GELinuxNICDev, netdev,
	linux-kernel
In-Reply-To: <1480373539-3257-1-git-send-email-tremyfr@gmail.com>

From: Philippe Reynes <tremyfr@gmail.com>
Date: Mon, 28 Nov 2016 23:52:19 +0100

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes <tremyfr@gmail.com>

Applied.

^ permalink raw reply

* RE: Aw: Re: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: David Laight @ 2016-11-30 15:28 UTC (permalink / raw)
  To: 'Eric Dumazet', Lino Sanfilippo
  Cc: David Miller, netdev, Tariq Toukan
In-Reply-To: <1480378759.18162.105.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet
> Sent: 29 November 2016 00:19
> On Mon, 2016-11-28 at 23:02 +0100, Lino Sanfilippo wrote:
> > Hi Eric,
> >
> > On 25.11.2016 20:19, Eric Dumazet wrote:
> > > On Fri, 2016-11-25 at 17:30 +0100, Lino Sanfilippo wrote:
> > >> Hi,
> > >>
> > >>
> > >> >
> > >> > The READ_ONCE() are documenting the fact that no lock is taken to fetch
> > >> > the stats, while another cpus might being changing them.
> > >> >
> > >> > I had no answer yet from https://patchwork.ozlabs.org/patch/698449/
> > >> >
> > >> > So I thought it was not needed to explain this in the changelog, given
> > >> > that it apparently is one of the few things that can block someone to
> > >> > understand one of my changes :/
> > >> >
> > >> > Apparently nobody really understands READ_ONCE() purpose, it is really a
> > >> > pity we have to explain this over and over.
> > >> >
> > >>
> > >> Even at the risk of showing once more a lack of understanding for
> > >> READ_ONCE():
> > >> Does not a READ_ONCE() have to e paired with some kind of
> > >> WRITE_ONCE()?
> > >
> > > You are right.
> > >
> > > Although in this case, the producers are using a lock, and do
> > >
> > > ring->packets++;
> > >
> > > We hopefully have compilers/cpus that do not put intermediate garbage in
> > > ring->packets while doing the increment.
> > >
> > > One problem with :
> > >
> > > WRITE_ONCE(ring->packets, ring->packets + 1);
> > >
> > > is that gcc no longer uses an INC instruction.
> >
> > I see. So we would have to do something like
> >
> > tmp = ring->packets;
> > tmp++;
> > WRITE_ONCE(ring->packets, tmp);
> 
> 
> Well, gcc will generate a code with more instructions than a mere
> 
> "inc  offset(%register)"

Are you sure??
Last I looked gcc seemed to convert 'foo++' to 'foo = foo + 1' before
generating any code.
It might then optimise that back to a memory increment, but that would
also happen if you'd coded the latter form.

> Which is kind of unfortunate, given it is the fast path.
> 
> Better add a comment, like :
> 
> /* We should use WRITE_ONCE() to pair with the READ_ONCE() found in xxxx()
>  * But gcc would generate non optimal code.
>  */

Actually while READ_ONCE() is generally useful - to get a snapshot of a changing value.

WRITE_ONCE() isn't a pairing - the compiler is highly unlikely to write a
location twice.
You might want an annotation to ensure is doesn't assume it can read the value
back (write through volatile pointer). But that has nothing to do with how readers behave.

	David


^ permalink raw reply

* Re: [PATCH net-next] bpf, xdp: allow to pass flags to dev_change_xdp_fd
From: David Miller @ 2016-11-30 15:27 UTC (permalink / raw)
  To: daniel; +Cc: alexei.starovoitov, netdev
In-Reply-To: <cd0871f11c476f8d7fee1cb356940a13cf7d4807.1480370617.git.daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 28 Nov 2016 23:16:54 +0100

> Add an IFLA_XDP_FLAGS attribute that can be passed for setting up
> XDP along with IFLA_XDP_FD, which eventually allows user space to
> implement typical add/replace/delete logic for programs. Right now,
> calling into dev_change_xdp_fd() will always replace previous programs.
> 
> When passed XDP_FLAGS_UPDATE_IF_NOEXIST, we can handle this more
> graceful when requested by returning -EBUSY in case we try to
> attach a new program, but we find that another one is already
> attached. This will be used by upcoming front-end for iproute2 as
> well.
> 
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Applied, thanks Daniel.

^ permalink raw reply

* Re: DSA vs. SWTICHDEV ?
From: Andrew Lunn @ 2016-11-30 15:25 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: netdev@vger.kernel.org
In-Reply-To: <1480516241.3563.142.camel@infinera.com>

On Wed, Nov 30, 2016 at 02:30:43PM +0000, Joakim Tjernlund wrote:
> On Wed, 2016-11-30 at 14:52 +0100, Andrew Lunn wrote:
> > On Wed, Nov 30, 2016 at 08:50:34AM +0000, Joakim Tjernlund wrote:
> > > I am trying to wrap my head around these two "devices" and have a hard time telling them apart.
> > > We are looking att adding a faily large switch(over PCIe) to our board and from what I can tell
> > > switchdev is the new way to do it but DSA is still there. Is it possible to just list
> > > how they differ?
> > 
> > Hi Joakim
> 
> Hi Andrew, thanks for answering
> 
> > 
> > If the interface you use to send frames from the host to the switch is
> > PCIe, you probably want to use switchdev directly.
> 
> OK, we will have a few ethernet I/F's connected too but I these should be used
> as normal interfaces just interfacing a switch.

That does not make much sense.

Maybe time to backtrack a bit. The Linux concept for switch/router
chips is that they are just hardware accelerators for what Linux can
already do in software. Each port of the switch is just a normal Linux
interface.  ip link show will list each port. ip addr add can be used
to add an IP address to the interface.  You want to switch frames
between two ports: Create a linux bridge and put the interfaces into
it. Via switchdev you get a call into the hardware to accelerate
this. If the hardware cannot accelerate it, it is done in software as
normal.  Want to combine two ports into a trunk: Add a team interface
and make the port interfaces slaves of the team interface. Via
switchdev, you ask the hardware to accelerate this. If it cannot, it
is done in software.

So back your connecting a few host interfaces to the switch. This is
logically putting a cable between two interfaces on the same host. You
are making a loopback. Why do that? Sure it is possible, but it is an
odd architecture.

> And switchdev can do all this over PCIe instead? Can you have a
> switch tree in switchdev too?

Mellonex says so, but i don't think they have actually implemented it.

	 Andrew

^ permalink raw reply

* Re: [PATCH net-next v3 0/2] net: phy: broadcom: Support for PHY counters
From: David Miller @ 2016-11-30 15:22 UTC (permalink / raw)
  To: f.fainelli
  Cc: netdev, andrew, bcm-kernel-feedback-list, allan.nielsen,
	raju.lakkaraju
In-Reply-To: <20161129175718.20213-1-f.fainelli@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 29 Nov 2016 09:57:16 -0800

> This patch series adds support for reading the Broadcom PHYs internal counters.
> 
> Changes in v3:
> 
> - fixed the allocation of priv->stats in bcm7xxx
> 
> Changes in v2:
> 
> - fixed modular build reported by kbuild
> 
> - constify array of stats

Series applied, thanks Florian.

^ permalink raw reply

* Re: [PATCH v2 net-next] sfc: separate out SFC4000 ("Falcon") support into new sfc-falcon driver
From: David Miller @ 2016-11-30 15:18 UTC (permalink / raw)
  To: ecree; +Cc: linux-net-drivers, bkenward, netdev
In-Reply-To: <624b8ce9-12dd-1b89-ae9f-8a092751bf12@solarflare.com>


Applied, but yes you should really rethink the Kconfig stuff.

I would strongly suggest you simply make them independent Kconfig
options with no attachment to eachother.  No depends, no implies,
nothing like that.

^ permalink raw reply

* Re: [PATCH] cpsw: ethtool: add support for nway reset
From: David Miller @ 2016-11-30 15:13 UTC (permalink / raw)
  To: yegorslists; +Cc: netdev, linux-omap, grygorii.strashko, mugunthanvnm
In-Reply-To: <CAGm1_ktxyxVxYdF0Aq-Nv2rVGjEccPsGRZSXHBOE9egybiV1Lg@mail.gmail.com>

From: Yegor Yefremov <yegorslists@googlemail.com>
Date: Wed, 30 Nov 2016 10:31:30 +0100

> Hi David,
> 
> On Wed, Nov 30, 2016 at 1:41 AM, David Miller <davem@davemloft.net> wrote:
>> From: yegorslists@googlemail.com
>> Date: Mon, 28 Nov 2016 10:47:52 +0100
>>
>>> From: Yegor Yefremov <yegorslists@googlemail.com>
>>>
>>> This patch adds support for ethtool's '-r' command. Restarting
>>> N-WAY negotiation can be useful to activate newly changed EEE
>>> settings etc.
>>>
>>> Signed-off-by: Yegor Yefremov <yegorslists@googlemail.com>
>>
>> This doesn't apply cleanly to net-next.
> 
> My previous patch [1] doesn't show up in net-next. This could explain,
> why nway patch doesn't apply.
> Should I resend them both as series?
> 
> [1] http://marc.info/?l=linux-omap&m=148036822211869&w=2

My bad, I sorted this out and applied the nway-reset patch too.

Sorry about that.

^ permalink raw reply

* Re: Receive offloads, small RCVBUF and zero TCP window
From: Alex Sidorenko @ 2016-11-30 15:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Marcelo Ricardo Leitner
In-Reply-To: <15427752.RvQkb5CQdb@zbook>

On Monday, November 28, 2016 4:14:04 PM EST Alex Sidorenko wrote:
> On Monday, November 28, 2016 3:54:59 PM EST David Miller wrote:
> > From: Alex Sidorenko <alexandre.sidorenko@hpe.com>
> > Date: Mon, 28 Nov 2016 15:49:26 -0500
> > 
> > > Now the question is whether is is OK to have icsk->icsk_ack.rcv_mss
> > > larger than MTU.
> > 
> > It absolutely is not OK.
> > 
> > If VMWare wants to receive large frames for batching purposes it must
> > use GRO or similar to achieve that, not just send vanilla frames into
> > the stack which are larger than the device MTU.
> > 
> 
> As VMWare's vmxnet3 driver is open-sourced and part of generic kernel, do you think the problem is in that driver or elsewhere? I looked at vmxnet3 sources and see that it uses LRO/GRO subroutines. Unfortunately, I don't understand its logic enough to see whether they are doing anything incorrectly.

I think this has been already fixed in recent versions of vmxnet3 driver (but not in RHEL6). VMWare/ESX can pass us aggregated large SKBs indeed (> MTU) if LRO is enabled, but the driver takes care of that in vmxnet3_rq_rx_complete():

			} else if (segCnt != 0 || skb->len > mtu) {
				u32 hlen;

				hlen = vmxnet3_get_hdr_len(adapter, skb,
					(union Vmxnet3_GenericDesc *)rcd);
				if (hlen == 0)
					goto not_lro;

				skb_shinfo(skb)->gso_type =
					rcd->v4 ? SKB_GSO_TCPV4 : SKB_GSO_TCPV6;
				if (segCnt != 0) {
					skb_shinfo(skb)->gso_segs = segCnt;
					skb_shinfo(skb)->gso_size =
						DIV_ROUND_UP(skb->len -
							hlen, segCnt);
				} else {
					skb_shinfo(skb)->gso_size = mtu - hlen;
				}
			}


So if packets have been aggregated, 

	u8		segCnt;       /* Number of aggregated packets */


we compute gso_size by dividing large skb->len by the number.

I still like Marcelo's idea of printing a warning when icsk->icsk_ack.rcv_mss looks unreasonable, should really help with detecting buggy drivers.

^ permalink raw reply

* Re: DSA vs envelope frames
From: Andrew Lunn @ 2016-11-30 15:10 UTC (permalink / raw)
  To: Nikita Yushchenko
  Cc: Toshiaki Makita, Andy Duan, David S. Miller, Troy Kisky,
	Eric Nelson, Philippe Reynes, Johannes Berg,
	netdev@vger.kernel.org, Chris Healy, Fabio Estevam,
	linux-kernel@vger.kernel.org, Vivien Didelot, florian Fainelli
In-Reply-To: <d8a6df03-d9bd-eca5-0b9f-73406efe6509@cogentembedded.com>

> What is not really clear - what if several tagging protocols are used
> together. AFAIU, things may be more complex that simple appending of
> tags, e.g. EDSA tag can carry VLAN id inside.

Hi Nikita

At least for all current tagging protocols, the size of the tag is
constant. And you cannot run different tagging protocols on the same
master interface at the same time.

However, i think Florian tried something funky with the SF2 and B53
driver. He has a b53 hanging off a sf2. So i think he used nested
tagging protocols!

	Andrew

^ permalink raw reply

* DSA vs envelope frames
From: Nikita Yushchenko @ 2016-11-30 14:58 UTC (permalink / raw)
  To: Toshiaki Makita, Andy Duan, David S. Miller, Troy Kisky,
	Andrew Lunn, Eric Nelson, Philippe Reynes, Johannes Berg,
	netdev@vger.kernel.org
  Cc: Chris Healy, Fabio Estevam, linux-kernel@vger.kernel.org,
	Vivien Didelot, lorian Fainelli
In-Reply-To: <f632a0e8-746b-2db5-09af-3a5f6fc78e13@lab.ntt.co.jp>

>> (1) When DSA is in use, frames processed by FEC chip contain DSA tag and
>> thus can be larger than hardcoded limit of 1522. This issue is not
>> FEC-specific, any driver that hardcodes maximum frame size to 1522 (many
>> do) will have this issue if used with DSA.
> 
> BTW I'm trying to introduce envelope frames to solve this kind of problems.
> http://marc.info/?t=147496691500005&r=1&w=2
> http://marc.info/?t=147496691500003&r=1&w=2
> http://marc.info/?t=147496691500002&r=1&w=2
> http://marc.info/?t=147496691500004&r=1&w=2
> http://marc.info/?t=147496691500001&r=1&w=2
> 
> It needs jumbo frame support of NICs though.

Thanks for pointing to this.

Indeed frame with DSA tag conceptually is an envelope frame.

ndev->env_hdr_len introduced by your patches, actually is explicitly
handled difference between (MTU + 18) and frame that HW should allow.
If this is known, hardware can be configured to work with DSA. At least
FEC hardware that can send and receive "slightly larger" frames after
simple register configuration.

Furthermore, since DSA configuration is known statically (it comes from
device tree), ndo_set_env_hdr_len method could be automatically called
at init, making setup working by default if driver supports that. And if
not, perhaps can automatically lower MTU.

Looks like a solution :)

What's current status of this work?

What is not really clear - what if several tagging protocols are used
together. AFAIU, things may be more complex that simple appending of
tags, e.g. EDSA tag can carry VLAN id inside.

Nikita

^ permalink raw reply

* Re: [net PATCH 0/2] Don't use lco_csum to compute IPv4 checksum
From: David Miller @ 2016-11-30 14:47 UTC (permalink / raw)
  To: alexander.h.duyck; +Cc: netdev, intel-wired-lan, sfr, jeffrey.t.kirsher
In-Reply-To: <20161128153927.15466.99193.stgit@ahduyck-blue-test.jf.intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 28 Nov 2016 10:42:18 -0500

> When I implemented the GSO partial support in the Intel drivers I was using
> lco_csum to compute the checksum that we needed to plug into the IPv4
> checksum field in order to cancel out the data that was not a part of the
> IPv4 header.  However this didn't take into account that the transport
> offset might be pointing to the inner transport header.
> 
> Instead of using lco_csum I have just coded around it so that we can use
> the outer IP header plus the IP header length to determine where we need to
> start our checksum and then just call csum_partial ourselves.
> 
> This should fix the SIT issue reported on igb interfaces as well as simliar
> issues that would pop up on other Intel NICs.

Jeff, are you going to send me a pull request with this stuff or would
you be OK with my applying these directly to 'net'?

Thanks.

^ permalink raw reply

* Re: DSA vs. SWTICHDEV ?
From: Joakim Tjernlund @ 2016-11-30 14:30 UTC (permalink / raw)
  To: andrew@lunn.ch; +Cc: netdev@vger.kernel.org
In-Reply-To: <20161130135257.GC18716@lunn.ch>

On Wed, 2016-11-30 at 14:52 +0100, Andrew Lunn wrote:
> On Wed, Nov 30, 2016 at 08:50:34AM +0000, Joakim Tjernlund wrote:
> > I am trying to wrap my head around these two "devices" and have a hard time telling them apart.
> > We are looking att adding a faily large switch(over PCIe) to our board and from what I can tell
> > switchdev is the new way to do it but DSA is still there. Is it possible to just list
> > how they differ?
> 
> Hi Joakim

Hi Andrew, thanks for answering

> 
> If the interface you use to send frames from the host to the switch is
> PCIe, you probably want to use switchdev directly.

OK, we will have a few ethernet I/F's connected too but I these should be used
as normal interfaces just interfacing a switch.

> 
> DSA devices all use a host Ethernet interface to send frames to the
> switch. DSA sits under switchdev, and effectively provides a lot of
> the common stuff needed for implementing switch drivers of this
> sort. It creates the slave interfaces, links the MAC to the PHY, has
> one uniform device tree binding which all DSA switches have, deals
> with encapsulation/decapsulating frames sent over the master device,
> etc.

And switchdev can do all this over PCIe instead? Can you have a switch tree in switchdev too?

We probably need to push a custom tag for final step (after the switch) and I think that
should be doable with a custom VLAN driver over one/several of switchdev's virtual
interfaces? 

 Joakim

^ permalink raw reply

* Re: [PATCH iproute2 V3 3/3] tc/act_tunnel: Introduce ip tunnel action
From: Jiri Benc @ 2016-11-30 14:44 UTC (permalink / raw)
  To: Amir Vadai
  Cc: Stephen Hemminger, netdev, David S. Miller, Or Gerlitz,
	Hadar Har-Zion, Jiri Pirko
In-Reply-To: <20161130073840.5269-4-amir@vadai.me>

On Wed, 30 Nov 2016 09:38:40 +0200, Amir Vadai wrote:
> +The
> +.I UNSET
> +mode is optional - even without using it, the metadata information will be
> +released automatically when packet processing will be finished.
> +.IR UNSET
> +function could be used in cases when traffic is forwarded between two tunnels,
> +where the metadata from the first tunnel will be used for encapsulation done by
> +the second tunnel.

This looks good.

> +It must be used for offloaded filters, such that hardware drivers can
> +realize they need to program the HW to do decapsulation.

However, this is wrong. The hardware offloading must be transparent.
The same configuration that works when processed in software must work
in hardware if the hardware has the necessary capabilities. Requiring
the user to alter the configuration to accommodate hardware
peculiarities is not acceptable.

Or maybe I'm misunderstanding what you mean here. In which case it's
not documented properly :-)

> +.IR SET
> +mode requires the source and destination ip
> +.I ADDRESS
> +and the tunnel key id
> +.I KEY_ID
> +which will be used by the ip tunnel shared device to create the tunnel header. The
> +.B tunnel_key
> +action is useful only in combination with a
> +.B mirred redirect
> +action to a shared IP tunnel device which will use the metadata (for
> +.I SET
> +) and unset the metadata created by it (for
> +.I UNSET
> +).
> +
> +.SH OPTIONS
> +.TP
> +.B unset
> +Decapsulation mode, no further arguments allowed. This function is not
> +mandatory and might be used only in some specific use cases.

This is NOT decapsulation. The packet is decapsulated at this point in
any case, whether or not set/unset or whatever is used. These actions
are only and solely about metadata associated with the packet. The
actual encapsulation and decapsulation happens at the target netdevice.

Calling this "decapsulation" is wrong. And if it's implemented as such
in your hardware offloading, then it's doubly wrong as it doesn't match
software processing and hence you must not do that and you must change
that.

> +.TP
> +.B set
> +Encapsulation mode. Requires

Likewise, this is not encapsulation. It just sets metadata.

 Jiri

^ permalink raw reply

* Re: [PATCH net 7/7] net: ethernet: stmmac: fix of-node and fixed-link-phydev leaks
From: Maxime Ripard @ 2016-11-30 14:39 UTC (permalink / raw)
  To: Johan Hovold
  Cc: David S. Miller, Giuseppe Cavallaro, Alexandre Torgue,
	Joachim Eastwood, Carlo Caione, Kevin Hilman, Maxime Coquelin,
	Chen-Yu Tsai, netdev, linux-kernel
In-Reply-To: <1480516195-27696-8-git-send-email-johan@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 848 bytes --]

On Wed, Nov 30, 2016 at 03:29:55PM +0100, Johan Hovold wrote:
> Make sure to deregister and free any fixed-link phy registered during
> probe on probe errors and on driver unbind by adding a new glue helper
> function.
> 
> Drop the of-node reference taken in the same path also on late probe
> errors (and not just on driver unbind) by moving the put from
> stmmac_dvr_remove() to the new helper.
> 
> Fixes: 277323814e49 ("stmmac: add fixed-link device-tree support")
> Fixes: 4613b279bee7 ("ethernet: stmicro: stmmac: add missing of_node_put
> after calling of_parse_phandle")
> Signed-off-by: Johan Hovold <johan@kernel.org>

For the sunxi part,
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>

Thanks!
Maxime

-- 
Maxime Ripard, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH net 6/7] net: ethernet: stmmac: platform: fix outdated function header
From: Johan Hovold @ 2016-11-30 14:29 UTC (permalink / raw)
  To: David S. Miller
  Cc: Giuseppe Cavallaro, Alexandre Torgue, Joachim Eastwood,
	Carlo Caione, Kevin Hilman, Maxime Coquelin, Maxime Ripard,
	Chen-Yu Tsai, netdev, linux-kernel, Johan Hovold
In-Reply-To: <1480516195-27696-1-git-send-email-johan@kernel.org>

Fix the OF-helper function header to reflect that the function no longer
has a platform-data parameter.

Fixes: b0003ead75f3 ("stmmac: make stmmac_probe_config_dt return the
platform data struct")
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 0a0d6a86f397..bcbf123d5ba2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -200,7 +200,6 @@ static int stmmac_dt_phy(struct plat_stmmacenet_data *plat,
 /**
  * stmmac_probe_config_dt - parse device-tree driver parameters
  * @pdev: platform_device structure
- * @plat: driver data platform structure
  * @mac: MAC address to use
  * Description:
  * this function is to read the driver parameters from device-tree and
-- 
2.7.3

^ permalink raw reply related

* [PATCH net 3/7] net: ethernet: stmmac: dwmac-rk: fix probe error path
From: Johan Hovold @ 2016-11-30 14:29 UTC (permalink / raw)
  To: David S. Miller
  Cc: Giuseppe Cavallaro, Alexandre Torgue, Joachim Eastwood,
	Carlo Caione, Kevin Hilman, Maxime Coquelin, Maxime Ripard,
	Chen-Yu Tsai, netdev, linux-kernel, Johan Hovold
In-Reply-To: <1480516195-27696-1-git-send-email-johan@kernel.org>

Make sure to disable runtime PM, power down the PHY, and disable clocks
before returning on late probe errors.

Fixes: 27ffefd2d109 ("stmmac: dwmac-rk: create a new probe function")
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 3740a4417fa0..e7aabe56c15a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -988,7 +988,16 @@ static int rk_gmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
+	ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
+	if (ret)
+		goto err_gmac_exit;
+
+	return 0;
+
+err_gmac_exit:
+	rk_gmac_exit(pdev, plat_dat->bsp_priv);
+
+	return ret;
 }
 
 static const struct of_device_id rk_gmac_dwmac_match[] = {
-- 
2.7.3

^ permalink raw reply related

* [PATCH net 2/7] net: ethernet: stmmac: dwmac-sti: fix probe error path
From: Johan Hovold @ 2016-11-30 14:29 UTC (permalink / raw)
  To: David S. Miller
  Cc: Giuseppe Cavallaro, Alexandre Torgue, Joachim Eastwood,
	Carlo Caione, Kevin Hilman, Maxime Coquelin, Maxime Ripard,
	Chen-Yu Tsai, netdev, linux-kernel, Johan Hovold
In-Reply-To: <1480516195-27696-1-git-send-email-johan@kernel.org>

Make sure to disable clocks before returning on late probe errors.

Fixes: 8387ee21f972 ("stmmac: dwmac-sti: turn setup callback into a
probe function")
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
index 58c05acc2aab..a1ce018bf844 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
@@ -365,7 +365,16 @@ static int sti_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
+	ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
+	if (ret)
+		goto err_dwmac_exit;
+
+	return 0;
+
+err_dwmac_exit:
+	sti_dwmac_exit(pdev, plat_dat->bsp_priv);
+
+	return ret;
 }
 
 static const struct sti_dwmac_of_data stih4xx_dwmac_data = {
-- 
2.7.3

^ permalink raw reply related

* [PATCH net 0/7] net: stmmac: fix probe error handling and phydev leaks
From: Johan Hovold @ 2016-11-30 14:29 UTC (permalink / raw)
  To: David S. Miller
  Cc: Giuseppe Cavallaro, Alexandre Torgue, Joachim Eastwood,
	Carlo Caione, Kevin Hilman, Maxime Coquelin, Maxime Ripard,
	Chen-Yu Tsai, netdev, linux-kernel, Johan Hovold

This series fixes a number of issues with the stmmac-driver probe error
handling, which for example left clocks enabled after probe failures.

The final patch fixes a failure to deregister and free any fixed-link
PHYs that were registered during probe on probe errors and on driver
unbind. It also fixes a related of-node leak on late probe errors.

This series depends on the of_phy_deregister_fixed_link() helper that
was just merged to net.

As mentioned earlier, one staging driver also suffers from a similar
leak and can be fixed up once the above mentioned helper hits mainline.

Note that these patches have only been compile tested.

Johan


Johan Hovold (7):
  net: ethernet: stmmac: dwmac-socfpga: fix use-after-free on probe
    errors
  net: ethernet: stmmac: dwmac-sti: fix probe error path
  net: ethernet: stmmac: dwmac-rk: fix probe error path
  net: ethernet: stmmac: dwmac-generic: fix probe error path
  net: ethernet: stmmac: dwmac-meson8b: fix probe error path
  net: ethernet: stmmac: platform: fix outdated function header
  net: ethernet: stmmac: fix of-node and fixed-link-phydev leaks

 .../net/ethernet/stmicro/stmmac/dwmac-generic.c    | 17 ++++++++--
 .../net/ethernet/stmicro/stmmac/dwmac-ipq806x.c    | 25 ++++++++++----
 .../net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c    | 17 ++++++++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c  | 23 ++++++++++---
 .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c    | 32 +++++++++++++-----
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c     | 21 +++++++++---
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    | 39 ++++++++++++++--------
 drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c    | 23 ++++++++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c  | 19 ++++++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c  | 26 +++++++++++----
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  1 -
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 33 +++++++++++++++---
 .../net/ethernet/stmicro/stmmac/stmmac_platform.h  |  2 ++
 13 files changed, 215 insertions(+), 63 deletions(-)

-- 
2.7.3

^ permalink raw reply

* Re: [PATCH v3 net-next 3/3] openvswitch: Fix skb->protocol for vlan frames.
From: Jiri Benc @ 2016-11-30 14:30 UTC (permalink / raw)
  To: Jarno Rajahalme; +Cc: netdev, pshelar, e
In-Reply-To: <1480462253-114713-3-git-send-email-jarno@ovn.org>

On Tue, 29 Nov 2016 15:30:53 -0800, Jarno Rajahalme wrote:
> Do not always set skb->protocol to be the ethertype of the L3 header.
> For a packet with non-accelerated VLAN tags skb->protocol needs to be
> the ethertype of the outermost non-accelerated VLAN ethertype.

Well, the current handling of skb->protocol matches what used to be the
handling of the kernel net stack before Jiri Pirko cleaned up the vlan
code.

I'm not opposed to changing this but I'm afraid it needs much deeper
review. Because with this in place, no core kernel functions that
depend on skb->protocol may be called from within openvswitch.

> @@ -361,6 +362,11 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
>  	if (res <= 0)
>  		return res;
>  
> +	/* If the outer vlan tag was accelerated, skb->protocol should
> +	 * refelect the inner vlan type. */
> +	if (!eth_type_vlan(skb->protocol))
> +		skb->protocol = key->eth.cvlan.tpid;

This should not depend on the current value in skb->protocol which
could be arbitrary at this point (from the point of view of how this
patch understands the skb->protocol values). It's easy to fix, though -
just add a local bool variable tracking whether the skb->protocol has
been set.

> @@ -531,15 +544,16 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
>  		if (unlikely(parse_vlan(skb, key)))
>  			return -ENOMEM;
>  
> -		skb->protocol = parse_ethertype(skb);
> -		if (unlikely(skb->protocol == htons(0)))
> +		key->eth.type = parse_ethertype(skb);
> +		if (unlikely(key->eth.type == htons(0)))
>  			return -ENOMEM;
>  
>  		skb_reset_network_header(skb);
>  		__skb_push(skb, skb->data - skb_mac_header(skb));
>  	}
>  	skb_reset_mac_len(skb);
> -	key->eth.type = skb->protocol;
> +	if (!eth_type_vlan(skb->protocol))
> +		skb->protocol = key->eth.type;

This leaves key->eth.type undefined for key->mac_proto ==
MAC_PROTO_NONE.

Plus the same problem as above with unknown value of skb->protocol. But
this is more complicated here, as skb->protocol may be either
uninitialized at this point or already initialized by parse_vlan.

>  
>  	/* Network layer. */
>  	if (key->eth.type == htons(ETH_P_IP)) {
> @@ -800,29 +814,15 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
>  	if (err)
>  		return err;
>  
> -	if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
> -		/* key_extract assumes that skb->protocol is set-up for
> -		 * layer 3 packets which is the case for other callers,
> -		 * in particular packets recieved from the network stack.
> -		 * Here the correct value can be set from the metadata
> -		 * extracted above.
> -		 */
> -		skb->protocol = key->eth.type;
> -	} else {
> -		struct ethhdr *eth;
> -
> -		skb_reset_mac_header(skb);
> -		eth = eth_hdr(skb);
> -
> -		/* Normally, setting the skb 'protocol' field would be
> -		 * handled by a call to eth_type_trans(), but it assumes
> -		 * there's a sending device, which we may not have.
> -		 */
> -		if (eth_proto_is_802_3(eth->h_proto))
> -			skb->protocol = eth->h_proto;
> -		else
> -			skb->protocol = htons(ETH_P_802_2);
> -	}
> +	/* key_extract assumes that skb->protocol is set-up for
> +	 * layer 3 packets which is the case for other callers,
> +	 * in particular packets recieved from the network stack.
> +	 * Here the correct value can be set from the metadata
> +	 * extracted above.  For layer 2 packets we initialize
> +         * skb->protocol to zero and set it in key_extract() while
> +         * parsing the L2 headers.
> +	 */
> +	skb->protocol = key->eth.type;
>  
>  	return key_extract(skb, key);
>  }

Interesting. This hunk looks safe even without the rest of the patch.
You should fix the comment indentation, though.

 Jiri

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox