Netdev List
 help / color / mirror / Atom feed
* [PATCH iproute2 net-next] iptnl: add support for collect_md flag in IPv4 and IPv6 tunnels
From: Alexei Starovoitov @ 2016-09-20  0:03 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Daniel Borkmann, William Tu, netdev

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
Stephen,
please sync include/uapi/linux/if_tunnel.h from the kernel tree
before applying.
Thanks
---
 ip/link_ip6tnl.c | 10 ++++++++++
 ip/link_iptnl.c  | 11 +++++++++++
 2 files changed, 21 insertions(+)

diff --git a/ip/link_ip6tnl.c b/ip/link_ip6tnl.c
index 59162a33608f..051c89f4fe57 100644
--- a/ip/link_ip6tnl.c
+++ b/ip/link_ip6tnl.c
@@ -40,6 +40,7 @@ static void print_usage(FILE *f)
 	fprintf(f, "          [ noencap ] [ encap { fou | gue | none } ]\n");
 	fprintf(f, "          [ encap-sport PORT ] [ encap-dport PORT ]\n");
 	fprintf(f, "          [ [no]encap-csum ] [ [no]encap-csum6 ] [ [no]encap-remcsum ]\n");
+	fprintf(f, "          [ external ]\n");
 	fprintf(f, "\n");
 	fprintf(f, "Where: NAME      := STRING\n");
 	fprintf(f, "       ADDR      := IPV6_ADDRESS\n");
@@ -89,6 +90,7 @@ static int ip6tunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 	__u16 encapflags = TUNNEL_ENCAP_FLAG_CSUM6;
 	__u16 encapsport = 0;
 	__u16 encapdport = 0;
+	__u8 metadata = 0;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
 		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
@@ -141,6 +143,8 @@ static int ip6tunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 
 		if (iptuninfo[IFLA_IPTUN_PROTO])
 			proto = rta_getattr_u8(iptuninfo[IFLA_IPTUN_PROTO]);
+		if (iptuninfo[IFLA_IPTUN_COLLECT_METADATA])
+			metadata = 1;
 	}
 
 	while (argc > 0) {
@@ -277,12 +281,18 @@ static int ip6tunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 			encapflags |= TUNNEL_ENCAP_FLAG_REMCSUM;
 		} else if (strcmp(*argv, "noencap-remcsum") == 0) {
 			encapflags |= ~TUNNEL_ENCAP_FLAG_REMCSUM;
+		} else if (strcmp(*argv, "external") == 0) {
+			metadata = 1;
 		} else
 			usage();
 		argc--, argv++;
 	}
 
 	addattr8(n, 1024, IFLA_IPTUN_PROTO, proto);
+	if (metadata) {
+		addattr_l(n, 1024, IFLA_IPTUN_COLLECT_METADATA, NULL, 0);
+		return 0;
+	}
 	addattr_l(n, 1024, IFLA_IPTUN_LOCAL, &laddr, sizeof(laddr));
 	addattr_l(n, 1024, IFLA_IPTUN_REMOTE, &raddr, sizeof(raddr));
 	addattr8(n, 1024, IFLA_IPTUN_TTL, hop_limit);
diff --git a/ip/link_iptnl.c b/ip/link_iptnl.c
index 7ec377791d88..1875348a268a 100644
--- a/ip/link_iptnl.c
+++ b/ip/link_iptnl.c
@@ -36,6 +36,7 @@ static void print_usage(FILE *f, int sit)
 		fprintf(f, "          [ mode { ip6ip | ipip | any } ]\n");
 		fprintf(f, "          [ isatap ]\n");
 	}
+	fprintf(f, "          [ external ]\n");
 	fprintf(f, "\n");
 	fprintf(f, "Where: NAME := STRING\n");
 	fprintf(f, "       ADDR := { IP_ADDRESS | any }\n");
@@ -85,6 +86,7 @@ static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 	__u16 encapflags = 0;
 	__u16 encapsport = 0;
 	__u16 encapdport = 0;
+	__u8 metadata = 0;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
 		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
@@ -161,6 +163,8 @@ static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 		if (iptuninfo[IFLA_IPTUN_6RD_RELAY_PREFIXLEN])
 			ip6rdrelayprefixlen =
 				rta_getattr_u16(iptuninfo[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]);
+		if (iptuninfo[IFLA_IPTUN_COLLECT_METADATA])
+			metadata = 1;
 	}
 
 	while (argc > 0) {
@@ -257,6 +261,8 @@ static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 			encapflags |= TUNNEL_ENCAP_FLAG_REMCSUM;
 		} else if (strcmp(*argv, "noencap-remcsum") == 0) {
 			encapflags &= ~TUNNEL_ENCAP_FLAG_REMCSUM;
+		} else if (strcmp(*argv, "external") == 0) {
+			metadata = 1;
 		} else if (strcmp(*argv, "6rd-prefix") == 0) {
 			inet_prefix prefix;
 
@@ -291,6 +297,11 @@ static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
 		exit(-1);
 	}
 
+	if (metadata) {
+		addattr_l(n, 1024, IFLA_IPTUN_COLLECT_METADATA, NULL, 0);
+		return 0;
+	}
+
 	addattr32(n, 1024, IFLA_IPTUN_LINK, link);
 	addattr32(n, 1024, IFLA_IPTUN_LOCAL, laddr);
 	addattr32(n, 1024, IFLA_IPTUN_REMOTE, raddr);
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next] net: dsa: mv88e6xxx: handle multiple ports in ATU
From: Vivien Didelot @ 2016-09-19 23:56 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot

An address can be loaded in the ATU with multiple ports, for instance
when adding multiple ports to a Multicast group with "bridge mdb".

The current code doesn't allow that. Add an helper to get a single entry
from the ATU, then set or clear the requested port, before loading the
entry back in the ATU.

Note that the required _mv88e6xxx_atu_getnext function is defined below
mv88e6xxx_port_db_load_purge, so forward-declare it for the moment. The
ATU code will be isolated in future patches.

Fixes: 83dabd1fa84c ("net: dsa: mv88e6xxx: make switchdev DB ops generic")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 56 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 70a812d..1d71802 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2091,12 +2091,48 @@ static int _mv88e6xxx_atu_load(struct mv88e6xxx_chip *chip,
 	return _mv88e6xxx_atu_cmd(chip, entry->fid, GLOBAL_ATU_OP_LOAD_DB);
 }
 
+static int _mv88e6xxx_atu_getnext(struct mv88e6xxx_chip *chip, u16 fid,
+				  struct mv88e6xxx_atu_entry *entry);
+
+static int mv88e6xxx_atu_get(struct mv88e6xxx_chip *chip, int fid,
+			     const u8 *addr, struct mv88e6xxx_atu_entry *entry)
+{
+	struct mv88e6xxx_atu_entry next;
+	int err;
+
+	eth_broadcast_addr(next.mac);
+
+	err = _mv88e6xxx_atu_mac_write(chip, next.mac);
+	if (err)
+		return err;
+
+	do {
+		err = _mv88e6xxx_atu_getnext(chip, fid, &next);
+		if (err)
+			return err;
+
+		if (next.state == GLOBAL_ATU_DATA_STATE_UNUSED)
+			break;
+
+		if (ether_addr_equal(next.mac, addr)) {
+			*entry = next;
+			return 0;
+		}
+	} while (!is_broadcast_ether_addr(next.mac));
+
+	memset(entry, 0, sizeof(*entry));
+	entry->fid = fid;
+	ether_addr_copy(entry->mac, addr);
+
+	return 0;
+}
+
 static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 					const unsigned char *addr, u16 vid,
 					u8 state)
 {
-	struct mv88e6xxx_atu_entry entry = { 0 };
 	struct mv88e6xxx_vtu_stu_entry vlan;
+	struct mv88e6xxx_atu_entry entry;
 	int err;
 
 	/* Null VLAN ID corresponds to the port private database */
@@ -2107,12 +2143,18 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 	if (err)
 		return err;
 
-	entry.fid = vlan.fid;
-	entry.state = state;
-	ether_addr_copy(entry.mac, addr);
-	if (state != GLOBAL_ATU_DATA_STATE_UNUSED) {
-		entry.trunk = false;
-		entry.portv_trunkid = BIT(port);
+	err = mv88e6xxx_atu_get(chip, vlan.fid, addr, &entry);
+	if (err)
+		return err;
+
+	/* Purge the ATU entry only if no port is using it anymore */
+	if (state == GLOBAL_ATU_DATA_STATE_UNUSED) {
+		entry.portv_trunkid &= ~BIT(port);
+		if (!entry.portv_trunkid)
+			entry.state = GLOBAL_ATU_DATA_STATE_UNUSED;
+	} else {
+		entry.portv_trunkid |= BIT(port);
+		entry.state = state;
 	}
 
 	return _mv88e6xxx_atu_load(chip, &entry);
-- 
2.10.0

^ permalink raw reply related

* Re: [PATCH v3 net-next 16/16] tcp_bbr: add BBR congestion control
From: Eric Dumazet @ 2016-09-19 23:33 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Neal Cardwell, David Miller, netdev, Van Jacobson, Yuchung Cheng,
	Nandita Dukkipati, Soheil Hassas Yeganeh
In-Reply-To: <20160919162848.2b916955@xeon-e3>

>
> It generates some slightly smaller code.
>         if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
> - 3e7:  0f b6 c0                movzbl %al,%eax
> - 3ea:  83 f8 03                cmp    $0x3,%eax
> - 3ed:  0f 86 d4 00 00 00       jbe    4c7 <bbr_lt_bw_sampling.isra.6+0x157>
> + 3e7:  3c 03                   cmp    $0x3,%al
> + 3e9:  0f 86 d1 00 00 00       jbe    4c0 <bbr_lt_bw_sampling.isra.6+0x150>
>
> And different code for abs
>                 /* Is new bw close to the lt_bw from the previous interval? */
>                 diff = abs(bw - bbr->lt_bw);
> - 47a:  44 89 e2                mov    %r12d,%edx
> - 47d:  29 c2                   sub    %eax,%edx
> - 47f:  89 d1                   mov    %edx,%ecx
> - 481:  89 d3                   mov    %edx,%ebx
> + 475:  44 89 e3                mov    %r12d,%ebx
> + 478:  29 c3                   sub    %eax,%ebx
> + 47a:  89 da                   mov    %ebx,%edx
> + 47c:  c1 fa 1f                sar    $0x1f,%edx
> + 47f:  31 d3                   xor    %edx,%ebx
> + 481:  29 d3                   sub    %edx,%ebx
>
> The biggest change is getting rid of the always true conditional.
>
...
>
> Overall, it really makes little difference. Actual file sizes come out the same.
> The idea is more to document what is variable
> and what is immutable in the algorithm.

SGTM, thanks Stephen !

^ permalink raw reply

* Re: [PATCH v3 net-next 16/16] tcp_bbr: add BBR congestion control
From: Stephen Hemminger @ 2016-09-19 23:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Neal Cardwell, David Miller, netdev, Van Jacobson, Yuchung Cheng,
	Nandita Dukkipati, Soheil Hassas Yeganeh
In-Reply-To: <CANn89i+MpNPYn=ewi_LioNctNePCuity_UXw5ieU7HFfoZTsGA@mail.gmail.com>

On Mon, 19 Sep 2016 14:10:39 -0700
Eric Dumazet <edumazet@google.com> wrote:

> On Mon, Sep 19, 2016 at 1:57 PM, Stephen Hemminger
> <stephen@networkplumber.org> wrote:
> 
> > Looks good, but could I suggest a simple optimization.
> > All these parameters are immutable in the version of BBR you are submitting.
> > Why not make the values const? And eliminate the always true long-term bw estimate
> > variable?
> >  
> 
> We could do that.
> 
> We used to have variables (aka module params) while BBR was cooking in
> our kernels ;)
> 
> Are you sure generated code is indeed 'optimized' ?


It generates some slightly smaller code.
 	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
- 3e7:	0f b6 c0             	movzbl %al,%eax
- 3ea:	83 f8 03             	cmp    $0x3,%eax
- 3ed:	0f 86 d4 00 00 00    	jbe    4c7 <bbr_lt_bw_sampling.isra.6+0x157>
+ 3e7:	3c 03                	cmp    $0x3,%al
+ 3e9:	0f 86 d1 00 00 00    	jbe    4c0 <bbr_lt_bw_sampling.isra.6+0x150>

And different code for abs
 		/* Is new bw close to the lt_bw from the previous interval? */
 		diff = abs(bw - bbr->lt_bw);
- 47a:	44 89 e2             	mov    %r12d,%edx
- 47d:	29 c2                	sub    %eax,%edx
- 47f:	89 d1                	mov    %edx,%ecx
- 481:	89 d3                	mov    %edx,%ebx
+ 475:	44 89 e3             	mov    %r12d,%ebx
+ 478:	29 c3                	sub    %eax,%ebx
+ 47a:	89 da                	mov    %ebx,%edx
+ 47c:	c1 fa 1f             	sar    $0x1f,%edx
+ 47f:	31 d3                	xor    %edx,%ebx
+ 481:	29 d3                	sub    %edx,%ebx

The biggest change is getting rid of the always true conditional.

-	u32 diff;
-
-	if (bbr->lt_bw &&  /* do we have bw from a previous interval? */
-	    bbr_lt_bw_estimator) {  /* using long-term bw estimator enabled? */
-		/* Is new bw close to the lt_bw from the previous interval? */
-		diff = abs(bw - bbr->lt_bw);
- 485:	c1 f9 1f             	sar    $0x1f,%ecx
-		if ((diff * BBR_UNIT <= bbr_lt_conv_thresh * bbr->lt_bw) ||
- 488:	c1 e2 05             	shl    $0x5,%edx
-	u32 diff;
-
-	if (bbr->lt_bw &&  /* do we have bw from a previous interval? */
-	    bbr_lt_bw_estimator) {  /* using long-term bw estimator enabled? */
-		/* Is new bw close to the lt_bw from the previous interval? */
-		diff = abs(bw - bbr->lt_bw);
- 48b:	31 cb                	xor    %ecx,%ebx
- 48d:	29 cb                	sub    %ecx,%ebx
-		if ((diff * BBR_UNIT <= bbr_lt_conv_thresh * bbr->lt_bw) ||
- 48f:	89 d9                	mov    %ebx,%ecx
- 491:	c1 e1 08             	shl    $0x8,%ecx
- 494:	39 d1                	cmp    %edx,%ecx
- 496:	0f 87 6e 01 00 00    	ja     60a <bbr_lt_bw_sampling.isra.6+0x29a>
+ 485:	89 d9                	mov    %ebx,%ecx
+ 487:	c1 e2 05             	shl    $0x5,%edx
+ 48a:	c1 e1 08             	shl    $0x8,%ecx
+ 48d:	39 d1                	cmp    %edx,%ecx
+ 48f:	0f 87 6e 01 00 00    	ja     603 <bbr_lt_bw_sampling.isra.6+0x293>


Overall, it really makes little difference. Actual file sizes come out the same.
The idea is more to document what is variable
and what is immutable in the algorithm.

^ permalink raw reply

* [PATCH v5 net-next 1/1] net sched actions: fix GETing actions
From: Jamal Hadi Salim @ 2016-09-19 23:02 UTC (permalink / raw)
  To: davem; +Cc: netdev, xiyou.wangcong, Jamal Hadi Salim

From: Jamal Hadi Salim <jhs@mojatatu.com>

With the batch changes that translated transient actions into
a temporary list lost in the translation was the fact that
tcf_action_destroy() will eventually delete the action from
the permanent location if the refcount is zero.

Example of what broke:
...add a gact action to drop
sudo $TC actions add action drop index 10
...now retrieve it, looks good
sudo $TC actions get action gact index 10
...retrieve it again and find it is gone!
sudo $TC actions get action gact index 10

Fixes: 22dc13c837c3 ("net_sched: convert tcf_exts from list to pointer array"),
Fixes: 824a7e8863b3 ("net_sched: remove an unnecessary list_del()")
Fixes: f07fed82ad79 ("net_sched: remove the leftover cleanup_a()")

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 net/sched/act_api.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d09d068..411191b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -592,6 +592,17 @@ err_out:
 	return ERR_PTR(err);
 }
 
+static void cleanup_a(struct list_head *actions, int ovr)
+{
+	struct tc_action *a;
+
+	if (!ovr)
+		return;
+
+	list_for_each_entry(a, actions, list)
+		a->tcfa_refcnt--;
+}
+
 int tcf_action_init(struct net *net, struct nlattr *nla,
 				  struct nlattr *est, char *name, int ovr,
 				  int bind, struct list_head *actions)
@@ -612,8 +623,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla,
 			goto err;
 		}
 		act->order = i;
+		if (ovr)
+			act->tcfa_refcnt++;
 		list_add_tail(&act->list, actions);
 	}
+
+	/* Remove the temp refcnt which was necessary to protect against
+	 * destroying an existing action which was being replaced
+	 */
+	cleanup_a(actions, ovr);
 	return 0;
 
 err:
@@ -883,6 +901,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 			goto err;
 		}
 		act->order = i;
+		if (event == RTM_GETACTION)
+			act->tcfa_refcnt++;
 		list_add_tail(&act->list, &actions);
 	}
 
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH net-next 2/2] net: dsa: mv88e6xxx: Convert flag bits to unsigned long long
From: Vivien Didelot @ 2016-09-19 22:54 UTC (permalink / raw)
  To: Andrew Lunn, David Miller; +Cc: netdev, Andrew Lunn
In-Reply-To: <1474323521-5088-3-git-send-email-andrew@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

> We are soon going to run out of flag bits on 32bit systems. Convert to
> unsigned long long.
>
> Signed-off-by: Andrew Lunn <andrew@lunn.ch>

Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Thanks,

        Vivien

^ permalink raw reply

* Re: [PATCH net-next 1/2] net: dsa: mv88e6xxx: Add helper for accessing port registers
From: Vivien Didelot @ 2016-09-19 22:53 UTC (permalink / raw)
  To: Andrew Lunn, David Miller; +Cc: netdev, Andrew Lunn
In-Reply-To: <1474323521-5088-2-git-send-email-andrew@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

> @@ -41,6 +41,11 @@ static void assert_reg_lock(struct mv88e6xxx_chip *chip)
>  	}
>  }
>  
> +static int mv88e6xxx_reg_port(struct mv88e6xxx_chip *chip, int port)
> +{
> +	return chip->info->port_base_addr + port;
> +}
> +

If we really want such helper, can you call it mv88e6xxx_port_addr()
instead, so that we respect an implicit mv88e6xxx_port_ namespace, and
use the correct "addr" term instead of erroneous "reg" one.

>  /* The switch ADDR[4:1] configuration pins define the chip SMI device address
>   * (ADDR[0] is always zero, thus only even SMI addresses can be strapped).
>   *
> @@ -216,6 +221,42 @@ int mv88e6xxx_write(struct mv88e6xxx_chip *chip, int addr, int reg, u16 val)
>  	return 0;
>  }
>  
> +int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
> +			u16 *val)
> +{
> +	int addr = mv88e6xxx_reg_port(chip, port);
> +	int err;
> +
> +	assert_reg_lock(chip);
> +
> +	err = mv88e6xxx_smi_read(chip, addr, reg, val);
> +	if (err)
> +		return err;
> +
> +	dev_dbg(chip->dev, "<- port: 0x%.2x reg: 0x%.2x val: 0x%.4x\n",
> +		port, reg, *val);
> +
> +	return 0;
> +}
> +
> +int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int port, int reg,
> +			 u16 val)
> +{
> +	int addr = mv88e6xxx_reg_port(chip, port);
> +	int err;
> +
> +	assert_reg_lock(chip);
> +
> +	err = mv88e6xxx_smi_write(chip, addr, reg, val);
> +	if (err)
> +		return err;
> +
> +	dev_dbg(chip->dev, "-> port: 0x%.2x reg: 0x%.2x val: 0x%.4x\n",
> +		port, reg, val);
> +
> +	return 0;
> +}
> +

mv88e6xxx_{read,write} are already doing this (wrapping the assert, smi
op and debug message). Plus, we could access the port registers through
a different interface, like remote management frames.

So please don't duplicate and use the following, as the previous
mv88e6xxx_port_read() function was doing:

    static int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip,
                                   int port, int reg, u16 *val)
    {
            int addr = chip->info->port_base_addr + port;

            return mv88e6xxx_read(chip, addr, reg, val);
    }

    static int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip,
                                    int port, int reg, u16 val)
    {
            int addr = chip->info->port_base_addr + port;

            return mv88e6xxx_write(chip, addr, reg, val);
    }

Note: I don't really see a need for the mv88e6xxx_port_addr helper in
fact, the above code is quite clear. I'd suggest to drop it unless we
need a port address somewhere else than in mv88e6xxx_port_{read,write}.

>  static int mv88e6xxx_phy_read(struct mv88e6xxx_chip *chip, int phy,
>  			      int reg, u16 *val)
>  {
> @@ -585,19 +626,19 @@ static void mv88e6xxx_adjust_link(struct dsa_switch *ds, int port,
>  				  struct phy_device *phydev)
>  {
>  	struct mv88e6xxx_chip *chip = ds->priv;
> -	u32 reg;
> -	int ret;
> +	u16 reg;
> +	int err;
>  
>  	if (!phy_is_pseudo_fixed_link(phydev))
>  		return;
>  
>  	mutex_lock(&chip->reg_lock);
>  
> -	ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_PCS_CTRL);
> -	if (ret < 0)
> +	err = mv88e6xxx_port_read(chip, port, PORT_PCS_CTRL, &reg);
> +	if (err < 0)
>  		goto out;

Can you please get rid of the < 0 condition at the same time, if (err)
is enough.

(same for the rest of this patch).

Thanks,

        Vivien

^ permalink raw reply

* [PATCH net-next 2/3] bpf: direct packet write and access for helpers for clsact progs
From: Daniel Borkmann @ 2016-09-19 22:26 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, tgraf, jakub.kicinski, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1474323281.git.daniel@iogearbox.net>

This work implements direct packet access for helpers and direct packet
write in a similar fashion as already available for XDP types via commits
4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and
6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a
complementary feature to the already available direct packet read for tc
(cls/act) programs.

For enabling this, we need to introduce two helpers, bpf_skb_pull_data()
and bpf_csum_update(). The first is generally needed for both, read and
write, because they would otherwise only be limited to the current linear
skb head. Usually, when the data_end test fails, programs just bail out,
or, in the direct read case, use bpf_skb_load_bytes() as an alternative
to overcome this limitation. If such data sits in non-linear parts, we
can just pull them in once with the new helper, retest and eventually
access them.

At the same time, this also makes sure the skb is uncloned, which is, of
course, a necessary condition for direct write. As this needs to be an
invariant for the write part only, the verifier detects writes and adds
a prologue that is calling bpf_skb_pull_data() to effectively unclone the
skb from the very beginning in case it is indeed cloned. The heuristic
makes use of a similar trick that was done in 233577a22089 ("net: filter:
constify detection of pkt_type_offset"). This comes at zero cost for other
programs that do not use the direct write feature. Should a program use
this feature only sparsely and has read access for the most parts with,
for example, drop return codes, then such write action can be delegated
to a tail called program for mitigating this cost of potential uncloning
to a late point in time where it would have been paid similarly with the
bpf_skb_store_bytes() as well. Advantage of direct write is that the
writes are inlined whereas the helper cannot make any length assumptions
and thus needs to generate a call to memcpy() also for small sizes, as well
as cost of helper call itself with sanity checks are avoided. Plus, when
direct read is already used, we don't need to cache or perform rechecks
on the data boundaries (due to verifier invalidating previous checks for
helpers that change skb->data), so more complex programs using rewrites
can benefit from switching to direct read plus write.

For direct packet access to helpers, we save the otherwise needed copy into
a temp struct sitting on stack memory when use-case allows. Both facilities
are enabled via may_access_direct_pkt_data() in verifier. For now, we limit
this to map helpers and csum_diff, and can successively enable other helpers
where we find it makes sense. Helpers that definitely cannot be allowed for
this are those part of bpf_helper_changes_skb_data() since they can change
underlying data, and those that write into memory as this could happen for
packet typed args when still cloned. bpf_csum_update() helper accommodates
for the fact that we need to fixup checksum_complete when using direct write
instead of bpf_skb_store_bytes(), meaning the programs can use available
helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(),
csum_block_add(), csum_block_sub() equivalents in eBPF together with the
new helper. A usage example will be provided for iproute2's examples/bpf/
directory.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |   4 +-
 include/linux/skbuff.h   |  14 ++++-
 include/uapi/linux/bpf.h |  21 ++++++++
 kernel/bpf/helpers.c     |   3 ++
 kernel/bpf/verifier.c    |  54 ++++++++++++++-----
 net/core/filter.c        | 134 +++++++++++++++++++++++++++++++++++++++++------
 6 files changed, 196 insertions(+), 34 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a904f6..5691fdc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_return_type {
 struct bpf_func_proto {
 	u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 	bool gpl_only;
+	bool pkt_access;
 	enum bpf_return_type ret_type;
 	enum bpf_arg_type arg1_type;
 	enum bpf_arg_type arg2_type;
@@ -151,7 +152,8 @@ struct bpf_verifier_ops {
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
 				enum bpf_reg_type *reg_type);
-
+	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
+			    const struct bpf_prog *prog);
 	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn, struct bpf_prog *prog);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c5662f..c6dab3f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -676,13 +676,23 @@ struct sk_buff {
 	 */
 	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
+
+/* if you move cloned around you also must adapt those constants */
+#ifdef __BIG_ENDIAN_BITFIELD
+#define CLONED_MASK	(1 << 7)
+#else
+#define CLONED_MASK	1
+#endif
+#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)
+
+	__u8			__cloned_offset[0];
 	__u8			cloned:1,
 				nohdr:1,
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				xmit_more:1;
-	/* one bit hole */
+				xmit_more:1,
+				__unused:1; /* one bit hole */
 	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f896dfa..e07432b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -398,6 +398,27 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_change_tail,
 
+	/**
+	 * bpf_skb_pull_data(skb, len)
+	 * The helper will pull in non-linear data in case the
+	 * skb is non-linear and not all of len are part of the
+	 * linear section. Only needed for read/write with direct
+	 * packet access.
+	 * @skb: pointer to skb
+	 * @len: len to make read/writeable
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_pull_data,
+
+	/**
+	 * bpf_csum_update(skb, csum)
+	 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+	 * @skb: pointer to skb
+	 * @csum: csum to add
+	 * Return: csum on success or negative error
+	 */
+	BPF_FUNC_csum_update,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a5b8bf8..3991840 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -36,6 +36,7 @@ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 	.func		= bpf_map_lookup_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -51,6 +52,7 @@ BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 const struct bpf_func_proto bpf_map_update_elem_proto = {
 	.func		= bpf_map_update_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
@@ -67,6 +69,7 @@ BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.func		= bpf_map_delete_elem,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc138f3..3a75ee3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -196,6 +196,7 @@ struct verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
+	bool seen_direct_write;
 };
 
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
@@ -204,6 +205,7 @@ struct verifier_env {
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
+	bool pkt_access;
 	int regno;
 	int access_size;
 };
@@ -654,10 +656,17 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
-static bool may_write_pkt_data(enum bpf_prog_type type)
+static bool may_access_direct_pkt_data(struct verifier_env *env,
+				       const struct bpf_call_arg_meta *meta)
 {
-	switch (type) {
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+		if (meta)
+			return meta->pkt_access;
+
+		env->seen_direct_write = true;
 		return true;
 	default:
 		return false;
@@ -817,7 +826,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -950,8 +959,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_write_pkt_data(env->prog->type)) {
-		verbose("helper access to the packet is not allowed for clsact\n");
+	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1191,6 +1200,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	changes_data = bpf_helper_changes_skb_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
+	meta.pkt_access = fn->pkt_access;
 
 	/* We only support one arg being in raw mode at the moment, which
 	 * is sufficient for the helper functions we have right now.
@@ -2675,18 +2685,35 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
  */
 static int convert_ctx_accesses(struct verifier_env *env)
 {
-	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
-	struct bpf_insn insn_buf[16];
+	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i;
+	int i, insn_cnt, cnt;
 
-	if (!env->prog->aux->ops->convert_ctx_access)
+	if (ops->gen_prologue) {
+		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+					env->prog);
+		if (cnt >= ARRAY_SIZE(insn_buf)) {
+			verbose("bpf verifier is misconfigured\n");
+			return -EINVAL;
+		} else if (cnt) {
+			new_prog = bpf_patch_insn_single(env->prog, 0,
+							 insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+			env->prog = new_prog;
+		}
+	}
+
+	if (!ops->convert_ctx_access)
 		return 0;
 
+	insn_cnt = env->prog->len;
+	insn = env->prog->insnsi;
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		u32 insn_delta, cnt;
+		u32 insn_delta;
 
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
@@ -2703,9 +2730,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			continue;
 		}
 
-		cnt = env->prog->aux->ops->
-			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf, env->prog);
+		cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+					      insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index 298b146..0920c2a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1362,6 +1362,11 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 	return err;
 }
 
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+	return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
 {
 	if (skb_at_tc_ingress(skb))
@@ -1441,6 +1446,28 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+	/* Idea is the following: should the needed direct read/write
+	 * test fail during runtime, we can pull in more data and redo
+	 * again, since implicitly, we invalidate previous checks here.
+	 *
+	 * Or, since we know how much we need to make read/writeable,
+	 * this can be done once at the program beginning for direct
+	 * access case. By this we overcome limitations of only current
+	 * headroom being accessible.
+	 */
+	return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+	.func		= bpf_skb_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
 	   u64, from, u64, to, u64, flags)
 {
@@ -1567,6 +1594,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
 static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.func		= bpf_csum_diff,
 	.gpl_only	= false,
+	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1575,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+	/* The interface is to be used in combination with bpf_csum_diff()
+	 * for direct packet writes. csum rotation for alignment as well
+	 * as emulating csum_sub() can be done from the eBPF program.
+	 */
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		return (skb->csum = csum_add(skb->csum, csum));
+
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+	.func		= bpf_csum_update,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
 {
 	return dev_forward_skb(dev, skb);
@@ -1602,6 +1650,8 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 {
 	struct net_device *dev;
+	struct sk_buff *clone;
+	int ret;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return -EINVAL;
@@ -1610,14 +1660,25 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
 	if (unlikely(!dev))
 		return -EINVAL;
 
-	skb = skb_clone(skb, GFP_ATOMIC);
-	if (unlikely(!skb))
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (unlikely(!clone))
 		return -ENOMEM;
 
-	bpf_push_mac_rcsum(skb);
+	/* For direct write, we need to keep the invariant that the skbs
+	 * we're dealing with need to be uncloned. Should uncloning fail
+	 * here, we need to free the just generated clone to unclone once
+	 * again.
+	 */
+	ret = bpf_try_make_head_writable(skb);
+	if (unlikely(ret)) {
+		kfree_skb(clone);
+		return -ENOMEM;
+	}
+
+	bpf_push_mac_rcsum(clone);
 
 	return flags & BPF_F_INGRESS ?
-	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+	       __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone);
 }
 
 static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -2063,19 +2124,14 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 
 bool bpf_helper_changes_skb_data(void *func)
 {
-	if (func == bpf_skb_vlan_push)
-		return true;
-	if (func == bpf_skb_vlan_pop)
-		return true;
-	if (func == bpf_skb_store_bytes)
-		return true;
-	if (func == bpf_skb_change_proto)
-		return true;
-	if (func == bpf_skb_change_tail)
-		return true;
-	if (func == bpf_l3_csum_replace)
-		return true;
-	if (func == bpf_l4_csum_replace)
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace)
 		return true;
 
 	return false;
@@ -2440,8 +2496,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
 		return &bpf_csum_diff_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
@@ -2533,6 +2593,45 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			       const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (!direct_write)
+		return 0;
+
+	/* if (!skb->cloned)
+	 *       goto start;
+	 *
+	 * (Fast-path, otherwise approximation that we might be
+	 *  a clone, do the rest in helper.)
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+	/* ret = bpf_skb_pull_data(skb, 0); */
+	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       BPF_FUNC_skb_pull_data);
+	/* if (!ret)
+	 *      goto restore;
+	 * return TC_ACT_SHOT;
+	 */
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+	*insn++ = BPF_EXIT_INSN();
+
+	/* restore: */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+	/* start: */
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
 				       enum bpf_reg_type *reg_type)
@@ -2810,6 +2909,7 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
 };
 
 static const struct bpf_verifier_ops xdp_ops = {
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 3/3] bpf: add test cases for direct packet access
From: Daniel Borkmann @ 2016-09-19 22:26 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, tgraf, jakub.kicinski, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1474323281.git.daniel@iogearbox.net>

Add couple of test cases for direct write and the negative size issue, and
also adjust the direct packet access test4 since it asserts that writes are
not possible, but since we've just added support for writes, we need to
invert the verdict to ACCEPT, of course. Summary: 133 PASSED, 0 FAILED.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/test_verifier.c | 433 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 430 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 1f6cc9b..ac590d4 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -291,6 +291,29 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
+		"invalid argument register",
+		.insns = {
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 !read_ok",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"non-invalid argument register",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_1, BPF_REG_6),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
 		"check valid spill/fill",
 		.insns = {
 			/* spill R1(ctx) into stack */
@@ -1210,6 +1233,54 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
+		"raw_stack: skb_load_bytes, negative len",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid stack type R3",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"raw_stack: skb_load_bytes, negative len 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_4, ~0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid stack type R3",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"raw_stack: skb_load_bytes, zero len",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid stack type R3",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
 		"raw_stack: skb_load_bytes, no init",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_2, 4),
@@ -1511,7 +1582,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
 	},
 	{
-		"direct packet access: test4",
+		"direct packet access: test4 (write)",
 		.insns = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
 				    offsetof(struct __sk_buff, data)),
@@ -1524,8 +1595,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "cannot write",
-		.result = REJECT,
+		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
@@ -1631,6 +1701,26 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
+		"direct packet access: test10 (write invalid)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid access to packet",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
 		"helper access to packet: test1, valid packet_ptr range",
 		.insns = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
@@ -1736,6 +1826,343 @@ static struct bpf_test tests[] = {
 		.errstr = "invalid access to packet",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"helper access to packet: test6, cls valid packet_ptr range",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 5),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup = {5},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test7, cls unchecked packet_ptr",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup = {1},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test8, cls variable add",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+					offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+					offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 10),
+			BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_2, 0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_5),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_4),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 4),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_4),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup = {11},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test9, cls packet_ptr with bad range",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup = {7},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test10, cls packet_ptr with too short range",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup = {6},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test11, cls unsuitable helper 1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_7, 4),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 42),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_store_bytes),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "helper access to the packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test12, cls unsuitable helper 2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 3),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 4),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "helper access to the packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test13, cls helper ok",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test14, cls helper fail sub",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 4),
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "type=inv expected=fp",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test15, cls helper fail range 1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_2, 8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test16, cls helper fail range 2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_2, -9),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test17, cls helper fail range 3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_2, ~0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test18, cls helper fail range zero",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test19, pkt end as input",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "R1 type=pkt_end expected=fp",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"helper access to packet: test20, wrong reg",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+			BPF_MOV64_IMM(BPF_REG_2, 4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid access to packet",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 };
 
 static int probe_filter_length(struct bpf_insn *fp)
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 1/3] bpf, verifier: enforce larger zero range for pkt on overloading stack buffs
From: Daniel Borkmann @ 2016-09-19 22:26 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, tgraf, jakub.kicinski, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1474323281.git.daniel@iogearbox.net>

Current contract for the following two helper argument types is:

  * ARG_CONST_STACK_SIZE: passed argument pair must be (ptr, >0).
  * ARG_CONST_STACK_SIZE_OR_ZERO: passed argument pair can be either
    (NULL, 0) or (ptr, >0).

With 6841de8b0d03 ("bpf: allow helpers access the packet directly"), we can
pass also raw packet data to helpers, so depending on the argument type
being PTR_TO_PACKET, we now either assert memory via check_packet_access()
or check_stack_boundary(). As a result, the tests in check_packet_access()
currently allow more than intended with regards to reg->imm.

Back in 969bf05eb3ce ("bpf: direct packet access"), check_packet_access()
was fine to ignore size argument since in check_mem_access() size was
bpf_size_to_bytes() derived and prior to the call to check_packet_access()
guaranteed to be larger than zero.

However, for the above two argument types, it currently means, we can have
a <= 0 size and thus breaking current guarantees for helpers. Enforce a
check for size <= 0 and bail out if so.

check_stack_boundary() doesn't have such an issue since it already tests
for access_size <= 0 and bails out, resp. access_size == 0 in case of NULL
pointer passed when allowed.

Fixes: 6841de8b0d03 ("bpf: allow helpers access the packet directly")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 Affected commit sits in net-next only.

 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 90493a6..bc138f3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -671,7 +671,7 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 	struct reg_state *reg = &regs[regno];
 
 	off += reg->off;
-	if (off < 0 || off + size > reg->range) {
+	if (off < 0 || size <= 0 || off + size > reg->range) {
 		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 0/3] BPF direct packet access improvements
From: Daniel Borkmann @ 2016-09-19 22:26 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, tgraf, jakub.kicinski, netdev,
	Daniel Borkmann

This set adds write support to the currently available read support
for {cls,act}_bpf programs. First one is a fix for affected commit
sitting in net-next and prerequisite for the second one, last patch
adds a number of test cases against the verifier. For details, please
see individual patches.

Thanks!

Daniel Borkmann (3):
  bpf, verifier: enforce larger zero range for pkt on overloading stack buffs
  bpf: direct packet write and access for helpers for clsact progs
  bpf: add test cases for direct packet access

 include/linux/bpf.h         |   4 +-
 include/linux/skbuff.h      |  14 +-
 include/uapi/linux/bpf.h    |  21 +++
 kernel/bpf/helpers.c        |   3 +
 kernel/bpf/verifier.c       |  56 ++++--
 net/core/filter.c           | 134 ++++++++++++--
 samples/bpf/test_verifier.c | 433 +++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 627 insertions(+), 38 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net-next 0/2] Preparation for mv88e6390
From: Andrew Lunn @ 2016-09-19 22:18 UTC (permalink / raw)
  To: David Miller; +Cc: Vivien Didelot, netdev, Andrew Lunn

These two patches are a couple of preparation steps for supporting the
the MV88E6390 family of chips. This is a new generation from Marvell,
and will need more feature flags than are currently available in an
unsigned long. Expand to an unsigned long long. The MV88E6390 also
places its port registers somewhere else, so add a wrapper around port
register access.

Andrew Lunn (2):
  net: dsa: mv88e6xxx: Convert flag bits to unsigned long long
  WIP: net: dsa: mv88e6xxx: Implement interrupt support.

 drivers/net/dsa/mv88e6xxx/chip.c      | 174 ++++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global2.c   | 128 ++++++++++++++++++++++++-
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  90 +++++++++++-------
 3 files changed, 359 insertions(+), 33 deletions(-)

-- 
2.9.3

^ permalink raw reply

* [PATCH net-next 1/2] net: dsa: mv88e6xxx: Add helper for accessing port registers
From: Andrew Lunn @ 2016-09-19 22:18 UTC (permalink / raw)
  To: David Miller; +Cc: Vivien Didelot, netdev, Andrew Lunn
In-Reply-To: <1474323521-5088-1-git-send-email-andrew@lunn.ch>

There is a device coming soon which places its port registers
somewhere different to all other Marvell switches supported so far.
Add helper functions for reading/writing port registers, making it
easier to handle this new device.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
---
 drivers/net/dsa/mv88e6xxx/chip.c      | 394 ++++++++++++++++++----------------
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |   1 -
 2 files changed, 206 insertions(+), 189 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 70a812d159c9..58b8b94d278e 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -41,6 +41,11 @@ static void assert_reg_lock(struct mv88e6xxx_chip *chip)
 	}
 }
 
+static int mv88e6xxx_reg_port(struct mv88e6xxx_chip *chip, int port)
+{
+	return chip->info->port_base_addr + port;
+}
+
 /* The switch ADDR[4:1] configuration pins define the chip SMI device address
  * (ADDR[0] is always zero, thus only even SMI addresses can be strapped).
  *
@@ -216,6 +221,42 @@ int mv88e6xxx_write(struct mv88e6xxx_chip *chip, int addr, int reg, u16 val)
 	return 0;
 }
 
+int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
+			u16 *val)
+{
+	int addr = mv88e6xxx_reg_port(chip, port);
+	int err;
+
+	assert_reg_lock(chip);
+
+	err = mv88e6xxx_smi_read(chip, addr, reg, val);
+	if (err)
+		return err;
+
+	dev_dbg(chip->dev, "<- port: 0x%.2x reg: 0x%.2x val: 0x%.4x\n",
+		port, reg, *val);
+
+	return 0;
+}
+
+int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int port, int reg,
+			 u16 val)
+{
+	int addr = mv88e6xxx_reg_port(chip, port);
+	int err;
+
+	assert_reg_lock(chip);
+
+	err = mv88e6xxx_smi_write(chip, addr, reg, val);
+	if (err)
+		return err;
+
+	dev_dbg(chip->dev, "-> port: 0x%.2x reg: 0x%.2x val: 0x%.4x\n",
+		port, reg, val);
+
+	return 0;
+}
+
 static int mv88e6xxx_phy_read(struct mv88e6xxx_chip *chip, int phy,
 			      int reg, u16 *val)
 {
@@ -585,19 +626,19 @@ static void mv88e6xxx_adjust_link(struct dsa_switch *ds, int port,
 				  struct phy_device *phydev)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
-	u32 reg;
-	int ret;
+	u16 reg;
+	int err;
 
 	if (!phy_is_pseudo_fixed_link(phydev))
 		return;
 
 	mutex_lock(&chip->reg_lock);
 
-	ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_PCS_CTRL);
-	if (ret < 0)
+	err = mv88e6xxx_port_read(chip, port, PORT_PCS_CTRL, &reg);
+	if (err < 0)
 		goto out;
 
-	reg = ret & ~(PORT_PCS_CTRL_LINK_UP |
+	reg = reg & ~(PORT_PCS_CTRL_LINK_UP |
 		      PORT_PCS_CTRL_FORCE_LINK |
 		      PORT_PCS_CTRL_DUPLEX_FULL |
 		      PORT_PCS_CTRL_FORCE_DUPLEX |
@@ -639,7 +680,7 @@ static void mv88e6xxx_adjust_link(struct dsa_switch *ds, int port,
 			reg |= (PORT_PCS_CTRL_RGMII_DELAY_RXCLK |
 				PORT_PCS_CTRL_RGMII_DELAY_TXCLK);
 	}
-	_mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_PCS_CTRL, reg);
+	mv88e6xxx_port_write(chip, port, PORT_PCS_CTRL, reg);
 
 out:
 	mutex_unlock(&chip->reg_lock);
@@ -799,22 +840,22 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip,
 {
 	u32 low;
 	u32 high = 0;
-	int ret;
+	int err;
+	u16 reg;
 	u64 value;
 
 	switch (s->type) {
 	case PORT:
-		ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), s->reg);
-		if (ret < 0)
+		err = mv88e6xxx_port_read(chip, port, s->reg, &reg);
+		if (err < 0)
 			return UINT64_MAX;
 
-		low = ret;
+		low = reg;
 		if (s->sizeof_stat == 4) {
-			ret = _mv88e6xxx_reg_read(chip, REG_PORT(port),
-						  s->reg + 1);
-			if (ret < 0)
+			err = mv88e6xxx_port_read(chip, port, s->reg + 1, &reg);
+			if (err < 0)
 				return UINT64_MAX;
-			high = ret;
+			high = reg;
 		}
 		break;
 	case BANK0:
@@ -893,6 +934,8 @@ static void mv88e6xxx_get_regs(struct dsa_switch *ds, int port,
 			       struct ethtool_regs *regs, void *_p)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
+	int err;
+	u16 reg;
 	u16 *p = _p;
 	int i;
 
@@ -903,11 +946,10 @@ static void mv88e6xxx_get_regs(struct dsa_switch *ds, int port,
 	mutex_lock(&chip->reg_lock);
 
 	for (i = 0; i < 32; i++) {
-		int ret;
 
-		ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), i);
-		if (ret >= 0)
-			p[i] = ret;
+		err = mv88e6xxx_port_read(chip, port, i, &reg);
+		if (!err)
+			p[i] = reg;
 	}
 
 	mutex_unlock(&chip->reg_lock);
@@ -938,7 +980,7 @@ static int mv88e6xxx_get_eee(struct dsa_switch *ds, int port,
 	e->eee_enabled = !!(reg & 0x0200);
 	e->tx_lpi_enabled = !!(reg & 0x0100);
 
-	err = mv88e6xxx_read(chip, REG_PORT(port), PORT_STATUS, &reg);
+	err = mv88e6xxx_port_read(chip, port, PORT_STATUS, &reg);
 	if (err)
 		goto out;
 
@@ -1106,12 +1148,13 @@ static int _mv88e6xxx_port_state(struct mv88e6xxx_chip *chip, int port,
 				 u8 state)
 {
 	struct dsa_switch *ds = chip->ds;
-	int reg, ret = 0;
+	u16 reg;
+	int err;
 	u8 oldstate;
 
-	reg = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_CONTROL);
-	if (reg < 0)
-		return reg;
+	err = mv88e6xxx_port_read(chip, port, PORT_CONTROL, &reg);
+	if (err < 0)
+		return err;
 
 	oldstate = reg & PORT_CONTROL_STATE_MASK;
 
@@ -1124,23 +1167,22 @@ static int _mv88e6xxx_port_state(struct mv88e6xxx_chip *chip, int port,
 		     oldstate == PORT_CONTROL_STATE_FORWARDING) &&
 		    (state == PORT_CONTROL_STATE_DISABLED ||
 		     state == PORT_CONTROL_STATE_BLOCKING)) {
-			ret = _mv88e6xxx_atu_remove(chip, 0, port, false);
-			if (ret)
-				return ret;
+			err = _mv88e6xxx_atu_remove(chip, 0, port, false);
+			if (err)
+				return err;
 		}
 
 		reg = (reg & ~PORT_CONTROL_STATE_MASK) | state;
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_CONTROL,
-					   reg);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_CONTROL, reg);
+		if (err)
+			return err;
 
 		netdev_dbg(ds->ports[port].netdev, "PortState %s (was %s)\n",
 			   mv88e6xxx_port_state_names[state],
 			   mv88e6xxx_port_state_names[oldstate]);
 	}
 
-	return ret;
+	return err;
 }
 
 static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int port)
@@ -1149,7 +1191,8 @@ static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int port)
 	const u16 mask = (1 << chip->info->num_ports) - 1;
 	struct dsa_switch *ds = chip->ds;
 	u16 output_ports = 0;
-	int reg;
+	u16 reg;
+	int err;
 	int i;
 
 	/* allow CPU port or DSA link(s) to send frames to every port */
@@ -1170,14 +1213,14 @@ static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int port)
 	/* prevent frames from going back out of the port they came in on */
 	output_ports &= ~BIT(port);
 
-	reg = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_BASE_VLAN);
-	if (reg < 0)
-		return reg;
+	err = mv88e6xxx_port_read(chip, port, PORT_BASE_VLAN, &reg);
+	if (err < 0)
+		return err;
 
 	reg &= ~mask;
 	reg |= output_ports & mask;
 
-	return _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_BASE_VLAN, reg);
+	return mv88e6xxx_port_write(chip, port, PORT_BASE_VLAN, reg);
 }
 
 static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port,
@@ -1218,23 +1261,22 @@ static int _mv88e6xxx_port_pvid(struct mv88e6xxx_chip *chip, int port,
 				u16 *new, u16 *old)
 {
 	struct dsa_switch *ds = chip->ds;
-	u16 pvid;
-	int ret;
+	u16 pvid, reg;
+	int err;
 
-	ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_DEFAULT_VLAN);
-	if (ret < 0)
-		return ret;
+	err = mv88e6xxx_port_read(chip, port, PORT_DEFAULT_VLAN, &reg);
+	if (err < 0)
+		return err;
 
-	pvid = ret & PORT_DEFAULT_VLAN_MASK;
+	pvid = reg & PORT_DEFAULT_VLAN_MASK;
 
 	if (new) {
-		ret &= ~PORT_DEFAULT_VLAN_MASK;
-		ret |= *new & PORT_DEFAULT_VLAN_MASK;
+		reg &= ~PORT_DEFAULT_VLAN_MASK;
+		reg |= *new & PORT_DEFAULT_VLAN_MASK;
 
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_DEFAULT_VLAN, ret);
-		if (ret < 0)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_DEFAULT_VLAN, reg);
+		if (err < 0)
+			return err;
 
 		netdev_dbg(ds->ports[port].netdev,
 			   "DefaultVID %d (was %d)\n", *new, pvid);
@@ -1613,7 +1655,8 @@ static int _mv88e6xxx_port_fid(struct mv88e6xxx_chip *chip, int port,
 	struct dsa_switch *ds = chip->ds;
 	u16 upper_mask;
 	u16 fid;
-	int ret;
+	u16 reg;
+	int err;
 
 	if (mv88e6xxx_num_databases(chip) == 4096)
 		upper_mask = 0xff;
@@ -1623,37 +1666,35 @@ static int _mv88e6xxx_port_fid(struct mv88e6xxx_chip *chip, int port,
 		return -EOPNOTSUPP;
 
 	/* Port's default FID bits 3:0 are located in reg 0x06, offset 12 */
-	ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_BASE_VLAN);
-	if (ret < 0)
-		return ret;
+	err = mv88e6xxx_port_read(chip, port, PORT_BASE_VLAN, &reg);
+	if (err < 0)
+		return err;
 
-	fid = (ret & PORT_BASE_VLAN_FID_3_0_MASK) >> 12;
+	fid = (reg & PORT_BASE_VLAN_FID_3_0_MASK) >> 12;
 
 	if (new) {
-		ret &= ~PORT_BASE_VLAN_FID_3_0_MASK;
-		ret |= (*new << 12) & PORT_BASE_VLAN_FID_3_0_MASK;
+		reg &= ~PORT_BASE_VLAN_FID_3_0_MASK;
+		reg |= (*new << 12) & PORT_BASE_VLAN_FID_3_0_MASK;
 
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_BASE_VLAN,
-					   ret);
-		if (ret < 0)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_BASE_VLAN, reg);
+		if (err < 0)
+			return err;
 	}
 
 	/* Port's default FID bits 11:4 are located in reg 0x05, offset 0 */
-	ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_CONTROL_1);
-	if (ret < 0)
-		return ret;
+	err = mv88e6xxx_port_read(chip, port, PORT_CONTROL_1, &reg);
+	if (err < 0)
+		return err;
 
-	fid |= (ret & upper_mask) << 4;
+	fid |= (reg & upper_mask) << 4;
 
 	if (new) {
-		ret &= ~upper_mask;
-		ret |= (*new >> 4) & upper_mask;
+		reg &= ~upper_mask;
+		reg |= (*new >> 4) & upper_mask;
 
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_CONTROL_1,
-					   ret);
-		if (ret < 0)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_CONTROL_1, reg);
+		if (err < 0)
+			return err;
 
 		netdev_dbg(ds->ports[port].netdev,
 			   "FID %d (was %d)\n", *new, fid);
@@ -1865,26 +1906,26 @@ static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
 	struct mv88e6xxx_chip *chip = ds->priv;
 	u16 old, new = vlan_filtering ? PORT_CONTROL_2_8021Q_SECURE :
 		PORT_CONTROL_2_8021Q_DISABLED;
-	int ret;
+	u16 reg;
+	int err;
 
 	if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_VTU))
 		return -EOPNOTSUPP;
 
 	mutex_lock(&chip->reg_lock);
 
-	ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_CONTROL_2);
-	if (ret < 0)
+	err = mv88e6xxx_port_read(chip, port, PORT_CONTROL_2, &reg);
+	if (err < 0)
 		goto unlock;
 
-	old = ret & PORT_CONTROL_2_8021Q_MASK;
+	old = reg & PORT_CONTROL_2_8021Q_MASK;
 
 	if (new != old) {
-		ret &= ~PORT_CONTROL_2_8021Q_MASK;
-		ret |= new & PORT_CONTROL_2_8021Q_MASK;
+		reg &= ~PORT_CONTROL_2_8021Q_MASK;
+		reg |= new & PORT_CONTROL_2_8021Q_MASK;
 
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_CONTROL_2,
-					   ret);
-		if (ret < 0)
+		err = mv88e6xxx_port_write(chip, port, PORT_CONTROL_2, reg);
+		if (err < 0)
 			goto unlock;
 
 		netdev_dbg(ds->ports[port].netdev, "802.1Q Mode %s (was %s)\n",
@@ -1892,11 +1933,11 @@ static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
 			   mv88e6xxx_port_8021q_mode_names[old]);
 	}
 
-	ret = 0;
+	err = 0;
 unlock:
 	mutex_unlock(&chip->reg_lock);
 
-	return ret;
+	return err;
 }
 
 static int
@@ -2364,19 +2405,19 @@ static int mv88e6xxx_switch_reset(struct mv88e6xxx_chip *chip)
 	u16 is_reset = (ppu_active ? 0x8800 : 0xc800);
 	struct gpio_desc *gpiod = chip->reset;
 	unsigned long timeout;
-	int ret;
+	u16 reg;
+	int err;
 	int i;
 
 	/* Set all ports to the disabled state. */
 	for (i = 0; i < chip->info->num_ports; i++) {
-		ret = _mv88e6xxx_reg_read(chip, REG_PORT(i), PORT_CONTROL);
-		if (ret < 0)
-			return ret;
+		err = mv88e6xxx_port_read(chip, i, PORT_CONTROL, &reg);
+		if (err < 0)
+			return err;
 
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(i), PORT_CONTROL,
-					   ret & 0xfffc);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, i, PORT_CONTROL, reg & 0xfffc);
+		if (err)
+			return err;
 	}
 
 	/* Wait for transmit queues to drain. */
@@ -2395,29 +2436,29 @@ static int mv88e6xxx_switch_reset(struct mv88e6xxx_chip *chip)
 	 * through global registers 0x18 and 0x19.
 	 */
 	if (ppu_active)
-		ret = _mv88e6xxx_reg_write(chip, REG_GLOBAL, 0x04, 0xc000);
+		err = _mv88e6xxx_reg_write(chip, REG_GLOBAL, 0x04, 0xc000);
 	else
-		ret = _mv88e6xxx_reg_write(chip, REG_GLOBAL, 0x04, 0xc400);
-	if (ret)
-		return ret;
+		err = _mv88e6xxx_reg_write(chip, REG_GLOBAL, 0x04, 0xc400);
+	if (err)
+		return err;
 
 	/* Wait up to one second for reset to complete. */
 	timeout = jiffies + 1 * HZ;
 	while (time_before(jiffies, timeout)) {
-		ret = _mv88e6xxx_reg_read(chip, REG_GLOBAL, 0x00);
-		if (ret < 0)
-			return ret;
+		err = _mv88e6xxx_reg_read(chip, REG_GLOBAL, 0x00);
+		if (err < 0)
+			return err;
 
-		if ((ret & is_reset) == is_reset)
+		if ((err & is_reset) == is_reset)
 			break;
 		usleep_range(1000, 2000);
 	}
 	if (time_after(jiffies, timeout))
-		ret = -ETIMEDOUT;
+		err = -ETIMEDOUT;
 	else
-		ret = 0;
+		err = 0;
 
-	return ret;
+	return err;
 }
 
 static int mv88e6xxx_serdes_power_on(struct mv88e6xxx_chip *chip)
@@ -2438,21 +2479,10 @@ static int mv88e6xxx_serdes_power_on(struct mv88e6xxx_chip *chip)
 	return err;
 }
 
-static int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port,
-			       int reg, u16 *val)
-{
-	int addr = chip->info->port_base_addr + port;
-
-	if (port >= chip->info->num_ports)
-		return -EINVAL;
-
-	return mv88e6xxx_read(chip, addr, reg, val);
-}
-
 static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 {
 	struct dsa_switch *ds = chip->ds;
-	int ret;
+	int err;
 	u16 reg;
 
 	if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
@@ -2465,7 +2495,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 		 * and all DSA ports to their maximum bandwidth and
 		 * full duplex.
 		 */
-		reg = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_PCS_CTRL);
+		err = mv88e6xxx_port_read(chip, port, PORT_PCS_CTRL, &reg);
 		if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) {
 			reg &= ~PORT_PCS_CTRL_UNFORCED;
 			reg |= PORT_PCS_CTRL_FORCE_LINK |
@@ -2480,10 +2510,9 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 			reg |= PORT_PCS_CTRL_UNFORCED;
 		}
 
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_PCS_CTRL, reg);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_PCS_CTRL, reg);
+		if (err)
+			return err;
 	}
 
 	/* Port Control: disable Drop-on-Unlock, disable Drop-on-Lock,
@@ -2534,26 +2563,25 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 				PORT_CONTROL_FORWARD_UNKNOWN_MC;
 	}
 	if (reg) {
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_CONTROL, reg);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_CONTROL, reg);
+		if (err)
+			return err;
 	}
 
 	/* If this port is connected to a SerDes, make sure the SerDes is not
 	 * powered down.
 	 */
 	if (mv88e6xxx_has(chip, MV88E6XXX_FLAGS_SERDES)) {
-		ret = _mv88e6xxx_reg_read(chip, REG_PORT(port), PORT_STATUS);
-		if (ret < 0)
-			return ret;
-		ret &= PORT_STATUS_CMODE_MASK;
-		if ((ret == PORT_STATUS_CMODE_100BASE_X) ||
-		    (ret == PORT_STATUS_CMODE_1000BASE_X) ||
-		    (ret == PORT_STATUS_CMODE_SGMII)) {
-			ret = mv88e6xxx_serdes_power_on(chip);
-			if (ret < 0)
-				return ret;
+		err = mv88e6xxx_port_read(chip, port, PORT_STATUS, &reg);
+		if (err < 0)
+			return err;
+		reg &= PORT_STATUS_CMODE_MASK;
+		if ((reg == PORT_STATUS_CMODE_100BASE_X) ||
+		    (reg == PORT_STATUS_CMODE_1000BASE_X) ||
+		    (reg == PORT_STATUS_CMODE_SGMII)) {
+			err = mv88e6xxx_serdes_power_on(chip);
+			if (err < 0)
+				return err;
 		}
 	}
 
@@ -2587,10 +2615,9 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 	reg |= PORT_CONTROL_2_8021Q_DISABLED;
 
 	if (reg) {
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_CONTROL_2, reg);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_CONTROL_2, reg);
+		if (err)
+			return err;
 	}
 
 	/* Port Association Vector: when learning source addresses
@@ -2603,16 +2630,14 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 	if (dsa_is_cpu_port(ds, port))
 		reg = 0;
 
-	ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_ASSOC_VECTOR,
-				   reg);
-	if (ret)
-		return ret;
+	err = mv88e6xxx_port_write(chip, port, PORT_ASSOC_VECTOR, reg);
+	if (err)
+		return err;
 
 	/* Egress rate control 2: disable egress rate control. */
-	ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_RATE_CONTROL_2,
-				   0x0000);
-	if (ret)
-		return ret;
+	err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL_2, 0x0000);
+	if (err)
+		return err;
 
 	if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
 	    mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) ||
@@ -2621,96 +2646,89 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port)
 		 * be paused for by the remote end or the period of
 		 * time that this port can pause the remote end.
 		 */
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_PAUSE_CTRL, 0x0000);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_PAUSE_CTRL, 0x0000);
+		if (err)
+			return err;
 
 		/* Port ATU control: disable limiting the number of
 		 * address database entries that this port is allowed
 		 * to use.
 		 */
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_ATU_CONTROL, 0x0000);
+		err = mv88e6xxx_port_write(chip, port, PORT_ATU_CONTROL,
+					   0x0000);
 		/* Priority Override: disable DA, SA and VTU priority
 		 * override.
 		 */
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_PRI_OVERRIDE, 0x0000);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_PRI_OVERRIDE,
+					   0x0000);
+		if (err)
+			return err;
 
 		/* Port Ethertype: use the Ethertype DSA Ethertype
 		 * value.
 		 */
 		if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_EDSA)) {
-			ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-						   PORT_ETH_TYPE, ETH_P_EDSA);
-			if (ret)
-				return ret;
+			err = mv88e6xxx_port_write(chip, port, PORT_ETH_TYPE,
+						   ETH_P_EDSA);
+			if (err)
+				return err;
 		}
 
 		/* Tag Remap: use an identity 802.1p prio -> switch
 		 * prio mapping.
 		 */
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_TAG_REGMAP_0123, 0x3210);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_TAG_REGMAP_0123,
+					   0x3210);
+		if (err)
+			return err;
 
 		/* Tag Remap 2: use an identity 802.1p prio -> switch
 		 * prio mapping.
 		 */
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_TAG_REGMAP_4567, 0x7654);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_TAG_REGMAP_4567,
+					   0x7654);
+		if (err)
+			return err;
 	}
 
 	/* Rate Control: disable ingress rate limiting. */
 	if (mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip) ||
 	    mv88e6xxx_6165_family(chip) || mv88e6xxx_6097_family(chip) ||
 	    mv88e6xxx_6320_family(chip)) {
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_RATE_CONTROL, 0x0001);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL,
+					   0x0001);
+		if (err)
+			return err;
 	} else if (mv88e6xxx_6185_family(chip) || mv88e6xxx_6095_family(chip)) {
-		ret = _mv88e6xxx_reg_write(chip, REG_PORT(port),
-					   PORT_RATE_CONTROL, 0x0000);
-		if (ret)
-			return ret;
+		err = mv88e6xxx_port_write(chip, port, PORT_RATE_CONTROL,
+					   0x0000);
+		if (err)
+			return err;
 	}
 
 	/* Port Control 1: disable trunking, disable sending
 	 * learning messages to this port.
 	 */
-	ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_CONTROL_1,
-				   0x0000);
-	if (ret)
-		return ret;
+	err = mv88e6xxx_port_write(chip, port, PORT_CONTROL_1, 0x0000);
+	if (err)
+		return err;
 
 	/* Port based VLAN map: give each port the same default address
 	 * database, and allow bidirectional communication between the
 	 * CPU and DSA port(s), and the other ports.
 	 */
-	ret = _mv88e6xxx_port_fid_set(chip, port, 0);
-	if (ret)
-		return ret;
+	err = _mv88e6xxx_port_fid_set(chip, port, 0);
+	if (err)
+		return err;
 
-	ret = _mv88e6xxx_port_based_vlan_map(chip, port);
-	if (ret)
-		return ret;
+	err = _mv88e6xxx_port_based_vlan_map(chip, port);
+	if (err)
+		return err;
 
 	/* Default VLAN ID and priority: don't set a default VLAN
 	 * ID, and set the default packet priority to zero.
 	 */
-	ret = _mv88e6xxx_reg_write(chip, REG_PORT(port), PORT_DEFAULT_VLAN,
-				   0x0000);
-	if (ret)
-		return ret;
-
-	return 0;
+	return mv88e6xxx_port_write(chip, port, PORT_DEFAULT_VLAN, 0x0000);
 }
 
 static int mv88e6xxx_g1_set_switch_mac(struct mv88e6xxx_chip *chip, u8 *addr)
diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index 52f3f5231f51..e349d0d64645 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -37,7 +37,6 @@
 #define ADDR_SERDES		0x0f
 #define SERDES_PAGE_FIBER	0x01
 
-#define REG_PORT(p)		(0x10 + (p))
 #define PORT_STATUS		0x00
 #define PORT_STATUS_PAUSE_EN	BIT(15)
 #define PORT_STATUS_MY_PAUSE	BIT(14)
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 2/2] net: dsa: mv88e6xxx: Convert flag bits to unsigned long long
From: Andrew Lunn @ 2016-09-19 22:18 UTC (permalink / raw)
  To: David Miller; +Cc: Vivien Didelot, netdev, Andrew Lunn
In-Reply-To: <1474323521-5088-1-git-send-email-andrew@lunn.ch>

We are soon going to run out of flag bits on 32bit systems. Convert to
unsigned long long.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
---
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 62 +++++++++++++++++------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index e349d0d64645..827988397fd8 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -452,36 +452,36 @@ enum mv88e6xxx_cap {
 };
 
 /* Bitmask of capabilities */
-#define MV88E6XXX_FLAG_EDSA		BIT(MV88E6XXX_CAP_EDSA)
-#define MV88E6XXX_FLAG_EEE		BIT(MV88E6XXX_CAP_EEE)
-
-#define MV88E6XXX_FLAG_SMI_CMD		BIT(MV88E6XXX_CAP_SMI_CMD)
-#define MV88E6XXX_FLAG_SMI_DATA		BIT(MV88E6XXX_CAP_SMI_DATA)
-
-#define MV88E6XXX_FLAG_PHY_PAGE		BIT(MV88E6XXX_CAP_PHY_PAGE)
-
-#define MV88E6XXX_FLAG_SERDES		BIT(MV88E6XXX_CAP_SERDES)
-
-#define MV88E6XXX_FLAG_GLOBAL2		BIT(MV88E6XXX_CAP_GLOBAL2)
-#define MV88E6XXX_FLAG_G2_MGMT_EN_2X	BIT(MV88E6XXX_CAP_G2_MGMT_EN_2X)
-#define MV88E6XXX_FLAG_G2_MGMT_EN_0X	BIT(MV88E6XXX_CAP_G2_MGMT_EN_0X)
-#define MV88E6XXX_FLAG_G2_IRL_CMD	BIT(MV88E6XXX_CAP_G2_IRL_CMD)
-#define MV88E6XXX_FLAG_G2_IRL_DATA	BIT(MV88E6XXX_CAP_G2_IRL_DATA)
-#define MV88E6XXX_FLAG_G2_PVT_ADDR	BIT(MV88E6XXX_CAP_G2_PVT_ADDR)
-#define MV88E6XXX_FLAG_G2_PVT_DATA	BIT(MV88E6XXX_CAP_G2_PVT_DATA)
-#define MV88E6XXX_FLAG_G2_SWITCH_MAC	BIT(MV88E6XXX_CAP_G2_SWITCH_MAC)
-#define MV88E6XXX_FLAG_G2_POT		BIT(MV88E6XXX_CAP_G2_POT)
-#define MV88E6XXX_FLAG_G2_EEPROM_CMD	BIT(MV88E6XXX_CAP_G2_EEPROM_CMD)
-#define MV88E6XXX_FLAG_G2_EEPROM_DATA	BIT(MV88E6XXX_CAP_G2_EEPROM_DATA)
-#define MV88E6XXX_FLAG_G2_SMI_PHY_CMD	BIT(MV88E6XXX_CAP_G2_SMI_PHY_CMD)
-#define MV88E6XXX_FLAG_G2_SMI_PHY_DATA	BIT(MV88E6XXX_CAP_G2_SMI_PHY_DATA)
-
-#define MV88E6XXX_FLAG_PPU		BIT(MV88E6XXX_CAP_PPU)
-#define MV88E6XXX_FLAG_PPU_ACTIVE	BIT(MV88E6XXX_CAP_PPU_ACTIVE)
-#define MV88E6XXX_FLAG_STU		BIT(MV88E6XXX_CAP_STU)
-#define MV88E6XXX_FLAG_TEMP		BIT(MV88E6XXX_CAP_TEMP)
-#define MV88E6XXX_FLAG_TEMP_LIMIT	BIT(MV88E6XXX_CAP_TEMP_LIMIT)
-#define MV88E6XXX_FLAG_VTU		BIT(MV88E6XXX_CAP_VTU)
+#define MV88E6XXX_FLAG_EDSA		BIT_ULL(MV88E6XXX_CAP_EDSA)
+#define MV88E6XXX_FLAG_EEE		BIT_ULL(MV88E6XXX_CAP_EEE)
+
+#define MV88E6XXX_FLAG_SMI_CMD		BIT_ULL(MV88E6XXX_CAP_SMI_CMD)
+#define MV88E6XXX_FLAG_SMI_DATA		BIT_ULL(MV88E6XXX_CAP_SMI_DATA)
+
+#define MV88E6XXX_FLAG_PHY_PAGE		BIT_ULL(MV88E6XXX_CAP_PHY_PAGE)
+
+#define MV88E6XXX_FLAG_SERDES		BIT_ULL(MV88E6XXX_CAP_SERDES)
+
+#define MV88E6XXX_FLAG_GLOBAL2		BIT_ULL(MV88E6XXX_CAP_GLOBAL2)
+#define MV88E6XXX_FLAG_G2_MGMT_EN_2X	BIT_ULL(MV88E6XXX_CAP_G2_MGMT_EN_2X)
+#define MV88E6XXX_FLAG_G2_MGMT_EN_0X	BIT_ULL(MV88E6XXX_CAP_G2_MGMT_EN_0X)
+#define MV88E6XXX_FLAG_G2_IRL_CMD	BIT_ULL(MV88E6XXX_CAP_G2_IRL_CMD)
+#define MV88E6XXX_FLAG_G2_IRL_DATA	BIT_ULL(MV88E6XXX_CAP_G2_IRL_DATA)
+#define MV88E6XXX_FLAG_G2_PVT_ADDR	BIT_ULL(MV88E6XXX_CAP_G2_PVT_ADDR)
+#define MV88E6XXX_FLAG_G2_PVT_DATA	BIT_ULL(MV88E6XXX_CAP_G2_PVT_DATA)
+#define MV88E6XXX_FLAG_G2_SWITCH_MAC	BIT_ULL(MV88E6XXX_CAP_G2_SWITCH_MAC)
+#define MV88E6XXX_FLAG_G2_POT		BIT_ULL(MV88E6XXX_CAP_G2_POT)
+#define MV88E6XXX_FLAG_G2_EEPROM_CMD	BIT_ULL(MV88E6XXX_CAP_G2_EEPROM_CMD)
+#define MV88E6XXX_FLAG_G2_EEPROM_DATA	BIT_ULL(MV88E6XXX_CAP_G2_EEPROM_DATA)
+#define MV88E6XXX_FLAG_G2_SMI_PHY_CMD	BIT_ULL(MV88E6XXX_CAP_G2_SMI_PHY_CMD)
+#define MV88E6XXX_FLAG_G2_SMI_PHY_DATA	BIT_ULL(MV88E6XXX_CAP_G2_SMI_PHY_DATA)
+
+#define MV88E6XXX_FLAG_PPU		BIT_ULL(MV88E6XXX_CAP_PPU)
+#define MV88E6XXX_FLAG_PPU_ACTIVE	BIT_ULL(MV88E6XXX_CAP_PPU_ACTIVE)
+#define MV88E6XXX_FLAG_STU		BIT_ULL(MV88E6XXX_CAP_STU)
+#define MV88E6XXX_FLAG_TEMP		BIT_ULL(MV88E6XXX_CAP_TEMP)
+#define MV88E6XXX_FLAG_TEMP_LIMIT	BIT_ULL(MV88E6XXX_CAP_TEMP_LIMIT)
+#define MV88E6XXX_FLAG_VTU		BIT_ULL(MV88E6XXX_CAP_VTU)
 
 /* EEPROM Programming via Global2 with 16-bit data */
 #define MV88E6XXX_FLAGS_EEPROM16	\
@@ -614,7 +614,7 @@ struct mv88e6xxx_info {
 	unsigned int num_ports;
 	unsigned int port_base_addr;
 	unsigned int age_time_coeff;
-	unsigned long flags;
+	unsigned long long flags;
 };
 
 struct mv88e6xxx_atu_entry {
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH] igb: mark igb_rxnfc_write_vlan_prio_filter() static
From: Jeff Kirsher @ 2016-09-19 22:16 UTC (permalink / raw)
  To: Baoyou Xie, intel-wired-lan; +Cc: netdev, linux-kernel, arnd, xie.baoyou
In-Reply-To: <1474188628-420-1-git-send-email-baoyou.xie@linaro.org>

[-- Attachment #1: Type: text/plain, Size: 684 bytes --]

On Sun, 2016-09-18 at 16:50 +0800, Baoyou Xie wrote:
> We get 1 warning when building kernel with W=1:
> drivers/net/ethernet/intel/igb/igb_ethtool.c:2707:5: warning: no previous
> prototype for 'igb_rxnfc_write_vlan_prio_filter' [-Wmissing-prototypes]
> 
> In fact, this function is only used in the file in which it is
> declared and don't need a declaration, but can be made static.
> so this patch marks this function with 'static'.
> 
> Signed-off-by: Baoyou Xie <baoyou.xie@linaro.org>
> ---
>  drivers/net/ethernet/intel/igb/igb_ethtool.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

Already fixed, see http://patchwork.ozlabs.org/patch/661904/

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCHv6 net-next 04/15] bpf: don't (ab)use instructions to store state
From: Daniel Borkmann @ 2016-09-19 21:55 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, ast, kubakici
In-Reply-To: <20160919224812.7a4b90b8@jkicinski-Precision-T1700>

On 09/19/2016 11:48 PM, Jakub Kicinski wrote:
> On Mon, 19 Sep 2016 23:44:50 +0200, Daniel Borkmann wrote:
>> On 09/19/2016 11:36 PM, Jakub Kicinski wrote:
>>> On Mon, 19 Sep 2016 23:03:17 +0200, Daniel Borkmann wrote:
>>>> On 09/18/2016 05:09 PM, Jakub Kicinski wrote:
>>>>> Storing state in reserved fields of instructions makes
>>>>> it impossible to run verifier on programs already
>>>>> marked as read-only. Allocate and use an array of
>>>>> per-instruction state instead.
>>>>>
>>>>> While touching the error path rename and move existing
>>>>> jump target.
>>>>>
>>>>> Suggested-by: Alexei Starovoitov <ast@kernel.org>
>>>>> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>>>>> Acked-by: Alexei Starovoitov <ast@kernel.org>
>>>>> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
>>>>
>>>> I believe there's still an issue here. Could you please double check
>>>> and confirm?
>>>>
>>>> I rebased my locally pending stuff on top of your set and suddenly my
>>>> test case breaks. So I did a bisect and it pointed me to this commit
>>>> eventually.
>>>>
>>>> [...]
>>>>> @@ -2697,11 +2706,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
>>>>>     		else
>>>>>     			continue;
>>>>>
>>>>> -		if (insn->imm != PTR_TO_CTX) {
>>>>> -			/* clear internal mark */
>>>>> -			insn->imm = 0;
>>>>> +		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
>>>>>     			continue;
>>>>> -		}
>>>>>
>>>>>     		cnt = env->prog->aux->ops->
>>>>>     			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
>>>>
>>>> Looking at the code, I believe the issue is in above snippet. In the
>>>> convert_ctx_accesses() rewrite loop, each time we bpf_patch_insn_single()
>>>> a program, the program can grow in size (due to __sk_buff access rewrite,
>>>> for example). After rewrite, we do 'i += insn_delta' for adjustment to
>>>> process next insn.
>>>>
>>>> However, env->insn_aux_data is alloced under the assumption that the
>>>> very initial, pre-verification prog->len doesn't change, right? So in
>>>> the above conversion access to env->insn_aux_data[i].ptr_type is off,
>>>> since after rewrites, corresponding mappings to ptr_type might not be
>>>> related anymore.
>>>>
>>>> I noticed this with direct packet access where suddenly the data vs
>>>> data_end test failed and contained some "semi-random" value always
>>>> bailing out for me.
>>>
>>> You are correct.  Should I respin or would you like to post your set? :)
>>
>> Heh, if you don't mind I would go ahead tonight, the conflict at two spots
>> when exposing verifier is really minor turns out. Are you okay with this?
>
> Yes, please go ahead :)

Ok, thanks!

>> What's the plan wrt env->insn_aux_data? Realloc plus rewrite of the array,
>> or do you see a more straight forward solution?
>
> I was thinking about something like this: (untested)

Yep, much better. :)

^ permalink raw reply

* Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
From: Sargun Dhillon @ 2016-09-19 21:53 UTC (permalink / raw)
  To: Daniel Mack
  Cc: Pablo Neira Ayuso, htejun, daniel, ast, davem, kafai, fw, harald,
	netdev, cgroups
In-Reply-To: <1918ef04-f22f-dce3-4bb4-1275936111d0@zonque.org>

On Mon, Sep 19, 2016 at 06:34:28PM +0200, Daniel Mack wrote:
> Hi,
> 
> On 09/16/2016 09:57 PM, Sargun Dhillon wrote:
> > On Wed, Sep 14, 2016 at 01:13:16PM +0200, Daniel Mack wrote:
> 
> >> I have no idea what makes you think this is limited to systemd. As I
> >> said, I provided an example for userspace that works from the command
> >> line. The same limitation apply as for all other users of cgroups.
> >>
> > So, at least in my work, we have Mesos, but on nearly every machine that Mesos 
> > runs, people also have systemd. Now, there's recently become a bit of a battle 
> > of ownership of things like cgroups on these machines. We can usually solve it 
> > by nesting under systemd cgroups, and thus so far we've avoided making too many 
> > systemd-specific concessions.
> > 
> > The reason this works (mostly), is because everything we touch has a sense of 
> > nesting, where we can apply policy at a place lower in the hierarchy, and yet 
> > systemd's monitoring and policy still stays in place. 
> > 
> > Now, with this patch, we don't have that, but I think we can reasonably add some 
> > flag like "no override" when applying policies, or alternatively something like 
> > "no new privileges", to prevent children from applying policies that override 
> > top-level policy.
> 
> Yes, but the API is already guarded by CAP_NET_ADMIN. Take that
> capability away from your children, and they can't tamper with the
> policy. Does that work for you?
> 
No. This can be addressed in a follow-on patch, but the use-case is that I have 
a container orchestrator (Docker, or Mesos), and systemd. The sysadmin controls 
systemd, and Docker is controlled by devs. Typically, the system owner wants 
some system level statistics, and filtering, and then we want to do 
per-container filtering.

We really want to be able to do nesting with userspace tools that are oblivious, 
and we want to delegate a level of the cgroup hierarchy to the tool that created 
it. I do not see Docker integrating with systemd any time soon, and that's 
really the only other alternative.

> > I realize there is a speed concern as well, but I think for 
> > people who want nested policy, we're willing to make the tradeoff. The cost
> > of traversing a few extra pointers still outweighs the overhead of network
> > namespaces, iptables, etc.. for many of us. 
> 
> Not sure. Have you tried it?
> 
Tried nested policies? Yes. I tried nested policy execution with syscalls, and I 
tested with bind and connect. The performance overhead was pretty minimal, but 
latency increased by 100 microseconds+ once the number of BPF hooks increased 
beyond 30. The BPF programs were trivial, and essentially did a map lookup, and 
returned 0.

I don't think that it's just raw cycles / execution time, but I didn't spend 
enough time digging into it to determine the performance hit. I'm waiting
for your patchset to land, and then I plan to work off of it.

> > What do you think Daniel?
> 
> I think we should look at an implementation once we really need it, and
> then revisit the performance impact. In any case, this can be changed
> under the hood, without touching the userspace API (except for adding
> flags if we need them).
> 
+1
> >> Not necessarily. You can as well do it the inetd way, and pass the
> >> socket to a process that is launched on demand, but do SO_ATTACH_FILTER
> >> + SO_LOCK_FILTER  in the middle. What happens with payload on the socket
> >> is not transparent to the launched binary at all. The proposed cgroup
> >> eBPF solution implements a very similar behavior in that regard.
> >
> > It would be nice to be able to see whether or not a filter is attached to a 
> > cgroup, but given this is going through syscalls, at least introspection
> > is possible as opposed to something like netlink.
> 
> Sure, there are many ways. I implemented the bpf cgroup logic using an
> own cgroup controller once, which made it possible to read out the
> status. But as we agreed on attaching programs through the bpf(2) system
> call, I moved back to the implementation that directly stores the
> pointers in the cgroup.
> 
> First enabling the controller through the fs-backed cgroup interface,
> then come back through the bpf(2) syscall and then go back to the fs
> interface to read out status values is a bit weird.
> 
Hrm, that makes sense. with the BPF syscall, would there be a way to get
file descriptor of the currently attached BPF program?

> >> And FWIW, I agree with Thomas - there is nothing wrong with having
> >> multiple options to use for such use-cases.
> >
> > Right now, for containers, we have netfilter and network namespaces.
> > There's a lot of performance overhead that comes with this.
> 
> Out of curiosity: Could you express that in numbers? And how exactly are
> you testing?
> 
Sure. Our workload that we use as a baseline is Redis with redis-benchmark. We 
reconnect after every connection, and we're running "isolation" between two 
containers on the same machine to try to rule out any physical infrastructure 
overhead.

So, we ran two tests with network namespaces. The first one was putting Redis 
into its own network namespace, and using tc to do some basic shaping:
Client--Veth---Host Namespace---Veth---Redis

The second was:
Client--Veth--Host Namespace+Iptables filtering--Veth--Redis. 

The second test required us to use conntrack, as we wanted stateful filtering.

Ops/sec:
Original: 4275
Situation 1: 3823
Situation 2: 1489

Latency (milliseconds):
Original: 0.69
Situation 1: 0.82
Situation 2: 2.11

This was on a (KVM) machine with 16GB of RAM, and 8 Cores where the machine was 
supposed to be dedicated to me. Given that it's not bare metal, take these 
numbers with a grain of salt.

> > Not only
> > that, but iptables doesn't really have a simple way of usage by
> > automated infrastructure. We (firewalld, systemd, dockerd, mesos)
> > end up fighting with one another for ownership over firewall rules.
> 
> Yes, that's a common problem.
> 
> > Although, I have problems with this approach, I think that it's
> > a good baseline where we can have top level owned by systemd,
> > docker underneath that, and Mesos underneath that. We can add
> > additional hooks for things like Checmate and Landlock, and
> > with a little more work, we can do compositition, solving
> > all of our problems.
> 
> It is supposed to be just a baseline, yes.
> 
> 
> Thanks for your feedback,
> Daniel
> 

^ permalink raw reply

* Re: [PATCHv6 net-next 04/15] bpf: don't (ab)use instructions to store state
From: Jakub Kicinski @ 2016-09-19 21:48 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: netdev, ast, kubakici
In-Reply-To: <57E05C52.4060208@iogearbox.net>

On Mon, 19 Sep 2016 23:44:50 +0200, Daniel Borkmann wrote:
> On 09/19/2016 11:36 PM, Jakub Kicinski wrote:
> > On Mon, 19 Sep 2016 23:03:17 +0200, Daniel Borkmann wrote:  
> >> On 09/18/2016 05:09 PM, Jakub Kicinski wrote:  
> >>> Storing state in reserved fields of instructions makes
> >>> it impossible to run verifier on programs already
> >>> marked as read-only. Allocate and use an array of
> >>> per-instruction state instead.
> >>>
> >>> While touching the error path rename and move existing
> >>> jump target.
> >>>
> >>> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> >>> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> >>> Acked-by: Alexei Starovoitov <ast@kernel.org>
> >>> Acked-by: Daniel Borkmann <daniel@iogearbox.net>  
> >>
> >> I believe there's still an issue here. Could you please double check
> >> and confirm?
> >>
> >> I rebased my locally pending stuff on top of your set and suddenly my
> >> test case breaks. So I did a bisect and it pointed me to this commit
> >> eventually.
> >>
> >> [...]  
> >>> @@ -2697,11 +2706,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
> >>>    		else
> >>>    			continue;
> >>>
> >>> -		if (insn->imm != PTR_TO_CTX) {
> >>> -			/* clear internal mark */
> >>> -			insn->imm = 0;
> >>> +		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
> >>>    			continue;
> >>> -		}
> >>>
> >>>    		cnt = env->prog->aux->ops->
> >>>    			convert_ctx_access(type, insn->dst_reg, insn->src_reg,  
> >>
> >> Looking at the code, I believe the issue is in above snippet. In the
> >> convert_ctx_accesses() rewrite loop, each time we bpf_patch_insn_single()
> >> a program, the program can grow in size (due to __sk_buff access rewrite,
> >> for example). After rewrite, we do 'i += insn_delta' for adjustment to
> >> process next insn.
> >>
> >> However, env->insn_aux_data is alloced under the assumption that the
> >> very initial, pre-verification prog->len doesn't change, right? So in
> >> the above conversion access to env->insn_aux_data[i].ptr_type is off,
> >> since after rewrites, corresponding mappings to ptr_type might not be
> >> related anymore.
> >>
> >> I noticed this with direct packet access where suddenly the data vs
> >> data_end test failed and contained some "semi-random" value always
> >> bailing out for me.  
> >
> > You are correct.  Should I respin or would you like to post your set? :)  
> 
> Heh, if you don't mind I would go ahead tonight, the conflict at two spots
> when exposing verifier is really minor turns out. Are you okay with this?

Yes, please go ahead :)
 
> What's the plan wrt env->insn_aux_data? Realloc plus rewrite of the array,
> or do you see a more straight forward solution?

I was thinking about something like this: (untested)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1612f7364c42..5c4cae046251 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2657,13 +2657,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i;
+	int i, delta = 0;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		u32 insn_delta, cnt;
+		u32 cnt;
 
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
@@ -2685,18 +2685,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			return -EINVAL;
 		}
 
-		new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
+		new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
+						 cnt);
 		if (!new_prog)
 			return -ENOMEM;
 
-		insn_delta = cnt - 1;
+		delta += cnt - 1;
 
 		/* keep walking new program and skip insns we just inserted */
 		env->prog = new_prog;
-		insn      = new_prog->insnsi + i + insn_delta;
-
-		insn_cnt += insn_delta;
-		i        += insn_delta;
+		insn      = new_prog->insnsi + i + delta;
 	}
 
 	return 0;

^ permalink raw reply related

* Re: [PATCHv6 net-next 04/15] bpf: don't (ab)use instructions to store state
From: Daniel Borkmann @ 2016-09-19 21:44 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, ast, kubakici
In-Reply-To: <20160919223621.363c8ac5@jkicinski-Precision-T1700>

On 09/19/2016 11:36 PM, Jakub Kicinski wrote:
> On Mon, 19 Sep 2016 23:03:17 +0200, Daniel Borkmann wrote:
>> On 09/18/2016 05:09 PM, Jakub Kicinski wrote:
>>> Storing state in reserved fields of instructions makes
>>> it impossible to run verifier on programs already
>>> marked as read-only. Allocate and use an array of
>>> per-instruction state instead.
>>>
>>> While touching the error path rename and move existing
>>> jump target.
>>>
>>> Suggested-by: Alexei Starovoitov <ast@kernel.org>
>>> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>>> Acked-by: Alexei Starovoitov <ast@kernel.org>
>>> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
>>
>> I believe there's still an issue here. Could you please double check
>> and confirm?
>>
>> I rebased my locally pending stuff on top of your set and suddenly my
>> test case breaks. So I did a bisect and it pointed me to this commit
>> eventually.
>>
>> [...]
>>> @@ -2697,11 +2706,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
>>>    		else
>>>    			continue;
>>>
>>> -		if (insn->imm != PTR_TO_CTX) {
>>> -			/* clear internal mark */
>>> -			insn->imm = 0;
>>> +		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
>>>    			continue;
>>> -		}
>>>
>>>    		cnt = env->prog->aux->ops->
>>>    			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
>>
>> Looking at the code, I believe the issue is in above snippet. In the
>> convert_ctx_accesses() rewrite loop, each time we bpf_patch_insn_single()
>> a program, the program can grow in size (due to __sk_buff access rewrite,
>> for example). After rewrite, we do 'i += insn_delta' for adjustment to
>> process next insn.
>>
>> However, env->insn_aux_data is alloced under the assumption that the
>> very initial, pre-verification prog->len doesn't change, right? So in
>> the above conversion access to env->insn_aux_data[i].ptr_type is off,
>> since after rewrites, corresponding mappings to ptr_type might not be
>> related anymore.
>>
>> I noticed this with direct packet access where suddenly the data vs
>> data_end test failed and contained some "semi-random" value always
>> bailing out for me.
>
> You are correct.  Should I respin or would you like to post your set? :)

Heh, if you don't mind I would go ahead tonight, the conflict at two spots
when exposing verifier is really minor turns out. Are you okay with this?

What's the plan wrt env->insn_aux_data? Realloc plus rewrite of the array,
or do you see a more straight forward solution?

^ permalink raw reply

* [PATCH] ip: (ipvlan) introduce L3s mode
From: Mahesh Bandewar @ 2016-09-19 21:39 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Mahesh Bandewar

From: Mahesh Bandewar <maheshb@google.com>

The new mode 'l3s' can be set like -

  ip link add link <master> dev <IPvlan-slave> type ipvlan mode l3s

  e.g. ip link add link eth0 dev ipvl0 type ipvlan mode l3s

Also did some trivial code restructuring.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
 include/linux/if_link.h |  1 +
 ip/iplink_ipvlan.c      | 32 +++++++++++++-------------------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1feb708902ac..78ef2c6ae04e 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -461,6 +461,7 @@ enum {
 enum ipvlan_mode {
 	IPVLAN_MODE_L2 = 0,
 	IPVLAN_MODE_L3,
+	IPVLAN_MODE_L3S,
 	IPVLAN_MODE_MAX
 };
 
diff --git a/ip/iplink_ipvlan.c b/ip/iplink_ipvlan.c
index a6273be88a2a..f7735f3a13ef 100644
--- a/ip/iplink_ipvlan.c
+++ b/ip/iplink_ipvlan.c
@@ -20,18 +20,7 @@
 
 static void ipvlan_explain(FILE *f)
 {
-	fprintf(f, "Usage: ... ipvlan [ mode { l2 | l3 } ]\n");
-}
-
-static void explain(void)
-{
-	ipvlan_explain(stderr);
-}
-
-static int mode_arg(void)
-{
-	fprintf(stderr, "Error: argument of \"mode\" must be either \"l2\", or \"l3\"\n");
-	return -1;
+	fprintf(f, "Usage: ... ipvlan [ mode { l2 | l3  | l3s } ]\n");
 }
 
 static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv,
@@ -47,20 +36,24 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv,
 				mode = IPVLAN_MODE_L2;
 			else if (strcmp(*argv, "l3") == 0)
 				mode = IPVLAN_MODE_L3;
-			else
-				return mode_arg();
-
+			else if (strcmp(*argv, "l3s") == 0)
+				mode = IPVLAN_MODE_L3S;
+			else {
+				fprintf(stderr, "Error: argument of \"mode\" must be either \"l2\", \"l3\" or \"l3s\"\n");
+				return -1;
+			}
 			addattr16(n, 1024, IFLA_IPVLAN_MODE, mode);
 		} else if (matches(*argv, "help") == 0) {
-			explain();
+			ipvlan_explain(stderr);
 			return -1;
 		} else {
 			fprintf(stderr, "ipvlan: unknown option \"%s\"?\n",
 				*argv);
-			explain();
+			ipvlan_explain(stderr);
 			return -1;
 		}
-		argc--, argv++;
+		argc--;
+		argv++;
 	}
 
 	return 0;
@@ -78,7 +71,8 @@ static void ipvlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 
 			fprintf(f, " mode %s ",
 				mode == IPVLAN_MODE_L2 ? "l2" :
-				mode == IPVLAN_MODE_L3 ? "l3" : "unknown");
+				mode == IPVLAN_MODE_L3 ? "l3" :
+				mode == IPVLAN_MODE_L3S ? "l3s" : "unknown");
 		}
 	}
 }
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: [PATCHv6 net-next 04/15] bpf: don't (ab)use instructions to store state
From: Jakub Kicinski @ 2016-09-19 21:36 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: netdev, ast, kubakici
In-Reply-To: <57E05295.4010904@iogearbox.net>

On Mon, 19 Sep 2016 23:03:17 +0200, Daniel Borkmann wrote:
> Hi Jakub,
> 
> On 09/18/2016 05:09 PM, Jakub Kicinski wrote:
> > Storing state in reserved fields of instructions makes
> > it impossible to run verifier on programs already
> > marked as read-only. Allocate and use an array of
> > per-instruction state instead.
> >
> > While touching the error path rename and move existing
> > jump target.
> >
> > Suggested-by: Alexei Starovoitov <ast@kernel.org>
> > Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> > Acked-by: Alexei Starovoitov <ast@kernel.org>
> > Acked-by: Daniel Borkmann <daniel@iogearbox.net>  
> 
> I believe there's still an issue here. Could you please double check
> and confirm?
> 
> I rebased my locally pending stuff on top of your set and suddenly my
> test case breaks. So I did a bisect and it pointed me to this commit
> eventually.
> 
> [...]
> > @@ -2697,11 +2706,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
> >   		else
> >   			continue;
> >
> > -		if (insn->imm != PTR_TO_CTX) {
> > -			/* clear internal mark */
> > -			insn->imm = 0;
> > +		if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
> >   			continue;
> > -		}
> >
> >   		cnt = env->prog->aux->ops->
> >   			convert_ctx_access(type, insn->dst_reg, insn->src_reg,  
> 
> Looking at the code, I believe the issue is in above snippet. In the
> convert_ctx_accesses() rewrite loop, each time we bpf_patch_insn_single()
> a program, the program can grow in size (due to __sk_buff access rewrite,
> for example). After rewrite, we do 'i += insn_delta' for adjustment to
> process next insn.
> 
> However, env->insn_aux_data is alloced under the assumption that the
> very initial, pre-verification prog->len doesn't change, right? So in
> the above conversion access to env->insn_aux_data[i].ptr_type is off,
> since after rewrites, corresponding mappings to ptr_type might not be
> related anymore.
> 
> I noticed this with direct packet access where suddenly the data vs
> data_end test failed and contained some "semi-random" value always
> bailing out for me.

You are correct.  Should I respin or would you like to post your set? :)

^ permalink raw reply

* Re: [PATCH v6 5/6] net: ipv4, ipv6: run cgroup eBPF egress programs
From: Thomas Graf @ 2016-09-19 21:28 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Pablo Neira Ayuso, Daniel Mack, htejun-b10kYP2dOMg,
	daniel-FeC+5ew28dpmcu3hnIyYJQ, ast-b10kYP2dOMg,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, kafai-b10kYP2dOMg,
	fw-HFFVJYpyMKqzQB+pC5nmwQ, harald-H+wXaHxf7aLQT0dZR+AlfA,
	netdev-u79uwXL29TY76Z2rM5mHXA, sargun-GaZTRHToo+CzQB+pC5nmwQ,
	cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20160919201322.GA84770-+o4/htvd0TDFYCXBM6kdu7fOX0fSgVTm@public.gmane.org>

On 09/19/16 at 01:13pm, Alexei Starovoitov wrote:
> as far as bpf debuggability/visibility there are various efforts on the way:
> for kernel side:
> - ksym for jit-ed programs
> - hash sum for prog code
> - compact type information for maps and various pretty printers
> - data flow analysis of the programs

We made a generic map tool available here as well:
https://github.com/cilium/cilium/blob/master/bpf/go/map_ctrl.go

^ permalink raw reply

* [PATCH] tcp: fix wrong checksum calculation on MTU probing
From: Douglas Caetano dos Santos @ 2016-09-19 21:16 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev

With TCP MTU probing enabled and offload TX checksumming disabled,
tcp_mtu_probe() calculated the wrong checksum when a fragment being copied
into the probe's SKB had an odd length. This was caused by the direct use
of skb_copy_and_csum_bits() to calculate the checksum, as it pads the
fragment being copied, if needed. When this fragment was not the last, a
subsequent call used the previous checksum without considering this
padding.

The effect was a stale connection in one way, as even retransmissions
wouldn't solve the problem, because the checksum was never recalculated for
the full SKB length.

Signed-off-by: Douglas Caetano dos Santos <douglascs@taghos.com.br>
---
  net/ipv4/tcp_output.c | 10 ++++++----
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f53d0cc..767135e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1968,10 +1968,12 @@ static int tcp_mtu_probe(struct sock *sk)
          copy = min_t(int, skb->len, probe_size - len);
          if (nskb->ip_summed)
              skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
-        else
-            nskb->csum = skb_copy_and_csum_bits(skb, 0,
-                                skb_put(nskb, copy),
-                                copy, nskb->csum);
+        else {
+            __wsum csum = skb_copy_and_csum_bits(skb, 0,
+                                 skb_put(nskb, copy),
+                                 copy, 0);
+            nskb->csum = csum_block_add(nskb->csum, csum, len);
+        }

          if (skb->len <= copy) {
              /* We've eaten all the data from this skb.
-- 
2.5.0

^ permalink raw reply related

* Re: [PATCH v3 net-next 16/16] tcp_bbr: add BBR congestion control
From: Eric Dumazet @ 2016-09-19 21:23 UTC (permalink / raw)
  To: Rick Jones
  Cc: Stephen Hemminger, Neal Cardwell, David Miller, netdev,
	Van Jacobson, Yuchung Cheng, Nandita Dukkipati,
	Soheil Hassas Yeganeh
In-Reply-To: <9d2f75e6-0161-aa25-e362-ad7267ee2251@hpe.com>

On Mon, Sep 19, 2016 at 2:17 PM, Rick Jones <rick.jones2@hpe.com> wrote:

>
> Are there better than epsilon odds of someone perhaps wanting to poke those
> values as it gets exposure beyond Google?
>

This does not matter.

A change would require patching net/ipv4/tcp_bbr.c , and the 'const'
attribute being there or not does not prevent the change.

I was simply asking to Stephen if the compiler would actually emit a
very different code, worth doing a last minute change.

The main BBR costs are divides and some multiplies , and they are
using per socket fields.

^ permalink raw reply

* Re: [PATCH v3 net-next 16/16] tcp_bbr: add BBR congestion control
From: Rick Jones @ 2016-09-19 21:17 UTC (permalink / raw)
  To: Eric Dumazet, Stephen Hemminger
  Cc: Neal Cardwell, David Miller, netdev, Van Jacobson, Yuchung Cheng,
	Nandita Dukkipati, Soheil Hassas Yeganeh
In-Reply-To: <CANn89i+MpNPYn=ewi_LioNctNePCuity_UXw5ieU7HFfoZTsGA@mail.gmail.com>

On 09/19/2016 02:10 PM, Eric Dumazet wrote:
> On Mon, Sep 19, 2016 at 1:57 PM, Stephen Hemminger
> <stephen@networkplumber.org> wrote:
>
>> Looks good, but could I suggest a simple optimization.
>> All these parameters are immutable in the version of BBR you are submitting.
>> Why not make the values const? And eliminate the always true long-term bw estimate
>> variable?
>>
>
> We could do that.
>
> We used to have variables (aka module params) while BBR was cooking in
> our kernels ;)

Are there better than epsilon odds of someone perhaps wanting to poke 
those values as it gets exposure beyond Google?

happy benchmarking,

rick jones

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox