Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] net: calxeda: xgmac: use new api ethtool_{get|set}_link_ksettings
From: Philippe Reynes @ 2016-12-04 22:37 UTC (permalink / raw)
  To: davem, jarod; +Cc: netdev, linux-kernel, Philippe Reynes

The ethtool api {get|set}_settings is deprecated.
We move this driver to new api {get|set}_link_ksettings.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
---
 drivers/net/ethernet/calxeda/xgmac.c |   17 ++++++++---------
 1 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c
index 6e72366..ce7de6f 100644
--- a/drivers/net/ethernet/calxeda/xgmac.c
+++ b/drivers/net/ethernet/calxeda/xgmac.c
@@ -1530,15 +1530,14 @@ static int xgmac_set_features(struct net_device *dev, netdev_features_t features
 	.ndo_set_features = xgmac_set_features,
 };
 
-static int xgmac_ethtool_getsettings(struct net_device *dev,
-					  struct ethtool_cmd *cmd)
+static int xgmac_ethtool_get_link_ksettings(struct net_device *dev,
+					    struct ethtool_link_ksettings *cmd)
 {
-	cmd->autoneg = 0;
-	cmd->duplex = DUPLEX_FULL;
-	ethtool_cmd_speed_set(cmd, 10000);
-	cmd->supported = 0;
-	cmd->advertising = 0;
-	cmd->transceiver = XCVR_INTERNAL;
+	cmd->base.autoneg = 0;
+	cmd->base.duplex = DUPLEX_FULL;
+	cmd->base.speed = 10000;
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported, 0);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising, 0);
 	return 0;
 }
 
@@ -1681,7 +1680,6 @@ static int xgmac_set_wol(struct net_device *dev,
 }
 
 static const struct ethtool_ops xgmac_ethtool_ops = {
-	.get_settings = xgmac_ethtool_getsettings,
 	.get_link = ethtool_op_get_link,
 	.get_pauseparam = xgmac_get_pauseparam,
 	.set_pauseparam = xgmac_set_pauseparam,
@@ -1690,6 +1688,7 @@ static int xgmac_set_wol(struct net_device *dev,
 	.get_wol = xgmac_get_wol,
 	.set_wol = xgmac_set_wol,
 	.get_sset_count = xgmac_get_sset_count,
+	.get_link_ksettings = xgmac_ethtool_get_link_ksettings,
 };
 
 /**
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH v2 main-v4.9-rc7] net/ipv6: allow sysctl to change link-local address generation mode
From: Felix Jia @ 2016-12-04 22:31 UTC (permalink / raw)
  To: netdev; +Cc: Felix Jia, Carl Smith

Removed the rtnl lock and switch to use RCU lock to iterate through
the netdev list.

The address generation mode for IPv6 link-local can only be configured
by netlink messages. This patch adds the ability to change the address
generation mode via sysctl.

An possible improvement is to remove the addrgenmode variable from the
idev structure and use the systcl storage for the flag.

The patch is based from v4.9-rc7 in mainline.

Signed-off-by: Felix Jia <felix.jia@alliedtelesis.co.nz>
Cc: Carl Smith <carl.smith@alliedtelesis.co.nz>
---
 include/linux/ipv6.h      |  1 +
 include/uapi/linux/ipv6.h |  1 +
 net/ipv6/addrconf.c       | 73 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index a064997..0d9e5d4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -64,6 +64,7 @@ struct ipv6_devconf {
 	} stable_secret;
 	__s32		use_oif_addrs_only;
 	__s32		keep_addr_on_down;
+	__s32		addrgenmode;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 8c27723..0524e2c 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -178,6 +178,7 @@ enum {
 	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
+	DEVCONF_ADDRGENMODE,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4bc5ba3..2b83cc7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -238,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
 	.keep_addr_on_down	= 0,
+	.addrgenmode = IN6_ADDR_GEN_MODE_EUI64,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -284,6 +285,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
 	.keep_addr_on_down	= 0,
+	.addrgenmode = IN6_ADDR_GEN_MODE_EUI64,
 };
 
 /* Check if a valid qdisc is available */
@@ -378,7 +380,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	if (ndev->cnf.stable_secret.initialized)
 		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
 	else
-		ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+		ndev->addr_gen_mode = ipv6_devconf_dflt.addrgenmode;
 
 	ndev->cnf.mtu6 = dev->mtu;
 	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -4950,6 +4952,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
 	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
+	array[DEVCONF_ADDRGENMODE] = cnf->addrgenmode;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5496,6 +5499,67 @@ int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
 	return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
 }
 
+static void addrconf_addrgenmode_change(struct net *net)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		idev = __in6_dev_get(dev);
+		if (idev) {
+			idev->cnf.addrgenmode = ipv6_devconf_dflt.addrgenmode;
+			idev->addr_gen_mode = ipv6_devconf_dflt.addrgenmode;
+			addrconf_dev_config(idev->dev);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int addrconf_sysctl_addrgenmode(struct ctl_table *ctl, int write,
+								void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	int new_val;
+	struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+	struct net *net = (struct net *)ctl->extra2;
+
+	if (write) { /* sysctl write request */
+		ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+		new_val = *((int *)ctl->data);
+
+		/* request for the all */
+		if (&net->ipv6.devconf_all->addrgenmode == ctl->data) {
+			ipv6_devconf_dflt.addrgenmode = new_val;
+			addrconf_addrgenmode_change(net);
+
+		/* request for default */
+		} else if (&net->ipv6.devconf_dflt->addrgenmode == ctl->data) {
+			ipv6_devconf_dflt.addrgenmode = new_val;
+
+		/* request for individual inet device */
+		} else {
+			if (!idev) {
+				return ret;
+			}
+			if (idev->addr_gen_mode != new_val) {
+				idev->addr_gen_mode = new_val;
+				rtnl_lock();
+				addrconf_dev_config(idev->dev);
+				rtnl_unlock();
+			}
+		}
+
+	} else { /* sysctl read request */
+		if (idev) {
+			idev->cnf.addrgenmode = idev->addr_gen_mode;
+		}
+		ret = proc_dointvec(ctl, 0, buffer, lenp, ppos);
+	}
+
+	return ret;
+}
+
 static void dev_disable_change(struct inet6_dev *idev)
 {
 	struct netdev_notifier_info info;
@@ -6042,6 +6106,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 
 	},
 	{
+		.procname	= "addrgenmode",
+		.data		= &ipv6_devconf.addrgenmode,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= addrconf_sysctl_addrgenmode,
+	},
+	{
 		/* sentinel */
 	}
 };
-- 
2.10.2

^ permalink raw reply related

* [PATCH net-next 3/3] bpf: add prog_digest and expose it via fdinfo/netlink
From: Daniel Borkmann @ 2016-12-04 22:19 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, netdev, Daniel Borkmann
In-Reply-To: <cover.1480889473.git.daniel@iogearbox.net>

When loading a BPF program via bpf(2), calculate the digest over
the program's instruction stream and store it in struct bpf_prog's
digest member. This is done at a point in time before any instructions
are rewritten by the verifier. Any unstable map file descriptor
number part of the imm field will be zeroed for the hash.

fdinfo example output for progs:

  # cat /proc/1590/fdinfo/5
  pos:          0
  flags:        02000002
  mnt_id:       11
  prog_type:    1
  prog_jited:   1
  prog_digest:  b27e8b06da22707513aa97363dfb11c7c3675d28
  memlock:      4096

When programs are pinned and retrieved by an ELF loader, the loader
can check the program's digest through fdinfo and compare it against
one that was generated over the ELF file's program section to see
if the program needs to be reloaded. Furthermore, this can also be
exposed through other means such as netlink in case of a tc cls/act
dump (or xdp in future), but also through tracepoints or other
facilities to identify the program. Other than that, the digest can
also serve as a base name for the work in progress kallsyms support
of programs. The digest doesn't depend/select the crypto layer, since
we need to keep dependencies to a minimum. iproute2 will get support
for this facility.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                |  1 +
 include/linux/filter.h             |  7 +++-
 include/uapi/linux/pkt_cls.h       |  1 +
 include/uapi/linux/tc_act/tc_bpf.h |  1 +
 kernel/bpf/core.c                  | 65 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c               | 24 +++++++++++++-
 kernel/bpf/verifier.c              |  2 ++
 net/sched/act_bpf.c                |  9 ++++++
 net/sched/cls_bpf.c                |  8 +++++
 9 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 69d0a7f..8796ff0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,6 +216,7 @@ struct bpf_event_entry {
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+void bpf_prog_calc_digest(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9733813..f078d2b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -14,6 +14,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
+#include <linux/cryptohash.h>
 
 #include <net/sch_generic.h>
 
@@ -56,6 +57,9 @@
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
+/* Maximum BPF program size in bytes. */
+#define MAX_BPF_SIZE	(BPF_MAXINSNS * sizeof(struct bpf_insn))
+
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -404,8 +408,9 @@ struct bpf_prog {
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
-	u32			len;		/* Number of filter blocks */
 	enum bpf_prog_type	type;		/* Type of BPF program */
+	u32			len;		/* Number of filter blocks */
+	u32			digest[SHA_DIGEST_WORDS]; /* Program digest */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	unsigned int		(*bpf_func)(const void *ctx,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 86786d4..1adc0b6 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -397,6 +397,7 @@ enum {
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
 	TCA_BPF_FLAGS_GEN,
+	TCA_BPF_DIGEST,
 	__TCA_BPF_MAX,
 };
 
diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index 063d9d4..a6b88a6 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -27,6 +27,7 @@ enum {
 	TCA_ACT_BPF_FD,
 	TCA_ACT_BPF_NAME,
 	TCA_ACT_BPF_PAD,
+	TCA_ACT_BPF_DIGEST,
 	__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 82a0414..bdcc9f4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
+#define SHA_BPF_RAW_SIZE						\
+	round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES)
+
+/* Called under verifier mutex. */
+void bpf_prog_calc_digest(struct bpf_prog *fp)
+{
+	const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
+	static u32 ws[SHA_WORKSPACE_WORDS];
+	static u8 raw[SHA_BPF_RAW_SIZE];
+	struct bpf_insn *dst = (void *)raw;
+	u32 i, bsize, psize, blocks;
+	bool was_ld_map;
+	u8 *todo = raw;
+	__be32 *result;
+	__be64 *bits;
+
+	sha_init(fp->digest);
+	memset(ws, 0, sizeof(ws));
+
+	/* We need to take out the map fd for the digest calculation
+	 * since they are unstable from user space side.
+	 */
+	for (i = 0, was_ld_map = false; i < fp->len; i++) {
+		dst[i] = fp->insnsi[i];
+		if (!was_ld_map &&
+		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+			was_ld_map = true;
+			dst[i].imm = 0;
+		} else if (was_ld_map &&
+			   dst[i].code == 0 &&
+			   dst[i].dst_reg == 0 &&
+			   dst[i].src_reg == 0 &&
+			   dst[i].off == 0) {
+			was_ld_map = false;
+			dst[i].imm = 0;
+		} else {
+			was_ld_map = false;
+		}
+	}
+
+	psize = fp->len * sizeof(struct bpf_insn);
+	memset(&raw[psize], 0, sizeof(raw) - psize);
+	raw[psize++] = 0x80;
+
+	bsize  = round_up(psize, SHA_MESSAGE_BYTES);
+	blocks = bsize / SHA_MESSAGE_BYTES;
+	if (bsize - psize >= sizeof(__be64)) {
+		bits = (__be64 *)(todo + bsize - sizeof(__be64));
+	} else {
+		bits = (__be64 *)(todo + bsize + bits_offset);
+		blocks++;
+	}
+	*bits = cpu_to_be64((psize - 1) << 3);
+
+	while (blocks--) {
+		sha_transform(fp->digest, todo, ws);
+		todo += SHA_MESSAGE_BYTES;
+	}
+
+	result = (__force __be32 *)fp->digest;
+	for (i = 0; i < SHA_DIGEST_WORDS; i++)
+		result[i] = cpu_to_be32(fp->digest[i]);
+}
+
 static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
 {
 	return BPF_CLASS(insn->code) == BPF_JMP  &&
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85af86c..c0d2b42 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -662,8 +662,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_FS
+static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	const struct bpf_prog *prog = filp->private_data;
+	char prog_digest[sizeof(prog->digest) * 2 + 1] = { };
+
+	bin2hex(prog_digest, prog->digest, sizeof(prog->digest));
+	seq_printf(m,
+		   "prog_type:\t%u\n"
+		   "prog_jited:\t%u\n"
+		   "prog_digest:\t%s\n"
+		   "memlock:\t%llu\n",
+		   prog->type,
+		   prog->jited,
+		   prog_digest,
+		   prog->pages * 1ULL << PAGE_SHIFT);
+}
+#endif
+
 static const struct file_operations bpf_prog_fops = {
-        .release = bpf_prog_release,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= bpf_prog_show_fdinfo,
+#endif
+	.release	= bpf_prog_release,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e74221..16ad38b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3171,6 +3171,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log_level = 0;
 	}
 
+	bpf_prog_calc_digest(env->prog);
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 84c1d2d..1c60317 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -117,10 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
 static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
 				  struct sk_buff *skb)
 {
+	struct nlattr *nla;
+
 	if (prog->bpf_name &&
 	    nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST,
+			  sizeof(prog->filter->digest));
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
 	return 0;
 }
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index f70e03d..adc7760 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -549,10 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
 static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
 				  struct sk_buff *skb)
 {
+	struct nlattr *nla;
+
 	if (prog->bpf_name &&
 	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest));
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
 	return 0;
 }
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 0/3] Minor BPF cleanups and digest
From: Daniel Borkmann @ 2016-12-04 22:19 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, netdev, Daniel Borkmann

First two patches are minor cleanups, and the third one adds
a prog digest. For details, please see individual patches.
After this one, I have a set with tracepoint support that makes
use of this facility as well.

Thanks!

Daniel Borkmann (3):
  bpf: remove type arg from __is_valid_{,xdp_}access
  bpf, cls: consolidate prog deletion path
  bpf: add prog_digest and expose it via fdinfo/netlink

 include/linux/bpf.h                |  1 +
 include/linux/filter.h             |  7 +++-
 include/uapi/linux/pkt_cls.h       |  1 +
 include/uapi/linux/tc_act/tc_bpf.h |  1 +
 kernel/bpf/core.c                  | 65 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c               | 24 +++++++++++++-
 kernel/bpf/verifier.c              |  2 ++
 net/core/filter.c                  | 15 ++++-----
 net/sched/act_bpf.c                |  9 ++++++
 net/sched/cls_bpf.c                | 38 ++++++++++++----------
 10 files changed, 135 insertions(+), 28 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net-next 2/3] bpf, cls: consolidate prog deletion path
From: Daniel Borkmann @ 2016-12-04 22:19 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, netdev, Daniel Borkmann
In-Reply-To: <cover.1480889473.git.daniel@iogearbox.net>

Commit 18cdb37ebf4c ("net: sched: do not use tcf_proto 'tp' argument from
call_rcu") removed the last usage of tp from cls_bpf_delete_prog(), so also
remove it from the function as argument to not give a wrong impression. tp
is illegal to access from this callback, since it could already have been
freed.

Refactor the deletion code a bit, so that cls_bpf_destroy() can call into
the same code for prog deletion as cls_bpf_delete() op, instead of having
it unnecessarily duplicated.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 net/sched/cls_bpf.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c37aa8b..f70e03d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -241,7 +241,7 @@ static int cls_bpf_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
+static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
 {
 	tcf_exts_destroy(&prog->exts);
 
@@ -255,22 +255,22 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
 	kfree(prog);
 }
 
-static void __cls_bpf_delete_prog(struct rcu_head *rcu)
+static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
 {
-	struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
-
-	cls_bpf_delete_prog(prog->tp, prog);
+	__cls_bpf_delete_prog(container_of(rcu, struct cls_bpf_prog, rcu));
 }
 
-static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
 {
-	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg;
-
 	cls_bpf_stop_offload(tp, prog);
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
-	call_rcu(&prog->rcu, __cls_bpf_delete_prog);
+	call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
+}
 
+static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	__cls_bpf_delete(tp, (struct cls_bpf_prog *) arg);
 	return 0;
 }
 
@@ -282,12 +282,8 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
 	if (!force && !list_empty(&head->plist))
 		return false;
 
-	list_for_each_entry_safe(prog, tmp, &head->plist, link) {
-		cls_bpf_stop_offload(tp, prog);
-		list_del_rcu(&prog->link);
-		tcf_unbind_filter(tp, &prog->res);
-		call_rcu(&prog->rcu, __cls_bpf_delete_prog);
-	}
+	list_for_each_entry_safe(prog, tmp, &head->plist, link)
+		__cls_bpf_delete(tp, prog);
 
 	kfree_rcu(head, rcu);
 	return true;
@@ -511,14 +507,14 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 
 	ret = cls_bpf_offload(tp, prog, oldprog);
 	if (ret) {
-		cls_bpf_delete_prog(tp, prog);
+		__cls_bpf_delete_prog(prog);
 		return ret;
 	}
 
 	if (oldprog) {
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
-		call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
+		call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
 	} else {
 		list_add_rcu(&prog->link, &head->plist);
 	}
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next 1/3] bpf: remove type arg from __is_valid_{,xdp_}access
From: Daniel Borkmann @ 2016-12-04 22:19 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, netdev, Daniel Borkmann
In-Reply-To: <cover.1480889473.git.daniel@iogearbox.net>

Commit d691f9e8d440 ("bpf: allow programs to write to certain skb
fields") pushed access type check outside of __is_valid_access()
to have different restrictions for socket filters and tc programs.
type is thus not used anymore within __is_valid_access() and should
be removed as a function argument. Same for __is_valid_xdp_access()
introduced by 6a773a15a1e8 ("bpf: add XDP prog type for early driver
filter").

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 56b4358..b751202 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2748,7 +2748,7 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	}
 }
 
-static bool __is_valid_access(int off, int size, enum bpf_access_type type)
+static bool __is_valid_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
@@ -2782,7 +2782,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 		}
 	}
 
-	return __is_valid_access(off, size, type);
+	return __is_valid_access(off, size);
 }
 
 static bool lwt_is_valid_access(int off, int size,
@@ -2815,7 +2815,7 @@ static bool lwt_is_valid_access(int off, int size,
 		break;
 	}
 
-	return __is_valid_access(off, size, type);
+	return __is_valid_access(off, size);
 }
 
 static bool sock_filter_is_valid_access(int off, int size,
@@ -2833,11 +2833,9 @@ static bool sock_filter_is_valid_access(int off, int size,
 
 	if (off < 0 || off + size > sizeof(struct bpf_sock))
 		return false;
-
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-
 	if (size != sizeof(__u32))
 		return false;
 
@@ -2910,11 +2908,10 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		break;
 	}
 
-	return __is_valid_access(off, size, type);
+	return __is_valid_access(off, size);
 }
 
-static bool __is_valid_xdp_access(int off, int size,
-				  enum bpf_access_type type)
+static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
 		return false;
@@ -2942,7 +2939,7 @@ static bool xdp_is_valid_access(int off, int size,
 		break;
 	}
 
-	return __is_valid_xdp_access(off, size, type);
+	return __is_valid_xdp_access(off, size);
 }
 
 void bpf_warn_invalid_xdp_action(u32 act)
-- 
1.9.3

^ permalink raw reply related

* Re: [PATCH v1 net-next 1/5] net: dsa: mv88e6xxx: Reserved Management frames to CPU
From: Vivien Didelot @ 2016-12-04 22:12 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: David Miller, netdev
In-Reply-To: <20161204202234.GA20743@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

>> The mv88e6xxx_ops actually implements the *features*. They can be
>> prefixed for clarity (e.g. .ppu_*, port_*, .atu_*, etc.). They don't
>> describe the register layout.
>> 
>> But we can discuss two ways of seeing this structure implementation:
>
> or
>
> 3) We have a prefix for us humans to help us find the code. Now we
> have ops, i cannot simply do M-. and emacs will take me to the
> implementation. I have to search for it a bit. Having the hint g1_
> tells me to go look in global1.c. Having the hint g2_ tells me to go
> look in global2.c. Having the port_ tells me to go look in port.c.
> Having no prefix tells me the code is scattered around and grep is my
> friend.

Just to be clear:

I totally agree for an implementation (e.g. mv88e6095_g1_set_cpu_port),
that's why I've been doing it since I started splitting the code around
in device-specific files. But I disagree for an mv88e6xxx_ops member.

You can have several implementations in the same file (e.g. global1.c),
so again the only value is the function name, not the struct member.


Thanks,

     Vivien

^ permalink raw reply

* Re: [PATCH net-next 2/3] net/act_pedit: Support using offset relative to the conventional network headers
From: Or Gerlitz @ 2016-12-04 21:55 UTC (permalink / raw)
  To: David Miller
  Cc: Linux Netdev List, Jamal Hadi Salim, Hadar Hen Zion, Jiri Pirko
In-Reply-To: <20161202104029.GA7729@office.localdomain>

On Fri, Dec 2, 2016 at 12:40 PM, Amir Vadai <amir@vadai.me> wrote:
> On Thu, Dec 01, 2016 at 02:41:14PM -0500, David Miller wrote:
>> From: Amir Vadai <amir@vadai.me>
>> Date: Wed, 30 Nov 2016 11:09:27 +0200

>> > +static int pedit_skb_hdr_offset(struct sk_buff *skb,
>> > +                           enum pedit_header_type htype, int *hoffset)
>> > +{
>> > +   int ret = -1;
>> > +
>> > +   switch (htype) {
>> > +   case PEDIT_HDR_TYPE_ETH:
>> > +           if (skb_mac_header_was_set(skb)) {
>> > +                   *hoffset = skb_mac_offset(skb);
>> > +                   ret = 0;
>> > +           }
>> > +           break;
>> > +   case PEDIT_HDR_TYPE_RAW:
>> > +   case PEDIT_HDR_TYPE_IP4:
>> > +   case PEDIT_HDR_TYPE_IP6:
>> > +           *hoffset = skb_network_offset(skb);
>> > +           ret = 0;
>> > +           break;
>> > +   case PEDIT_HDR_TYPE_TCP:
>> > +   case PEDIT_HDR_TYPE_UDP:
>> > +           if (skb_transport_header_was_set(skb)) {
>> > +                   *hoffset = skb_transport_offset(skb);
>> > +                   ret = 0;
>> > +           }
>> > +           break;
>> > +   };
>> > +
>> > +   return ret;
>> > +}
>> > +

>> The only distinction between the cases is "L2", "L3", and "L4".

>> Therefore I don't see any reason to break it down into IP4 vs. IP6 vs.
>> RAW, for example.  They all map to the same thing.

>> So why not just have PEDIT_HDR_TYPE_L2, PEDIT_HDR_TYPE_L3, and
>> PEDIT_HDR_TYPE_L4?  It definitely seems more straightforward
>> and cleaner that way.

> Yeh, is isn't by mistake. The next step will be to implement hardware
> offloading of the action, and for that we would like to keep the
> information about the specific header type.

Hi Dave,

I see that this patch is marked as "Changes Requested" @ your patchworks.

Just wanted to make a note as Amir explained here and as mentioned on
the change log, this was done in purpose, as heads up for HW offloads.
Typically HW APIs would let you do things also based on header type
they have parsed, etc, so that's why we added this small redundancy
e.g of IPv4/IPv6 header ID instead of network header ID - while SW
wise both IPv4/IPv6 are using the same code path, for HW offloads, the
HW driver could choose to use the IPv4/IPv6 header ID info.

Or.

^ permalink raw reply

* Re: [PATCH] mlx4: Use kernel sizeof and alloc styles
From: Eric Dumazet @ 2016-12-04 20:58 UTC (permalink / raw)
  To: Joe Perches; +Cc: Yishai Hadas, Tariq Toukan, netdev, linux-rdma, linux-kernel
In-Reply-To: <8b20668cb08cce8b4af872863440e149bd25fa94.1480882180.git.joe@perches.com>

On Sun, 2016-12-04 at 12:11 -0800, Joe Perches wrote:
> Convert sizeof foo to sizeof(foo) and allocations with multiplications
> to the appropriate kcalloc/kmalloc_array styles.
> 
> Signed-off-by: Joe Perches <joe@perches.com>
> ---

Gah.

This is one of the hotest NIC driver on linux at this moment, 
with XDP and other efforts going on.

Some kmalloc() are becoming kmalloc_node() in some dev branches, and
there is no kmalloc_array_node() yet.

This kind of patch is making rebases/backports very painful.

Could we wait ~6 months before doing such cleanup/changes please ?

If you believe a bug needs a fix, please send a patch to address it.

Thanks.

^ permalink raw reply

* "af_unix: conditionally use freezable blocking calls in read" is wrong
From: Al Viro @ 2016-12-04 21:04 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang

	Could we please kill that kludge?  "af_unix: use freezable blocking
calls in read" had been wrong to start with; having a method make assumptions
of that sort ("nobody will call me while holding locks I hadn't thought of")
is asking for serious trouble.  splice is just a place where lockdep has
caught that - we *can't* assume that nobody will ever call kernel_recvmsg()
while holding some locks.

	I've run into that converting AF_UNIX to generic_file_splice_read();
I can kludge around that ("freezable unless ->msg_iter is ITER_PIPE"), but
that only delays trouble.

	Note that the only other user of freezable_schedule_timeout() is
a very different story - it's a kernel thread, which *does* have a guaranteed
locking environment.  Making such assumptions in unix_stream_recvmsg(),
OTOH, is insane...

^ permalink raw reply

* [PATCH net-next 3/3] net: ethoc: Demote packet dropped error message to debug
From: Florian Fainelli @ 2016-12-04 20:40 UTC (permalink / raw)
  To: netdev; +Cc: tremyfr, tklauser, davem, thierry.reding, andrew,
	Florian Fainelli
In-Reply-To: <20161204204030.9853-1-f.fainelli@gmail.com>

Spamming the console with: net eth1: packet dropped can happen
fairly frequently if the adapter is busy transmitting, demote the
message to a debug print.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/ethoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 8d0cb5ce87ee..45abc81f6f55 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -576,7 +576,7 @@ static irqreturn_t ethoc_interrupt(int irq, void *dev_id)
 
 	/* We always handle the dropped packet interrupt */
 	if (pending & INT_MASK_BUSY) {
-		dev_err(&dev->dev, "packet dropped\n");
+		dev_dbg(&dev->dev, "packet dropped\n");
 		dev->stats.rx_dropped++;
 	}
 
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 2/3] net: ethoc: Utilize of_get_mac_address()
From: Florian Fainelli @ 2016-12-04 20:40 UTC (permalink / raw)
  To: netdev; +Cc: tremyfr, tklauser, davem, thierry.reding, andrew,
	Florian Fainelli
In-Reply-To: <20161204204030.9853-1-f.fainelli@gmail.com>

Do not open code getting the MAC address exclusively from the
"local-mac-address" property, but instead use of_get_mac_address() which
looks up the MAC address using the 3 typical property names.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/ethoc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 877c02a36c85..8d0cb5ce87ee 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include <linux/of_net.h>
 #include <linux/module.h>
 #include <net/ethoc.h>
 
@@ -1158,11 +1159,9 @@ static int ethoc_probe(struct platform_device *pdev)
 		memcpy(netdev->dev_addr, pdata->hwaddr, IFHWADDRLEN);
 		priv->phy_id = pdata->phy_id;
 	} else {
-		const uint8_t *mac;
+		const void *mac;
 
-		mac = of_get_property(pdev->dev.of_node,
-				      "local-mac-address",
-				      NULL);
+		mac = of_get_mac_address(pdev->dev.of_node);
 		if (mac)
 			memcpy(netdev->dev_addr, mac, IFHWADDRLEN);
 		priv->phy_id = -1;
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 1/3] net: ethoc: Account for duplex changes
From: Florian Fainelli @ 2016-12-04 20:40 UTC (permalink / raw)
  To: netdev; +Cc: tremyfr, tklauser, davem, thierry.reding, andrew,
	Florian Fainelli
In-Reply-To: <20161204204030.9853-1-f.fainelli@gmail.com>

ethoc_mdio_poll() which is our PHYLIB adjust_link callback does nothing,
we should at least react to duplex changes and change MODER accordingly.
Speed changes is not a problem, since the OpenCores Ethernet core seems
to be reacting okay without us telling it.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/ethoc.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 6456c180114b..877c02a36c85 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -221,6 +221,9 @@ struct ethoc {
 	struct mii_bus *mdio;
 	struct clk *clk;
 	s8 phy_id;
+
+	int old_link;
+	int old_duplex;
 };
 
 /**
@@ -667,6 +670,32 @@ static int ethoc_mdio_write(struct mii_bus *bus, int phy, int reg, u16 val)
 
 static void ethoc_mdio_poll(struct net_device *dev)
 {
+	struct ethoc *priv = netdev_priv(dev);
+	struct phy_device *phydev = dev->phydev;
+	bool changed = false;
+	u32 mode;
+
+	if (priv->old_link != phydev->link) {
+		changed = true;
+		priv->old_link = phydev->link;
+	}
+
+	if (priv->old_duplex != phydev->duplex) {
+		changed = true;
+		priv->old_duplex = phydev->duplex;
+	}
+
+	if (!changed)
+		return;
+
+	mode = ethoc_read(priv, MODER);
+	if (phydev->duplex == DUPLEX_FULL)
+		mode |= MODER_FULLD;
+	else
+		mode &= ~MODER_FULLD;
+	ethoc_write(priv, MODER, mode);
+
+	phy_print_status(phydev);
 }
 
 static int ethoc_mdio_probe(struct net_device *dev)
@@ -685,6 +714,9 @@ static int ethoc_mdio_probe(struct net_device *dev)
 		return -ENXIO;
 	}
 
+	priv->old_duplex = -1;
+	priv->old_link = -1;
+
 	err = phy_connect_direct(dev, phy, ethoc_mdio_poll,
 				 PHY_INTERFACE_MODE_GMII);
 	if (err) {
@@ -721,6 +753,9 @@ static int ethoc_open(struct net_device *dev)
 		netif_start_queue(dev);
 	}
 
+	priv->old_link = -1;
+	priv->old_duplex = -1;
+
 	phy_start(dev->phydev);
 	napi_enable(&priv->napi);
 
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 0/3] net: ethoc: Misc improvements
From: Florian Fainelli @ 2016-12-04 20:40 UTC (permalink / raw)
  To: netdev; +Cc: tremyfr, tklauser, davem, thierry.reding, andrew,
	Florian Fainelli

Hi all,

This patch series fixes/improves a few things:

- implement a proper PHYLIB adjust_link callback to set the duplex mode
  accordingly
- do not open code the fetching of a MAC address in OF/DT environments
- demote an error message that occurs more frequently than expected in low
  CPU/memory/bandwidth environments

Tested on a Cirrus Logic EP93xx / TS7300 board.

Florian Fainelli (3):
  net: ethoc: Account for duplex changes
  net: ethoc: Utilize of_get_mac_address()
  net: ethoc: Demote packet dropped error message to debug

 drivers/net/ethernet/ethoc.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

-- 
2.9.3

^ permalink raw reply

* Re: [PATCH v1 net-next 1/5] net: dsa: mv88e6xxx: Reserved Management frames to CPU
From: Andrew Lunn @ 2016-12-04 20:22 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: David Miller, netdev
In-Reply-To: <87y3zvcvhm.fsf@ketchup.i-did-not-set--mail-host-address--so-tickle-me>

> > +int mv88e6095_g2_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip)
> > +{
> > +	int err;
> > +
> > +	/* Consider the frames with reserved multicast destination
> > +	 * addresses matching 01:80:c2:00:00:2x as MGMT.
> > +	 */
> > +	if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_G2_MGMT_EN_2X)) {
> 
> Please don't just move the old code. You shouldn't need flags anymore.

Hi Vivien

My aim is to get the 6390 supported. Refactoring is secondary. If it
helps implementing the 6390 code, i will refactor stuff. If it does
not help, i will leave it alone. It does not help here, so i left it
alone. Please feel free to submit a follow up patch refactoring this
further.

> The mv88e6xxx_ops actually implements the *features*. They can be
> prefixed for clarity (e.g. .ppu_*, port_*, .atu_*, etc.). They don't
> describe the register layout.
> 
> But we can discuss two ways of seeing this structure implementation:

or

3) We have a prefix for us humans to help us find the code. Now we
have ops, i cannot simply do M-. and emacs will take me to the
implementation. I have to search for it a bit. Having the hint g1_
tells me to go look in global1.c. Having the hint g2_ tells me to go
look in global2.c. Having the port_ tells me to go look in port.c.
Having no prefix tells me the code is scattered around and grep is my
friend.

The prefix is just a hint where the function is in the source
code. Nothing more.

      Andrew

^ permalink raw reply

* [PATCH] mlx4: Use kernel sizeof and alloc styles
From: Joe Perches @ 2016-12-04 20:11 UTC (permalink / raw)
  To: Yishai Hadas, Tariq Toukan; +Cc: netdev, linux-rdma, linux-kernel

Convert sizeof foo to sizeof(foo) and allocations with multiplications
to the appropriate kcalloc/kmalloc_array styles.

Signed-off-by: Joe Perches <joe@perches.com>
---
 drivers/net/ethernet/mellanox/mlx4/alloc.c         |  6 +++---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |  8 ++++----
 drivers/net/ethernet/mellanox/mlx4/en_resources.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |  2 +-
 drivers/net/ethernet/mellanox/mlx4/eq.c            | 20 +++++++++---------
 drivers/net/ethernet/mellanox/mlx4/fw.c            |  2 +-
 drivers/net/ethernet/mellanox/mlx4/icm.c           |  2 +-
 drivers/net/ethernet/mellanox/mlx4/icm.h           |  4 ++--
 drivers/net/ethernet/mellanox/mlx4/intf.c          |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c          | 12 +++++------
 drivers/net/ethernet/mellanox/mlx4/mcg.c           | 12 +++++------
 drivers/net/ethernet/mellanox/mlx4/mr.c            | 18 ++++++++--------
 drivers/net/ethernet/mellanox/mlx4/qp.c            | 12 +++++------
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 24 +++++++++++-----------
 15 files changed, 63 insertions(+), 65 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index 249a4584401a..c25de2740c78 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -185,8 +185,8 @@ int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
 	bitmap->avail = num - reserved_top - reserved_bot;
 	bitmap->effective_len = bitmap->avail;
 	spin_lock_init(&bitmap->lock);
-	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
-				sizeof (long), GFP_KERNEL);
+	bitmap->table = kcalloc(BITS_TO_LONGS(bitmap->max), sizeof(long),
+				GFP_KERNEL);
 	if (!bitmap->table)
 		return -ENOMEM;
 
@@ -668,7 +668,7 @@ static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device,
 {
 	struct mlx4_db_pgdir *pgdir;
 
-	pgdir = kzalloc(sizeof *pgdir, gfp);
+	pgdir = kzalloc(sizeof(*pgdir), gfp);
 	if (!pgdir)
 		return NULL;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index e36bebcab3f2..86e03e47ca47 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2616,9 +2616,9 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
 	int i;
 	int err = 0;
 
-	priv->cmd.context = kmalloc(priv->cmd.max_cmds *
-				   sizeof (struct mlx4_cmd_context),
-				   GFP_KERNEL);
+	priv->cmd.context = kmalloc_array(priv->cmd.max_cmds,
+					  sizeof(struct mlx4_cmd_context),
+					  GFP_KERNEL);
 	if (!priv->cmd.context)
 		return -ENOMEM;
 
@@ -2675,7 +2675,7 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 
-	mailbox = kmalloc(sizeof *mailbox, GFP_KERNEL);
+	mailbox = kmalloc(sizeof(*mailbox), GFP_KERNEL);
 	if (!mailbox)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index a6b0db0e0383..10966dc5792c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -44,7 +44,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct net_device *dev = priv->dev;
 
-	memset(context, 0, sizeof *context);
+	memset(context, 0, sizeof(*context));
 	context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET);
 	context->pd = cpu_to_be32(mdev->priv_pdn);
 	context->mtu_msgmax = 0xff;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6562f78b07f4..616d3febe7ce 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1235,7 +1235,7 @@ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
 	}
 	qp->event = mlx4_en_sqp_event;
 
-	memset(context, 0, sizeof *context);
+	memset(context, 0, sizeof(*context));
 	mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
 				qpn, ring->cqn, -1, context);
 	context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 5de3cbe24f2b..233317e5fe72 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -660,7 +660,7 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
 			     void *fragptr)
 {
 	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
-	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
+	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof(*inl);
 	unsigned int hlen = skb_headlen(skb);
 
 	if (skb->len <= spc) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index cd3638e6fe25..01210cb044ed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -259,7 +259,7 @@ int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
 	if (!s_slave->active)
 		return 0;
 
-	memset(&eqe, 0, sizeof eqe);
+	memset(&eqe, 0, sizeof(eqe));
 
 	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
 	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
@@ -276,7 +276,7 @@ int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
 	/*don't send if we don't have the that slave */
 	if (dev->persist->num_vfs < slave)
 		return 0;
-	memset(&eqe, 0, sizeof eqe);
+	memset(&eqe, 0, sizeof(eqe));
 
 	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
 	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
@@ -295,7 +295,7 @@ int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
 	/*don't send if we don't have the that slave */
 	if (dev->persist->num_vfs < slave)
 		return 0;
-	memset(&eqe, 0, sizeof eqe);
+	memset(&eqe, 0, sizeof(eqe));
 
 	eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
 	eqe.subtype = port_subtype_change;
@@ -432,7 +432,7 @@ int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr)
 {
 	struct mlx4_eqe eqe;
 
-	memset(&eqe, 0, sizeof eqe);
+	memset(&eqe, 0, sizeof(eqe));
 
 	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
 	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PORT_INFO;
@@ -721,7 +721,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 			}
 			memcpy(&priv->mfunc.master.comm_arm_bit_vector,
 			       eqe->event.comm_channel_arm.bit_vec,
-			       sizeof eqe->event.comm_channel_arm.bit_vec);
+			       sizeof(eqe->event.comm_channel_arm.bit_vec));
 			queue_work(priv->mfunc.master.comm_wq,
 				   &priv->mfunc.master.comm_work);
 			break;
@@ -986,15 +986,15 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 	 */
 	npages = PAGE_ALIGN(eq->nent * dev->caps.eqe_size) / PAGE_SIZE;
 
-	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
-				GFP_KERNEL);
+	eq->page_list = kmalloc_array(npages, sizeof(*eq->page_list),
+				      GFP_KERNEL);
 	if (!eq->page_list)
 		goto err_out;
 
 	for (i = 0; i < npages; ++i)
 		eq->page_list[i].buf = NULL;
 
-	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	dma_list = kmalloc_array(npages, sizeof(*dma_list), GFP_KERNEL);
 	if (!dma_list)
 		goto err_out_free;
 
@@ -1163,7 +1163,7 @@ int mlx4_alloc_eq_table(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs,
-				    sizeof *priv->eq_table.eq, GFP_KERNEL);
+				    sizeof(*priv->eq_table.eq), GFP_KERNEL);
 	if (!priv->eq_table.eq)
 		return -ENOMEM;
 
@@ -1182,7 +1182,7 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	int i;
 
 	priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev),
-					 sizeof *priv->eq_table.uar_map,
+					 sizeof(*priv->eq_table.uar_map),
 					 GFP_KERNEL);
 	if (!priv->eq_table.uar_map) {
 		err = -ENOMEM;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 84bab9f0732e..2be6f0518baf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -57,7 +57,7 @@ MODULE_PARM_DESC(enable_qos, "Enable Enhanced QoS support (default: off)");
 	do {							      \
 		void *__p = (char *) (source) + (offset);	      \
 		u64 val;                                              \
-		switch (sizeof (dest)) {			      \
+		switch (sizeof(dest)) {				      \
 		case 1: (dest) = *(u8 *) __p;	    break;	      \
 		case 2: (dest) = be16_to_cpup(__p); break;	      \
 		case 4: (dest) = be32_to_cpup(__p); break;	      \
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 2a9dd460a95f..84fe587d5dfe 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -396,7 +396,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
 	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
 
-	table->icm      = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
+	table->icm      = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
 	if (!table->icm)
 		return -ENOMEM;
 	table->virt     = virt;
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h
index 0c7364550150..304f2f1d26a5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.h
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.h
@@ -39,8 +39,8 @@
 #include <linux/mutex.h>
 
 #define MLX4_ICM_CHUNK_LEN						\
-	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
-	 (sizeof (struct scatterlist)))
+	((256 - sizeof(struct list_head) - 2 * sizeof(int)) /		\
+	 (sizeof(struct scatterlist)))
 
 enum {
 	MLX4_ICM_PAGE_SHIFT	= 12,
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 0e8b7c44931f..968b4b4ee5ca 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -53,7 +53,7 @@ static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
 {
 	struct mlx4_device_context *dev_ctx;
 
-	dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL);
+	dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL);
 	if (!dev_ctx)
 		return;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6f4e67bc3538..932cd312d131 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -907,10 +907,10 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	mlx4_replace_zero_macs(dev);
 
 	dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
-	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
-	dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
-	dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+	dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+	dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 
 	if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
 	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy ||
@@ -2373,7 +2373,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 		dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2;
 	}
 	priv->eq_table.inta_pin = adapter.inta_pin;
-	memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
+	memcpy(dev->board_id, adapter.board_id, sizeof(dev->board_id));
 
 	return 0;
 
@@ -2845,7 +2845,7 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 		if (nreq > MAX_MSIX)
 			nreq = MAX_MSIX;
 
-		entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
+		entries = kcalloc(nreq, sizeof(*entries), GFP_KERNEL);
 		if (!entries)
 			goto no_msi;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index 94b891c118c1..53841aa994b7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -161,7 +161,7 @@ static int new_steering_entry(struct mlx4_dev *dev, u8 port,
 		return -EINVAL;
 
 	s_steer = &mlx4_priv(dev)->steer[port - 1];
-	new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL);
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
 	if (!new_entry)
 		return -ENOMEM;
 
@@ -174,7 +174,7 @@ static int new_steering_entry(struct mlx4_dev *dev, u8 port,
 	 */
 	pqp = get_promisc_qp(dev, port, steer, qpn);
 	if (pqp) {
-		dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+		dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
 		if (!dqp) {
 			err = -ENOMEM;
 			goto out_alloc;
@@ -273,7 +273,7 @@ static int existing_steering_entry(struct mlx4_dev *dev, u8 port,
 	}
 
 	/* add the qp as a duplicate on this index */
-	dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+	dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
 	if (!dqp)
 		return -ENOMEM;
 	dqp->qpn = qpn;
@@ -442,7 +442,7 @@ static int add_promisc_qp(struct mlx4_dev *dev, u8 port,
 		goto out_mutex;
 	}
 
-	pqp = kmalloc(sizeof *pqp, GFP_KERNEL);
+	pqp = kmalloc(sizeof(*pqp), GFP_KERNEL);
 	if (!pqp) {
 		err = -ENOMEM;
 		goto out_mutex;
@@ -513,7 +513,7 @@ static int add_promisc_qp(struct mlx4_dev *dev, u8 port,
 	/* add the new qpn to list of promisc qps */
 	list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
 	/* now need to add all the promisc qps to default entry */
-	memset(mgm, 0, sizeof *mgm);
+	memset(mgm, 0, sizeof(*mgm));
 	members_count = 0;
 	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) {
 		if (members_count == dev->caps.num_qp_per_mgm) {
@@ -1137,7 +1137,7 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 		index += dev->caps.num_mgms;
 
 		new_entry = 1;
-		memset(mgm, 0, sizeof *mgm);
+		memset(mgm, 0, sizeof(*mgm));
 		memcpy(mgm->gid, gid, 16);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 395b5463cfd9..f490a29b6203 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -106,16 +106,15 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
 	buddy->max_order = max_order;
 	spin_lock_init(&buddy->lock);
 
-	buddy->bits = kcalloc(buddy->max_order + 1, sizeof (long *),
-			      GFP_KERNEL);
-	buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
-				  GFP_KERNEL);
+	buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *), GFP_KERNEL);
+	buddy->num_free = kcalloc(buddy->max_order + 1,
+				  sizeof(*buddy->num_free), GFP_KERNEL);
 	if (!buddy->bits || !buddy->num_free)
 		goto err_out;
 
 	for (i = 0; i <= buddy->max_order; ++i) {
 		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
-		buddy->bits[i] = kcalloc(s, sizeof (long), GFP_KERNEL | __GFP_NOWARN);
+		buddy->bits[i] = kcalloc(s, sizeof(long), GFP_KERNEL | __GFP_NOWARN);
 		if (!buddy->bits[i]) {
 			buddy->bits[i] = vzalloc(s * sizeof(long));
 			if (!buddy->bits[i])
@@ -706,13 +705,13 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		return -ENOMEM;
 
 	dma_sync_single_for_cpu(&dev->persist->pdev->dev, dma_handle,
-				npages * sizeof (u64), DMA_TO_DEVICE);
+				npages * sizeof(u64), DMA_TO_DEVICE);
 
 	for (i = 0; i < npages; ++i)
 		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
 	dma_sync_single_for_device(&dev->persist->pdev->dev, dma_handle,
-				   npages * sizeof (u64), DMA_TO_DEVICE);
+				   npages * sizeof(u64), DMA_TO_DEVICE);
 
 	return 0;
 }
@@ -796,8 +795,7 @@ int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	int err;
 	int i;
 
-	page_list = kmalloc(buf->npages * sizeof *page_list,
-			    gfp);
+	page_list = kmalloc_array(buf->npages, sizeof(*page_list), gfp);
 	if (!page_list)
 		return -ENOMEM;
 
@@ -1056,7 +1054,7 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
 		return -EINVAL;
 
 	/* All MTTs must fit in the same page */
-	if (max_pages * sizeof *fmr->mtts > PAGE_SIZE)
+	if (max_pages * sizeof(*fmr->mtts) > PAGE_SIZE)
 		return -EINVAL;
 
 	fmr->page_shift = page_shift;
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index d1cd9c32a9ae..ded4109865a0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -174,7 +174,7 @@ static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 			cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
 
 	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
-	memcpy(mailbox->buf + 8, context, sizeof *context);
+	memcpy(mailbox->buf + 8, context, sizeof(*context));
 
 	((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
 		cpu_to_be32(qp->qpn);
@@ -825,10 +825,10 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 
 		/* In mfunc, calculate proxy and tunnel qp offsets for the PF here,
 		 * since the PF does not call mlx4_slave_caps */
-		dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
-		dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
-		dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
-		dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+		dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+		dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+		dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+		dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 
 		if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
 		    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) {
@@ -888,7 +888,7 @@ int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
 			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A,
 			   MLX4_CMD_WRAPPED);
 	if (!err)
-		memcpy(context, mailbox->buf + 8, sizeof *context);
+		memcpy(context, mailbox->buf + 8, sizeof(*context));
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index c548beaaf910..ff11dc01c2d5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -999,7 +999,7 @@ static struct res_common *alloc_qp_tr(int id)
 {
 	struct res_qp *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1017,7 +1017,7 @@ static struct res_common *alloc_mtt_tr(int id, int order)
 {
 	struct res_mtt *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1033,7 +1033,7 @@ static struct res_common *alloc_mpt_tr(int id, int key)
 {
 	struct res_mpt *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1048,7 +1048,7 @@ static struct res_common *alloc_eq_tr(int id)
 {
 	struct res_eq *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1062,7 +1062,7 @@ static struct res_common *alloc_cq_tr(int id)
 {
 	struct res_cq *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1077,7 +1077,7 @@ static struct res_common *alloc_srq_tr(int id)
 {
 	struct res_srq *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1092,7 +1092,7 @@ static struct res_common *alloc_counter_tr(int id, int port)
 {
 	struct res_counter *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1107,7 +1107,7 @@ static struct res_common *alloc_xrcdn_tr(int id)
 {
 	struct res_xrcdn *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1121,7 +1121,7 @@ static struct res_common *alloc_fs_rule_tr(u64 id, int qpn)
 {
 	struct res_fs_rule *ret;
 
-	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 	if (!ret)
 		return NULL;
 
@@ -1233,7 +1233,7 @@ static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
 	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
 	struct rb_root *root = &tracker->res_tree[type];
 
-	res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL);
+	res_arr = kcalloc(count, sizeof(*res_arr), GFP_KERNEL);
 	if (!res_arr)
 		return -ENOMEM;
 
@@ -1986,7 +1986,7 @@ static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port,
 
 	if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port))
 		return -EINVAL;
-	res = kzalloc(sizeof *res, GFP_KERNEL);
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
 	if (!res) {
 		mlx4_release_resource(dev, slave, RES_MAC, 1, port);
 		return -ENOMEM;
@@ -3978,7 +3978,7 @@ static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
 	struct res_gid *res;
 	int err;
 
-	res = kzalloc(sizeof *res, GFP_KERNEL);
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
 	if (!res)
 		return -ENOMEM;
 
-- 
2.10.0.rc2.1.g053435c

^ permalink raw reply related

* Re: [PATCH v1 net-next 1/5] net: dsa: mv88e6xxx: Reserved Management frames to CPU
From: Vivien Didelot @ 2016-12-04 20:03 UTC (permalink / raw)
  To: Andrew Lunn, David Miller; +Cc: netdev, Andrew Lunn
In-Reply-To: <1480736720-12608-2-git-send-email-andrew@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

> +	/* Some generations have the configuration of sending reserved
> +	 * management frames to the CPU in global2, others in
> +	 * global1. Hence it does not fit the two setup functions
> +	 * above.
> +	 */
> +	if (chip->info->ops->mgmt_rsvd2cpu) {
> +		err = chip->info->ops->mgmt_rsvd2cpu(chip);
> +		if (err)
> +			goto unlock;
> +	}

Correct. The old mv88e6xxx_g*_setup() are bad. Ideally we'll get rid of
them once we have the complete set of features implemented in the API.

> +/* Offset 0x02: Management Enable 2x */
> +/* Offset 0x03: Management Enable 0x */
> +
> +int mv88e6095_g2_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip)
> +{
> +	int err;
> +
> +	/* Consider the frames with reserved multicast destination
> +	 * addresses matching 01:80:c2:00:00:2x as MGMT.
> +	 */
> +	if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_G2_MGMT_EN_2X)) {

Please don't just move the old code. You shouldn't need flags anymore.

> +		err = mv88e6xxx_g2_write(chip, GLOBAL2_MGMT_EN_2X, 0xffff);
> +		if (err)
> +			return err;
> +	}
> +
> +	/* Consider the frames with reserved multicast destination
> +	 * addresses matching 01:80:c2:00:00:0x as MGMT.
> +	 */
> +	if (mv88e6xxx_has(chip, MV88E6XXX_FLAG_G2_MGMT_EN_0X))
> +		return mv88e6xxx_g2_write(chip, GLOBAL2_MGMT_EN_0X, 0xffff);
> +
> +	return 0;
> +}

[...]

>  	int (*g1_set_cpu_port)(struct mv88e6xxx_chip *chip, int port);
>  	int (*g1_set_egress_port)(struct mv88e6xxx_chip *chip, int port);
> +
> +	/* Can be either in g1 or g2, so don't use a prefix */

We have to discuss this and find an agreement.

The mv88e6xxx_ops actually implements the *features*. They can be
prefixed for clarity (e.g. .ppu_*, port_*, .atu_*, etc.). They don't
describe the register layout.

But we can discuss two ways of seeing this structure implementation:

1) Either we describe the exact register layout, and provide
.g1_mgmt_rsv2cpu and .g2_mgmt_rsvd2cpu. One or the other being NULL.

This will have the impact of expliciting the features location. One can
say it'll ease adding support for new chips, but that's not true when
the feature at the same location is implemented differently (e.g. port's
speed, switch reset). This will make the structure unnecessarily big as
well as cluttering the wrapping code.

2) We describe the feature. No *location* prefix.

To explain this point, please understand what Marvell does with their
chips, using the example of Rsvd2CPU feature and (old-to-new) models:

  - 6060 doesn't have the feature
  - 6185 introduced one G2 register for 0x MAC addresses
  - 6352 added a second G2 register for 2x MAC addresses
  - 6390 packed the feature in a single-register indirect table in G1

That's all. They are just relocating features to free a few registers.

So .g1_set_cpu_port, .g1_reset, or whatever location-prefixed operation
does not make sense, unless you want to describe the register layout.
But we should not mix 1) and 2).

Thanks,

        Vivien

^ permalink raw reply

* Re: [GIT PULL nf-next 0/2] IPVS Updates for v4.10
From: Pablo Neira Ayuso @ 2016-12-04 19:46 UTC (permalink / raw)
  To: Simon Horman
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov
In-Reply-To: <1479200503-25842-1-git-send-email-horms@verge.net.au>

On Tue, Nov 15, 2016 at 10:01:41AM +0100, Simon Horman wrote:
> Hi Pablo,
> 
> please consider these enhancements to the IPVS for v4.10.
> 
> * Decrement the IP ttl in all the modes in order to prevent infinite
>   route loops. Thanks to Dwip Banerjee.
> * Use IS_ERR_OR_NULL macro. Clean-up from Gao Feng.
> 
> 
> The following changes since commit 7d384846b9987f7b611357adf3cdfecfdcf0c402:
> 
>   Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next (2016-11-13 22:41:25 -0500)
> 
> are available in the git repository at:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git tags/ipvs-for-v4.10

Pulled, thanks Simon.

^ permalink raw reply

* Re: [net-next PATCH v4 1/6] net: virtio dynamically disable/enable LRO
From: John Fastabend @ 2016-12-04 18:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <20161203010423-mutt-send-email-mst@kernel.org>

On 16-12-03 09:36 PM, Michael S. Tsirkin wrote:
> On Fri, Dec 02, 2016 at 12:49:45PM -0800, John Fastabend wrote:
>> This adds support for dynamically setting the LRO feature flag. The
>> message to control guest features in the backend uses the
>> CTRL_GUEST_OFFLOADS msg type.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>  drivers/net/virtio_net.c |   45 ++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 44 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index a21d93a..d814e7cb 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -1419,6 +1419,41 @@ static void virtnet_init_settings(struct net_device *dev)
>>  	.set_settings = virtnet_set_settings,
>>  };
>>  
>> +static int virtnet_set_features(struct net_device *netdev,
>> +				netdev_features_t features)
>> +{
>> +	struct virtnet_info *vi = netdev_priv(netdev);
>> +	struct virtio_device *vdev = vi->vdev;
>> +	struct scatterlist sg;
>> +	u64 offloads = 0;
>> +
>> +	if (features & NETIF_F_LRO)
>> +		offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) |
>> +			    (1 << VIRTIO_NET_F_GUEST_TSO6);
>> +
>> +	if (features & NETIF_F_RXCSUM)
>> +		offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM);
>> +
>> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
>> +		sg_init_one(&sg, &offloads, sizeof(uint64_t));
>> +		if (!virtnet_send_command(vi,
>> +					  VIRTIO_NET_CTRL_GUEST_OFFLOADS,
>> +					  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
>> +					  &sg)) {
>> +			dev_warn(&netdev->dev,
>> +				 "Failed to set guest offloads by virtnet command.\n");
>> +			return -EINVAL;
>> +		}
>> +	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
>> +		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
> 
> No need for VIRTIO_F_VERSION_1 here.
> 

Yep Dave pointed it out as well. Also I found a bug on driver unload
path where XDP tx queues are not cleaned up correctly so will need a v5
for that.

Thanks,
John

^ permalink raw reply

* [PATCH v2 net-next] bnx2x: ethtool -x full support
From: Eric Dumazet @ 2016-12-04 18:15 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Ariel Elior
In-Reply-To: <1476810487.5650.68.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

Implement ethtool -x full support, so that rss key can be fetched
instead of assuming it matches /proc/sys/net/core/netdev_rss_key
content.

We might add "ethtool --rxfh" support later to set a different rss key.

Tested:

lpk51:~# ethtool --show-rxfh eth0 | grep -A1 'hash key'
RSS hash key:
8b:a9:3a:ff:3e:f8:44:bd:5a:44:b7:b5:6d:e8:2d:f0:f0:72:98:54:03:86:8f:39:a4:42:5a:b3:84:71:5c:4f:1c:18:d6:a3:04:68:85:ac
lpk51:~# cat /proc/sys/net/core/netdev_rss_key
8b:a9:3a:ff:3e:f8:44:bd:5a:44:b7:b5:6d:e8:2d:f0:f0:72:98:54:03:86:8f:39:a4:42:5a:b3:84:71:5c:4f:1c:18:d6:a3:04:68:85:ac:22:1f:50:76:d4:c8:a5:20:7b:61:3c:0c

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ariel Elior <ariel.elior@qlogic.com>
---
v2: support CONFIG_BNX2X_SRIOV=y

 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c     |    2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c |   47 ++++++----
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c      |    2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h      |    5 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c    |    5 -
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index ed42c1009685..28af24ae0092 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2106,7 +2106,7 @@ int bnx2x_rss(struct bnx2x *bp, struct bnx2x_rss_config_obj *rss_obj,
 
 	if (config_hash) {
 		/* RSS keys */
-		netdev_rss_key_fill(params.rss_key, T_ETH_RSS_KEY * 4);
+		netdev_rss_key_fill(&rss_obj->rss_key, T_ETH_RSS_KEY * 4);
 		__set_bit(BNX2X_RSS_SET_SRCH, &params.rss_flags);
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 85a7800bfc12..28bc9479fc74 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3421,6 +3421,13 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device *dev)
 	return T_ETH_INDIRECTION_TABLE_SIZE;
 }
 
+static u32 bnx2x_get_rxfh_key_size(struct net_device *dev)
+{
+	struct bnx2x *bp = netdev_priv(dev);
+
+	return (bp->port.pmf || !CHIP_IS_E1x(bp)) ? T_ETH_RSS_KEY * 4 : 0;
+}
+
 static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 			  u8 *hfunc)
 {
@@ -3430,23 +3437,30 @@ static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 
 	if (hfunc)
 		*hfunc = ETH_RSS_HASH_TOP;
-	if (!indir)
-		return 0;
 
-	/* Get the current configuration of the RSS indirection table */
-	bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table);
-
-	/*
-	 * We can't use a memcpy() as an internal storage of an
-	 * indirection table is a u8 array while indir->ring_index
-	 * points to an array of u32.
-	 *
-	 * Indirection table contains the FW Client IDs, so we need to
-	 * align the returned table to the Client ID of the leading RSS
-	 * queue.
-	 */
-	for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
-		indir[i] = ind_table[i] - bp->fp->cl_id;
+	if (key) {
+		if (bp->port.pmf || !CHIP_IS_E1x(bp))
+			memcpy(key, &bp->rss_conf_obj.rss_key, T_ETH_RSS_KEY * 4);
+		else
+			memset(key, 0, T_ETH_RSS_KEY * 4);
+	}
+
+	if (indir) {
+		/* Get the current configuration of the RSS indirection table */
+		bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table);
+
+		/*
+		 * We can't use a memcpy() as an internal storage of an
+		 * indirection table is a u8 array while indir->ring_index
+		 * points to an array of u32.
+		 *
+		 * Indirection table contains the FW Client IDs, so we need to
+		 * align the returned table to the Client ID of the leading RSS
+		 * queue.
+		 */
+		for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
+			indir[i] = ind_table[i] - bp->fp->cl_id;
+	}
 
 	return 0;
 }
@@ -3628,6 +3642,7 @@ static const struct ethtool_ops bnx2x_ethtool_ops = {
 	.get_ethtool_stats	= bnx2x_get_ethtool_stats,
 	.get_rxnfc		= bnx2x_get_rxnfc,
 	.set_rxnfc		= bnx2x_set_rxnfc,
+	.get_rxfh_key_size	= bnx2x_get_rxfh_key_size,
 	.get_rxfh_indir_size	= bnx2x_get_rxfh_indir_size,
 	.get_rxfh		= bnx2x_get_rxfh,
 	.set_rxfh		= bnx2x_set_rxfh,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index cea6bdcde33f..3b1becc23160 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -4538,7 +4538,7 @@ static int bnx2x_setup_rss(struct bnx2x *bp,
 	/* RSS keys */
 	if (test_bit(BNX2X_RSS_SET_SRCH, &p->rss_flags)) {
 		u8 *dst = (u8 *)(data->rss_key) + sizeof(data->rss_key);
-		const u8 *src = (const u8 *)p->rss_key;
+		const u8 *src = (const u8 *)o->rss_key;
 		int i;
 
 		/* Apparently, bnx2x reads this array in reverse order
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
index 0bf2fd470819..0092991796d3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
@@ -744,9 +744,6 @@ struct bnx2x_config_rss_params {
 	/* Indirection table */
 	u8		ind_table[T_ETH_INDIRECTION_TABLE_SIZE];
 
-	/* RSS hash values */
-	u32		rss_key[10];
-
 	/* valid only iff BNX2X_RSS_UPDATE_TOE is set */
 	u16		toe_rss_bitmap;
 };
@@ -760,6 +757,8 @@ struct bnx2x_rss_config_obj {
 	/* Last configured indirection table */
 	u8			ind_table[T_ETH_INDIRECTION_TABLE_SIZE];
 
+	u32     rss_key[T_ETH_RSS_KEY];
+
 	/* flags for enabling 4-tupple hash on UDP */
 	u8			udp_rss_v4;
 	u8			udp_rss_v6;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index bfae300cf25f..9a8f36f12b07 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -811,7 +811,8 @@ int bnx2x_vfpf_config_rss(struct bnx2x *bp,
 		      sizeof(struct channel_list_end_tlv));
 
 	memcpy(req->ind_table, params->ind_table, T_ETH_INDIRECTION_TABLE_SIZE);
-	memcpy(req->rss_key, params->rss_key, sizeof(params->rss_key));
+	memcpy(req->rss_key, params->rss_obj->rss_key,
+	       sizeof(params->rss_obj->rss_key));
 	req->ind_table_size = T_ETH_INDIRECTION_TABLE_SIZE;
 	req->rss_key_size = T_ETH_RSS_KEY;
 	req->rss_result_mask = params->rss_result_mask;
@@ -1985,7 +1986,7 @@ static void bnx2x_vf_mbx_update_rss(struct bnx2x *bp, struct bnx2x_virtf *vf,
 	/* set vfop params according to rss tlv */
 	memcpy(rss.ind_table, rss_tlv->ind_table,
 	       T_ETH_INDIRECTION_TABLE_SIZE);
-	memcpy(rss.rss_key, rss_tlv->rss_key, sizeof(rss_tlv->rss_key));
+	memcpy(&vf->rss_conf_obj.rss_key, rss_tlv->rss_key, sizeof(rss_tlv->rss_key));
 	rss.rss_obj = &vf->rss_conf_obj;
 	rss.rss_result_mask = rss_tlv->rss_result_mask;
 

^ permalink raw reply related

* Re: Trigger EHOSTUNREACH
From: Neal Cardwell @ 2016-12-04 17:51 UTC (permalink / raw)
  To: Marco Zunino; +Cc: Netdev
In-Reply-To: <CAP__gDkhM82EP6pP0NBHAYME0iPkEf49aeOp9C3dYQcCpdetrA@mail.gmail.com>

On Sun, Dec 4, 2016 at 7:04 AM, Marco Zunino <eng.marco.zunino@gmail.com> wrote:
> Hallo everyone, hope you are having a good day
> we are building a networking testing tool to simulate network error
> condition, and we are having difficulties triggering the EHOSTUNREACH
> socket error.
>
> We are trying to trigger this error by sending an ICMP packet type=3
> code=3 on an open STREAM socket, but it has no effect.
>
> Based on RFC1122 and the code here
>
> https://github.com/torvalds/linux/blob/e76d21c40bd6c67fd4e2c1540d77e113df962b4d/net/ipv4/tcp_ipv4.c#L353
>
> I would expect the this ICMP packet to abort the socket connection
> with a EHOSTUNREACH error on the client side, but this does not
> happen.

In my quick tests with packetdrill, it looks like Linux will not
immediately pass EHOSTUNREACH to the application unless the
application has requested this with setsockopt(SOL_IP, IP_RECVERR).

Specifically, the following packetdrill test passes for me:
---
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+.020 < . 1:1(0) ack 1 win 257
   +0 accept(3, ..., ...) = 4
   +0 setsockopt(4, SOL_IP, IP_RECVERR, [1], 4) = 0
   +0 write(4, ..., 1000) = 1000
   +0 > P. 1:1001(1000) ack 1

+.010 < icmp unreachable host_unreachable [1:1461(1460)]

   +0 write(4, ..., 1) = -1 EHOSTUNREACH (No route to host)
---

But without the setsockopt(SOL_IP, IP_RECVERR) there is no error upon
the second write().

My reading of RFC 1122 is that this is consistent with the RFC.

RFC 1122 section 3.2.2.1 says:

            A Destination Unreachable message that is received with code
            0 (Net), 1 (Host), or 5 (Bad Source Route) may result from a
            routing transient and MUST therefore be interpreted as only
            a hint, not proof, that the specified destination is
            unreachable [IP:11].

So it seems that the RFC is suggesting that by default an ICMP host
unreachable should not cause an immediate error for the connection.
Instead, it should be used as a hint as to the cause of the problem if
TCP's normal reliable delivery mechanisms ultimately timeout and fail.

neal

^ permalink raw reply

* [PATCH v3 net-next] net_sched: gen_estimator: complete rewrite of rate estimators
From: Eric Dumazet @ 2016-12-04 17:48 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, netfilter-devel
In-Reply-To: <1480835882.18162.462.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

1) Old code was hard to maintain, due to complex lock chains.
   (We probably will be able to remove some kfree_rcu() in callers)

2) Using a single timer to update all estimators does not scale.

3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
   is not supposed to work well)

In this rewrite :

- I removed the RB tree that had to be scanned in
  gen_estimator_active(). qdisc dumps should be much faster.

- Each estimator has its own timer.

- Estimations are maintained in net_rate_estimator structure,
  instead of dirtying the qdisc. Minor, but part of the simplification.

- Reading the estimator uses RCU and a seqcount to provide proper
  support for 32bit kernels.

- We reduce memory need when estimators are not used, since
  we store a pointer, instead of the bytes/packets counters.

- xt_rateest_mt() no longer has to grab a spinlock.
  (In the future, xt_rateest_tg() could be switched to per cpu counters)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
v3: Renamed some parameters to please make htmldocs
v2: Removed unwanted changes to tcp_output.c

 include/net/act_api.h              |    2 
 include/net/gen_stats.h            |   17 -
 include/net/netfilter/xt_rateest.h |   10 
 include/net/sch_generic.h          |    2 
 net/core/gen_estimator.c           |  299 +++++++++------------------
 net/core/gen_stats.c               |   20 -
 net/netfilter/xt_RATEEST.c         |    4 
 net/netfilter/xt_rateest.c         |   28 +-
 net/sched/act_api.c                |    9 
 net/sched/act_police.c             |   21 +
 net/sched/sch_api.c                |    2 
 net/sched/sch_cbq.c                |    6 
 net/sched/sch_drr.c                |    6 
 net/sched/sch_generic.c            |    2 
 net/sched/sch_hfsc.c               |    6 
 net/sched/sch_htb.c                |    6 
 net/sched/sch_qfq.c                |    8 
 17 files changed, 182 insertions(+), 266 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9dddf77a69ccbcb003cfa66bcc0de337f78f3dae..1d716449209e4753a297c61a287077a1eb96e6d8 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -36,7 +36,7 @@ struct tc_action {
 	struct tcf_t			tcfa_tm;
 	struct gnet_stats_basic_packed	tcfa_bstats;
 	struct gnet_stats_queue		tcfa_qstats;
-	struct gnet_stats_rate_est64	tcfa_rate_est;
+	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
 	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121cc7d9c72075e7e6dde3655d631f64a1c4..8b7aa370e7a4af61fcb71ed751dba72ebead6143 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
 	struct u64_stats_sync syncp;
 };
 
+struct net_rate_estimator;
+
 struct gnet_dump {
 	spinlock_t *      lock;
 	struct sk_buff *  skb;
@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
 			     struct gnet_stats_basic_cpu __percpu *cpu,
 			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
-			     const struct gnet_stats_basic_packed *b,
-			     struct gnet_stats_rate_est64 *r);
+			     struct net_rate_estimator __rcu **ptr);
 int gnet_stats_copy_queue(struct gnet_dump *d,
 			  struct gnet_stats_queue __percpu *cpu_q,
 			  struct gnet_stats_queue *q, __u32 qlen);
@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
 
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-		      struct gnet_stats_rate_est64 *rate_est,
+		      struct net_rate_estimator __rcu **rate_est,
 		      spinlock_t *stats_lock,
 		      seqcount_t *running, struct nlattr *opt);
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			struct gnet_stats_rate_est64 *rate_est);
+void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-			  struct gnet_stats_rate_est64 *rate_est,
+			  struct net_rate_estimator __rcu **ptr,
 			  spinlock_t *stats_lock,
 			  seqcount_t *running, struct nlattr *opt);
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-			  const struct gnet_stats_rate_est64 *rate_est);
+bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
+bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
+			struct gnet_stats_rate_est64 *sample);
 #endif
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index 79f45e19f31eaa54deb8437ef4b883bec78def84..130e58361f998ecdf361910b196e69778224d955 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -1,19 +1,23 @@
 #ifndef _XT_RATEEST_H
 #define _XT_RATEEST_H
 
+#include <net/gen_stats.h>
+
 struct xt_rateest {
 	/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
 	struct gnet_stats_basic_packed	bstats;
 	spinlock_t			lock;
-	/* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
-	struct gnet_stats_rate_est64	rstats;
+
 
 	/* following fields not accessed in hot path */
+	unsigned int			refcnt;
 	struct hlist_node		list;
 	char				name[IFNAMSIZ];
-	unsigned int			refcnt;
 	struct gnet_estimator		params;
 	struct rcu_head			rcu;
+
+	/* keep this field far away to speedup xt_rateest_mt() */
+	struct net_rate_estimator __rcu *rate_est;
 };
 
 struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672a802dce583166f4f808154b77a96..498f81b229a4565e140f1a054ce931e80361e8b2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -76,7 +76,7 @@ struct Qdisc {
 
 	struct netdev_queue	*dev_queue;
 
-	struct gnet_stats_rate_est64	rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
 
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 0993844faeeaefb221ea619fd9d796600b82fbb9..101b5d0e2142e6a813f6b524a59945379f02bcb3 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,6 +7,7 @@
  *		2 of the License, or (at your option) any later version.
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *		Eric Dumazet <edumazet@google.com>
  *
  * Changes:
  *              Jamal Hadi Salim - moved it to net/core and reshulfed
@@ -30,171 +31,79 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
-#include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/seqlock.h>
 #include <net/sock.h>
 #include <net/gen_stats.h>
 
-/*
-   This code is NOT intended to be used for statistics collection,
-   its purpose is to provide a base for statistical multiplexing
-   for controlled load service.
-   If you need only statistics, run a user level daemon which
-   periodically reads byte counters.
-
-   Unfortunately, rate estimation is not a very easy task.
-   F.e. I did not find a simple way to estimate the current peak rate
-   and even failed to formulate the problem 8)8)
-
-   So I preferred not to built an estimator into the scheduler,
-   but run this task separately.
-   Ideally, it should be kernel thread(s), but for now it runs
-   from timers, which puts apparent top bounds on the number of rated
-   flows, has minimal overhead on small, but is enough
-   to handle controlled load service, sets of aggregates.
-
-   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
-   avrate = avrate*(1-W) + rate*W
-
-   where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
-   The resulting time constant is:
-
-   T = A/(-ln(1-W))
-
-
-   NOTES.
-
-   * avbps and avpps are scaled by 2^5.
-   * both values are reported as 32 bit unsigned values. bps can
-     overflow for fast links : max speed being 34360Mbit/sec
-   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
-     for HZ=100 and HZ=1024 8)), maximal interval
-     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
-     are too expensive, longer ones can be implemented
-     at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
  */
 
-#define EST_MAX_INTERVAL	5
-
-struct gen_estimator {
-	struct list_head	list;
+struct net_rate_estimator {
 	struct gnet_stats_basic_packed	*bstats;
-	struct gnet_stats_rate_est64	*rate_est;
 	spinlock_t		*stats_lock;
 	seqcount_t		*running;
-	int			ewma_log;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	u8			ewma_log;
+	u8			intvl_log; /* period : (250ms << intvl_log) */
+
+	seqcount_t		seq;
 	u32			last_packets;
-	unsigned long		avpps;
 	u64			last_bytes;
+
+	u64			avpps;
 	u64			avbps;
-	struct rcu_head		e_rcu;
-	struct rb_node		node;
-	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
-	struct rcu_head		head;
-};
 
-struct gen_estimator_head {
-	unsigned long		next_jiffies;
-	struct timer_list	timer;
-	struct list_head	list;
+	unsigned long           next_jiffies;
+	struct timer_list       timer;
+	struct rcu_head		rcu;
 };
 
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+			       struct gnet_stats_basic_packed *b)
 {
-	int idx = (int)arg;
-	struct gen_estimator *e;
+	if (e->stats_lock)
+		spin_lock(e->stats_lock);
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(e, &elist[idx].list, list) {
-		struct gnet_stats_basic_packed b = {0};
-		unsigned long rate;
-		u64 brate;
-
-		if (e->stats_lock)
-			spin_lock(e->stats_lock);
-		read_lock(&est_lock);
-		if (e->bstats == NULL)
-			goto skip;
-
-		__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
-		brate = (b.bytes - e->last_bytes)<<(7 - idx);
-		e->last_bytes = b.bytes;
-		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
-		WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
-		rate = b.packets - e->last_packets;
-		rate <<= (7 - idx);
-		e->last_packets = b.packets;
-		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
-		WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
-		read_unlock(&est_lock);
-		if (e->stats_lock)
-			spin_unlock(e->stats_lock);
-	}
+	__gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
 
-	if (!list_empty(&elist[idx].list)) {
-		elist[idx].next_jiffies += ((HZ/4) << idx);
+	if (e->stats_lock)
+		spin_unlock(e->stats_lock);
 
-		if (unlikely(time_after_eq(jiffies, elist[idx].next_jiffies))) {
-			/* Ouch... timer was delayed. */
-			elist[idx].next_jiffies = jiffies + 1;
-		}
-		mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
-	}
-	rcu_read_unlock();
 }
 
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
 {
-	struct rb_node **p = &est_root.rb_node, *parent = NULL;
+	struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+	struct gnet_stats_basic_packed b;
+	u64 rate, brate;
 
-	while (*p) {
-		struct gen_estimator *e;
+	est_fetch_counters(est, &b);
+	brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+	brate -= (est->avbps >> est->ewma_log);
 
-		parent = *p;
-		e = rb_entry(parent, struct gen_estimator, node);
+	rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+	rate -= (est->avpps >> est->ewma_log);
 
-		if (est->bstats > e->bstats)
-			p = &parent->rb_right;
-		else
-			p = &parent->rb_left;
-	}
-	rb_link_node(&est->node, parent, p);
-	rb_insert_color(&est->node, &est_root);
-}
-
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
-				    const struct gnet_stats_rate_est64 *rate_est)
-{
-	struct rb_node *p = est_root.rb_node;
+	write_seqcount_begin(&est->seq);
+	est->avbps += brate;
+	est->avpps += rate;
+	write_seqcount_end(&est->seq);
 
-	while (p) {
-		struct gen_estimator *e;
+	est->last_bytes = b.bytes;
+	est->last_packets = b.packets;
 
-		e = rb_entry(p, struct gen_estimator, node);
+	est->next_jiffies += ((HZ/4) << est->intvl_log);
 
-		if (bstats > e->bstats)
-			p = p->rb_right;
-		else if (bstats < e->bstats || rate_est != e->rate_est)
-			p = p->rb_left;
-		else
-			return e;
+	if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+		/* Ouch... timer was delayed. */
+		est->next_jiffies = jiffies + 1;
 	}
-	return NULL;
+	mod_timer(&est->timer, est->next_jiffies);
 }
 
 /**
@@ -217,84 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-		      struct gnet_stats_rate_est64 *rate_est,
+		      struct net_rate_estimator __rcu **rate_est,
 		      spinlock_t *stats_lock,
 		      seqcount_t *running,
 		      struct nlattr *opt)
 {
-	struct gen_estimator *est;
 	struct gnet_estimator *parm = nla_data(opt);
-	struct gnet_stats_basic_packed b = {0};
-	int idx;
+	struct net_rate_estimator *old, *est;
+	struct gnet_stats_basic_packed b;
+	int intvl_log;
 
 	if (nla_len(opt) < sizeof(*parm))
 		return -EINVAL;
 
+	/* allowed timer periods are :
+	 * -2 : 250ms,   -1 : 500ms,    0 : 1 sec
+	 *  1 : 2 sec,    2 : 4 sec,    3 : 8 sec
+	 */
 	if (parm->interval < -2 || parm->interval > 3)
 		return -EINVAL;
 
 	est = kzalloc(sizeof(*est), GFP_KERNEL);
-	if (est == NULL)
+	if (!est)
 		return -ENOBUFS;
 
-	__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
-	idx = parm->interval + 2;
+	seqcount_init(&est->seq);
+	intvl_log = parm->interval + 2;
 	est->bstats = bstats;
-	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
 	est->running  = running;
 	est->ewma_log = parm->ewma_log;
-	est->last_bytes = b.bytes;
-	est->avbps = rate_est->bps<<5;
-	est->last_packets = b.packets;
-	est->avpps = rate_est->pps<<10;
+	est->intvl_log = intvl_log;
 	est->cpu_bstats = cpu_bstats;
 
-	spin_lock_bh(&est_tree_lock);
-	if (!elist[idx].timer.function) {
-		INIT_LIST_HEAD(&elist[idx].list);
-		setup_timer(&elist[idx].timer, est_timer, idx);
+	est_fetch_counters(est, &b);
+	est->last_bytes = b.bytes;
+	est->last_packets = b.packets;
+	old = rcu_dereference_protected(*rate_est, 1);
+	if (old) {
+		del_timer_sync(&old->timer);
+		est->avbps = old->avbps;
+		est->avpps = old->avpps;
 	}
 
-	if (list_empty(&elist[idx].list)) {
-		elist[idx].next_jiffies = jiffies + ((HZ/4) << idx);
-		mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
-	}
-	list_add_rcu(&est->list, &elist[idx].list);
-	gen_add_node(est);
-	spin_unlock_bh(&est_tree_lock);
+	est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+	setup_timer(&est->timer, est_timer, (unsigned long)est);
+	mod_timer(&est->timer, est->next_jiffies);
 
+	rcu_assign_pointer(*rate_est, est);
+	if (old)
+		kfree_rcu(old, rcu);
 	return 0;
 }
 EXPORT_SYMBOL(gen_new_estimator);
 
 /**
  * gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
  *
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
  *
- * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
 {
-	struct gen_estimator *e;
-
-	spin_lock_bh(&est_tree_lock);
-	while ((e = gen_find_node(bstats, rate_est))) {
-		rb_erase(&e->node, &est_root);
+	struct net_rate_estimator *est;
 
-		write_lock(&est_lock);
-		e->bstats = NULL;
-		write_unlock(&est_lock);
-
-		list_del_rcu(&e->list);
-		kfree_rcu(e, e_rcu);
+	est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+	if (est) {
+		del_timer_sync(&est->timer);
+		kfree_rcu(est, rcu);
 	}
-	spin_unlock_bh(&est_tree_lock);
 }
 EXPORT_SYMBOL(gen_kill_estimator);
 
@@ -314,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
  */
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-			  struct gnet_stats_rate_est64 *rate_est,
+			  struct net_rate_estimator __rcu **rate_est,
 			  spinlock_t *stats_lock,
 			  seqcount_t *running, struct nlattr *opt)
 {
-	gen_kill_estimator(bstats, rate_est);
-	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+	return gen_new_estimator(bstats, cpu_bstats, rate_est,
+				 stats_lock, running, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
 
 /**
  * gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
  *
  * Returns true if estimator is active, and false if not.
  */
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-			  const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
 {
-	bool res;
+	return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
 
-	ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+			struct gnet_stats_rate_est64 *sample)
+{
+	struct net_rate_estimator *est;
+	unsigned seq;
+
+	rcu_read_lock();
+	est = rcu_dereference(*rate_est);
+	if (!est) {
+		rcu_read_unlock();
+		return false;
+	}
 
-	spin_lock_bh(&est_tree_lock);
-	res = gen_find_node(bstats, rate_est) != NULL;
-	spin_unlock_bh(&est_tree_lock);
+	do {
+		seq = read_seqcount_begin(&est->seq);
+		sample->bps = est->avbps >> 8;
+		sample->pps = est->avpps >> 8;
+	} while (read_seqcount_retry(&est->seq, seq));
 
-	return res;
+	rcu_read_unlock();
+	return true;
 }
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb62627e61b5065b2325edd1b84f2e..87f28557b3298f30b1cf79dccf3bc6b761cd6623 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
 /**
  * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
  * @d: dumping handle
- * @b: basic statistics
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
  *
  * Appends the rate estimator statistics to the top level TLV created by
  * gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
  */
 int
 gnet_stats_copy_rate_est(struct gnet_dump *d,
-			 const struct gnet_stats_basic_packed *b,
-			 struct gnet_stats_rate_est64 *r)
+			 struct net_rate_estimator __rcu **rate_est)
 {
+	struct gnet_stats_rate_est64 sample;
 	struct gnet_stats_rate_est est;
 	int res;
 
-	if (b && !gen_estimator_active(b, r))
+	if (!gen_estimator_read(rate_est, &sample))
 		return 0;
-
-	est.bps = min_t(u64, UINT_MAX, r->bps);
+	est.bps = min_t(u64, UINT_MAX, sample.bps);
 	/* we have some time before reaching 2^32 packets per second */
-	est.pps = r->pps;
+	est.pps = sample.pps;
 
 	if (d->compat_tc_stats) {
 		d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
 	if (d->tail) {
 		res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
 				      TCA_STATS_PAD);
-		if (res < 0 || est.bps == r->bps)
+		if (res < 0 || est.bps == sample.bps)
 			return res;
 		/* emit 64bit stats only if needed */
-		return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
-				       TCA_STATS_PAD);
+		return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+				       sizeof(sample), TCA_STATS_PAD);
 	}
 
 	return 0;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dbd6c4a12b97435d2937c92fd79a035fd5828965..91a373a3f534de8d8641341c33c08d8fc49cbd29 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
-		gen_kill_estimator(&est->bstats, &est->rstats);
+		gen_kill_estimator(&est->rate_est);
 		/*
 		 * gen_estimator est_timer() might access est->lock or bstats,
 		 * wait a RCU grace period before freeing 'est'
@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 	cfg.est.interval	= info->interval;
 	cfg.est.ewma_log	= info->ewma_log;
 
-	ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+	ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
 				&est->lock, NULL, &cfg.opt);
 	if (ret < 0)
 		goto err2;
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a84c949080c8c32827b604c80da64..1db02f6fca54d7eb5d60c2d8c5cb8ab11656fbcb 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
 xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_rateest_match_info *info = par->matchinfo;
-	struct gnet_stats_rate_est64 *r;
+	struct gnet_stats_rate_est64 sample = {0};
 	u_int32_t bps1, bps2, pps1, pps2;
 	bool ret = true;
 
-	spin_lock_bh(&info->est1->lock);
-	r = &info->est1->rstats;
+	gen_estimator_read(&info->est1->rate_est, &sample);
+
 	if (info->flags & XT_RATEEST_MATCH_DELTA) {
-		bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
-		pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+		bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
+		pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
 	} else {
-		bps1 = r->bps;
-		pps1 = r->pps;
+		bps1 = sample.bps;
+		pps1 = sample.pps;
 	}
-	spin_unlock_bh(&info->est1->lock);
 
 	if (info->flags & XT_RATEEST_MATCH_ABS) {
 		bps2 = info->bps2;
 		pps2 = info->pps2;
 	} else {
-		spin_lock_bh(&info->est2->lock);
-		r = &info->est2->rstats;
+		gen_estimator_read(&info->est2->rate_est, &sample);
+
 		if (info->flags & XT_RATEEST_MATCH_DELTA) {
-			bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
-			pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+			bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
+			pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
 		} else {
-			bps2 = r->bps;
-			pps2 = r->pps;
+			bps2 = sample.bps;
+			pps2 = sample.pps;
 		}
-		spin_unlock_bh(&info->est2->lock);
 	}
 
 	switch (info->mode) {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f893d180da1caa3b6dd1cc8773920beb1885f9b0..2095c83ce7730d49550103a8efad943d28815617 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
 	spin_lock_bh(&hinfo->lock);
 	hlist_del(&p->tcfa_head);
 	spin_unlock_bh(&hinfo->lock);
-	gen_kill_estimator(&p->tcfa_bstats,
-			   &p->tcfa_rate_est);
+	gen_kill_estimator(&p->tcfa_rate_est);
 	/*
 	 * gen_estimator est_timer() might access p->tcfa_lock
 	 * or bstats, wait a RCU grace period before freeing p
@@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check);
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 {
 	if (est)
-		gen_kill_estimator(&a->tcfa_bstats,
-				   &a->tcfa_rate_est);
+		gen_kill_estimator(&a->tcfa_rate_est);
 	call_rcu(&a->tcfa_rcu, free_tcf);
 }
 EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -670,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
 		goto errout;
 
 	if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
-				     &p->tcfa_rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, p->cpu_qstats,
 				  &p->tcfa_qstats,
 				  p->tcfa_qstats.qlen) < 0)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index c990b73a6c85151862c30971bd3787d20dbcecd1..0ba91d1ce99480c4817684dbe0b4fde61811e03e 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 			goto failure_unlock;
 	} else if (tb[TCA_POLICE_AVRATE] &&
 		   (ret == ACT_P_CREATED ||
-		    !gen_estimator_active(&police->tcf_bstats,
-					  &police->tcf_rate_est))) {
+		    !gen_estimator_active(&police->tcf_rate_est))) {
 		err = -EINVAL;
 		goto failure_unlock;
 	}
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 	bstats_update(&police->tcf_bstats, skb);
 	tcf_lastuse_update(&police->tcf_tm);
 
-	if (police->tcfp_ewma_rate &&
-	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
-		police->tcf_qstats.overlimits++;
-		if (police->tcf_action == TC_ACT_SHOT)
-			police->tcf_qstats.drops++;
-		spin_unlock(&police->tcf_lock);
-		return police->tcf_action;
+	if (police->tcfp_ewma_rate) {
+		struct gnet_stats_rate_est64 sample;
+
+		if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+		    sample.bps >= police->tcfp_ewma_rate) {
+			police->tcf_qstats.overlimits++;
+			if (police->tcf_action == TC_ACT_SHOT)
+				police->tcf_qstats.drops++;
+			spin_unlock(&police->tcf_lock);
+			return police->tcf_action;
+		}
 	}
 
 	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f337f1bdd1d4a4cac738f9fe1fbd42e5984174fe..d7b93429f0cca61ff489536b4247f31f3690fdd8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1395,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 				  &d, cpu_bstats, &q->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 		goto nla_put_failure;
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfbb4acb5d89511b5e0030b4c9cf26e..9ffe1c220b02848032474fa7b9b2c2f09d40b110 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -122,7 +122,7 @@ struct cbq_class {
 	psched_time_t		penalized;
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct tc_cbq_xstats	xstats;
 
 	struct tcf_proto __rcu	*filter_list;
@@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
 		return -1;
 
@@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->q);
 	qdisc_put_rtab(cl->R_tab);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	if (cl != &q->link)
 		kfree(cl);
 }
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef848db47cc33a878296693dabb44955..bb4cbdf7500482b170eef6e7923cf2f2259e52b5 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
 
 	struct gnet_stats_basic_packed		bstats;
 	struct gnet_stats_queue		qstats;
-	struct gnet_stats_rate_est64	rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct list_head		alist;
 	struct Qdisc			*qdisc;
 
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
 {
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	qdisc_destroy(cl->qdisc);
 	kfree(cl);
 }
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
 		return -1;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c28187cc888cebf004e61270f00871..6eb9c8e88519a36fc3ef82be7c7f1dbe0ef1e13c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	qdisc_put_stab(rtnl_dereference(qdisc->stab));
 #endif
-	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+	gen_kill_estimator(&qdisc->rate_est);
 	if (ops->reset)
 		ops->reset(qdisc);
 	if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 000f1d36128e1abfc0657bce779a7df852f66384..3ffaa6fb0990f0aa31487a2f1829b1f1accf8b21 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct tcf_proto __rcu *filter_list; /* filter list */
 	unsigned int	filter_cnt;	/* filter count */
 	unsigned int	level;		/* class level in hierarchy */
@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->qdisc);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	if (cl != &q->root)
 		kfree(cl);
 }
@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	xstats.rtwork  = cl->cl_cumul;
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
 		return -1;
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9926fe4f3b6f7db9aeba22fbbfae3d47c4e2af60..760f39e7caeeb91b6c34d561f64f1ed97c884a32 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -111,7 +111,7 @@ struct htb_class {
 	unsigned int		children;
 	struct htb_class	*parent;	/* parent class */
 
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 
 	/*
 	 * Written often fields
@@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
 		return -1;
 
@@ -1228,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 		WARN_ON(!cl->un.leaf.q);
 		qdisc_destroy(cl->un.leaf.q);
 	}
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	tcf_destroy_chain(&cl->filter_list);
 	kfree(cl);
 }
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f74356f47dffcc2262f233ee50f0c47c..f9e712ce2d15ce9280c31d2f75d62b84034ae51d 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct Qdisc *qdisc;
 	struct list_head alist;		/* Link for active-classes list. */
 	struct qfq_aggregate *agg;	/* Parent aggregate. */
@@ -508,7 +508,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
 		if (new_agg == NULL) {
 			err = -ENOBUFS;
-			gen_kill_estimator(&cl->bstats, &cl->rate_est);
+			gen_kill_estimator(&cl->rate_est);
 			goto destroy_class;
 		}
 		sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
 	struct qfq_sched *q = qdisc_priv(sch);
 
 	qfq_rm_from_agg(q, cl);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	qdisc_destroy(cl->qdisc);
 	kfree(cl);
 }
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL,
 				  &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
 		return -1;



^ permalink raw reply related

* Re: [PATCN v2 net-next] net_sched: gen_estimator: complete rewrite of rate estimators
From: Eric Dumazet @ 2016-12-04 17:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, netfilter-devel
In-Reply-To: <1480835882.18162.462.camel@edumazet-glaptop3.roam.corp.google.com>

On Sat, 2016-12-03 at 23:18 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> 1) Old code was hard to maintain, due to complex lock chains.
>    (We probably will be able to remove some kfree_rcu() in callers)
> 
> 2) Using a single timer to update all estimators does not scale.
> 
> 3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
>    is not supposed to work well)

A v3 is under way, fixing a last "make htmldocs" warning.

^ permalink raw reply

* Re: [flamebait] xdp Was: Re: bpf bounded loops. Was: [flamebait] xdp
From: Hannes Frederic Sowa @ 2016-12-04 16:05 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tom Herbert, Thomas Graf, Linux Kernel Network Developers,
	Daniel Borkmann, David S. Miller
In-Reply-To: <20161202233430.GA58522@ast-mbp>

Hello,

On 03.12.2016 00:34, Alexei Starovoitov wrote:
> On Fri, Dec 02, 2016 at 08:42:41PM +0100, Hannes Frederic Sowa wrote:
>> On Fri, Dec 2, 2016, at 20:25, Hannes Frederic Sowa wrote:
>>> On 02.12.2016 19:39, Alexei Starovoitov wrote:
>>>> On Thu, Dec 01, 2016 at 10:27:12PM +0100, Hannes Frederic Sowa wrote:
>>>>> like") and the problematic of parsing DNS packets in XDP due to string
>>>>> processing and looping inside eBPF.
>>>>
>>>> Hannes,
>>>> Not too long ago you proposed a very interesting idea to add
>>>> support for bounded loops without adding any new bpf instructions and
>>>> changing llvm (which was way better than my 'rep' like instructions
>>>> I was experimenting with). I thought systemtap guys also wanted bounded
>>>> loops and you were cooperating on the design, so I gave up on my work and
>>>> was expecting an imminent patch from you. I guess it sounds like you know
>>>> believe that bounded loops are impossible or I misunderstand your statement ?
>>>
>>> Your argument was that it would need a new verifier as the current first
>>> pass checks that we indeed can lay out the basic blocks as a DAG which
>>> the second pass depends on. This would be violated.
> 
> yes. today the main part of verifier depends on cfg check that confirms DAG
> property of the program. This was done as a simplification for the algorithm,
> so any programmer that understands C can understand the verifier code.
> It certainly was the case, since most of the people who hacked
> verifier had zero compiler background.
> Now I'm thinking to introduce proper compiler technologies to it.
> On one side it will make the bar to understand higher and on the other
> side it will cleanup the logic and reuse tens of years of data flow
> analysis theory and will make verifier more robust and mathematically
> solid.

See below.

>>> Because eBPF is available by non privileged users this would need a lot
>>> of effort to rewrite and verify (or indeed keep two verifiers in the
>>> kernel for priv and non-priv). The verifier itself is exposed to
>>> unprivileged users.
> 
> I certainly hear your concerns that people unfamiliar with it are simply
> scared that more and more verification logic being added. So I don't mind
> freezing current verifier for unpriv and let proper data flow analysis
> to be done in root only component.
> 
>>> Also, by design, if we keep the current limits, this would not give you
>>> more instructions to operate on compared to the flattened version of the
>>> program, it would merely reduce the numbers of optimizations in LLVM
>>> that let the verifier reject the program.
> 
> I think we most likely will keep 4k insn limit (since there were no
> requests to increase it). The bounded loops will improve performance
> and reduce I-cache misses.

I agree that bounded loops will increase performance and in general I
see lifting this limitation as something good if it works out.

>> The only solution to protect the verifier, which I saw, would be to
>> limit it by time and space, thus making loading of eBPF programs
>> depending on how fast and hot (thermal throttling) one CPU thread is.
> 
> the verifier already has time and space limits.
> See no reason to rely on physical cpu sensors.

Time and space is bounded by the DAG property. It is still bounded in
the directed cyclic case (some arbitrary upper limit), but can have a
combinatorical explosion because of the switch from proving properties
for each node+state to prove properties for each path+state.

Compiler algorithms maybe a help here, but historically have focused on
other properties, mainly optimization and thus are mostly heuristics.

Compiler developers don't write the algorithm under the assumption they
will execute in a security and resource sensitive environment (only the
generated code should be safe). I believe that optimization algorithms
increase the attack surface, as their big-O worst case might be an
additional cost, additionally to the worst case verification path. I
don't think compiler engineers think about the code to optimize
attacking the optimization algorithm itself.

Verification of a malicious BPF program (complexity bomb) in the kernel
should not disrupt nor create a security threat (we are also mostly
voluntary preemptible in this code and hold a lock). Verification might
fail when memory fragmentation becomes more probably, as the huge state
table for path sensitive verification cannot be allocated.

In user space, instead of verification of many properties regarding
program state (which you actually need in the BPF verifier), the
development effort concentrates on sanitizers. Otherwise look how good
gcc is in finding uninitialized variables.

Most mathematical proofs that are written for compiler optimization I
know of are written to show equivalence between program text and the
optimization. Compile time recently became a more important aspect of
the open source compilers at least.

I am happy to be shown wrong here and assume there is no better
algorithm, which is reasonable easy for the kernel - I am a pessimist
but am happy to look at proposals.

>> Those are the complexity problems I am talking and concerned about.
> 
> Do you have concerns when people implement encryption algorithm
> that you're unfamiliar with?

Absolutely not, this can already be done in user space very much the
same, I can remove it, delete it, uninstall it. APIs in the kernel stay
forever.

> Isn't it much bigger concern, since any bugs in the algorithm
> are directly exploitable and when encryption is actually used
> it's protecting sensitive data, whereas here the verifier
> protects kernel from crashing.

Bugs happen everywhere, but a bug in the kernel itself creates a whole
bigger mess than a bug in a security constrained user space application
with proper privilege drops (at least it should).

The complexity arguments in random order:

1) The code itself - if the route is taken to provide two verifiers for
eBPF, this certainly comes with a maintenance cost to keep them secure
(secure as in the verifier itself should not expose a threat neither is
the to be verified program allowed to leak data or exceed its address
space nor some time budget).

If one of those eBPF verifiers only accepts a certain number of INSN, as
fundamental as backwards jumps, we might end up with two compiler?

At some point Google might start to fuzz them like right now other
fundamental parts of the kernel and someone must review all the code and
fix them in both of them.

If we shift verification to new heuristics programs will still fail,
just under new conditions, this exposes complexity to the users.
Libraries cannot even be exchanged between those twos.

2) Integration and API considerations:

E.g. the newly added cgroup socket bpf interface, which can modify
bound_dev_if. If you restart networking (old way
/etc/init.d/networking), those if_indexes are all outdated and the
system will behave strangely. Network management software needs to walk
cgroup (a different kernel subsystem) now, too, in order to refresh
outdated information, recompile and insert. Even worse, all user space
programs which might be in the cgroups need to be restarted, too, as the
change is permanent per socket lifetime. Instead of reconfiguring
networking, I will probably now restart the whole box.
Furthermore it gets even more complicated because cgroup can be
namespace'd nowadays and doesn't necessarily need to have no 1:1
relation with the network namespace, so this gets really complicated.

Misconfiguration causes security problems.

In the past we had dependency trees and notifiers to e.g. clean up IPs,
multicast, disable netfilter rules, routes, etc., because the current
kernel tries to keep a specific semantic when you disable an interface
and bring it back up or do some other configuration change.

Like in the eBPF example with cgroups above, networking stack state
might become hard coded in XDP programs because of simplicity and
regularly needs to be refreshed to correspond linux networking changes
from user space, too. User space program crashes or gets killed by an
admin and doesn't remember to clean-up, the kernel is partly tainted
with some state you can search in different subsystems, cgroups, XDP,
qdiscs and clean up yourself (if we assume we can't do everything in XDP).

All those changes pretty much speak for a new user space control plane,
which wasn't that much necessary so far. Two places to keep your data up
to date creates intermediate states where it is not up to date and also
the update process itself might be complex (e.g. simple things like
unicast/multicast address filter changes on the NIC vs. what the XDP
program thinks). Ergo, more complexity. What do you do when one of those
two systems fail? What is the reference data? What do you do if on a
highly busy box during DoS constant reloading of your vmalloc happens (I
don't know if it is a problem under DoS)?

Lot's of effort went into transparent hw offloading to still retain this
property of transparency and behave as a proper citizen.

If XDP should not be considered an outsider of the networking stack, it
should as well understand reconfiguration events which leads to leakage
of kernel internals into XDP (e.g. Jesper's example of writing a bridge
in XDP, or offloading examples where we need some network config pieces).

I find it strange that hw offloading people try as much as possible to
discover the state of the linux networking stack inside the kernel and
apply this state transparently to hw offloads. Will there be shulti
switch available to allow user space to sniff switchdev events to
compile them to XDP or update hashtables?

"Software offloading" basically brings the control and dataplane into
user space (not technically but from the users PoV), making it
abnormally difficult to fit in with the traditional network admin tools.

3) Security complexity:

I was e.g. wondering if there is an architectural TOCTTOU in
"env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);" in the verifier,
because bpf objects can be easily passed around and be attached to file
descriptors etc. of processes that might be sandboxed.

Leaks of pointers can e.g. lead to discover the state of address space
layout randomization. Can we now pass verifier-ng verified eBPF programs
around to untrusted programs? To sandboxes? Need they be tainted? Do we
need to consider that for bpffs? What about user namespaces? If it turns
out to be a problem now, would a change like this be backwards compatible?

Helpers for eBPF need to be checked regularly if they still provide
their safety guarantees, especially if they call into other subsystems,
also when new features get added.

4) Complexity for users of this framework (compiler problems solved):

I tried to argue that someone wanting to build netmap/DPDK-alike things
in XDP, one faces the problem of synchronized IPC. Hashmaps solve this
to some degree but cannot be synchronized. Recent offloading series
showed e.g. how hard it is to keep state up-to-date in multiple places.
If XDP-user space sharing happens this might be solved.

Adding external functions to eBPF might also freeze some kernel
internals or the eBPF wrappers might become complex down some years.

DPDK even can configure various hw offloads already before the kernel
can do so. If users want to use those, they switch to DPDK also, as I
have seen the industry always wanting the best performance. DPDK can use
SIMD instructions, all AVX, SSE and MMX stuff, and they do it.

Users still depend on specific versions of the kernel due to which
functions are exported to XDP.

Debugging is harder but currently worked on. But will probably always be
harder than simply using a debugger.

This all leads to gigantic user space control planes like neutron and
others that just make everyone's life much harder. The model requires
this. And that is what I fear.

I am not at all that negative against a hook before allocating the
packet, but making everyone using it and marketing as an alternative to
DPDK doesn't seem to fit for me.

It seems to me if XDP should even remotely be comparable to DPDK a lot
of complexity has to be added. It is not Linux network stack for me
either so far.

Sorry if I hijacked the bounded loop discussion for the rant. I am happy
to discuss or follow-up on ideas regarding lifting the looping limit in
eBPF.

Bye,
Hannes

^ permalink raw reply

* [patch net] net: fec: fix compile with CONFIG_M5272
From: Nikita Yushchenko @ 2016-12-04 15:17 UTC (permalink / raw)
  To: David S. Miller, Fugang Duan, Troy Kisky, Andrew Lunn,
	Eric Nelson, Philippe Reynes, Johannes Berg, netdev
  Cc: Chris Healy, Fabio Estevam, linux-kernel, Nikita Yushchenko

Commit 4dfb80d18d05 ("net: fec: cache statistics while device is down")
introduced unconditional statistics-related actions.

However, when driver is compiled with CONFIG_M5272, staticsics-related
definitions do not exist, which results into build errors.

Fix that by adding needed #if !defined(CONFIG_M5272).

Fixes: 4dfb80d18d05 ("net: fec: cache statistics while device is down")
Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
---
 drivers/net/ethernet/freescale/fec_main.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 6a20c24a2003..89e902767abb 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2884,7 +2884,9 @@ fec_enet_close(struct net_device *ndev)
 	if (fep->quirks & FEC_QUIRK_ERR006687)
 		imx6q_cpuidle_fec_irqs_unused();
 
+#if !defined(CONFIG_M5272)
 	fec_enet_update_ethtool_stats(ndev);
+#endif
 
 	fec_enet_clk_enable(ndev, false);
 	pinctrl_pm_select_sleep_state(&fep->pdev->dev);
@@ -3192,7 +3194,9 @@ static int fec_enet_init(struct net_device *ndev)
 
 	fec_restart(ndev);
 
+#if !defined(CONFIG_M5272)
 	fec_enet_update_ethtool_stats(ndev);
+#endif
 
 	return 0;
 }
@@ -3292,9 +3296,11 @@ fec_probe(struct platform_device *pdev)
 	fec_enet_get_queue_num(pdev, &num_tx_qs, &num_rx_qs);
 
 	/* Init network device */
-	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) +
-				  ARRAY_SIZE(fec_stats) * sizeof(u64),
-				  num_tx_qs, num_rx_qs);
+	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private)
+#if !defined(CONFIG_M5272)
+				  + ARRAY_SIZE(fec_stats) * sizeof(u64)
+#endif
+				  , num_tx_qs, num_rx_qs);
 	if (!ndev)
 		return -ENOMEM;
 
-- 
2.1.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox