Netdev List
 help / color / mirror / Atom feed
* Re: [net-next PATCH v4 1/6] net: virtio dynamically disable/enable LRO
From: John Fastabend @ 2016-12-04 18:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: daniel, shm, davem, tgraf, alexei.starovoitov, john.r.fastabend,
	netdev, bblanco, brouer
In-Reply-To: <20161203010423-mutt-send-email-mst@kernel.org>

On 16-12-03 09:36 PM, Michael S. Tsirkin wrote:
> On Fri, Dec 02, 2016 at 12:49:45PM -0800, John Fastabend wrote:
>> This adds support for dynamically setting the LRO feature flag. The
>> message to control guest features in the backend uses the
>> CTRL_GUEST_OFFLOADS msg type.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>  drivers/net/virtio_net.c |   45 ++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 44 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index a21d93a..d814e7cb 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -1419,6 +1419,41 @@ static void virtnet_init_settings(struct net_device *dev)
>>  	.set_settings = virtnet_set_settings,
>>  };
>>  
>> +static int virtnet_set_features(struct net_device *netdev,
>> +				netdev_features_t features)
>> +{
>> +	struct virtnet_info *vi = netdev_priv(netdev);
>> +	struct virtio_device *vdev = vi->vdev;
>> +	struct scatterlist sg;
>> +	u64 offloads = 0;
>> +
>> +	if (features & NETIF_F_LRO)
>> +		offloads |= (1 << VIRTIO_NET_F_GUEST_TSO4) |
>> +			    (1 << VIRTIO_NET_F_GUEST_TSO6);
>> +
>> +	if (features & NETIF_F_RXCSUM)
>> +		offloads |= (1 << VIRTIO_NET_F_GUEST_CSUM);
>> +
>> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
>> +		sg_init_one(&sg, &offloads, sizeof(uint64_t));
>> +		if (!virtnet_send_command(vi,
>> +					  VIRTIO_NET_CTRL_GUEST_OFFLOADS,
>> +					  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
>> +					  &sg)) {
>> +			dev_warn(&netdev->dev,
>> +				 "Failed to set guest offloads by virtnet command.\n");
>> +			return -EINVAL;
>> +		}
>> +	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) &&
>> +		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
> 
> No need for VIRTIO_F_VERSION_1 here.
> 

Yep Dave pointed it out as well. Also I found a bug on driver unload
path where XDP tx queues are not cleaned up correctly so will need a v5
for that.

Thanks,
John

^ permalink raw reply

* [PATCH v2 net-next] bnx2x: ethtool -x full support
From: Eric Dumazet @ 2016-12-04 18:15 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Ariel Elior
In-Reply-To: <1476810487.5650.68.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

Implement ethtool -x full support, so that rss key can be fetched
instead of assuming it matches /proc/sys/net/core/netdev_rss_key
content.

We might add "ethtool --rxfh" support later to set a different rss key.

Tested:

lpk51:~# ethtool --show-rxfh eth0 | grep -A1 'hash key'
RSS hash key:
8b:a9:3a:ff:3e:f8:44:bd:5a:44:b7:b5:6d:e8:2d:f0:f0:72:98:54:03:86:8f:39:a4:42:5a:b3:84:71:5c:4f:1c:18:d6:a3:04:68:85:ac
lpk51:~# cat /proc/sys/net/core/netdev_rss_key
8b:a9:3a:ff:3e:f8:44:bd:5a:44:b7:b5:6d:e8:2d:f0:f0:72:98:54:03:86:8f:39:a4:42:5a:b3:84:71:5c:4f:1c:18:d6:a3:04:68:85:ac:22:1f:50:76:d4:c8:a5:20:7b:61:3c:0c

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ariel Elior <ariel.elior@qlogic.com>
---
v2: support CONFIG_BNX2X_SRIOV=y

 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c     |    2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c |   47 ++++++----
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c      |    2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h      |    5 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c    |    5 -
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index ed42c1009685..28af24ae0092 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -2106,7 +2106,7 @@ int bnx2x_rss(struct bnx2x *bp, struct bnx2x_rss_config_obj *rss_obj,
 
 	if (config_hash) {
 		/* RSS keys */
-		netdev_rss_key_fill(params.rss_key, T_ETH_RSS_KEY * 4);
+		netdev_rss_key_fill(&rss_obj->rss_key, T_ETH_RSS_KEY * 4);
 		__set_bit(BNX2X_RSS_SET_SRCH, &params.rss_flags);
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 85a7800bfc12..28bc9479fc74 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3421,6 +3421,13 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device *dev)
 	return T_ETH_INDIRECTION_TABLE_SIZE;
 }
 
+static u32 bnx2x_get_rxfh_key_size(struct net_device *dev)
+{
+	struct bnx2x *bp = netdev_priv(dev);
+
+	return (bp->port.pmf || !CHIP_IS_E1x(bp)) ? T_ETH_RSS_KEY * 4 : 0;
+}
+
 static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 			  u8 *hfunc)
 {
@@ -3430,23 +3437,30 @@ static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
 
 	if (hfunc)
 		*hfunc = ETH_RSS_HASH_TOP;
-	if (!indir)
-		return 0;
 
-	/* Get the current configuration of the RSS indirection table */
-	bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table);
-
-	/*
-	 * We can't use a memcpy() as an internal storage of an
-	 * indirection table is a u8 array while indir->ring_index
-	 * points to an array of u32.
-	 *
-	 * Indirection table contains the FW Client IDs, so we need to
-	 * align the returned table to the Client ID of the leading RSS
-	 * queue.
-	 */
-	for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
-		indir[i] = ind_table[i] - bp->fp->cl_id;
+	if (key) {
+		if (bp->port.pmf || !CHIP_IS_E1x(bp))
+			memcpy(key, &bp->rss_conf_obj.rss_key, T_ETH_RSS_KEY * 4);
+		else
+			memset(key, 0, T_ETH_RSS_KEY * 4);
+	}
+
+	if (indir) {
+		/* Get the current configuration of the RSS indirection table */
+		bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table);
+
+		/*
+		 * We can't use a memcpy() as an internal storage of an
+		 * indirection table is a u8 array while indir->ring_index
+		 * points to an array of u32.
+		 *
+		 * Indirection table contains the FW Client IDs, so we need to
+		 * align the returned table to the Client ID of the leading RSS
+		 * queue.
+		 */
+		for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
+			indir[i] = ind_table[i] - bp->fp->cl_id;
+	}
 
 	return 0;
 }
@@ -3628,6 +3642,7 @@ static const struct ethtool_ops bnx2x_ethtool_ops = {
 	.get_ethtool_stats	= bnx2x_get_ethtool_stats,
 	.get_rxnfc		= bnx2x_get_rxnfc,
 	.set_rxnfc		= bnx2x_set_rxnfc,
+	.get_rxfh_key_size	= bnx2x_get_rxfh_key_size,
 	.get_rxfh_indir_size	= bnx2x_get_rxfh_indir_size,
 	.get_rxfh		= bnx2x_get_rxfh,
 	.set_rxfh		= bnx2x_set_rxfh,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index cea6bdcde33f..3b1becc23160 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -4538,7 +4538,7 @@ static int bnx2x_setup_rss(struct bnx2x *bp,
 	/* RSS keys */
 	if (test_bit(BNX2X_RSS_SET_SRCH, &p->rss_flags)) {
 		u8 *dst = (u8 *)(data->rss_key) + sizeof(data->rss_key);
-		const u8 *src = (const u8 *)p->rss_key;
+		const u8 *src = (const u8 *)o->rss_key;
 		int i;
 
 		/* Apparently, bnx2x reads this array in reverse order
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
index 0bf2fd470819..0092991796d3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
@@ -744,9 +744,6 @@ struct bnx2x_config_rss_params {
 	/* Indirection table */
 	u8		ind_table[T_ETH_INDIRECTION_TABLE_SIZE];
 
-	/* RSS hash values */
-	u32		rss_key[10];
-
 	/* valid only iff BNX2X_RSS_UPDATE_TOE is set */
 	u16		toe_rss_bitmap;
 };
@@ -760,6 +757,8 @@ struct bnx2x_rss_config_obj {
 	/* Last configured indirection table */
 	u8			ind_table[T_ETH_INDIRECTION_TABLE_SIZE];
 
+	u32     rss_key[T_ETH_RSS_KEY];
+
 	/* flags for enabling 4-tupple hash on UDP */
 	u8			udp_rss_v4;
 	u8			udp_rss_v6;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index bfae300cf25f..9a8f36f12b07 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -811,7 +811,8 @@ int bnx2x_vfpf_config_rss(struct bnx2x *bp,
 		      sizeof(struct channel_list_end_tlv));
 
 	memcpy(req->ind_table, params->ind_table, T_ETH_INDIRECTION_TABLE_SIZE);
-	memcpy(req->rss_key, params->rss_key, sizeof(params->rss_key));
+	memcpy(req->rss_key, params->rss_obj->rss_key,
+	       sizeof(params->rss_obj->rss_key));
 	req->ind_table_size = T_ETH_INDIRECTION_TABLE_SIZE;
 	req->rss_key_size = T_ETH_RSS_KEY;
 	req->rss_result_mask = params->rss_result_mask;
@@ -1985,7 +1986,7 @@ static void bnx2x_vf_mbx_update_rss(struct bnx2x *bp, struct bnx2x_virtf *vf,
 	/* set vfop params according to rss tlv */
 	memcpy(rss.ind_table, rss_tlv->ind_table,
 	       T_ETH_INDIRECTION_TABLE_SIZE);
-	memcpy(rss.rss_key, rss_tlv->rss_key, sizeof(rss_tlv->rss_key));
+	memcpy(&vf->rss_conf_obj.rss_key, rss_tlv->rss_key, sizeof(rss_tlv->rss_key));
 	rss.rss_obj = &vf->rss_conf_obj;
 	rss.rss_result_mask = rss_tlv->rss_result_mask;
 

^ permalink raw reply related

* Re: Trigger EHOSTUNREACH
From: Neal Cardwell @ 2016-12-04 17:51 UTC (permalink / raw)
  To: Marco Zunino; +Cc: Netdev
In-Reply-To: <CAP__gDkhM82EP6pP0NBHAYME0iPkEf49aeOp9C3dYQcCpdetrA@mail.gmail.com>

On Sun, Dec 4, 2016 at 7:04 AM, Marco Zunino <eng.marco.zunino@gmail.com> wrote:
> Hallo everyone, hope you are having a good day
> we are building a networking testing tool to simulate network error
> condition, and we are having difficulties triggering the EHOSTUNREACH
> socket error.
>
> We are trying to trigger this error by sending an ICMP packet type=3
> code=3 on an open STREAM socket, but it has no effect.
>
> Based on RFC1122 and the code here
>
> https://github.com/torvalds/linux/blob/e76d21c40bd6c67fd4e2c1540d77e113df962b4d/net/ipv4/tcp_ipv4.c#L353
>
> I would expect the this ICMP packet to abort the socket connection
> with a EHOSTUNREACH error on the client side, but this does not
> happen.

In my quick tests with packetdrill, it looks like Linux will not
immediately pass EHOSTUNREACH to the application unless the
application has requested this with setsockopt(SOL_IP, IP_RECVERR).

Specifically, the following packetdrill test passes for me:
---
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+.020 < . 1:1(0) ack 1 win 257
   +0 accept(3, ..., ...) = 4
   +0 setsockopt(4, SOL_IP, IP_RECVERR, [1], 4) = 0
   +0 write(4, ..., 1000) = 1000
   +0 > P. 1:1001(1000) ack 1

+.010 < icmp unreachable host_unreachable [1:1461(1460)]

   +0 write(4, ..., 1) = -1 EHOSTUNREACH (No route to host)
---

But without the setsockopt(SOL_IP, IP_RECVERR) there is no error upon
the second write().

My reading of RFC 1122 is that this is consistent with the RFC.

RFC 1122 section 3.2.2.1 says:

            A Destination Unreachable message that is received with code
            0 (Net), 1 (Host), or 5 (Bad Source Route) may result from a
            routing transient and MUST therefore be interpreted as only
            a hint, not proof, that the specified destination is
            unreachable [IP:11].

So it seems that the RFC is suggesting that by default an ICMP host
unreachable should not cause an immediate error for the connection.
Instead, it should be used as a hint as to the cause of the problem if
TCP's normal reliable delivery mechanisms ultimately timeout and fail.

neal

^ permalink raw reply

* [PATCH v3 net-next] net_sched: gen_estimator: complete rewrite of rate estimators
From: Eric Dumazet @ 2016-12-04 17:48 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, netfilter-devel
In-Reply-To: <1480835882.18162.462.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

1) Old code was hard to maintain, due to complex lock chains.
   (We probably will be able to remove some kfree_rcu() in callers)

2) Using a single timer to update all estimators does not scale.

3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
   is not supposed to work well)

In this rewrite :

- I removed the RB tree that had to be scanned in
  gen_estimator_active(). qdisc dumps should be much faster.

- Each estimator has its own timer.

- Estimations are maintained in net_rate_estimator structure,
  instead of dirtying the qdisc. Minor, but part of the simplification.

- Reading the estimator uses RCU and a seqcount to provide proper
  support for 32bit kernels.

- We reduce memory need when estimators are not used, since
  we store a pointer, instead of the bytes/packets counters.

- xt_rateest_mt() no longer has to grab a spinlock.
  (In the future, xt_rateest_tg() could be switched to per cpu counters)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
v3: Renamed some parameters to please make htmldocs
v2: Removed unwanted changes to tcp_output.c

 include/net/act_api.h              |    2 
 include/net/gen_stats.h            |   17 -
 include/net/netfilter/xt_rateest.h |   10 
 include/net/sch_generic.h          |    2 
 net/core/gen_estimator.c           |  299 +++++++++------------------
 net/core/gen_stats.c               |   20 -
 net/netfilter/xt_RATEEST.c         |    4 
 net/netfilter/xt_rateest.c         |   28 +-
 net/sched/act_api.c                |    9 
 net/sched/act_police.c             |   21 +
 net/sched/sch_api.c                |    2 
 net/sched/sch_cbq.c                |    6 
 net/sched/sch_drr.c                |    6 
 net/sched/sch_generic.c            |    2 
 net/sched/sch_hfsc.c               |    6 
 net/sched/sch_htb.c                |    6 
 net/sched/sch_qfq.c                |    8 
 17 files changed, 182 insertions(+), 266 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9dddf77a69ccbcb003cfa66bcc0de337f78f3dae..1d716449209e4753a297c61a287077a1eb96e6d8 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -36,7 +36,7 @@ struct tc_action {
 	struct tcf_t			tcfa_tm;
 	struct gnet_stats_basic_packed	tcfa_bstats;
 	struct gnet_stats_queue		tcfa_qstats;
-	struct gnet_stats_rate_est64	tcfa_rate_est;
+	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
 	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121cc7d9c72075e7e6dde3655d631f64a1c4..8b7aa370e7a4af61fcb71ed751dba72ebead6143 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
 	struct u64_stats_sync syncp;
 };
 
+struct net_rate_estimator;
+
 struct gnet_dump {
 	spinlock_t *      lock;
 	struct sk_buff *  skb;
@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
 			     struct gnet_stats_basic_cpu __percpu *cpu,
 			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
-			     const struct gnet_stats_basic_packed *b,
-			     struct gnet_stats_rate_est64 *r);
+			     struct net_rate_estimator __rcu **ptr);
 int gnet_stats_copy_queue(struct gnet_dump *d,
 			  struct gnet_stats_queue __percpu *cpu_q,
 			  struct gnet_stats_queue *q, __u32 qlen);
@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
 
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-		      struct gnet_stats_rate_est64 *rate_est,
+		      struct net_rate_estimator __rcu **rate_est,
 		      spinlock_t *stats_lock,
 		      seqcount_t *running, struct nlattr *opt);
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			struct gnet_stats_rate_est64 *rate_est);
+void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-			  struct gnet_stats_rate_est64 *rate_est,
+			  struct net_rate_estimator __rcu **ptr,
 			  spinlock_t *stats_lock,
 			  seqcount_t *running, struct nlattr *opt);
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-			  const struct gnet_stats_rate_est64 *rate_est);
+bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
+bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
+			struct gnet_stats_rate_est64 *sample);
 #endif
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index 79f45e19f31eaa54deb8437ef4b883bec78def84..130e58361f998ecdf361910b196e69778224d955 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -1,19 +1,23 @@
 #ifndef _XT_RATEEST_H
 #define _XT_RATEEST_H
 
+#include <net/gen_stats.h>
+
 struct xt_rateest {
 	/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
 	struct gnet_stats_basic_packed	bstats;
 	spinlock_t			lock;
-	/* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
-	struct gnet_stats_rate_est64	rstats;
+
 
 	/* following fields not accessed in hot path */
+	unsigned int			refcnt;
 	struct hlist_node		list;
 	char				name[IFNAMSIZ];
-	unsigned int			refcnt;
 	struct gnet_estimator		params;
 	struct rcu_head			rcu;
+
+	/* keep this field far away to speedup xt_rateest_mt() */
+	struct net_rate_estimator __rcu *rate_est;
 };
 
 struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672a802dce583166f4f808154b77a96..498f81b229a4565e140f1a054ce931e80361e8b2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -76,7 +76,7 @@ struct Qdisc {
 
 	struct netdev_queue	*dev_queue;
 
-	struct gnet_stats_rate_est64	rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
 
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 0993844faeeaefb221ea619fd9d796600b82fbb9..101b5d0e2142e6a813f6b524a59945379f02bcb3 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,6 +7,7 @@
  *		2 of the License, or (at your option) any later version.
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *		Eric Dumazet <edumazet@google.com>
  *
  * Changes:
  *              Jamal Hadi Salim - moved it to net/core and reshulfed
@@ -30,171 +31,79 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
-#include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/seqlock.h>
 #include <net/sock.h>
 #include <net/gen_stats.h>
 
-/*
-   This code is NOT intended to be used for statistics collection,
-   its purpose is to provide a base for statistical multiplexing
-   for controlled load service.
-   If you need only statistics, run a user level daemon which
-   periodically reads byte counters.
-
-   Unfortunately, rate estimation is not a very easy task.
-   F.e. I did not find a simple way to estimate the current peak rate
-   and even failed to formulate the problem 8)8)
-
-   So I preferred not to built an estimator into the scheduler,
-   but run this task separately.
-   Ideally, it should be kernel thread(s), but for now it runs
-   from timers, which puts apparent top bounds on the number of rated
-   flows, has minimal overhead on small, but is enough
-   to handle controlled load service, sets of aggregates.
-
-   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
-   avrate = avrate*(1-W) + rate*W
-
-   where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
-   The resulting time constant is:
-
-   T = A/(-ln(1-W))
-
-
-   NOTES.
-
-   * avbps and avpps are scaled by 2^5.
-   * both values are reported as 32 bit unsigned values. bps can
-     overflow for fast links : max speed being 34360Mbit/sec
-   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
-     for HZ=100 and HZ=1024 8)), maximal interval
-     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
-     are too expensive, longer ones can be implemented
-     at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
  */
 
-#define EST_MAX_INTERVAL	5
-
-struct gen_estimator {
-	struct list_head	list;
+struct net_rate_estimator {
 	struct gnet_stats_basic_packed	*bstats;
-	struct gnet_stats_rate_est64	*rate_est;
 	spinlock_t		*stats_lock;
 	seqcount_t		*running;
-	int			ewma_log;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	u8			ewma_log;
+	u8			intvl_log; /* period : (250ms << intvl_log) */
+
+	seqcount_t		seq;
 	u32			last_packets;
-	unsigned long		avpps;
 	u64			last_bytes;
+
+	u64			avpps;
 	u64			avbps;
-	struct rcu_head		e_rcu;
-	struct rb_node		node;
-	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
-	struct rcu_head		head;
-};
 
-struct gen_estimator_head {
-	unsigned long		next_jiffies;
-	struct timer_list	timer;
-	struct list_head	list;
+	unsigned long           next_jiffies;
+	struct timer_list       timer;
+	struct rcu_head		rcu;
 };
 
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+			       struct gnet_stats_basic_packed *b)
 {
-	int idx = (int)arg;
-	struct gen_estimator *e;
+	if (e->stats_lock)
+		spin_lock(e->stats_lock);
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(e, &elist[idx].list, list) {
-		struct gnet_stats_basic_packed b = {0};
-		unsigned long rate;
-		u64 brate;
-
-		if (e->stats_lock)
-			spin_lock(e->stats_lock);
-		read_lock(&est_lock);
-		if (e->bstats == NULL)
-			goto skip;
-
-		__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
-		brate = (b.bytes - e->last_bytes)<<(7 - idx);
-		e->last_bytes = b.bytes;
-		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
-		WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
-		rate = b.packets - e->last_packets;
-		rate <<= (7 - idx);
-		e->last_packets = b.packets;
-		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
-		WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
-		read_unlock(&est_lock);
-		if (e->stats_lock)
-			spin_unlock(e->stats_lock);
-	}
+	__gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
 
-	if (!list_empty(&elist[idx].list)) {
-		elist[idx].next_jiffies += ((HZ/4) << idx);
+	if (e->stats_lock)
+		spin_unlock(e->stats_lock);
 
-		if (unlikely(time_after_eq(jiffies, elist[idx].next_jiffies))) {
-			/* Ouch... timer was delayed. */
-			elist[idx].next_jiffies = jiffies + 1;
-		}
-		mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
-	}
-	rcu_read_unlock();
 }
 
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
 {
-	struct rb_node **p = &est_root.rb_node, *parent = NULL;
+	struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+	struct gnet_stats_basic_packed b;
+	u64 rate, brate;
 
-	while (*p) {
-		struct gen_estimator *e;
+	est_fetch_counters(est, &b);
+	brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+	brate -= (est->avbps >> est->ewma_log);
 
-		parent = *p;
-		e = rb_entry(parent, struct gen_estimator, node);
+	rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+	rate -= (est->avpps >> est->ewma_log);
 
-		if (est->bstats > e->bstats)
-			p = &parent->rb_right;
-		else
-			p = &parent->rb_left;
-	}
-	rb_link_node(&est->node, parent, p);
-	rb_insert_color(&est->node, &est_root);
-}
-
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
-				    const struct gnet_stats_rate_est64 *rate_est)
-{
-	struct rb_node *p = est_root.rb_node;
+	write_seqcount_begin(&est->seq);
+	est->avbps += brate;
+	est->avpps += rate;
+	write_seqcount_end(&est->seq);
 
-	while (p) {
-		struct gen_estimator *e;
+	est->last_bytes = b.bytes;
+	est->last_packets = b.packets;
 
-		e = rb_entry(p, struct gen_estimator, node);
+	est->next_jiffies += ((HZ/4) << est->intvl_log);
 
-		if (bstats > e->bstats)
-			p = p->rb_right;
-		else if (bstats < e->bstats || rate_est != e->rate_est)
-			p = p->rb_left;
-		else
-			return e;
+	if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+		/* Ouch... timer was delayed. */
+		est->next_jiffies = jiffies + 1;
 	}
-	return NULL;
+	mod_timer(&est->timer, est->next_jiffies);
 }
 
 /**
@@ -217,84 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
  */
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-		      struct gnet_stats_rate_est64 *rate_est,
+		      struct net_rate_estimator __rcu **rate_est,
 		      spinlock_t *stats_lock,
 		      seqcount_t *running,
 		      struct nlattr *opt)
 {
-	struct gen_estimator *est;
 	struct gnet_estimator *parm = nla_data(opt);
-	struct gnet_stats_basic_packed b = {0};
-	int idx;
+	struct net_rate_estimator *old, *est;
+	struct gnet_stats_basic_packed b;
+	int intvl_log;
 
 	if (nla_len(opt) < sizeof(*parm))
 		return -EINVAL;
 
+	/* allowed timer periods are :
+	 * -2 : 250ms,   -1 : 500ms,    0 : 1 sec
+	 *  1 : 2 sec,    2 : 4 sec,    3 : 8 sec
+	 */
 	if (parm->interval < -2 || parm->interval > 3)
 		return -EINVAL;
 
 	est = kzalloc(sizeof(*est), GFP_KERNEL);
-	if (est == NULL)
+	if (!est)
 		return -ENOBUFS;
 
-	__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
-	idx = parm->interval + 2;
+	seqcount_init(&est->seq);
+	intvl_log = parm->interval + 2;
 	est->bstats = bstats;
-	est->rate_est = rate_est;
 	est->stats_lock = stats_lock;
 	est->running  = running;
 	est->ewma_log = parm->ewma_log;
-	est->last_bytes = b.bytes;
-	est->avbps = rate_est->bps<<5;
-	est->last_packets = b.packets;
-	est->avpps = rate_est->pps<<10;
+	est->intvl_log = intvl_log;
 	est->cpu_bstats = cpu_bstats;
 
-	spin_lock_bh(&est_tree_lock);
-	if (!elist[idx].timer.function) {
-		INIT_LIST_HEAD(&elist[idx].list);
-		setup_timer(&elist[idx].timer, est_timer, idx);
+	est_fetch_counters(est, &b);
+	est->last_bytes = b.bytes;
+	est->last_packets = b.packets;
+	old = rcu_dereference_protected(*rate_est, 1);
+	if (old) {
+		del_timer_sync(&old->timer);
+		est->avbps = old->avbps;
+		est->avpps = old->avpps;
 	}
 
-	if (list_empty(&elist[idx].list)) {
-		elist[idx].next_jiffies = jiffies + ((HZ/4) << idx);
-		mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
-	}
-	list_add_rcu(&est->list, &elist[idx].list);
-	gen_add_node(est);
-	spin_unlock_bh(&est_tree_lock);
+	est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+	setup_timer(&est->timer, est_timer, (unsigned long)est);
+	mod_timer(&est->timer, est->next_jiffies);
 
+	rcu_assign_pointer(*rate_est, est);
+	if (old)
+		kfree_rcu(old, rcu);
 	return 0;
 }
 EXPORT_SYMBOL(gen_new_estimator);
 
 /**
  * gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
  *
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
  *
- * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
 {
-	struct gen_estimator *e;
-
-	spin_lock_bh(&est_tree_lock);
-	while ((e = gen_find_node(bstats, rate_est))) {
-		rb_erase(&e->node, &est_root);
+	struct net_rate_estimator *est;
 
-		write_lock(&est_lock);
-		e->bstats = NULL;
-		write_unlock(&est_lock);
-
-		list_del_rcu(&e->list);
-		kfree_rcu(e, e_rcu);
+	est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+	if (est) {
+		del_timer_sync(&est->timer);
+		kfree_rcu(est, rcu);
 	}
-	spin_unlock_bh(&est_tree_lock);
 }
 EXPORT_SYMBOL(gen_kill_estimator);
 
@@ -314,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
  */
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-			  struct gnet_stats_rate_est64 *rate_est,
+			  struct net_rate_estimator __rcu **rate_est,
 			  spinlock_t *stats_lock,
 			  seqcount_t *running, struct nlattr *opt)
 {
-	gen_kill_estimator(bstats, rate_est);
-	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+	return gen_new_estimator(bstats, cpu_bstats, rate_est,
+				 stats_lock, running, opt);
 }
 EXPORT_SYMBOL(gen_replace_estimator);
 
 /**
  * gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
  *
  * Returns true if estimator is active, and false if not.
  */
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-			  const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
 {
-	bool res;
+	return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
 
-	ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+			struct gnet_stats_rate_est64 *sample)
+{
+	struct net_rate_estimator *est;
+	unsigned seq;
+
+	rcu_read_lock();
+	est = rcu_dereference(*rate_est);
+	if (!est) {
+		rcu_read_unlock();
+		return false;
+	}
 
-	spin_lock_bh(&est_tree_lock);
-	res = gen_find_node(bstats, rate_est) != NULL;
-	spin_unlock_bh(&est_tree_lock);
+	do {
+		seq = read_seqcount_begin(&est->seq);
+		sample->bps = est->avbps >> 8;
+		sample->pps = est->avpps >> 8;
+	} while (read_seqcount_retry(&est->seq, seq));
 
-	return res;
+	rcu_read_unlock();
+	return true;
 }
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb62627e61b5065b2325edd1b84f2e..87f28557b3298f30b1cf79dccf3bc6b761cd6623 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
 /**
  * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
  * @d: dumping handle
- * @b: basic statistics
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
  *
  * Appends the rate estimator statistics to the top level TLV created by
  * gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
  */
 int
 gnet_stats_copy_rate_est(struct gnet_dump *d,
-			 const struct gnet_stats_basic_packed *b,
-			 struct gnet_stats_rate_est64 *r)
+			 struct net_rate_estimator __rcu **rate_est)
 {
+	struct gnet_stats_rate_est64 sample;
 	struct gnet_stats_rate_est est;
 	int res;
 
-	if (b && !gen_estimator_active(b, r))
+	if (!gen_estimator_read(rate_est, &sample))
 		return 0;
-
-	est.bps = min_t(u64, UINT_MAX, r->bps);
+	est.bps = min_t(u64, UINT_MAX, sample.bps);
 	/* we have some time before reaching 2^32 packets per second */
-	est.pps = r->pps;
+	est.pps = sample.pps;
 
 	if (d->compat_tc_stats) {
 		d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
 	if (d->tail) {
 		res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
 				      TCA_STATS_PAD);
-		if (res < 0 || est.bps == r->bps)
+		if (res < 0 || est.bps == sample.bps)
 			return res;
 		/* emit 64bit stats only if needed */
-		return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
-				       TCA_STATS_PAD);
+		return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+				       sizeof(sample), TCA_STATS_PAD);
 	}
 
 	return 0;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dbd6c4a12b97435d2937c92fd79a035fd5828965..91a373a3f534de8d8641341c33c08d8fc49cbd29 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
-		gen_kill_estimator(&est->bstats, &est->rstats);
+		gen_kill_estimator(&est->rate_est);
 		/*
 		 * gen_estimator est_timer() might access est->lock or bstats,
 		 * wait a RCU grace period before freeing 'est'
@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 	cfg.est.interval	= info->interval;
 	cfg.est.ewma_log	= info->ewma_log;
 
-	ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
+	ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
 				&est->lock, NULL, &cfg.opt);
 	if (ret < 0)
 		goto err2;
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 7720b036d76a84c949080c8c32827b604c80da64..1db02f6fca54d7eb5d60c2d8c5cb8ab11656fbcb 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -18,35 +18,33 @@ static bool
 xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_rateest_match_info *info = par->matchinfo;
-	struct gnet_stats_rate_est64 *r;
+	struct gnet_stats_rate_est64 sample = {0};
 	u_int32_t bps1, bps2, pps1, pps2;
 	bool ret = true;
 
-	spin_lock_bh(&info->est1->lock);
-	r = &info->est1->rstats;
+	gen_estimator_read(&info->est1->rate_est, &sample);
+
 	if (info->flags & XT_RATEEST_MATCH_DELTA) {
-		bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
-		pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+		bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
+		pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
 	} else {
-		bps1 = r->bps;
-		pps1 = r->pps;
+		bps1 = sample.bps;
+		pps1 = sample.pps;
 	}
-	spin_unlock_bh(&info->est1->lock);
 
 	if (info->flags & XT_RATEEST_MATCH_ABS) {
 		bps2 = info->bps2;
 		pps2 = info->pps2;
 	} else {
-		spin_lock_bh(&info->est2->lock);
-		r = &info->est2->rstats;
+		gen_estimator_read(&info->est2->rate_est, &sample);
+
 		if (info->flags & XT_RATEEST_MATCH_DELTA) {
-			bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
-			pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+			bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
+			pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
 		} else {
-			bps2 = r->bps;
-			pps2 = r->pps;
+			bps2 = sample.bps;
+			pps2 = sample.pps;
 		}
-		spin_unlock_bh(&info->est2->lock);
 	}
 
 	switch (info->mode) {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f893d180da1caa3b6dd1cc8773920beb1885f9b0..2095c83ce7730d49550103a8efad943d28815617 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
 	spin_lock_bh(&hinfo->lock);
 	hlist_del(&p->tcfa_head);
 	spin_unlock_bh(&hinfo->lock);
-	gen_kill_estimator(&p->tcfa_bstats,
-			   &p->tcfa_rate_est);
+	gen_kill_estimator(&p->tcfa_rate_est);
 	/*
 	 * gen_estimator est_timer() might access p->tcfa_lock
 	 * or bstats, wait a RCU grace period before freeing p
@@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check);
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
 {
 	if (est)
-		gen_kill_estimator(&a->tcfa_bstats,
-				   &a->tcfa_rate_est);
+		gen_kill_estimator(&a->tcfa_rate_est);
 	call_rcu(&a->tcfa_rcu, free_tcf);
 }
 EXPORT_SYMBOL(tcf_hash_cleanup);
@@ -670,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
 		goto errout;
 
 	if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
-				     &p->tcfa_rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, p->cpu_qstats,
 				  &p->tcfa_qstats,
 				  p->tcfa_qstats.qlen) < 0)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index c990b73a6c85151862c30971bd3787d20dbcecd1..0ba91d1ce99480c4817684dbe0b4fde61811e03e 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 			goto failure_unlock;
 	} else if (tb[TCA_POLICE_AVRATE] &&
 		   (ret == ACT_P_CREATED ||
-		    !gen_estimator_active(&police->tcf_bstats,
-					  &police->tcf_rate_est))) {
+		    !gen_estimator_active(&police->tcf_rate_est))) {
 		err = -EINVAL;
 		goto failure_unlock;
 	}
@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
 	bstats_update(&police->tcf_bstats, skb);
 	tcf_lastuse_update(&police->tcf_tm);
 
-	if (police->tcfp_ewma_rate &&
-	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
-		police->tcf_qstats.overlimits++;
-		if (police->tcf_action == TC_ACT_SHOT)
-			police->tcf_qstats.drops++;
-		spin_unlock(&police->tcf_lock);
-		return police->tcf_action;
+	if (police->tcfp_ewma_rate) {
+		struct gnet_stats_rate_est64 sample;
+
+		if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
+		    sample.bps >= police->tcfp_ewma_rate) {
+			police->tcf_qstats.overlimits++;
+			if (police->tcf_action == TC_ACT_SHOT)
+				police->tcf_qstats.drops++;
+			spin_unlock(&police->tcf_lock);
+			return police->tcf_action;
+		}
 	}
 
 	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f337f1bdd1d4a4cac738f9fe1fbd42e5984174fe..d7b93429f0cca61ff489536b4247f31f3690fdd8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1395,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
 				  &d, cpu_bstats, &q->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
 		goto nla_put_failure;
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index beb554aa8cfbb4acb5d89511b5e0030b4c9cf26e..9ffe1c220b02848032474fa7b9b2c2f09d40b110 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -122,7 +122,7 @@ struct cbq_class {
 	psched_time_t		penalized;
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct tc_cbq_xstats	xstats;
 
 	struct tcf_proto __rcu	*filter_list;
@@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
 		return -1;
 
@@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->q);
 	qdisc_put_rtab(cl->R_tab);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	if (cl != &q->link)
 		kfree(cl);
 }
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8af5c59eef848db47cc33a878296693dabb44955..bb4cbdf7500482b170eef6e7923cf2f2259e52b5 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -25,7 +25,7 @@ struct drr_class {
 
 	struct gnet_stats_basic_packed		bstats;
 	struct gnet_stats_queue		qstats;
-	struct gnet_stats_rate_est64	rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct list_head		alist;
 	struct Qdisc			*qdisc;
 
@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
 {
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	qdisc_destroy(cl->qdisc);
 	kfree(cl);
 }
@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
 		return -1;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c28187cc888cebf004e61270f00871..6eb9c8e88519a36fc3ef82be7c7f1dbe0ef1e13c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 
 	qdisc_put_stab(rtnl_dereference(qdisc->stab));
 #endif
-	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+	gen_kill_estimator(&qdisc->rate_est);
 	if (ops->reset)
 		ops->reset(qdisc);
 	if (ops->destroy)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 000f1d36128e1abfc0657bce779a7df852f66384..3ffaa6fb0990f0aa31487a2f1829b1f1accf8b21 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -114,7 +114,7 @@ struct hfsc_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct tcf_proto __rcu *filter_list; /* filter list */
 	unsigned int	filter_cnt;	/* filter count */
 	unsigned int	level;		/* class level in hierarchy */
@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 
 	tcf_destroy_chain(&cl->filter_list);
 	qdisc_destroy(cl->qdisc);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	if (cl != &q->root)
 		kfree(cl);
 }
@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 	xstats.rtwork  = cl->cl_cumul;
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
 		return -1;
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 9926fe4f3b6f7db9aeba22fbbfae3d47c4e2af60..760f39e7caeeb91b6c34d561f64f1ed97c884a32 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -111,7 +111,7 @@ struct htb_class {
 	unsigned int		children;
 	struct htb_class	*parent;	/* parent class */
 
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 
 	/*
 	 * Written often fields
@@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
 		return -1;
 
@@ -1228,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 		WARN_ON(!cl->un.leaf.q);
 		qdisc_destroy(cl->un.leaf.q);
 	}
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	tcf_destroy_chain(&cl->filter_list);
 	kfree(cl);
 }
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index ca0516e6f74356f47dffcc2262f233ee50f0c47c..f9e712ce2d15ce9280c31d2f75d62b84034ae51d 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -137,7 +137,7 @@ struct qfq_class {
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
-	struct gnet_stats_rate_est64 rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct Qdisc *qdisc;
 	struct list_head alist;		/* Link for active-classes list. */
 	struct qfq_aggregate *agg;	/* Parent aggregate. */
@@ -508,7 +508,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
 		if (new_agg == NULL) {
 			err = -ENOBUFS;
-			gen_kill_estimator(&cl->bstats, &cl->rate_est);
+			gen_kill_estimator(&cl->rate_est);
 			goto destroy_class;
 		}
 		sch_tree_lock(sch);
@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
 	struct qfq_sched *q = qdisc_priv(sch);
 
 	qfq_rm_from_agg(q, cl);
-	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	gen_kill_estimator(&cl->rate_est);
 	qdisc_destroy(cl->qdisc);
 	kfree(cl);
 }
@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
 
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
 				  d, NULL, &cl->bstats) < 0 ||
-	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
 	    gnet_stats_copy_queue(d, NULL,
 				  &cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
 		return -1;



^ permalink raw reply related

* Re: [PATCN v2 net-next] net_sched: gen_estimator: complete rewrite of rate estimators
From: Eric Dumazet @ 2016-12-04 17:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, netfilter-devel
In-Reply-To: <1480835882.18162.462.camel@edumazet-glaptop3.roam.corp.google.com>

On Sat, 2016-12-03 at 23:18 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> 1) Old code was hard to maintain, due to complex lock chains.
>    (We probably will be able to remove some kfree_rcu() in callers)
> 
> 2) Using a single timer to update all estimators does not scale.
> 
> 3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
>    is not supposed to work well)

A v3 is under way, fixing a last "make htmldocs" warning.

^ permalink raw reply

* Re: [flamebait] xdp Was: Re: bpf bounded loops. Was: [flamebait] xdp
From: Hannes Frederic Sowa @ 2016-12-04 16:05 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tom Herbert, Thomas Graf, Linux Kernel Network Developers,
	Daniel Borkmann, David S. Miller
In-Reply-To: <20161202233430.GA58522@ast-mbp>

Hello,

On 03.12.2016 00:34, Alexei Starovoitov wrote:
> On Fri, Dec 02, 2016 at 08:42:41PM +0100, Hannes Frederic Sowa wrote:
>> On Fri, Dec 2, 2016, at 20:25, Hannes Frederic Sowa wrote:
>>> On 02.12.2016 19:39, Alexei Starovoitov wrote:
>>>> On Thu, Dec 01, 2016 at 10:27:12PM +0100, Hannes Frederic Sowa wrote:
>>>>> like") and the problematic of parsing DNS packets in XDP due to string
>>>>> processing and looping inside eBPF.
>>>>
>>>> Hannes,
>>>> Not too long ago you proposed a very interesting idea to add
>>>> support for bounded loops without adding any new bpf instructions and
>>>> changing llvm (which was way better than my 'rep' like instructions
>>>> I was experimenting with). I thought systemtap guys also wanted bounded
>>>> loops and you were cooperating on the design, so I gave up on my work and
>>>> was expecting an imminent patch from you. I guess it sounds like you know
>>>> believe that bounded loops are impossible or I misunderstand your statement ?
>>>
>>> Your argument was that it would need a new verifier as the current first
>>> pass checks that we indeed can lay out the basic blocks as a DAG which
>>> the second pass depends on. This would be violated.
> 
> yes. today the main part of verifier depends on cfg check that confirms DAG
> property of the program. This was done as a simplification for the algorithm,
> so any programmer that understands C can understand the verifier code.
> It certainly was the case, since most of the people who hacked
> verifier had zero compiler background.
> Now I'm thinking to introduce proper compiler technologies to it.
> On one side it will make the bar to understand higher and on the other
> side it will cleanup the logic and reuse tens of years of data flow
> analysis theory and will make verifier more robust and mathematically
> solid.

See below.

>>> Because eBPF is available by non privileged users this would need a lot
>>> of effort to rewrite and verify (or indeed keep two verifiers in the
>>> kernel for priv and non-priv). The verifier itself is exposed to
>>> unprivileged users.
> 
> I certainly hear your concerns that people unfamiliar with it are simply
> scared that more and more verification logic being added. So I don't mind
> freezing current verifier for unpriv and let proper data flow analysis
> to be done in root only component.
> 
>>> Also, by design, if we keep the current limits, this would not give you
>>> more instructions to operate on compared to the flattened version of the
>>> program, it would merely reduce the numbers of optimizations in LLVM
>>> that let the verifier reject the program.
> 
> I think we most likely will keep 4k insn limit (since there were no
> requests to increase it). The bounded loops will improve performance
> and reduce I-cache misses.

I agree that bounded loops will increase performance and in general I
see lifting this limitation as something good if it works out.

>> The only solution to protect the verifier, which I saw, would be to
>> limit it by time and space, thus making loading of eBPF programs
>> depending on how fast and hot (thermal throttling) one CPU thread is.
> 
> the verifier already has time and space limits.
> See no reason to rely on physical cpu sensors.

Time and space is bounded by the DAG property. It is still bounded in
the directed cyclic case (some arbitrary upper limit), but can have a
combinatorical explosion because of the switch from proving properties
for each node+state to prove properties for each path+state.

Compiler algorithms maybe a help here, but historically have focused on
other properties, mainly optimization and thus are mostly heuristics.

Compiler developers don't write the algorithm under the assumption they
will execute in a security and resource sensitive environment (only the
generated code should be safe). I believe that optimization algorithms
increase the attack surface, as their big-O worst case might be an
additional cost, additionally to the worst case verification path. I
don't think compiler engineers think about the code to optimize
attacking the optimization algorithm itself.

Verification of a malicious BPF program (complexity bomb) in the kernel
should not disrupt nor create a security threat (we are also mostly
voluntary preemptible in this code and hold a lock). Verification might
fail when memory fragmentation becomes more probably, as the huge state
table for path sensitive verification cannot be allocated.

In user space, instead of verification of many properties regarding
program state (which you actually need in the BPF verifier), the
development effort concentrates on sanitizers. Otherwise look how good
gcc is in finding uninitialized variables.

Most mathematical proofs that are written for compiler optimization I
know of are written to show equivalence between program text and the
optimization. Compile time recently became a more important aspect of
the open source compilers at least.

I am happy to be shown wrong here and assume there is no better
algorithm, which is reasonable easy for the kernel - I am a pessimist
but am happy to look at proposals.

>> Those are the complexity problems I am talking and concerned about.
> 
> Do you have concerns when people implement encryption algorithm
> that you're unfamiliar with?

Absolutely not, this can already be done in user space very much the
same, I can remove it, delete it, uninstall it. APIs in the kernel stay
forever.

> Isn't it much bigger concern, since any bugs in the algorithm
> are directly exploitable and when encryption is actually used
> it's protecting sensitive data, whereas here the verifier
> protects kernel from crashing.

Bugs happen everywhere, but a bug in the kernel itself creates a whole
bigger mess than a bug in a security constrained user space application
with proper privilege drops (at least it should).

The complexity arguments in random order:

1) The code itself - if the route is taken to provide two verifiers for
eBPF, this certainly comes with a maintenance cost to keep them secure
(secure as in the verifier itself should not expose a threat neither is
the to be verified program allowed to leak data or exceed its address
space nor some time budget).

If one of those eBPF verifiers only accepts a certain number of INSN, as
fundamental as backwards jumps, we might end up with two compiler?

At some point Google might start to fuzz them like right now other
fundamental parts of the kernel and someone must review all the code and
fix them in both of them.

If we shift verification to new heuristics programs will still fail,
just under new conditions, this exposes complexity to the users.
Libraries cannot even be exchanged between those twos.

2) Integration and API considerations:

E.g. the newly added cgroup socket bpf interface, which can modify
bound_dev_if. If you restart networking (old way
/etc/init.d/networking), those if_indexes are all outdated and the
system will behave strangely. Network management software needs to walk
cgroup (a different kernel subsystem) now, too, in order to refresh
outdated information, recompile and insert. Even worse, all user space
programs which might be in the cgroups need to be restarted, too, as the
change is permanent per socket lifetime. Instead of reconfiguring
networking, I will probably now restart the whole box.
Furthermore it gets even more complicated because cgroup can be
namespace'd nowadays and doesn't necessarily need to have no 1:1
relation with the network namespace, so this gets really complicated.

Misconfiguration causes security problems.

In the past we had dependency trees and notifiers to e.g. clean up IPs,
multicast, disable netfilter rules, routes, etc., because the current
kernel tries to keep a specific semantic when you disable an interface
and bring it back up or do some other configuration change.

Like in the eBPF example with cgroups above, networking stack state
might become hard coded in XDP programs because of simplicity and
regularly needs to be refreshed to correspond linux networking changes
from user space, too. User space program crashes or gets killed by an
admin and doesn't remember to clean-up, the kernel is partly tainted
with some state you can search in different subsystems, cgroups, XDP,
qdiscs and clean up yourself (if we assume we can't do everything in XDP).

All those changes pretty much speak for a new user space control plane,
which wasn't that much necessary so far. Two places to keep your data up
to date creates intermediate states where it is not up to date and also
the update process itself might be complex (e.g. simple things like
unicast/multicast address filter changes on the NIC vs. what the XDP
program thinks). Ergo, more complexity. What do you do when one of those
two systems fail? What is the reference data? What do you do if on a
highly busy box during DoS constant reloading of your vmalloc happens (I
don't know if it is a problem under DoS)?

Lot's of effort went into transparent hw offloading to still retain this
property of transparency and behave as a proper citizen.

If XDP should not be considered an outsider of the networking stack, it
should as well understand reconfiguration events which leads to leakage
of kernel internals into XDP (e.g. Jesper's example of writing a bridge
in XDP, or offloading examples where we need some network config pieces).

I find it strange that hw offloading people try as much as possible to
discover the state of the linux networking stack inside the kernel and
apply this state transparently to hw offloads. Will there be shulti
switch available to allow user space to sniff switchdev events to
compile them to XDP or update hashtables?

"Software offloading" basically brings the control and dataplane into
user space (not technically but from the users PoV), making it
abnormally difficult to fit in with the traditional network admin tools.

3) Security complexity:

I was e.g. wondering if there is an architectural TOCTTOU in
"env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);" in the verifier,
because bpf objects can be easily passed around and be attached to file
descriptors etc. of processes that might be sandboxed.

Leaks of pointers can e.g. lead to discover the state of address space
layout randomization. Can we now pass verifier-ng verified eBPF programs
around to untrusted programs? To sandboxes? Need they be tainted? Do we
need to consider that for bpffs? What about user namespaces? If it turns
out to be a problem now, would a change like this be backwards compatible?

Helpers for eBPF need to be checked regularly if they still provide
their safety guarantees, especially if they call into other subsystems,
also when new features get added.

4) Complexity for users of this framework (compiler problems solved):

I tried to argue that someone wanting to build netmap/DPDK-alike things
in XDP, one faces the problem of synchronized IPC. Hashmaps solve this
to some degree but cannot be synchronized. Recent offloading series
showed e.g. how hard it is to keep state up-to-date in multiple places.
If XDP-user space sharing happens this might be solved.

Adding external functions to eBPF might also freeze some kernel
internals or the eBPF wrappers might become complex down some years.

DPDK even can configure various hw offloads already before the kernel
can do so. If users want to use those, they switch to DPDK also, as I
have seen the industry always wanting the best performance. DPDK can use
SIMD instructions, all AVX, SSE and MMX stuff, and they do it.

Users still depend on specific versions of the kernel due to which
functions are exported to XDP.

Debugging is harder but currently worked on. But will probably always be
harder than simply using a debugger.

This all leads to gigantic user space control planes like neutron and
others that just make everyone's life much harder. The model requires
this. And that is what I fear.

I am not at all that negative against a hook before allocating the
packet, but making everyone using it and marketing as an alternative to
DPDK doesn't seem to fit for me.

It seems to me if XDP should even remotely be comparable to DPDK a lot
of complexity has to be added. It is not Linux network stack for me
either so far.

Sorry if I hijacked the bounded loop discussion for the rant. I am happy
to discuss or follow-up on ideas regarding lifting the looping limit in
eBPF.

Bye,
Hannes

^ permalink raw reply

* [patch net] net: fec: fix compile with CONFIG_M5272
From: Nikita Yushchenko @ 2016-12-04 15:17 UTC (permalink / raw)
  To: David S. Miller, Fugang Duan, Troy Kisky, Andrew Lunn,
	Eric Nelson, Philippe Reynes, Johannes Berg, netdev
  Cc: Chris Healy, Fabio Estevam, linux-kernel, Nikita Yushchenko

Commit 4dfb80d18d05 ("net: fec: cache statistics while device is down")
introduced unconditional statistics-related actions.

However, when driver is compiled with CONFIG_M5272, staticsics-related
definitions do not exist, which results into build errors.

Fix that by adding needed #if !defined(CONFIG_M5272).

Fixes: 4dfb80d18d05 ("net: fec: cache statistics while device is down")
Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
---
 drivers/net/ethernet/freescale/fec_main.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 6a20c24a2003..89e902767abb 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2884,7 +2884,9 @@ fec_enet_close(struct net_device *ndev)
 	if (fep->quirks & FEC_QUIRK_ERR006687)
 		imx6q_cpuidle_fec_irqs_unused();
 
+#if !defined(CONFIG_M5272)
 	fec_enet_update_ethtool_stats(ndev);
+#endif
 
 	fec_enet_clk_enable(ndev, false);
 	pinctrl_pm_select_sleep_state(&fep->pdev->dev);
@@ -3192,7 +3194,9 @@ static int fec_enet_init(struct net_device *ndev)
 
 	fec_restart(ndev);
 
+#if !defined(CONFIG_M5272)
 	fec_enet_update_ethtool_stats(ndev);
+#endif
 
 	return 0;
 }
@@ -3292,9 +3296,11 @@ fec_probe(struct platform_device *pdev)
 	fec_enet_get_queue_num(pdev, &num_tx_qs, &num_rx_qs);
 
 	/* Init network device */
-	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) +
-				  ARRAY_SIZE(fec_stats) * sizeof(u64),
-				  num_tx_qs, num_rx_qs);
+	ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private)
+#if !defined(CONFIG_M5272)
+				  + ARRAY_SIZE(fec_stats) * sizeof(u64)
+#endif
+				  , num_tx_qs, num_rx_qs);
 	if (!ndev)
 		return -ENOMEM;
 
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH v2 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Daniel Borkmann @ 2016-12-04 15:11 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev
  Cc: Alexei Starovoitov, Brenden Blanco, David Miller,
	Jesper Dangaard Brouer, Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <1480821446-4122277-2-git-send-email-kafai@fb.com>

On 12/04/2016 04:17 AM, Martin KaFai Lau wrote:
> This patch allows XDP prog to extend/remove the packet
> data at the head (like adding or removing header).  It is
> done by adding a new XDP helper bpf_xdp_adjust_head().
>
> It also renames bpf_helper_changes_skb_data() to
> bpf_helper_changes_pkt_data() to better reflect
> that XDP prog does not work on skb.
>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>

Yeah, looks good like that:

Acked-by: Daniel Borkmann <daniel@iogearbox.net>

Should there one day be different requirements wrt min length,
we could always pass dev pointer via struct xdp_buff and then
use dev->hard_header_len, for example, but not needed right now.

^ permalink raw reply

* [PATCH net-next] net/sched: cls_flower: Set the filter Hardware device for all use-cases
From: Hadar Hen Zion @ 2016-12-04 13:25 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Simon Horman, Jiri Pirko, Or Gerlitz, Hadar Hen Zion

Check if the returned device from tcf_exts_get_dev function supports tc
offload and in case the rule can't be offloaded, set the filter hw_dev
parameter to the original device given by the user.

The filter hw_device parameter should always be set by fl_hw_replace_filter
function, since this pointer is used by dump stats and destroy
filter for each flower rule (offloaded or not).

Fixes: 7091d8c7055d ('net/sched: cls_flower: Add offload support using egress Hardware device')
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reported-by: Simon Horman <horms@verge.net.au>
---
 net/sched/cls_flower.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index c5cea78..29a9e6d 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -236,8 +236,11 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	int err;
 
 	if (!tc_can_offload(dev, tp)) {
-		if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev))
+		if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
+		    (f->hw_dev && !tc_can_offload(f->hw_dev, tp))) {
+			f->hw_dev = dev;
 			return tc_skip_sw(f->flags) ? -EINVAL : 0;
+		}
 		dev = f->hw_dev;
 		tc->egress_dev = true;
 	} else {
-- 
1.8.3.1

^ permalink raw reply related

* 967799855664878
From: ??? @ 2016-12-04 14:03 UTC (permalink / raw)
  To: netdemon; +Cc: netdev

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="l8", Size: 37 bytes --]

ÎÄ»ÝÜŠnetdemon

967799855664878
23

[-- Attachment #2: ???????????.xls --]
[-- Type: application/x-msexcel, Size: 37376 bytes --]

^ permalink raw reply

* [PATCH net 2/2] bnx2x: Prevent tunnel config for 577xx
From: Yuval Mintz @ 2016-12-04 13:30 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480858218-13187-1-git-send-email-Yuval.Mintz@cavium.com>

Only the 578xx adapters are capable of configuring UDP ports for
the purpose of tunnelling - doing the same on 577xx might lead to
a firmware assertion.
We're already not claiming support for any related feature for such
devices, but we also need to prevent the configuration of the UDP
ports to the device in this case.

Fixes: f34fa14cc033 ("bnx2x: Add vxlan RSS support")
Reported-by: Anikina Anna <anikina@gmail.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 0cee4c0..42f4611 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -10138,7 +10138,7 @@ static void __bnx2x_add_udp_port(struct bnx2x *bp, u16 port,
 {
 	struct bnx2x_udp_tunnel *udp_port = &bp->udp_tunnel_ports[type];
 
-	if (!netif_running(bp->dev) || !IS_PF(bp))
+	if (!netif_running(bp->dev) || !IS_PF(bp) || CHIP_IS_E1x(bp))
 		return;
 
 	if (udp_port->count && udp_port->dst_port == port) {
@@ -10163,7 +10163,7 @@ static void __bnx2x_del_udp_port(struct bnx2x *bp, u16 port,
 {
 	struct bnx2x_udp_tunnel *udp_port = &bp->udp_tunnel_ports[type];
 
-	if (!IS_PF(bp))
+	if (!IS_PF(bp) || CHIP_IS_E1x(bp))
 		return;
 
 	if (!udp_port->count || udp_port->dst_port != port) {
-- 
1.9.3

^ permalink raw reply related

* [PATCH net 1/2] bnx2x: Correct ringparam estimate when DOWN
From: Yuval Mintz @ 2016-12-04 13:30 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480858218-13187-1-git-send-email-Yuval.Mintz@cavium.com>

Until interface is up [and assuming ringparams weren't explicitly
configured] when queried for the size of its rings bnx2x would
claim they're the maximal size by default.
That is incorrect as by default the maximal number of buffers would
be equally divided between the various rx rings.

This prevents the user from actually setting the number of elements
on each rx ring to be of maximal size prior to transitioning the
interface into up state.

To fix this, make a rough estimation about the number of buffers.
It wouldn't always be accurate, but it would be much better than
current estimation and would allow users to increase number of
buffers during early initialization of the interface.

Reported-by: Seymour, Shane <shane.seymour@hpe.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 85a7800..5f19427 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -1872,8 +1872,16 @@ static void bnx2x_get_ringparam(struct net_device *dev,
 
 	ering->rx_max_pending = MAX_RX_AVAIL;
 
+	/* If size isn't already set, we give an estimation of the number
+	 * of buffers we'll have. We're neglecting some possible conditions
+	 * [we couldn't know for certain at this point if number of queues
+	 * might shrink] but the number would be correct for the likely
+	 * scenario.
+	 */
 	if (bp->rx_ring_size)
 		ering->rx_pending = bp->rx_ring_size;
+	else if (BNX2X_NUM_RX_QUEUES(bp))
+		ering->rx_pending = MAX_RX_AVAIL / BNX2X_NUM_RX_QUEUES(bp);
 	else
 		ering->rx_pending = MAX_RX_AVAIL;
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net 0/2] bnx2x: fixes series
From: Yuval Mintz @ 2016-12-04 13:30 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz

Hi Dave,

Two unrelated fixes for bnx2x - the first one is nice-to-have,
while the other fixes fatal behaviour in older adapters.

Please consider applying them to `net'.

Thanks,
Yuval

Yuval Mintz (2):
  bnx2x: Correct ringparam estimate when interface is down
  bnx2x: Prevent tunnel configuration for 577xx adapters.

 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 8 ++++++++
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c    | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

-- 
1.9.3

^ permalink raw reply

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Saeed Mahameed @ 2016-12-04 13:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan
In-Reply-To: <1480613764.18162.324.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, Dec 1, 2016 at 7:36 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-12-01 at 08:08 -0800, Eric Dumazet wrote:
>> On Thu, 2016-12-01 at 07:55 -0800, Eric Dumazet wrote:
>>
>> > So removing the spinlock is doable, but needs to add a new parameter
>> > to mlx4_en_fold_software_stats() and call netdev_stats_to_stats64()
>> > before mlx4_en_fold_software_stats(dev)
>>
>> Untested patch would be :
>>
>>  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |    2 -
>>  drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |   10 +----
>>  drivers/net/ethernet/mellanox/mlx4/en_port.c    |   24 +++++++++-----
>>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |    3 +
>>  4 files changed, 23 insertions(+), 16 deletions(-)
>
> The patch is wrong, since priv->port_up could change to false while we
> are running and using the about to be deleted tx/rx rings.
>

Right, hence the regression Jesper saw ;).

>
> So the only safe thing to do is to remove the _bh suffix.
>
> Not worth trying to avoid taking a spinlock in this code.
>

Ack.

^ permalink raw reply

* Re: Regression: [PATCH] mlx4: give precise rx/tx bytes/packets counters
From: Saeed Mahameed @ 2016-12-04 13:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, David Miller, netdev, Tariq Toukan
In-Reply-To: <1480612130.18162.321.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, Dec 1, 2016 at 7:08 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-12-01 at 18:33 +0200, Saeed Mahameed wrote:
>
>> Thanks for the detailed answer !!
>
> You're welcome.
>
>>
>> BTW you went 5 steps ahead of my original question :)), so far you
>> already have a patch without locking at all (really impressive).
>>
>> What i wanted to ask originally, was regarding the "_bh", i didn't
>> mean to completely remove the "spin_lock_bh",
>> I meant, what happens if we replace "spin_lock_bh"  with "spin_lock",
>> without disabling bh ?
>> I gues raw "sping_lock" handles points (2 to 4) from above, but it
>> won't handle long irqs.
>
> Thats a very good point, the _bh prefix can totally be removed, since
> stats_lock is only acquired from process context.
>
>

That was my initial point, Thanks for the help.
will provide a fix patch later once 4.9 is release.

^ permalink raw reply

* [PATCH V2 net 18/20] net/ena: remove affinity hint from the driver
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

To allow irqbalance to better distribute the napi handler,
remove the smp affinity hint from the driver.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 6c49529..ee80472 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1323,8 +1323,6 @@ static int ena_request_mgmnt_irq(struct ena_adapter *adapter)
 		  "set affinity hint of mgmnt irq.to 0x%lx (irq vector: %d)\n",
 		  irq->affinity_hint_mask.bits[0], irq->vector);
 
-	irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask);
-
 	return rc;
 }
 
@@ -1354,8 +1352,6 @@ static int ena_request_io_irq(struct ena_adapter *adapter)
 		netif_dbg(adapter, ifup, adapter->netdev,
 			  "set affinity hint of irq. index %d to 0x%lx (irq vector: %d)\n",
 			  i, irq->affinity_hint_mask.bits[0], irq->vector);
-
-		irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask);
 	}
 
 	return rc;
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 15/20] net/ena: change sizeof() argument to be the type pointer
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

Instead of using:
memset(ptr, 0x0, sizeof(struct ...))
use:
memset(ptr, 0x0, sizeor(*ptr))

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_com.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 9deb0dd..f2e167b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -329,7 +329,7 @@ static int ena_com_init_io_sq(struct ena_com_dev *ena_dev,
 	size_t size;
 	int dev_node = 0;
 
-	memset(&io_sq->desc_addr, 0x0, sizeof(struct ena_com_io_desc_addr));
+	memset(&io_sq->desc_addr, 0x0, sizeof(io_sq->desc_addr));
 
 	io_sq->desc_entry_size =
 		(io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX) ?
@@ -383,7 +383,7 @@ static int ena_com_init_io_cq(struct ena_com_dev *ena_dev,
 	size_t size;
 	int prev_node = 0;
 
-	memset(&io_cq->cdesc_addr, 0x0, sizeof(struct ena_com_io_desc_addr));
+	memset(&io_cq->cdesc_addr, 0x0, sizeof(io_cq->cdesc_addr));
 
 	/* Use the basic completion descriptor for Rx */
 	io_cq->cdesc_entry_size_in_bytes =
@@ -681,7 +681,7 @@ static int ena_com_destroy_io_sq(struct ena_com_dev *ena_dev,
 	u8 direction;
 	int ret;
 
-	memset(&destroy_cmd, 0x0, sizeof(struct ena_admin_aq_destroy_sq_cmd));
+	memset(&destroy_cmd, 0x0, sizeof(destroy_cmd));
 
 	if (io_sq->direction == ENA_COM_IO_QUEUE_DIRECTION_TX)
 		direction = ENA_ADMIN_SQ_DIRECTION_TX;
@@ -963,7 +963,7 @@ static int ena_com_create_io_sq(struct ena_com_dev *ena_dev,
 	u8 direction;
 	int ret;
 
-	memset(&create_cmd, 0x0, sizeof(struct ena_admin_aq_create_sq_cmd));
+	memset(&create_cmd, 0x0, sizeof(create_cmd));
 
 	create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_SQ;
 
@@ -1155,7 +1155,7 @@ int ena_com_create_io_cq(struct ena_com_dev *ena_dev,
 	struct ena_admin_acq_create_cq_resp_desc cmd_completion;
 	int ret;
 
-	memset(&create_cmd, 0x0, sizeof(struct ena_admin_aq_create_cq_cmd));
+	memset(&create_cmd, 0x0, sizeof(create_cmd));
 
 	create_cmd.aq_common_descriptor.opcode = ENA_ADMIN_CREATE_CQ;
 
@@ -1263,7 +1263,7 @@ int ena_com_destroy_io_cq(struct ena_com_dev *ena_dev,
 	struct ena_admin_acq_destroy_cq_resp_desc destroy_resp;
 	int ret;
 
-	memset(&destroy_cmd, 0x0, sizeof(struct ena_admin_aq_destroy_sq_cmd));
+	memset(&destroy_cmd, 0x0, sizeof(destroy_cmd));
 
 	destroy_cmd.cq_idx = io_cq->idx;
 	destroy_cmd.aq_common_descriptor.opcode = ENA_ADMIN_DESTROY_CQ;
@@ -1613,8 +1613,8 @@ int ena_com_create_io_queue(struct ena_com_dev *ena_dev,
 	io_sq = &ena_dev->io_sq_queues[ctx->qid];
 	io_cq = &ena_dev->io_cq_queues[ctx->qid];
 
-	memset(io_sq, 0x0, sizeof(struct ena_com_io_sq));
-	memset(io_cq, 0x0, sizeof(struct ena_com_io_cq));
+	memset(io_sq, 0x0, sizeof(*io_sq));
+	memset(io_cq, 0x0, sizeof(*io_cq));
 
 	/* Init CQ */
 	io_cq->q_depth = ctx->queue_size;
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 09/20] net/ena: fix potential access to freed memory during device reset
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

If the ena driver detects that the device is not behave as expected,
it tries to reset the device.
The reset flow calls ena_down, which will frees all the resources
the driver allocates and then it will reset the device.

This flow can cause memory corruption if the device is still writes
to the driver's memory space.
To overcome this potential race, move the reset before the device
resources are freed.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 56 +++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 0b718ee..bb7eeea 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -80,14 +80,18 @@ static void ena_tx_timeout(struct net_device *dev)
 {
 	struct ena_adapter *adapter = netdev_priv(dev);
 
+	/* Change the state of the device to trigger reset
+	 * Check that we are not in the middle or a trigger already
+	 */
+
+	if (test_and_set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
 	u64_stats_update_begin(&adapter->syncp);
 	adapter->dev_stats.tx_timeout++;
 	u64_stats_update_end(&adapter->syncp);
 
 	netif_err(adapter, tx_err, dev, "Transmit time out\n");
-
-	/* Change the state of the device to trigger reset */
-	set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
 }
 
 static void update_rx_ring_mtu(struct ena_adapter *adapter, int mtu)
@@ -1116,7 +1120,8 @@ static int ena_io_poll(struct napi_struct *napi, int budget)
 
 	tx_budget = tx_ring->ring_size / ENA_TX_POLL_BUDGET_DIVIDER;
 
-	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags)) {
+	if (!test_bit(ENA_FLAG_DEV_UP, &tx_ring->adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &tx_ring->adapter->flags)) {
 		napi_complete_done(napi, 0);
 		return 0;
 	}
@@ -1705,12 +1710,22 @@ static void ena_down(struct ena_adapter *adapter)
 	adapter->dev_stats.interface_down++;
 	u64_stats_update_end(&adapter->syncp);
 
-	/* After this point the napi handler won't enable the tx queue */
-	ena_napi_disable_all(adapter);
 	netif_carrier_off(adapter->netdev);
 	netif_tx_disable(adapter->netdev);
 
+	/* After this point the napi handler won't enable the tx queue */
+	ena_napi_disable_all(adapter);
+
 	/* After destroy the queue there won't be any new interrupts */
+
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) {
+		int rc;
+
+		rc = ena_com_dev_reset(adapter->ena_dev);
+		if (rc)
+			dev_err(&adapter->pdev->dev, "Device reset failed\n");
+	}
+
 	ena_destroy_all_io_queues(adapter);
 
 	ena_disable_io_intr_sync(adapter);
@@ -2073,6 +2088,14 @@ static void ena_netpoll(struct net_device *netdev)
 	struct ena_adapter *adapter = netdev_priv(netdev);
 	int i;
 
+	/* Dont schedule NAPI if the driver is in the middle of reset
+	 * or netdev is down.
+	 */
+
+	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags) ||
+	    test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
 	for (i = 0; i < adapter->num_queues; i++)
 		napi_schedule(&adapter->ena_napi[i].napi);
 }
@@ -2459,6 +2482,14 @@ static void ena_fw_reset_device(struct work_struct *work)
 	bool dev_up, wd_state;
 	int rc;
 
+	if (unlikely(!test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+		dev_err(&pdev->dev,
+			"device reset schedule while reset bit is off\n");
+		return;
+	}
+
+	netif_carrier_off(netdev);
+
 	del_timer_sync(&adapter->timer_service);
 
 	rtnl_lock();
@@ -2472,12 +2503,6 @@ static void ena_fw_reset_device(struct work_struct *work)
 	 */
 	ena_close(netdev);
 
-	rc = ena_com_dev_reset(ena_dev);
-	if (rc) {
-		dev_err(&pdev->dev, "Device reset failed\n");
-		goto err;
-	}
-
 	ena_free_mgmnt_irq(adapter);
 
 	ena_disable_msix(adapter);
@@ -2490,6 +2515,8 @@ static void ena_fw_reset_device(struct work_struct *work)
 
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
 
+	clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+
 	/* Finish with the destroy part. Start the init part */
 
 	rc = ena_device_init(ena_dev, adapter->pdev, &get_feat_ctx, &wd_state);
@@ -2555,6 +2582,9 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
 	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
 		return;
 
+	if (test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))
+		return;
+
 	if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT)
 		return;
 
@@ -2696,7 +2726,7 @@ static void ena_timer_service(unsigned long data)
 	if (host_info)
 		ena_update_host_info(host_info, adapter->netdev);
 
-	if (unlikely(test_and_clear_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
+	if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Trigger reset is on\n");
 		ena_dump_stats_to_dmesg(adapter);
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 03/20] net/ena: fix queues number calculation
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

The ENA driver tries to open a queue per vCPU.
To determine how many vCPUs the instance have it uses num_possible_cpus
while it should have use num_online_cpus instead.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 397c9bc..224302c 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2667,7 +2667,7 @@ static int ena_calc_io_queue_num(struct pci_dev *pdev,
 		io_sq_num = get_feat_ctx->max_queues.max_sq_num;
 	}
 
-	io_queue_num = min_t(int, num_possible_cpus(), ENA_MAX_NUM_IO_QUEUES);
+	io_queue_num = min_t(int, num_online_cpus(), ENA_MAX_NUM_IO_QUEUES);
 	io_queue_num = min_t(int, io_queue_num, io_sq_num);
 	io_queue_num = min_t(int, io_queue_num,
 			     get_feat_ctx->max_queues.max_cq_num);
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 20/20] net/ena: increase driver version to 1.1.2
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index ed42e07..de1e5ac 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -45,7 +45,7 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_VER_MAJOR	1
-#define DRV_MODULE_VER_MINOR	0
+#define DRV_MODULE_VER_MINOR	1
 #define DRV_MODULE_VER_SUBMINOR 2
 
 #define DRV_MODULE_NAME		"ena"
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 19/20] net/ena: restructure skb allocation
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

To increase readability, refactor skb allocation to dedicated function
This change does not impact the performance since the compiler optimize
the code and elimitate the if condition.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 46 ++++++++++++++++------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index ee80472..8d42960 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -787,6 +787,28 @@ static int ena_clean_tx_irq(struct ena_ring *tx_ring, u32 budget)
 	return tx_pkts;
 }
 
+static struct sk_buff *ena_alloc_skb(struct ena_ring *rx_ring, bool frags)
+{
+	struct sk_buff *skb;
+
+	if (frags)
+		skb = napi_get_frags(rx_ring->napi);
+	else
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+						rx_ring->rx_copybreak);
+
+	if (unlikely(!skb)) {
+		u64_stats_update_begin(&rx_ring->syncp);
+		rx_ring->rx_stats.skb_alloc_fail++;
+		u64_stats_update_end(&rx_ring->syncp);
+		netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
+			  "Failed to allocate skb. frags: %d\n", frags);
+		return NULL;
+	}
+
+	return skb;
+}
+
 static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 				  struct ena_com_rx_buf_info *ena_bufs,
 				  u32 descs,
@@ -795,8 +817,7 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	struct sk_buff *skb;
 	struct ena_rx_buffer *rx_info =
 		&rx_ring->rx_buffer_info[*next_to_clean];
-	u32 len;
-	u32 buf = 0;
+	u32 len, buf = 0;
 	void *va;
 
 	len = ena_bufs[0].len;
@@ -815,16 +836,9 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 	prefetch(va + NET_IP_ALIGN);
 
 	if (len <= rx_ring->rx_copybreak) {
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-						rx_ring->rx_copybreak);
-		if (unlikely(!skb)) {
-			u64_stats_update_begin(&rx_ring->syncp);
-			rx_ring->rx_stats.skb_alloc_fail++;
-			u64_stats_update_end(&rx_ring->syncp);
-			netif_err(rx_ring->adapter, rx_err, rx_ring->netdev,
-				  "Failed to allocate skb\n");
+		skb = ena_alloc_skb(rx_ring, false);
+		if (unlikely(!skb))
 			return NULL;
-		}
 
 		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
 			  "rx allocated small packet. len %d. data_len %d\n",
@@ -848,15 +862,9 @@ static struct sk_buff *ena_rx_skb(struct ena_ring *rx_ring,
 		return skb;
 	}
 
-	skb = napi_get_frags(rx_ring->napi);
-	if (unlikely(!skb)) {
-		netif_dbg(rx_ring->adapter, rx_status, rx_ring->netdev,
-			  "Failed allocating skb\n");
-		u64_stats_update_begin(&rx_ring->syncp);
-		rx_ring->rx_stats.skb_alloc_fail++;
-		u64_stats_update_end(&rx_ring->syncp);
+	skb = ena_alloc_skb(rx_ring, true);
+	if (unlikely(!skb))
 		return NULL;
-	}
 
 	do {
 		dma_unmap_page(rx_ring->dev,
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 17/20] net/ena: add IPv6 extended protocols to ena_admin_flow_hash_proto
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

We intend to use those fields in the future.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_admin_defs.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
index 35ae511..92bba08 100644
--- a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
+++ b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
@@ -627,6 +627,12 @@ enum ena_admin_flow_hash_proto {
 
 	ENA_ADMIN_RSS_NOT_IP	= 7,
 
+	/* TCPv6 with extension header */
+	ENA_ADMIN_RSS_TCP6_EX	= 8,
+
+	/* IPv6 with extension header */
+	ENA_ADMIN_RSS_IP6_EX	= 9,
+
 	ENA_ADMIN_RSS_PROTO_NUM	= 16,
 };
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 16/20] net/ena: use napi_schedule_irqoff when possible
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 8c1e14b..6c49529 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1210,7 +1210,7 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 	struct ena_napi *ena_napi = data;
 
 	atomic_set(&ena_napi->unmask_interrupt, 1);
-	napi_schedule(&ena_napi->napi);
+	napi_schedule_irqoff(&ena_napi->napi);
 
 	return IRQ_HANDLED;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 14/20] net/ena: change condition for host attribute configuration
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

Move the host info config to be the first admin command that is executed.
This change require the driver to remove the 'feature check'
from host info configuration flow.
The check is removed since the supported features bitmask field
is retrieved only after calling ENA_ADMIN_DEVICE_ATTRIBUTES admin command.

If set host info is not supported an error will be returned by the device.

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_com.c    | 8 +++-----
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 5 +++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index a550c8a..9deb0dd 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -2474,11 +2474,9 @@ int ena_com_set_host_attributes(struct ena_com_dev *ena_dev)
 
 	int ret;
 
-	if (!ena_com_check_supported_feature_id(ena_dev,
-						ENA_ADMIN_HOST_ATTR_CONFIG)) {
-		pr_warn("Set host attribute isn't supported\n");
-		return -EPERM;
-	}
+	/* Host attribute config is called before ena_com_get_dev_attr_feat
+	 * so ena_com can't check if the feature is supported.
+	 */
 
 	memset(&cmd, 0x0, sizeof(cmd));
 	admin_queue = &ena_dev->admin_queue;
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 7ae1fce..8c1e14b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2426,6 +2426,8 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 	 */
 	ena_com_set_admin_polling_mode(ena_dev, true);
 
+	ena_config_host_info(ena_dev);
+
 	/* Get Device Attributes*/
 	rc = ena_com_get_dev_attr_feat(ena_dev, get_feat_ctx);
 	if (rc) {
@@ -2450,11 +2452,10 @@ static int ena_device_init(struct ena_com_dev *ena_dev, struct pci_dev *pdev,
 
 	*wd_state = !!(aenq_groups & BIT(ENA_ADMIN_KEEP_ALIVE));
 
-	ena_config_host_info(ena_dev);
-
 	return 0;
 
 err_admin_init:
+	ena_com_delete_host_info(ena_dev);
 	ena_com_admin_destroy(ena_dev);
 err_mmio_read_less:
 	ena_com_mmio_reg_read_request_destroy(ena_dev);
-- 
2.7.4

^ permalink raw reply related

* [PATCH V2 net 13/20] net/ena: change driver's default timeouts
From: Netanel Belgazal @ 2016-12-04 13:19 UTC (permalink / raw)
  To: linux-kernel, davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, alex, saeed, msw, aliguori, nafea
In-Reply-To: <1480857578-5065-1-git-send-email-netanel@annapurnalabs.com>

Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
---
 drivers/net/ethernet/amazon/ena/ena_com.c    | 4 ++--
 drivers/net/ethernet/amazon/ena/ena_netdev.h | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 4147d6e..a550c8a 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c
@@ -36,9 +36,9 @@
 /*****************************************************************************/
 
 /* Timeout in micro-sec */
-#define ADMIN_CMD_TIMEOUT_US (1000000)
+#define ADMIN_CMD_TIMEOUT_US (3000000)
 
-#define ENA_ASYNC_QUEUE_DEPTH 4
+#define ENA_ASYNC_QUEUE_DEPTH 16
 #define ENA_ADMIN_QUEUE_DEPTH 32
 
 #define MIN_ENA_VER (((ENA_COMMON_SPEC_VERSION_MAJOR) << \
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index c081fd3..ed42e07 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -39,6 +39,7 @@
 #include <linux/interrupt.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/u64_stats_sync.h>
 
 #include "ena_com.h"
 #include "ena_eth_com.h"
@@ -100,7 +101,7 @@
 /* Number of queues to check for missing queues per timer service */
 #define ENA_MONITORED_TX_QUEUES	4
 /* Max timeout packets before device reset */
-#define MAX_NUM_OF_TIMEOUTED_PACKETS 32
+#define MAX_NUM_OF_TIMEOUTED_PACKETS 128
 
 #define ENA_TX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1))
 
@@ -116,9 +117,9 @@
 #define ENA_IO_IRQ_IDX(q)		(ENA_IO_IRQ_FIRST_IDX + (q))
 
 /* ENA device should send keep alive msg every 1 sec.
- * We wait for 3 sec just to be on the safe side.
+ * We wait for 6 sec just to be on the safe side.
  */
-#define ENA_DEVICE_KALIVE_TIMEOUT	(3 * HZ)
+#define ENA_DEVICE_KALIVE_TIMEOUT	(6 * HZ)
 
 #define ENA_MMIO_DISABLE_REG_READ	BIT(0)
 
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox