Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net 2/4] net:ethernet:aquantia: Fix Tx queue hangups
From: Yunsheng Lin @ 2017-09-21 11:19 UTC (permalink / raw)
  To: Igor Russkikh, David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus
In-Reply-To: <cef3863edd8d504d7406f781c97260c52f21e156.1505915085.git.igor.russkikh@aquantia.com>

Hi, Igor

On 2017/9/21 18:53, Igor Russkikh wrote:
> Driver did a poor job in managing its Tx queues: Sometimes it could stop
> tx queues due to link down condition in aq_nic_xmit - but never waked up
> them. That led to Tx path total suspend.
> This patch fixes this and improves generic queue management:
> - introduces queue restart counter
> - uses generic netif_ interface to disable and enable tx path
> - refactors link up/down condition and introduces dmesg log event when
>   link changes.
> - introduces new constant for minimum descriptors count required for queue
>   wakeup
> 
> Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
> ---
>  drivers/net/ethernet/aquantia/atlantic/aq_cfg.h  |  4 ++
>  drivers/net/ethernet/aquantia/atlantic/aq_nic.c  | 91 +++++++++++-------------
>  drivers/net/ethernet/aquantia/atlantic/aq_nic.h  |  2 -
>  drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 26 +++++++
>  drivers/net/ethernet/aquantia/atlantic/aq_ring.h |  4 ++
>  drivers/net/ethernet/aquantia/atlantic/aq_vec.c  |  8 +--
>  6 files changed, 76 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
> index 2149864..0fdaaa6 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
> @@ -51,6 +51,10 @@
>  
>  #define AQ_CFG_SKB_FRAGS_MAX   32U
>  
> +/* Number of descriptors available in one ring to resume this ring queue
> + */
> +#define AQ_CFG_RESTART_DESC_THRES   (AQ_CFG_SKB_FRAGS_MAX * 2)
> +
>  #define AQ_CFG_NAPI_WEIGHT     64U
>  
>  #define AQ_CFG_MULTICAST_ADDRESS_MAX     32U
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> index f281392..24f573c 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> @@ -119,6 +119,35 @@ int aq_nic_cfg_start(struct aq_nic_s *self)
>  	return 0;
>  }
>  
> +static int aq_nic_update_link_status(struct aq_nic_s *self)
> +{
> +	int err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
> +
> +	if (err < 0)
> +		return -1;


why not just return err?

> +
> +	if (self->link_status.mbps != self->aq_hw->aq_link_status.mbps)
> +		pr_info("%s: link change old %d new %d\n",
> +			AQ_CFG_DRV_NAME, self->link_status.mbps,
> +			self->aq_hw->aq_link_status.mbps);

You has ndev in struct aq_nic_s *self, why not use netdev_*?


> +
> +	self->link_status = self->aq_hw->aq_link_status;
> +	if (!netif_carrier_ok(self->ndev) && self->link_status.mbps) {
> +		aq_utils_obj_set(&self->header.flags,
> +				 AQ_NIC_FLAG_STARTED);
> +		aq_utils_obj_clear(&self->header.flags,
> +				   AQ_NIC_LINK_DOWN);
> +		netif_carrier_on(self->ndev);
> +		netif_tx_wake_all_queues(self->ndev);
> +	}
> +	if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) {
> +		netif_carrier_off(self->ndev);
> +		netif_tx_disable(self->ndev);
> +		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
> +	}
> +	return 0;
> +}
> +
>  static void aq_nic_service_timer_cb(unsigned long param)
>  {
>  	struct aq_nic_s *self = (struct aq_nic_s *)param;
> @@ -131,26 +160,13 @@ static void aq_nic_service_timer_cb(unsigned long param)
>  	if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY))
>  		goto err_exit;
>  
> -	err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
> -	if (err < 0)
> +	err = aq_nic_update_link_status(self);
> +	if (err)
>  		goto err_exit;
>  
> -	self->link_status = self->aq_hw->aq_link_status;
> -
>  	self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw,
>  		    self->aq_nic_cfg.is_interrupt_moderation);
>  
> -	if (self->link_status.mbps) {
> -		aq_utils_obj_set(&self->header.flags,
> -				 AQ_NIC_FLAG_STARTED);
> -		aq_utils_obj_clear(&self->header.flags,
> -				   AQ_NIC_LINK_DOWN);
> -		netif_carrier_on(self->ndev);
> -	} else {
> -		netif_carrier_off(self->ndev);
> -		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
> -	}
> -
>  	memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
>  	memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
>  	for (i = AQ_DIMOF(self->aq_vec); i--;) {
> @@ -240,7 +256,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
>  int aq_nic_ndev_register(struct aq_nic_s *self)
>  {
>  	int err = 0;
> -	unsigned int i = 0U;
>  
>  	if (!self->ndev) {
>  		err = -EINVAL;
> @@ -262,8 +277,7 @@ int aq_nic_ndev_register(struct aq_nic_s *self)
>  
>  	netif_carrier_off(self->ndev);
>  
> -	for (i = AQ_CFG_VECS_MAX; i--;)
> -		aq_nic_ndev_queue_stop(self, i);
> +	netif_tx_disable(self->ndev);
>  
>  	err = register_netdev(self->ndev);
>  	if (err < 0)
> @@ -319,12 +333,8 @@ struct aq_nic_s *aq_nic_alloc_hot(struct net_device *ndev)
>  		err = -EINVAL;
>  		goto err_exit;
>  	}
> -	if (netif_running(ndev)) {
> -		unsigned int i;
> -
> -		for (i = AQ_CFG_VECS_MAX; i--;)
> -			netif_stop_subqueue(ndev, i);
> -	}
> +	if (netif_running(ndev))
> +		netif_tx_disable(ndev);
>  
>  	for (self->aq_vecs = 0; self->aq_vecs < self->aq_nic_cfg.vecs;
>  		self->aq_vecs++) {
> @@ -384,16 +394,6 @@ int aq_nic_init(struct aq_nic_s *self)
>  	return err;
>  }
>  
> -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx)
> -{
> -	netif_start_subqueue(self->ndev, idx);
> -}
> -
> -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx)
> -{
> -	netif_stop_subqueue(self->ndev, idx);
> -}
> -
>  int aq_nic_start(struct aq_nic_s *self)
>  {
>  	struct aq_vec_s *aq_vec = NULL;
> @@ -452,10 +452,6 @@ int aq_nic_start(struct aq_nic_s *self)
>  			goto err_exit;
>  	}
>  
> -	for (i = 0U, aq_vec = self->aq_vec[0];
> -		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
> -		aq_nic_ndev_queue_start(self, i);
> -
>  	err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs);
>  	if (err < 0)
>  		goto err_exit;
> @@ -464,6 +460,8 @@ int aq_nic_start(struct aq_nic_s *self)
>  	if (err < 0)
>  		goto err_exit;
>  
> +	netif_tx_start_all_queues(self->ndev);
> +
>  err_exit:
>  	return err;
>  }
> @@ -603,7 +601,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
>  	unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
>  	unsigned int tc = 0U;
>  	int err = NETDEV_TX_OK;
> -	bool is_nic_in_bad_state;
>  
>  	frags = skb_shinfo(skb)->nr_frags + 1;
>  
> @@ -614,13 +611,10 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
>  		goto err_exit;
>  	}
>  
> -	is_nic_in_bad_state = aq_utils_obj_test(&self->header.flags,
> -						AQ_NIC_FLAGS_IS_NOT_TX_READY) ||
> -						(aq_ring_avail_dx(ring) <
> -						AQ_CFG_SKB_FRAGS_MAX);
> +	aq_ring_update_queue_state(ring);
>  
> -	if (is_nic_in_bad_state) {
> -		aq_nic_ndev_queue_stop(self, ring->idx);
> +	/* Above status update may stop the queue. Check this. */
> +	if (__netif_subqueue_stopped(self->ndev, ring->idx)) {
>  		err = NETDEV_TX_BUSY;
>  		goto err_exit;
>  	}
> @@ -632,9 +626,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
>  						      ring,
>  						      frags);
>  		if (err >= 0) {
> -			if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1)
> -				aq_nic_ndev_queue_stop(self, ring->idx);
> -
>  			++ring->stats.tx.packets;
>  			ring->stats.tx.bytes += skb->len;
>  		}
> @@ -906,9 +897,7 @@ int aq_nic_stop(struct aq_nic_s *self)
>  	struct aq_vec_s *aq_vec = NULL;
>  	unsigned int i = 0U;
>  
> -	for (i = 0U, aq_vec = self->aq_vec[0];
> -		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
> -		aq_nic_ndev_queue_stop(self, i);
> +	netif_tx_disable(self->ndev);
>  
>  	del_timer_sync(&self->service_timer);
>  
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> index 7fc2a5e..0ddd556 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> @@ -83,8 +83,6 @@ struct net_device *aq_nic_get_ndev(struct aq_nic_s *self);
>  int aq_nic_init(struct aq_nic_s *self);
>  int aq_nic_cfg_start(struct aq_nic_s *self);
>  int aq_nic_ndev_register(struct aq_nic_s *self);
> -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx);
> -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx);
>  void aq_nic_ndev_free(struct aq_nic_s *self);
>  int aq_nic_start(struct aq_nic_s *self);
>  int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb);
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> index 4eee199..02f79b0 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> @@ -104,6 +104,32 @@ int aq_ring_init(struct aq_ring_s *self)
>  	return 0;
>  }
>  
> +void aq_ring_update_queue_state(struct aq_ring_s *ring)
> +{
> +	if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX)
> +		aq_ring_queue_stop(ring);
> +	else if (aq_ring_avail_dx(ring) > AQ_CFG_RESTART_DESC_THRES)
> +		aq_ring_queue_wake(ring);
> +}
> +
> +void aq_ring_queue_wake(struct aq_ring_s *ring)
> +{
> +	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
> +
> +	if (__netif_subqueue_stopped(ndev, ring->idx)) {
> +		netif_wake_subqueue(ndev, ring->idx);
> +		ring->stats.tx.queue_restarts++;
> +	}
> +}
> +
> +void aq_ring_queue_stop(struct aq_ring_s *ring)
> +{
> +	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
> +
> +	if (!__netif_subqueue_stopped(ndev, ring->idx))
> +		netif_stop_subqueue(ndev, ring->idx);
> +}
> +
>  void aq_ring_tx_clean(struct aq_ring_s *self)
>  {
>  	struct device *dev = aq_nic_get_dev(self->aq_nic);
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
> index 782176c..24523b5 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
> @@ -94,6 +94,7 @@ struct aq_ring_stats_tx_s {
>  	u64 errors;
>  	u64 packets;
>  	u64 bytes;
> +	u64 queue_restarts;
>  };
>  
>  union aq_ring_stats_s {
> @@ -147,6 +148,9 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
>  int aq_ring_init(struct aq_ring_s *self);
>  void aq_ring_rx_deinit(struct aq_ring_s *self);
>  void aq_ring_free(struct aq_ring_s *self);
> +void aq_ring_update_queue_state(struct aq_ring_s *ring);
> +void aq_ring_queue_wake(struct aq_ring_s *ring);
> +void aq_ring_queue_stop(struct aq_ring_s *ring);
>  void aq_ring_tx_clean(struct aq_ring_s *self);
>  int aq_ring_rx_clean(struct aq_ring_s *self,
>  		     struct napi_struct *napi,
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> index ebf5880..305ff8f 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> @@ -59,12 +59,7 @@ static int aq_vec_poll(struct napi_struct *napi, int budget)
>  			if (ring[AQ_VEC_TX_ID].sw_head !=
>  			    ring[AQ_VEC_TX_ID].hw_head) {
>  				aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]);
> -
> -				if (aq_ring_avail_dx(&ring[AQ_VEC_TX_ID]) >
> -				    AQ_CFG_SKB_FRAGS_MAX) {
> -					aq_nic_ndev_queue_start(self->aq_nic,
> -						ring[AQ_VEC_TX_ID].idx);
> -				}
> +				aq_ring_update_queue_state(&ring[AQ_VEC_TX_ID]);
>  				was_tx_cleaned = true;
>  			}
>  
> @@ -364,6 +359,7 @@ void aq_vec_add_stats(struct aq_vec_s *self,
>  		stats_tx->packets += tx->packets;
>  		stats_tx->bytes += tx->bytes;
>  		stats_tx->errors += tx->errors;
> +		stats_tx->queue_restarts += tx->queue_restarts;
>  	}
>  }
>  
> 

^ permalink raw reply

* Re: [patch net-next 03/12] ipmr: Add FIB notification access functions
From: Nikolay Aleksandrov @ 2017-09-21 11:19 UTC (permalink / raw)
  To: Jiri Pirko, netdev; +Cc: davem, yotamg, idosch, mlxsw
In-Reply-To: <20170921064338.1282-4-jiri@resnulli.us>

On 21/09/17 09:43, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> Make the ipmr module register as a FIB notifier. To do that, implement both
> the ipmr_seq_read and ipmr_dump ops.
> 
> The ipmr_seq_read op returns a sequence counter that is incremented on
> every notification related operation done by the ipmr. To implement that,
> add a sequence counter in the netns_ipv4 struct and increment it whenever a
> new MFC route or VIF are added or deleted. The sequence operations are
> protected by the RTNL lock.
> 
> The ipmr_dump iterates the list of MFC routes and the list of VIF entries
> and sends notifications about them. The entries dump is done under RCU.
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/linux/mroute.h   |  15 ++++++
>  include/net/netns/ipv4.h |   3 ++
>  net/ipv4/ipmr.c          | 135 ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 151 insertions(+), 2 deletions(-)
> 
[snip]
> +
> +static int ipmr_dump(struct net *net, struct notifier_block *nb)
> +{
> +	struct mr_table *mrt;
> +	int err;
> +
> +	err = ipmr_rules_dump(net, nb);
> +	if (err)
> +		return err;
> +
> +	ipmr_for_each_table(mrt, net) {
> +		struct vif_device *v = &mrt->vif_table[0];
> +		struct mfc_cache *mfc;
> +		int vifi;
> +
> +		/* Notifiy on table VIF entries */
> +		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
> +			if (!v->dev)
> +				continue;
> +
> +			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
> +						     v, vifi, mrt->id);
> +		}

The VIF table is protected by mrt_lock (rwlock), here with RCU only
you're not guaranteed to keep v->dev. It can become NULL after the check above.
For details you can see vif_delete() in net/ipv4/ipmr.c. You need at least
mrt_lock for reading.

> +
> +		/* Notify on table MFC entries */
> +		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
> +			call_ipmr_mfc_entry_notifier(nb, net,
> +						     FIB_EVENT_ENTRY_ADD, mfc,
> +						     mrt->id);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct fib_notifier_ops ipmr_notifier_ops_template = {
> +	.family		= RTNL_FAMILY_IPMR,
> +	.fib_seq_read	= ipmr_seq_read,
> +	.fib_dump	= ipmr_dump,
> +	.owner		= THIS_MODULE,
> +};
> +
> +int __net_init ipmr_notifier_init(struct net *net)
> +{
> +	struct fib_notifier_ops *ops;
> +
> +	net->ipv4.ipmr_seq = 0;
> +
> +	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
> +	if (IS_ERR(ops))
> +		return PTR_ERR(ops);
> +	net->ipv4.ipmr_notifier_ops = ops;
> +
> +	return 0;
> +}
> +
> +static void __net_exit ipmr_notifier_exit(struct net *net)
> +{
> +	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
> +	net->ipv4.ipmr_notifier_ops = NULL;
> +}
> +
>  /* Setup for IP multicast routing */
>  static int __net_init ipmr_net_init(struct net *net)
>  {
>  	int err;
>  
> +	err = ipmr_notifier_init(net);
> +	if (err)
> +		goto ipmr_notifier_fail;
> +
>  	err = ipmr_rules_init(net);
>  	if (err < 0)
> -		goto fail;
> +		goto ipmr_rules_fail;
>  
>  #ifdef CONFIG_PROC_FS
>  	err = -ENOMEM;
> @@ -3074,7 +3202,9 @@ static int __net_init ipmr_net_init(struct net *net)
>  proc_vif_fail:
>  	ipmr_rules_exit(net);
>  #endif
> -fail:
> +ipmr_rules_fail:
> +	ipmr_notifier_exit(net);
> +ipmr_notifier_fail:
>  	return err;
>  }
>  
> @@ -3084,6 +3214,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
>  	remove_proc_entry("ip_mr_cache", net->proc_net);
>  	remove_proc_entry("ip_mr_vif", net->proc_net);
>  #endif
> +	ipmr_notifier_exit(net);
>  	ipmr_rules_exit(net);
>  }
>  
> 

^ permalink raw reply

* Re: [PATCH net] bpf: one perf event close won't free bpf program attached by another perf event
From: Peter Zijlstra @ 2017-09-21 11:17 UTC (permalink / raw)
  To: Yonghong Song; +Cc: Steven Rostedt, ast, daniel, netdev, kernel-team
In-Reply-To: <9e968490-87ae-7a79-9e59-0dcc840a93f5@fb.com>

On Wed, Sep 20, 2017 at 10:20:13PM -0700, Yonghong Song wrote:
> > (2). trace_event_call->perf_events are per cpu data structure, that
> > means, some filtering logic is needed to avoid the same perf_event prog
> > is executing twice.
> 
> What I mean here is that the trace_event_call->perf_events need to be
> checked on ALL cpus since bpf prog should be executed regardless of
> cpu affiliation. It is possible that the same perf_event in different
> per_cpu bucket and hence filtering is needed to avoid the same
> perf_event bpf_prog is executed twice.

An event will only ever be on a single CPU's list at any one time IIRC.

Now, hysterically perf_event_set_bpf_prog used the tracepoint crud
because that already had bpf bits in. But it might make sense to look at
unifying the bpf stuff across all the different event types. Have them
all use event->prog.

I suspect that would break a fair bunch of bpf proglets, since the data
access to the trace data would be completely different, but it would be
much nicer to not have this distinction based on event type.

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-21 11:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wei Wang, Cong Wang, Linux Kernel Network Developers,
	Eric Dumazet
In-Reply-To: <22cde020-e13a-3635-512c-25532f754bda@itcare.pl>



W dniu 2017-09-21 o 13:12, Paweł Staszewski pisze:
>
>
> W dniu 2017-09-21 o 13:03, Eric Dumazet pisze:
>> On Thu, 2017-09-21 at 11:06 +0200, Paweł Staszewski wrote:
>>> W dniu 2017-09-21 o 03:17, Eric Dumazet pisze:
>>>> On Wed, 2017-09-20 at 18:09 -0700, Wei Wang wrote:
>>>>>> Thanks very much Pawel for the feedback.
>>>>>>
>>>>>> I was looking into the code (specifically IPv4 part) and found 
>>>>>> that in
>>>>>> free_fib_info_rcu(), we call free_nh_exceptions() without holding 
>>>>>> the
>>>>>> fnhe_lock. I am wondering if that could cause some race condition on
>>>>>> fnhe->fnhe_rth_input/output so a double call on dst_dev_put() on the
>>>>>> same dst could be happening.
>>>>>>
>>>>>> But as we call free_fib_info_rcu() only after the grace period, and
>>>>>> the lookup code which could potentially modify
>>>>>> fnhe->fnhe_rth_input/output all holds rcu_read_lock(), it seems
>>>>>> fine...
>>>>>>
>>>>> Hi Pawel,
>>>>>
>>>>> Could you try the following debug patch on top of net-next branch and
>>>>> reproduce the issue check if there are warning msg showing?
>>>>>
>>>>> diff --git a/include/net/dst.h b/include/net/dst.h
>>>>> index 93568bd0a352..82aff41c6f63 100644
>>>>> --- a/include/net/dst.h
>>>>> +++ b/include/net/dst.h
>>>>> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry
>>>>> *dst, unsigned long time)
>>>>>    static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>>>>>    {
>>>>>           if (dst)
>>>>> -               atomic_inc(&dst->__refcnt);
>>>>> +               dst_hold(dst);
>>>>>           return dst;
>>>>>    }
>>>>>
>>>>> Thanks.
>>>>> Wei
>>>>>
>>>> Yes, we believe skb_dst_force() and skb_dst_force_safe() should be
>>>> unified  (to the 'safe' version)
>>>>
>>>> We no longer have gc to protect from 0 -> 1 transition of dst 
>>>> refcount.
>>>>
>>>>
>>>>
>>>>
>>> After adding patch from Wei
>>> https://bugzilla.kernel.org/show_bug.cgi?id=197005#c14
>>>
>> OK we have two problems here
>>
>> 1) We need to unify skb_dst_force()  ( for net tree )
>>
>> 2) Vlan devices should try to correctly handle IFF_XMIT_DST_RELEASE from
>> lower device. This will considerably help your performance.
>>
>>
>> For 1), this is what I had in mind, can you try it ?
>>
>> Thanks a lot !
>>
>> diff --git a/include/net/dst.h b/include/net/dst.h
>> index 
>> 93568bd0a3520bb7402f04d90cf04ac99c81cfbe..f23851eeaad917e8dafc06b58d23a2575405c894 
>> 100644
>> --- a/include/net/dst.h
>> +++ b/include/net/dst.h
>> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry 
>> *dst, unsigned long time)
>>   static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>>   {
>>       if (dst)
>> -        atomic_inc(&dst->__refcnt);
>> +        dst_hold(dst);
>>       return dst;
>>   }
>>   @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff 
>> *nskb, const struct sk_buff *oskb
>>       __skb_dst_copy(nskb, oskb->_skb_refdst);
>>   }
>>   -/**
>> - * skb_dst_force - makes sure skb dst is refcounted
>> - * @skb: buffer
>> - *
>> - * If dst is not yet refcounted, let's do it
>> - */
>> -static inline void skb_dst_force(struct sk_buff *skb)
>> -{
>> -    if (skb_dst_is_noref(skb)) {
>> -        WARN_ON(!rcu_read_lock_held());
>> -        skb->_skb_refdst &= ~SKB_DST_NOREF;
>> -        dst_clone(skb_dst(skb));
>> -    }
>> -}
>> -
>>   /**
>>    * dst_hold_safe - Take a reference on a dst if possible
>>    * @dst: pointer to dst entry
>> @@ -356,6 +341,23 @@ static inline void skb_dst_force_safe(struct 
>> sk_buff *skb)
>>       }
>>   }
>>   +/**
>> + * skb_dst_force - makes sure skb dst is refcounted
>> + * @skb: buffer
>> + *
>> + * If dst is not yet refcounted, let's do it
>> + */
>> +static inline void skb_dst_force(struct sk_buff *skb)
>> +{
>> +    if (skb_dst_is_noref(skb)) {
>> +        struct dst_entry *dst = skb_dst(skb);
>> +
>> +        WARN_ON(!rcu_read_lock_held());
>> +        if (!dst_hold_safe(dst))
>> +            dst = NULL;
>> +        skb->_skb_refdst = (unsigned long)dst;
>> +    }
>> +}
>>     /**
>>    *    __skb_tunnel_rx - prepare skb for rx reinsert
>>
>>
>>
> Thanks
>
> What is weird i have this part in my net-next from git:
> /**
>  * skb_dst_force_safe - makes sure skb dst is refcounted
>  * @skb: buffer
>  *
>  * If dst is not yet refcounted and not destroyed, grab a ref on it.
>  */
> static inline void skb_dst_force_safe(struct sk_buff *skb)
> {
>         if (skb_dst_is_noref(skb)) {
>                 struct dst_entry *dst = skb_dst(skb);
>
>                 if (!dst_hold_safe(dst))
>                         dst = NULL;
>
>                 skb->_skb_refdst = (unsigned long)dst;
>         }
> }
>
>
>
ok the difference is skb_dst_force_safe not skb_dst_force

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-21 11:12 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wei Wang, Cong Wang, Linux Kernel Network Developers,
	Eric Dumazet
In-Reply-To: <1505991826.29839.124.camel@edumazet-glaptop3.roam.corp.google.com>



W dniu 2017-09-21 o 13:03, Eric Dumazet pisze:
> On Thu, 2017-09-21 at 11:06 +0200, Paweł Staszewski wrote:
>> W dniu 2017-09-21 o 03:17, Eric Dumazet pisze:
>>> On Wed, 2017-09-20 at 18:09 -0700, Wei Wang wrote:
>>>>> Thanks very much Pawel for the feedback.
>>>>>
>>>>> I was looking into the code (specifically IPv4 part) and found that in
>>>>> free_fib_info_rcu(), we call free_nh_exceptions() without holding the
>>>>> fnhe_lock. I am wondering if that could cause some race condition on
>>>>> fnhe->fnhe_rth_input/output so a double call on dst_dev_put() on the
>>>>> same dst could be happening.
>>>>>
>>>>> But as we call free_fib_info_rcu() only after the grace period, and
>>>>> the lookup code which could potentially modify
>>>>> fnhe->fnhe_rth_input/output all holds rcu_read_lock(), it seems
>>>>> fine...
>>>>>
>>>> Hi Pawel,
>>>>
>>>> Could you try the following debug patch on top of net-next branch and
>>>> reproduce the issue check if there are warning msg showing?
>>>>
>>>> diff --git a/include/net/dst.h b/include/net/dst.h
>>>> index 93568bd0a352..82aff41c6f63 100644
>>>> --- a/include/net/dst.h
>>>> +++ b/include/net/dst.h
>>>> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry
>>>> *dst, unsigned long time)
>>>>    static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>>>>    {
>>>>           if (dst)
>>>> -               atomic_inc(&dst->__refcnt);
>>>> +               dst_hold(dst);
>>>>           return dst;
>>>>    }
>>>>
>>>> Thanks.
>>>> Wei
>>>>
>>> Yes, we believe skb_dst_force() and skb_dst_force_safe() should be
>>> unified  (to the 'safe' version)
>>>
>>> We no longer have gc to protect from 0 -> 1 transition of dst refcount.
>>>
>>>
>>>
>>>
>> After adding patch from Wei
>> https://bugzilla.kernel.org/show_bug.cgi?id=197005#c14
>>
> OK we have two problems here
>
> 1) We need to unify skb_dst_force()  ( for net tree )
>
> 2) Vlan devices should try to correctly handle IFF_XMIT_DST_RELEASE from
> lower device. This will considerably help your performance.
>
>
> For 1), this is what I had in mind, can you try it ?
>
> Thanks a lot !
>
> diff --git a/include/net/dst.h b/include/net/dst.h
> index 93568bd0a3520bb7402f04d90cf04ac99c81cfbe..f23851eeaad917e8dafc06b58d23a2575405c894 100644
> --- a/include/net/dst.h
> +++ b/include/net/dst.h
> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
>   static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>   {
>   	if (dst)
> -		atomic_inc(&dst->__refcnt);
> +		dst_hold(dst);
>   	return dst;
>   }
>   
> @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
>   	__skb_dst_copy(nskb, oskb->_skb_refdst);
>   }
>   
> -/**
> - * skb_dst_force - makes sure skb dst is refcounted
> - * @skb: buffer
> - *
> - * If dst is not yet refcounted, let's do it
> - */
> -static inline void skb_dst_force(struct sk_buff *skb)
> -{
> -	if (skb_dst_is_noref(skb)) {
> -		WARN_ON(!rcu_read_lock_held());
> -		skb->_skb_refdst &= ~SKB_DST_NOREF;
> -		dst_clone(skb_dst(skb));
> -	}
> -}
> -
>   /**
>    * dst_hold_safe - Take a reference on a dst if possible
>    * @dst: pointer to dst entry
> @@ -356,6 +341,23 @@ static inline void skb_dst_force_safe(struct sk_buff *skb)
>   	}
>   }
>   
> +/**
> + * skb_dst_force - makes sure skb dst is refcounted
> + * @skb: buffer
> + *
> + * If dst is not yet refcounted, let's do it
> + */
> +static inline void skb_dst_force(struct sk_buff *skb)
> +{
> +	if (skb_dst_is_noref(skb)) {
> +		struct dst_entry *dst = skb_dst(skb);
> +
> +		WARN_ON(!rcu_read_lock_held());
> +		if (!dst_hold_safe(dst))
> +			dst = NULL;
> +		skb->_skb_refdst = (unsigned long)dst;
> +	}
> +}
>   
>   /**
>    *	__skb_tunnel_rx - prepare skb for rx reinsert
>
>
>
Thanks

What is weird i have this part in my net-next from git:
/**
  * skb_dst_force_safe - makes sure skb dst is refcounted
  * @skb: buffer
  *
  * If dst is not yet refcounted and not destroyed, grab a ref on it.
  */
static inline void skb_dst_force_safe(struct sk_buff *skb)
{
         if (skb_dst_is_noref(skb)) {
                 struct dst_entry *dst = skb_dst(skb);

                 if (!dst_hold_safe(dst))
                         dst = NULL;

                 skb->_skb_refdst = (unsigned long)dst;
         }
}

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Eric Dumazet @ 2017-09-21 11:03 UTC (permalink / raw)
  To: Paweł Staszewski
  Cc: Wei Wang, Cong Wang, Linux Kernel Network Developers,
	Eric Dumazet
In-Reply-To: <a016d5bc-1cbb-44d7-9ebf-e7e5428e6f98@itcare.pl>

On Thu, 2017-09-21 at 11:06 +0200, Paweł Staszewski wrote:
> 
> W dniu 2017-09-21 o 03:17, Eric Dumazet pisze:
> > On Wed, 2017-09-20 at 18:09 -0700, Wei Wang wrote:
> >>> Thanks very much Pawel for the feedback.
> >>>
> >>> I was looking into the code (specifically IPv4 part) and found that in
> >>> free_fib_info_rcu(), we call free_nh_exceptions() without holding the
> >>> fnhe_lock. I am wondering if that could cause some race condition on
> >>> fnhe->fnhe_rth_input/output so a double call on dst_dev_put() on the
> >>> same dst could be happening.
> >>>
> >>> But as we call free_fib_info_rcu() only after the grace period, and
> >>> the lookup code which could potentially modify
> >>> fnhe->fnhe_rth_input/output all holds rcu_read_lock(), it seems
> >>> fine...
> >>>
> >> Hi Pawel,
> >>
> >> Could you try the following debug patch on top of net-next branch and
> >> reproduce the issue check if there are warning msg showing?
> >>
> >> diff --git a/include/net/dst.h b/include/net/dst.h
> >> index 93568bd0a352..82aff41c6f63 100644
> >> --- a/include/net/dst.h
> >> +++ b/include/net/dst.h
> >> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry
> >> *dst, unsigned long time)
> >>   static inline struct dst_entry *dst_clone(struct dst_entry *dst)
> >>   {
> >>          if (dst)
> >> -               atomic_inc(&dst->__refcnt);
> >> +               dst_hold(dst);
> >>          return dst;
> >>   }
> >>
> >> Thanks.
> >> Wei
> >>
> >
> > Yes, we believe skb_dst_force() and skb_dst_force_safe() should be
> > unified  (to the 'safe' version)
> >
> > We no longer have gc to protect from 0 -> 1 transition of dst refcount.
> >
> >
> >
> >
> 
> After adding patch from Wei
> https://bugzilla.kernel.org/show_bug.cgi?id=197005#c14
> 

OK we have two problems here 

1) We need to unify skb_dst_force()  ( for net tree )

2) Vlan devices should try to correctly handle IFF_XMIT_DST_RELEASE from
lower device. This will considerably help your performance.


For 1), this is what I had in mind, can you try it ?

Thanks a lot !

diff --git a/include/net/dst.h b/include/net/dst.h
index 93568bd0a3520bb7402f04d90cf04ac99c81cfbe..f23851eeaad917e8dafc06b58d23a2575405c894 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
 static inline struct dst_entry *dst_clone(struct dst_entry *dst)
 {
 	if (dst)
-		atomic_inc(&dst->__refcnt);
+		dst_hold(dst);
 	return dst;
 }
 
@@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
 	__skb_dst_copy(nskb, oskb->_skb_refdst);
 }
 
-/**
- * skb_dst_force - makes sure skb dst is refcounted
- * @skb: buffer
- *
- * If dst is not yet refcounted, let's do it
- */
-static inline void skb_dst_force(struct sk_buff *skb)
-{
-	if (skb_dst_is_noref(skb)) {
-		WARN_ON(!rcu_read_lock_held());
-		skb->_skb_refdst &= ~SKB_DST_NOREF;
-		dst_clone(skb_dst(skb));
-	}
-}
-
 /**
  * dst_hold_safe - Take a reference on a dst if possible
  * @dst: pointer to dst entry
@@ -356,6 +341,23 @@ static inline void skb_dst_force_safe(struct sk_buff *skb)
 	}
 }
 
+/**
+ * skb_dst_force - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted, let's do it
+ */
+static inline void skb_dst_force(struct sk_buff *skb)
+{
+	if (skb_dst_is_noref(skb)) {
+		struct dst_entry *dst = skb_dst(skb);
+
+		WARN_ON(!rcu_read_lock_held());
+		if (!dst_hold_safe(dst))
+			dst = NULL;
+		skb->_skb_refdst = (unsigned long)dst;
+	}
+}
 
 /**
  *	__skb_tunnel_rx - prepare skb for rx reinsert

^ permalink raw reply related

* [PATCH 1/1] net:nfc: use setup_timer
From: Allen Pais @ 2017-09-21 10:59 UTC (permalink / raw)
  To: linux-kernel; +Cc: sameo, netdev, Allen Pais

    Use setup_timer function instead of initializing timer with the
    function and data fields.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
---
 net/nfc/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/nfc/core.c b/net/nfc/core.c
index 5cf33df..e5e23c2 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -1094,9 +1094,8 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
 	dev->targets_generation = 1;
 
 	if (ops->check_presence) {
-		init_timer(&dev->check_pres_timer);
-		dev->check_pres_timer.data = (unsigned long)dev;
-		dev->check_pres_timer.function = nfc_check_pres_timeout;
+		setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout,
+			    (unsigned long)dev);
 
 		INIT_WORK(&dev->check_pres_work, nfc_check_pres_work);
 	}
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 2/4] cxgb4: add basic tc flower offload support
From: Yunsheng Lin @ 2017-09-21 10:55 UTC (permalink / raw)
  To: Rahul Lakkireddy, netdev; +Cc: davem, kumaras, ganeshgr, nirranjan, indranil
In-Reply-To: <dc5f8e6419ad3439b14f39306245d98537be3306.1505977744.git.rahul.lakkireddy@chelsio.com>

Hi, Kumar

On 2017/9/21 15:33, Rahul Lakkireddy wrote:
> From: Kumar Sanghvi <kumaras@chelsio.com>
> 
> Add support to add/remove flows for offload.  Following match
> and action are supported for offloading a flow:
> 
> Match: ether-protocol, IPv4/IPv6 addresses, L4 ports (TCP/UDP)
> Action: drop, redirect to another port on the device.
> 
> The qualifying flows can have accompanying mask information.
> 
> Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
> Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
> ---
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |   3 +
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c  |  26 ++
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |   2 +
>  .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 285 ++++++++++++++++++++-
>  .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h   |  17 ++
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h     |   1 +
>  6 files changed, 332 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
> index ea72d2d2e1b4..26eac599ab2c 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
> @@ -904,6 +904,9 @@ struct adapter {
>  	/* TC u32 offload */
>  	struct cxgb4_tc_u32_table *tc_u32;
>  	struct chcr_stats_debug chcr_stats;
> +
> +	/* TC flower offload */
> +	DECLARE_HASHTABLE(flower_anymatch_tbl, 9);
>  };
>  
>  /* Support for "sched-class" command to allow a TX Scheduling Class to be
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
> index 45b5853ca2f1..07a4619e2164 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
> @@ -148,6 +148,32 @@ static int get_filter_steerq(struct net_device *dev,
>  	return iq;
>  }
>  
> +int cxgb4_get_free_ftid(struct net_device *dev, int family)
> +{
> +	struct adapter *adap = netdev2adap(dev);
> +	struct tid_info *t = &adap->tids;
> +	int ftid;
> +
> +	spin_lock_bh(&t->ftid_lock);
> +	if (family == PF_INET) {
> +		ftid = find_first_zero_bit(t->ftid_bmap, t->nftids);
> +		if (ftid >= t->nftids)
> +			ftid = -1;
> +	} else {
> +		ftid = bitmap_find_free_region(t->ftid_bmap, t->nftids, 2);
> +		if (ftid < 0) {
> +			ftid = -1;

ftid = -1 is not needed?

> +			goto out_unlock;
> +		}
> +
> +		/* this is only a lookup, keep the found region unallocated */
> +		bitmap_release_region(t->ftid_bmap, ftid, 2);
> +	}
> +out_unlock:
> +	spin_unlock_bh(&t->ftid_lock);
> +	return ftid;
> +}
> +
>  static int cxgb4_set_ftid(struct tid_info *t, int fidx, int family)
>  {
>  	spin_lock_bh(&t->ftid_lock);
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> index 8923affbdaf8..3ba4e1ff8486 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> @@ -5105,6 +5105,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
>  		if (!adapter->tc_u32)
>  			dev_warn(&pdev->dev,
>  				 "could not offload tc u32, continuing\n");
> +
> +		cxgb4_init_tc_flower(adapter);
>  	}
>  
>  	if (is_offload(adapter)) {
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
> index 16dff71e4d02..1af01101faaf 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
> @@ -38,16 +38,292 @@
>  #include "cxgb4.h"
>  #include "cxgb4_tc_flower.h"
>  
> +static struct ch_tc_flower_entry *allocate_flower_entry(void)
> +{
> +	struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL);
> +	return new;
> +}
> +
> +/* Must be called with either RTNL or rcu_read_lock */
> +static struct ch_tc_flower_entry *ch_flower_lookup(struct adapter *adap,
> +						   unsigned long flower_cookie)
> +{
> +	struct ch_tc_flower_entry *flower_entry;
> +
> +	hash_for_each_possible_rcu(adap->flower_anymatch_tbl, flower_entry,
> +				   link, flower_cookie)
> +		if (flower_entry->tc_flower_cookie == flower_cookie)
> +			return flower_entry;
> +	return NULL;
> +}
> +
> +static void cxgb4_process_flow_match(struct net_device *dev,
> +				     struct tc_cls_flower_offload *cls,
> +				     struct ch_filter_specification *fs)
> +{
> +	u16 addr_type = 0;
> +
> +	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
> +		struct flow_dissector_key_control *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_CONTROL,
> +						  cls->key);
> +
> +		addr_type = key->addr_type;
> +	}
> +
> +	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
> +		struct flow_dissector_key_basic *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_BASIC,
> +						  cls->key);
> +		struct flow_dissector_key_basic *mask =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_BASIC,
> +						  cls->mask);
> +		u16 ethtype_key = ntohs(key->n_proto);
> +		u16 ethtype_mask = ntohs(mask->n_proto);
> +
> +		if (ethtype_key == ETH_P_ALL) {
> +			ethtype_key = 0;
> +			ethtype_mask = 0;
> +		}
> +
> +		fs->val.ethtype = ethtype_key;
> +		fs->mask.ethtype = ethtype_mask;
> +		fs->val.proto = key->ip_proto;
> +		fs->mask.proto = mask->ip_proto;
> +	}
> +
> +	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
> +		struct flow_dissector_key_ipv4_addrs *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
> +						  cls->key);
> +		struct flow_dissector_key_ipv4_addrs *mask =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
> +						  cls->mask);
> +		fs->type = 0;
> +		memcpy(&fs->val.lip[0], &key->dst, sizeof(key->dst));
> +		memcpy(&fs->val.fip[0], &key->src, sizeof(key->src));
> +		memcpy(&fs->mask.lip[0], &mask->dst, sizeof(mask->dst));
> +		memcpy(&fs->mask.fip[0], &mask->src, sizeof(mask->src));
> +	}
> +
> +	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
> +		struct flow_dissector_key_ipv6_addrs *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
> +						  cls->key);
> +		struct flow_dissector_key_ipv6_addrs *mask =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
> +						  cls->mask);
> +
> +		fs->type = 1;
> +		memcpy(&fs->val.lip[0], key->dst.s6_addr, sizeof(key->dst));
> +		memcpy(&fs->val.fip[0], key->src.s6_addr, sizeof(key->src));
> +		memcpy(&fs->mask.lip[0], mask->dst.s6_addr, sizeof(mask->dst));
> +		memcpy(&fs->mask.fip[0], mask->src.s6_addr, sizeof(mask->src));
> +	}
> +
> +	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
> +		struct flow_dissector_key_ports *key, *mask;
> +
> +		key = skb_flow_dissector_target(cls->dissector,
> +						FLOW_DISSECTOR_KEY_PORTS,
> +						cls->key);
> +		mask = skb_flow_dissector_target(cls->dissector,
> +						 FLOW_DISSECTOR_KEY_PORTS,
> +						 cls->mask);
> +		fs->val.lport = cpu_to_be16(key->dst);
> +		fs->mask.lport = cpu_to_be16(mask->dst);
> +		fs->val.fport = cpu_to_be16(key->src);
> +		fs->mask.fport = cpu_to_be16(mask->src);
> +	}
> +
> +	/* Match only packets coming from the ingress port where this
> +	 * filter will be created.
> +	 */
> +	fs->val.iport = netdev2pinfo(dev)->port_id;
> +	fs->mask.iport = ~0;
> +}
> +
> +static int cxgb4_validate_flow_match(struct net_device *dev,
> +				     struct tc_cls_flower_offload *cls)
> +{
> +	if (cls->dissector->used_keys &
> +	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
> +	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
> +	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
> +	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
> +	      BIT(FLOW_DISSECTOR_KEY_PORTS))) {
> +		netdev_warn(dev, "Unsupported key used: 0x%x\n",
> +			    cls->dissector->used_keys);
> +		return -EOPNOTSUPP;
> +	}
> +	return 0;
> +}
> +
> +static void cxgb4_process_flow_actions(struct net_device *in,
> +				       struct tc_cls_flower_offload *cls,
> +				       struct ch_filter_specification *fs)
> +{
> +	const struct tc_action *a;
> +	LIST_HEAD(actions);
> +
> +	tcf_exts_to_list(cls->exts, &actions);
> +	list_for_each_entry(a, &actions, list) {
> +		if (is_tcf_gact_shot(a)) {
> +			fs->action = FILTER_DROP;
> +		} else if (is_tcf_mirred_egress_redirect(a)) {
> +			int ifindex = tcf_mirred_ifindex(a);
> +			struct net_device *out = __dev_get_by_index(dev_net(in),
> +								    ifindex);
> +			struct port_info *pi = netdev_priv(out);
> +
> +			fs->action = FILTER_SWITCH;
> +			fs->eport = pi->port_id;
> +		}
> +	}
> +}
> +
> +static int cxgb4_validate_flow_actions(struct net_device *dev,
> +				       struct tc_cls_flower_offload *cls)
> +{
> +	const struct tc_action *a;
> +	LIST_HEAD(actions);
> +
> +	tcf_exts_to_list(cls->exts, &actions);
> +	list_for_each_entry(a, &actions, list) {
> +		if (is_tcf_gact_shot(a)) {
> +			/* Do nothing */
> +		} else if (is_tcf_mirred_egress_redirect(a)) {
> +			struct adapter *adap = netdev2adap(dev);
> +			struct net_device *n_dev;
> +			unsigned int i, ifindex;
> +			bool found = false;
> +
> +			ifindex = tcf_mirred_ifindex(a);
> +			for_each_port(adap, i) {
> +				n_dev = adap->port[i];
> +				if (ifindex == n_dev->ifindex) {
> +					found = true;
> +					break;
> +				}
> +			}
> +
> +			/* If interface doesn't belong to our hw, then
> +			 * the provided output port is not valid
> +			 */
> +			if (!found) {
> +				netdev_err(dev, "%s: Out port invalid\n",
> +					   __func__);
> +				return -EINVAL;
> +			}
> +		} else {
> +			netdev_err(dev, "%s: Unsupported action\n", __func__);
> +			return -EOPNOTSUPP;
> +		}
> +	}
> +	return 0;
> +}
> +
>  int cxgb4_tc_flower_replace(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls)
>  {
> -	return -EOPNOTSUPP;
> +	struct adapter *adap = netdev2adap(dev);
> +	struct ch_tc_flower_entry *ch_flower;
> +	struct ch_filter_specification *fs;
> +	struct filter_ctx ctx;
> +	int fidx;
> +	int ret;
> +
> +	if (cxgb4_validate_flow_actions(dev, cls))
> +		return -EOPNOTSUPP;
> +
> +	if (cxgb4_validate_flow_match(dev, cls))
> +		return -EOPNOTSUPP;
> +
> +	ch_flower = allocate_flower_entry();
> +	if (!ch_flower) {
> +		netdev_err(dev, "%s: ch_flower alloc failed.\n", __func__);
> +		ret = -ENOMEM;
> +		goto err;

Just return, err label is needed?

> +	}
> +
> +	fs = &ch_flower->fs;
> +	fs->hitcnts = 1;
> +	cxgb4_process_flow_actions(dev, cls, fs);
> +	cxgb4_process_flow_match(dev, cls, fs);
> +
> +	fidx = cxgb4_get_free_ftid(dev, fs->type ? PF_INET6 : PF_INET);
> +	if (fidx < 0) {
> +		netdev_err(dev, "%s: No fidx for offload.\n", __func__);
> +		ret = -ENOMEM;
> +		goto free_entry;
> +	}
> +
> +	init_completion(&ctx.completion);
> +	ret = __cxgb4_set_filter(dev, fidx, fs, &ctx);
> +	if (ret) {
> +		netdev_err(dev, "%s: filter creation err %d\n",
> +			   __func__, ret);
> +		goto free_entry;
> +	}
> +
> +	/* Wait for reply */
> +	ret = wait_for_completion_timeout(&ctx.completion, 10 * HZ);
> +	if (!ret) {
> +		ret = -ETIMEDOUT;
> +		goto free_entry;
> +	}
> +
> +	ret = ctx.result;
> +	/* Check if hw returned error for filter creation */
> +	if (ret) {
> +		netdev_err(dev, "%s: filter creation err %d\n",
> +			   __func__, ret);
> +		goto free_entry;
> +	}
> +
> +	INIT_HLIST_NODE(&ch_flower->link);
> +	ch_flower->tc_flower_cookie = cls->cookie;
> +	ch_flower->filter_id = ctx.tid;
> +	hash_add_rcu(adap->flower_anymatch_tbl, &ch_flower->link, cls->cookie);
> +
> +	return ret;
> +
> +free_entry:
> +	kfree(ch_flower);
> +err:
> +	return ret;
>  }
>  
>  int cxgb4_tc_flower_destroy(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls)
>  {
> -	return -EOPNOTSUPP;
> +	struct adapter *adap = netdev2adap(dev);
> +	struct ch_tc_flower_entry *ch_flower;
> +	int ret;
> +
> +	ch_flower = ch_flower_lookup(adap, cls->cookie);
> +	if (!ch_flower) {
> +		ret = -ENOENT;
> +		goto err;

Same as above

> +	}
> +
> +	ret = cxgb4_del_filter(dev, ch_flower->filter_id);
> +	if (ret)
> +		goto err;
> +
> +	hash_del_rcu(&ch_flower->link);
> +	kfree_rcu(ch_flower, rcu);
> +	return ret;
> +
> +err:
> +	return ret;
>  }
>  
>  int cxgb4_tc_flower_stats(struct net_device *dev,
> @@ -55,3 +331,8 @@ int cxgb4_tc_flower_stats(struct net_device *dev,
>  {
>  	return -EOPNOTSUPP;
>  }
> +
> +void cxgb4_init_tc_flower(struct adapter *adap)
> +{
> +	hash_init(adap->flower_anymatch_tbl);
> +}
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
> index b321fc205b5a..6145a9e056eb 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
> @@ -37,10 +37,27 @@
>  
>  #include <net/pkt_cls.h>
>  
> +struct ch_tc_flower_stats {
> +	u64 packet_count;
> +	u64 byte_count;
> +	u64 last_used;
> +};
> +
> +struct ch_tc_flower_entry {
> +	struct ch_filter_specification fs;
> +	struct ch_tc_flower_stats stats;
> +	unsigned long tc_flower_cookie;
> +	struct hlist_node link;
> +	struct rcu_head rcu;
> +	u32 filter_id;
> +};
> +
>  int cxgb4_tc_flower_replace(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls);
>  int cxgb4_tc_flower_destroy(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls);
>  int cxgb4_tc_flower_stats(struct net_device *dev,
>  			  struct tc_cls_flower_offload *cls);
> +
> +void cxgb4_init_tc_flower(struct adapter *adap);
>  #endif /* __CXGB4_TC_FLOWER_H */
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
> index 84541fce94c5..88487095d14f 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
> @@ -212,6 +212,7 @@ struct filter_ctx {
>  
>  struct ch_filter_specification;
>  
> +int cxgb4_get_free_ftid(struct net_device *dev, int family);
>  int __cxgb4_set_filter(struct net_device *dev, int filter_id,
>  		       struct ch_filter_specification *fs,
>  		       struct filter_ctx *ctx);
> 

^ permalink raw reply

* [PATCH net 4/4] net:ethernet:atlantic: fix iommu errors
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Pavel Belous, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

From: Pavel Belous <pavel.belous@aquantia.com>

Call skb_frag_dma_map multiple times if tx length is greater than
device max and avoid processing tx ring until entire packet has been
sent.

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: Pavel Belous <pavel.belous@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c  | 43 ++++++++++++++----------
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 27 ++++++++++-----
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h |  6 ++--
 3 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 24f573c..5b18ffc 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -474,6 +474,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 	unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
 	unsigned int frag_count = 0U;
 	unsigned int dx = ring->sw_tail;
+	struct aq_ring_buff_s *first = NULL;
 	struct aq_ring_buff_s *dx_buff = &ring->buff_ring[dx];
 
 	if (unlikely(skb_is_gso(skb))) {
@@ -484,6 +485,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 		dx_buff->len_l4 = tcp_hdrlen(skb);
 		dx_buff->mss = skb_shinfo(skb)->gso_size;
 		dx_buff->is_txc = 1U;
+		dx_buff->eop_index = 0xffffU;
 
 		dx_buff->is_ipv6 =
 			(ip_hdr(skb)->version == 6) ? 1U : 0U;
@@ -503,6 +505,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 	if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa)))
 		goto exit;
 
+	first = dx_buff;
 	dx_buff->len_pkt = skb->len;
 	dx_buff->is_sop = 1U;
 	dx_buff->is_mapped = 1U;
@@ -531,40 +534,46 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 
 	for (; nr_frags--; ++frag_count) {
 		unsigned int frag_len = 0U;
+		unsigned int buff_offset = 0U;
+		unsigned int buff_size = 0U;
 		dma_addr_t frag_pa;
 		skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_count];
 
 		frag_len = skb_frag_size(frag);
-		frag_pa = skb_frag_dma_map(aq_nic_get_dev(self), frag, 0,
-					   frag_len, DMA_TO_DEVICE);
 
-		if (unlikely(dma_mapping_error(aq_nic_get_dev(self), frag_pa)))
-			goto mapping_error;
+		while (frag_len) {
+			if (frag_len > AQ_CFG_TX_FRAME_MAX)
+				buff_size = AQ_CFG_TX_FRAME_MAX;
+			else
+				buff_size = frag_len;
+
+			frag_pa = skb_frag_dma_map(aq_nic_get_dev(self),
+						   frag,
+						   buff_offset,
+						   buff_size,
+						   DMA_TO_DEVICE);
+
+			if (unlikely(dma_mapping_error(aq_nic_get_dev(self),
+						       frag_pa)))
+				goto mapping_error;
 
-		while (frag_len > AQ_CFG_TX_FRAME_MAX) {
 			dx = aq_ring_next_dx(ring, dx);
 			dx_buff = &ring->buff_ring[dx];
 
 			dx_buff->flags = 0U;
-			dx_buff->len = AQ_CFG_TX_FRAME_MAX;
+			dx_buff->len = buff_size;
 			dx_buff->pa = frag_pa;
 			dx_buff->is_mapped = 1U;
+			dx_buff->eop_index = 0xffffU;
+
+			frag_len -= buff_size;
+			buff_offset += buff_size;
 
-			frag_len -= AQ_CFG_TX_FRAME_MAX;
-			frag_pa += AQ_CFG_TX_FRAME_MAX;
 			++ret;
 		}
-
-		dx = aq_ring_next_dx(ring, dx);
-		dx_buff = &ring->buff_ring[dx];
-
-		dx_buff->flags = 0U;
-		dx_buff->len = frag_len;
-		dx_buff->pa = frag_pa;
-		dx_buff->is_mapped = 1U;
-		++ret;
 	}
 
+	first->eop_index = dx;
 	dx_buff->is_eop = 1U;
 	dx_buff->skb = skb;
 	goto exit;
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 02f79b0..0654e0c 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -104,6 +104,12 @@ int aq_ring_init(struct aq_ring_s *self)
 	return 0;
 }
 
+static inline bool aq_ring_dx_in_range(unsigned int h, unsigned int i,
+				       unsigned int t)
+{
+	return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t));
+}
+
 void aq_ring_update_queue_state(struct aq_ring_s *ring)
 {
 	if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX)
@@ -139,23 +145,28 @@ void aq_ring_tx_clean(struct aq_ring_s *self)
 		struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head];
 
 		if (likely(buff->is_mapped)) {
-			if (unlikely(buff->is_sop))
+			if (unlikely(buff->is_sop)) {
+				if (!buff->is_eop &&
+				    buff->eop_index != 0xffffU &&
+				    (!aq_ring_dx_in_range(self->sw_head,
+						buff->eop_index,
+						self->hw_head)))
+					break;
+
 				dma_unmap_single(dev, buff->pa, buff->len,
 						 DMA_TO_DEVICE);
-			else
+			} else {
 				dma_unmap_page(dev, buff->pa, buff->len,
 					       DMA_TO_DEVICE);
+			}
 		}
 
 		if (unlikely(buff->is_eop))
 			dev_kfree_skb_any(buff->skb);
-	}
-}
 
-static inline unsigned int aq_ring_dx_in_range(unsigned int h, unsigned int i,
-					       unsigned int t)
-{
-	return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t));
+		buff->pa = 0U;
+		buff->eop_index = 0xffffU;
+	}
 }
 
 #define AQ_SKB_ALIGN SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
index 24523b5..5844078 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
@@ -65,7 +65,7 @@ struct __packed aq_ring_buff_s {
 	};
 	union {
 		struct {
-			u32 len:16;
+			u16 len;
 			u32 is_ip_cso:1;
 			u32 is_udp_cso:1;
 			u32 is_tcp_cso:1;
@@ -77,8 +77,10 @@ struct __packed aq_ring_buff_s {
 			u32 is_cleaned:1;
 			u32 is_error:1;
 			u32 rsvd3:6;
+			u16 eop_index;
+			u16 rsvd4;
 		};
-		u32 flags;
+		u64 flags;
 	};
 };
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 3/4] net:ethernet:aquantia: Fix transient invalid link down/up indications
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Due to a bug in aquantia atlantic card firmware, it sometimes reports
invalid link speed bits. That caused driver to report link down events,
although link itself is totally fine.

This patch ignores such out of blue readings.

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
index 4f5ec9a..ab5d3cb 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
@@ -351,8 +351,7 @@ int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self)
 			break;
 
 		default:
-			link_status->mbps = 0U;
-			break;
+			return -1;
 		}
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 2/4] net:ethernet:aquantia: Fix Tx queue hangups
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Driver did a poor job in managing its Tx queues: Sometimes it could stop
tx queues due to link down condition in aq_nic_xmit - but never waked up
them. That led to Tx path total suspend.
This patch fixes this and improves generic queue management:
- introduces queue restart counter
- uses generic netif_ interface to disable and enable tx path
- refactors link up/down condition and introduces dmesg log event when
  link changes.
- introduces new constant for minimum descriptors count required for queue
  wakeup

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h  |  4 ++
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c  | 91 +++++++++++-------------
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h  |  2 -
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 26 +++++++
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h |  4 ++
 drivers/net/ethernet/aquantia/atlantic/aq_vec.c  |  8 +--
 6 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
index 2149864..0fdaaa6 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
@@ -51,6 +51,10 @@
 
 #define AQ_CFG_SKB_FRAGS_MAX   32U
 
+/* Number of descriptors available in one ring to resume this ring queue
+ */
+#define AQ_CFG_RESTART_DESC_THRES   (AQ_CFG_SKB_FRAGS_MAX * 2)
+
 #define AQ_CFG_NAPI_WEIGHT     64U
 
 #define AQ_CFG_MULTICAST_ADDRESS_MAX     32U
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index f281392..24f573c 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -119,6 +119,35 @@ int aq_nic_cfg_start(struct aq_nic_s *self)
 	return 0;
 }
 
+static int aq_nic_update_link_status(struct aq_nic_s *self)
+{
+	int err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
+
+	if (err < 0)
+		return -1;
+
+	if (self->link_status.mbps != self->aq_hw->aq_link_status.mbps)
+		pr_info("%s: link change old %d new %d\n",
+			AQ_CFG_DRV_NAME, self->link_status.mbps,
+			self->aq_hw->aq_link_status.mbps);
+
+	self->link_status = self->aq_hw->aq_link_status;
+	if (!netif_carrier_ok(self->ndev) && self->link_status.mbps) {
+		aq_utils_obj_set(&self->header.flags,
+				 AQ_NIC_FLAG_STARTED);
+		aq_utils_obj_clear(&self->header.flags,
+				   AQ_NIC_LINK_DOWN);
+		netif_carrier_on(self->ndev);
+		netif_tx_wake_all_queues(self->ndev);
+	}
+	if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) {
+		netif_carrier_off(self->ndev);
+		netif_tx_disable(self->ndev);
+		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
+	}
+	return 0;
+}
+
 static void aq_nic_service_timer_cb(unsigned long param)
 {
 	struct aq_nic_s *self = (struct aq_nic_s *)param;
@@ -131,26 +160,13 @@ static void aq_nic_service_timer_cb(unsigned long param)
 	if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY))
 		goto err_exit;
 
-	err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
-	if (err < 0)
+	err = aq_nic_update_link_status(self);
+	if (err)
 		goto err_exit;
 
-	self->link_status = self->aq_hw->aq_link_status;
-
 	self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw,
 		    self->aq_nic_cfg.is_interrupt_moderation);
 
-	if (self->link_status.mbps) {
-		aq_utils_obj_set(&self->header.flags,
-				 AQ_NIC_FLAG_STARTED);
-		aq_utils_obj_clear(&self->header.flags,
-				   AQ_NIC_LINK_DOWN);
-		netif_carrier_on(self->ndev);
-	} else {
-		netif_carrier_off(self->ndev);
-		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
-	}
-
 	memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
 	memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
 	for (i = AQ_DIMOF(self->aq_vec); i--;) {
@@ -240,7 +256,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 int aq_nic_ndev_register(struct aq_nic_s *self)
 {
 	int err = 0;
-	unsigned int i = 0U;
 
 	if (!self->ndev) {
 		err = -EINVAL;
@@ -262,8 +277,7 @@ int aq_nic_ndev_register(struct aq_nic_s *self)
 
 	netif_carrier_off(self->ndev);
 
-	for (i = AQ_CFG_VECS_MAX; i--;)
-		aq_nic_ndev_queue_stop(self, i);
+	netif_tx_disable(self->ndev);
 
 	err = register_netdev(self->ndev);
 	if (err < 0)
@@ -319,12 +333,8 @@ struct aq_nic_s *aq_nic_alloc_hot(struct net_device *ndev)
 		err = -EINVAL;
 		goto err_exit;
 	}
-	if (netif_running(ndev)) {
-		unsigned int i;
-
-		for (i = AQ_CFG_VECS_MAX; i--;)
-			netif_stop_subqueue(ndev, i);
-	}
+	if (netif_running(ndev))
+		netif_tx_disable(ndev);
 
 	for (self->aq_vecs = 0; self->aq_vecs < self->aq_nic_cfg.vecs;
 		self->aq_vecs++) {
@@ -384,16 +394,6 @@ int aq_nic_init(struct aq_nic_s *self)
 	return err;
 }
 
-void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx)
-{
-	netif_start_subqueue(self->ndev, idx);
-}
-
-void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx)
-{
-	netif_stop_subqueue(self->ndev, idx);
-}
-
 int aq_nic_start(struct aq_nic_s *self)
 {
 	struct aq_vec_s *aq_vec = NULL;
@@ -452,10 +452,6 @@ int aq_nic_start(struct aq_nic_s *self)
 			goto err_exit;
 	}
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
-		aq_nic_ndev_queue_start(self, i);
-
 	err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs);
 	if (err < 0)
 		goto err_exit;
@@ -464,6 +460,8 @@ int aq_nic_start(struct aq_nic_s *self)
 	if (err < 0)
 		goto err_exit;
 
+	netif_tx_start_all_queues(self->ndev);
+
 err_exit:
 	return err;
 }
@@ -603,7 +601,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 	unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
 	unsigned int tc = 0U;
 	int err = NETDEV_TX_OK;
-	bool is_nic_in_bad_state;
 
 	frags = skb_shinfo(skb)->nr_frags + 1;
 
@@ -614,13 +611,10 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 		goto err_exit;
 	}
 
-	is_nic_in_bad_state = aq_utils_obj_test(&self->header.flags,
-						AQ_NIC_FLAGS_IS_NOT_TX_READY) ||
-						(aq_ring_avail_dx(ring) <
-						AQ_CFG_SKB_FRAGS_MAX);
+	aq_ring_update_queue_state(ring);
 
-	if (is_nic_in_bad_state) {
-		aq_nic_ndev_queue_stop(self, ring->idx);
+	/* Above status update may stop the queue. Check this. */
+	if (__netif_subqueue_stopped(self->ndev, ring->idx)) {
 		err = NETDEV_TX_BUSY;
 		goto err_exit;
 	}
@@ -632,9 +626,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 						      ring,
 						      frags);
 		if (err >= 0) {
-			if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1)
-				aq_nic_ndev_queue_stop(self, ring->idx);
-
 			++ring->stats.tx.packets;
 			ring->stats.tx.bytes += skb->len;
 		}
@@ -906,9 +897,7 @@ int aq_nic_stop(struct aq_nic_s *self)
 	struct aq_vec_s *aq_vec = NULL;
 	unsigned int i = 0U;
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
-		aq_nic_ndev_queue_stop(self, i);
+	netif_tx_disable(self->ndev);
 
 	del_timer_sync(&self->service_timer);
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 7fc2a5e..0ddd556 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -83,8 +83,6 @@ struct net_device *aq_nic_get_ndev(struct aq_nic_s *self);
 int aq_nic_init(struct aq_nic_s *self);
 int aq_nic_cfg_start(struct aq_nic_s *self);
 int aq_nic_ndev_register(struct aq_nic_s *self);
-void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx);
-void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx);
 void aq_nic_ndev_free(struct aq_nic_s *self);
 int aq_nic_start(struct aq_nic_s *self);
 int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 4eee199..02f79b0 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -104,6 +104,32 @@ int aq_ring_init(struct aq_ring_s *self)
 	return 0;
 }
 
+void aq_ring_update_queue_state(struct aq_ring_s *ring)
+{
+	if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX)
+		aq_ring_queue_stop(ring);
+	else if (aq_ring_avail_dx(ring) > AQ_CFG_RESTART_DESC_THRES)
+		aq_ring_queue_wake(ring);
+}
+
+void aq_ring_queue_wake(struct aq_ring_s *ring)
+{
+	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
+
+	if (__netif_subqueue_stopped(ndev, ring->idx)) {
+		netif_wake_subqueue(ndev, ring->idx);
+		ring->stats.tx.queue_restarts++;
+	}
+}
+
+void aq_ring_queue_stop(struct aq_ring_s *ring)
+{
+	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
+
+	if (!__netif_subqueue_stopped(ndev, ring->idx))
+		netif_stop_subqueue(ndev, ring->idx);
+}
+
 void aq_ring_tx_clean(struct aq_ring_s *self)
 {
 	struct device *dev = aq_nic_get_dev(self->aq_nic);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
index 782176c..24523b5 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
@@ -94,6 +94,7 @@ struct aq_ring_stats_tx_s {
 	u64 errors;
 	u64 packets;
 	u64 bytes;
+	u64 queue_restarts;
 };
 
 union aq_ring_stats_s {
@@ -147,6 +148,9 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
 int aq_ring_init(struct aq_ring_s *self);
 void aq_ring_rx_deinit(struct aq_ring_s *self);
 void aq_ring_free(struct aq_ring_s *self);
+void aq_ring_update_queue_state(struct aq_ring_s *ring);
+void aq_ring_queue_wake(struct aq_ring_s *ring);
+void aq_ring_queue_stop(struct aq_ring_s *ring);
 void aq_ring_tx_clean(struct aq_ring_s *self);
 int aq_ring_rx_clean(struct aq_ring_s *self,
 		     struct napi_struct *napi,
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
index ebf5880..305ff8f 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
@@ -59,12 +59,7 @@ static int aq_vec_poll(struct napi_struct *napi, int budget)
 			if (ring[AQ_VEC_TX_ID].sw_head !=
 			    ring[AQ_VEC_TX_ID].hw_head) {
 				aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]);
-
-				if (aq_ring_avail_dx(&ring[AQ_VEC_TX_ID]) >
-				    AQ_CFG_SKB_FRAGS_MAX) {
-					aq_nic_ndev_queue_start(self->aq_nic,
-						ring[AQ_VEC_TX_ID].idx);
-				}
+				aq_ring_update_queue_state(&ring[AQ_VEC_TX_ID]);
 				was_tx_cleaned = true;
 			}
 
@@ -364,6 +359,7 @@ void aq_vec_add_stats(struct aq_vec_s *self,
 		stats_tx->packets += tx->packets;
 		stats_tx->bytes += tx->bytes;
 		stats_tx->errors += tx->errors;
+		stats_tx->queue_restarts += tx->queue_restarts;
 	}
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 1/4] net:ethernet:aquantia: Setup max_mtu in ndev to enable jumbo frames
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Although hardware is capable for almost 16K MTU, without max_mtu field
correctly set it only allows standard MTU to be used.
This patch enables max MTU, calculating it from hardware maximum frame size
of 16352 octets (including FCS).

Fixes: 5513e16421cb ("net: ethernet: aquantia: Fixes for aq_ndev_change_mtu")

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c                    | 5 +++--
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 6ac9e26..f281392 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -214,7 +214,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 	SET_NETDEV_DEV(ndev, dev);
 
 	ndev->if_port = port;
-	ndev->min_mtu = ETH_MIN_MTU;
 	self->ndev = ndev;
 
 	self->aq_pci_func = aq_pci_func;
@@ -283,6 +282,8 @@ int aq_nic_ndev_init(struct aq_nic_s *self)
 	self->ndev->features = aq_hw_caps->hw_features;
 	self->ndev->priv_flags = aq_hw_caps->hw_priv_flags;
 	self->ndev->mtu = aq_nic_cfg->mtu - ETH_HLEN;
+	self->ndev->min_mtu = ETH_MIN_MTU;
+	self->ndev->max_mtu = self->aq_hw_caps.mtu - ETH_FCS_LEN - ETH_HLEN;
 
 	return 0;
 }
@@ -695,7 +696,7 @@ int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu)
 {
 	int err = 0;
 
-	if (new_mtu > self->aq_hw_caps.mtu) {
+	if (new_mtu + ETH_FCS_LEN > self->aq_hw_caps.mtu) {
 		err = -EINVAL;
 		goto err_exit;
 	}
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
index f3957e93..fcf89e2 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
@@ -16,7 +16,7 @@
 
 #include "../aq_common.h"
 
-#define HW_ATL_B0_MTU_JUMBO (16000U)
+#define HW_ATL_B0_MTU_JUMBO  16352U
 #define HW_ATL_B0_MTU        1514U
 
 #define HW_ATL_B0_TX_RINGS 4U
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 0/4] net:ethernet:aquantia: Atlantic driver bugfixes and improvements
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh

This series contains bugfixes for aQuantia Atlantic driver.

Igor Russkikh (3):
  net:ethernet:aquantia: Setup max_mtu in ndev to enable jumbo frames
  net:ethernet:aquantia: Fix Tx queue hangups
  net:ethernet:aquantia: Fix transient invalid link down/up indications

Pavel Belous (1):
  net:ethernet:atlantic: fix iommu errors

 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h    |   4 +
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c    | 139 ++++++++++-----------
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h    |   2 -
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c   |  53 ++++++--
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h   |  10 +-
 drivers/net/ethernet/aquantia/atlantic/aq_vec.c    |   8 +-
 .../aquantia/atlantic/hw_atl/hw_atl_b0_internal.h  |   2 +-
 .../aquantia/atlantic/hw_atl/hw_atl_utils.c        |   3 +-
 8 files changed, 130 insertions(+), 91 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH net-next 0/5] net: introduce noref sk
From: Eric Dumazet @ 2017-09-21 10:37 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: David Miller, netdev, pablo, fw, edumazet, hannes
In-Reply-To: <1505986931.2560.51.camel@redhat.com>

On Thu, 2017-09-21 at 11:42 +0200, Paolo Abeni wrote:
> Hi,
> 
> Thanks for the feedback!
> 
> On Wed, 2017-09-20 at 20:20 -0700, David Miller wrote:
> > From: Paolo Abeni <pabeni@redhat.com>
> > Date: Wed, 20 Sep 2017 18:54:00 +0200
> > 
> > > This series introduce the infrastructure to store inside the skb a socket
> > > pointer without carrying a refcount to the socket.
> > > 
> > > Such infrastructure is then used in the network receive path - and
> > > specifically the early demux operation.
> > > 
> > > This allows the UDP early demux to perform a full lookup for UDP sockets,
> > > with many benefits:
> > > 
> > > - the UDP early demux code is now much simpler
> > > - the early demux does not hit any performance penalties in case of UDP hash
> > >   table collision - previously the early demux performed a partial, unsuccesful,
> > >   lookup
> > > - early demux is now operational also for unconnected sockets.
> > > 
> > > This infrastrcture will be used in follow-up series to allow dst caching for
> > > unconnected UDP sockets, and than to extend the same features to TCP listening
> > > sockets.
> > 
> > Like Eric, I find this series (while exciting) quite scary :-)
> > 
> > You really have to post some kind of performance numbers in your
> > header posting in order to justify something with these ramifications
> > and scale.
> 
> This is actually a preparatory work for the next series which will
> bring in the real gain. The next patches are still to be polished so we
>  posted this separately to get some early feedback. 
> 
> If that would help, I can post the follow-up soon as RFC. Overall -
> with the follow-up appplied, too - when using a single rx ingress
> queue, I measured ~20% tput gain for unconnected ipv4 sockets - with
> rp_filter disabled - and ~30% for ipv6 sockets. In case of multiple
> ingress queues, the gain is smaller but still measurable (roughly 5%). 
> 
> Please let me know if you prefer the see the full work early. 

I want to see the full work yes. Ipv6, and everything.

I do not want ~1000 lines of changed code in the stack for some corner
cases, where people do not properly use existing infra, like proper
SO_REUSEPORT with proper BPF filter to have as many clean siloes (proper
CPU/NUMA affinities to avoid QPI traffic)

The complexity of your patches reached a point where I am extremely
nervous.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next 1/5] net: add support for noref skb->sk
From: Eric Dumazet @ 2017-09-21 10:35 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, Pablo Neira Ayuso, Florian Westphal,
	Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <1505985297.2560.39.camel@redhat.com>

On Thu, 2017-09-21 at 11:14 +0200, Paolo Abeni wrote:
> Hi,
> 
> Thank you for looking at it!
> 
> On Wed, 2017-09-20 at 10:41 -0700, Eric Dumazet wrote:
> > On Wed, 2017-09-20 at 18:54 +0200, Paolo Abeni wrote:
> > > Noref sk do not carry a socket refcount, are valid
> > > only inside the current RCU section and must be
> > > explicitly cleared before exiting such section.
> > > 
> > > They will be used in a later patch to allow early demux
> > > without sock refcounting.
> > 
> > 
> > 
> > 
> > > +/* dummy destructor used by noref sockets */
> > > +void sock_dummyfree(struct sk_buff *skb)
> > > +{
> > 
> > BUG();
> > 
> > > +}
> > > +EXPORT_SYMBOL(sock_dummyfree);
> > > +
> 
> We can call sock_dummyfree() in legitimate paths, see below, but we can
> add a:
> 
> WARN_ON_ONCE(!rcu_read_lock_held());

This wont be enough see below.

> 
> here and in  skb_clear_noref_sk(). That should help much to catch
> possible bugs.
> 
> > I do not see how you ensure we do not leave RCU section with an skb
> > destructor pointing to this sock_dummyfree()
> > 
> > This patch series looks quite dangerous to me.
> 
> The idea is to explicitly clear the sknoref references before leaving
> the RCU section. Quite alike what we currently do for dst noref, but
> here the only place where we get a noref socket is the socket early
> demux, thus the scope of this change is more limited to what we have
> with noref dst_entries.
> 
> The relevant code is in the next 2 patches; after the demux we preserve
> the sknoref only if the skb has a local destination. The UDP socket
> will then set the noref on early demux lookup, and the skb will either:
> 
> * land on the corresponding UDP socket, the receive function will steal
> the sknoref
> * be dropped by some nft/iptables target - the dummy destructor is
> called
> * forwarded by some nft/iptables target outside the input path; we
> clear the skref explicitly in such targets. 
> 
> Currently there are an handful of places affected, and we can simplify
> the code dropping the early demux result for locally terminated
> multicast sockets on a host acting as a multicast router, please see
> the comment on the next patch.
> 
> > Do we really have real applications using connected UDP sockets and
> > wanting very high pps throughput ?
> 
> The ultimate goal is to improve the unconnected UDP sockets scenario,
> we do actually have use cases for that - DNS servers and VoIP SBCs.

Unconnected UDP traffic does not use refcounting on sk _already_.

And SO_REUSEPORT already allows us to handle all the traffic we want
_already_.


Please take a look at 71563f3414e917c62acd8e0fb0edf8ed6af63e4b

This might tell you why I am so nervous about your changes.

Checking WARN_ON_ONCE(!rcu_read_lock_held());
is not enough.

rcu_read_lock()
skb->destructor = sock_dummyfree;

queue the packet into an intermediate queue.
rcu_read_unlock();

....

rcu_read_lock()
...
if (skb->sk && skb->sk->state == ...) // crash

Also you covered IPv4, but really we need to forget about IPv4 and focus
on IPv6 only. And _then_ take care of IPv4 compat.

^ permalink raw reply

* [net-next v3] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl
From: Vincent Bernat @ 2017-09-21 10:05 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern, David Miller, bridge, netdev
  Cc: Vincent Bernat
In-Reply-To: <20170920162140.369bb198@xeon-e3>

Currently, there is a difference in netlink events received when an
interface is modified through bridge ioctl() or through netlink. This
patch generates additional events when an interface is added to or
removed from a bridge via ioctl().

When adding then removing an interface from a bridge with netlink, we
get:

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

When using ioctl():

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

Without this patch, the last netlink notification is not sent.

Signed-off-by: Vincent Bernat <vincent@bernat.im>
---
 net/bridge/br_ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 7970f8540cbb..66cd98772051 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -102,6 +102,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
 	else
 		ret = br_del_if(br, dev);
 
+	if (!ret)
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL);
+
 	return ret;
 }
 
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge
From: Vincent Bernat @ 2017-09-21 10:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Ahern, David Miller, bridge, netdev
In-Reply-To: <20170920162140.369bb198@xeon-e3>

 ❦ 20 septembre 2017 16:21 -0700, Stephen Hemminger <stephen@networkplumber.org> :

> The one concern is that ports added or removed through ioctl should
> cause same events as doing the same thing via netlink. Some users use
> brctl (ioctl) and others use newer bridge (netlink) API.

I'll make a third iteration to have the same notifications when using
ioctl() with details in the commit message.
-- 
When in doubt, tell the truth.
		-- Mark Twain

^ permalink raw reply

* Re: [PATCH net-next 0/5] net: introduce noref sk
From: Paolo Abeni @ 2017-09-21  9:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, pablo, fw, edumazet, hannes
In-Reply-To: <20170920.202022.2073272587145961156.davem@davemloft.net>

Hi,

Thanks for the feedback!

On Wed, 2017-09-20 at 20:20 -0700, David Miller wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> Date: Wed, 20 Sep 2017 18:54:00 +0200
> 
> > This series introduce the infrastructure to store inside the skb a socket
> > pointer without carrying a refcount to the socket.
> > 
> > Such infrastructure is then used in the network receive path - and
> > specifically the early demux operation.
> > 
> > This allows the UDP early demux to perform a full lookup for UDP sockets,
> > with many benefits:
> > 
> > - the UDP early demux code is now much simpler
> > - the early demux does not hit any performance penalties in case of UDP hash
> >   table collision - previously the early demux performed a partial, unsuccesful,
> >   lookup
> > - early demux is now operational also for unconnected sockets.
> > 
> > This infrastrcture will be used in follow-up series to allow dst caching for
> > unconnected UDP sockets, and than to extend the same features to TCP listening
> > sockets.
> 
> Like Eric, I find this series (while exciting) quite scary :-)
> 
> You really have to post some kind of performance numbers in your
> header posting in order to justify something with these ramifications
> and scale.

This is actually a preparatory work for the next series which will
bring in the real gain. The next patches are still to be polished so we
 posted this separately to get some early feedback. 

If that would help, I can post the follow-up soon as RFC. Overall -
with the follow-up appplied, too - when using a single rx ingress
queue, I measured ~20% tput gain for unconnected ipv4 sockets - with
rp_filter disabled - and ~30% for ipv6 sockets. In case of multiple
ingress queues, the gain is smaller but still measurable (roughly 5%). 

Please let me know if you prefer the see the full work early. 

Thanks,

Paolo

^ permalink raw reply

* [PATCH net-next 2/2] net: dsa: lan9303: Add basic offloading of unicast traffic
From: Egil Hjelmeland @ 2017-09-21  9:41 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland
In-Reply-To: <20170921094139.4250-1-privat@egil-hjelmeland.no>

When both user ports are joined to the same bridge, the normal
HW MAC learning is enabled. This means that unicast traffic is forwarded
in HW.

If one of the user ports leave the bridge,
the ports goes back to the initial separated operation.

Port separation relies on disabled HW MAC learning. Hence the condition
that both ports must join same bridge.

Add brigde methods port_bridge_join, port_bridge_leave and
port_stp_state_set.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
---
 drivers/net/dsa/lan9303-core.c | 88 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/lan9303.h      |  1 +
 2 files changed, 89 insertions(+)

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index bba875e114e7..76112674fe6a 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
+#include <linux/if_bridge.h>
 
 #include "lan9303.h"
 
@@ -146,6 +147,7 @@
 # define LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 (0)
 # define LAN9303_SWE_PORT_STATE_LEARNING_PORT0 BIT(1)
 # define LAN9303_SWE_PORT_STATE_BLOCKING_PORT0 BIT(0)
+# define LAN9303_SWE_PORT_STATE_DISABLED_PORT0 (3)
 #define LAN9303_SWE_PORT_MIRROR 0x1846
 # define LAN9303_SWE_PORT_MIRROR_SNIFF_ALL BIT(8)
 # define LAN9303_SWE_PORT_MIRROR_SNIFFER_PORT2 BIT(7)
@@ -431,6 +433,20 @@ static int lan9303_read_switch_reg(struct lan9303 *chip, u16 regnum, u32 *val)
 	return ret;
 }
 
+static int lan9303_write_switch_reg_mask(
+	struct lan9303 *chip, u16 regnum, u32 val, u32 mask)
+{
+	int ret;
+	u32 reg;
+
+	ret = lan9303_read_switch_reg(chip, regnum, &reg);
+	if (ret)
+		return ret;
+	reg = (reg & ~mask) | val;
+
+	return lan9303_write_switch_reg(chip, regnum, reg);
+}
+
 static int lan9303_write_switch_port(struct lan9303 *chip, int port,
 				     u16 regnum, u32 val)
 {
@@ -556,6 +572,12 @@ static int lan9303_separate_ports(struct lan9303 *chip)
 				LAN9303_SWE_PORT_STATE_BLOCKING_PORT2);
 }
 
+static void lan9303_bridge_ports(struct lan9303 *chip)
+{
+	/* ports bridged: remove mirroring */
+	lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_MIRROR, 0);
+}
+
 static int lan9303_handle_reset(struct lan9303 *chip)
 {
 	if (!chip->reset_gpio)
@@ -844,6 +866,69 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port,
 	}
 }
 
+static int lan9303_port_bridge_join(struct dsa_switch *ds, int port,
+				    struct net_device *br)
+{
+	struct lan9303 *chip = ds->priv;
+
+	dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
+	if (ds->ports[1].bridge_dev ==  ds->ports[2].bridge_dev) {
+		lan9303_bridge_ports(chip);
+		chip->is_bridged = true;  /* unleash stp_state_set() */
+	}
+
+	return 0;
+}
+
+static void lan9303_port_bridge_leave(struct dsa_switch *ds, int port,
+				      struct net_device *br)
+{
+	struct lan9303 *chip = ds->priv;
+
+	dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
+	if (chip->is_bridged) {
+		lan9303_separate_ports(chip);
+		chip->is_bridged = false;
+	}
+}
+
+static void lan9303_port_stp_state_set(struct dsa_switch *ds, int port,
+				       u8 state)
+{
+	int portmask, portstate;
+	struct lan9303 *chip = ds->priv;
+
+	dev_dbg(chip->dev, "%s(port %d, state %d)\n",
+		__func__, port, state);
+	if (!chip->is_bridged)
+		return; /* touching SWE_PORT_STATE will break port separation */
+
+	switch (state) {
+	case BR_STATE_DISABLED:
+		portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
+		break;
+	case BR_STATE_BLOCKING:
+	case BR_STATE_LISTENING:
+		portstate = LAN9303_SWE_PORT_STATE_BLOCKING_PORT0;
+		break;
+	case BR_STATE_LEARNING:
+		portstate = LAN9303_SWE_PORT_STATE_LEARNING_PORT0;
+		break;
+	case BR_STATE_FORWARDING:
+		portstate = LAN9303_SWE_PORT_STATE_FORWARDING_PORT0;
+		break;
+	default:
+		portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
+		dev_err(chip->dev, "unknown stp state: port %d, state %d\n",
+			port, state);
+	}
+
+	portmask = 0x3 << (port * 2);
+	portstate     <<= (port * 2);
+	lan9303_write_switch_reg_mask(chip, LAN9303_SWE_PORT_STATE,
+				      portstate, portmask);
+}
+
 static const struct dsa_switch_ops lan9303_switch_ops = {
 	.get_tag_protocol = lan9303_get_tag_protocol,
 	.setup = lan9303_setup,
@@ -855,6 +940,9 @@ static const struct dsa_switch_ops lan9303_switch_ops = {
 	.get_sset_count = lan9303_get_sset_count,
 	.port_enable = lan9303_port_enable,
 	.port_disable = lan9303_port_disable,
+	.port_bridge_join       = lan9303_port_bridge_join,
+	.port_bridge_leave      = lan9303_port_bridge_leave,
+	.port_stp_state_set     = lan9303_port_stp_state_set,
 };
 
 static int lan9303_register_switch(struct lan9303 *chip)
diff --git a/drivers/net/dsa/lan9303.h b/drivers/net/dsa/lan9303.h
index 4d8be555ff4d..5be246f05965 100644
--- a/drivers/net/dsa/lan9303.h
+++ b/drivers/net/dsa/lan9303.h
@@ -21,6 +21,7 @@ struct lan9303 {
 	struct dsa_switch *ds;
 	struct mutex indirect_mutex; /* protect indexed register access */
 	const struct lan9303_phy_ops *ops;
+	bool is_bridged; /* true if port 1 and 2 is bridged */
 };
 
 extern const struct regmap_access_table lan9303_register_set;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/2] net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
From: Egil Hjelmeland @ 2017-09-21  9:41 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland
In-Reply-To: <20170921094139.4250-1-privat@egil-hjelmeland.no>

Prepare for next patch:
Move tag setup from lan9303_separate_ports() to new function
lan9303_setup_tagging()

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
---
 drivers/net/dsa/lan9303-core.c | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index 07355db2ad81..bba875e114e7 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -157,6 +157,7 @@
 # define LAN9303_SWE_PORT_MIRROR_ENABLE_RX_MIRRORING BIT(1)
 # define LAN9303_SWE_PORT_MIRROR_ENABLE_TX_MIRRORING BIT(0)
 #define LAN9303_SWE_INGRESS_PORT_TYPE 0x1847
+#define  LAN9303_SWE_INGRESS_PORT_TYPE_VLAN 3
 #define LAN9303_BM_CFG 0x1c00
 #define LAN9303_BM_EGRSS_PORT_TYPE 0x1c0c
 # define LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT2 (BIT(17) | BIT(16))
@@ -510,11 +511,30 @@ static int lan9303_enable_processing_port(struct lan9303 *chip,
 				LAN9303_MAC_TX_CFG_X_TX_ENABLE);
 }
 
+/* forward special tagged packets from port 0 to port 1 *or* port 2 */
+static int lan9303_setup_tagging(struct lan9303 *chip)
+{
+	int ret;
+	/* enable defining the destination port via special VLAN tagging
+	 * for port 0
+	 */
+	ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE,
+				       LAN9303_SWE_INGRESS_PORT_TYPE_VLAN);
+	if (ret)
+		return ret;
+
+	/* tag incoming packets at port 1 and 2 on their way to port 0 to be
+	 * able to discover their source port
+	 */
+	return lan9303_write_switch_reg(
+		chip, LAN9303_BM_EGRSS_PORT_TYPE,
+		LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0);
+}
+
 /* We want a special working switch:
  * - do not forward packets between port 1 and 2
  * - forward everything from port 1 to port 0
  * - forward everything from port 2 to port 0
- * - forward special tagged packets from port 0 to port 1 *or* port 2
  */
 static int lan9303_separate_ports(struct lan9303 *chip)
 {
@@ -529,22 +549,6 @@ static int lan9303_separate_ports(struct lan9303 *chip)
 	if (ret)
 		return ret;
 
-	/* enable defining the destination port via special VLAN tagging
-	 * for port 0
-	 */
-	ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE,
-				       0x03);
-	if (ret)
-		return ret;
-
-	/* tag incoming packets at port 1 and 2 on their way to port 0 to be
-	 * able to discover their source port
-	 */
-	ret = lan9303_write_switch_reg(chip, LAN9303_BM_EGRSS_PORT_TYPE,
-			LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0);
-	if (ret)
-		return ret;
-
 	/* prevent port 1 and 2 from forwarding packets by their own */
 	return lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE,
 				LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 |
@@ -644,6 +648,10 @@ static int lan9303_setup(struct dsa_switch *ds)
 		return -EINVAL;
 	}
 
+	ret = lan9303_setup_tagging(chip);
+	if (ret)
+		dev_err(chip->dev, "failed to setup port tagging %d\n", ret);
+
 	ret = lan9303_separate_ports(chip);
 	if (ret)
 		dev_err(chip->dev, "failed to separate ports %d\n", ret);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 0/2] lan9303: Add basic offloading of unicast traffic
From: Egil Hjelmeland @ 2017-09-21  9:41 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland

This series add basic offloading of unicast traffic to the lan9303
DSA driver.

Comments welcome!

Egil Hjelmeland (2):
  net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
  net: dsa: lan9303: Add basic offloading of unicast traffic

 drivers/net/dsa/lan9303-core.c | 130 +++++++++++++++++++++++++++++++++++------
 drivers/net/dsa/lan9303.h      |   1 +
 2 files changed, 114 insertions(+), 17 deletions(-)

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH v3 10/31] befs: Define usercopy region in befs_inode_cache slab cache
From: Luis de Bethencourt @ 2017-09-21  9:34 UTC (permalink / raw)
  To: Kees Cook, linux-kernel
  Cc: David Windsor, Salah Triki, linux-fsdevel, netdev, linux-mm,
	kernel-hardening
In-Reply-To: <1505940337-79069-11-git-send-email-keescook@chromium.org>

On 09/20/2017 09:45 PM, Kees Cook wrote:
> From: David Windsor <dave@nullcore.net>
> 
> befs symlink pathnames, stored in struct befs_inode_info.i_data.symlink
> and therefore contained in the befs_inode_cache slab cache, need to be
> copied to/from userspace.
> 
> cache object allocation:
>      fs/befs/linuxvfs.c:
>          befs_alloc_inode(...):
>              ...
>              bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
>              ...
>              return &bi->vfs_inode;
> 
>          befs_iget(...):
>              ...
>              strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
>                      BEFS_SYMLINK_LEN);
>              ...
>              inode->i_link = befs_ino->i_data.symlink;
> 
> example usage trace:
>      readlink_copy+0x43/0x70
>      vfs_readlink+0x62/0x110
>      SyS_readlinkat+0x100/0x130
> 
>      fs/namei.c:
>          readlink_copy(..., link):
>              ...
>              copy_to_user(..., link, len);
> 
>          (inlined in vfs_readlink)
>          generic_readlink(dentry, ...):
>              struct inode *inode = d_inode(dentry);
>              const char *link = inode->i_link;
>              ...
>              readlink_copy(..., link);
> 
> In support of usercopy hardening, this patch defines a region in the
> befs_inode_cache slab cache in which userspace copy operations are
> allowed.
> 
> This region is known as the slab cache's usercopy region. Slab caches can
> now check that each copy operation involving cache-managed memory falls
> entirely within the slab's usercopy region.
> 
> This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
> whitelisting code in the last public patch of grsecurity/PaX based on my
> understanding of the code. Changes or omissions from the original code are
> mine and don't reflect the original grsecurity/PaX code.
> 
> Signed-off-by: David Windsor <dave@nullcore.net>
> [kees: adjust commit log, provide usage trace]
> Cc: Luis de Bethencourt <luisbg@kernel.org>
> Cc: Salah Triki <salah.triki@gmail.com>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Acked-by: Luis de Bethencourt <luisbg@kernel.org>
> ---
>   fs/befs/linuxvfs.c | 14 +++++++++-----
>   1 file changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
> index a92355cc453b..e5dcd26003dc 100644
> --- a/fs/befs/linuxvfs.c
> +++ b/fs/befs/linuxvfs.c
> @@ -444,11 +444,15 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
>   static int __init
>   befs_init_inodecache(void)
>   {
> -	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
> -					      sizeof (struct befs_inode_info),
> -					      0, (SLAB_RECLAIM_ACCOUNT|
> -						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
> -					      init_once);
> +	befs_inode_cachep = kmem_cache_create_usercopy("befs_inode_cache",
> +				sizeof(struct befs_inode_info), 0,
> +				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
> +					SLAB_ACCOUNT),
> +				offsetof(struct befs_inode_info,
> +					i_data.symlink),
> +				sizeof_field(struct befs_inode_info,
> +					i_data.symlink),
> +				init_once);
>   	if (befs_inode_cachep == NULL)
>   		return -ENOMEM;
>   
> 

No changes in the befs patch in v3. It goes without saying I continue to 
Ack this.

Thanks Kees and David,
Luis

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 2/2] ip_tunnel: add mpls over gre encapsulation
From: Amine Kherbouche @ 2017-09-21  9:25 UTC (permalink / raw)
  To: netdev, xeb, roopa; +Cc: amine.kherbouche, equinox
In-Reply-To: <1505985924-12479-1-git-send-email-amine.kherbouche@6wind.com>

This commit introduces the MPLSoGRE support (RFC 4023), using ip tunnel
API.

Encap:
  - Add a new iptunnel type mpls.

Decap:
  - pull gre hdr and call mpls_forward().

Signed-off-by: Amine Kherbouche <amine.kherbouche@6wind.com>
---
 include/net/gre.h              |  3 +++
 include/uapi/linux/if_tunnel.h |  1 +
 net/ipv4/gre_demux.c           | 22 ++++++++++++++++++++++
 net/ipv4/ip_gre.c              |  9 +++++++++
 net/ipv6/ip6_gre.c             |  7 +++++++
 net/mpls/af_mpls.c             | 37 +++++++++++++++++++++++++++++++++++++
 6 files changed, 79 insertions(+)

diff --git a/include/net/gre.h b/include/net/gre.h
index d25d836..88a8343 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -35,6 +35,9 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 				       u8 name_assign_type);
 int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 		     bool *csum_err, __be16 proto, int nhs);
+#if IS_ENABLED(CONFIG_MPLS)
+int mpls_gre_rcv(struct sk_buff *skb, int gre_hdr_len);
+#endif
 
 static inline int gre_calc_hlen(__be16 o_flags)
 {
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 2e52088..a2f48c0 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -84,6 +84,7 @@ enum tunnel_encap_types {
 	TUNNEL_ENCAP_NONE,
 	TUNNEL_ENCAP_FOU,
 	TUNNEL_ENCAP_GUE,
+	TUNNEL_ENCAP_MPLS,
 };
 
 #define TUNNEL_ENCAP_FLAG_CSUM		(1<<0)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index b798862..a6a937e 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -23,6 +23,9 @@
 #include <linux/netdevice.h>
 #include <linux/if_tunnel.h>
 #include <linux/spinlock.h>
+#if IS_ENABLED(CONFIG_MPLS)
+#include <linux/mpls.h>
+#endif
 #include <net/protocol.h>
 #include <net/gre.h>
 
@@ -122,6 +125,25 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 }
 EXPORT_SYMBOL(gre_parse_header);
 
+#if IS_ENABLED(CONFIG_MPLS)
+int mpls_gre_rcv(struct sk_buff *skb, int gre_hdr_len)
+{
+	if (unlikely(!pskb_may_pull(skb, gre_hdr_len)))
+		goto drop;
+
+	/* Pop GRE hdr and reset the skb */
+	skb_pull(skb, gre_hdr_len);
+	skb_reset_network_header(skb);
+
+	mpls_forward(skb, skb->dev, NULL, NULL);
+
+	return 0;
+drop:
+	return NET_RX_DROP;
+}
+EXPORT_SYMBOL(mpls_gre_rcv);
+#endif
+
 static int gre_rcv(struct sk_buff *skb)
 {
 	const struct gre_protocol *proto;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9cee986..dd4431c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -412,10 +412,19 @@ static int gre_rcv(struct sk_buff *skb)
 			return 0;
 	}
 
+#if IS_ENABLED(CONFIG_MPLS)
+	if (unlikely(tpi.proto == htons(ETH_P_MPLS_UC))) {
+		if (mpls_gre_rcv(skb, hdr_len))
+			goto drop;
+		return 0;
+	}
+#endif
+
 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 		return 0;
 
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
 drop:
 	kfree_skb(skb);
 	return 0;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index c82d41e..e52396d 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -476,6 +476,13 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
+#if IS_ENABLED(CONFIG_MPLS)
+	if (unlikely(tpi.proto == htons(ETH_P_MPLS_UC))) {
+		if (mpls_gre_rcv(skb, hdr_len))
+			goto drop;
+		return 0;
+	}
+#endif
 	if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false))
 		goto drop;
 
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 36ea2ad..060ed07 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -16,6 +16,7 @@
 #include <net/arp.h>
 #include <net/ip_fib.h>
 #include <net/netevent.h>
+#include <net/ip_tunnels.h>
 #include <net/netns/generic.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
@@ -39,6 +40,40 @@ static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
+size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e)
+{
+	return sizeof(struct mpls_shim_hdr);
+}
+
+int ipgre_mpls_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+			    u8 *protocol, struct flowi4 *fl4)
+{
+	return 0;
+}
+
+static const struct ip_tunnel_encap_ops mpls_iptun_ops = {
+	.encap_hlen = ipgre_mpls_encap_hlen,
+	.build_header = ipgre_mpls_build_header,
+};
+
+int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+	int ret;
+
+	ret = ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+	if (ret < 0) {
+		pr_err("can't add mplsgre ops\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+	ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
 		       struct nlmsghdr *nlh, struct net *net, u32 portid,
 		       unsigned int nlm_flags);
@@ -2486,6 +2521,7 @@ static int __init mpls_init(void)
 		      0);
 	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
 		      mpls_netconf_dump_devconf, 0);
+	ipgre_tunnel_encap_add_mpls_ops();
 	err = 0;
 out:
 	return err;
@@ -2503,6 +2539,7 @@ static void __exit mpls_exit(void)
 	dev_remove_pack(&mpls_packet_type);
 	unregister_netdevice_notifier(&mpls_dev_notifier);
 	unregister_pernet_subsys(&mpls_net_ops);
+	ipgre_tunnel_encap_del_mpls_ops();
 }
 module_exit(mpls_exit);
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 1/2] mpls: expose stack entry function
From: Amine Kherbouche @ 2017-09-21  9:25 UTC (permalink / raw)
  To: netdev, xeb, roopa; +Cc: amine.kherbouche, equinox
In-Reply-To: <1505985924-12479-1-git-send-email-amine.kherbouche@6wind.com>

Exposing mpls_forward() function to be able to be called from elsewhere
such as MPLS over GRE in the next commit.

Signed-off-by: Amine Kherbouche <amine.kherbouche@6wind.com>
---
 include/linux/mpls.h | 3 +++
 net/mpls/af_mpls.c   | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/mpls.h b/include/linux/mpls.h
index 384fb22..d5c7599 100644
--- a/include/linux/mpls.h
+++ b/include/linux/mpls.h
@@ -2,10 +2,13 @@
 #define _LINUX_MPLS_H
 
 #include <uapi/linux/mpls.h>
+#include <linux/netdevice.h>
 
 #define MPLS_TTL_MASK		(MPLS_LS_TTL_MASK >> MPLS_LS_TTL_SHIFT)
 #define MPLS_BOS_MASK		(MPLS_LS_S_MASK >> MPLS_LS_S_SHIFT)
 #define MPLS_TC_MASK		(MPLS_LS_TC_MASK >> MPLS_LS_TC_SHIFT)
 #define MPLS_LABEL_MASK		(MPLS_LS_LABEL_MASK >> MPLS_LS_LABEL_SHIFT)
 
+int mpls_forward(struct sk_buff *skb, struct net_device *dev,
+		 struct packet_type *pt, struct net_device *orig_dev);
 #endif  /* _LINUX_MPLS_H */
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c5b9ce4..36ea2ad 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -307,8 +307,8 @@ static bool mpls_egress(struct net *net, struct mpls_route *rt,
 	return success;
 }
 
-static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
-			struct packet_type *pt, struct net_device *orig_dev)
+int mpls_forward(struct sk_buff *skb, struct net_device *dev,
+		 struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct net *net = dev_net(dev);
 	struct mpls_shim_hdr *hdr;
@@ -442,6 +442,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
+EXPORT_SYMBOL(mpls_forward);
 
 static struct packet_type mpls_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_UC),
-- 
2.1.4

^ permalink raw reply related

* [RFC PATCH 0/0] Introduce MPLS over GRE
From: Amine Kherbouche @ 2017-09-21  9:25 UTC (permalink / raw)
  To: netdev, xeb, roopa; +Cc: amine.kherbouche, equinox

This series introduces the MPLS over GRE encapsulation (RFC 4023).

Various applications of MPLS make use of label stacks with multiple
entries.  In some cases, it is possible to replace the top label of
the stack with an IP-based encapsulation, thereby, it is possible for
two LSRs that are adjacent on an LSP to be separated by an IP network,
even if that IP network does not provide MPLS.

An example of configuration:


         node1                LER1                       LER2                node2
        +-----+             +------+                   +------+             +-----+
        |     |             |      |                   |      |             |     |
        |     |             |      |p3  GRE tunnel   p4|      |             |     |
        |     |p1         p2|      +-------------------+      |p5         p6|     |
        |     +-------------+      +-------------------+      +------------+|     |
        |     |10.100.0.0/24|      |                   |      |10.200.0.0/24|     |
        |     |fd00:100::/64|      |  10.125.0.0/24    |      |fd00:200::/64|     |
        |     |             |      |  fd00:125::/64    |      |             |     |
        |     |             |      |                   |      |             |     |
        |     |             |      |                   |      |             |     |
        |     |             |      |                   |      |             |     |
        |     |             |      |                   |      |             |     |
        +-----+             +------+                   +------+             +-----+


		###	node1	###

ip link set p1 up
ip addr add 10.100.0.1/24 dev p1

		###	LER1	###

ip link set p2 up
ip addr add 10.100.0.2/24 dev p2

ip link set p3 up
ip addr add 10.125.0.1/24 dev p3

modprobe mpls_router
sysctl -w net.mpls.conf.p2.input=1
sysctl -w net.mpls.conf.p3.input=1
sysctl -w net.mpls.platform_labels=1000

ip link add gre1 type gre ttl 64 local 10.125.0.1 remote 10.125.0.2 dev p3
ip link set dev gre1 up

ip -M route add 111 as 222 dev gre1
ip -M route add 555 as 666 via inet 10.100.0.1 dev p2

		###	LER2	###

ip link set p5 up
ip addr add 10.200.0.2/24 dev p5

ip link set p4 up
ip addr add 10.125.0.2/24 dev p4

modprobe mpls_router
sysctl -w net.mpls.conf.p4.input=1
sysctl -w net.mpls.conf.p5.input=1
sysctl -w net.mpls.platform_labels=1000

ip link add gre1 type gre ttl 64 local 10.125.0.2 remote 10.125.0.1 dev p4
ip link set dev gre1 up

ip -M route add 444 as 555 dev gre1
ip -M route add 222 as 333 via inet 10.200.0.1 dev p5

		###	node2	###

ip link set p6 up
ip addr add 10.200.0.1/24 dev p6


Now using this scapy to forge and send packets from the port p1 of node1:

p = Ether(src='de:ed:01:0c:41:09', dst='de:ed:01:2f:3b:ba')
p /= MPLS(s=1, ttl=64, label=111)/Raw(load='\xde')
sendp(p, iface="p1", count=20, inter=0.1)

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox