Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next v3 5/6] bonding: restructure and add rcu for bond_for_each_slave_next()
From: Ding Tianhong @ 2013-09-05 14:29 UTC (permalink / raw)
  To: Veaceslav Falico
  Cc: Ding Tianhong, Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Netdev
In-Reply-To: <20130905133027.GA26163@redhat.com>

于 2013/9/5 21:30, Veaceslav Falico 写道:
> On Thu, Sep 05, 2013 at 03:49:01PM +0800, Ding Tianhong wrote:
>> Remove the wordy int and add bond_for_each_slave_next_rcu() for 
>> future use.
>>
>> Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
>> Suggested-by: Veaceslav Falico <vfalico@redhat.com>
>> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>> Cc: Nikolay Aleksandrov <nikolay@redhat.com>
>> Cc: Veaceslav Falico <vfalico@redhat.com>
>> ---
>> drivers/net/bonding/bond_alb.c | 3 +--
>> drivers/net/bonding/bond_main.c | 6 ++----
>> drivers/net/bonding/bonding.h | 23 +++++++++++++++++++----
>> 3 files changed, 22 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_alb.c 
>> b/drivers/net/bonding/bond_alb.c
>> index 91f179d..c75d383 100644
>> --- a/drivers/net/bonding/bond_alb.c
>> +++ b/drivers/net/bonding/bond_alb.c
>> @@ -383,7 +383,6 @@ static struct slave *rlb_next_rx_slave(struct 
>> bonding *bond)
>> {
>> struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
>> struct slave *rx_slave, *slave, *start_at;
>> - int i = 0;
>>
>> if (bond_info->next_rx_slave)
>> start_at = bond_info->next_rx_slave;
>> @@ -392,7 +391,7 @@ static struct slave *rlb_next_rx_slave(struct 
>> bonding *bond)
>>
>> rx_slave = NULL;
>>
>> - bond_for_each_slave_from(bond, slave, i, start_at) {
>> + bond_for_each_slave_from(bond, slave, start_at) {
>> if (SLAVE_IS_OK(slave)) {
>> if (!rx_slave) {
>> rx_slave = slave;
>> diff --git a/drivers/net/bonding/bond_main.c 
>> b/drivers/net/bonding/bond_main.c
>> index 39e5b1c..4190389 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -782,7 +782,6 @@ static struct slave *bond_find_best_slave(struct 
>> bonding *bond)
>> struct slave *new_active, *old_active;
>> struct slave *bestslave = NULL;
>> int mintime = bond->params.updelay;
>> - int i;
>>
>> new_active = bond->curr_active_slave;
>>
>> @@ -801,7 +800,7 @@ static struct slave *bond_find_best_slave(struct 
>> bonding *bond)
>> /* remember where to stop iterating over the slaves */
>> old_active = new_active;
>>
>> - bond_for_each_slave_from(bond, new_active, i, old_active) {
>> + bond_for_each_slave_from(bond, new_active, old_active) {
>> if (new_active->link == BOND_LINK_UP) {
>> return new_active;
>> } else if (new_active->link == BOND_LINK_BACK &&
>> @@ -2756,7 +2755,6 @@ do_failover:
>> static void bond_ab_arp_probe(struct bonding *bond)
>> {
>> struct slave *slave, *next_slave;
>> - int i;
>>
>> read_lock(&bond->curr_slave_lock);
>>
>> @@ -2788,7 +2786,7 @@ static void bond_ab_arp_probe(struct bonding 
>> *bond)
>>
>> /* search for next candidate */
>> next_slave = bond_next_slave(bond, bond->current_arp_slave);
>> - bond_for_each_slave_from(bond, slave, i, next_slave) {
>> + bond_for_each_slave_from(bond, slave, next_slave) {
>> if (IS_UP(slave->dev)) {
>> slave->link = BOND_LINK_BACK;
>> bond_set_slave_active_flags(slave);
>> diff --git a/drivers/net/bonding/bonding.h 
>> b/drivers/net/bonding/bonding.h
>> index f013b12..48fd41a 100644
>> --- a/drivers/net/bonding/bonding.h
>> +++ b/drivers/net/bonding/bonding.h
>> @@ -123,18 +123,33 @@
>> (bond_is_first_slave(bond, pos) ? bond_last_slave_rcu(bond) : \
>> bond_to_slave_rcu((pos)->list.prev))
>>
>> +/* Check whether the slave is the only one in bond */
>> +
>> /**
>> * bond_for_each_slave_from - iterate the slaves list from a starting 
>> point
>> * @bond: the bond holding this list.
>> * @pos: current slave.
>> - * @cnt: counter for max number of moves
>> * @start: starting point.
>> *
>> * Caller must hold bond->lock
>> */
>> -#define bond_for_each_slave_from(bond, pos, cnt, start) \
>> - for (cnt = 0, pos = start; pos && cnt < (bond)->slave_cnt; \
>> - cnt++, pos = bond_next_slave(bond, pos))
>> +#define bond_for_each_slave_from(bond, pos, start) \
>> + for (pos = start; pos && &pos->list != &bond->slave_list; \
>> + (pos = bond_next_slave(bond, pos)) != start ? pos : \
>> + (pos = list_entry(&bond->slave_list, typeof(*pos), list)))
>
> Do you understand the differences of these two implementations?
>
> pos && cnt < (bond)->slave_cnt
>
> vs
>
> pos && &pos->list != &bond->slave_list
>
> I'm sorry, but you should listen to comments.
>
if the slave is only one, it will run once, and out the loop, else if
the slave is more than one, it will run until reach the start again,
here I use the &bond->slave_list to control the loop, as it will not
change any time, maybe it is hard to understand first.

ok, I think they are same, just in the purpose to remove the cnt, so 
maybe it make you
unconfortable, sorry about that, if you strangly disagree, I will miss 
this patch, but
thanks again for your different opinions. :)

>> +
>> +/**
>> + * bond_for_each_slave_from_rcu - iterate the slaves list from a 
>> starting point
>> + * @bond: the bond holding this list.
>> + * @pos: current slave.
>> + * @start: starting point.
>> + *
>> + * Caller must hold rcu_read_lock
>> + */
>> +#define bond_for_each_slave_from_rcu(bond, pos, start) \
>> + for (pos = start; pos && &pos->list != &bond->slave_list; \
>> + (pos = bond_next_slave_rcu(bond, pos)) != start ? pos : \
>> + (pos = list_entry_rcu(&bond->slave_list, typeof(*pos), list)))
>>
>> /**
>> * bond_for_each_slave - iterate over all slaves
>> -- 
>> 1.8.2.1
>>
>>
>>
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH 1/2] ipv6 mcast: use del_timer_sync instead of del_timer in ipv6_mc_down
From: Ben Hutchings @ 2013-09-05 14:30 UTC (permalink / raw)
  To: Salam Noureddine
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev
In-Reply-To: <1378363359-37716-1-git-send-email-noureddine@aristanetworks.com>

On Wed, 2013-09-04 at 23:42 -0700, Salam Noureddine wrote:
> Delete timers using del_timer_sync in ipv6_mc_down. Otherwise, it is
> possible for the timer to be the last to release its reference to the
> inet6_dev and since __in6_dev_put doesn't destroy the inet6_dev we
> would end up leaking a reference to the net_device and see messages
> like the following,
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 1
> 
> Tested on linux-3.4.43.
> 
> Signed-off-by: Salam Noureddine <noureddine@aristanetworks.com>
> ---
>  net/ipv6/mcast.c |    6 +++---
>  1 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
> index 99cd65c..5c8d49d 100644
> --- a/net/ipv6/mcast.c
> +++ b/net/ipv6/mcast.c
> @@ -2277,12 +2277,12 @@ void ipv6_mc_down(struct inet6_dev *idev)
>  
>  	read_lock_bh(&idev->lock);
>  	idev->mc_ifc_count = 0;
> -	if (del_timer(&idev->mc_ifc_timer))
> +	if (del_timer_sync(&idev->mc_ifc_timer))

Are you sure this doesn't introduce a potential deadlock?  Have you
tested this with lockdep enabled?

Ben.

>  		__in6_dev_put(idev);
>  	idev->mc_gq_running = 0;
> -	if (del_timer(&idev->mc_gq_timer))
> +	if (del_timer_sync(&idev->mc_gq_timer))
>  		__in6_dev_put(idev);
> -	if (del_timer(&idev->mc_dad_timer))
> +	if (del_timer_sync(&idev->mc_dad_timer))
>  		__in6_dev_put(idev);
>  
>  	for (i = idev->mc_list; i; i=i->next)

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH net-next v3 1/6] bonding: simplify and use RCU protection for 3ad xmit path
From: Ding Tianhong @ 2013-09-05 14:17 UTC (permalink / raw)
  To: Veaceslav Falico
  Cc: Ding Tianhong, Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Netdev
In-Reply-To: <20130905134258.GB26163@redhat.com>

于 2013/9/5 21:42, Veaceslav Falico 写道:
> On Thu, Sep 05, 2013 at 03:48:44PM +0800, Ding Tianhong wrote:
>> The commit 278b20837511776dc9d5f6ee1c7fabd5479838bb
>> (bonding: initial RCU conversion) has convert the roundrobin, 
>> active-backup,
>> broadcast and xor xmit path to rcu protection, the performance will 
>> be better
>> for these mode, so this time, convert xmit path for 3ad mode.
>>
>> Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
>> Suggested-by: Veaceslav Falico <vfalico@redhat.com>
>> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>> Signed-off-by: Wang Yufen <wangyufen@huawei.com>
>> Cc: Nikolay Aleksandrov <nikolay@redhat.com>
>> Cc: Veaceslav Falico <vfalico@redhat.com>
>> ---
>> drivers/net/bonding/bond_3ad.c | 32 ++++++++++++++------------------
>> drivers/net/bonding/bonding.h | 32 +++++++++++++++++++++++++++++++-
>> 2 files changed, 45 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_3ad.c 
>> b/drivers/net/bonding/bond_3ad.c
>> index 0d8f427..13f1deb 100644
>> --- a/drivers/net/bonding/bond_3ad.c
>> +++ b/drivers/net/bonding/bond_3ad.c
>> @@ -143,7 +143,7 @@ static inline struct bonding 
>> *__get_bond_by_port(struct port *port)
>> */
>> static inline struct port *__get_first_port(struct bonding *bond)
>> {
>> - struct slave *first_slave = bond_first_slave(bond);
>> + struct slave *first_slave = bond_first_slave_rcu(bond);
>>
>> return first_slave ? &(SLAVE_AD_INFO(first_slave).port) : NULL;
>> }
>> @@ -163,7 +163,7 @@ static inline struct port *__get_next_port(struct 
>> port *port)
>> // If there's no bond for this port, or this is the last slave
>> if (bond == NULL)
>> return NULL;
>> - slave_next = bond_next_slave(bond, slave);
>> + slave_next = bond_next_slave_rcu(bond, slave);
>> if (!slave_next || bond_is_first_slave(bond, slave_next))
>> return NULL;
>>
>> @@ -2417,16 +2417,14 @@ int bond_3ad_get_active_agg_info(struct 
>> bonding *bond, struct ad_info *ad_info)
>>
>> int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
>> {
>> - struct slave *slave, *start_at;
>> struct bonding *bond = netdev_priv(dev);
>> + struct slave *slave;
>> int slave_agg_no;
>> int slaves_in_agg;
>> int agg_id;
>> - int i;
>> struct ad_info ad_info;
>> int res = 1;
>>
>> - read_lock(&bond->lock);
>> if (__bond_3ad_get_active_agg_info(bond, &ad_info)) {
>> pr_debug("%s: Error: __bond_3ad_get_active_agg_info failed\n",
>> dev->name);
>> @@ -2444,13 +2442,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, 
>> struct net_device *dev)
>>
>> slave_agg_no = bond->xmit_hash_policy(skb, slaves_in_agg);
>>
>> - bond_for_each_slave(bond, slave) {
>> + bond_for_each_slave_rcu(bond, slave) {
>> struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
>>
>> if (agg && (agg->aggregator_identifier == agg_id)) {
>> - slave_agg_no--;
>> - if (slave_agg_no < 0)
>> - break;
>> + if (--slave_agg_no < 0) {
>> + if (SLAVE_IS_OK(slave)) {
>> + res = bond_dev_queue_xmit(bond,
>> + skb, slave->dev);
>> + goto out;
>> + }
>> + }
>> }
>> }
>>
>> @@ -2460,23 +2462,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, 
>> struct net_device *dev)
>> goto out;
>> }
>>
>> - start_at = slave;
>> -
>> - bond_for_each_slave_from(bond, slave, i, start_at) {
>> - int slave_agg_id = 0;
>> + bond_for_each_slave_rcu(bond, slave) {
>> struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
>>
>> - if (agg)
>> - slave_agg_id = agg->aggregator_identifier;
>> -
>> - if (SLAVE_IS_OK(slave) && agg && (slave_agg_id == agg_id)) {
>> + if (SLAVE_IS_OK(slave) && agg &&
>> + agg->aggregator_identifier == agg_id) {
>> res = bond_dev_queue_xmit(bond, skb, slave->dev);
>> break;
>> }
>> }
>>
>> out:
>> - read_unlock(&bond->lock);
>> if (res) {
>> /* no suitable interface, frame not sent */
>> kfree_skb(skb);
>> diff --git a/drivers/net/bonding/bonding.h 
>> b/drivers/net/bonding/bonding.h
>> index f7ab161..f013b12 100644
>> --- a/drivers/net/bonding/bonding.h
>> +++ b/drivers/net/bonding/bonding.h
>> @@ -74,13 +74,34 @@
>> /* slave list primitives */
>> #define bond_to_slave(ptr) list_entry(ptr, struct slave, list)
>>
>> +/* slave list primitives, Caller must hold rcu_read_lock */
>> +#define bond_to_slave_rcu(ptr) list_entry_rcu(ptr, struct slave, list)
>> +
>> +/* bond_is_empty return NULL if slave list is empty*/
>> +#define bond_is_empty(bond) \
>> + (list_empty(&(bond)->slave_list))
>> +
>> +/* bond_is_empty_rcu return NULL if slave list is empty*/
>> +#define bond_is_empty_rcu(bond) \
>> + (!list_first_or_null_rcu(&(bond)->slave_list, struct slave, list))
>> +
>> /* IMPORTANT: bond_first/last_slave can return NULL in case of an 
>> empty list */
>> #define bond_first_slave(bond) \
>> list_first_entry_or_null(&(bond)->slave_list, struct slave, list)
>> #define bond_last_slave(bond) \
>> - (list_empty(&(bond)->slave_list) ? NULL : \
>> + (bond_is_empty(bond) ? NULL : \
>> bond_to_slave((bond)->slave_list.prev))
>>
>> +/**
>> + * IMPORTANT: bond_first/last_slave_rcu can return NULL in case of 
>> an empty list
>> + * Caller must hold rcu_read_lock
>> + */
>> +#define bond_first_slave_rcu(bond) \
>> + list_first_or_null_rcu(&(bond)->slave_list, struct slave, list)
>> +#define bond_last_slave_rcu(bond) \
>> + (bond_is_empty_rcu(bond) ? NULL : \
>> + bond_to_slave_rcu((bond)->slave_list.prev))
>
> Really?
>
> No. Again - take a look at list_first_or_null_rcu. And its comments.
>
> I'm sorry that I'm acting that negatively... But if that gets accepted -
> I'll have days of nightmares.
>
> Try to understand how RCU works, please, before sending patches using it.
>
I am sad to here that, I think I had good understand of rcu, but maybe 
miss something.
#define bond_first_slave_rcu(bond) \
(bond_is_empty_rcu(bond) ? NULL : \
bond_to_slave_rcu((bond)->slave_list.next))

vs
list_first_or_null_rcu(&(bond)->slave_list, struct slave, list)

maybe the first one is confort you, but I think the second is racy and 
simple, maybe I still
miss something, I need a coffee to clear up.


>> +
>> #define bond_is_first_slave(bond, pos) ((pos)->list.prev == 
>> &(bond)->slave_list)
>> #define bond_is_last_slave(bond, pos) ((pos)->list.next == 
>> &(bond)->slave_list)
>>
>> @@ -93,6 +114,15 @@
>> (bond_is_first_slave(bond, pos) ? bond_last_slave(bond) : \
>> bond_to_slave((pos)->list.prev))
>>
>> +/* Since bond_first/last_slave_rcu can return NULL, these can return 
>> NULL too */
>> +#define bond_next_slave_rcu(bond, pos) \
>> + (bond_is_last_slave(bond, pos) ? bond_first_slave_rcu(bond) : \
>> + bond_to_slave_rcu((pos)->list.next))
>> +
>> +#define bond_prev_slave_rcu(bond, pos) \
>> + (bond_is_first_slave(bond, pos) ? bond_last_slave_rcu(bond) : \
>> + bond_to_slave_rcu((pos)->list.prev))
>> +
>> /**
>> * bond_for_each_slave_from - iterate the slaves list from a starting 
>> point
>> * @bond: the bond holding this list.
>> -- 
>> 1.8.2.1
>>
>>
>>
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH net-next v3 1/6] bonding: simplify and use RCU protection for 3ad xmit path
From: Eric Dumazet @ 2013-09-05 14:26 UTC (permalink / raw)
  To: Veaceslav Falico
  Cc: Ding Tianhong, Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Netdev
In-Reply-To: <20130905134258.GB26163@redhat.com>

On Thu, 2013-09-05 at 15:42 +0200, Veaceslav Falico wrote:

> Really?
> 
> No. Again - take a look at list_first_or_null_rcu. And its comments.
> 
> I'm sorry that I'm acting that negatively... But if that gets accepted -
> I'll have days of nightmares.
> 
> Try to understand how RCU works, please, before sending patches using it.

BTW, RCU is really hard, even for experimented kernel developers.

^ permalink raw reply

* Re: [patch v2] sfc: check for allocation failure
From: Ben Hutchings @ 2013-09-04 15:51 UTC (permalink / raw)
  To: Dan Carpenter; +Cc: Solarflare linux maintainers, netdev, kernel-janitors
In-Reply-To: <20130904150727.GA32327@elgon.mountain>

On Wed, 2013-09-04 at 18:07 +0300, Dan Carpenter wrote:
> It upsets static analyzers when we don't check for allocation failure.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

Acked-by: Ben Hutchings <bhutchings@solarflare.com>

> ---
> v2: rebased on latest linux-next
> 
> diff --git a/drivers/net/ethernet/sfc/falcon.c b/drivers/net/ethernet/sfc/falcon.c
> index 38d179c..75799f8 100644
> --- a/drivers/net/ethernet/sfc/falcon.c
> +++ b/drivers/net/ethernet/sfc/falcon.c
> @@ -894,6 +894,8 @@ static int falcon_mtd_probe(struct efx_nic *efx)
>  
>  	/* Allocate space for maximum number of partitions */
>  	parts = kcalloc(2, sizeof(*parts), GFP_KERNEL);
> +	if (!parts)
> +		return -ENOMEM;
>  	n_parts = 0;
>  
>  	spi = &nic_data->spi_flash;

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH net-next v3 1/6] bonding: simplify and use RCU protection for 3ad xmit path
From: Veaceslav Falico @ 2013-09-05 13:42 UTC (permalink / raw)
  To: Ding Tianhong
  Cc: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Netdev
In-Reply-To: <5228375C.1050105@huawei.com>

On Thu, Sep 05, 2013 at 03:48:44PM +0800, Ding Tianhong wrote:
>The commit 278b20837511776dc9d5f6ee1c7fabd5479838bb
>(bonding: initial RCU conversion) has convert the roundrobin, active-backup,
>broadcast and xor xmit path to rcu protection, the performance will be better
>for these mode, so this time, convert xmit path for 3ad mode.
>
>Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
>Suggested-by: Veaceslav Falico <vfalico@redhat.com>
>Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>Signed-off-by: Wang Yufen <wangyufen@huawei.com>
>Cc: Nikolay Aleksandrov <nikolay@redhat.com>
>Cc: Veaceslav Falico <vfalico@redhat.com>
>---
> drivers/net/bonding/bond_3ad.c | 32 ++++++++++++++------------------
> drivers/net/bonding/bonding.h  | 32 +++++++++++++++++++++++++++++++-
> 2 files changed, 45 insertions(+), 19 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
>index 0d8f427..13f1deb 100644
>--- a/drivers/net/bonding/bond_3ad.c
>+++ b/drivers/net/bonding/bond_3ad.c
>@@ -143,7 +143,7 @@ static inline struct bonding *__get_bond_by_port(struct port *port)
>  */
> static inline struct port *__get_first_port(struct bonding *bond)
> {
>-	struct slave *first_slave = bond_first_slave(bond);
>+	struct slave *first_slave = bond_first_slave_rcu(bond);
>
> 	return first_slave ? &(SLAVE_AD_INFO(first_slave).port) : NULL;
> }
>@@ -163,7 +163,7 @@ static inline struct port *__get_next_port(struct port *port)
> 	// If there's no bond for this port, or this is the last slave
> 	if (bond == NULL)
> 		return NULL;
>-	slave_next = bond_next_slave(bond, slave);
>+	slave_next = bond_next_slave_rcu(bond, slave);
> 	if (!slave_next || bond_is_first_slave(bond, slave_next))
> 		return NULL;
>
>@@ -2417,16 +2417,14 @@ int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info)
>
> int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
> {
>-	struct slave *slave, *start_at;
> 	struct bonding *bond = netdev_priv(dev);
>+	struct slave *slave;
> 	int slave_agg_no;
> 	int slaves_in_agg;
> 	int agg_id;
>-	int i;
> 	struct ad_info ad_info;
> 	int res = 1;
>
>-	read_lock(&bond->lock);
> 	if (__bond_3ad_get_active_agg_info(bond, &ad_info)) {
> 		pr_debug("%s: Error: __bond_3ad_get_active_agg_info failed\n",
> 			 dev->name);
>@@ -2444,13 +2442,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
>
> 	slave_agg_no = bond->xmit_hash_policy(skb, slaves_in_agg);
>
>-	bond_for_each_slave(bond, slave) {
>+	bond_for_each_slave_rcu(bond, slave) {
> 		struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
>
> 		if (agg && (agg->aggregator_identifier == agg_id)) {
>-			slave_agg_no--;
>-			if (slave_agg_no < 0)
>-				break;
>+			if (--slave_agg_no < 0) {
>+				if (SLAVE_IS_OK(slave)) {
>+					res = bond_dev_queue_xmit(bond,
>+						skb, slave->dev);
>+					goto out;
>+				}
>+			}
> 		}
> 	}
>
>@@ -2460,23 +2462,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
> 		goto out;
> 	}
>
>-	start_at = slave;
>-
>-	bond_for_each_slave_from(bond, slave, i, start_at) {
>-		int slave_agg_id = 0;
>+	bond_for_each_slave_rcu(bond, slave) {
> 		struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
>
>-		if (agg)
>-			slave_agg_id = agg->aggregator_identifier;
>-
>-		if (SLAVE_IS_OK(slave) && agg && (slave_agg_id == agg_id)) {
>+		if (SLAVE_IS_OK(slave) && agg &&
>+			agg->aggregator_identifier == agg_id) {
> 			res = bond_dev_queue_xmit(bond, skb, slave->dev);
> 			break;
> 		}
> 	}
>
> out:
>-	read_unlock(&bond->lock);
> 	if (res) {
> 		/* no suitable interface, frame not sent */
> 		kfree_skb(skb);
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index f7ab161..f013b12 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -74,13 +74,34 @@
> /* slave list primitives */
> #define bond_to_slave(ptr) list_entry(ptr, struct slave, list)
>
>+/* slave list primitives, Caller must hold rcu_read_lock */
>+#define bond_to_slave_rcu(ptr) list_entry_rcu(ptr, struct slave, list)
>+
>+/* bond_is_empty return NULL if slave list is empty*/
>+#define bond_is_empty(bond) \
>+	(list_empty(&(bond)->slave_list))
>+
>+/* bond_is_empty_rcu return NULL if slave list is empty*/
>+#define bond_is_empty_rcu(bond) \
>+	(!list_first_or_null_rcu(&(bond)->slave_list, struct slave, list))
>+
> /* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */
> #define bond_first_slave(bond) \
> 	list_first_entry_or_null(&(bond)->slave_list, struct slave, list)
> #define bond_last_slave(bond) \
>-	(list_empty(&(bond)->slave_list) ? NULL : \
>+	(bond_is_empty(bond) ? NULL : \
> 					   bond_to_slave((bond)->slave_list.prev))
>
>+/**
>+ * IMPORTANT: bond_first/last_slave_rcu can return NULL in case of an empty list
>+ * Caller must hold rcu_read_lock
>+ */
>+#define bond_first_slave_rcu(bond) \
>+	list_first_or_null_rcu(&(bond)->slave_list, struct slave, list)
>+#define bond_last_slave_rcu(bond) \
>+	(bond_is_empty_rcu(bond) ? NULL : \
>+					bond_to_slave_rcu((bond)->slave_list.prev))

Really?

No. Again - take a look at list_first_or_null_rcu. And its comments.

I'm sorry that I'm acting that negatively... But if that gets accepted -
I'll have days of nightmares.

Try to understand how RCU works, please, before sending patches using it.

>+
> #define bond_is_first_slave(bond, pos) ((pos)->list.prev == &(bond)->slave_list)
> #define bond_is_last_slave(bond, pos) ((pos)->list.next == &(bond)->slave_list)
>
>@@ -93,6 +114,15 @@
> 	(bond_is_first_slave(bond, pos) ? bond_last_slave(bond) : \
> 					  bond_to_slave((pos)->list.prev))
>
>+/* Since bond_first/last_slave_rcu can return NULL, these can return NULL too */
>+#define bond_next_slave_rcu(bond, pos) \
>+	(bond_is_last_slave(bond, pos) ? bond_first_slave_rcu(bond) : \
>+					 bond_to_slave_rcu((pos)->list.next))
>+
>+#define bond_prev_slave_rcu(bond, pos) \
>+	(bond_is_first_slave(bond, pos) ? bond_last_slave_rcu(bond) : \
>+					  bond_to_slave_rcu((pos)->list.prev))
>+
> /**
>  * bond_for_each_slave_from - iterate the slaves list from a starting point
>  * @bond:	the bond holding this list.
>-- 
>1.8.2.1
>
>
>

^ permalink raw reply

* Re: [Xen-devel] Is fallback vhost_net to qemu for live migrate available？
From: Stefano Stabellini @ 2013-09-05 13:33 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Wei Liu, qianhuibin, KVM list, likunyun, netdev, jasowang,
	xen-devel@lists.xen.org, liuyongan, liuyingdong, wangfuhai,
	Anthony Liguori, Qin Chuanyu
In-Reply-To: <20130903085556.GD18901@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2813 bytes --]

On Tue, 3 Sep 2013, Michael S. Tsirkin wrote:
> On Tue, Sep 03, 2013 at 09:40:48AM +0100, Wei Liu wrote:
> > On Tue, Sep 03, 2013 at 09:28:11AM +0800, Qin Chuanyu wrote:
> > > On 2013/9/2 15:57, Wei Liu wrote:
> > > >On Sat, Aug 31, 2013 at 12:45:11PM +0800, Qin Chuanyu wrote:
> > > >>On 2013/8/30 0:08, Anthony Liguori wrote:
> > > >>>Hi Qin,
> > > >>
> > > >>>>By change the memory copy and notify mechanism ，currently virtio-net with
> > > >>>>vhost_net could run on Xen with good performance。
> > > >>>
> > > >>>I think the key in doing this would be to implement a property
> > > >>>ioeventfd and irqfd interface in the driver domain kernel.  Just
> > > >>>hacking vhost_net with Xen specific knowledge would be pretty nasty
> > > >>>IMHO.
> > > >>>
> > > >>Yes, I add a kernel module which persist virtio-net pio_addr and
> > > >>msix address as what kvm module did. Guest wake up vhost thread by
> > > >>adding a hook func in evtchn_interrupt.
> > > >>
> > > >>>Did you modify the front end driver to do grant table mapping or is
> > > >>>this all being done by mapping the domain's memory?
> > > >>>
> > > >>There is nothing changed in front end driver. Currently I use
> > > >>alloc_vm_area to get address space， and map the domain's memory as
> > > >>what what qemu did.
> > > >>
> > > >
> > > >You mean you're using xc_map_foreign_range and friends in the backend to
> > > >map guest memory? That's not very desirable as it violates Xen's
> > > >security model. It would not be too hard to pass grant references
> > > >instead of guest physical memory address IMHO.
> > > >
> > > In fact, I did what virtio-net have done in Qemu. I think security
> > > is a pseudo question because Dom0 is under control.

Right, but we are trying to move the backends out of Dom0, for
scalability and security.
Setting up a network driver domain is pretty easy and should work out of
the box with Xen 4.3.
That said, I agree that using xc_map_foreign_range is a good way to start.


> > Consider that you might have driver domains. Not every domain is under
> > control or trusted.
> 
> I don't see anything that will prevent using driver domains here.

Driver domains are not privileged, therefore cannot map random guest
pages (unless they have been granted by the guest via the grant table).
xc_map_foreign_range can't work from a driver domain.


> > Also consider that security model like XSM can be
> > used to audit operations to enhance security so your foreign mapping
> > approach might not always work.
> 
> It could be nice to have as an option, sure.
> XSM is disabled by default though so I don't think lack of support for
> that makes it a prototype.

There are some security aware Xen based products in the market today
that use XSM.

^ permalink raw reply

* Re: [PATCH net-next v3 5/6] bonding: restructure and add rcu for bond_for_each_slave_next()
From: Veaceslav Falico @ 2013-09-05 13:30 UTC (permalink / raw)
  To: Ding Tianhong
  Cc: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Netdev
In-Reply-To: <5228376D.2040608@huawei.com>

On Thu, Sep 05, 2013 at 03:49:01PM +0800, Ding Tianhong wrote:
>Remove the wordy int and add bond_for_each_slave_next_rcu() for future use.
>
>Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
>Suggested-by: Veaceslav Falico <vfalico@redhat.com>
>Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>Cc: Nikolay Aleksandrov <nikolay@redhat.com>
>Cc: Veaceslav Falico <vfalico@redhat.com>
>---
> drivers/net/bonding/bond_alb.c  |  3 +--
> drivers/net/bonding/bond_main.c |  6 ++----
> drivers/net/bonding/bonding.h   | 23 +++++++++++++++++++----
> 3 files changed, 22 insertions(+), 10 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
>index 91f179d..c75d383 100644
>--- a/drivers/net/bonding/bond_alb.c
>+++ b/drivers/net/bonding/bond_alb.c
>@@ -383,7 +383,6 @@ static struct slave *rlb_next_rx_slave(struct bonding *bond)
> {
> 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
> 	struct slave *rx_slave, *slave, *start_at;
>-	int i = 0;
>
> 	if (bond_info->next_rx_slave)
> 		start_at = bond_info->next_rx_slave;
>@@ -392,7 +391,7 @@ static struct slave *rlb_next_rx_slave(struct bonding *bond)
>
> 	rx_slave = NULL;
>
>-	bond_for_each_slave_from(bond, slave, i, start_at) {
>+	bond_for_each_slave_from(bond, slave, start_at) {
> 		if (SLAVE_IS_OK(slave)) {
> 			if (!rx_slave) {
> 				rx_slave = slave;
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 39e5b1c..4190389 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -782,7 +782,6 @@ static struct slave *bond_find_best_slave(struct bonding *bond)
> 	struct slave *new_active, *old_active;
> 	struct slave *bestslave = NULL;
> 	int mintime = bond->params.updelay;
>-	int i;
>
> 	new_active = bond->curr_active_slave;
>
>@@ -801,7 +800,7 @@ static struct slave *bond_find_best_slave(struct bonding *bond)
> 	/* remember where to stop iterating over the slaves */
> 	old_active = new_active;
>
>-	bond_for_each_slave_from(bond, new_active, i, old_active) {
>+	bond_for_each_slave_from(bond, new_active, old_active) {
> 		if (new_active->link == BOND_LINK_UP) {
> 			return new_active;
> 		} else if (new_active->link == BOND_LINK_BACK &&
>@@ -2756,7 +2755,6 @@ do_failover:
> static void bond_ab_arp_probe(struct bonding *bond)
> {
> 	struct slave *slave, *next_slave;
>-	int i;
>
> 	read_lock(&bond->curr_slave_lock);
>
>@@ -2788,7 +2786,7 @@ static void bond_ab_arp_probe(struct bonding *bond)
>
> 	/* search for next candidate */
> 	next_slave = bond_next_slave(bond, bond->current_arp_slave);
>-	bond_for_each_slave_from(bond, slave, i, next_slave) {
>+	bond_for_each_slave_from(bond, slave, next_slave) {
> 		if (IS_UP(slave->dev)) {
> 			slave->link = BOND_LINK_BACK;
> 			bond_set_slave_active_flags(slave);
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index f013b12..48fd41a 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -123,18 +123,33 @@
> 	(bond_is_first_slave(bond, pos) ? bond_last_slave_rcu(bond) : \
> 					  bond_to_slave_rcu((pos)->list.prev))
>
>+/* Check whether the slave is the only one in bond */
>+
> /**
>  * bond_for_each_slave_from - iterate the slaves list from a starting point
>  * @bond:	the bond holding this list.
>  * @pos:	current slave.
>- * @cnt:	counter for max number of moves
>  * @start:	starting point.
>  *
>  * Caller must hold bond->lock
>  */
>-#define bond_for_each_slave_from(bond, pos, cnt, start) \
>-	for (cnt = 0, pos = start; pos && cnt < (bond)->slave_cnt; \
>-	     cnt++, pos = bond_next_slave(bond, pos))
>+#define bond_for_each_slave_from(bond, pos, start) \
>+	for (pos = start; pos && &pos->list != &bond->slave_list; \
>+	    (pos = bond_next_slave(bond, pos)) != start ? pos : \
>+	    (pos = list_entry(&bond->slave_list, typeof(*pos), list)))

Do you understand the differences of these two implementations?

pos && cnt < (bond)->slave_cnt

vs

pos && &pos->list != &bond->slave_list

I'm sorry, but you should listen to comments.

>+
>+/**
>+ * bond_for_each_slave_from_rcu - iterate the slaves list from a starting point
>+ * @bond:	the bond holding this list.
>+ * @pos:	current slave.
>+ * @start:	starting point.
>+ *
>+ * Caller must hold rcu_read_lock
>+ */
>+#define bond_for_each_slave_from_rcu(bond, pos, start) \
>+	for (pos = start; pos && &pos->list != &bond->slave_list; \
>+	    (pos = bond_next_slave_rcu(bond, pos)) != start ? pos : \
>+	    (pos = list_entry_rcu(&bond->slave_list, typeof(*pos), list)))
>
> /**
>  * bond_for_each_slave - iterate over all slaves
>-- 
>1.8.2.1
>
>
>

^ permalink raw reply

* Re: [PATCH] net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
From: Jason Cooper @ 2013-09-05 11:38 UTC (permalink / raw)
  To: Thomas Petazzoni
  Cc: Willy Tarreau, Lior Amsalem, Jochen De Smet, Simon Guinot,
	Ryan Press, netdev, vdonnefort, Ethan Tuttle, stable,
	Ezequiel Garcia, Chény Yves-Gael, Gregory Clement,
	Peter Sanford, David S. Miller, linux-arm-kernel
In-Reply-To: <20130905102659.25a0f064@skate>

On Thu, Sep 05, 2013 at 10:26:59AM +0200, Thomas Petazzoni wrote:
> On Thu, 5 Sep 2013 09:44:26 +0200, Willy Tarreau wrote:
> > One simpler solution for them could be to slightly modify the boot loader
> > so that it sets the MAC address on the two ethernet controllers prior to
> > boot. Then your code which checks if a MAC is already set will simply
> > work.
> 
> This works when the network driver is compiled 'statically' inside the
> kernel. When compiled as a module, then the gatable clock of the
> network interface will be gated at the end of the kernel boot, before
> the mvneta module is probe. And gating the network interface clocks
> means that it will loose its state, including its MAC address. So it's
> not an entirely perfect solution either, but I admit that on such
> platforms, the network driver is most likely compiled statically, so it
> would probably suit the needs of most people.

Up until sleep and standby modes are supported.  Proper power savings
would include gating the clock...

thx,

Jason.

^ permalink raw reply

* Re: [PATCH] net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
From: Jason Cooper @ 2013-09-05 11:36 UTC (permalink / raw)
  To: Thomas Petazzoni
  Cc: Lior Amsalem, Jochen De Smet, Simon Guinot, Ryan Press, netdev,
	vdonnefort, David S. Miller, Ethan Tuttle, stable,
	Ezequiel Garcia, Chény Yves-Gael, Gregory Clement,
	Peter Sanford, Willy Tarreau, linux-arm-kernel
In-Reply-To: <20130905092808.617da416@skate>

On Thu, Sep 05, 2013 at 09:28:08AM +0200, Thomas Petazzoni wrote:
>  (2) Use the "impedance matcher" code written by Daniel Mack and
>      extended by Jason Cooper, available at
>      https://github.com/zonque/pxa-impedance-matcher. Essentially, it
>      inserts a small binary between the installed bootloader and the
>      kernel, that for example allows to choose a particular DTB amongst
>      several, depending on the board that is detected. I believe it
>      could probably be extended to cover other use cases such as
>      modifying the DTB to add the MAC addresses where appropriate. I've
>      added Jason Cooper in the Cc list if he wants to comment on that.

Yes, I'm hoping to add the dtb editing support sometime over the next
few weeks.  That way, we can use Willy's atags patch in there, and the
kernel never knows the difference.

It isn't high on my priority list atm, so if someone else is motivated,
I've been borrowing code from barebox to flesh out impedance-matcher.
Patches are welcomed ;-)

thx,

Jason.

^ permalink raw reply

* Re: [E1000-devel] [net-next v4 7/8] i40e: sysfs and debugfs interfaces
From: Bjørn Mork @ 2013-09-05 11:10 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Nelson, Shannon, Kirsher, Jeffrey T,
	e1000-devel@lists.sourceforge.net, netdev@vger.kernel.org,
	Brandeburg, Jesse, gospo@redhat.com, davem@davemloft.net,
	sassmann@redhat.com
In-Reply-To: <CAOaVG17stH4K=MOfB0-OfqerVOmXpGQtERD7kgCF97AOtpn18A@mail.gmail.com>

Stephen Hemminger <stephen@networkplumber.org> writes:

> More surprising is that others did not see the same things.

If I were to guess the 3 primary reasons for that:
  1) the series is too large for anyone to actually read it all without
     being paid to do so
  2) the sysfs code is at the end of a huge debugfs patch, which
     probably noone cares about whether they are paid or not
  3) the small sysfs patch is so full of minor issues to nit about that
     it seems pointless to even start

But since I looked at it, I can name a few other obvious issues with the
sysfs patch:
 - mix of macro and octal mode values
 - no documentation (required for all new sysfs attributes according to
    Documentation/filesystems/sysfs.txt)
 - extremely dubious kboject usage and attribute creation/deletion (How
    does this fit into the driver model?  How will userspace race with
    these attributes being added and removed?)

I am sure there are plenty more.


Bjørn

^ permalink raw reply

* Re: [PATCH] xen-netback: count number required slots for an skb more carefully
From: Wei Liu @ 2013-09-05 10:27 UTC (permalink / raw)
  To: David Vrabel
  Cc: Wei Liu, xen-devel, Konrad Rzeszutek Wilk, Boris Ostrovsky,
	Ian Campbell, netdev, msw, annie.li
In-Reply-To: <522858FB.807@citrix.com>

On Thu, Sep 05, 2013 at 11:12:11AM +0100, David Vrabel wrote:
> On 04/09/13 16:44, Wei Liu wrote:
> > On Wed, Sep 04, 2013 at 03:02:01PM +0100, David Vrabel wrote:
> > [...]
> >>>>
> >>>> I think I prefer fixing the counting for backporting to stable kernels.
> >>>
> >>> The original patch has coding style change. Sans that contextual change
> >>> it's not a very long patch.
> >>
> >> The size of the patch isn't the main concern for backport-ability.  It's
> >> the frontend visible changes and thus any (unexpected) impacts on
> >> frontends -- this is especially important as only a small fraction of
> >> frontends in use will be tested with these changes.
> >>
> >>>>  Xi's approach of packing the ring differently is a change in frontend
> >>>> visible behaviour and seems more risky. e.g., possible performance
> >>>> impact so I would like to see some performance analysis of that approach.
> >>>>
> >>>
> >>> With Xi's approach it is more efficient for backend to process. As we
> >>> now use one less grant copy operation which means we copy the same
> >>> amount of data with less grant ops.
> >>
> >> It think it uses more grant ops because the copies of the linear
> >> portion are in chunks that do not cross source page boundaries.
> >>
> >> i.e., in netbk_gop_skb():
> >>
> >> 	data = skb->data;
> >> 	while (data < skb_tail_pointer(skb)) {
> >> 		unsigned int offset = offset_in_page(data);
> >> 		unsigned int len = PAGE_SIZE - offset;
> >>                 [...]
> >>
> >> It wasn't clear from the patch that this had been considered and that
> >> any extra space needed in the grant op array was made available.
> >>
> > 
> > If I'm not mistaken the grant op array is already enormous. See the
> > comment in struct xen_netbk for grant_copy_op. The case that a buffer
> > straddles two slots was taken into consideration long ago -- that's
> > why you don't see any comment or code change WRT that..
> 
> I'm not convinced that even that is enough for the current
> implementation in the (very) worse case.
> 
> Consider a skb with 8 frags all 512 in length.  The linear data will be
> placed into 1 slot, and the frags will be packed into 1 slot so 9 grant
> ops and 2 slots.
> 

That is still less than 8 * 2 = 16 slots in the grant copy array, right?
Basically each paged buffer is allowed for 2 slots in the grant copy
array.

> I definitely think we do not want to potentially regress any further in
> this area.
> 

Sure.

> David

^ permalink raw reply

* Re: [PATCH] xen-netback: count number required slots for an skb more carefully
From: David Vrabel @ 2013-09-05 10:12 UTC (permalink / raw)
  To: Wei Liu
  Cc: xen-devel, Konrad Rzeszutek Wilk, Boris Ostrovsky, Ian Campbell,
	netdev, msw, annie.li
In-Reply-To: <20130904154400.GS14104@zion.uk.xensource.com>

On 04/09/13 16:44, Wei Liu wrote:
> On Wed, Sep 04, 2013 at 03:02:01PM +0100, David Vrabel wrote:
> [...]
>>>>
>>>> I think I prefer fixing the counting for backporting to stable kernels.
>>>
>>> The original patch has coding style change. Sans that contextual change
>>> it's not a very long patch.
>>
>> The size of the patch isn't the main concern for backport-ability.  It's
>> the frontend visible changes and thus any (unexpected) impacts on
>> frontends -- this is especially important as only a small fraction of
>> frontends in use will be tested with these changes.
>>
>>>>  Xi's approach of packing the ring differently is a change in frontend
>>>> visible behaviour and seems more risky. e.g., possible performance
>>>> impact so I would like to see some performance analysis of that approach.
>>>>
>>>
>>> With Xi's approach it is more efficient for backend to process. As we
>>> now use one less grant copy operation which means we copy the same
>>> amount of data with less grant ops.
>>
>> It think it uses more grant ops because the copies of the linear
>> portion are in chunks that do not cross source page boundaries.
>>
>> i.e., in netbk_gop_skb():
>>
>> 	data = skb->data;
>> 	while (data < skb_tail_pointer(skb)) {
>> 		unsigned int offset = offset_in_page(data);
>> 		unsigned int len = PAGE_SIZE - offset;
>>                 [...]
>>
>> It wasn't clear from the patch that this had been considered and that
>> any extra space needed in the grant op array was made available.
>>
> 
> If I'm not mistaken the grant op array is already enormous. See the
> comment in struct xen_netbk for grant_copy_op. The case that a buffer
> straddles two slots was taken into consideration long ago -- that's
> why you don't see any comment or code change WRT that..

I'm not convinced that even that is enough for the current
implementation in the (very) worse case.

Consider a skb with 8 frags all 512 in length.  The linear data will be
placed into 1 slot, and the frags will be packed into 1 slot so 9 grant
ops and 2 slots.

I definitely think we do not want to potentially regress any further in
this area.

David

^ permalink raw reply

* [PATCH V2 net-next 2/2] tuntap: orphan frags before trying to set tx timestamp
From: Jason Wang @ 2013-09-05  9:54 UTC (permalink / raw)
  To: davem, netdev, linux-kernel
  Cc: mst, Jason Wang, Richard Cochran, Sergei Shtylyov
In-Reply-To: <1378374840-39130-1-git-send-email-jasowang@redhat.com>

sock_tx_timestamp() will clear all zerocopy flags of skb which may lead the
frags never to be orphaned. This will break guest to guest traffic when zerocopy
is enabled. Fix this by orphaning the frags before trying to set tx time stamp.

The issue were introduced by commit eda297729171fe16bf34fe5b0419dfb69060f623
(tun: Support software transmit time stamping).

Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes from v1:
- correct the comment location and style
---
 drivers/net/tun.c |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2dddb1b..a639de8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -749,15 +749,17 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 			  >= dev->tx_queue_len / tun->numqueues)
 		goto drop;
 
+	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+		goto drop;
+
 	if (skb->sk) {
 		sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags);
 		sw_tx_timestamp(skb);
 	}
 
 	/* Orphan the skb - required as we might hang on to it
-	 * for indefinite time. */
-	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
-		goto drop;
+	 * for indefinite time.
+	 */
 	skb_orphan(skb);
 
 	nf_reset(skb);
-- 
1.7.1

^ permalink raw reply related

* Re: rtnl_lock deadlock on 3.10
From: Bart Van Assche @ 2013-09-05 10:02 UTC (permalink / raw)
  To: Steve Wise
  Cc: Shawn Bohrer, Or Gerlitz, Shawn Bohrer, Cong Wang,
	netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	roland-BHEL68pLQRGGvPXPguhicg, swise-ut6Up61K2wZBDgjK7y7TUQ
In-Reply-To: <51F7B792.7030803-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>

On 07/30/13 14:54, Steve Wise wrote:
> On 7/29/2013 6:02 PM, Shawn Bohrer wrote:
>> On Mon, Jul 15, 2013 at 09:38:19AM -0500, Shawn Bohrer wrote:
>>> On Wed, Jul 03, 2013 at 08:26:11PM +0300, Or Gerlitz wrote:
>>>> On 03/07/2013 20:22, Shawn Bohrer wrote:
>>>>> On Wed, Jul 03, 2013 at 07:33:07AM +0200, Hannes Frederic Sowa wrote:
>>>>>> On Wed, Jul 03, 2013 at 07:11:52AM +0200, Hannes Frederic Sowa wrote:
>>>>>>> On Tue, Jul 02, 2013 at 01:38:26PM +0000, Cong Wang wrote:
>>>>>>>> On Tue, 02 Jul 2013 at 08:28 GMT, Hannes Frederic Sowa
>>>>>>>> <hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r@public.gmane.org> wrote:
>>>>>>>>> On Mon, Jul 01, 2013 at 09:54:56AM -0500, Shawn Bohrer wrote:
>>>>>>>>>> I've managed to hit a deadlock at boot a couple times while
>>>>>>>>>> testing
>>>>>>>>>> the 3.10 rc kernels.  It seems to always happen when my network
>>>>>>>>>> devices are initializing.  This morning I updated to v3.10 and
>>>>>>>>>> made a
>>>>>>>>>> few config tweaks and so far I've hit it 4 out of 5 reboots.
>>>>>>>>>> It looks
>>>>>>>>>> like most processes are getting stuck on rtnl_lock.  Below is
>>>>>>>>>> a boot
>>>>>>>>>> log with the soft lockup prints.  Please let know if there is any
>>>>>>>>>> other information I can provide:
>>>>>>>>> Could you try a build with CONFIG_LOCKDEP enabled?
>>>>>>>>>
>>>>>>>> The problem is clear: ib_register_device() is called with
>>>>>>>> rtnl_lock,
>>>>>>>> but itself needs device_mutex, however, ib_register_client() first
>>>>>>>> acquires device_mutex, then indirectly calls register_netdev()
>>>>>>>> which
>>>>>>>> takes rtnl_lock. Deadlock!
>>>>>>>>
>>>>>>>> One possible fix is always taking rtnl_lock before taking
>>>>>>>> device_mutex, something like below:
>>>>>>>>
>>>>>>>> diff --git a/drivers/infiniband/core/device.c
>>>>>>>> b/drivers/infiniband/core/device.c
>>>>>>>> index 18c1ece..890870b 100644
>>>>>>>> --- a/drivers/infiniband/core/device.c
>>>>>>>> +++ b/drivers/infiniband/core/device.c
>>>>>>>> @@ -381,6 +381,7 @@ int ib_register_client(struct ib_client
>>>>>>>> *client)
>>>>>>>>   {
>>>>>>>>       struct ib_device *device;
>>>>>>>> +    rtnl_lock();
>>>>>>>>       mutex_lock(&device_mutex);
>>>>>>>>       list_add_tail(&client->list, &client_list);
>>>>>>>> @@ -389,6 +390,7 @@ int ib_register_client(struct ib_client
>>>>>>>> *client)
>>>>>>>>               client->add(device);
>>>>>>>>       mutex_unlock(&device_mutex);
>>>>>>>> +    rtnl_unlock();
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>> index b6e049a..5a7a048 100644
>>>>>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>> @@ -1609,7 +1609,7 @@ static struct net_device
>>>>>>>> *ipoib_add_port(const char *format,
>>>>>>>>           goto event_failed;
>>>>>>>>       }
>>>>>>>> -    result = register_netdev(priv->dev);
>>>>>>>> +    result = register_netdevice(priv->dev);
>>>>>>>>       if (result) {
>>>>>>>>           printk(KERN_WARNING "%s: couldn't register ipoib port
>>>>>>>> %d; error %d\n",
>>>>>>>>                  hca->name, port, result);
>>>>>>> Looks good to me. Shawn, could you test this patch?
>>>>>> ib_unregister_device/ib_unregister_client would need the same change,
>>>>>> too. I have not checked the other ->add() and ->remove()
>>>>>> functions. Also
>>>>>> cc'ed linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Roland Dreier.
>>>>> Cong's patch is missing the #include <linux/rtnetlink.h> but otherwise
>>>>> I've had 34 successful reboots with no deadlocks which is a good sign.
>>>>> It sounds like there are more paths that need to be audited and a
>>>>> proper patch submitted.  I can do more testing later if needed.
>>>>>
>>>>> Thanks,
>>>>> Shawn
>>>>>
>>>> Guys, I was a bit busy today looking into that, but I don't think we
>>>> want the IB core layer  (core/device.c) to
>>>> use rtnl locking which is something that belongs to the network stack.
>>> Has anymore thought been put into a proper fix for this issue?
>> I'm no expert in this area but I'm having a hard time seeing a
>> different solution than the one Cong suggested.  Just to be clear the
>> deadlock I hit was between cxgb3 and the ipoib module, so I've Cc'd
>> Steve Wise in case he has a better solution from the Chelsio side.
>
> I don't know of another way to resolve this.   The rtnl lock is used in
> ipoib and mlx4 already.  I think we should go forward with the proposed
> patch.

(replying to an e-mail of one month ago)

Hello,

It would be appreciated if anyone could report what the current status 
of this issue is. I think a deadlock I ran into with kernels 3.10 and 
3.11 and PCI pass-through is related to this issue. See also 
http://bugzilla.kernel.org/show_bug.cgi?id=60856 for the lockdep report.

Thanks,

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next 2/2] tuntap: orphan frags before trying to set tx timestamp
From: Jason Wang @ 2013-09-05  9:59 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: davem, netdev, linux-kernel, mst, Richard Cochran
In-Reply-To: <52273525.7090501@cogentembedded.com>

On 09/04/2013 09:27 PM, Sergei Shtylyov wrote:
> Hello.
>
> On 04-09-2013 8:33, Jason Wang wrote:
>
>> sock_tx_timestamp() will clear all zerocopy flags of skb which may
>> lead the
>> frags never to be orphaned. This will break guest to guest traffic
>> when zerocopy
>> is enabled. Fix this by orphaning the frags before trying to set tx
>> time stamp.
>
>> The issue were introduced by commit
>> eda297729171fe16bf34fe5b0419dfb69060f623
>> (tun: Support software transmit time stamping).
>
>> Cc: Richard Cochran <richardcochran@gmail.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/net/tun.c |    9 +++++----
>>   1 files changed, 5 insertions(+), 4 deletions(-)
>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 2dddb1b..af9a096 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -749,15 +749,16 @@ static netdev_tx_t tun_net_xmit(struct sk_buff
>> *skb, struct net_device *dev)
>>                 >= dev->tx_queue_len / tun->numqueues)
>>           goto drop;
>>
>> +    /* Orphan the skb - required as we might hang on to it
>> +     * for indefinite time. */
>
>    You could fix the comment style to the networking code default,
> while at: it:
>
> /* bla
>  * bla
>  */
>
>> +    if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
>> +        goto drop;
>> +
>>       if (skb->sk) {
>>           sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags);
>>           sw_tx_timestamp(skb);
>>       }
>>
>> -    /* Orphan the skb - required as we might hang on to it
>> -     * for indefinite time. */
>> -    if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
>> -        goto drop;
>
> WBR, Sergei
>
>
>

Sure will post V2.

^ permalink raw reply

* [PATCH V2 net-next 1/2] tuntap: purge socket error queue on detach
From: Jason Wang @ 2013-09-05  9:53 UTC (permalink / raw)
  To: davem, netdev, linux-kernel; +Cc: mst, Jason Wang, Richard Cochran

Commit eda297729171fe16bf34fe5b0419dfb69060f623
(tun: Support software transmit time stamping) will queue skbs into error queue
when tx stamping is enabled. But it forgets to purge the error queue during
detach. This patch fixes this.

Cc: Richard Cochran <richardcochran@gmail.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c |   12 +++++++++---
 1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 60a1e93..2dddb1b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -409,6 +409,12 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 	return tun;
 }
 
+static void tun_queue_purge(struct tun_file *tfile)
+{
+	skb_queue_purge(&tfile->sk.sk_receive_queue);
+	skb_queue_purge(&tfile->sk.sk_error_queue);
+}
+
 static void __tun_detach(struct tun_file *tfile, bool clean)
 {
 	struct tun_file *ntfile;
@@ -435,7 +441,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		synchronize_net();
 		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
 		/* Drop read queue */
-		skb_queue_purge(&tfile->sk.sk_receive_queue);
+		tun_queue_purge(tfile);
 		tun_set_real_num_queues(tun);
 	} else if (tfile->detached && clean) {
 		tun = tun_enable_queue(tfile);
@@ -487,12 +493,12 @@ static void tun_detach_all(struct net_device *dev)
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
 		/* Drop read queue */
-		skb_queue_purge(&tfile->sk.sk_receive_queue);
+		tun_queue_purge(tfile);
 		sock_put(&tfile->sk);
 	}
 	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 		tun_enable_queue(tfile);
-		skb_queue_purge(&tfile->sk.sk_receive_queue);
+		tun_queue_purge(tfile);
 		sock_put(&tfile->sk);
 	}
 	BUG_ON(tun->numdisabled != 0);
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH] can: pcan_usb_core: fix memory leak on failure paths in peak_usb_start()
From: Marc Kleine-Budde @ 2013-09-05  9:33 UTC (permalink / raw)
  To: Alexey Khoroshilov
  Cc: Wolfgang Grandegger, linux-can, netdev, linux-kernel, ldv-project,
	s.grosjean@peak-system.com
In-Reply-To: <1378331169-26285-1-git-send-email-khoroshilov@ispras.ru>

[-- Attachment #1: Type: text/plain, Size: 2417 bytes --]

Added Stephane to Cc.

On 09/04/2013 11:46 PM, Alexey Khoroshilov wrote:
> Tx and rx urbs are not deallocated if something goes wrong in peak_usb_start().
> The patch fixes error handling to deallocate all the resources.
> 
> Found by Linux Driver Verification project (linuxtesting.org).
> 
> Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>

Stephane, can you have a look at the patch and give your Acked-by.

Looks good to me.

tnx,
Marc
> ---
>  drivers/net/can/usb/peak_usb/pcan_usb_core.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
> index a0f647f..0b7a4c3 100644
> --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
> +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
> @@ -463,7 +463,7 @@ static int peak_usb_start(struct peak_usb_device *dev)
>  	if (i < PCAN_USB_MAX_TX_URBS) {
>  		if (i == 0) {
>  			netdev_err(netdev, "couldn't setup any tx URB\n");
> -			return err;
> +			goto err_tx;
>  		}
>  
>  		netdev_warn(netdev, "tx performance may be slow\n");
> @@ -472,7 +472,7 @@ static int peak_usb_start(struct peak_usb_device *dev)
>  	if (dev->adapter->dev_start) {
>  		err = dev->adapter->dev_start(dev);
>  		if (err)
> -			goto failed;
> +			goto err_adapter;
>  	}
>  
>  	dev->state |= PCAN_USB_STATE_STARTED;
> @@ -481,19 +481,26 @@ static int peak_usb_start(struct peak_usb_device *dev)
>  	if (dev->adapter->dev_set_bus) {
>  		err = dev->adapter->dev_set_bus(dev, 1);
>  		if (err)
> -			goto failed;
> +			goto err_adapter;
>  	}
>  
>  	dev->can.state = CAN_STATE_ERROR_ACTIVE;
>  
>  	return 0;
>  
> -failed:
> +err_adapter:
>  	if (err == -ENODEV)
>  		netif_device_detach(dev->netdev);
>  
>  	netdev_warn(netdev, "couldn't submit control: %d\n", err);
>  
> +	for (i = 0; i < PCAN_USB_MAX_TX_URBS; i++) {
> +		usb_free_urb(dev->tx_contexts[i].urb);
> +		dev->tx_contexts[i].urb = NULL;
> +	}
> +err_tx:
> +	usb_kill_anchored_urbs(&dev->rx_submitted);
> +
>  	return err;
>  }
>  
> 


-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 259 bytes --]

^ permalink raw reply

* Re: [PATCH] net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
From: Thomas Petazzoni @ 2013-09-05  9:24 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Ethan Tuttle, David S. Miller, netdev, linux-arm-kernel,
	Lior Amsalem, Gregory Clement, Ezequiel Garcia, Jochen De Smet,
	Peter Sanford, Chény Yves-Gael, Ryan Press, Simon Guinot,
	vdonnefort, stable, Jason Cooper
In-Reply-To: <20130905091147.GA28658@1wt.eu>

Dear Willy Tarreau,

On Thu, 5 Sep 2013 11:11:47 +0200, Willy Tarreau wrote:

> 
> > Note that this can be done without doing any change in the bootloader.
> > For example, on a Mirabox, you can do:
> > 
> >    mw.l 0xD0072414 0x5C93; mw.l 0xD0072418 0xF0AD4E01; mw.l 0xD0076414 0x5C94; mw.l 0xD0076418 0xF0AD4E01; bootm
> > 
> > to boot your kernel. This will program the MAC addresses for both
> > network interfaces in the network controllers, so that when booting
> > Linux, you get:
> > 
> > [   42.122881] mvneta d0070000.ethernet eth0: Using hardware mac address f0:ad:4e:01:5c:93
> > [   42.385398] mvneta d0074000.ethernet eth1: Using hardware mac address f0:ad:4e:01:5c:94
> > 
> > You add that to your default U-Boot boot script, and that's it, you
> > have stable MAC addresses.
> 
> Hmmm that's quite interesting. Unfortunately I don't see an easy way to
> make this directly rely on the ethaddr/eth1addr so that end users can
> simply cut-n-paste a few lines into the u-boot config. But anyway that
> can be useful.

I thought about this as well, but I don't think that's possible, the
U-Boot scripting/parsing capabilities seems to be too limited to
achieve that, unfortunately.

Best regards,

Thomas
-- 
Thomas Petazzoni, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH] net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
From: Willy Tarreau @ 2013-09-05  9:11 UTC (permalink / raw)
  To: Thomas Petazzoni
  Cc: Lior Amsalem, Jochen De Smet, Simon Guinot, Ryan Press, netdev,
	vdonnefort, Ethan Tuttle, stable, Ezequiel Garcia,
	Chény Yves-Gael, Gregory Clement, Peter Sanford,
	David S. Miller, linux-arm-kernel, Jason Cooper
In-Reply-To: <20130905102659.25a0f064@skate>

On Thu, Sep 05, 2013 at 10:26:59AM +0200, Thomas Petazzoni wrote:
> > One simpler solution for them could be to slightly modify the boot loader
> > so that it sets the MAC address on the two ethernet controllers prior to
> > boot. Then your code which checks if a MAC is already set will simply
> > work.
> 
> This works when the network driver is compiled 'statically' inside the
> kernel. When compiled as a module, then the gatable clock of the
> network interface will be gated at the end of the kernel boot, before
> the mvneta module is probe. And gating the network interface clocks
> means that it will loose its state, including its MAC address. So it's
> not an entirely perfect solution either, but I admit that on such
> platforms, the network driver is most likely compiled statically, so it
> would probably suit the needs of most people.

Agreed.

> Note that this can be done without doing any change in the bootloader.
> For example, on a Mirabox, you can do:
> 
>    mw.l 0xD0072414 0x5C93; mw.l 0xD0072418 0xF0AD4E01; mw.l 0xD0076414 0x5C94; mw.l 0xD0076418 0xF0AD4E01; bootm
> 
> to boot your kernel. This will program the MAC addresses for both
> network interfaces in the network controllers, so that when booting
> Linux, you get:
> 
> [   42.122881] mvneta d0070000.ethernet eth0: Using hardware mac address f0:ad:4e:01:5c:93
> [   42.385398] mvneta d0074000.ethernet eth1: Using hardware mac address f0:ad:4e:01:5c:94
> 
> You add that to your default U-Boot boot script, and that's it, you
> have stable MAC addresses.

Hmmm that's quite interesting. Unfortunately I don't see an easy way to
make this directly rely on the ethaddr/eth1addr so that end users can
simply cut-n-paste a few lines into the u-boot config. But anyway that
can be useful.

> > A last one would be to have the mvneta module accept an array of addresses
> > as a module parameter. This way it would just require a minor change in the
> > kernel's cmdline to pass the MAC addresses. I remember seeing this in the
> > past, I don't remember the platform (maybe the NSLU2 but I could be wrong).
> 
> The situation of module parameters to pass MAC addresses was a bit
> fuzzy. There was once a proposal to add a generic kernel parameter to
> do this, but it was rejected by David Miller (I believe not on specific
> implementation details, but on the general idea). However, there are
> numerous drivers in the tree that do provide a custom module parameter
> to set MAC addresses.

Yes, I remember using this with the sunhme driver many years ago when
we did not have access to the onboard rom to retrieve the MAC address.

> However, with the above suggestion of U-Boot scripting, I believe we
> have a relatively easy solution for people to use.

We could provide a script to do it more conveniently for the user :-)

Best regards,
Willy

^ permalink raw reply

* Re: [PATCH] net: mvneta: properly disable HW PHY polling and ensure adjust_link() works
From: Thomas Petazzoni @ 2013-09-05  8:26 UTC (permalink / raw)
  To: Willy Tarreau
  Cc: Ethan Tuttle, David S. Miller, netdev, linux-arm-kernel,
	Lior Amsalem, Gregory Clement, Ezequiel Garcia, Jochen De Smet,
	Peter Sanford, Chény Yves-Gael, Ryan Press, Simon Guinot,
	vdonnefort, stable, Jason Cooper
In-Reply-To: <20130905074426.GD26000@1wt.eu>

Dear Willy Tarreau,

On Thu, 5 Sep 2013 09:44:26 +0200, Willy Tarreau wrote:

> On Thu, Sep 05, 2013 at 09:28:08AM +0200, Thomas Petazzoni wrote:
> > I indeed submitted a revised/improved version of your patches some time
> > ago, but they were rejected. See
> > http://lists.infradead.org/pipermail/linux-arm-kernel/2013-June/173201.html.
> 
> Thanks for the link, that's indeed what I was referring to. Well, at least
> the most boring part that constantly needed to be rebased was the DT patch.
> Now it's much easier to keep the small remaining patches in one's local tree.

Right.

> > Since this has been rejected, the available options are:
> > 
> >  (1) Use a DT-capable bootloader that will properly set the MAC
> >      addresses in the DT. Such DT capable bootloaders will soon be made
> >      available by Marvell, but I am really unsure Globalscale will
> >      provide an update for the Mirabox bootloader, rebased on the new
> >      Marvell bootloader that will be DT capable.
> 
> One simpler solution for them could be to slightly modify the boot loader
> so that it sets the MAC address on the two ethernet controllers prior to
> boot. Then your code which checks if a MAC is already set will simply
> work.

This works when the network driver is compiled 'statically' inside the
kernel. When compiled as a module, then the gatable clock of the
network interface will be gated at the end of the kernel boot, before
the mvneta module is probe. And gating the network interface clocks
means that it will loose its state, including its MAC address. So it's
not an entirely perfect solution either, but I admit that on such
platforms, the network driver is most likely compiled statically, so it
would probably suit the needs of most people.

Note that this can be done without doing any change in the bootloader.
For example, on a Mirabox, you can do:

   mw.l 0xD0072414 0x5C93; mw.l 0xD0072418 0xF0AD4E01; mw.l 0xD0076414 0x5C94; mw.l 0xD0076418 0xF0AD4E01; bootm

to boot your kernel. This will program the MAC addresses for both
network interfaces in the network controllers, so that when booting
Linux, you get:

[   42.122881] mvneta d0070000.ethernet eth0: Using hardware mac address f0:ad:4e:01:5c:93
[   42.385398] mvneta d0074000.ethernet eth1: Using hardware mac address f0:ad:4e:01:5c:94

You add that to your default U-Boot boot script, and that's it, you
have stable MAC addresses. Of course, as explained above, that doesn't
work if you build mvneta as a module.

> >      As an alternative, we (mainly Sebastian Hesselbarth and myself)
> >      have started adding Armada 370/XP support in the Barebox
> >      bootloader. We can already start Barebox on the Mirabox, but for
> >      now it's quite useless since only the serial port is supported,
> >      there is still no support for the network, SD card, USB or NAND.
> >      This will probably come over time, but it's not going to happen
> >      overnight.
> 
> And we must keep in mind that people are generally scared by boot loader
> upgrades, especially when it's for a different one. At least on this
> platform now we have a solution to reflash even after complete failures
> so this is less of a problem.

Right, with kwboot working very reliably on Mirabox, I believe this is
not really an issue.

> >  (2) Use the "impedance matcher" code written by Daniel Mack and
> >      extended by Jason Cooper, available at
> >      https://github.com/zonque/pxa-impedance-matcher. Essentially, it
> >      inserts a small binary between the installed bootloader and the
> >      kernel, that for example allows to choose a particular DTB amongst
> >      several, depending on the board that is detected. I believe it
> >      could probably be extended to cover other use cases such as
> >      modifying the DTB to add the MAC addresses where appropriate. I've
> >      added Jason Cooper in the Cc list if he wants to comment on that.
> 
> Could be a nice solution as well, indeed.
> 
> A last one would be to have the mvneta module accept an array of addresses
> as a module parameter. This way it would just require a minor change in the
> kernel's cmdline to pass the MAC addresses. I remember seeing this in the
> past, I don't remember the platform (maybe the NSLU2 but I could be wrong).

The situation of module parameters to pass MAC addresses was a bit
fuzzy. There was once a proposal to add a generic kernel parameter to
do this, but it was rejected by David Miller (I believe not on specific
implementation details, but on the general idea). However, there are
numerous drivers in the tree that do provide a custom module parameter
to set MAC addresses.

However, with the above suggestion of U-Boot scripting, I believe we
have a relatively easy solution for people to use.

Best regards,

Thomas
-- 
Thomas Petazzoni, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply

* [PATCH net-next v3 1/6] bonding: simplify and use RCU protection for 3ad xmit path
From: Ding Tianhong @ 2013-09-05  7:48 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev

The commit 278b20837511776dc9d5f6ee1c7fabd5479838bb
(bonding: initial RCU conversion) has convert the roundrobin, active-backup,
broadcast and xor xmit path to rcu protection, the performance will be better
for these mode, so this time, convert xmit path for 3ad mode.

Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
Suggested-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Cc: Nikolay Aleksandrov <nikolay@redhat.com>
Cc: Veaceslav Falico <vfalico@redhat.com>
---
 drivers/net/bonding/bond_3ad.c | 32 ++++++++++++++------------------
 drivers/net/bonding/bonding.h  | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 0d8f427..13f1deb 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -143,7 +143,7 @@ static inline struct bonding *__get_bond_by_port(struct port *port)
  */
 static inline struct port *__get_first_port(struct bonding *bond)
 {
-	struct slave *first_slave = bond_first_slave(bond);
+	struct slave *first_slave = bond_first_slave_rcu(bond);
 
 	return first_slave ? &(SLAVE_AD_INFO(first_slave).port) : NULL;
 }
@@ -163,7 +163,7 @@ static inline struct port *__get_next_port(struct port *port)
 	// If there's no bond for this port, or this is the last slave
 	if (bond == NULL)
 		return NULL;
-	slave_next = bond_next_slave(bond, slave);
+	slave_next = bond_next_slave_rcu(bond, slave);
 	if (!slave_next || bond_is_first_slave(bond, slave_next))
 		return NULL;
 
@@ -2417,16 +2417,14 @@ int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info)
 
 int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 {
-	struct slave *slave, *start_at;
 	struct bonding *bond = netdev_priv(dev);
+	struct slave *slave;
 	int slave_agg_no;
 	int slaves_in_agg;
 	int agg_id;
-	int i;
 	struct ad_info ad_info;
 	int res = 1;
 
-	read_lock(&bond->lock);
 	if (__bond_3ad_get_active_agg_info(bond, &ad_info)) {
 		pr_debug("%s: Error: __bond_3ad_get_active_agg_info failed\n",
 			 dev->name);
@@ -2444,13 +2442,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 
 	slave_agg_no = bond->xmit_hash_policy(skb, slaves_in_agg);
 
-	bond_for_each_slave(bond, slave) {
+	bond_for_each_slave_rcu(bond, slave) {
 		struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
 
 		if (agg && (agg->aggregator_identifier == agg_id)) {
-			slave_agg_no--;
-			if (slave_agg_no < 0)
-				break;
+			if (--slave_agg_no < 0) {
+				if (SLAVE_IS_OK(slave)) {
+					res = bond_dev_queue_xmit(bond,
+						skb, slave->dev);
+					goto out;
+				}
+			}
 		}
 	}
 
@@ -2460,23 +2462,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 		goto out;
 	}
 
-	start_at = slave;
-
-	bond_for_each_slave_from(bond, slave, i, start_at) {
-		int slave_agg_id = 0;
+	bond_for_each_slave_rcu(bond, slave) {
 		struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
 
-		if (agg)
-			slave_agg_id = agg->aggregator_identifier;
-
-		if (SLAVE_IS_OK(slave) && agg && (slave_agg_id == agg_id)) {
+		if (SLAVE_IS_OK(slave) && agg &&
+			agg->aggregator_identifier == agg_id) {
 			res = bond_dev_queue_xmit(bond, skb, slave->dev);
 			break;
 		}
 	}
 
 out:
-	read_unlock(&bond->lock);
 	if (res) {
 		/* no suitable interface, frame not sent */
 		kfree_skb(skb);
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index f7ab161..f013b12 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -74,13 +74,34 @@
 /* slave list primitives */
 #define bond_to_slave(ptr) list_entry(ptr, struct slave, list)
 
+/* slave list primitives, Caller must hold rcu_read_lock */
+#define bond_to_slave_rcu(ptr) list_entry_rcu(ptr, struct slave, list)
+
+/* bond_is_empty return NULL if slave list is empty*/
+#define bond_is_empty(bond) \
+	(list_empty(&(bond)->slave_list))
+
+/* bond_is_empty_rcu return NULL if slave list is empty*/
+#define bond_is_empty_rcu(bond) \
+	(!list_first_or_null_rcu(&(bond)->slave_list, struct slave, list))
+
 /* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */
 #define bond_first_slave(bond) \
 	list_first_entry_or_null(&(bond)->slave_list, struct slave, list)
 #define bond_last_slave(bond) \
-	(list_empty(&(bond)->slave_list) ? NULL : \
+	(bond_is_empty(bond) ? NULL : \
 					   bond_to_slave((bond)->slave_list.prev))
 
+/**
+ * IMPORTANT: bond_first/last_slave_rcu can return NULL in case of an empty list
+ * Caller must hold rcu_read_lock
+ */
+#define bond_first_slave_rcu(bond) \
+	list_first_or_null_rcu(&(bond)->slave_list, struct slave, list)
+#define bond_last_slave_rcu(bond) \
+	(bond_is_empty_rcu(bond) ? NULL : \
+					bond_to_slave_rcu((bond)->slave_list.prev))
+
 #define bond_is_first_slave(bond, pos) ((pos)->list.prev == &(bond)->slave_list)
 #define bond_is_last_slave(bond, pos) ((pos)->list.next == &(bond)->slave_list)
 
@@ -93,6 +114,15 @@
 	(bond_is_first_slave(bond, pos) ? bond_last_slave(bond) : \
 					  bond_to_slave((pos)->list.prev))
 
+/* Since bond_first/last_slave_rcu can return NULL, these can return NULL too */
+#define bond_next_slave_rcu(bond, pos) \
+	(bond_is_last_slave(bond, pos) ? bond_first_slave_rcu(bond) : \
+					 bond_to_slave_rcu((pos)->list.next))
+
+#define bond_prev_slave_rcu(bond, pos) \
+	(bond_is_first_slave(bond, pos) ? bond_last_slave_rcu(bond) : \
+					  bond_to_slave_rcu((pos)->list.prev))
+
 /**
  * bond_for_each_slave_from - iterate the slaves list from a starting point
  * @bond:	the bond holding this list.
-- 
1.8.2.1

^ permalink raw reply related

* [PATCH net-next v3 3/6] bonding: replace read_lock to rcu_read_lock for bond_3ad_get_active_agg_info()
From: Ding Tianhong @ 2013-09-05  7:48 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev

the bond slave list will protected by rcu, so the read lock should replace by
rcu read lock.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Cc: Nikolay Aleksandrov <nikolay@redhat.com>
---
 drivers/net/bonding/bond_3ad.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index c134f43..b678502 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2408,9 +2408,9 @@ int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info)
 {
 	int ret;
 
-	read_lock(&bond->lock);
+	rcu_read_lock();
 	ret = __bond_3ad_get_active_agg_info(bond, ad_info);
-	read_unlock(&bond->lock);
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
1.8.2.1

^ permalink raw reply related

* [PATCH net-next v3 2/6] bonding: remove the no effect lock for bond_3ad_lacpdu_recv()
From: Ding Tianhong @ 2013-09-05  7:48 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev

There is no pointer needed read lock protection, remove the unnecessary lock
and improve performance for the 3ad recv path.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Cc: Nikolay Aleksandrov <nikolay@redhat.com>
---
 drivers/net/bonding/bond_3ad.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 7a3860f..c134f43 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2494,9 +2494,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond,
 	if (!lacpdu)
 		return ret;
 
-	read_lock(&bond->lock);
 	ret = bond_3ad_rx_indication(lacpdu, slave, skb->len);
-	read_unlock(&bond->lock);
 	return ret;
 }
 
-- 
1.8.2.1

^ permalink raw reply related

* [PATCH net-next v3 6/6] bonding: use RCU protection for alb xmit path
From: Ding Tianhong @ 2013-09-05  7:49 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev, Hideaki YOSHIFUJI

The commit 278b20837511776dc9d5f6ee1c7fabd5479838bb
(bonding: initial RCU conversion) has convert the roundrobin, active-backup,
broadcast and xor xmit path to rcu protection, the performance will be better
for these mode, so this time, convert xmit path for alb mode.

Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Cc: Nikolay Aleksandrov <nikolay@redhat.com>
Cc: Veaceslav Falico <vfalico@redhat.com>
---
 drivers/net/bonding/bond_alb.c | 20 +++++++++-----------
 drivers/net/bonding/bonding.h  |  2 +-
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c75d383..c0bfd56 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -229,7 +229,7 @@ static struct slave *tlb_get_least_loaded_slave(struct bonding *bond)
 	max_gap = LLONG_MIN;
 
 	/* Find the slave with the largest gap */
-	bond_for_each_slave(bond, slave) {
+	bond_for_each_slave_rcu(bond, slave) {
 		if (SLAVE_IS_OK(slave)) {
 			long long gap = compute_gap(slave);
 
@@ -625,12 +625,14 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 	struct arp_pkt *arp = arp_pkt(skb);
-	struct slave *assigned_slave;
+	struct slave *assigned_slave, *curr_active_slave;
 	struct rlb_client_info *client_info;
 	u32 hash_index = 0;
 
 	_lock_rx_hashtbl(bond);
 
+	curr_active_slave = rcu_dereference(bond->curr_active_slave);
+
 	hash_index = _simple_hash((u8 *)&arp->ip_dst, sizeof(arp->ip_dst));
 	client_info = &(bond_info->rx_hashtbl[hash_index]);
 
@@ -654,9 +656,9 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 			 * move the old client to primary (curr_active_slave) so
 			 * that the new client can be assigned to this entry.
 			 */
-			if (bond->curr_active_slave &&
-			    client_info->slave != bond->curr_active_slave) {
-				client_info->slave = bond->curr_active_slave;
+			if (curr_active_slave &&
+			    client_info->slave != curr_active_slave) {
+				client_info->slave = curr_active_slave;
 				rlb_update_client(client_info);
 			}
 		}
@@ -1338,8 +1340,6 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 
 	/* make sure that the curr_active_slave do not change during tx
 	 */
-	read_lock(&bond->lock);
-	read_lock(&bond->curr_slave_lock);
 
 	switch (ntohs(skb->protocol)) {
 	case ETH_P_IP: {
@@ -1422,12 +1422,12 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 
 	if (!tx_slave) {
 		/* unbalanced or unassigned, send through primary */
-		tx_slave = bond->curr_active_slave;
+		tx_slave = rcu_dereference(bond->curr_active_slave);
 		bond_info->unbalanced_load += skb->len;
 	}
 
 	if (tx_slave && SLAVE_IS_OK(tx_slave)) {
-		if (tx_slave != bond->curr_active_slave) {
+		if (tx_slave != rcu_dereference(bond->curr_active_slave)) {
 			memcpy(eth_data->h_source,
 			       tx_slave->dev->dev_addr,
 			       ETH_ALEN);
@@ -1442,8 +1442,6 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 		}
 	}
 
-	read_unlock(&bond->curr_slave_lock);
-	read_unlock(&bond->lock);
 	if (res) {
 		/* no suitable interface, frame not sent */
 		kfree_skb(skb);
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 80b170a..4282e65 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -536,7 +536,7 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond,
 {
 	struct slave *tmp;
 
-	bond_for_each_slave(bond, tmp)
+	bond_for_each_slave_rcu(bond, tmp)
 		if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr))
 			return tmp;
 
-- 
1.8.2.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox