Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next v3 5/6] bonding: restructure and add rcu for bond_for_each_slave_next()
From: Ding Tianhong @ 2013-09-05 14:29 UTC (permalink / raw)
  To: Veaceslav Falico
  Cc: Ding Tianhong, Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Netdev
In-Reply-To: <20130905133027.GA26163@redhat.com>

于 2013/9/5 21:30, Veaceslav Falico 写道:
> On Thu, Sep 05, 2013 at 03:49:01PM +0800, Ding Tianhong wrote:
>> Remove the wordy int and add bond_for_each_slave_next_rcu() for 
>> future use.
>>
>> Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
>> Suggested-by: Veaceslav Falico <vfalico@redhat.com>
>> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>> Cc: Nikolay Aleksandrov <nikolay@redhat.com>
>> Cc: Veaceslav Falico <vfalico@redhat.com>
>> ---
>> drivers/net/bonding/bond_alb.c | 3 +--
>> drivers/net/bonding/bond_main.c | 6 ++----
>> drivers/net/bonding/bonding.h | 23 +++++++++++++++++++----
>> 3 files changed, 22 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_alb.c 
>> b/drivers/net/bonding/bond_alb.c
>> index 91f179d..c75d383 100644
>> --- a/drivers/net/bonding/bond_alb.c
>> +++ b/drivers/net/bonding/bond_alb.c
>> @@ -383,7 +383,6 @@ static struct slave *rlb_next_rx_slave(struct 
>> bonding *bond)
>> {
>> struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
>> struct slave *rx_slave, *slave, *start_at;
>> - int i = 0;
>>
>> if (bond_info->next_rx_slave)
>> start_at = bond_info->next_rx_slave;
>> @@ -392,7 +391,7 @@ static struct slave *rlb_next_rx_slave(struct 
>> bonding *bond)
>>
>> rx_slave = NULL;
>>
>> - bond_for_each_slave_from(bond, slave, i, start_at) {
>> + bond_for_each_slave_from(bond, slave, start_at) {
>> if (SLAVE_IS_OK(slave)) {
>> if (!rx_slave) {
>> rx_slave = slave;
>> diff --git a/drivers/net/bonding/bond_main.c 
>> b/drivers/net/bonding/bond_main.c
>> index 39e5b1c..4190389 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -782,7 +782,6 @@ static struct slave *bond_find_best_slave(struct 
>> bonding *bond)
>> struct slave *new_active, *old_active;
>> struct slave *bestslave = NULL;
>> int mintime = bond->params.updelay;
>> - int i;
>>
>> new_active = bond->curr_active_slave;
>>
>> @@ -801,7 +800,7 @@ static struct slave *bond_find_best_slave(struct 
>> bonding *bond)
>> /* remember where to stop iterating over the slaves */
>> old_active = new_active;
>>
>> - bond_for_each_slave_from(bond, new_active, i, old_active) {
>> + bond_for_each_slave_from(bond, new_active, old_active) {
>> if (new_active->link == BOND_LINK_UP) {
>> return new_active;
>> } else if (new_active->link == BOND_LINK_BACK &&
>> @@ -2756,7 +2755,6 @@ do_failover:
>> static void bond_ab_arp_probe(struct bonding *bond)
>> {
>> struct slave *slave, *next_slave;
>> - int i;
>>
>> read_lock(&bond->curr_slave_lock);
>>
>> @@ -2788,7 +2786,7 @@ static void bond_ab_arp_probe(struct bonding 
>> *bond)
>>
>> /* search for next candidate */
>> next_slave = bond_next_slave(bond, bond->current_arp_slave);
>> - bond_for_each_slave_from(bond, slave, i, next_slave) {
>> + bond_for_each_slave_from(bond, slave, next_slave) {
>> if (IS_UP(slave->dev)) {
>> slave->link = BOND_LINK_BACK;
>> bond_set_slave_active_flags(slave);
>> diff --git a/drivers/net/bonding/bonding.h 
>> b/drivers/net/bonding/bonding.h
>> index f013b12..48fd41a 100644
>> --- a/drivers/net/bonding/bonding.h
>> +++ b/drivers/net/bonding/bonding.h
>> @@ -123,18 +123,33 @@
>> (bond_is_first_slave(bond, pos) ? bond_last_slave_rcu(bond) : \
>> bond_to_slave_rcu((pos)->list.prev))
>>
>> +/* Check whether the slave is the only one in bond */
>> +
>> /**
>> * bond_for_each_slave_from - iterate the slaves list from a starting 
>> point
>> * @bond: the bond holding this list.
>> * @pos: current slave.
>> - * @cnt: counter for max number of moves
>> * @start: starting point.
>> *
>> * Caller must hold bond->lock
>> */
>> -#define bond_for_each_slave_from(bond, pos, cnt, start) \
>> - for (cnt = 0, pos = start; pos && cnt < (bond)->slave_cnt; \
>> - cnt++, pos = bond_next_slave(bond, pos))
>> +#define bond_for_each_slave_from(bond, pos, start) \
>> + for (pos = start; pos && &pos->list != &bond->slave_list; \
>> + (pos = bond_next_slave(bond, pos)) != start ? pos : \
>> + (pos = list_entry(&bond->slave_list, typeof(*pos), list)))
>
> Do you understand the differences of these two implementations?
>
> pos && cnt < (bond)->slave_cnt
>
> vs
>
> pos && &pos->list != &bond->slave_list
>
> I'm sorry, but you should listen to comments.
>
if the slave is only one, it will run once, and out the loop, else if
the slave is more than one, it will run until reach the start again,
here I use the &bond->slave_list to control the loop, as it will not
change any time, maybe it is hard to understand first.

ok, I think they are same, just in the purpose to remove the cnt, so 
maybe it make you
unconfortable, sorry about that, if you strangly disagree, I will miss 
this patch, but
thanks again for your different opinions. :)

>> +
>> +/**
>> + * bond_for_each_slave_from_rcu - iterate the slaves list from a 
>> starting point
>> + * @bond: the bond holding this list.
>> + * @pos: current slave.
>> + * @start: starting point.
>> + *
>> + * Caller must hold rcu_read_lock
>> + */
>> +#define bond_for_each_slave_from_rcu(bond, pos, start) \
>> + for (pos = start; pos && &pos->list != &bond->slave_list; \
>> + (pos = bond_next_slave_rcu(bond, pos)) != start ? pos : \
>> + (pos = list_entry_rcu(&bond->slave_list, typeof(*pos), list)))
>>
>> /**
>> * bond_for_each_slave - iterate over all slaves
>> -- 
>> 1.8.2.1
>>
>>
>>
> -- 
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH net-next v3 1/6] bonding: simplify and use RCU protection for 3ad xmit path
From: Ding Tianhong @ 2013-09-05 14:42 UTC (permalink / raw)
  To: Veaceslav Falico
  Cc: Ding Tianhong, Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Netdev
In-Reply-To: <52289289.2030606@gmail.com>

于 2013/9/5 22:17, Ding Tianhong 写道:
> 于 2013/9/5 21:42, Veaceslav Falico 写道:
>> On Thu, Sep 05, 2013 at 03:48:44PM +0800, Ding Tianhong wrote:
>>> The commit 278b20837511776dc9d5f6ee1c7fabd5479838bb
>>> (bonding: initial RCU conversion) has convert the roundrobin, 
>>> active-backup,
>>> broadcast and xor xmit path to rcu protection, the performance will 
>>> be better
>>> for these mode, so this time, convert xmit path for 3ad mode.
>>>
>>> Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
>>> Suggested-by: Veaceslav Falico <vfalico@redhat.com>
>>> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>>> Signed-off-by: Wang Yufen <wangyufen@huawei.com>
>>> Cc: Nikolay Aleksandrov <nikolay@redhat.com>
>>> Cc: Veaceslav Falico <vfalico@redhat.com>
>>> ---
>>> drivers/net/bonding/bond_3ad.c | 32 ++++++++++++++------------------
>>> drivers/net/bonding/bonding.h | 32 +++++++++++++++++++++++++++++++-
>>> 2 files changed, 45 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/net/bonding/bond_3ad.c 
>>> b/drivers/net/bonding/bond_3ad.c
>>> index 0d8f427..13f1deb 100644
>>> --- a/drivers/net/bonding/bond_3ad.c
>>> +++ b/drivers/net/bonding/bond_3ad.c
>>> @@ -143,7 +143,7 @@ static inline struct bonding 
>>> *__get_bond_by_port(struct port *port)
>>> */
>>> static inline struct port *__get_first_port(struct bonding *bond)
>>> {
>>> - struct slave *first_slave = bond_first_slave(bond);
>>> + struct slave *first_slave = bond_first_slave_rcu(bond);
>>>
>>> return first_slave ? &(SLAVE_AD_INFO(first_slave).port) : NULL;
>>> }
>>> @@ -163,7 +163,7 @@ static inline struct port 
>>> *__get_next_port(struct port *port)
>>> // If there's no bond for this port, or this is the last slave
>>> if (bond == NULL)
>>> return NULL;
>>> - slave_next = bond_next_slave(bond, slave);
>>> + slave_next = bond_next_slave_rcu(bond, slave);
>>> if (!slave_next || bond_is_first_slave(bond, slave_next))
>>> return NULL;
>>>
>>> @@ -2417,16 +2417,14 @@ int bond_3ad_get_active_agg_info(struct 
>>> bonding *bond, struct ad_info *ad_info)
>>>
>>> int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
>>> {
>>> - struct slave *slave, *start_at;
>>> struct bonding *bond = netdev_priv(dev);
>>> + struct slave *slave;
>>> int slave_agg_no;
>>> int slaves_in_agg;
>>> int agg_id;
>>> - int i;
>>> struct ad_info ad_info;
>>> int res = 1;
>>>
>>> - read_lock(&bond->lock);
>>> if (__bond_3ad_get_active_agg_info(bond, &ad_info)) {
>>> pr_debug("%s: Error: __bond_3ad_get_active_agg_info failed\n",
>>> dev->name);
>>> @@ -2444,13 +2442,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, 
>>> struct net_device *dev)
>>>
>>> slave_agg_no = bond->xmit_hash_policy(skb, slaves_in_agg);
>>>
>>> - bond_for_each_slave(bond, slave) {
>>> + bond_for_each_slave_rcu(bond, slave) {
>>> struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
>>>
>>> if (agg && (agg->aggregator_identifier == agg_id)) {
>>> - slave_agg_no--;
>>> - if (slave_agg_no < 0)
>>> - break;
>>> + if (--slave_agg_no < 0) {
>>> + if (SLAVE_IS_OK(slave)) {
>>> + res = bond_dev_queue_xmit(bond,
>>> + skb, slave->dev);
>>> + goto out;
>>> + }
>>> + }
>>> }
>>> }
>>>
>>> @@ -2460,23 +2462,17 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, 
>>> struct net_device *dev)
>>> goto out;
>>> }
>>>
>>> - start_at = slave;
>>> -
>>> - bond_for_each_slave_from(bond, slave, i, start_at) {
>>> - int slave_agg_id = 0;
>>> + bond_for_each_slave_rcu(bond, slave) {
>>> struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
>>>
>>> - if (agg)
>>> - slave_agg_id = agg->aggregator_identifier;
>>> -
>>> - if (SLAVE_IS_OK(slave) && agg && (slave_agg_id == agg_id)) {
>>> + if (SLAVE_IS_OK(slave) && agg &&
>>> + agg->aggregator_identifier == agg_id) {
>>> res = bond_dev_queue_xmit(bond, skb, slave->dev);
>>> break;
>>> }
>>> }
>>>
>>> out:
>>> - read_unlock(&bond->lock);
>>> if (res) {
>>> /* no suitable interface, frame not sent */
>>> kfree_skb(skb);
>>> diff --git a/drivers/net/bonding/bonding.h 
>>> b/drivers/net/bonding/bonding.h
>>> index f7ab161..f013b12 100644
>>> --- a/drivers/net/bonding/bonding.h
>>> +++ b/drivers/net/bonding/bonding.h
>>> @@ -74,13 +74,34 @@
>>> /* slave list primitives */
>>> #define bond_to_slave(ptr) list_entry(ptr, struct slave, list)
>>>
>>> +/* slave list primitives, Caller must hold rcu_read_lock */
>>> +#define bond_to_slave_rcu(ptr) list_entry_rcu(ptr, struct slave, list)
>>> +
>>> +/* bond_is_empty return NULL if slave list is empty*/
>>> +#define bond_is_empty(bond) \
>>> + (list_empty(&(bond)->slave_list))
>>> +
>>> +/* bond_is_empty_rcu return NULL if slave list is empty*/
>>> +#define bond_is_empty_rcu(bond) \
>>> + (!list_first_or_null_rcu(&(bond)->slave_list, struct slave, list))
>>> +
>>> /* IMPORTANT: bond_first/last_slave can return NULL in case of an 
>>> empty list */
>>> #define bond_first_slave(bond) \
>>> list_first_entry_or_null(&(bond)->slave_list, struct slave, list)
>>> #define bond_last_slave(bond) \
>>> - (list_empty(&(bond)->slave_list) ? NULL : \
>>> + (bond_is_empty(bond) ? NULL : \
>>> bond_to_slave((bond)->slave_list.prev))
>>>
>>> +/**
>>> + * IMPORTANT: bond_first/last_slave_rcu can return NULL in case of 
>>> an empty list
>>> + * Caller must hold rcu_read_lock
>>> + */
>>> +#define bond_first_slave_rcu(bond) \
>>> + list_first_or_null_rcu(&(bond)->slave_list, struct slave, list)
>>> +#define bond_last_slave_rcu(bond) \
>>> + (bond_is_empty_rcu(bond) ? NULL : \
>>> + bond_to_slave_rcu((bond)->slave_list.prev))
>>
>> Really?
>>
>> No. Again - take a look at list_first_or_null_rcu. And its comments.
>>
>> I'm sorry that I'm acting that negatively... But if that gets accepted -
>> I'll have days of nightmares.
>>
>> Try to understand how RCU works, please, before sending patches using 
>> it.
>>
> I am sad to here that, I think I had good understand of rcu, but maybe 
> miss something.
> #define bond_first_slave_rcu(bond) \
> (bond_is_empty_rcu(bond) ? NULL : \
> bond_to_slave_rcu((bond)->slave_list.next))
>
> vs
> list_first_or_null_rcu(&(bond)->slave_list, struct slave, list)
>
> maybe the first one is confort you, but I think the second is racy and 
> simple, maybe I still
> miss something, I need a coffee to clear up.
>

fprgot it

best regards
Ding Tianhong

>
>>> +
>>> #define bond_is_first_slave(bond, pos) ((pos)->list.prev == 
>>> &(bond)->slave_list)
>>> #define bond_is_last_slave(bond, pos) ((pos)->list.next == 
>>> &(bond)->slave_list)
>>>
>>> @@ -93,6 +114,15 @@
>>> (bond_is_first_slave(bond, pos) ? bond_last_slave(bond) : \
>>> bond_to_slave((pos)->list.prev))
>>>
>>> +/* Since bond_first/last_slave_rcu can return NULL, these can 
>>> return NULL too */
>>> +#define bond_next_slave_rcu(bond, pos) \
>>> + (bond_is_last_slave(bond, pos) ? bond_first_slave_rcu(bond) : \
>>> + bond_to_slave_rcu((pos)->list.next))
>>> +
>>> +#define bond_prev_slave_rcu(bond, pos) \
>>> + (bond_is_first_slave(bond, pos) ? bond_last_slave_rcu(bond) : \
>>> + bond_to_slave_rcu((pos)->list.prev))
>>> +
>>> /**
>>> * bond_for_each_slave_from - iterate the slaves list from a starting 
>>> point
>>> * @bond: the bond holding this list.
>>> -- 
>>> 1.8.2.1
>>>
>>>
>>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>

^ permalink raw reply

* Fw: [Bug 60853] New: OOPS at find_appropriate_src+0xdb/0x1a0 [nf_nat]
From: Stephen Hemminger @ 2013-09-05 15:04 UTC (permalink / raw)
  To: netdev



Begin forwarded message:

Date: Wed, 4 Sep 2013 20:02:15 -0700
From: "bugzilla-daemon@bugzilla.kernel.org" <bugzilla-daemon@bugzilla.kernel.org>
To: "stephen@networkplumber.org" <stephen@networkplumber.org>
Subject: [Bug 60853] New: OOPS at find_appropriate_src+0xdb/0x1a0 [nf_nat]


https://bugzilla.kernel.org/show_bug.cgi?id=60853

            Bug ID: 60853
           Summary: OOPS at find_appropriate_src+0xdb/0x1a0 [nf_nat]
           Product: Networking
           Version: 2.5
    Kernel Version: 2.6.32.43-0.4-default
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: normal
          Priority: P1
         Component: IPV4
          Assignee: shemminger@linux-foundation.org
          Reporter: lizhao09@huawei.com
        Regression: No

[10542399.515396] BUG: unable to handle kernel NULL pointer dereference at
000000000000003e
[10542399.523469] IP: [<ffffffffa1491a4b>] find_appropriate_src+0xdb/0x1a0
[nf_nat]
[10542399.530843] PGD 17f55ec067 PUD 17fba37067 PMD 0
[10542399.535727] Oops: 0000 [#1] SMP
[10542399.539220] last sysfs file:
/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
[10542399.547355] CPU 8
[10542399.647544] Supported: Yes, External
[10542399.651361] Pid: 0, comm: swapper Tainted: P          NX
2.6.32.43-0.4-default #1 Thurley
[10542399.659755] RIP: 0010:[<ffffffffa1491a4b>]  [<ffffffffa1491a4b>]
find_appropriate_src+0xdb/0x1a0 [nf_nat]
[10542399.669552] RSP: 0018:ffff88002c3039f0  EFLAGS: 00010286
[10542399.675095] RAX: 0000000000000000 RBX: ffff8817814beb90 RCX:
0000000024852261
[10542399.682454] RDX: 0000000000000000 RSI: 00000000327c4d71 RDI:
ffffffff81cd4dc0
[10542399.689812] RBP: ffff88002c303ad0 R08: 0000000000000011 R09:
0000000000000002
[10542399.697170] R10: 0000000000004000 R11: ffffffffa14726e0 R12:
ffff88002c303aa0
[10542399.704529] R13: ffff88002c303b40 R14: ffff88002c303b4c R15:
ffff88002c303b4e
[10542399.711888] FS:  0000000000000000(0000) GS:ffff88002c300000(0000)
knlGS:0000000000000000
[10542399.720199] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[10542399.726175] CR2: 000000000000003e CR3: 00000017f67f1000 CR4:
00000000000006e0
[10542399.733534] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[10542399.740893] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[10542399.748254] Process swapper (pid: 0, threadinfo ffff881810db2000, task
ffff881810db0080)
[10542399.756560] Stack:
[10542399.758821]  00000000ffffffff ffff88002c303aa0 ffff88002c303ad0
ffff88002c303b40
[10542399.766301] <0> 0000000000000000 ffff8817f7d639e8 0000000000000100
ffffffffa1491beb
[10542399.774237] <0> ffff88002c303ad0 ffff8817f7d639e8 ffff88002c303b40
ffff88002c303aa0
[10542399.782365] Call Trace:
[10542399.785085]  [<ffffffffa1491beb>] get_unique_tuple+0xdb/0x240 [nf_nat]
[10542399.791847]  [<ffffffffa1491de9>] nf_nat_setup_info+0x99/0x350 [nf_nat]
[10542399.798697]  [<ffffffffa149e162>] alloc_null_binding+0x52/0x90
[iptable_nat]
[10542399.805977]  [<ffffffffa149e519>] nf_nat_fn+0x1e9/0x280 [iptable_nat]
[10542399.812654]  [<ffffffff81318d18>] nf_iterate+0x68/0xa0
[10542399.818031]  [<ffffffff81318db2>] nf_hook_slow+0x62/0xf0
[10542399.823582]  [<ffffffff813214a1>] ip_local_deliver+0x51/0x80
[10542399.829477]  [<ffffffff81320a59>] ip_rcv_finish+0x1b9/0x440
[10542399.835288]  [<ffffffff812f5f89>] netif_receive_skb+0x599/0x6a0
[10542399.841454]  [<ffffffffa0ea4837>] ixgbe_clean_rx_irq+0x3d7/0xe50 [ixgbe]
[10542399.848397]  [<ffffffffa0ea53e4>] ixgbe_clean_rxtx_many+0x134/0x270
[ixgbe]
[10542399.855595]  [<ffffffff812f6863>] net_rx_action+0xe3/0x1a0
[10542399.861318]  [<ffffffff810533ef>] __do_softirq+0xbf/0x170
[10542399.866956]  [<ffffffff810040bc>] call_softirq+0x1c/0x30
[10542399.872506]  [<ffffffff81005cfd>] do_softirq+0x4d/0x80
[10542399.877883]  [<ffffffff81053275>] irq_exit+0x85/0x90
[10542399.883087]  [<ffffffff8100525e>] do_IRQ+0x6e/0xe0
[10542399.888120]  [<ffffffff81003913>] ret_from_intr+0x0/0xa
[10542399.893582]  [<ffffffff8100ae42>] mwait_idle+0x62/0x70
[10542399.898957]  [<ffffffff8100204a>] cpu_idle+0x5a/0xb0
[10542399.904159] Code: 00 00 00 4d 8d 7d 0e 4d 8d 75 0c 48 89 c3 eb 14 48 8b
03 48 85 c0 0f 84 84 00 00 00 44 0f b6 45 26 48 89 c3 48 8b 53 20 48 8b 03 <44>
38 42 3e 0f 18 08 75 dc 8b 42 18 3b 45 00 75 d4 0f b7 42 28

From the vmcore,we found that: 

1 OOPS occured at the statement 't->dst.protonum == tuple->dst.protonum' in
inline function same_src. 

2 The first parameter of same_src "ct" is NULL,The value of 'ct' came from 'ct
= nat->ct'.

3 Read the content of the 'nat', all member's value are zero.


static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
    struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);

    if (nat == NULL || nat->ct == NULL)
        return;

    NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);

    spin_lock_bh(&nf_nat_lock);
    hlist_del_rcu(&nat->bysource); 
    spin_unlock_bh(&nf_nat_lock);
}

void nf_conntrack_free(struct nf_conn *ct)
{
  struct net *net = nf_ct_net(ct);
  nf_ct_ext_destroy(ct); //For NAT，it will call nf_nat_cleanup_conntrack
  atomic_dec(&net->ct.count);    
  nf_ct_ext_free(ct);  // Free nat-extention memory by kfree; is it possible
that the extention was still used in a RCU read side ?
  kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
}

-- 
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Fw: [Bug 60856] New: Enabling PCI pass-through triggers circular locking complaint
From: Stephen Hemminger @ 2013-09-05 15:06 UTC (permalink / raw)
  To: Amir Vadai; +Cc: netdev



Begin forwarded message:

Date: Thu, 5 Sep 2013 02:54:21 -0700
From: "bugzilla-daemon@bugzilla.kernel.org" <bugzilla-daemon@bugzilla.kernel.org>
To: "stephen@networkplumber.org" <stephen@networkplumber.org>
Subject: [Bug 60856] New: Enabling PCI pass-through triggers circular locking complaint


https://bugzilla.kernel.org/show_bug.cgi?id=60856

            Bug ID: 60856
           Summary: Enabling PCI pass-through triggers circular locking
                    complaint
           Product: Networking
           Version: 2.5
    Kernel Version: 3.11
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: normal
          Priority: P1
         Component: Other
          Assignee: shemminger@linux-foundation.org
          Reporter: bvanassche@acm.org
        Regression: Yes

When I enable PCI pass-through for an mlx4 HCA, a circular locking complaint is
reported.

PCI pass-through was enabled with the following script:

#!/bin/bash
vendor_id="15b3" # Mellanox
device_id="1003" # MT27500 Family [ConnectX-3]

modprobe pci_stub &&
echo "$vendor_id $device_id" >/sys/bus/pci/drivers/pci-stub/new_id &&
lspci -n -mm |
while read slot class vendor device rest; do
    slot="0000:${slot}"
    vendor="${vendor#\"}"
    vendor="${vendor%\"}"
    device="${device#\"}"
    device="${device%\"}"
    if [ "$vendor" = "$vendor_id" -a "$device" = "$device_id" ]; then
    echo "$slot" >/sys/bus/pci/devices/$slot/driver/unbind
    echo "$slot" >/sys/bus/pci/drivers/pci-stub/bind
    fi
done

Running the above script triggered the following lockdep complaint:

======================================================
[ INFO: possible circular locking dependency detected ]
3.11.0-debug+ #1 Not tainted
-------------------------------------------------------
assign-pci-dev-/3065 is trying to acquire lock:
 (s_active#79){++++.+}, at: [<ffffffff811d816b>] sysfs_addrm_finish+0x3b/0x70

but task is already holding lock:
 (rtnl_mutex){+.+.+.}, at: [<ffffffff81384117>] rtnl_lock+0x17/0x20

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (rtnl_mutex){+.+.+.}:
       [<ffffffff810a5eca>] lock_acquire+0x8a/0x120
       [<ffffffff8144c16d>] mutex_lock_nested+0x7d/0x380
       [<ffffffff81384117>] rtnl_lock+0x17/0x20
       [<ffffffffa048fb7e>] ipoib_set_mode+0xde/0xf0 [ib_ipoib]
       [<ffffffffa049686a>] set_mode+0x3a/0x90 [ib_ipoib]
       [<ffffffff812eec88>] dev_attr_store+0x18/0x30
       [<ffffffff811d6654>] sysfs_write_file+0xe4/0x150
       [<ffffffff81167f24>] vfs_write+0xc4/0x1e0
       [<ffffffff811683e5>] SyS_write+0x55/0xa0
       [<ffffffff81459fc2>] system_call_fastpath+0x16/0x1b

-> #0 (s_active#79){++++.+}:
       [<ffffffff810a57a6>] __lock_acquire+0x1d36/0x1e40
       [<ffffffff810a5eca>] lock_acquire+0x8a/0x120
       [<ffffffff811d75b6>] sysfs_deactivate+0x126/0x180
       [<ffffffff811d816b>] sysfs_addrm_finish+0x3b/0x70
       [<ffffffff811d86af>] sysfs_remove_dir+0x9f/0xd0
       [<ffffffff81228666>] kobject_del+0x16/0x40
       [<ffffffff812f037a>] device_del+0x18a/0x1d0
       [<ffffffff8138daf1>] netdev_unregister_kobject+0x71/0x80
       [<ffffffff8137339c>] rollback_registered_many+0x16c/0x220
       [<ffffffff81373661>] rollback_registered+0x31/0x40
       [<ffffffff81373ce8>] unregister_netdevice_queue+0x58/0xa0
       [<ffffffff81373e20>] unregister_netdev+0x20/0x30
       [<ffffffffa048dc01>] ipoib_remove_one+0xb1/0xf0 [ib_ipoib]
       [<ffffffffa035e57e>] ib_unregister_device+0x4e/0x110 [ib_core]
       [<ffffffffa044ebde>] mlx4_ib_remove+0x2e/0x1a0 [mlx4_ib]
       [<ffffffffa037e24b>] mlx4_remove_device+0x7b/0x90 [mlx4_core]
       [<ffffffffa037e5ab>] mlx4_unregister_device+0x4b/0x90 [mlx4_core]
       [<ffffffffa037fca4>] mlx4_remove_one+0x54/0x330 [mlx4_core]
       [<ffffffff812542a6>] pci_device_remove+0x46/0xc0
       [<ffffffff812f39ef>] __device_release_driver+0x7f/0xf0
       [<ffffffff812f3d4e>] device_release_driver+0x2e/0x40
       [<ffffffff812f28e3>] driver_unbind+0xa3/0xc0
       [<ffffffff812f1c24>] drv_attr_store+0x24/0x40
       [<ffffffff811d6654>] sysfs_write_file+0xe4/0x150
       [<ffffffff81167f24>] vfs_write+0xc4/0x1e0
       [<ffffffff811683e5>] SyS_write+0x55/0xa0
       [<ffffffff81459fc2>] system_call_fastpath+0x16/0x1b

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(rtnl_mutex);
                               lock(s_active#79);
                               lock(rtnl_mutex);
  lock(s_active#79);

 *** DEADLOCK ***

8 locks held by assign-pci-dev-/3065:
 #0:  (sb_writers#6){.+.+.+}, at: [<ffffffff81168003>] vfs_write+0x1a3/0x1e0
 #1:  (&buffer->mutex){+.+.+.}, at: [<ffffffff811d65b8>]
sysfs_write_file+0x48/0x150
 #2:  (s_active#185){.+.+.+}, at: [<ffffffff811d663c>]
sysfs_write_file+0xcc/0x150
 #3:  (&__lockdep_no_validate__){......}, at: [<ffffffff812f28db>]
driver_unbind+0x9b/0xc0
 #4:  (&__lockdep_no_validate__){......}, at: [<ffffffff812f3d46>]
device_release_driver+0x26/0x40
 #5:  (intf_mutex){+.+.+.}, at: [<ffffffffa037e583>]
mlx4_unregister_device+0x23/0x90 [mlx4_core]
 #6:  (device_mutex){+.+.+.}, at: [<ffffffffa035e557>]
ib_unregister_device+0x27/0x110 [ib_core]
 #7:  (rtnl_mutex){+.+.+.}, at: [<ffffffff81384117>] rtnl_lock+0x17/0x20

stack backtrace:
CPU: 1 PID: 3065 Comm: assign-pci-dev- Not tainted 3.11.0-debug+ #1
Hardware name: System manufacturer P5Q DELUXE/P5Q DELUXE, BIOS 2301
07/10/2009
 ffffffff81d67910 ffff8801b265d848 ffffffff8144973f 0000000000000007
 ffffffff81d67910 ffff8801b265d898 ffffffff814467da 0000000000000086
 ffff8801b265d928 ffff8801b37f5278 ffff8801b37f52b0 ffff8801b37f5278
Call Trace:
 [<ffffffff8144973f>] dump_stack+0x55/0x76
 [<ffffffff814467da>] print_circular_bug+0x1fb/0x20c
 [<ffffffff810a57a6>] __lock_acquire+0x1d36/0x1e40
 [<ffffffff8107c9e5>] ? sched_clock_local+0x25/0xa0
 [<ffffffff810a5eca>] lock_acquire+0x8a/0x120
 [<ffffffff811d816b>] ? sysfs_addrm_finish+0x3b/0x70
 [<ffffffff811d75b6>] sysfs_deactivate+0x126/0x180
 [<ffffffff811d816b>] ? sysfs_addrm_finish+0x3b/0x70
 [<ffffffff810a6829>] ? mark_held_locks+0xb9/0x140
 [<ffffffff811d816b>] sysfs_addrm_finish+0x3b/0x70
 [<ffffffff811d86af>] sysfs_remove_dir+0x9f/0xd0
 [<ffffffff81228666>] kobject_del+0x16/0x40
 [<ffffffff812f037a>] device_del+0x18a/0x1d0
 [<ffffffff8138daf1>] netdev_unregister_kobject+0x71/0x80
 [<ffffffff8137339c>] rollback_registered_many+0x16c/0x220
 [<ffffffff81384117>] ? rtnl_lock+0x17/0x20
 [<ffffffff81373661>] rollback_registered+0x31/0x40
 [<ffffffff81373ce8>] unregister_netdevice_queue+0x58/0xa0
 [<ffffffff81373e20>] unregister_netdev+0x20/0x30
 [<ffffffffa048dc01>] ipoib_remove_one+0xb1/0xf0 [ib_ipoib]
 [<ffffffffa035e57e>] ib_unregister_device+0x4e/0x110 [ib_core]
 [<ffffffffa044ebde>] mlx4_ib_remove+0x2e/0x1a0 [mlx4_ib]
 [<ffffffffa037e24b>] mlx4_remove_device+0x7b/0x90 [mlx4_core]
 [<ffffffffa037e5ab>] mlx4_unregister_device+0x4b/0x90 [mlx4_core]
 [<ffffffffa037fca4>] mlx4_remove_one+0x54/0x330 [mlx4_core]
 [<ffffffff812542a6>] pci_device_remove+0x46/0xc0
 [<ffffffff812f39ef>] __device_release_driver+0x7f/0xf0
 [<ffffffff812f3d4e>] device_release_driver+0x2e/0x40
 [<ffffffff812f28e3>] driver_unbind+0xa3/0xc0
 [<ffffffff812f1c24>] drv_attr_store+0x24/0x40
 [<ffffffff811d6654>] sysfs_write_file+0xe4/0x150
 [<ffffffff81167f24>] vfs_write+0xc4/0x1e0
 [<ffffffff811683e5>] SyS_write+0x55/0xa0
 [<ffffffff81459fc2>] system_call_fastpath+0x16/0x1b

--
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Fw: [Bug 60853] New: OOPS at find_appropriate_src+0xdb/0x1a0 [nf_nat]
From: Stephen Hemminger @ 2013-09-05 15:06 UTC (permalink / raw)
  To: netdev



Begin forwarded message:

Date: Wed, 4 Sep 2013 20:02:15 -0700
From: "bugzilla-daemon@bugzilla.kernel.org" <bugzilla-daemon@bugzilla.kernel.org>
To: "stephen@networkplumber.org" <stephen@networkplumber.org>
Subject: [Bug 60853] New: OOPS at find_appropriate_src+0xdb/0x1a0 [nf_nat]


https://bugzilla.kernel.org/show_bug.cgi?id=60853

            Bug ID: 60853
           Summary: OOPS at find_appropriate_src+0xdb/0x1a0 [nf_nat]
           Product: Networking
           Version: 2.5
    Kernel Version: 2.6.32.43-0.4-default
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: normal
          Priority: P1
         Component: IPV4
          Assignee: shemminger@linux-foundation.org
          Reporter: lizhao09@huawei.com
        Regression: No

[10542399.515396] BUG: unable to handle kernel NULL pointer dereference at
000000000000003e
[10542399.523469] IP: [<ffffffffa1491a4b>] find_appropriate_src+0xdb/0x1a0
[nf_nat]
[10542399.530843] PGD 17f55ec067 PUD 17fba37067 PMD 0
[10542399.535727] Oops: 0000 [#1] SMP
[10542399.539220] last sysfs file:
/sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
[10542399.547355] CPU 8
[10542399.647544] Supported: Yes, External
[10542399.651361] Pid: 0, comm: swapper Tainted: P          NX
2.6.32.43-0.4-default #1 Thurley
[10542399.659755] RIP: 0010:[<ffffffffa1491a4b>]  [<ffffffffa1491a4b>]
find_appropriate_src+0xdb/0x1a0 [nf_nat]
[10542399.669552] RSP: 0018:ffff88002c3039f0  EFLAGS: 00010286
[10542399.675095] RAX: 0000000000000000 RBX: ffff8817814beb90 RCX:
0000000024852261
[10542399.682454] RDX: 0000000000000000 RSI: 00000000327c4d71 RDI:
ffffffff81cd4dc0
[10542399.689812] RBP: ffff88002c303ad0 R08: 0000000000000011 R09:
0000000000000002
[10542399.697170] R10: 0000000000004000 R11: ffffffffa14726e0 R12:
ffff88002c303aa0
[10542399.704529] R13: ffff88002c303b40 R14: ffff88002c303b4c R15:
ffff88002c303b4e
[10542399.711888] FS:  0000000000000000(0000) GS:ffff88002c300000(0000)
knlGS:0000000000000000
[10542399.720199] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[10542399.726175] CR2: 000000000000003e CR3: 00000017f67f1000 CR4:
00000000000006e0
[10542399.733534] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[10542399.740893] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[10542399.748254] Process swapper (pid: 0, threadinfo ffff881810db2000, task
ffff881810db0080)
[10542399.756560] Stack:
[10542399.758821]  00000000ffffffff ffff88002c303aa0 ffff88002c303ad0
ffff88002c303b40
[10542399.766301] <0> 0000000000000000 ffff8817f7d639e8 0000000000000100
ffffffffa1491beb
[10542399.774237] <0> ffff88002c303ad0 ffff8817f7d639e8 ffff88002c303b40
ffff88002c303aa0
[10542399.782365] Call Trace:
[10542399.785085]  [<ffffffffa1491beb>] get_unique_tuple+0xdb/0x240 [nf_nat]
[10542399.791847]  [<ffffffffa1491de9>] nf_nat_setup_info+0x99/0x350 [nf_nat]
[10542399.798697]  [<ffffffffa149e162>] alloc_null_binding+0x52/0x90
[iptable_nat]
[10542399.805977]  [<ffffffffa149e519>] nf_nat_fn+0x1e9/0x280 [iptable_nat]
[10542399.812654]  [<ffffffff81318d18>] nf_iterate+0x68/0xa0
[10542399.818031]  [<ffffffff81318db2>] nf_hook_slow+0x62/0xf0
[10542399.823582]  [<ffffffff813214a1>] ip_local_deliver+0x51/0x80
[10542399.829477]  [<ffffffff81320a59>] ip_rcv_finish+0x1b9/0x440
[10542399.835288]  [<ffffffff812f5f89>] netif_receive_skb+0x599/0x6a0
[10542399.841454]  [<ffffffffa0ea4837>] ixgbe_clean_rx_irq+0x3d7/0xe50 [ixgbe]
[10542399.848397]  [<ffffffffa0ea53e4>] ixgbe_clean_rxtx_many+0x134/0x270
[ixgbe]
[10542399.855595]  [<ffffffff812f6863>] net_rx_action+0xe3/0x1a0
[10542399.861318]  [<ffffffff810533ef>] __do_softirq+0xbf/0x170
[10542399.866956]  [<ffffffff810040bc>] call_softirq+0x1c/0x30
[10542399.872506]  [<ffffffff81005cfd>] do_softirq+0x4d/0x80
[10542399.877883]  [<ffffffff81053275>] irq_exit+0x85/0x90
[10542399.883087]  [<ffffffff8100525e>] do_IRQ+0x6e/0xe0
[10542399.888120]  [<ffffffff81003913>] ret_from_intr+0x0/0xa
[10542399.893582]  [<ffffffff8100ae42>] mwait_idle+0x62/0x70
[10542399.898957]  [<ffffffff8100204a>] cpu_idle+0x5a/0xb0
[10542399.904159] Code: 00 00 00 4d 8d 7d 0e 4d 8d 75 0c 48 89 c3 eb 14 48 8b
03 48 85 c0 0f 84 84 00 00 00 44 0f b6 45 26 48 89 c3 48 8b 53 20 48 8b 03 <44>
38 42 3e 0f 18 08 75 dc 8b 42 18 3b 45 00 75 d4 0f b7 42 28

From the vmcore,we found that: 

1 OOPS occured at the statement 't->dst.protonum == tuple->dst.protonum' in
inline function same_src. 

2 The first parameter of same_src "ct" is NULL,The value of 'ct' came from 'ct
= nat->ct'.

3 Read the content of the 'nat', all member's value are zero.


static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
    struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);

    if (nat == NULL || nat->ct == NULL)
        return;

    NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);

    spin_lock_bh(&nf_nat_lock);
    hlist_del_rcu(&nat->bysource); 
    spin_unlock_bh(&nf_nat_lock);
}

void nf_conntrack_free(struct nf_conn *ct)
{
  struct net *net = nf_ct_net(ct);
  nf_ct_ext_destroy(ct); //For NAT，it will call nf_nat_cleanup_conntrack
  atomic_dec(&net->ct.count);    
  nf_ct_ext_free(ct);  // Free nat-extention memory by kfree; is it possible
that the extention was still used in a RCU read side ?
  kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
}

-- 
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Re: rtnl_lock deadlock on 3.10
From: Steve Wise @ 2013-09-05 15:14 UTC (permalink / raw)
  To: roland
  Cc: Bart Van Assche, Shawn Bohrer, Or Gerlitz, Shawn Bohrer,
	Cong Wang, netdev, linux-rdma, swise
In-Reply-To: <522856A4.8040800@acm.org>

On 9/5/2013 5:02 AM, Bart Van Assche wrote:
> On 07/30/13 14:54, Steve Wise wrote:
>> On 7/29/2013 6:02 PM, Shawn Bohrer wrote:
>>> On Mon, Jul 15, 2013 at 09:38:19AM -0500, Shawn Bohrer wrote:
>>>> On Wed, Jul 03, 2013 at 08:26:11PM +0300, Or Gerlitz wrote:
>>>>> On 03/07/2013 20:22, Shawn Bohrer wrote:
>>>>>> On Wed, Jul 03, 2013 at 07:33:07AM +0200, Hannes Frederic Sowa 
>>>>>> wrote:
>>>>>>> On Wed, Jul 03, 2013 at 07:11:52AM +0200, Hannes Frederic Sowa 
>>>>>>> wrote:
>>>>>>>> On Tue, Jul 02, 2013 at 01:38:26PM +0000, Cong Wang wrote:
>>>>>>>>> On Tue, 02 Jul 2013 at 08:28 GMT, Hannes Frederic Sowa
>>>>>>>>> <hannes@stressinduktion.org> wrote:
>>>>>>>>>> On Mon, Jul 01, 2013 at 09:54:56AM -0500, Shawn Bohrer wrote:
>>>>>>>>>>> I've managed to hit a deadlock at boot a couple times while
>>>>>>>>>>> testing
>>>>>>>>>>> the 3.10 rc kernels.  It seems to always happen when my network
>>>>>>>>>>> devices are initializing.  This morning I updated to v3.10 and
>>>>>>>>>>> made a
>>>>>>>>>>> few config tweaks and so far I've hit it 4 out of 5 reboots.
>>>>>>>>>>> It looks
>>>>>>>>>>> like most processes are getting stuck on rtnl_lock.  Below is
>>>>>>>>>>> a boot
>>>>>>>>>>> log with the soft lockup prints.  Please let know if there 
>>>>>>>>>>> is any
>>>>>>>>>>> other information I can provide:
>>>>>>>>>> Could you try a build with CONFIG_LOCKDEP enabled?
>>>>>>>>>>
>>>>>>>>> The problem is clear: ib_register_device() is called with
>>>>>>>>> rtnl_lock,
>>>>>>>>> but itself needs device_mutex, however, ib_register_client() 
>>>>>>>>> first
>>>>>>>>> acquires device_mutex, then indirectly calls register_netdev()
>>>>>>>>> which
>>>>>>>>> takes rtnl_lock. Deadlock!
>>>>>>>>>
>>>>>>>>> One possible fix is always taking rtnl_lock before taking
>>>>>>>>> device_mutex, something like below:
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/infiniband/core/device.c
>>>>>>>>> b/drivers/infiniband/core/device.c
>>>>>>>>> index 18c1ece..890870b 100644
>>>>>>>>> --- a/drivers/infiniband/core/device.c
>>>>>>>>> +++ b/drivers/infiniband/core/device.c
>>>>>>>>> @@ -381,6 +381,7 @@ int ib_register_client(struct ib_client
>>>>>>>>> *client)
>>>>>>>>>   {
>>>>>>>>>       struct ib_device *device;
>>>>>>>>> +    rtnl_lock();
>>>>>>>>>       mutex_lock(&device_mutex);
>>>>>>>>>       list_add_tail(&client->list, &client_list);
>>>>>>>>> @@ -389,6 +390,7 @@ int ib_register_client(struct ib_client
>>>>>>>>> *client)
>>>>>>>>>               client->add(device);
>>>>>>>>>       mutex_unlock(&device_mutex);
>>>>>>>>> +    rtnl_unlock();
>>>>>>>>>       return 0;
>>>>>>>>>   }
>>>>>>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>>> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>>> index b6e049a..5a7a048 100644
>>>>>>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
>>>>>>>>> @@ -1609,7 +1609,7 @@ static struct net_device
>>>>>>>>> *ipoib_add_port(const char *format,
>>>>>>>>>           goto event_failed;
>>>>>>>>>       }
>>>>>>>>> -    result = register_netdev(priv->dev);
>>>>>>>>> +    result = register_netdevice(priv->dev);
>>>>>>>>>       if (result) {
>>>>>>>>>           printk(KERN_WARNING "%s: couldn't register ipoib port
>>>>>>>>> %d; error %d\n",
>>>>>>>>>                  hca->name, port, result);
>>>>>>>> Looks good to me. Shawn, could you test this patch?
>>>>>>> ib_unregister_device/ib_unregister_client would need the same 
>>>>>>> change,
>>>>>>> too. I have not checked the other ->add() and ->remove()
>>>>>>> functions. Also
>>>>>>> cc'ed linux-rdma@vger.kernel.org, Roland Dreier.
>>>>>> Cong's patch is missing the #include <linux/rtnetlink.h> but 
>>>>>> otherwise
>>>>>> I've had 34 successful reboots with no deadlocks which is a good 
>>>>>> sign.
>>>>>> It sounds like there are more paths that need to be audited and a
>>>>>> proper patch submitted.  I can do more testing later if needed.
>>>>>>
>>>>>> Thanks,
>>>>>> Shawn
>>>>>>
>>>>> Guys, I was a bit busy today looking into that, but I don't think we
>>>>> want the IB core layer  (core/device.c) to
>>>>> use rtnl locking which is something that belongs to the network 
>>>>> stack.
>>>> Has anymore thought been put into a proper fix for this issue?
>>> I'm no expert in this area but I'm having a hard time seeing a
>>> different solution than the one Cong suggested.  Just to be clear the
>>> deadlock I hit was between cxgb3 and the ipoib module, so I've Cc'd
>>> Steve Wise in case he has a better solution from the Chelsio side.
>>
>> I don't know of another way to resolve this.   The rtnl lock is used in
>> ipoib and mlx4 already.  I think we should go forward with the proposed
>> patch.
>
> (replying to an e-mail of one month ago)
>
> Hello,
>
> It would be appreciated if anyone could report what the current status 
> of this issue is. I think a deadlock I ran into with kernels 3.10 and 
> 3.11 and PCI pass-through is related to this issue. See also 
> http://bugzilla.kernel.org/show_bug.cgi?id=60856 for the lockdep report.
>
> Thanks,
>
> Bart.


Roland, what do you think?

As I've said, I think we should go ahead with using the rtnl lock in the 
core.  Is there a complete patch available for review?  looks like the 
original was a partial fix.

Steve.

^ permalink raw reply

* Re: r8169 OOPSen in rtl_rx
From: Peter Zijlstra @ 2013-09-05 15:20 UTC (permalink / raw)
  To: Francois Romieu; +Cc: nic_swsd, netdev
In-Reply-To: <20130814095233.GH24092@twins.programming.kicks-ass.net>

On Wed, Aug 14, 2013 at 11:52:33AM +0200, Peter Zijlstra wrote:
> diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
> index 393f961..81e0bf4 100644
> --- a/drivers/net/ethernet/realtek/r8169.c
> +++ b/drivers/net/ethernet/realtek/r8169.c
> @@ -6185,6 +6185,12 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget
>  			else
>  				pkt_size = status & 0x00003fff;
>  
> +			if (!(pkt_size > 0 && pkt_size <= ETH_FRAME_LEN)) {
> +				dev->stats.rx_dropped++;
> +				printk("%s Funny sized packet: %d\n", dev->name, pkt_size);
> +				goto release_descriptor;
> +			}
> +
>  			/*
>  			 * The driver does not support incoming fragmented
>  			 * frames. They are seen as a symptom of over-mtu

Yay, it triggered..

$ dmesg | awk '/Funny sized packet/ { t[$6]++ } END { for (i in t) {
printf "%d %d\n", t[i], i; } }' | sort -n
1 4237
1 4983
1 5811
1 6062
1 6594
2 10709
2 12073
2 9197
4 14624
4 14870
266 16364

dev->name is always the same and the internal NIC (eth0, RTL8110s).

When it happens the NIC stops working as every packet is mal-sized,
however an ifconfig down; ifconfig up will restore it to working order.

It appears to happen when I saturate my outside link such that all
packets are fwd to the internal network -- I've got a 30Mbit/s down link
which isn't all that much given its a GBE capable card.

When I try and saturate the internal nic, with traffic from the firewall
to an internal machine we reach GBE speeds but nothing falls over.

^ permalink raw reply

* Re: [E1000-devel] [net-next v4 7/8] i40e: sysfs and debugfs interfaces
From: Stephen Hemminger @ 2013-09-05 15:21 UTC (permalink / raw)
  To: Bjørn Mork
  Cc: Nelson, Shannon, Kirsher, Jeffrey T,
	e1000-devel@lists.sourceforge.net, netdev@vger.kernel.org,
	Brandeburg, Jesse, gospo@redhat.com, davem@davemloft.net,
	sassmann@redhat.com
In-Reply-To: <8761uf5xt0.fsf@nemi.mork.no>

Has anyone else noticed how big the drivers are getting.

Intel
	e100	2275
	e1000	11971
	e1000e	17502
	igb	15720
	ixgbe	22451
	i40e	22603



	

^ permalink raw reply

* Re: rtnl_lock deadlock on 3.10
From: Shawn Bohrer @ 2013-09-05 15:34 UTC (permalink / raw)
  To: Steve Wise
  Cc: roland, Bart Van Assche, Shawn Bohrer, Or Gerlitz, Cong Wang,
	netdev, linux-rdma, swise
In-Reply-To: <52289FEB.7060606@opengridcomputing.com>

On Thu, Sep 05, 2013 at 10:14:51AM -0500, Steve Wise wrote:
> On 9/5/2013 5:02 AM, Bart Van Assche wrote:
> >On 07/30/13 14:54, Steve Wise wrote:
> >>On 7/29/2013 6:02 PM, Shawn Bohrer wrote:
> >>>On Mon, Jul 15, 2013 at 09:38:19AM -0500, Shawn Bohrer wrote:
> >>>>On Wed, Jul 03, 2013 at 08:26:11PM +0300, Or Gerlitz wrote:
> >>>>>On 03/07/2013 20:22, Shawn Bohrer wrote:
> >>>>>>On Wed, Jul 03, 2013 at 07:33:07AM +0200, Hannes
> >>>>>>Frederic Sowa wrote:
> >>>>>>>On Wed, Jul 03, 2013 at 07:11:52AM +0200, Hannes
> >>>>>>>Frederic Sowa wrote:
> >>>>>>>>On Tue, Jul 02, 2013 at 01:38:26PM +0000, Cong Wang wrote:
> >>>>>>>>>On Tue, 02 Jul 2013 at 08:28 GMT, Hannes Frederic Sowa
> >>>>>>>>><hannes@stressinduktion.org> wrote:
> >>>>>>>>>>On Mon, Jul 01, 2013 at 09:54:56AM -0500, Shawn Bohrer wrote:
> >>>>>>>>>>>I've managed to hit a deadlock at boot a couple times while
> >>>>>>>>>>>testing
> >>>>>>>>>>>the 3.10 rc kernels.  It seems to always happen when my network
> >>>>>>>>>>>devices are initializing.  This morning I updated to v3.10 and
> >>>>>>>>>>>made a
> >>>>>>>>>>>few config tweaks and so far I've hit it 4 out of 5 reboots.
> >>>>>>>>>>>It looks
> >>>>>>>>>>>like most processes are getting stuck on rtnl_lock.  Below is
> >>>>>>>>>>>a boot
> >>>>>>>>>>>log with the soft lockup prints.  Please let
> >>>>>>>>>>>know if there is any
> >>>>>>>>>>>other information I can provide:
> >>>>>>>>>>Could you try a build with CONFIG_LOCKDEP enabled?
> >>>>>>>>>>
> >>>>>>>>>The problem is clear: ib_register_device() is called with
> >>>>>>>>>rtnl_lock,
> >>>>>>>>>but itself needs device_mutex, however,
> >>>>>>>>>ib_register_client() first
> >>>>>>>>>acquires device_mutex, then indirectly calls register_netdev()
> >>>>>>>>>which
> >>>>>>>>>takes rtnl_lock. Deadlock!
> >>>>>>>>>
> >>>>>>>>>One possible fix is always taking rtnl_lock before taking
> >>>>>>>>>device_mutex, something like below:
> >>>>>>>>>
> >>>>>>>>>diff --git a/drivers/infiniband/core/device.c
> >>>>>>>>>b/drivers/infiniband/core/device.c
> >>>>>>>>>index 18c1ece..890870b 100644
> >>>>>>>>>--- a/drivers/infiniband/core/device.c
> >>>>>>>>>+++ b/drivers/infiniband/core/device.c
> >>>>>>>>>@@ -381,6 +381,7 @@ int ib_register_client(struct ib_client
> >>>>>>>>>*client)
> >>>>>>>>>  {
> >>>>>>>>>      struct ib_device *device;
> >>>>>>>>>+    rtnl_lock();
> >>>>>>>>>      mutex_lock(&device_mutex);
> >>>>>>>>>      list_add_tail(&client->list, &client_list);
> >>>>>>>>>@@ -389,6 +390,7 @@ int ib_register_client(struct ib_client
> >>>>>>>>>*client)
> >>>>>>>>>              client->add(device);
> >>>>>>>>>      mutex_unlock(&device_mutex);
> >>>>>>>>>+    rtnl_unlock();
> >>>>>>>>>      return 0;
> >>>>>>>>>  }
> >>>>>>>>>diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> >>>>>>>>>b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> >>>>>>>>>index b6e049a..5a7a048 100644
> >>>>>>>>>--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> >>>>>>>>>+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> >>>>>>>>>@@ -1609,7 +1609,7 @@ static struct net_device
> >>>>>>>>>*ipoib_add_port(const char *format,
> >>>>>>>>>          goto event_failed;
> >>>>>>>>>      }
> >>>>>>>>>-    result = register_netdev(priv->dev);
> >>>>>>>>>+    result = register_netdevice(priv->dev);
> >>>>>>>>>      if (result) {
> >>>>>>>>>          printk(KERN_WARNING "%s: couldn't register ipoib port
> >>>>>>>>>%d; error %d\n",
> >>>>>>>>>                 hca->name, port, result);
> >>>>>>>>Looks good to me. Shawn, could you test this patch?
> >>>>>>>ib_unregister_device/ib_unregister_client would need
> >>>>>>>the same change,
> >>>>>>>too. I have not checked the other ->add() and ->remove()
> >>>>>>>functions. Also
> >>>>>>>cc'ed linux-rdma@vger.kernel.org, Roland Dreier.
> >>>>>>Cong's patch is missing the #include <linux/rtnetlink.h>
> >>>>>>but otherwise
> >>>>>>I've had 34 successful reboots with no deadlocks which
> >>>>>>is a good sign.
> >>>>>>It sounds like there are more paths that need to be audited and a
> >>>>>>proper patch submitted.  I can do more testing later if needed.
> >>>>>>
> >>>>>>Thanks,
> >>>>>>Shawn
> >>>>>>
> >>>>>Guys, I was a bit busy today looking into that, but I don't think we
> >>>>>want the IB core layer  (core/device.c) to
> >>>>>use rtnl locking which is something that belongs to the
> >>>>>network stack.
> >>>>Has anymore thought been put into a proper fix for this issue?
> >>>I'm no expert in this area but I'm having a hard time seeing a
> >>>different solution than the one Cong suggested.  Just to be clear the
> >>>deadlock I hit was between cxgb3 and the ipoib module, so I've Cc'd
> >>>Steve Wise in case he has a better solution from the Chelsio side.
> >>
> >>I don't know of another way to resolve this.   The rtnl lock is used in
> >>ipoib and mlx4 already.  I think we should go forward with the proposed
> >>patch.
> >
> >(replying to an e-mail of one month ago)
> >
> >Hello,
> >
> >It would be appreciated if anyone could report what the current
> >status of this issue is. I think a deadlock I ran into with
> >kernels 3.10 and 3.11 and PCI pass-through is related to this
> >issue. See also http://bugzilla.kernel.org/show_bug.cgi?id=60856
> >for the lockdep report.
> >
> >Thanks,
> >
> >Bart.
> 
> 
> Roland, what do you think?
> 
> As I've said, I think we should go ahead with using the rtnl lock in
> the core.  Is there a complete patch available for review?  looks
> like the original was a partial fix.

I've been running with Cong's partial fix for the past couple of
months, and I'm pretty sure no complete patch has been posted.
I may be able to look at the missing pieces tomorrow and see if I can
put together a patch but if someone else wants to run with this feel
free.

--
Shawn

^ permalink raw reply

* Re: [PATCH 2/2] ipv4 igmp: use del_timer_sync instead of del_timer in ip_mc_down
From: Stephen Hemminger @ 2013-09-05 15:43 UTC (permalink / raw)
  To: Salam Noureddine
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev
In-Reply-To: <1378363404-37749-1-git-send-email-noureddine@aristanetworks.com>

On Wed,  4 Sep 2013 23:43:24 -0700
Salam Noureddine <noureddine@aristanetworks.com> wrote:

> Delete timers using del_timer_sync in ip_mc_down. Otherwise, it is
> possible for the timer to be the last to release its reference to the
> in_device and since __in_dev_put doesn't destroy the in_device we
> would end up leaking a reference to the net_device and see messages
> like the following,
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 1
> 
> Tested on linux-3.4.43.
> 
> Signed-off-by: Salam Noureddine <noureddine@aristanetworks.com>

Why not just call in_dev_put instead which just proper cleanup.
It is less risky of deadlock than del_timer_sync.

^ permalink raw reply

* [PATCH net] net: netlink: filter particular protocols from analyzers
From: Daniel Borkmann @ 2013-09-05 15:48 UTC (permalink / raw)
  To: davem; +Cc: netdev, Daniel Borkmann

From: Daniel Borkmann <dborkmann@redhat.com>

Fix finer-grained control and let only a whitelist of allowed netlink
protocols pass, in our case related to networking. If later on, other
subsystems decide they want to add their protocol as well to the list
of allowed protocols they shall simply add it. While at it, we also
need to tell what protocol is in use otherwise BPF_S_ANC_PROTOCOL can
not pick it up (as it's not filled out).

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
 net/netlink/af_netlink.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0c61b59..350187e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -168,16 +168,43 @@ int netlink_remove_tap(struct netlink_tap *nt)
 }
 EXPORT_SYMBOL_GPL(netlink_remove_tap);
 
+static bool netlink_filter_tap(const struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	bool pass = false;
+
+	/* We take the more conservative approach and
+	 * whitelist socket protocols that may pass.
+	 */
+	switch (sk->sk_protocol) {
+	case NETLINK_ROUTE:
+	case NETLINK_USERSOCK:
+	case NETLINK_SOCK_DIAG:
+	case NETLINK_NFLOG:
+	case NETLINK_XFRM:
+	case NETLINK_FIB_LOOKUP:
+	case NETLINK_NETFILTER:
+	case NETLINK_GENERIC:
+		pass = true;
+		break;
+	}
+
+	return pass;
+}
+
 static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 				     struct net_device *dev)
 {
 	struct sk_buff *nskb;
+	struct sock *sk = skb->sk;
 	int ret = -ENOMEM;
 
 	dev_hold(dev);
 	nskb = skb_clone(skb, GFP_ATOMIC);
 	if (nskb) {
 		nskb->dev = dev;
+		nskb->protocol = htons((u16) sk->sk_protocol);
+
 		ret = dev_queue_xmit(nskb);
 		if (unlikely(ret > 0))
 			ret = net_xmit_errno(ret);
@@ -192,6 +219,9 @@ static void __netlink_deliver_tap(struct sk_buff *skb)
 	int ret;
 	struct netlink_tap *tmp;
 
+	if (!netlink_filter_tap(skb))
+		return;
+
 	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
 		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
 		if (unlikely(ret))
-- 
1.8.3.1

^ permalink raw reply related

* Re: [net-next v4 1/7] i40e: main driver core
From: Stephen Hemminger @ 2013-09-05 16:26 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: davem, Jesse Brandeburg, netdev, gospo, sassmann, Shannon Nelson,
	PJ Waskiewicz, e1000-devel
In-Reply-To: <1378336629-24224-2-git-send-email-jeffrey.t.kirsher@intel.com>

More review comments, these are non-blocking, and can be fixed later.

Minor
-----
* style
    not a fan of doing own error codes and handling (ie. I40E_SUCCESS)
    which probably is side effect of OS abstraction.

* extra parens aren't needed on return
     return (budget > 0);

* i40e_config_netdev
    etherdev_size = (sizeof(struct i40e_netdev_priv));
	 Extra paren, only used once, unsigned vs signed, just
	 use sizeof()
    strlcpy(netdev->name, "eth%d", IFNAMSIZ-1);
        That is already done by alloc_etherdev
    memcpy(netdev->dev_addr, mac_addr, netdev->addr_len);
       netdev->addr_len is ETH_ALEN, mac_addr is ETH_ALEN
       just use constant size copy.
* i40e_init_module
    ret = pci_register_driver(&i40e_driver);
    return ret;
       assignment is unneeded, just do return pci_register_driver

* service timer
     since it is 1 HZ, you should use round_jiffies to align
     on clock boundary to save power.

* Why is Kbuild file used instead of Makefile like other drivers?

* Ethtool
  Need to use ethtool_cmd_speed_set othewise only lower bit set.

* Extension I40E_DEBUG_USER is non-standard

* What is is point of using snprintf() in i40e_get_drvinfo
	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
		 "%s", i40e_fw_version_str(&pf->hw));
  Already using strlcpy() for other fields.

* Ethtool ops can be const

* isn't vsi->netdev = NULL needed here.
s32 i40e_vsi_release(struct i40e_vsi *vsi)
	if (vsi->netdev_registered) {
			vsi->netdev_registered = false;
			if (vsi->netdev) {
				/* results in a call to i40e_close() */
				unregister_netdev(vsi->netdev);
				free_netdev(vsi->netdev);
			>>>>	vsi->netdev = NULL;


Enhancement
-----------
* Support Byte Queue Limits
  - might be tougher with VBE

^ permalink raw reply

* Re: bride: IPv6 multicast snooping enhancements
From: David Miller @ 2013-09-05 16:36 UTC (permalink / raw)
  To: linus.luessing
  Cc: amwang, netdev, bridge, linux-kernel, linux, stephen, herbert
In-Reply-To: <1378253619-23918-1-git-send-email-linus.luessing@web.de>

From: Linus Lüssing <linus.luessing@web.de>
Date: Wed,  4 Sep 2013 02:13:37 +0200

> Here are two, small feature changes I would like to submit to increase
> the usefulness of the multicast snooping of the bridge code.
> 
> The first patch is an unaltered one I had submitted before, but since it
> got no feedback I'm resubmitting it here for net-next. With the recently
> added patch to disable snooping if there is no querier (b00589af + 248ba8ec05
> + 8d50af4fb), it should be a safe choice now (without these, patch 1/2 would
> have introduced another potential for lost IPv6 multicast packets).
> 
> Both conceptually and also with some testing and fuzzing, I couldn't spot
> any more causes for potential packet loss. And since the multicast snooping
> code has now been tried by various people, I think it should be a safe
> choice to apply the multicast snooping not only for IPv6 multicast packets
> with a scope greater than link-local, but also for packets of exactly this
> scope. The IPv6 standard mandates MLD reports for link-local multicast, too,
> so we can safely snoop them as well (in contrast to IPv4 link-local).

Both patches applied, thanks!

^ permalink raw reply

* Re: [PATCH net-next 0/5] driver/net: enic: enic driver updates
From: David Miller @ 2013-09-05 16:40 UTC (permalink / raw)
  To: govindarajulu90; +Cc: netdev, linux-kernel, benve, ssujith, nistrive, umalhi
In-Reply-To: <1378273638-7780-1-git-send-email-govindarajulu90@gmail.com>

From: Govindarajulu Varadarajan <govindarajulu90@gmail.com>
Date: Wed,  4 Sep 2013 11:17:13 +0530

> This series includes support for multi-tx, rss_hash, RPS, RFS, DMA64, 
> export symbols for cisco low latency driver and update the driver version and 
> driver maintainers.

Series applied, thanks.

^ permalink raw reply

* Re: [GIT PULL] please pull infiniband.git
From: Linus Torvalds @ 2013-09-05 16:42 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Roland Dreier, Linux Kernel Mailing List, David Miller,
	Network Development
In-Reply-To: <20130905103116.6998495cd755f5ae2c3def78@canb.auug.org.au>

On Wed, Sep 4, 2013 at 5:31 PM, Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>
> I am carrying the following merge fix patch for a semantic conflict
> between this tree and Dave's net-next tree:

Ok. I haven't gotten David's pull request yet, and unless I get it
later today I'll probably forget this.

David, mind reminding me when you do send (or maybe cc'ing Stephen, so
that he will)?

                  Linus

^ permalink raw reply

* Re: [GIT PULL] please pull infiniband.git
From: David Miller @ 2013-09-05 16:43 UTC (permalink / raw)
  To: torvalds; +Cc: sfr, roland, linux-kernel, netdev
In-Reply-To: <CA+55aFwyJO10zX0DNgfqmsvPCfnaiT-7Fmwz8MVBTq2wtL4L0w@mail.gmail.com>

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 5 Sep 2013 09:42:05 -0700

> On Wed, Sep 4, 2013 at 5:31 PM, Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>>
>> I am carrying the following merge fix patch for a semantic conflict
>> between this tree and Dave's net-next tree:
> 
> Ok. I haven't gotten David's pull request yet, and unless I get it
> later today I'll probably forget this.
> 
> David, mind reminding me when you do send (or maybe cc'ing Stephen, so
> that he will)?

Sure, I was planning on send out the pull request by the end of today.

^ permalink raw reply

* Re: [net-next v4] vxlan: Notify drivers for listening UDP port changes
From: David Miller @ 2013-09-05 16:45 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: joseph.gasparakis, netdev, gospo, sassmann, john.r.fastabend,
	stephen
In-Reply-To: <1378286019-8719-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Wed,  4 Sep 2013 02:13:38 -0700

> From: Joseph Gasparakis <joseph.gasparakis@intel.com>
> 
> This patch adds two more ndo ops: ndo_add_rx_vxlan_port() and
> ndo_del_rx_vxlan_port().
> 
> Drivers can get notifications through the above functions about changes
> of the UDP listening port of VXLAN. Also, when physical ports come up,
> now they can call vxlan_get_rx_port() in order to obtain the port number(s)
> of the existing VXLAN interface in case they already up before them.
> 
> This information about the listening UDP port would be used for VXLAN
> related offloads.
> 
> A big thank you to John Fastabend (john.r.fastabend@intel.com) for his
> input and his suggestions on this patch set.
> 
> CC: John Fastabend <john.r.fastabend@intel.com>
> CC: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Joseph Gasparakis <joseph.gasparakis@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied.

^ permalink raw reply

* Re: [PATCH RESEND net-next] net: usbnet: update addr_assign_type if appropriate
From: David Miller @ 2013-09-05 16:45 UTC (permalink / raw)
  To: bjorn; +Cc: netdev, linux-usb
In-Reply-To: <1378280552-12632-1-git-send-email-bjorn@mork.no>

From: Bjørn Mork <bjorn@mork.no>
Date: Wed,  4 Sep 2013 09:42:32 +0200

> This module generates a common default address on init,
> using eth_random_addr. Set addr_assign_type to let
> userspace know the address is random unless it was
> overridden by the minidriver.
> 
> Signed-off-by: Bjørn Mork <bjorn@mork.no>
> Acked-by: Oliver Neukum <oliver@neukum.org>

Applied.

^ permalink raw reply

* Re: bnx2x: VF RSS support
From: David Miller @ 2013-09-05 16:45 UTC (permalink / raw)
  To: ariele; +Cc: netdev, eilong
In-Reply-To: <1378292962-5537-1-git-send-email-ariele@broadcom.com>

From: "Ariel Elior" <ariele@broadcom.com>
Date: Wed, 4 Sep 2013 14:09:20 +0300

> This patch series adds the capability for VF functions to use multiple queues
> and Receive / Transmit side scaling.
> 
> Patch #1 enhances the PF's side database to allow for multiple queues per PF
> and configure the HW appropriately, and the PF side of the VF PF channel
> message for configuring the RSS.
> 
> Patch #2 adds to the VF side the ability to request multiple queues, and if
> obtained to configure RSS for them over the VF PF channel.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH v5] ipv6:introduce function to find route for redirect
From: David Miller @ 2013-09-05 16:45 UTC (permalink / raw)
  To: duanj.fnst; +Cc: netdev, hannes
In-Reply-To: <52271D15.1000806@cn.fujitsu.com>

From: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Date: Wed, 04 Sep 2013 19:44:21 +0800

> From: Duan Jiong <duanj.fnst@cn.fujitsu.com>
> 
> RFC 4861 says that the IP source address of the Redirect is the
> same as the current first-hop router for the specified ICMP
> Destination Address, so the gateway should be taken into
> consideration when we find the route for redirect.
> 
> There was once a check in commit
> a6279458c534d01ccc39498aba61c93083ee0372 ("NDISC: Search over
> all possible rules on receipt of redirect.") and the check
> went away in commit b94f1c0904da9b8bf031667afc48080ba7c3e8c9
> ("ipv6: Use icmpv6_notify() to propagate redirect, instead of
> rt6_redirect()").
> 
> The bug is only "exploitable" on layer-2 because the source
> address of the redirect is checked to be a valid link-local
> address but it makes spoofing a lot easier in the same L2
> domain nonetheless.
> 
> Thanks very much for Hannes's help.
> 
> Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next v2] qlcnic: use standard NAPI weights
From: David Miller @ 2013-09-05 16:46 UTC (permalink / raw)
  To: mschmidt
  Cc: himanshu.madhani, netdev, edumazet, rajesh.borundia,
	shahed.shaikh, jitendra.kalsaria, sony.chacko,
	sucheta.chakraborty, Linux-Driver
In-Reply-To: <52272F89.60505@redhat.com>

From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 04 Sep 2013 15:03:05 +0200

> Since commit 82dc3c63 ("net: introduce NAPI_POLL_WEIGHT")
> netif_napi_add() produces an error message if a NAPI poll weight
> greater than 64 is requested.
> 
> qlcnic requests the weight as large as 256 for some of its rings, and
> smaller values for other rings. For instance in qlcnic_82xx_napi_add()
> I think the intention was to give the tx+rx ring a bigger weight than
> to rx-only rings, but it's actually doing the opposite. So I'm assuming
> the weights do not really matter much.
> 
> Just use the standard NAPI weights for all rings.
> 
> Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
> Acked-by: Himanshu Madhani <himanshu.madhani@qlogic.com>

Applied.

^ permalink raw reply

* Re: [PATCH V2 net-next 1/2] tuntap: purge socket error queue on detach
From: David Miller @ 2013-09-05 16:46 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, linux-kernel, mst, richardcochran
In-Reply-To: <1378374840-39130-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Thu,  5 Sep 2013 17:53:59 +0800

> Commit eda297729171fe16bf34fe5b0419dfb69060f623
> (tun: Support software transmit time stamping) will queue skbs into error queue
> when tx stamping is enabled. But it forgets to purge the error queue during
> detach. This patch fixes this.
> 
> Cc: Richard Cochran <richardcochran@gmail.com>
> Acked-by: Richard Cochran <richardcochran@gmail.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Applied.

^ permalink raw reply

* Re: [PATCH V2 net-next 2/2] tuntap: orphan frags before trying to set tx timestamp
From: David Miller @ 2013-09-05 16:47 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, linux-kernel, mst, richardcochran, sergei.shtylyov
In-Reply-To: <1378374840-39130-2-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Thu,  5 Sep 2013 17:54:00 +0800

> sock_tx_timestamp() will clear all zerocopy flags of skb which may lead the
> frags never to be orphaned. This will break guest to guest traffic when zerocopy
> is enabled. Fix this by orphaning the frags before trying to set tx time stamp.
> 
> The issue were introduced by commit eda297729171fe16bf34fe5b0419dfb69060f623
> (tun: Support software transmit time stamping).
> 
> Cc: Richard Cochran <richardcochran@gmail.com>
> Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
> Acked-by: Richard Cochran <richardcochran@gmail.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Applied.

I also queued both of these patches to -stable.

Thanks.

^ permalink raw reply

* Re: [PATCH 1/2] ipv6 mcast: use del_timer_sync instead of del_timer in ipv6_mc_down
From: Salam Noureddine @ 2013-09-05 17:20 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, netdev
In-Reply-To: <1378391415.3159.3.camel@bwh-desktop.uk.level5networks.com>

On Thu, Sep 5, 2013 at 7:30 AM, Ben Hutchings <bhutchings@solarflare.com> wrote:
>> diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
>> index 99cd65c..5c8d49d 100644
>> --- a/net/ipv6/mcast.c
>> +++ b/net/ipv6/mcast.c
>> @@ -2277,12 +2277,12 @@ void ipv6_mc_down(struct inet6_dev *idev)
>>
>>       read_lock_bh(&idev->lock);
>>       idev->mc_ifc_count = 0;
>> -     if (del_timer(&idev->mc_ifc_timer))
>> +     if (del_timer_sync(&idev->mc_ifc_timer))
>
> Are you sure this doesn't introduce a potential deadlock?  Have you
> tested this with lockdep enabled?
>
> Ben.

I will test with lockdep enabled and get back to you.

Thanks,

Salam

^ permalink raw reply

* [PATCH net-next] intel: pci_register_driver tail return
From: Stephen Hemminger @ 2013-09-05 17:22 UTC (permalink / raw)
  To: Jeff Kirsher, David S. Miller; +Cc: netdev, e1000-devel

Remove unnecessary assignment, just return result of pci_register_driver
Glad to see that Intel does lots of code reuse :-)

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

---
 drivers/net/ethernet/intel/e1000e/netdev.c        |    5 +----
 drivers/net/ethernet/intel/igb/igb_main.c         |    4 +---
 drivers/net/ethernet/intel/igbvf/netdev.c         |    5 +----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |    4 +---
 4 files changed, 4 insertions(+), 14 deletions(-)

--- a/drivers/net/ethernet/intel/e1000e/netdev.c	2013-08-25 20:40:29.798995725 -0700
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c	2013-09-05 10:18:11.322040468 -0700
@@ -7054,13 +7054,10 @@ static struct pci_driver e1000_driver =
  **/
 static int __init e1000_init_module(void)
 {
-	int ret;
 	pr_info("Intel(R) PRO/1000 Network Driver - %s\n",
 		e1000e_driver_version);
 	pr_info("Copyright(c) 1999 - 2013 Intel Corporation.\n");
-	ret = pci_register_driver(&e1000_driver);
-
-	return ret;
+	return pci_register_driver(&e1000_driver);
 }
 module_init(e1000_init_module);
 
--- a/drivers/net/ethernet/intel/igb/igb_main.c	2013-09-05 08:10:16.494868469 -0700
+++ b/drivers/net/ethernet/intel/igb/igb_main.c	2013-09-05 09:52:11.914731506 -0700
@@ -680,7 +680,6 @@ struct net_device *igb_get_hw_dev(struct
  **/
 static int __init igb_init_module(void)
 {
-	int ret;
 	pr_info("%s - version %s\n",
 	       igb_driver_string, igb_driver_version);
 
@@ -689,8 +688,7 @@ static int __init igb_init_module(void)
 #ifdef CONFIG_IGB_DCA
 	dca_register_notify(&dca_notifier);
 #endif
-	ret = pci_register_driver(&igb_driver);
-	return ret;
+	return pci_register_driver(&igb_driver);
 }
 
 module_init(igb_init_module);
--- a/drivers/net/ethernet/intel/igbvf/netdev.c	2013-08-10 10:36:10.293516206 -0700
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c	2013-09-05 09:55:11.644534238 -0700
@@ -2891,13 +2891,10 @@ static struct pci_driver igbvf_driver =
  **/
 static int __init igbvf_init_module(void)
 {
-	int ret;
 	pr_info("%s - version %s\n", igbvf_driver_string, igbvf_driver_version);
 	pr_info("%s\n", igbvf_copyright);
 
-	ret = pci_register_driver(&igbvf_driver);
-
-	return ret;
+	return pci_register_driver(&igbvf_driver);
 }
 module_init(igbvf_init_module);
 
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c	2013-09-05 08:10:16.494868469 -0700
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c	2013-09-05 09:53:27.937592738 -0700
@@ -3614,14 +3614,12 @@ static struct pci_driver ixgbevf_driver
  **/
 static int __init ixgbevf_init_module(void)
 {
-	int ret;
 	pr_info("%s - version %s\n", ixgbevf_driver_string,
 		ixgbevf_driver_version);
 
 	pr_info("%s\n", ixgbevf_copyright);
 
-	ret = pci_register_driver(&ixgbevf_driver);
-	return ret;
+	return pci_register_driver(&ixgbevf_driver);
 }
 
 module_init(ixgbevf_init_module);

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox