Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH v3 net-next 2/2] bonding: restructure locking of bond_ab_arp_probe()
From: Veaceslav Falico @ 2014-01-27 13:36 UTC (permalink / raw)
  To: netdev; +Cc: Jay Vosburgh, Andy Gospodarek
In-Reply-To: <1390829613-1842-3-git-send-email-vfalico@redhat.com>

On Mon, Jan 27, 2014 at 02:33:33PM +0100, Veaceslav Falico wrote:
>Currently we're calling it from under RCU context, however we're using some
>functions that require rtnl to be held.
>
>Fix this by restructuring the locking - don't call it under any locks,
>aquire rcu_read_lock() if we're sending _only_ (i.e. we have the active
>slave present), and use rtnl locking otherwise - if we need to modify
>(in)active flags of a slave.
>
>CC: Jay Vosburgh <fubar@us.ibm.com>
>CC: Andy Gospodarek <andy@greyhouse.net>
>Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
>---
>
>Notes:
>    v2->v3:
>    Use rtnl_trylock(), not to race with queue cancelling.
>
>    v1->v2:
>    Add two steps - one for sending/rcu, another for modifying/rtnl.
>
> drivers/net/bonding/bond_main.c | 38 ++++++++++++++++++++++++--------------
> 1 file changed, 24 insertions(+), 14 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 27e6fdd..7de0256 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -2605,11 +2605,14 @@ do_failover:
> static void bond_ab_arp_probe(struct bonding *bond)
> {
> 	struct slave *slave, *before = NULL, *new_slave = NULL,
>-		     *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
>-		     *curr_active_slave = rcu_dereference(bond->curr_active_slave);
>+		     *curr_arp_slave, *curr_active_slave;
> 	struct list_head *iter;
> 	bool found = false;
>
>+	rcu_read_lock();
>+	curr_arp_slave = rcu_dereference(bond->current_arp_slave);
>+	curr_active_slave = rcu_dereference(bond->curr_active_slave);
>+
> 	if (curr_arp_slave && curr_active_slave)
> 		pr_info("PROBE: c_arp %s && cas %s BAD\n",
> 			curr_arp_slave->dev->name,
>@@ -2617,23 +2620,31 @@ static void bond_ab_arp_probe(struct bonding *bond)
>
> 	if (curr_active_slave) {
> 		bond_arp_send_all(bond, curr_active_slave);
>+		rcu_read_unlock();
> 		return;
> 	}
>+	rcu_read_unlock();
>
> 	/* if we don't have a curr_active_slave, search for the next available
> 	 * backup slave from the current_arp_slave and make it the candidate
> 	 * for becoming the curr_active_slave
> 	 */
>
>+	rtnl_lock();

Right, git commit --amend would be great after git add...

Sorry, forgot to actually commit changes, will re-send.

>+	/* curr_arp_slave might have gone away */
>+	curr_arp_slave = rcu_dereference(bond->current_arp_slave);
>+
> 	if (!curr_arp_slave) {
>-		curr_arp_slave = bond_first_slave_rcu(bond);
>-		if (!curr_arp_slave)
>+		curr_arp_slave = bond_first_slave(bond);
>+		if (!curr_arp_slave) {
>+			rtnl_unlock();
> 			return;
>+		}
> 	}
>
> 	bond_set_slave_inactive_flags(curr_arp_slave);
>
>-	bond_for_each_slave_rcu(bond, slave, iter) {
>+	bond_for_each_slave(bond, slave, iter) {
> 		if (!found && !before && IS_UP(slave->dev))
> 			before = slave;
>
>@@ -2663,21 +2674,24 @@ static void bond_ab_arp_probe(struct bonding *bond)
> 	if (!new_slave && before)
> 		new_slave = before;
>
>-	if (!new_slave)
>+	if (!new_slave) {
>+		rtnl_unlock();
> 		return;
>+	}
>
> 	new_slave->link = BOND_LINK_BACK;
> 	bond_set_slave_active_flags(new_slave);
> 	bond_arp_send_all(bond, new_slave);
> 	new_slave->jiffies = jiffies;
> 	rcu_assign_pointer(bond->current_arp_slave, new_slave);
>+	rtnl_unlock();
> }
>
> static void bond_activebackup_arp_mon(struct work_struct *work)
> {
> 	struct bonding *bond = container_of(work, struct bonding,
> 					    arp_work.work);
>-	bool should_notify_peers = false;
>+	bool should_notify_peers = false, should_commit = false;
> 	int delta_in_ticks;
>
> 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
>@@ -2686,12 +2700,11 @@ static void bond_activebackup_arp_mon(struct work_struct *work)
> 		goto re_arm;
>
> 	rcu_read_lock();
>-
> 	should_notify_peers = bond_should_notify_peers(bond);
>+	should_commit = bond_ab_arp_inspect(bond);
>+	rcu_read_unlock();
>
>-	if (bond_ab_arp_inspect(bond)) {
>-		rcu_read_unlock();
>-
>+	if (should_commit) {
> 		/* Race avoidance with bond_close flush of workqueue */
> 		if (!rtnl_trylock()) {
> 			delta_in_ticks = 1;
>@@ -2700,13 +2713,10 @@ static void bond_activebackup_arp_mon(struct work_struct *work)
> 		}
>
> 		bond_ab_arp_commit(bond);
>-
> 		rtnl_unlock();
>-		rcu_read_lock();
> 	}
>
> 	bond_ab_arp_probe(bond);
>-	rcu_read_unlock();
>
> re_arm:
> 	if (bond->params.arp_interval)
>-- 
>1.8.4
>

^ permalink raw reply

* [PATCH v3 net-next 1/2] bonding: RCUify bond_ab_arp_probe
From: Veaceslav Falico @ 2014-01-27 13:37 UTC (permalink / raw)
  To: netdev; +Cc: Veaceslav Falico, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <1390829852-1943-1-git-send-email-vfalico@redhat.com>

Currently bond_ab_arp_probe() is always called under rcu_read_lock(),
however to work with curr_active_slave we're still holding the
curr_slave_lock.

To remove that curr_slave_lock - rcu_dereference the bond's
curr_active_slave and use it further - so that we're sure the slave won't
go away, and we don't care if it will change in the meanwhile.

CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
---
 drivers/net/bonding/bond_main.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a7db819..27e6fdd 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2605,25 +2605,21 @@ do_failover:
 static void bond_ab_arp_probe(struct bonding *bond)
 {
 	struct slave *slave, *before = NULL, *new_slave = NULL,
-		     *curr_arp_slave = rcu_dereference(bond->current_arp_slave);
+		     *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
+		     *curr_active_slave = rcu_dereference(bond->curr_active_slave);
 	struct list_head *iter;
 	bool found = false;
 
-	read_lock(&bond->curr_slave_lock);
-
-	if (curr_arp_slave && bond->curr_active_slave)
+	if (curr_arp_slave && curr_active_slave)
 		pr_info("PROBE: c_arp %s && cas %s BAD\n",
 			curr_arp_slave->dev->name,
-			bond->curr_active_slave->dev->name);
+			curr_active_slave->dev->name);
 
-	if (bond->curr_active_slave) {
-		bond_arp_send_all(bond, bond->curr_active_slave);
-		read_unlock(&bond->curr_slave_lock);
+	if (curr_active_slave) {
+		bond_arp_send_all(bond, curr_active_slave);
 		return;
 	}
 
-	read_unlock(&bond->curr_slave_lock);
-
 	/* if we don't have a curr_active_slave, search for the next available
 	 * backup slave from the current_arp_slave and make it the candidate
 	 * for becoming the curr_active_slave
-- 
1.8.4

^ permalink raw reply related

* [PATCH v3 net-next 0/2] bonding: fix locking in bond_ab_arp_prob
From: Veaceslav Falico @ 2014-01-27 13:37 UTC (permalink / raw)
  To: netdev; +Cc: Jay Vosburgh, Andy Gospodarek, Veaceslav Falico

Hi,

After the latest patches, on every call of bond_ab_arp_probe() without an
active slave I see the following warning:

[    7.912314] RTNL: assertion failed at net/core/dev.c (4494)
...
[    7.922495]  [<ffffffff817acc6f>] dump_stack+0x51/0x72
[    7.923714]  [<ffffffff8168795e>] netdev_master_upper_dev_get+0x6e/0x70
[    7.924940]  [<ffffffff816a2a66>] rtnl_link_fill+0x116/0x260
[    7.926143]  [<ffffffff817acc6f>] ? dump_stack+0x51/0x72
[    7.927333]  [<ffffffff816a350c>] rtnl_fill_ifinfo+0x95c/0xb90
[    7.928529]  [<ffffffff8167af2b>] ? __kmalloc_reserve+0x3b/0xa0
[    7.929681]  [<ffffffff8167bfcf>] ? __alloc_skb+0x9f/0x1e0
[    7.930827]  [<ffffffff816a3b64>] rtmsg_ifinfo+0x84/0x100
[    7.931960]  [<ffffffffa00bca07>] bond_ab_arp_probe+0x1a7/0x370 [bonding]
[    7.933133]  [<ffffffffa00bcd78>] bond_activebackup_arp_mon+0x1a8/0x2f0 [bonding]
...

It happens because in bond_ab_arp_probe() we change the flags of a slave
without holding the RTNL lock.

To fix this - remove the useless curr_active_lock, RCUify it and lock RTNL
while changing the slave's flags. Also, remove bond_ab_arp_probe() from
under any locks in bond_ab_arp_mon().

CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>

---
 drivers/net/bonding/bond_main.c | 67 ++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 28 deletions(-)

^ permalink raw reply

* [PATCH v3 net-next 2/2] bonding: restructure locking of bond_ab_arp_probe()
From: Veaceslav Falico @ 2014-01-27 13:37 UTC (permalink / raw)
  To: netdev; +Cc: Veaceslav Falico, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <1390829852-1943-1-git-send-email-vfalico@redhat.com>

Currently we're calling it from under RCU context, however we're using some
functions that require rtnl to be held.

Fix this by restructuring the locking - don't call it under any locks,
aquire rcu_read_lock() if we're sending _only_ (i.e. we have the active
slave present), and use rtnl locking otherwise - if we need to modify
(in)active flags of a slave.

CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
---

Notes:
    v2->v3:
    Use rtnl_trylock(), not to race with queue cancelling.
    
    v1->v2:
    Add two steps - one for sending/rcu, another for modifying/rtnl.

 drivers/net/bonding/bond_main.c | 57 ++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 27e6fdd..dd75615 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2599,17 +2599,18 @@ do_failover:
 
 /*
  * Send ARP probes for active-backup mode ARP monitor.
- *
- * Called with rcu_read_lock hold.
  */
-static void bond_ab_arp_probe(struct bonding *bond)
+static bool bond_ab_arp_probe(struct bonding *bond)
 {
 	struct slave *slave, *before = NULL, *new_slave = NULL,
-		     *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
-		     *curr_active_slave = rcu_dereference(bond->curr_active_slave);
+		     *curr_arp_slave, *curr_active_slave;
 	struct list_head *iter;
 	bool found = false;
 
+	rcu_read_lock();
+	curr_arp_slave = rcu_dereference(bond->current_arp_slave);
+	curr_active_slave = rcu_dereference(bond->curr_active_slave);
+
 	if (curr_arp_slave && curr_active_slave)
 		pr_info("PROBE: c_arp %s && cas %s BAD\n",
 			curr_arp_slave->dev->name,
@@ -2617,23 +2618,32 @@ static void bond_ab_arp_probe(struct bonding *bond)
 
 	if (curr_active_slave) {
 		bond_arp_send_all(bond, curr_active_slave);
-		return;
+		rcu_read_unlock();
+		return true;
 	}
+	rcu_read_unlock();
 
 	/* if we don't have a curr_active_slave, search for the next available
 	 * backup slave from the current_arp_slave and make it the candidate
 	 * for becoming the curr_active_slave
 	 */
 
+	if (!rtnl_trylock())
+		return false;
+	/* curr_arp_slave might have gone away */
+	curr_arp_slave = ACCESS_ONCE(bond->current_arp_slave);
+
 	if (!curr_arp_slave) {
-		curr_arp_slave = bond_first_slave_rcu(bond);
-		if (!curr_arp_slave)
-			return;
+		curr_arp_slave = bond_first_slave(bond);
+		if (!curr_arp_slave) {
+			rtnl_unlock();
+			return true;
+		}
 	}
 
 	bond_set_slave_inactive_flags(curr_arp_slave);
 
-	bond_for_each_slave_rcu(bond, slave, iter) {
+	bond_for_each_slave(bond, slave, iter) {
 		if (!found && !before && IS_UP(slave->dev))
 			before = slave;
 
@@ -2663,21 +2673,26 @@ static void bond_ab_arp_probe(struct bonding *bond)
 	if (!new_slave && before)
 		new_slave = before;
 
-	if (!new_slave)
-		return;
+	if (!new_slave) {
+		rtnl_unlock();
+		return true;
+	}
 
 	new_slave->link = BOND_LINK_BACK;
 	bond_set_slave_active_flags(new_slave);
 	bond_arp_send_all(bond, new_slave);
 	new_slave->jiffies = jiffies;
 	rcu_assign_pointer(bond->current_arp_slave, new_slave);
+	rtnl_unlock();
+
+	return true;
 }
 
 static void bond_activebackup_arp_mon(struct work_struct *work)
 {
 	struct bonding *bond = container_of(work, struct bonding,
 					    arp_work.work);
-	bool should_notify_peers = false;
+	bool should_notify_peers = false, should_commit = false;
 	int delta_in_ticks;
 
 	delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
@@ -2686,12 +2701,11 @@ static void bond_activebackup_arp_mon(struct work_struct *work)
 		goto re_arm;
 
 	rcu_read_lock();
-
 	should_notify_peers = bond_should_notify_peers(bond);
+	should_commit = bond_ab_arp_inspect(bond);
+	rcu_read_unlock();
 
-	if (bond_ab_arp_inspect(bond)) {
-		rcu_read_unlock();
-
+	if (should_commit) {
 		/* Race avoidance with bond_close flush of workqueue */
 		if (!rtnl_trylock()) {
 			delta_in_ticks = 1;
@@ -2700,13 +2714,14 @@ static void bond_activebackup_arp_mon(struct work_struct *work)
 		}
 
 		bond_ab_arp_commit(bond);
-
 		rtnl_unlock();
-		rcu_read_lock();
 	}
 
-	bond_ab_arp_probe(bond);
-	rcu_read_unlock();
+	if (!bond_ab_arp_probe(bond)) {
+		/* rtnl locking failed, re-arm */
+		delta_in_ticks = 1;
+		should_notify_peers = false;
+	}
 
 re_arm:
 	if (bond->params.arp_interval)
-- 
1.8.4

^ permalink raw reply related

* Re: [PATCH] [RFC] netfilter: nf_conntrack: don't relase a conntrack with non-zero refcnt
From: Andrew Vagin @ 2014-01-27 13:44 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: netfilter-devel, Eric Dumazet, netfilter, coreteam, netdev,
	linux-kernel, vvs, Florian Westphal, Cyrill Gorcunov,
	Vasiliy Averin
In-Reply-To: <1389720948-7883-1-git-send-email-avagin@openvz.org>

On Tue, Jan 14, 2014 at 09:35:48PM +0400, Andrey Vagin wrote:
> ----
> Eric and Florian, could you look at this patch. When you say,
> that it looks good, I will ask the user to validate it.
> I can't reorder these actions, because it's reproduced on a real host
> with real users. Thanks.

We didn't get new reports from users for the last week, so these patches
fix the problem.

> ----
> 
> nf_conntrack_free can't be called for a conntract with non-zero ref-counter,
> because it can race with nf_conntrack_find_get().
> 
> A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero
> ref-conunter says that this conntrack is used now. So when we release a
> conntrack with non-zero counter, we break this assumption.
> 
> CPU1                                    CPU2
> ____nf_conntrack_find()
>                                         nf_ct_put()
>                                          destroy_conntrack()
>                                         ...
>                                         init_conntrack
>                                          __nf_conntrack_alloc (set use = 1)
> atomic_inc_not_zero(&ct->use) (use = 2)
>                                          if (!l4proto->new(ct, skb, dataoff, timeouts))
>                                           nf_conntrack_free(ct); (use = 2 !!!)
>                                         ...
>                                         __nf_conntrack_alloc (set use = 1)
>  if (!nf_ct_key_equal(h, tuple, zone))
>   nf_ct_put(ct); (use = 0)
>    destroy_conntrack()
>                                         /* continue to work with CT */
> 
> After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU race
> in nf_conntrack_find_get (v3)" another bug was triggered in
> destroy_conntrack():
> <4>[67096.759334] ------------[ cut here ]------------
> <2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211!
> ...
> <4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G         C ---------------    2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB
> <4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>]  [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack]
> <4>[67096.760255] Call Trace:
> <4>[67096.760255]  [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30
> <4>[67096.760255]  [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack]
> <4>[67096.760255]  [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack]
> <4>[67096.760255]  [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4]
> <4>[67096.760255]  [<ffffffff81484419>] nf_iterate+0x69/0xb0
> <4>[67096.760255]  [<ffffffff814b5b00>] ? dst_output+0x0/0x20
> <4>[67096.760255]  [<ffffffff814845d4>] nf_hook_slow+0x74/0x110
> <4>[67096.760255]  [<ffffffff814b5b00>] ? dst_output+0x0/0x20
> <4>[67096.760255]  [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910
> <4>[67096.760255]  [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130
> <4>[67096.760255]  [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255]  [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255]  [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0
> <4>[67096.760255]  [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140
> <4>[67096.760255]  [<ffffffff81444f97>] sock_sendmsg+0x117/0x140
> <4>[67096.760255]  [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60
> <4>[67096.760255]  [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20
> <4>[67096.760255]  [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40
> <4>[67096.760255]  [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80
> <4>[67096.760255]  [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255]  [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20
> <4>[67096.760255]  [<ffffffff814457c9>] sys_sendto+0x139/0x190
> <4>[67096.760255]  [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200
> <4>[67096.760255]  [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290
> <4>[67096.760255]  [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210
> <4>[67096.760255]  [<ffffffff8104dea3>] ia32_sysret+0x0/0x5
> 
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Florian Westphal <fw@strlen.de>
> Cc: Cyrill Gorcunov <gorcunov@openvz.org>
> Cc: Vasiliy Averin <vvs@parallels.com>
> Signed-off-by: Andrey Vagin <avagin@openvz.org>
> ---
>  include/net/netfilter/nf_conntrack.h |  1 -
>  net/netfilter/nf_conntrack_core.c    | 18 +++++++++++-------
>  net/netfilter/nf_conntrack_netlink.c |  2 +-
>  net/netfilter/nf_synproxy_core.c     |  4 ++--
>  net/netfilter/xt_CT.c                |  2 +-
>  5 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 01ea6ee..d338316 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -243,7 +243,6 @@ void nf_ct_untracked_status_or(unsigned long bits);
>  void nf_ct_iterate_cleanup(struct net *net,
>  			   int (*iter)(struct nf_conn *i, void *data),
>  			   void *data, u32 portid, int report);
> -void nf_conntrack_free(struct nf_conn *ct);
>  struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
>  				   const struct nf_conntrack_tuple *orig,
>  				   const struct nf_conntrack_tuple *repl,
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index b56e53b..c38cc74 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -198,6 +198,8 @@ clean_from_lists(struct nf_conn *ct)
>  	nf_ct_remove_expectations(ct);
>  }
>  
> +static void nf_conntrack_free(struct nf_conn *ct);
> +
>  static void
>  destroy_conntrack(struct nf_conntrack *nfct)
>  {
> @@ -226,9 +228,8 @@ destroy_conntrack(struct nf_conntrack *nfct)
>  	 * too. */
>  	nf_ct_remove_expectations(ct);
>  
> -	/* We overload first tuple to link into unconfirmed or dying list.*/
> -	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
> -	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
> +	if (!hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode))
> +		hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
>  
>  	NF_CT_STAT_INC(net, delete);
>  	spin_unlock_bh(&nf_conntrack_lock);
> @@ -772,18 +773,21 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
>  }
>  EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
>  
> -void nf_conntrack_free(struct nf_conn *ct)
> +static void nf_conntrack_free(struct nf_conn *ct)
>  {
>  	struct net *net = nf_ct_net(ct);
>  
> +	/* A freed object has refcnt == 0, thats
> +	 * the golden rule for SLAB_DESTROY_BY_RCU
> +	 */
> +	NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
> +
>  	nf_ct_ext_destroy(ct);
>  	nf_ct_ext_free(ct);
>  	kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
>  	smp_mb__before_atomic_dec();
>  	atomic_dec(&net->ct.count);
>  }
> -EXPORT_SYMBOL_GPL(nf_conntrack_free);
> -
>  
>  /* Allocate a new conntrack: we return -ENOMEM if classification
>     failed due to stress.  Otherwise it really is unclassifiable. */
> @@ -835,7 +839,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
>  	}
>  
>  	if (!l4proto->new(ct, skb, dataoff, timeouts)) {
> -		nf_conntrack_free(ct);
> +		nf_ct_put(ct);
>  		pr_debug("init conntrack: can't track with proto module\n");
>  		return NULL;
>  	}
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 3e91ad3..fadd0f3 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1732,7 +1732,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  err2:
>  	rcu_read_unlock();
>  err1:
> -	nf_conntrack_free(ct);
> +	nf_ct_put(ct);
>  	return ERR_PTR(err);
>  }
>  
> diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
> index 9858e3e..d12234c 100644
> --- a/net/netfilter/nf_synproxy_core.c
> +++ b/net/netfilter/nf_synproxy_core.c
> @@ -381,7 +381,7 @@ static int __net_init synproxy_net_init(struct net *net)
>  err3:
>  	free_percpu(snet->stats);
>  err2:
> -	nf_conntrack_free(ct);
> +	nf_ct_put(ct);
>  err1:
>  	return err;
>  }
> @@ -390,7 +390,7 @@ static void __net_exit synproxy_net_exit(struct net *net)
>  {
>  	struct synproxy_net *snet = synproxy_pernet(net);
>  
> -	nf_conntrack_free(snet->tmpl);
> +	nf_ct_put(snet->tmpl);
>  	synproxy_proc_exit(net);
>  	free_percpu(snet->stats);
>  }
> diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
> index da35ac0..da4edfe 100644
> --- a/net/netfilter/xt_CT.c
> +++ b/net/netfilter/xt_CT.c
> @@ -237,7 +237,7 @@ out:
>  	return 0;
>  
>  err3:
> -	nf_conntrack_free(ct);
> +	nf_ct_put(ct);
>  err2:
>  	nf_ct_l3proto_module_put(par->family);
>  err1:
> -- 
> 1.8.4.2
> 

^ permalink raw reply

* Re: [PATCH 3/3] net: ethoc: document OF bindings
From: Rob Herring @ 2014-01-27 14:10 UTC (permalink / raw)
  To: Max Filippov
  Cc: linux-xtensa, netdev, devicetree@vger.kernel.org,
	linux-kernel@vger.kernel.org, Chris Zankel, Marc Gauthier,
	David S. Miller, Grant Likely, Rob Herring
In-Reply-To: <1390795167-6677-4-git-send-email-jcmvbkbc@gmail.com>

On Sun, Jan 26, 2014 at 9:59 PM, Max Filippov <jcmvbkbc@gmail.com> wrote:
> Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> ---
>  .../devicetree/bindings/net/opencores-ethoc.txt    | 25 ++++++++++++++++++++++
>  1 file changed, 25 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/opencores-ethoc.txt
>
> diff --git a/Documentation/devicetree/bindings/net/opencores-ethoc.txt b/Documentation/devicetree/bindings/net/opencores-ethoc.txt
> new file mode 100644
> index 0000000..f7c6c94
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/opencores-ethoc.txt
> @@ -0,0 +1,25 @@
> +* OpenCores MAC 10/100 Mbps
> +
> +Required properties:
> +- compatible: Should be "opencores,ethoc".

There are not different versions of IP or is the version probeable?

> +- reg: Address and length of the register set for the device and of its
> +  descriptor memory.
> +- interrupts: Should contain ethoc interrupt.
> +
> +Optional properties:
> +- local-mac-address: 6 bytes, mac address

There's a patch in progress to move all the standard network
properties to a common location, so you can remove this.

> +- clock-frequency: basic MAC frequency.
> +- mii-mgmt-clock-frequency: frequency of MII management bus. Together
> +  with clock-frequency determines the value programmed into the CLKDIV
> +  field of MIIMODER register.
> +
> +Examples:
> +
> +       enet0: ethoc@fd030000 {
> +               compatible = "opencores,ethoc";
> +               reg = <0xfd030000 0x4000 0xfd800000 0x4000>;
> +               interrupts = <1>;
> +               local-mac-address = [00 50 c2 13 6f 00];
> +               clock-frequency = <50000000>;
> +               mii-mgmt-clock-frequency = <5000000>;
> +        };
> --
> 1.8.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe devicetree" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 3/3] net: ethoc: document OF bindings
From: Sergei Shtylyov @ 2014-01-27 14:21 UTC (permalink / raw)
  To: Rob Herring, Max Filippov
  Cc: linux-xtensa, netdev, devicetree@vger.kernel.org,
	linux-kernel@vger.kernel.org, Chris Zankel, Marc Gauthier,
	David S. Miller, Grant Likely, Rob Herring
In-Reply-To: <CAL_JsqLjn4FXrNGLGHs639TEqmQKPx_kk9JbAd-hn7mexzAjNg@mail.gmail.com>

Hello.

On 27-01-2014 18:10, Rob Herring wrote:

>> Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
>> ---
>>   .../devicetree/bindings/net/opencores-ethoc.txt    | 25 ++++++++++++++++++++++
>>   1 file changed, 25 insertions(+)
>>   create mode 100644 Documentation/devicetree/bindings/net/opencores-ethoc.txt

>> diff --git a/Documentation/devicetree/bindings/net/opencores-ethoc.txt b/Documentation/devicetree/bindings/net/opencores-ethoc.txt
>> new file mode 100644
>> index 0000000..f7c6c94
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/net/opencores-ethoc.txt
>> @@ -0,0 +1,25 @@
[...]

>> +- reg: Address and length of the register set for the device and of its
>> +  descriptor memory.
>> +- interrupts: Should contain ethoc interrupt.
>> +
>> +Optional properties:
>> +- local-mac-address: 6 bytes, mac address

> There's a patch in progress to move all the standard network
> properties to a common location, so you can remove this.

    Haven't you asked me to replace the descriptions of common Ethernet props 
with the references to the common bindings file? I'd prefer this prop to 
remain on its place in this case, just the description replaced. 
Unfortunately, my patch will now have to wait till 3.15 (I guess), the same as 
these ones.

WBR, Sergei

^ permalink raw reply

* Re: [PATCH RFC 4/6] net: rfkill: gpio: add device tree support
From: Maxime Ripard @ 2014-01-27 14:24 UTC (permalink / raw)
  To: Chen-Yu Tsai
  Cc: Johannes Berg, David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-sunxi-/JYPxA39Uh5TLH3MbocFFw,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1389941251-32692-5-git-send-email-wens-jdAy2FN1RRM@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 1295 bytes --]

Hi,

On Fri, Jan 17, 2014 at 02:47:29PM +0800, Chen-Yu Tsai wrote:
> Signed-off-by: Chen-Yu Tsai <wens-jdAy2FN1RRM@public.gmane.org>
> ---
>  .../devicetree/bindings/rfkill/rfkill-gpio.txt     | 26 ++++++++++++++++++++++
>  net/rfkill/rfkill-gpio.c                           | 23 +++++++++++++++++++
>  2 files changed, 49 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/rfkill/rfkill-gpio.txt
> 
> diff --git a/Documentation/devicetree/bindings/rfkill/rfkill-gpio.txt b/Documentation/devicetree/bindings/rfkill/rfkill-gpio.txt
> new file mode 100644
> index 0000000..8a07ea4
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/rfkill/rfkill-gpio.txt
> @@ -0,0 +1,26 @@
> +GPIO controlled RFKILL devices
> +
> +Required properties:
> +- compatible	: Must be "rfkill-gpio".
> +- rfkill-name	: Name of RFKILL device
> +- rfkill-type	: Type of RFKILL device: 1 for WiFi, 2 for BlueTooth
> +- NAME_shutdown-gpios	: GPIO phandle to shutdown control
> +			  (phandle must be the second)

Can't it be handled by a regulator?

> +- NAME_reset-gpios	: GPIO phandle to reset control

And this one using the reset framework?

Maxime

-- 
Maxime Ripard, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 1/3] net: via-rhine: switch to generic DMA functions
From: Ben Hutchings @ 2014-01-27 14:49 UTC (permalink / raw)
  To: Alexey Charkov; +Cc: netdev, linux, devicetree, rl, linux-kernel
In-Reply-To: <1390823503-24087-2-git-send-email-alchark@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1803 bytes --]

On Mon, 2014-01-27 at 15:51 +0400, Alexey Charkov wrote:
> Remove legacy PCI DMA wrappers and instead use generic DMA functions
> directly in preparation for OF bus binding
> 
> Signed-off-by: Alexey Charkov <alchark@gmail.com>
> Signed-off-by: Roger Luethi <rl@hellgate.ch>
> ---
>  drivers/net/ethernet/via/via-rhine.c | 56 +++++++++++++++++++-----------------
>  1 file changed, 29 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
> index ef312bc..fee8732 100644
> --- a/drivers/net/ethernet/via/via-rhine.c
> +++ b/drivers/net/ethernet/via/via-rhine.c
> @@ -919,10 +919,10 @@ static int rhine_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
>  		goto err_out;
>  
>  	/* this should always be supported */
> -	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
> +	rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
>  	if (rc) {
>  		dev_err(&pdev->dev,
> -			"32-bit PCI DMA addresses not supported by the card!?\n");
> +			"32-bit DMA addresses not supported by the card!?\n");
>  		goto err_out;
>  	}
>  
> @@ -1094,20 +1094,22 @@ static int alloc_ring(struct net_device* dev)
>  	void *ring;
>  	dma_addr_t ring_dma;
>  
> -	ring = pci_alloc_consistent(rp->pdev,
> +	ring = dma_alloc_coherent(&rp->pdev->dev,
>  				    RX_RING_SIZE * sizeof(struct rx_desc) +
>  				    TX_RING_SIZE * sizeof(struct tx_desc),
> -				    &ring_dma);
> +				    &ring_dma,
> +					GFP_ATOMIC);
[...]

Indentation is messed up here (and in several other function calls
you're changing).  You should align the function arguments so each line
begins in the column after the opening parenthesis.

Ben.

-- 
Ben Hutchings
If at first you don't succeed, you're doing about average.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: [PATCH 3/3] net: via-rhine: add OF bus binding
From: Ben Hutchings @ 2014-01-27 14:57 UTC (permalink / raw)
  To: Alexey Charkov
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-ci5G2KO2hbZ+pU9mqzGVBQ,
	devicetree-u79uwXL29TY76Z2rM5mHXA, rl-7uj+XXdSDtwfv37vnLkPlQ,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1390823503-24087-4-git-send-email-alchark-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 1475 bytes --]

On Mon, 2014-01-27 at 15:51 +0400, Alexey Charkov wrote:
> This should make the driver usable with VIA/WonderMedia ARM-based
> Systems-on-Chip integrated Rhine III adapters. Note that these
> are always in MMIO mode, and don't have any known EEPROM.
[...]
> --- a/drivers/net/ethernet/via/Kconfig
> +++ b/drivers/net/ethernet/via/Kconfig
> @@ -19,7 +19,7 @@ if NET_VENDOR_VIA
>  
>  config VIA_RHINE
>         tristate "VIA Rhine support"
> -       depends on PCI
> +       depends on (PCI || USE_OF)
>         select CRC32
>         select MII
>         ---help---

This seems like the right thing to do, but it means you need to add
#ifdef CONFIG_PCI and #ifdef CONFIG_USE_OF around the driver structures
and related functions.

You should compile-test in configurations that have just one of those
dependencies enabled.

[...]
> --- a/drivers/net/ethernet/via/via-rhine.c
> +++ b/drivers/net/ethernet/via/via-rhine.c
[...]
> @@ -847,7 +856,8 @@ static void rhine_hw_init(struct net_device *dev, long pioaddr)
>  		msleep(5);
>  
>  	/* Reload EEPROM controlled bytes cleared by soft reset */
> -	rhine_reload_eeprom(pioaddr, dev);
> +	if (!strncmp(dev->dev.parent->bus->name, "pci", 3))
> +		rhine_reload_eeprom(pioaddr, dev);
[...]

Ew.  I think you should use dev_is_pci(), although you might also need
to guard that with #ifdef CONFIG_PCI.

Ben.

-- 
Ben Hutchings
If at first you don't succeed, you're doing about average.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: ath9k ARM build error with v3.13-8330-g4ba9920
From: Josh Boyer @ 2014-01-27 15:10 UTC (permalink / raw)
  To: Sujith Manoharan
  Cc: John W. Linville, ath9k-devel, netdev,
	Linux-Kernel@Vger. Kernel. Org
In-Reply-To: <21221.47960.726835.615315@gargle.gargle.HOWL>

On Sun, Jan 26, 2014 at 8:50 PM, Sujith Manoharan <sujith@msujith.org> wrote:
> Josh Boyer wrote:
>> adds a udelay(10000) call to the ath9k driver.  This will cause a
>> build error on various ARM configs because the value passed to udelay
>> is too large:
>>
>> ERROR: "__bad_udelay" [drivers/net/wireless/ath/ath9k/ath9k_hw.ko] undefined!
>> make[1]: *** [__modpost] Error 1
>> make: *** [modules] Error 2
>>
>> Is the 10000 microsecond udelay really required?  I believe the limit
>> on ARM is 2000.  Perhaps something else could be done in this case?
>
> The delay is a workaround for a HW issue. Does this patch fix the problem ?

It fixes the build error, yes.  I don't have actual HW to test, but
the driver compiles on ARM now with the same config as before.
Thanks!

josh

>
> diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
> index fbf43c0..11eab9f 100644
> --- a/drivers/net/wireless/ath/ath9k/hw.c
> +++ b/drivers/net/wireless/ath/ath9k/hw.c
> @@ -1316,7 +1316,7 @@ static bool ath9k_hw_set_reset(struct ath_hw *ah, int type)
>         if (AR_SREV_9300_20_OR_LATER(ah))
>                 udelay(50);
>         else if (AR_SREV_9100(ah))
> -               udelay(10000);
> +               mdelay(10);
>         else
>                 udelay(100);
>
> @@ -2051,9 +2051,8 @@ static bool ath9k_hw_set_power_awake(struct ath_hw *ah)
>
>         REG_SET_BIT(ah, AR_RTC_FORCE_WAKE,
>                     AR_RTC_FORCE_WAKE_EN);
> -
>         if (AR_SREV_9100(ah))
> -               udelay(10000);
> +               mdelay(10);
>         else
>                 udelay(50);
>
>
> Sujith

^ permalink raw reply

* [PATCH net] bnx2x: More Shutdown revisions
From: Yuval Mintz @ 2014-01-27 15:11 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz, Ariel Elior

Submission d9aee59 "bnx2x: Don't release PCI bars on shutdown" separated
the PCI remove and shutdown flows, but pci_disable_device() is still
being called on both.
As a result, a dev_WARN_ONCE will be hit during shutdown for every bnx2x
VF probed on a hypervisor (as its shutdown callback will be called and later
pci_disable_sriov() will call its remove callback).

This calls the pci_disable_device() only on the remove flow.

Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Ariel Elior <ariele@broadcom.com>
---
Hi Dave,

Please consider applying this to `net' (or possibly net-next;
not sure exactly how things currently work).

Thanks,
Yuval
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index e118a3e..c9c445e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13102,9 +13102,9 @@ static void __bnx2x_remove(struct pci_dev *pdev,
 
 		if (atomic_read(&pdev->enable_cnt) == 1)
 			pci_release_regions(pdev);
-	}
 
-	pci_disable_device(pdev);
+		pci_disable_device(pdev);
+	}
 }
 
 static void bnx2x_remove_one(struct pci_dev *pdev)
-- 
1.8.1.227.g44fe835

^ permalink raw reply related

* Re: [PATCH 1/3] net: via-rhine: switch to generic DMA functions
From: Alexey Charkov @ 2014-01-27 15:26 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, Tony Prisk,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Roger Luethi,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1390834190.2735.143.camel-nDn/Rdv9kqW9Jme8/bJn5UCKIB8iOfG2tUK59QYPAWc@public.gmane.org>

2014/1/27 Ben Hutchings <ben-/+tVBieCtBitmTQ+vhA3Yw@public.gmane.org>:
> On Mon, 2014-01-27 at 15:51 +0400, Alexey Charkov wrote:
>> Remove legacy PCI DMA wrappers and instead use generic DMA functions
>> directly in preparation for OF bus binding
>>
>> Signed-off-by: Alexey Charkov <alchark-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> Signed-off-by: Roger Luethi <rl-7uj+XXdSDtwfv37vnLkPlQ@public.gmane.org>
>> ---
>>  drivers/net/ethernet/via/via-rhine.c | 56 +++++++++++++++++++-----------------
>>  1 file changed, 29 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
>> index ef312bc..fee8732 100644
>> --- a/drivers/net/ethernet/via/via-rhine.c
>> +++ b/drivers/net/ethernet/via/via-rhine.c
>> @@ -919,10 +919,10 @@ static int rhine_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
>>               goto err_out;
>>
>>       /* this should always be supported */
>> -     rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
>> +     rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
>>       if (rc) {
>>               dev_err(&pdev->dev,
>> -                     "32-bit PCI DMA addresses not supported by the card!?\n");
>> +                     "32-bit DMA addresses not supported by the card!?\n");
>>               goto err_out;
>>       }
>>
>> @@ -1094,20 +1094,22 @@ static int alloc_ring(struct net_device* dev)
>>       void *ring;
>>       dma_addr_t ring_dma;
>>
>> -     ring = pci_alloc_consistent(rp->pdev,
>> +     ring = dma_alloc_coherent(&rp->pdev->dev,
>>                                   RX_RING_SIZE * sizeof(struct rx_desc) +
>>                                   TX_RING_SIZE * sizeof(struct tx_desc),
>> -                                 &ring_dma);
>> +                                 &ring_dma,
>> +                                     GFP_ATOMIC);
> [...]
>
> Indentation is messed up here (and in several other function calls
> you're changing).  You should align the function arguments so each line
> begins in the column after the opening parenthesis.

Ben, thanks for pointing out. I actually just tried to follow the
style of surrounding code, but happy to adjust if that's the preferred
option. From what I can see, these lines should still fit in below 80
cols even with increased indents...

Should we then also adjust other function calls within the driver with
similar indentation (if any), that are currently not touched by this
patch series?

Thanks,
Alexey
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/3] net: via-rhine: switch to generic DMA functions
From: Ben Hutchings @ 2014-01-27 15:28 UTC (permalink / raw)
  To: Alexey Charkov
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, Tony Prisk,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Roger Luethi,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <CABjd4Yyq_Mia=2DkC83Wpi6o1o3m5+yoMcDL4WdZmuLvRtu6qg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 1590 bytes --]

On Mon, 2014-01-27 at 19:26 +0400, Alexey Charkov wrote:
> 2014/1/27 Ben Hutchings <ben-/+tVBieCtBitmTQ+vhA3Yw@public.gmane.org>:
> > On Mon, 2014-01-27 at 15:51 +0400, Alexey Charkov wrote:
[...]
> >> @@ -1094,20 +1094,22 @@ static int alloc_ring(struct net_device* dev)
> >>       void *ring;
> >>       dma_addr_t ring_dma;
> >>
> >> -     ring = pci_alloc_consistent(rp->pdev,
> >> +     ring = dma_alloc_coherent(&rp->pdev->dev,
> >>                                   RX_RING_SIZE * sizeof(struct rx_desc) +
> >>                                   TX_RING_SIZE * sizeof(struct tx_desc),
> >> -                                 &ring_dma);
> >> +                                 &ring_dma,
> >> +                                     GFP_ATOMIC);
> > [...]
> >
> > Indentation is messed up here (and in several other function calls
> > you're changing).  You should align the function arguments so each line
> > begins in the column after the opening parenthesis.
> 
> Ben, thanks for pointing out. I actually just tried to follow the
> style of surrounding code, but happy to adjust if that's the preferred
> option. From what I can see, these lines should still fit in below 80
> cols even with increased indents...
> 
> Should we then also adjust other function calls within the driver with
> similar indentation (if any), that are currently not touched by this
> patch series?

There is no need to do that at the same time, but it would be a nice bit
of cleanup.

Ben.

-- 
Ben Hutchings
If at first you don't succeed, you're doing about average.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: [PATCH 3/3] net: via-rhine: add OF bus binding
From: Alexey Charkov @ 2014-01-27 15:34 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, Tony Prisk,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Roger Luethi,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1390834654.2735.148.camel-nDn/Rdv9kqW9Jme8/bJn5UCKIB8iOfG2tUK59QYPAWc@public.gmane.org>

2014/1/27 Ben Hutchings <ben-/+tVBieCtBitmTQ+vhA3Yw@public.gmane.org>:
> On Mon, 2014-01-27 at 15:51 +0400, Alexey Charkov wrote:
>> This should make the driver usable with VIA/WonderMedia ARM-based
>> Systems-on-Chip integrated Rhine III adapters. Note that these
>> are always in MMIO mode, and don't have any known EEPROM.
> [...]
>> --- a/drivers/net/ethernet/via/Kconfig
>> +++ b/drivers/net/ethernet/via/Kconfig
>> @@ -19,7 +19,7 @@ if NET_VENDOR_VIA
>>
>>  config VIA_RHINE
>>         tristate "VIA Rhine support"
>> -       depends on PCI
>> +       depends on (PCI || USE_OF)
>>         select CRC32
>>         select MII
>>         ---help---
>
> This seems like the right thing to do, but it means you need to add
> #ifdef CONFIG_PCI and #ifdef CONFIG_USE_OF around the driver structures
> and related functions.

Frankly, I would like to avoid that if possible (as pointed out in the
cover email), as I believe we would get a cleaner driver without
#ifdef. This is also the way it was done in via-velocity, and it works
just fine.

> You should compile-test in configurations that have just one of those
> dependencies enabled.

This has been compile-tested and runtime-tested in OF-only
configuration on WM8950, and Roger also tested it in PCI-only
configuration, so it seems to work fine.

> [...]
>> --- a/drivers/net/ethernet/via/via-rhine.c
>> +++ b/drivers/net/ethernet/via/via-rhine.c
> [...]
>> @@ -847,7 +856,8 @@ static void rhine_hw_init(struct net_device *dev, long pioaddr)
>>               msleep(5);
>>
>>       /* Reload EEPROM controlled bytes cleared by soft reset */
>> -     rhine_reload_eeprom(pioaddr, dev);
>> +     if (!strncmp(dev->dev.parent->bus->name, "pci", 3))
>> +             rhine_reload_eeprom(pioaddr, dev);
> [...]
>
> Ew.  I think you should use dev_is_pci(), although you might also need
> to guard that with #ifdef CONFIG_PCI.

Oh, cool. Didn't realize it existed :) Will adjust, thanks.

I believe the #ifdef is not strictly required, though, as we include
the PCI header anyway (and the macro expands to just a simple test).
Any specific concerns why we should do that, apart from the +3.8%
module size increase?

Thanks,
Alexey
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 3/3] net: ethoc: document OF bindings
From: Max Filippov @ 2014-01-27 15:52 UTC (permalink / raw)
  To: Rob Herring
  Cc: linux-xtensa@linux-xtensa.org, netdev, devicetree@vger.kernel.org,
	linux-kernel@vger.kernel.org, Chris Zankel, Marc Gauthier,
	David S. Miller, Grant Likely, Rob Herring
In-Reply-To: <CAL_JsqLjn4FXrNGLGHs639TEqmQKPx_kk9JbAd-hn7mexzAjNg@mail.gmail.com>

Hi Rob,

On Mon, Jan 27, 2014 at 6:10 PM, Rob Herring <robherring2@gmail.com> wrote:
> On Sun, Jan 26, 2014 at 9:59 PM, Max Filippov <jcmvbkbc@gmail.com> wrote:
>> Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
>> ---
>>  .../devicetree/bindings/net/opencores-ethoc.txt    | 25 ++++++++++++++++++++++
>>  1 file changed, 25 insertions(+)
>>  create mode 100644 Documentation/devicetree/bindings/net/opencores-ethoc.txt
>>
>> diff --git a/Documentation/devicetree/bindings/net/opencores-ethoc.txt b/Documentation/devicetree/bindings/net/opencores-ethoc.txt
>> new file mode 100644
>> index 0000000..f7c6c94
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/net/opencores-ethoc.txt
>> @@ -0,0 +1,25 @@
>> +* OpenCores MAC 10/100 Mbps
>> +
>> +Required properties:
>> +- compatible: Should be "opencores,ethoc".
>
> There are not different versions of IP or is the version probeable?

AFAIK there's single version of this 10/100 MAC.
It doesn't have any identification registers.

>> +- reg: Address and length of the register set for the device and of its
>> +  descriptor memory.
>> +- interrupts: Should contain ethoc interrupt.
>> +
>> +Optional properties:
>> +- local-mac-address: 6 bytes, mac address
>
> There's a patch in progress to move all the standard network
> properties to a common location, so you can remove this.

Will do.

>> +- clock-frequency: basic MAC frequency.
>> +- mii-mgmt-clock-frequency: frequency of MII management bus. Together
>> +  with clock-frequency determines the value programmed into the CLKDIV
>> +  field of MIIMODER register.
>> +
>> +Examples:
>> +
>> +       enet0: ethoc@fd030000 {
>> +               compatible = "opencores,ethoc";
>> +               reg = <0xfd030000 0x4000 0xfd800000 0x4000>;
>> +               interrupts = <1>;
>> +               local-mac-address = [00 50 c2 13 6f 00];
>> +               clock-frequency = <50000000>;
>> +               mii-mgmt-clock-frequency = <5000000>;
>> +        };
>> --
>> 1.8.1.4

-- 
Thanks.
-- Max

^ permalink raw reply

* Re: [PATCH 3/3] net: via-rhine: add OF bus binding
From: Rob Herring @ 2014-01-27 15:56 UTC (permalink / raw)
  To: Alexey Charkov
  Cc: netdev, Tony Prisk,
	devicetree-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	rl-7uj+XXdSDtwfv37vnLkPlQ,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1390823503-24087-4-git-send-email-alchark-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On Mon, Jan 27, 2014 at 5:51 AM, Alexey Charkov <alchark-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> This should make the driver usable with VIA/WonderMedia ARM-based
> Systems-on-Chip integrated Rhine III adapters. Note that these
> are always in MMIO mode, and don't have any known EEPROM.
>
> Signed-off-by: Alexey Charkov <alchark-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> Signed-off-by: Roger Luethi <rl-7uj+XXdSDtwfv37vnLkPlQ@public.gmane.org>
> ---
>  .../devicetree/bindings/net/via-rhine.txt          |  18 ++
>  drivers/net/ethernet/via/Kconfig                   |   2 +-
>  drivers/net/ethernet/via/via-rhine.c               | 293 +++++++++++++--------
>  3 files changed, 200 insertions(+), 113 deletions(-)
>  create mode 100644 Documentation/devicetree/bindings/net/via-rhine.txt
>
> diff --git a/Documentation/devicetree/bindings/net/via-rhine.txt b/Documentation/devicetree/bindings/net/via-rhine.txt
> new file mode 100644
> index 0000000..684dd3a
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/via-rhine.txt
> @@ -0,0 +1,18 @@
> +* VIA Rhine 10/100 Network Controller
> +
> +Required properties:
> +- compatible : Should be "via,rhine"

This should be more specific rather than...

> +- reg : Address and length of the io space
> +- interrupts : Should contain the controller interrupt line
> +- rhine,revision : Rhine core revision, used to inform the
> +       driver of quirks and capabilities to expect from
> +       the device. Mimics the respective PCI attribute.

having this property. The OF match table can then have the quirks set
based on compatible strings.

Rob
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: [PATCH net] net: hyperv: initialize link status correctly
From: Haiyang Zhang @ 2014-01-27 16:05 UTC (permalink / raw)
  To: Jason Wang, KY Srinivasan, devel@linuxdriverproject.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1390807854-4469-1-git-send-email-jasowang@redhat.com>



> -----Original Message-----
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Monday, January 27, 2014 2:31 AM
> To: KY Srinivasan; Haiyang Zhang; devel@linuxdriverproject.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org
> Cc: Jason Wang
> Subject: [PATCH net] net: hyperv: initialize link status correctly
> 
> Call netif_carrier_on() after register_device(). Otherwise it won't work since
> the device was still in NETREG_UNINITIALIZED state.
> 
> Fixes a68f9614614749727286f675d15f1e09d13cb54a
> (hyperv: Fix race between probe and open calls)
> 
> Cc: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: K. Y. Srinivasan <kys@microsoft.com>
> Reported-by: Di Nie <dnie@redhat.com>
> Tested-by: Di Nie <dnie@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---

I'm working on a fix for this, and will submit it soon.

Thanks,
- Haiyang

^ permalink raw reply

* Many USB ethernet devices are broken over xhci
From: David Laight @ 2014-01-27 16:06 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Sarah Sharp

Many of the net/usb ethernet drivers (including common ones like
the smsc95xx) will fail to transmit all packet length properly
when connected to a USB3 port (ie using the xhci driver).

The underlying problem is that they assume the host controller
will honour the URB_ZERO_PACKET flag - which usbnet sets because
they specify FLAG_SEND_ZLP.

However no one has ever added support for URB_ZERO_PACKET to
the xhci driver - so packets that need padding (probably 512
bytes after any header is added) will not be sent correctly
and may have very adverse effects on the usb target.

The ax179_178a driver avoids this by not setting FLAG_SEND_ZLP,
modifying the packet header, and appending an extra zero byte.
(Which has been responsible for its own set of panics.)

I don't think this can be fixed by just clearing (or ignoring)
FLAG_SEND_ZLP as the extra byte will also confuse things.

It needs to be fixed in the xhci code.

I wrote this patch a while ago - worked for me with the ax179_178a
driver. http://www.spinics.net/lists/linux-usb/msg97370.html

The patch is a bit difficult to read, the v1 version contained a copy of
the new function. http://www.spinics.net/lists/linux-usb/msg97183.html

I don't think anything significant has changed (in the main kernel
sources) since I wrote the patch.

	David




--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/3] net: ethoc: don't advertise gigabit speed on attached PHY
From: Max Filippov @ 2014-01-27 16:17 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: linux-xtensa@linux-xtensa.org, netdev, devicetree@vger.kernel.org,
	LKML, Chris Zankel, Marc Gauthier, David S. Miller, Grant Likely,
	Rob Herring
In-Reply-To: <1390817904.2735.127.camel@deadeye.wl.decadent.org.uk>

Hi Ben,

On Mon, Jan 27, 2014 at 2:18 PM, Ben Hutchings <ben@decadent.org.uk> wrote:
> On Mon, 2014-01-27 at 07:59 +0400, Max Filippov wrote:
>> OpenCores 10/100 Mbps MAC does not support speeds above 100 Mbps, but does
>> not disable advertisement when PHY supports them. This results in
>> non-functioning network when the MAC is connected to a gigabit PHY connected
>> to a gigabit switch.
>>
>> The fix is to disable gigabit speed advertisement on attached PHY
>> unconditionally.
>>
>> Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
>> ---
>>  drivers/net/ethernet/ethoc.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
>> index 4de8cfd..0aa1a05 100644
>> --- a/drivers/net/ethernet/ethoc.c
>> +++ b/drivers/net/ethernet/ethoc.c
>> @@ -712,6 +712,8 @@ static int ethoc_open(struct net_device *dev)
>>               netif_start_queue(dev);
>>       }
>>
>> +     priv->phy->advertising &= ~(ADVERTISED_1000baseT_Full |
>> +                                 ADVERTISED_1000baseT_Half);
>>       phy_start(priv->phy);
>>       napi_enable(&priv->napi);
>>
>
> This is not sufficient to disable gigabit speeds; the supported mask
> should also be limited.  And it should be done even before the net

I tried that, but when I also limit supported mask the phy driver doesn't
touch gigabit advertising register int the genphy_config_advert at all.
That's probably right for ethtool interface, but ethoc doesn't support
ethtool.

> device is registered.
>
> Rather than poking into the phy_device structure directly from this
> driver, I think you should add a function in phylib for this purpose.

Like below?

---8<---
>From 347331f399626ecaa9a8e54252f55e0b6788772f Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Mon, 27 Jan 2014 04:01:40 +0400
Subject: [PATCH 1/3] net: ethoc: don't advertise gigabit speed on attached PHY

OpenCores 10/100 Mbps MAC does not support speeds above 100 Mbps, but does
not disable advertisement when PHY supports them. This results in
non-functioning network when the MAC is connected to a gigabit PHY connected
to a gigabit switch.

The fix is to disable gigabit speed advertisement on attached PHY
unconditionally.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 drivers/net/ethernet/ethoc.c | 3 +++
 include/linux/phy.h          | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 4de8cfd..e817d58 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -688,6 +688,9 @@ static int ethoc_mdio_probe(struct net_device *dev)
 	}

 	priv->phy = phy;
+	phy_update_adv(phy,
+		       ~(ADVERTISED_1000baseT_Full |
+			 ADVERTISED_1000baseT_Half), 0);
 	return 0;
 }

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 48a4dc3..0282a8d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -559,6 +559,11 @@ static inline int phy_read_status(struct phy_device *phydev) {
 	return phydev->drv->read_status(phydev);
 }

+static inline void phy_update_adv(struct phy_device *phydev, u32 mask, u32 set)
+{
+	phydev->advertising = (phydev->advertising & mask) | set;
+}
+
 int genphy_setup_forced(struct phy_device *phydev);
 int genphy_restart_aneg(struct phy_device *phydev);
 int genphy_config_aneg(struct phy_device *phydev);
-- 
1.8.1.4


-- 
Thanks.
-- Max

^ permalink raw reply related

* [PATCH] AF_PACKET: Add documentation for queue mapping fanout mode
From: Neil Horman @ 2014-01-27 16:43 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, David S. Miller, Daniel Borkmann

Recently I added a new AF_PACKET fanout operation mode in commit
2d36097, but I forgot to document it.  Add PACKET_FANOUT_QM as an available mode
in the af_packet documentation.  Applies to net-next.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Daniel Borkmann <dborkman@redhat.com>
---
 Documentation/networking/packet_mmap.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 91ffe1d..1404674 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -583,6 +583,7 @@ Currently implemented fanout policies are:
   - PACKET_FANOUT_CPU: schedule to socket by CPU packet arrives on
   - PACKET_FANOUT_RND: schedule to socket by random selection
   - PACKET_FANOUT_ROLLOVER: if one socket is full, rollover to another
+  - PACKET_FANOUT_QM: schedule to socket by skbs recorded queue_mapping
 
 Minimal example code by David S. Miller (try things like "./test eth0 hash",
 "./test eth0 lb", etc.):
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH] AF_PACKET: Add documentation for queue mapping fanout mode
From: Daniel Borkmann @ 2014-01-27 16:59 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, David S. Miller
In-Reply-To: <1390840984-22246-1-git-send-email-nhorman@tuxdriver.com>

On 01/27/2014 05:43 PM, Neil Horman wrote:
> Recently I added a new AF_PACKET fanout operation mode in commit
> 2d36097, but I forgot to document it.  Add PACKET_FANOUT_QM as an available mode
> in the af_packet documentation.  Applies to net-next.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: Daniel Borkmann <dborkman@redhat.com>

Thanks a bunch, Neil.

Acked-by: Daniel Borkmann <dborkman@redhat.com>

^ permalink raw reply

* [PATCH V4] net: ip, ipv6: handle gso skbs in forwarding path
From: Florian Westphal @ 2014-01-27 17:58 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal

Marcelo Ricardo Leitner reported problems when the forwarding link
path has a lower mtu than the incoming link if the inbound interface
supports GRO.

Given:
Host <mtu1500> R1 <mtu1200> R2

Host sends tcp stream which is routed via R1 and R2.  R1 performs GRO.

In this case, the kernel will fail to send ICMP fragmentation needed
messages (or pkt too big for ipv6), as gso packets currently bypass the
dst mtu checks in forward path. Instead, Linux tries to send out packets
exceeding R1-R2 link mtu.

When locking route MTU on Host (i.e., no ipv4 DF bit set), R1 does
not fragment the packets when forwarding, and again tries to send out
packets exceeding R1-R2 link mtu.

This alters the forwarding dstmtu checks to take the individual gso
segment lengths into account.

For ipv6, we send out pkt too big error for gso if the individual
segments are too big.

For ipv4, we either send icmp fragmentation needed, or, if the DF bit
is not set, perform software segmentation and let the output path
create fragments when the packet is leaving the machine.
It is not 100% correct as the error message will contain the headers of
the GRO skb instead of the original/segmented one, but it seems to
work fine in my (limited) tests.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
Changes since V3:
 - use ip_dst_mtu_maybe_forward instead of dst_mtu
 - add comment wrt. DF bit not being set

Changes since V2:
 - make this thing apply to current -net tree
 - kill unused variables in ip_forward/ip6_output

Changes since V1:
 suggestions from Eric Dumazet:
  - skip more expensive computation for small packets in fwd path
  - use netif_skb_features() feature mask and remove GSO flags
    instead of using 0 feature set.

 include/linux/skbuff.h | 17 ++++++++++++
 net/ipv4/ip_forward.c  | 71 ++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_output.c  | 17 ++++++++++--
 3 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f589c9a..3ebbbe7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2916,5 +2916,22 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb)
 {
 	return !skb->head_frag || skb_cloned(skb);
 }
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+	unsigned int hdr_len = skb_transport_header(skb) -
+			       skb_network_header(skb);
+	return hdr_len + skb_gso_transport_seglen(skb);
+}
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index e9f1217..2a9602f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -39,6 +39,71 @@
 #include <net/route.h>
 #include <net/xfrm.h>
 
+static bool ip_may_fragment(const struct sk_buff *skb)
+{
+	return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
+	       !skb->local_df;
+}
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu || skb->local_df)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+		return false;
+
+	return true;
+}
+
+static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb)
+{
+	unsigned int mtu;
+
+	if (skb->local_df || !skb_is_gso(skb))
+		return false;
+
+	mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true);
+
+	/* if seglen is > mtu, we must do software segmentation to
+	 * enable proper IP fragmentation on output.
+	 *
+	 * In > mtu case DF bit cannot be set since we'd have already
+	 * dropped skb and sent icmp frag needed error in ip_forward().
+	 */
+	return skb_gso_network_seglen(skb) > mtu;
+}
+
+/* called if GSO skb needs to be fragmented on forward */
+static int ip_forward_finish_gso(struct sk_buff *skb)
+{
+	netdev_features_t features = netif_skb_features(skb);
+	struct sk_buff *segs;
+	int ret = 0;
+
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+	if (IS_ERR(segs)) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+
+	consume_skb(skb);
+
+	do {
+		struct sk_buff *nskb = segs->next;
+		int err;
+
+		segs->next = NULL;
+		err = dst_output(segs);
+
+		if (err && ret == 0)
+			ret = err;
+		segs = nskb;
+	} while (segs);
+
+	return ret;
+}
+
 static int ip_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options *opt	= &(IPCB(skb)->opt);
@@ -49,6 +114,9 @@ static int ip_forward_finish(struct sk_buff *skb)
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
+	if (ip_gso_exceeds_dst_mtu(skb))
+		return ip_forward_finish_gso(skb);
+
 	return dst_output(skb);
 }
 
@@ -91,8 +159,7 @@ int ip_forward(struct sk_buff *skb)
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;
 	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
-	if (unlikely(skb->len > mtu && !skb_is_gso(skb) &&
-		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
+	if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
 		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			  htonl(mtu));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ef02b26..070a2fa 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -342,6 +342,20 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 	return mtu;
 }
 
+static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu || skb->local_df)
+		return false;
+
+	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
+		return true;
+
+	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+		return false;
+
+	return true;
+}
+
 int ip6_forward(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -466,8 +480,7 @@ int ip6_forward(struct sk_buff *skb)
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 
-	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
-	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
+	if (ip6_pkt_too_big(skb, mtu)) {
 		/* Again, force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-- 
1.8.1.5

^ permalink raw reply related

* Re: [PATCH 2/2] net: ip, ipv6: handle gso skbs in forwarding path
From: Eric Dumazet @ 2014-01-27 18:22 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netdev
In-Reply-To: <1390810971-23959-2-git-send-email-fw@strlen.de>

On Mon, 2014-01-27 at 09:22 +0100, Florian Westphal wrote:

> +/* called if GSO skb needs to be fragmented on forward.  */
> +static int ip_forward_finish_gso(struct sk_buff *skb)
> +{
> +	netdev_features_t features = netif_skb_features(skb);
> +	struct sk_buff *segs;
> +	int ret = 0;
> +
> +	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
> +	if (IS_ERR(segs)) {
> +		kfree_skb(skb);
> +		return -ENOMEM;
> +	}
> +
> +	consume_skb(skb);
> +
> +	do {
> +		struct sk_buff *nskb = segs->next;
> +		int err;
> +
> +		segs->next = NULL;
> +		err = dst_output(segs);
> +
> +		if (err && ret == 0)
> +			ret = err;
> +		segs = nskb;
> +	} while (segs);
> +
> +	return ret;
> +}
> +

Its still unclear if this is the best strategy.

TCP stream not using DF flag are very unlikely to care if we adjust
their MTU (lowering gso_size) at this point ?

^ permalink raw reply

* Re: [PATCH 1/3] net: ethoc: don't advertise gigabit speed on attached PHY
From: Florian Fainelli @ 2014-01-27 19:36 UTC (permalink / raw)
  To: Max Filippov
  Cc: Ben Hutchings,
	linux-xtensa-PjhNF2WwrV/0Sa2dR60CXw@public.gmane.org, netdev,
	devicetree-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, LKML,
	Chris Zankel, Marc Gauthier, David S. Miller, Grant Likely,
	Rob Herring
In-Reply-To: <52E6868C.3070401-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

2014-01-27 Max Filippov <jcmvbkbc-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>:
> Hi Ben,
>
> On Mon, Jan 27, 2014 at 2:18 PM, Ben Hutchings <ben-/+tVBieCtBitmTQ+vhA3Yw@public.gmane.org> wrote:
>> On Mon, 2014-01-27 at 07:59 +0400, Max Filippov wrote:
>>> OpenCores 10/100 Mbps MAC does not support speeds above 100 Mbps, but does
>>> not disable advertisement when PHY supports them. This results in
>>> non-functioning network when the MAC is connected to a gigabit PHY connected
>>> to a gigabit switch.
>>>
>>> The fix is to disable gigabit speed advertisement on attached PHY
>>> unconditionally.
>>>
>>> Signed-off-by: Max Filippov <jcmvbkbc-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>> ---
>>>  drivers/net/ethernet/ethoc.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
>>> index 4de8cfd..0aa1a05 100644
>>> --- a/drivers/net/ethernet/ethoc.c
>>> +++ b/drivers/net/ethernet/ethoc.c
>>> @@ -712,6 +712,8 @@ static int ethoc_open(struct net_device *dev)
>>>               netif_start_queue(dev);
>>>       }
>>>
>>> +     priv->phy->advertising &= ~(ADVERTISED_1000baseT_Full |
>>> +                                 ADVERTISED_1000baseT_Half);
>>>       phy_start(priv->phy);
>>>       napi_enable(&priv->napi);
>>>
>>
>> This is not sufficient to disable gigabit speeds; the supported mask
>> should also be limited.  And it should be done even before the net
>
> I tried that, but when I also limit supported mask the phy driver doesn't
> touch gigabit advertising register int the genphy_config_advert at all.
> That's probably right for ethtool interface, but ethoc doesn't support
> ethtool.

This is not right for libphy either, phydev->supported must reflect
that you support Gigabit or not.

Since ethoc supports libphy, there really is no reason not to
implement the ethtool_{get,set}_settings callbacks using
phy_mii_ioctl().

>
>> device is registered.
>>
>> Rather than poking into the phy_device structure directly from this
>> driver, I think you should add a function in phylib for this purpose.

All drivers are currently modifying phydev->advertising and
phydev->supported directly, most of them do it correctly as far as I
checked. It does make some sense to add a helper function though.

>
> Like below?
>
> ---8<---
> From 347331f399626ecaa9a8e54252f55e0b6788772f Mon Sep 17 00:00:00 2001
> From: Max Filippov <jcmvbkbc-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> Date: Mon, 27 Jan 2014 04:01:40 +0400
> Subject: [PATCH 1/3] net: ethoc: don't advertise gigabit speed on attached PHY
>
> OpenCores 10/100 Mbps MAC does not support speeds above 100 Mbps, but does
> not disable advertisement when PHY supports them. This results in
> non-functioning network when the MAC is connected to a gigabit PHY connected
> to a gigabit switch.
>
> The fix is to disable gigabit speed advertisement on attached PHY
> unconditionally.
>
> Signed-off-by: Max Filippov <jcmvbkbc-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> ---
>  drivers/net/ethernet/ethoc.c | 3 +++
>  include/linux/phy.h          | 5 +++++
>  2 files changed, 8 insertions(+)
>
> diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
> index 4de8cfd..e817d58 100644
> --- a/drivers/net/ethernet/ethoc.c
> +++ b/drivers/net/ethernet/ethoc.c
> @@ -688,6 +688,9 @@ static int ethoc_mdio_probe(struct net_device *dev)
>         }
>
>         priv->phy = phy;
> +       phy_update_adv(phy,
> +                      ~(ADVERTISED_1000baseT_Full |
> +                        ADVERTISED_1000baseT_Half), 0);
>         return 0;
>  }
>
> diff --git a/include/linux/phy.h b/include/linux/phy.h
> index 48a4dc3..0282a8d 100644
> --- a/include/linux/phy.h
> +++ b/include/linux/phy.h
> @@ -559,6 +559,11 @@ static inline int phy_read_status(struct phy_device *phydev) {
>         return phydev->drv->read_status(phydev);
>  }
>
> +static inline void phy_update_adv(struct phy_device *phydev, u32 mask, u32 set)
> +{
> +       phydev->advertising = (phydev->advertising & mask) | set;
> +}
> +

This should be a preliminary patch to this patchset, so you first
introduce the function, then use it in the driver, but the idea looks
good. There is room for updating quite some drivers out there since
those used to modify phydev->advertising and phydev->supported
directly without using an accessor.

>  int genphy_setup_forced(struct phy_device *phydev);
>  int genphy_restart_aneg(struct phy_device *phydev);
>  int genphy_config_aneg(struct phy_device *phydev);
> --
> 1.8.1.4
>
>
> --
> Thanks.
> -- Max
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



-- 
Florian
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox