Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2] Phonet: set the pipe handle using setsockopt
From: Hemant Vilas RAMDASI @ 2011-11-10  9:50 UTC (permalink / raw)
  To: remi.denis-courmont; +Cc: netdev, Dinesh Kumar Sharma, Hemant Ramdasi

From: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>

This provides flexibility to set the pipe handle
using setsockopt and enable the same.

Signed-off-by: Hemant Ramdasi <hemant.ramdasi@stericsson.com>
Signed-off-by: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
---
 include/linux/phonet.h |    2 +
 net/phonet/pep.c       |   90 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 6fb1384..491caec 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -37,6 +37,8 @@
 #define PNPIPE_ENCAP		1
 #define PNPIPE_IFINDEX		2
 #define PNPIPE_HANDLE		3
+#define PNPIPE_ENABLE		4
+#define PNPIPE_INITSTATE	5
 
 #define PNADDR_ANY		0
 #define PNADDR_BROADCAST	0xFC
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index f17fd84..f8057a1 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -167,6 +167,12 @@ static int pipe_handler_send_created_ind(struct sock *sk)
 				data, 4, GFP_ATOMIC);
 }
 
+static int pipe_handler_send_enabled_ind(struct sock *sk)
+{
+	return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
+				NULL, 0, GFP_ATOMIC);
+}
+
 static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
 {
 	static const u8 data[20] = {
@@ -533,6 +539,17 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
 	return pipe_handler_send_created_ind(sk);
 }
 
+static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+
+	if (hdr->error_code != PN_PIPE_NO_ERROR)
+		return -ECONNREFUSED;
+
+	return pipe_handler_send_enabled_ind(sk);
+}
+
+
 /* Queue an skb to an actively connected sock.
  * Socket lock must be held. */
 static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
@@ -578,6 +595,28 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
 			sk->sk_state = TCP_CLOSE_WAIT;
 			break;
 		}
+		if (pn->init_enable == PN_PIPE_DISABLE)
+			sk->sk_state = TCP_SYN_RECV;
+		else {
+			sk->sk_state = TCP_ESTABLISHED;
+
+			if (!pn_flow_safe(pn->tx_fc)) {
+				atomic_set(&pn->tx_credits, 1);
+				sk->sk_write_space(sk);
+			}
+			pipe_grant_credits(sk, GFP_ATOMIC);
+
+		}
+		break;
+
+	case PNS_PEP_ENABLE_RESP:
+		if (sk->sk_state != TCP_SYN_SENT)
+			break;
+
+		if (pep_enableresp_rcv(sk, skb)) {
+			sk->sk_state = TCP_CLOSE_WAIT;
+			break;
+		}
 
 		sk->sk_state = TCP_ESTABLISHED;
 		if (!pn_flow_safe(pn->tx_fc)) {
@@ -863,9 +902,26 @@ static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
 	int err;
 	u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
 
-	pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+	if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
+		pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+
 	err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
-					PN_PIPE_ENABLE, data, 4);
+				pn->init_enable, data, 4);
+	if (err)
+		return err;
+
+	sk->sk_state = TCP_SYN_SENT;
+
+	return 0;
+}
+
+static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int err;
+
+	err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
+				NULL, 0);
 	if (err) {
 		pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
 		return err;
@@ -959,6 +1015,29 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 		}
 		goto out_norel;
 
+	case PNPIPE_HANDLE:
+		if ((val >= 0) && (val < PN_PIPE_INVALID_HANDLE))
+			pn->pipe_handle = val;
+		else
+			err = -EINVAL;
+		break;
+
+	case PNPIPE_ENABLE:
+		if (sk->sk_state == TCP_SYN_SENT)
+			err = -EBUSY;
+		if (sk->sk_state == TCP_ESTABLISHED)
+			err = -EISCONN;
+		else
+			err = pep_sock_enable(sk, NULL, 0);
+		break;
+
+	case PNPIPE_INITSTATE:
+		if ((val == PN_PIPE_DISABLE) || (val == PN_PIPE_ENABLE))
+			pn->init_enable = val;
+		else
+			err = -EINVAL;
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -994,6 +1073,13 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		break;
 
+	case PNPIPE_ENABLE:
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EINVAL;
+		else
+			val = 1;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
1.7.4.3

^ permalink raw reply related

* [PATCH RESUBMITTED net-next 2/2] IPv6 - support for NLM_F_* flags at IPv6 routing requests
From: Matti Vaittinen @ 2011-11-10 10:07 UTC (permalink / raw)
  To: ext David Miller; +Cc: netdev

The support for NLM_F_* flags at IPv6 routing requests.

If NLM_F_CREATE flag is not defined for RTM_NEWROUTE request,
warning is printed, but no error is returned. Instead new route is added.

Exception is when NLM_F_REPLACE flag is given without NLM_F_CREATE, and
no matching route is found. In this case it should be safe to assume
that the request issuer is familiar with NLM_F_* flags, and does really
not want route to be created.

Specifying NLM_F_REPLACE flag will now make the kernel to search for
matching route, and replace it with new one. If no route is found and
NLM_F_CREATE is specified as well, then new route is created.

Also, specifying NLM_F_EXCL will yield returning of error if matching route
is found.

Patch created against linux-3.2-rc1


Signed-off-by: Matti Vaittinen <Mazziesaccount@gmail.com>
--
diff -uNr Linux-3.2-rc1.orig/net/ipv6/ip6_fib.c Linux-3.2-rc1.new/net/ipv6/ip6_fib.c
--- Linux-3.2-rc1.orig/net/ipv6/ip6_fib.c	2011-11-10 08:44:18.000000000 +0200
+++ Linux-3.2-rc1.new/net/ipv6/ip6_fib.c	2011-11-10 08:46:00.000000000 +0200
@@ -425,7 +425,8 @@
 
 static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 				     int addrlen, int plen,
-				     int offset)
+				     int offset, int allow_create,
+				     int replace_required)
 {
 	struct fib6_node *fn, *in, *ln;
 	struct fib6_node *pn = NULL;
@@ -447,8 +448,12 @@
 		 *	Prefix match
 		 */
 		if (plen < fn->fn_bit ||
-		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
+		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
+			if (!allow_create)
+				printk(KERN_WARNING
+				    "NLM_F_CREATE should be specified when creating new rt\n");
 			goto insert_above;
+		}
 
 		/*
 		 *	Exact match ?
@@ -477,10 +482,26 @@
 		fn = dir ? fn->right: fn->left;
 	} while (fn);
 
+	if (replace_required && !allow_create) {
+		/* We should not create new node because
+		 * NLM_F_REPLACE was specified without NLM_F_CREATE
+		 * I assume it is safe to require NLM_F_CREATE when
+		 * REPLACE flag is used! Later we may want to remove the
+		 * check for replace_required, because according
+		 * to netlink specification, NLM_F_CREATE
+		 * MUST be specified if new route is created.
+		 * That would keep IPv6 consistent with IPv4
+		 */
+		printk(KERN_WARNING
+		    "NLM_F_CREATE should be specified when creating new rt - ignoring request\n");
+		return ERR_PTR(-ENOENT);
+	}
 	/*
 	 *	We walked to the bottom of tree.
 	 *	Create new leaf node without children.
 	 */
+	if (!allow_create)
+		printk(KERN_WARNING "NLM_F_CREATE should be specified when creating new rt\n");
 
 	ln = node_alloc();
 
@@ -614,6 +635,12 @@
 {
 	struct rt6_info *iter = NULL;
 	struct rt6_info **ins;
+	int replace = (NULL != info &&
+	    NULL != info->nlh &&
+	    (info->nlh->nlmsg_flags&NLM_F_REPLACE));
+	int add = ((NULL == info || NULL == info->nlh) ||
+	    (info->nlh->nlmsg_flags&NLM_F_CREATE));
+	int found = 0;
 
 	ins = &fn->leaf;
 
@@ -626,6 +653,13 @@
 			/*
 			 *	Same priority level
 			 */
+			if (NULL != info->nlh &&
+			    (info->nlh->nlmsg_flags&NLM_F_EXCL))
+				return -EEXIST;
+			if (replace) {
+				found++;
+				break;
+			}
 
 			if (iter->rt6i_dev == rt->rt6i_dev &&
 			    iter->rt6i_idev == rt->rt6i_idev &&
@@ -655,17 +689,40 @@
 	/*
 	 *	insert node
 	 */
+	if (!replace) {
+		if (!add)
+			printk(KERN_WARNING "NLM_F_CREATE should be specified when creating new rt\n");
+
+add:
+		rt->dst.rt6_next = iter;
+		*ins = rt;
+		rt->rt6i_node = fn;
+		atomic_inc(&rt->rt6i_ref);
+		inet6_rt_notify(RTM_NEWROUTE, rt, info);
+		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
+
+		if ((fn->fn_flags & RTN_RTINFO) == 0) {
+			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
+			fn->fn_flags |= RTN_RTINFO;
+		}
 
-	rt->dst.rt6_next = iter;
-	*ins = rt;
-	rt->rt6i_node = fn;
-	atomic_inc(&rt->rt6i_ref);
-	inet6_rt_notify(RTM_NEWROUTE, rt, info);
-	info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
-
-	if ((fn->fn_flags & RTN_RTINFO) == 0) {
-		info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
-		fn->fn_flags |= RTN_RTINFO;
+	} else {
+		if (!found) {
+			if (add)
+				goto add;
+			printk(KERN_WARNING "add rtinfo to node - NLM_F_REPLACE specified, but no existing node found! bailing out\n");
+			return -ENOENT;
+		}
+		*ins = rt;
+		rt->rt6i_node = fn;
+		rt->dst.rt6_next = iter->dst.rt6_next;
+		atomic_inc(&rt->rt6i_ref);
+		inet6_rt_notify(RTM_NEWROUTE, rt, info);
+		rt6_release(iter);
+		if ((fn->fn_flags & RTN_RTINFO) == 0) {
+			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
+			fn->fn_flags |= RTN_RTINFO;
+		}
 	}
 
 	return 0;
@@ -696,9 +753,25 @@
 {
 	struct fib6_node *fn, *pn = NULL;
 	int err = -ENOMEM;
+	int allow_create = 1;
+	int replace_required = 0;
+	if (NULL != info && NULL != info->nlh) {
+		if (!(info->nlh->nlmsg_flags&NLM_F_CREATE))
+			allow_create = 0;
+		if ((info->nlh->nlmsg_flags&NLM_F_REPLACE))
+			replace_required = 1;
+	}
+	if (!allow_create && !replace_required)
+		printk(KERN_WARNING "RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 
 	fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
-			rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst));
+		    rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst),
+		    allow_create, replace_required);
+
+	if (IS_ERR(fn)) {
+		err = PTR_ERR(fn);
+		fn = NULL;
+	}
 
 	if (fn == NULL)
 		goto out;
@@ -736,7 +809,8 @@
 
 			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
 					sizeof(struct in6_addr), rt->rt6i_src.plen,
-					offsetof(struct rt6_info, rt6i_src));
+					offsetof(struct rt6_info, rt6i_src),
+					allow_create, replace_required);
 
 			if (sn == NULL) {
 				/* If it is failed, discard just allocated
@@ -753,8 +827,13 @@
 		} else {
 			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
 					sizeof(struct in6_addr), rt->rt6i_src.plen,
-					offsetof(struct rt6_info, rt6i_src));
+					offsetof(struct rt6_info, rt6i_src),
+					allow_create, replace_required);
 
+			if (IS_ERR(sn)) {
+				err = PTR_ERR(sn);
+				sn = NULL;
+			}
 			if (sn == NULL)
 				goto st_failure;
 		}





-- 
Matti Vaittinen
+358 504863070
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Told a UDP joke the other night...
...but I'm not sure everyone got it...
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

^ permalink raw reply

* [PATCH RESUBMITTED net-next 1/2] IPv6 - support for NLM_F_* flags at IPv6 routing requests
From: Matti Vaittinen @ 2011-11-10 10:06 UTC (permalink / raw)
  To: ext David Miller; +Cc: netdev

The support for NLM_F_* flags at IPv6 routing requests.

If NLM_F_CREATE flag is not defined for RTM_NEWROUTE request,
warning is printed, but no error is returned. Instead new route is
added.

Exception is when NLM_F_REPLACE flag is given without NLM_F_CREATE, and
no matching route is found. In this case it should be safe to assume
that the request issuer is familiar with NLM_F_* flags, and does really
not want route to be created.

Specifying NLM_F_REPLACE flag will now make the kernel to search for
matching route, and replace it with new one. If no route is found and
NLM_F_CREATE is specified as well, then new route is created.

Also, specifying NLM_F_EXCL will yield returning of error if matching
route
is found.

Patch created against linux-3.2-rc1

Signed-off-by: Matti Vaittinen <Mazziesaccount@gmail.com>
--
diff -uNr Linux-3.2-rc1.orig/net/ipv6/route.c Linux-3.2-rc1.new/net/ipv6/route.c
--- Linux-3.2-rc1.orig/net/ipv6/route.c	2011-11-10 08:44:18.000000000 +0200
+++ Linux-3.2-rc1.new/net/ipv6/route.c	2011-11-10 08:46:15.000000000 +0200
@@ -1230,9 +1230,18 @@
 	if (cfg->fc_metric == 0)
 		cfg->fc_metric = IP6_RT_PRIO_USER;

-	table = fib6_new_table(net, cfg->fc_table);
+	err = -ENOBUFS;
+	if (NULL != cfg->fc_nlinfo.nlh &&
+	    !(cfg->fc_nlinfo.nlh->nlmsg_flags&NLM_F_CREATE)) {
+		table = fib6_get_table(net, cfg->fc_table);
+		if (table == NULL) {
+			printk(KERN_WARNING "NLM_F_CREATE should be specified when creating new rt\n");
+			table = fib6_new_table(net, cfg->fc_table);
+		}
+	} else {
+		table = fib6_new_table(net, cfg->fc_table);
+	}
 	if (table == NULL) {
-		err = -ENOBUFS;
 		goto out;
 	}

-- 
Matti Vaittinen
+358 504863070
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Told a UDP joke the other night...
...but I'm not sure everyone got it...
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

^ permalink raw reply

* RE: [PATCH] Phonet: set the pipe handle using setsockopt
From: Hemant-vilas RAMDASI @ 2011-11-10  9:44 UTC (permalink / raw)
  To: Rémi Denis-Courmont
  Cc: netdev@vger.kernel.org, Dinesh Kumar SHARMA (STE)
In-Reply-To: <6033284.KrP4yu8ukf@hector>

Remi,

> -----Original Message-----
> From: Rémi Denis-Courmont [mailto:remi.denis-courmont@nokia.com]
> Sent: Wednesday, November 09, 2011 8:31 PM
> To: Hemant-vilas RAMDASI
> Cc: netdev@vger.kernel.org; Dinesh Kumar SHARMA (STE)
> Subject: Re: [PATCH] Phonet: set the pipe handle using setsockopt
> 
> Inline...
> 
> Le Mercredi 9 Novembre 2011 16:50:53 ext Hemant Vilas RAMDASI a écrit :

[...]

> 
> It looks like there is no use-case for init-enabled pipes then, right?
> How
> about dropping this extra bit of code and assuming connect()ed pipes
> are
> always init-disabled?

VT pipes can still be init-enabled.

Regards,
Hemant 

^ permalink raw reply

* Re: [PATCH] r8169: more driver shutdown WoL regression.
From: Stefan Becker @ 2011-11-10  9:40 UTC (permalink / raw)
  To: hayeswang; +Cc: Francois Romieu, netdev, David Miller
In-Reply-To: <DCB55CA56A5546B4B7350912D30EC679@realtek.com.tw>

On Thu, Nov 10, 2011 at 6:11 AM, hayeswang <hayeswang@realtek.com> wrote:
>
> I find that the magic packet which I send is the broadcast packet, and the one
> which you send is the unicast packet. That is, you could wake up the system by
> using broadcast magic packet for the previous chips without the patch.

Confirmed. If I use the "-b" option in ether-wake WoL works for my
system with plain 3.1 kernel, i.e. without the patch.

[    8.357467] r8169 0000:02:00.0: eth0: RTL8168c/8111c at 0xf37be000,
00:1f:d0:5a:22:77, XID 1c4000c0 IRQ 43

> However,
> if you prefer to unicast magic packet, this patch is necessary.

As "-b" is not the default for ether-wake it will come as a surprise
for the user that WoL is no longer working. Maybe we should reword the
patch subject to:

  r8169: driver shutdown unicast WoL regression

Regards,

          Stefan

^ permalink raw reply

* Re: [REPOST patch net-next V6] net: introduce ethernet teaming device
From: Eric Dumazet @ 2011-11-10  7:02 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, bhutchings, shemminger, fubar, andy, tgraf,
	ebiederm, mirqus, kaber, greearb, jesse, fbl, benjamin.poirier,
	jzupka, ivecera
In-Reply-To: <1320907983.16265.1.camel@edumazet-laptop>

Le jeudi 10 novembre 2011 à 07:53 +0100, Eric Dumazet a écrit :
> Le jeudi 10 novembre 2011 à 07:48 +0100, Jiri Pirko a écrit :
> 
> > dev_kfree_skb is called from module transmit functions to handle free of
> > skb when needed. But I see now that in case when
> > (likely(!list_empty(&team->port_list) && team->mode_ops.transmit)) == false
> > this would leak. I'm going to rewrite this bit somehow.
> 
> This is the leak I mentioned, I thought it was obvious :)
> 
> If you return NETDEV_TX_OK, then the skb must have been consumed.

I meant : consumed or queued in a device transmit queue...

^ permalink raw reply

* Re: [REPOST patch net-next V6] net: introduce ethernet teaming device
From: Eric Dumazet @ 2011-11-10  6:53 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, bhutchings, shemminger, fubar, andy, tgraf,
	ebiederm, mirqus, kaber, greearb, jesse, fbl, benjamin.poirier,
	jzupka, ivecera
In-Reply-To: <20111110064807.GB2058@minipsycho>

Le jeudi 10 novembre 2011 à 07:48 +0100, Jiri Pirko a écrit :

> dev_kfree_skb is called from module transmit functions to handle free of
> skb when needed. But I see now that in case when
> (likely(!list_empty(&team->port_list) && team->mode_ops.transmit)) == false
> this would leak. I'm going to rewrite this bit somehow.

This is the leak I mentioned, I thought it was obvious :)

If you return NETDEV_TX_OK, then the skb must have been consumed.

^ permalink raw reply

* Re: [REPOST patch net-next V6] net: introduce ethernet teaming device
From: Jiri Pirko @ 2011-11-10  6:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, davem, bhutchings, shemminger, fubar, andy, tgraf,
	ebiederm, mirqus, kaber, greearb, jesse, fbl, benjamin.poirier,
	jzupka, ivecera
In-Reply-To: <1320882789.5825.10.camel@edumazet-laptop>

Thu, Nov 10, 2011 at 12:53:09AM CET, eric.dumazet@gmail.com wrote:
>Le jeudi 10 novembre 2011 à 00:12 +0100, Eric Dumazet a écrit :
>> Le mercredi 09 novembre 2011 à 23:13 +0100, Jiri Pirko a écrit :
>> > This patch introduces new network device called team. It supposes to be
>> > very fast, simple, userspace-driven alternative to existing bonding
>> > driver.
>> 
>> ...
>> 
>> > +/*
>> > + * note: already called with rcu_read_lock
>> > + */
>> > +static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
>> > +{
>> > +	struct team *team = netdev_priv(dev);
>> > +	bool tx_success = false;
>> > +	unsigned int len = skb->len;
>> > +
>> > +	/*
>> > +	 * Ensure transmit function is called only in case there is at least
>> > +	 * one port present.
>> > +	 */
>> > +	if (likely(!list_empty(&team->port_list) && team->mode_ops.transmit))
>> > +		tx_success = team->mode_ops.transmit(team, skb);
>
>Not clear why its so complex here.
>
>When you manipulate team->port_list and make it empty, why dont you set
>team->mode_ops.transmit to a helper function, freeing the skb and
>returning false.
>
>Also, instead of setting .transmit to NULL, you also could set it to
>same helper function.
>
>This way you could just do in fast path :
>
>	tx_succcess = team->mode_ops.transmit(team, skb);
>
>Avoiding two tests

This approach I was thinking of as well. I was not sure that it would be
so nice to change this function from port_add/port_del but I'm going to
look at this.

>
>> > +	if (tx_success) {
>> > +		struct team_pcpu_stats *pcpu_stats;
>> > +
>> > +		pcpu_stats = this_cpu_ptr(team->pcpu_stats);
>> > +		u64_stats_update_begin(&pcpu_stats->syncp);
>> > +		pcpu_stats->tx_packets++;
>> > +		pcpu_stats->tx_bytes += len;
>> > +		u64_stats_update_end(&pcpu_stats->syncp);
>> > +	} else {
>> > +		this_cpu_inc(team->pcpu_stats->tx_dropped);
>> > +	}
>> > +
>> > +	return NETDEV_TX_OK;
>> > +}
>> > +
>> 
>
>
>

^ permalink raw reply

* Re: [REPOST patch net-next V6] net: introduce ethernet teaming device
From: Jiri Pirko @ 2011-11-10  6:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, davem, bhutchings, shemminger, fubar, andy, tgraf,
	ebiederm, mirqus, kaber, greearb, jesse, fbl, benjamin.poirier,
	jzupka, ivecera
In-Reply-To: <1320880355.5825.1.camel@edumazet-laptop>

Thu, Nov 10, 2011 at 12:12:35AM CET, eric.dumazet@gmail.com wrote:
>Le mercredi 09 novembre 2011 à 23:13 +0100, Jiri Pirko a écrit :
>> This patch introduces new network device called team. It supposes to be
>> very fast, simple, userspace-driven alternative to existing bonding
>> driver.
>
>...
>
>> +/*
>> + * note: already called with rcu_read_lock
>> + */
>> +static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> +	struct team *team = netdev_priv(dev);
>> +	bool tx_success = false;
>> +	unsigned int len = skb->len;
>> +
>> +	/*
>> +	 * Ensure transmit function is called only in case there is at least
>> +	 * one port present.
>> +	 */
>> +	if (likely(!list_empty(&team->port_list) && team->mode_ops.transmit))
>> +		tx_success = team->mode_ops.transmit(team, skb);
>> +	if (tx_success) {
>> +		struct team_pcpu_stats *pcpu_stats;
>> +
>> +		pcpu_stats = this_cpu_ptr(team->pcpu_stats);
>> +		u64_stats_update_begin(&pcpu_stats->syncp);
>> +		pcpu_stats->tx_packets++;
>> +		pcpu_stats->tx_bytes += len;
>> +		u64_stats_update_end(&pcpu_stats->syncp);
>> +	} else {
>> +		this_cpu_inc(team->pcpu_stats->tx_dropped);
>> +	}
>> +
>> +	return NETDEV_TX_OK;
>> +}
>> +
>
>If skb is not transmitted, its leaked.

dev_kfree_skb is called from module transmit functions to handle free of
skb when needed. But I see now that in case when
(likely(!list_empty(&team->port_list) && team->mode_ops.transmit)) == false
this would leak. I'm going to rewrite this bit somehow.
>
>
>

^ permalink raw reply

* Re: [PATCH] libteam: fix function names to include 'bond'
From: Jiri Pirko @ 2011-11-10  6:41 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: netdev, davem, eric.dumazet, bhutchings, shemminger, fubar, andy,
	tgraf, ebiederm, mirqus, kaber, greearb, jesse, benjamin.poirier,
	jzupka
In-Reply-To: <20111109212308.4f89739e@asterix.rh>

Thu, Nov 10, 2011 at 12:23:08AM CET, fbl@redhat.com wrote:
>On Wed, 9 Nov 2011 23:04:25 +0100
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> 
>> Hi Flavio.
>> 
>> Thomas included these 2 functions in latest libnl upstream. Bond
>> versions wouldn't work because of "bond" type check.
>> 
>
>Yeah, my first option would be using git repo, but the one
>mentioned in the page seems to be off and I am not finding
>another one.  Do you have the repo url handy?

git://git.infradead.org/users/tgr/libnl.git

>
>Anyway, I've tried here with my patch applied and it didn't work.
>Every packet transmitted increased TX dropped in ifconfig team0
>output. The tcpdump attached to the slave doesn't show anything,
>the packets appear only when attached to team0.
>
>When I ping the team0 host from another host, both slave and
>team0 RX dropped increases at the same rate. tcpdump attached
>to the slave shows the packet coming.
>
>Well, maybe this is an effect of having used the wrong libnl
>version, though the team_monitor seems to be working just fine.
>(patch applied on top of net-next of today, commit e56c57d,
> followed the exact same steps as in HOWTO.BASICS)

Oh. I changed activebackup mode behaviour in V4 or V5 of kernel patch.
Now it's needed to adjust mac address of ports by hand. I'll refresh
HOWTO.BASICS shortly. 

>
>I am planning to upgrade the system to the new fedora and
>try again after that.  If I have time, I will follow up with
>fresh results.
>
>thanks,
>fbl
>
>> Jirka
>> 
>> Wed, Nov 09, 2011 at 07:20:46PM CET, fbl@redhat.com wrote:
>> >Signed-off-by: Flavio Leitner <fbl@redhat.com>
>> >---
>> >
>> > I found those while trying to test V6 patch using latest
>> > libteam (commit 5e9790816606a6dd4e7f6f32c0bb0c45e5d13b31)
>> > and libnl-3.2.2 (last stable).
>> > thanks,
>> > fbl
>> >
>> > lib/libteam.c |    4 ++--
>> > 1 files changed, 2 insertions(+), 2 deletions(-)
>> >
>> >diff --git a/lib/libteam.c b/lib/libteam.c
>> >index feb13b6..e7ae6b0 100644
>> >--- a/lib/libteam.c
>> >+++ b/lib/libteam.c
>> >@@ -1331,7 +1331,7 @@ int team_port_add(struct team_handle *th,
>> >uint32_t port_ifindex)
>> > {
>> > 	int err;
>> > 
>> >-	err = rtnl_link_enslave_ifindex(th->nl_cli.sock,
>> >th->ifindex,
>> >+	err = rtnl_link_bond_enslave_ifindex(th->nl_cli.sock,
>> >th->ifindex,
>> > 					port_ifindex);
>> > 	return -nl2syserr(err);
>> > }
>> >@@ -1350,6 +1350,6 @@ int team_port_remove(struct team_handle *th,
>> >uint32_t port_ifindex)
>> > {
>> > 	int err;
>> > 
>> >-	err = rtnl_link_release_ifindex(th->nl_cli.sock,
>> >port_ifindex);
>> >+	err = rtnl_link_bond_release_ifindex(th->nl_cli.sock,
>> >port_ifindex);
>> > 	return -nl2syserr(err);
>> > }
>> >-- 
>> >1.7.6
>> >
>

^ permalink raw reply

* Re: Bug? GRE tunnel periodically won't transmit some packets
From: Chris Siebenmann @ 2011-11-10  5:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Chris Siebenmann, netdev
In-Reply-To: <1320761153.3444.4.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

| So it appears the drop is in gre xmit because frame is bigger than
| mtu...
| 
| Maybe you receive some strange ICMP (ICMP_FRAG_NEEDED) from a buggy
| host ?
| 
| You could catch it with "tcpdump -s 1000 -i any icmp" maybe...

 The problem went away for several days and then came roaring back
just now. I couldn't see any outside ICMP messages like this while
the problem was happening. I did see ICMP messages, but they were
locally generated:

	IP 128.100.3.52 > 128.100.3.52: ICMP 128.100.3.51 unreachable - need to frag (mtu 478), length 556

(I verified that these were for the things that were stalling with
'tcpdump -vv -ee'.)

At the time that this was happening, I could see a lot of 'ip route show
table cache' entries like this:

	128.100.3.58 from 66.96.18.208 dev ppp0  src 66.96.18.208
	    cache  expires 286sec ipid 0xdeda mtu 552 rtt 44ms rttvar 30ms ssthresh 7 cwnd 9 iif lo

There were a bunch of other 'mtu 552' routes. Flushing the routing cache
(with 'ip route flush cache; ip route show table cache' to verify that
it had flushed) did not change the situation; the problem continued and
the 'mtu 552' routes came back as fast as I could check (it appeared to
be the moment that the routing cache was repopulated, there they were).

 In general the route cache appears to have strangely low MTUs listed
for the remote end of the tunnel (at least after the problem's
happened), eg:

	128.100.3.58 from 66.96.18.208 dev ppp0 
	    cache  expires 203sec ipid 0xdeda mtu 934 rtt 44ms rttvar 30ms ssthresh 7 cwnd 9
	128.100.3.58 from 66.96.18.208 dev ppp0  src 66.96.18.208 
	    cache  expires 203sec ipid 0xdeda mtu 934 rtt 44ms rttvar 30ms ssthresh 7 cwnd 9 iif lo

I would normally expect this to be 1492, the PPP link MTU.

 This does not seem to happen on the Fedora 14 kernel (the one where
the problem doesn't happen). There the route is listed as:

	128.100.3.58 from 66.96.18.208 dev ppp0 
	    cache  mtu 1492 advmss 1452 hoplimit 64
	128.100.3.58 from 66.96.18.208 dev ppp0 
	    cache  mtu 1492 advmss 1452 hoplimit 64

(to quote what I *think* are the relevant entries.)

 Since things may be pointing towards routing cache oddities, I should
mention that I have a somewhat peculiar policy based routing setup
involving this GRE tunnel. While it's been working fine for literally
years, it's possible that recent changes in the area changed something
so that peculiar things now happen; if you think it might be relevant, I
can provide a dump of the routing rules and explain the setup.

(Part of why this occurs to me now is that I know it's possible to have
a connection to 128.100.3.58 (the remote end of the tunnel) that runs
through the tunnel itself, and I've seen such routes appear in the 'ip
route show table cache' output.)

		- cks

^ permalink raw reply

* [PATCH 1/1] net: fec: convert to clk_prepare/clk_unprepare
From: Richard Zhao @ 2011-11-10  5:01 UTC (permalink / raw)
  To: netdev; +Cc: davem, linux-arm-kernel, kernel, shawn.guo, eric.miao,
	Richard Zhao

Signed-off-by: Richard Zhao <richard.zhao@linaro.org>
---
 drivers/net/ethernet/freescale/fec.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c
index 1124ce0..92e3585 100644
--- a/drivers/net/ethernet/freescale/fec.c
+++ b/drivers/net/ethernet/freescale/fec.c
@@ -1588,6 +1588,7 @@ fec_probe(struct platform_device *pdev)
 		ret = PTR_ERR(fep->clk);
 		goto failed_clk;
 	}
+	clk_prepare(fep->clk);
 	clk_enable(fep->clk);
 
 	ret = fec_enet_init(ndev);
@@ -1612,6 +1613,7 @@ failed_register:
 failed_mii_init:
 failed_init:
 	clk_disable(fep->clk);
+	clk_unprepare(fep->clk);
 	clk_put(fep->clk);
 failed_clk:
 	for (i = 0; i < FEC_IRQ_NUM; i++) {
@@ -1639,6 +1641,7 @@ fec_drv_remove(struct platform_device *pdev)
 	fec_stop(ndev);
 	fec_enet_mii_remove(fep);
 	clk_disable(fep->clk);
+	clk_unprepare(fep->clk);
 	clk_put(fep->clk);
 	iounmap(fep->hwp);
 	unregister_netdev(ndev);
@@ -1665,6 +1668,7 @@ fec_suspend(struct device *dev)
 		netif_device_detach(ndev);
 	}
 	clk_disable(fep->clk);
+	clk_unprepare(fep->clk);
 
 	return 0;
 }
@@ -1675,6 +1679,7 @@ fec_resume(struct device *dev)
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct fec_enet_private *fep = netdev_priv(ndev);
 
+	clk_prepare(fep->clk);
 	clk_enable(fep->clk);
 	if (netif_running(ndev)) {
 		fec_restart(ndev, fep->full_duplex);
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH net-next] bridge: add NTF_USE support
From: Stephen Hemminger @ 2011-11-10  4:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

More changes to the recent code to support control of forwarding
database via netlink.
   * Support NTF_USE like neighbour table
   * Validate state bits from application
   * Only send notifications (and change bits) if new entry is
     different.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/bridge/br_fdb.c	2011-11-02 11:14:06.009407558 -0700
+++ b/net/bridge/br_fdb.c	2011-11-07 08:30:00.882705168 -0800
@@ -556,7 +556,7 @@ skip:
 	return skb->len;
 }
 
-/* Create new static fdb entry */
+/* Update (create or replace) forwarding database entry */
 static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
 			 __u16 state, __u16 flags)
 {
@@ -575,16 +575,21 @@ static int fdb_add_entry(struct net_brid
 	} else {
 		if (flags & NLM_F_EXCL)
 			return -EEXIST;
+	}
+
+	if (fdb_to_nud(fdb) != state) {
+		if (state & NUD_PERMANENT)
+			fdb->is_local = fdb->is_static = 1;
+		else if (state & NUD_NOARP) {
+			fdb->is_local = 0;
+			fdb->is_static = 1;
+		} else
+			fdb->is_local = fdb->is_static = 0;
 
-		if (flags & NLM_F_REPLACE)
-			fdb->updated = fdb->used = jiffies;
-		fdb->is_local = fdb->is_static = 0;
+		fdb->updated = fdb->used = jiffies;
+		fdb_notify(fdb, RTM_NEWNEIGH);
 	}
 
-	if (state & NUD_PERMANENT)
-		fdb->is_local = fdb->is_static = 1;
-	else if (state & NUD_NOARP)
-		fdb->is_static = 1;
 	return 0;
 }
 
@@ -627,6 +632,11 @@ int br_fdb_add(struct sk_buff *skb, stru
 		return -EINVAL;
 	}
 
+	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
+		pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
+		return -EINVAL;
+	}
+
 	p = br_port_get_rtnl(dev);
 	if (p == NULL) {
 		pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n",
@@ -634,9 +644,15 @@ int br_fdb_add(struct sk_buff *skb, stru
 		return -EINVAL;
 	}
 
-	spin_lock_bh(&p->br->hash_lock);
-	err = fdb_add_entry(p, addr, ndm->ndm_state, nlh->nlmsg_flags);
-	spin_unlock_bh(&p->br->hash_lock);
+	if (ndm->ndm_flags & NTF_USE) {
+		rcu_read_lock();
+		br_fdb_update(p->br, p, addr);
+		rcu_read_unlock();
+	} else {
+		spin_lock_bh(&p->br->hash_lock);
+		err = fdb_add_entry(p, addr, ndm->ndm_state, nlh->nlmsg_flags);
+		spin_unlock_bh(&p->br->hash_lock);
+	}
 
 	return err;
 }

^ permalink raw reply

* RE: [PATCH] r8169: more driver shutdown WoL regression.
From: hayeswang @ 2011-11-10  4:11 UTC (permalink / raw)
  To: 'Francois Romieu'
  Cc: netdev, 'Stefan Becker', 'David Miller'
In-Reply-To: <20111109222556.GA8226@electric-eye.fr.zoreil.com>

Francois Romieu [mailto:romieu@fr.zoreil.com] 
[...]
> 
> I tested in the following order
> RTL_GIGA_MAC_VER_26 (ko)
> RTL_GIGA_MAC_VER_25 (ko)
> RTL_GIGA_MAC_VER_12 (ko)
> RTL_GIGA_MAC_VER_34 (ok)
> RTL_GIGA_MAC_VER_33 (ok)
> RTL_GIGA_MAC_VER_35 (ok)
> RTL_GIGA_MAC_VER_09 (ko)
> RTL_GIGA_MAC_VER_09 (ko)
> RTL_GIGA_MAC_VER_30 (ok)
> RTL_GIGA_MAC_VER_30 (ok)
> RTL_GIGA_MAC_VER_06 (ok)
> 
> Each test:
> - proceeded from a fresh boot
> - dmesg | grep XID; ip link show
>   -> check MAC address (devices were set through udev long beforehand)
> - ip link set dev xyz up
> - ethtool xyz
>   -> check link status up
> - ethtool peer
>   -> check link status up, 1000 Mbps or 100 Mbps
> - ethtool -s xyz wol g; ethtool xyz
> - telinit 0
> - ethtool peer
>   -> check link status up, 10 Mbps
> - ether-wake -i peer mac_addr (triple check the mac address here)
> 
> (stop, power off, unplug, plug, power on, repeat)
> 

I find that the magic packet which I send is the broadcast packet, and the one
which you send is the unicast packet. That is, you could wake up the system by
using broadcast magic packet for the previous chips without the patch. However,
if you prefer to unicast magic packet, this patch is necessary. Besides, no
matter broadcast or unicast magic packet, the patch is necessary for 8105,
8168e, and later chips. Further, it may be dangerous to enable both rx_enable
(ChipCmd bit 3) and RxConfig for 8168b for WOL, because the hw would try to
write the rx buffer.

Best Regards,
Hayes

^ permalink raw reply

* Re: [PATCH 1/2] include/net/cfg80211.h: Fix issue of make htmldocs
From: Randy Dunlap @ 2011-11-10  2:01 UTC (permalink / raw)
  To: Marcos Paulo de Souza; +Cc: johannes, davem, netdev
In-Reply-To: <1320871572-3430-1-git-send-email-marcos.mage@gmail.com>

On 11/09/2011 12:46 PM, Marcos Paulo de Souza wrote:
> Make documentation of member sta_modify_mask of struct
> station_parameters and sta_flags of struct station_info.
> 
> Signed-off-by: Marcos Paulo de Souza <marcos.mage@gmail.com>

Thanks, but Johannes has already posted a patch for this
(to linux-wireless).

> ---
>  include/net/cfg80211.h |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
> index 92cf1c2..bbf6bf7 100644
> --- a/include/net/cfg80211.h
> +++ b/include/net/cfg80211.h
> @@ -447,6 +447,7 @@ enum station_parameters_apply_mask {
>   *	(bitmask of BIT(NL80211_STA_FLAG_...))
>   * @sta_flags_set: station flags values
>   *	(bitmask of BIT(NL80211_STA_FLAG_...))
> + * @sta_modify_mask: apply new uAPSD parameters
>   * @listen_interval: listen interval or -1 for no change
>   * @aid: AID or zero for no change
>   * @plink_action: plink action to take
> @@ -606,6 +607,7 @@ struct sta_bss_parameters {
>   * @tx_failed: number of failed transmissions (retries exceeded, no ACK)
>   * @rx_dropped_misc:  Dropped for un-specified reason.
>   * @bss_param: current BSS parameters
> + * @sta_flags: Station flags mask/set
>   * @generation: generation number for nl80211 dumps.
>   *	This number should increase every time the list of stations
>   *	changes, i.e. when a station is added or removed, so that


-- 
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply

* Re: [PATCH] [RFC] net-netlink: fix tos/tclass for dual-stack ipv6 sockets
From: Maciej Żenczykowski @ 2011-11-10  1:52 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111109.153530.2283817573824031015.davem@davemloft.net>

2011/11/9 David Miller <davem@davemloft.net>:
> From: Maciej Żenczykowski <zenczykowski@gmail.com>
> Date: Mon,  7 Nov 2011 17:46:40 -0800
>
>> From: Maciej Żenczykowski <maze@google.com>
>>
>> Something along the following lines would be needed.
>>
>> Signed-off-by: Maciej Żenczykowski <maze@google.com>
>
> This is terrible, see my other email, inet->tos doesn't matter even
> for mapped ipv6 sockets.

Oh, yes, I definitely agree that this is terrible.
Can we just get rid of tclass and use inet->tos for ipv6 as well?

^ permalink raw reply

* Re: [PATCH] net-netlink: Add a new attribute to expose TCLASS values via netlink
From: David Miller @ 2011-11-10  1:53 UTC (permalink / raw)
  To: zenczykowski; +Cc: netdev, muralira, shemminger, eric.dumazet
In-Reply-To: <CAHo-OozNP2qi_RMjO7rS1=fyaU8w_3aZZLptqnYAkCAwhQDj_w@mail.gmail.com>

From: Maciej Żenczykowski <zenczykowski@gmail.com>
Date: Wed, 9 Nov 2011 17:50:27 -0800

> Yes, but an ipv6 socket is permitted to setsockopt SOL_IP in addition
> to SOL_IPV6.
> As such, one can simply setsockopt(ipv6_socket, SOL_IP, IP_TOS, value).

That's my whole point, this operation isn't currently possible, the
socket ops for ipv4 setsockopt() aren't hooked up to ipv6 mapped
sockets in the kernel right now.

^ permalink raw reply

* Re: [PATCH] net-netlink: Add a new attribute to expose TCLASS values via netlink
From: Maciej Żenczykowski @ 2011-11-10  1:50 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, muralira, shemminger, eric.dumazet
In-Reply-To: <20111109.153451.441383099995731023.davem@davemloft.net>

> I can't see how an ipv6 mapped socket can even set the inet->tos value.
>
> As far as I can see, only net/ipv4/ip_sockglue.c:ip_setsockopt() provides
> the interface to change inet->tos.

Yes, but an ipv6 socket is permitted to setsockopt SOL_IP in addition
to SOL_IPV6.
As such, one can simply setsockopt(ipv6_socket, SOL_IP, IP_TOS, value).
This is just as well, because inet->tos is what determines the ipv4 tos value
of any ipv4 packets generated by an ipv6 socket, ie. when connect(ed/ing)
to an ipv4 mapped ipv6 address.

> And ipv6 sockets, of any type, are provided no such vector by which to
> get at those interfaces.

> So inet->tos is always left at it's default value for ipv6 mapped sockets,
> and therefore I see no reason to report TCLASS vs. TOS separately.

> In fact, what I would suggest is to do something about the lack of
> ability to set inet->tos, and the best way to do that seems to be to
> simply propagate the npinfo->tclass setting into inet->tos.  Performaing
> any munging if necessary.

> I'm not applying this patch.

If people are in agreement that inet->tos vs ipv6->tclass being
separately settable is not desirable,
then I'm willing to go through and remove the tclass field (and send
out a patch to do that).

However, this is _QUITE DEFINITELY_ a user visible change in API and
application visible behaviour.

It is however an annoying corner case, so perhaps would be best fixed
by getting rid of it?
OTOH, I'm not sure if people aren't perhaps relying on the ability to
separately set ipv4 dscp vs ipv6 tclass due to some internal network
constraints...

Personally, I'd prefer the simplification of not having to deal with
IP_TOS vs IPV6_TCLASS discrepancies on ipv6 dual-stack sockets.

- Maciej

^ permalink raw reply

* Re: [GIT PULL nf-next] IPVS
From: Simon Horman @ 2011-11-10  1:22 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Krzysztof Wilczynski
In-Reply-To: <20111109143641.GA24174@1984>

On Wed, Nov 09, 2011 at 03:36:41PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Nov 09, 2011 at 09:58:07AM +0900, Simon Horman wrote:
> > Hi Pablo,
> > 
> > On Mon, Nov 07, 2011 at 09:29:56AM +0100, Pablo Neira Ayuso wrote:
> > > Hi Simon,
> > > 
> > > On Mon, Nov 07, 2011 at 12:07:01PM +0900, Simon Horman wrote:
> > > > Hi Pablo,
> > > > 
> > > > I am a little confused. The nf-next branch seems to have disappeared.
> > > > 
> > > > Could you consider pulling git://github.com/horms/ipvs-next.git master
> > > > to get the following changes that were in your nf-next branch.
> > > 
> > > I was late to get it into net-next. Since net-next became net after
> > > the 3.1 release, my moved those changes to net to get it into 3.2
> > > once Linus announced that the merge window was opened again.
> > > 
> > > > Or would
> > > > you like me to rebase the ipvs patches (9 or the 11 changes below) on
> > > > top of git://1984.lsi.us.es/net-next/.git master ?
> > > 
> > > They are already in net davem's tree, they will be included in the
> > > upcoming 3.2 release.
> > > 
> > > http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fdavem%2Fnet.git&a=search&h=HEAD&st=commit&s=Neira
> > 
> > Thanks, and sorry for missing that when I checked yesterday.
> > 
> > Could you suggest which tree and branch I should base the master branch of my
> > ipvs and ipvs-next trees on? Their purposes are to provide a reference for
> > people wishing to fix or enhance IPVS and a mechanism to send pull requests to
> > you. As of now I am using the master branch of your net tree for both.
> 
> The 1984.lsi.us.es trees are fine.
> 
> There are no branch yet because I have no patches queued for upstream
> so far. You can use master if you don't see any nf branch, OK?

Ok :)

^ permalink raw reply

* Re: dst->obsolete has become pointless
From: David Miller @ 2011-11-10  0:47 UTC (permalink / raw)
  To: eric.dumazet; +Cc: joe, steffen.klassert, netdev, timo.teras
In-Reply-To: <1320885223.5825.37.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 10 Nov 2011 01:33:43 +0100

> Now some information was migrated to inetpeer, I wonder if we could not
> allocate a private dst for a socket ?

Yes, and in fact we are well equipted to support this kind of notion
exactly because we can detect route invalidity with simply serial
number comparisons.

All the necessary mechanics are there.  Simply allocate the cloned
route with DST_NOCOUNT, do not put it into any tables, and copy over
the necessary information.

Even things like __sk_dst_check() will just work transparently.

There might be some gotchas wrt. stacked IPSEC routes, but I don't
anticipate any real serious problems.

^ permalink raw reply

* Re: dst->obsolete has become pointless
From: Eric Dumazet @ 2011-11-10  0:33 UTC (permalink / raw)
  To: David Miller; +Cc: joe, steffen.klassert, netdev, timo.teras
In-Reply-To: <20111109.192427.661860445333469011.davem@davemloft.net>

Le mercredi 09 novembre 2011 à 19:24 -0500, David Miller a écrit :

> We can only use it in this scenerio if it universally
> uses the same type.  This is due to the by-hand layout
> of the rest of the structure such that __refcnt ends
> up on a different cache line.
> --

tbench (or any tcp on loopback workload) is currently hitting a single
dst refcnt because a single dst is shared by all sockets...

Now some information was migrated to inetpeer, I wonder if we could not
allocate a private dst for a socket ?

Just an idea before the night...

^ permalink raw reply

* [PATCH net-next] bnx2x: reduce skb truesize by 50%
From: Eric Dumazet @ 2011-11-10  0:29 UTC (permalink / raw)
  To: David Miller; +Cc: bhutchings, pstaszewski, netdev, Eilon Greenstein
In-Reply-To: <1320876183.3272.8.camel@edumazet-laptop>

Le mercredi 09 novembre 2011 à 23:03 +0100, Eric Dumazet a écrit :

> BTW, on my bnx2x adapter, even small UDP frames use more than PAGE_SIZE
> bytes :
> 
> skb->truesize=4352 len=26 (payload only)
> 

> I wonder if we shouldnt increase SK_MEM_QUANTUM a bit to avoid
> ping/pong...
> 
> -#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
> +#define SK_MEM_QUANTUM ((int)PAGE_SIZE * 2)
> 

Following patch also helps a lot, even with only two cpus (one handling
device interrupts, one running the application thread)

[PATCH net-next] bnx2x: reduce skb truesize by ~50%

bnx2x uses following formula to compute its rx_buf_sz :

dev->mtu + 2*L1_CACHE_BYTES + 14 + 8 + 8

Then core network adds NET_SKB_PAD and SKB_DATA_ALIGN(sizeof(struct
skb_shared_info))

Final allocated size for skb head on x86_64 (L1_CACHE_BYTES = 64,
MTU=1500) : 2112 bytes : SLUB/SLAB round this to 4096 bytes.

Since skb truesize is then bigger than SK_MEM_QUANTUM, we have lot of
false sharing because of mem_reclaim in UDP stack.

One possible way to half truesize is to lower the need by 64 bytes (2112
-> 2048 bytes)

This way, skb->truesize is lower than SK_MEM_QUANTUM and we get better
performance.

(760.000 pps on a rx UDP monothread benchmark, instead of 720.000 pps)


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h |   11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index aec7212..ebbdc55 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1185,9 +1185,14 @@ struct bnx2x {
 #define ETH_MAX_PACKET_SIZE		1500
 #define ETH_MAX_JUMBO_PACKET_SIZE	9600
 
-	/* Max supported alignment is 256 (8 shift) */
-#define BNX2X_RX_ALIGN_SHIFT		((L1_CACHE_SHIFT < 8) ? \
-					 L1_CACHE_SHIFT : 8)
+/* Max supported alignment is 256 (8 shift)
+ * It should ideally be min(L1_CACHE_SHIFT, 8)
+ * Choosing 5 (32 bytes) permits to get skb heads of 2048 bytes
+ * instead of 4096 bytes.
+ * With SLUB/SLAB allocators, data will be cache line aligned anyway.
+ */
+#define BNX2X_RX_ALIGN_SHIFT		5
+
 	/* FW use 2 Cache lines Alignment for start packet and size  */
 #define BNX2X_FW_RX_ALIGN		(2 << BNX2X_RX_ALIGN_SHIFT)
 #define BNX2X_PXP_DRAM_ALIGN		(BNX2X_RX_ALIGN_SHIFT - 5)

^ permalink raw reply related

* Re: dst->obsolete has become pointless
From: David Miller @ 2011-11-10  0:24 UTC (permalink / raw)
  To: joe; +Cc: steffen.klassert, netdev, timo.teras
In-Reply-To: <1320882969.6923.29.camel@Joe-Laptop>

From: Joe Perches <joe@perches.com>
Date: Wed, 09 Nov 2011 15:56:09 -0800

> As far as I know, other than being large enough to
> store a 1 and 0, it's implementation defined.
> 
> Just like an unsigned short.
> 
> I _believe_ gcc uses unsigned char width for
> _Bool in all normal cases though.

We can only use it in this scenerio if it universally
uses the same type.  This is due to the by-hand layout
of the rest of the structure such that __refcnt ends
up on a different cache line.

^ permalink raw reply

* Re: [PATCH net-next 0/2] 802.1ad S-VLAN support
From: Ben Hutchings @ 2011-11-09 23:58 UTC (permalink / raw)
  To: David Lamparter; +Cc: netdev
In-Reply-To: <20111109153419.GJ1833899@jupiter.n2.diac24.net>

On Wed, 2011-11-09 at 16:34 +0100, David Lamparter wrote:
> On Tue, Nov 08, 2011 at 12:16:33AM +0000, Ben Hutchings wrote:
> > > Hmm. I think we need to cleanly separate MTU and MFS. MTU is used for
> > > upper layer stuff like setting TCP MSS, IP fragment size, etc.
> > >
> > > MFS is the actual ethernet thing, and it's quite independent from the
> > > MTU. Imagine the following example case:
> >
> > I was proposing to make a distinction between the 'untagged' MTU
> > (dev->mtu) that would continue to be used by layer 3 and the physical
> > MTU that would take into account the needs of any related VLAN devices.
> 
> Ah. I think we're saying the same thing with different words.
> 
> > > How about instead of propagating the MFS up, we provide an user knob to
> > > adjust the MFS (on physical devices)?
> >
> > I suppose that may be necessary - unfortunately.
> 
> Hm. Basically, the current ndo_change_mtu call is severely misnamed, it
> actually changes the MFS. MTU isn't even relevant for the driver.

Well, the Ethernet standards effectively specify an MTU of 1500
regardless of VLAN tags rather than specfying a single maximum frame
size or properly acknowledging jumbos.  And so there is hardware that
implements limits in terms of payload length, not frame length.

> With that premise, it boils down to creating new "MFS" attributes for
> userspace, and cleanly splitting L2/L3 values inside the kernel.
> ndo_change_mtu would become ndo_change_mfs; there'd be a MFS_CHANGED
> notifier call; "mtu" becomes an IP-level thing.
> 
> While MFS constrains MTU, I'd prefer to avoid "automatic" functions like
> raising MFS to make the MTU fit. (Or, worse, lowering MTU if MFS gets
> changed. I'd return error instead and have the user fix his config.)
[...]

For backward compatibility, setting the MTU on a physical device must
also raise its MFS if necessary.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: dst->obsolete has become pointless
From: Joe Perches @ 2011-11-09 23:56 UTC (permalink / raw)
  To: David Miller; +Cc: steffen.klassert, netdev, timo.teras
In-Reply-To: <20111109.142044.1307358041881082836.davem@davemloft.net>

On Wed, 2011-11-09 at 14:20 -0500, David Miller wrote:
> From: Joe Perches <joe@perches.com>
> Date: Wed, 09 Nov 2011 04:49:08 -0800
> 
> > On Tue, 2011-11-08 at 13:59 -0500, David Miller wrote:
> >> net: Kill pointless and misleading checks on dst->obsolete.
> > []
> >> Therefore rename it to dst->freed, and make it take on only the values
> >> "0" and "1".
> >> diff --git a/include/net/dst.h b/include/net/dst.h
> > []
> >> @@ -55,7 +55,7 @@ struct dst_entry {
> >>  #define DST_NOCOUNT		0x0020
> >>  
> >>  	short			error;
> >> -	short			obsolete;
> >> +	unsigned short		freed;
> > 
> > perhaps
> > 	bool freed;
> > 	bool __pad3;
> > just to mark the available space a bit more obviously.
> 
> Hmmm, what is a bool's defined type anyways?  It is a char on every
> architecture and ABI?

As far as I know, other than being large enough to
store a 1 and 0, it's implementation defined.

Just like an unsigned short.

I _believe_ gcc uses unsigned char width for
_Bool in all normal cases though.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox