Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 1/4] [IPROUTE2] Revert "Make ip utility veth driver aware"
From: Pavel Emelyanov @ 2007-09-12 12:55 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Stephen Hemminger, netdev, Patrick McHardy
In-Reply-To: <m14pi0cac5.fsf@ebiederm.dsl.xmission.com>

Eric W. Biederman wrote:
> Stephen it looks like you weren't cc'd on the latest version
> of the veth support.  So this patchset first reverts the old

He was. The latest version looks completely different from what
is reversed in this patch.

> version of the veth support you merged.  Then merges a tested
> version of the veth support.
> 
> This reverts commit 4ed390ce43d1ec7c881721f312260df901d8390d.
> 
> Conflicts:
> 
> 	ip/ip.c
> ---
>  ip/Makefile |    2 +-
>  ip/ip.c     |    4 +-
>  ip/veth.c   |  196 -----------------------------------------------------------
>  ip/veth.h   |   17 -----
>  4 files changed, 2 insertions(+), 217 deletions(-)
>  delete mode 100644 ip/veth.c
>  delete mode 100644 ip/veth.h
> 
> diff --git a/ip/Makefile b/ip/Makefile
> index 209c5c8..9a5bfe3 100644
> --- a/ip/Makefile
> +++ b/ip/Makefile
> @@ -1,7 +1,7 @@
>  IPOBJ=ip.o ipaddress.o iproute.o iprule.o \
>      rtm_map.o iptunnel.o ip6tunnel.o tunnel.o ipneigh.o ipntable.o iplink.o \
>      ipmaddr.o ipmonitor.o ipmroute.o ipprefix.o \
> -    ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o veth.o
> +    ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o
>  
>  RTMONOBJ=rtmon.o
>  
> diff --git a/ip/ip.c b/ip/ip.c
> index 829fc64..4bdb83b 100644
> --- a/ip/ip.c
> +++ b/ip/ip.c
> @@ -27,7 +27,6 @@
>  #include "SNAPSHOT.h"
>  #include "utils.h"
>  #include "ip_common.h"
> -#include "veth.h"
>  
>  int preferred_family = AF_UNSPEC;
>  int show_stats = 0;
> @@ -48,7 +47,7 @@ static void usage(void)
>  "Usage: ip [ OPTIONS ] OBJECT { COMMAND | help }\n"
>  "       ip [ -force ] [-batch filename\n"
>  "where  OBJECT := { link | addr | route | rule | neigh | ntable | tunnel |\n"
> -"                   maddr | mroute | monitor | xfrm | veth }\n"
> +"                   maddr | mroute | monitor | xfrm }\n"
>  "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
>  "                    -f[amily] { inet | inet6 | ipx | dnet | link } |\n"
>  "                    -o[neline] | -t[imestamp] }\n");
> @@ -78,7 +77,6 @@ static const struct cmd {
>  	{ "monitor",	do_ipmonitor },
>  	{ "xfrm",	do_xfrm },
>  	{ "mroute",	do_multiroute },
> -	{ "veth",	do_veth },
>  	{ "help",	do_help },
>  	{ 0 }
>  };
> diff --git a/ip/veth.c b/ip/veth.c
> deleted file mode 100644
> index d4eecc8..0000000
> --- a/ip/veth.c
> +++ /dev/null
> @@ -1,196 +0,0 @@
> -/*
> - * veth.c	       "ethernet tunnel"
> - *
> - *		This program is free software; you can redistribute it and/or
> - *		modify it under the terms of the GNU General Public License
> - *		as published by the Free Software Foundation; either version
> - *		2 of the License, or (at your option) any later version.
> - *
> - * Authors:	Pavel Emelianov, <xemul@openvz.org>
> - *
> - */
> -
> -#include <stdio.h>
> -#include <string.h>
> -#include <unistd.h>
> -#include <sys/types.h>
> -#include <sys/socket.h>
> -#include <linux/genetlink.h>
> -
> -#include "utils.h"
> -#include "veth.h"
> -
> -#define GENLMSG_DATA(glh)       ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
> -#define NLA_DATA(na)            ((void *)((char*)(na) + NLA_HDRLEN))
> -
> -static int do_veth_help(void)
> -{
> -	fprintf(stderr, "Usage: ip veth add DEVICE PEER_NAME\n");
> -	fprintf(stderr, "               del DEVICE\n");
> -	exit(-1);
> -}
> -
> -static int genl_ctrl_resolve_family(const char *family)
> -{
> -	struct rtnl_handle rth;
> -	struct nlmsghdr *nlh;
> -	struct genlmsghdr *ghdr;
> -	int ret = 0;
> -	struct {
> -		struct nlmsghdr         n;
> -		char                    buf[4096];
> -	} req;
> -
> -	memset(&req, 0, sizeof(req));
> -
> -	nlh = &req.n;
> -	nlh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
> -	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
> -	nlh->nlmsg_type = GENL_ID_CTRL;
> -
> -	ghdr = NLMSG_DATA(&req.n);
> -	ghdr->cmd = CTRL_CMD_GETFAMILY;
> -
> -	if (rtnl_open_byproto(&rth, 0, NETLINK_GENERIC) < 0) {
> -		fprintf(stderr, "Cannot open generic netlink socket\n");
> -		exit(1);
> -	}
> -
> -	addattr_l(nlh, 128, CTRL_ATTR_FAMILY_NAME, family, strlen(family) + 1);
> -
> -	if (rtnl_talk(&rth, nlh, 0, 0, nlh, NULL, NULL) < 0) {
> -		fprintf(stderr, "Error talking to the kernel\n");
> -		goto errout;
> -	}
> -
> -	{
> -		struct rtattr *tb[CTRL_ATTR_MAX + 1];
> -		struct genlmsghdr *ghdr = NLMSG_DATA(nlh);
> -		int len = nlh->nlmsg_len;
> -		struct rtattr *attrs;
> -
> -		if (nlh->nlmsg_type !=  GENL_ID_CTRL) {
> -			fprintf(stderr, "Not a controller message, nlmsg_len=%d "
> -				"nlmsg_type=0x%x\n", nlh->nlmsg_len, nlh->nlmsg_type);
> -			goto errout;
> -		}
> -
> -		if (ghdr->cmd != CTRL_CMD_NEWFAMILY) {
> -			fprintf(stderr, "Unkown controller command %d\n", ghdr->cmd);
> -			goto errout;
> -		}
> -
> -		len -= NLMSG_LENGTH(GENL_HDRLEN);
> -
> -		if (len < 0) {
> -			fprintf(stderr, "wrong controller message len %d\n", len);
> -			return -1;
> -		}
> -
> -		attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN);
> -		parse_rtattr(tb, CTRL_ATTR_MAX, attrs, len);
> -
> -		if (tb[CTRL_ATTR_FAMILY_ID] == NULL) {
> -			fprintf(stderr, "Missing family id TLV\n");
> -			goto errout;
> -		}
> -
> -		ret = *(__u16 *) RTA_DATA(tb[CTRL_ATTR_FAMILY_ID]);
> -	}
> -
> -errout:
> -	rtnl_close(&rth);
> -	return ret;
> -}
> -
> -static int do_veth_operate(char *dev, char *peer, int cmd)
> -{
> -	struct rtnl_handle rth;
> -	struct nlmsghdr *nlh;
> -	struct genlmsghdr *ghdr;
> -	struct nlattr *attr;
> -	struct  {
> -		struct nlmsghdr n;
> -		struct genlmsghdr h;
> -		char bug[1024];
> -	} req;
> -	int family, len;
> -	int err = 0;
> -
> -	family = genl_ctrl_resolve_family("veth");
> -	if (family == 0) {
> -		fprintf(stderr, "veth: Can't resolve family\n");
> -		exit(1);
> -	}
> -
> -	if (rtnl_open_byproto(&rth, 0, NETLINK_GENERIC) < 0)
> -		exit(1);
> -
> -	nlh = &req.n;
> -	nlh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
> -	nlh->nlmsg_flags = NLM_F_REQUEST;
> -	nlh->nlmsg_type = family;
> -	nlh->nlmsg_seq = 0;
> -
> -	ghdr = &req.h;
> -	ghdr->cmd = cmd;
> -
> -	attr = (struct nlattr *) GENLMSG_DATA(&req);
> -	len = strlen(dev);
> -	attr->nla_type = VETH_ATTR_DEVNAME;
> -	attr->nla_len = len + 1 + NLA_HDRLEN;
> -	memcpy(NLA_DATA(attr), dev, len);
> -	nlh->nlmsg_len += NLMSG_ALIGN(attr->nla_len);
> -
> -	if (peer) {
> -		attr = (struct nlattr *)((char *)attr +
> -				NLMSG_ALIGN(attr->nla_len));
> -		len = strlen(peer);
> -		attr->nla_type = VETH_ATTR_PEERNAME;
> -		attr->nla_len = len + 1 + NLA_HDRLEN;
> -		memcpy(NLA_DATA(attr), peer, len);
> -		nlh->nlmsg_len += NLMSG_ALIGN(attr->nla_len);
> -	}
> -
> -	if (rtnl_send(&rth, (char *) &req, nlh->nlmsg_len) < 0) {
> -		err = -1;
> -		fprintf(stderr, "Error talking to the kernel (add)\n");
> -	}
> -
> -	rtnl_close(&rth);
> -	return err;
> -}
> -
> -static int do_veth_add(int argc, char **argv)
> -{
> -	if (argc < 2)
> -		return do_veth_help();
> -
> -	return do_veth_operate(argv[0], argv[1], VETH_CMD_ADD);
> -}
> -
> -static int do_veth_del(int argc, char **argv)
> -{
> -	char *name;
> -
> -	if (argc < 1)
> -		return do_veth_help();
> -
> -	return do_veth_operate(argv[0], NULL, VETH_CMD_DEL);
> -}
> -
> -int do_veth(int argc, char **argv)
> -{
> -	if (argc == 0)
> -		return do_veth_help();
> -
> -	if (strcmp(*argv, "add") == 0 || strcmp(*argv, "a") == 0)
> -		return do_veth_add(argc - 1, argv + 1);
> -	if (strcmp(*argv, "del") == 0 || strcmp(*argv, "d") == 0)
> -		return do_veth_del(argc - 1, argv + 1);
> -	if (strcmp(*argv, "help") == 0)
> -		return do_veth_help();
> -
> -	fprintf(stderr, "Command \"%s\" is unknown, try \"ip veth help\".\n", *argv);
> -	exit(-1);
> -}
> diff --git a/ip/veth.h b/ip/veth.h
> deleted file mode 100644
> index 4d7b357..0000000
> --- a/ip/veth.h
> +++ /dev/null
> @@ -1,17 +0,0 @@
> -int do_veth(int argc, char **argv);
> -
> -enum {
> -	VETH_CMD_UNSPEC, 
> -	VETH_CMD_ADD, 
> -	VETH_CMD_DEL,
> -
> -	VETH_CMD_MAX
> -};
> -
> -enum {
> -	VETH_ATTR_UNSPEC,
> -	VETH_ATTR_DEVNAME,
> -	VETH_ATTR_PEERNAME,
> -
> -	VETH_ATTR_MAX
> -};


^ permalink raw reply

* Re: [net-2.6.24][NETNS][patch 3/3] fix bad macro definition
From: David Miller @ 2007-09-12 12:58 UTC (permalink / raw)
  To: dlezcano; +Cc: ebiederm, containers, netdev, benjamin.thery
In-Reply-To: <20070912124429.288622995@mai.toulouse-stg.fr.ibm.com>

From: dlezcano@fr.ibm.com
Date: Wed, 12 Sep 2007 14:38:14 +0200

> From: Daniel Lezcano <dlezcano@fr.ibm.com>
> 
> The macro definition is bad. When calling next_net_device with 
> parameter name "dev", the resulting code is:
> 	  struct net_device *dev = dev and that leads to an unexpected
> behavior. Especially when llc_core is compiled in, the kernel panics
> at boot time.
> The patchset change macro definition with static inline functions as
> they were defined before.
> 
> Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>

Applied, thanks.

^ permalink raw reply

* Re: [net-2.6.24][NETNS][patch 1/3] fix export symbols
From: Daniel Lezcano @ 2007-09-12 12:57 UTC (permalink / raw)
  To: David Miller; +Cc: ebiederm, containers, netdev, markn, benjamin.thery
In-Reply-To: <20070912.055307.23027518.davem@davemloft.net>

David Miller wrote:
> From: dlezcano@fr.ibm.com
> Date: Wed, 12 Sep 2007 14:38:12 +0200
> 
>> From: Daniel Lezcano <dlezcano@fr.ibm.com>
>>
>> Add the appropriate EXPORT_SYMBOLS for proc_net_create,
>> proc_net_fops_create and proc_net_remove to fix errors when
>> compiling allmodconfig
>>
>> Signed-off-by: Mark Nelson <markn@au1.ibm.com>
>> Acked-by: Benjamin Thery <benjamin.thery@bull.net>
> 
> Applied to net-2.6.24, thanks.
> 
> Why aren't you signing off on these patches?  Please
> do so in the future.
> 
> Because "From: " usually means you are the patch author, and I can't
> tell who wrote these patches, you or these other people listed in the
> signoff area.
> 

Sorry for that, I will take care of that next time. Thanks.

^ permalink raw reply

* [PATCH 2/6] [IPROUTE2] Introduce iplink_parse() routine
From: Eric W. Biederman @ 2007-09-12 12:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Pavel Emelyanov, Patrick McHardy
In-Reply-To: <m14pi0cac5.fsf@ebiederm.dsl.xmission.com>

From: Pavel Emelyanov <xemul@openvz.org>
Date: Thu, 19 Jul 2007 13:32:31 +0400

This routine parses CLI attributes, describing generic link
parameters such as name, address, etc.

This is mostly copy-pasted from iplink_modify().

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Patrick McHardy <kaber@trash.net>
---
 include/utils.h |    3 +
 ip/iplink.c     |  127 +++++++++++++++++++++++++++++++-----------------------
 2 files changed, 76 insertions(+), 54 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index a3fd335..3fd851d 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -146,4 +146,7 @@ extern int cmdlineno;
 extern size_t getcmdline(char **line, size_t *len, FILE *in);
 extern int makeargs(char *line, char *argv[], int maxargs);
 
+struct iplink_req;
+int iplink_parse(int argc, char **argv, struct iplink_req *req,
+		char **name, char **type, char **link, char **dev);
 #endif /* __UTILS_H__ */
diff --git a/ip/iplink.c b/ip/iplink.c
index 4060845..64989b2 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -142,140 +142,159 @@ static int iplink_have_newlink(void)
 }
 #endif /* ! IPLINK_IOCTL_COMPAT */
 
-static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv)
+struct iplink_req {
+	struct nlmsghdr		n;
+	struct ifinfomsg	i;
+	char			buf[1024];
+};
+
+int iplink_parse(int argc, char **argv, struct iplink_req *req,
+		char **name, char **type, char **link, char **dev)
 {
+	int ret, len;
+	char abuf[32];
 	int qlen = -1;
 	int mtu = -1;
-	int len;
-	char abuf[32];
-	char *dev = NULL;
-	char *name = NULL;
-	char *link = NULL;
-	char *type = NULL;
-	struct link_util *lu = NULL;
-	struct {
-		struct nlmsghdr		n;
-		struct ifinfomsg	i;
-		char			buf[1024];
-	} req;
 
-	memset(&req, 0, sizeof(req));
-
-	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-	req.n.nlmsg_flags = NLM_F_REQUEST|flags;
-	req.n.nlmsg_type = cmd;
-	req.i.ifi_family = preferred_family;
+	ret = argc;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "up") == 0) {
-			req.i.ifi_change |= IFF_UP;
-			req.i.ifi_flags |= IFF_UP;
+			req->i.ifi_change |= IFF_UP;
+			req->i.ifi_flags |= IFF_UP;
 		} else if (strcmp(*argv, "down") == 0) {
-			req.i.ifi_change |= IFF_UP;
-			req.i.ifi_flags &= ~IFF_UP;
+			req->i.ifi_change |= IFF_UP;
+			req->i.ifi_flags &= ~IFF_UP;
 		} else if (strcmp(*argv, "name") == 0) {
 			NEXT_ARG();
-			name = *argv;
+			*name = *argv;
 		} else if (matches(*argv, "link") == 0) {
 			NEXT_ARG();
-			link = *argv;
+			*link = *argv;
 		} else if (matches(*argv, "address") == 0) {
 			NEXT_ARG();
 			len = ll_addr_a2n(abuf, sizeof(abuf), *argv);
-			addattr_l(&req.n, sizeof(req), IFLA_ADDRESS, abuf, len);
+			addattr_l(&req->n, sizeof(*req), IFLA_ADDRESS, abuf, len);
 		} else if (matches(*argv, "broadcast") == 0 ||
-			   strcmp(*argv, "brd") == 0) {
+				strcmp(*argv, "brd") == 0) {
 			NEXT_ARG();
 			len = ll_addr_a2n(abuf, sizeof(abuf), *argv);
-			addattr_l(&req.n, sizeof(req), IFLA_BROADCAST, abuf, len);
+			addattr_l(&req->n, sizeof(*req), IFLA_BROADCAST, abuf, len);
 		} else if (matches(*argv, "txqueuelen") == 0 ||
-			   strcmp(*argv, "qlen") == 0 ||
-			   matches(*argv, "txqlen") == 0) {
+				strcmp(*argv, "qlen") == 0 ||
+				matches(*argv, "txqlen") == 0) {
 			NEXT_ARG();
 			if (qlen != -1)
 				duparg("txqueuelen", *argv);
 			if (get_integer(&qlen,  *argv, 0))
 				invarg("Invalid \"txqueuelen\" value\n", *argv);
-			addattr_l(&req.n, sizeof(req), IFLA_TXQLEN, &qlen, 4);
+			addattr_l(&req->n, sizeof(*req), IFLA_TXQLEN, &qlen, 4);
 		} else if (strcmp(*argv, "mtu") == 0) {
 			NEXT_ARG();
 			if (mtu != -1)
 				duparg("mtu", *argv);
 			if (get_integer(&mtu, *argv, 0))
 				invarg("Invalid \"mtu\" value\n", *argv);
-			addattr_l(&req.n, sizeof(req), IFLA_MTU, &mtu, 4);
+			addattr_l(&req->n, sizeof(*req), IFLA_MTU, &mtu, 4);
 		} else if (strcmp(*argv, "multicast") == 0) {
 			NEXT_ARG();
-			req.i.ifi_change |= IFF_MULTICAST;
+			req->i.ifi_change |= IFF_MULTICAST;
 			if (strcmp(*argv, "on") == 0) {
-				req.i.ifi_flags |= IFF_MULTICAST;
+				req->i.ifi_flags |= IFF_MULTICAST;
 			} else if (strcmp(*argv, "off") == 0) {
-				req.i.ifi_flags &= ~IFF_MULTICAST;
+				req->i.ifi_flags &= ~IFF_MULTICAST;
 			} else
 				return on_off("multicast");
 		} else if (strcmp(*argv, "allmulticast") == 0) {
 			NEXT_ARG();
-			req.i.ifi_change |= IFF_ALLMULTI;
+			req->i.ifi_change |= IFF_ALLMULTI;
 			if (strcmp(*argv, "on") == 0) {
-				req.i.ifi_flags |= IFF_ALLMULTI;
+				req->i.ifi_flags |= IFF_ALLMULTI;
 			} else if (strcmp(*argv, "off") == 0) {
-				req.i.ifi_flags &= ~IFF_ALLMULTI;
+				req->i.ifi_flags &= ~IFF_ALLMULTI;
 			} else
 				return on_off("allmulticast");
 		} else if (strcmp(*argv, "promisc") == 0) {
 			NEXT_ARG();
-			req.i.ifi_change |= IFF_PROMISC;
+			req->i.ifi_change |= IFF_PROMISC;
 			if (strcmp(*argv, "on") == 0) {
-				req.i.ifi_flags |= IFF_PROMISC;
+				req->i.ifi_flags |= IFF_PROMISC;
 			} else if (strcmp(*argv, "off") == 0) {
-				req.i.ifi_flags &= ~IFF_PROMISC;
+				req->i.ifi_flags &= ~IFF_PROMISC;
 			} else
 				return on_off("promisc");
 		} else if (strcmp(*argv, "trailers") == 0) {
 			NEXT_ARG();
-			req.i.ifi_change |= IFF_NOTRAILERS;
+			req->i.ifi_change |= IFF_NOTRAILERS;
 			if (strcmp(*argv, "off") == 0) {
-				req.i.ifi_flags |= IFF_NOTRAILERS;
+				req->i.ifi_flags |= IFF_NOTRAILERS;
 			} else if (strcmp(*argv, "on") == 0) {
-				req.i.ifi_flags &= ~IFF_NOTRAILERS;
+				req->i.ifi_flags &= ~IFF_NOTRAILERS;
 			} else
 				return on_off("trailers");
 		} else if (strcmp(*argv, "arp") == 0) {
 			NEXT_ARG();
-			req.i.ifi_change |= IFF_NOARP;
+			req->i.ifi_change |= IFF_NOARP;
 			if (strcmp(*argv, "on") == 0) {
-				req.i.ifi_flags &= ~IFF_NOARP;
+				req->i.ifi_flags &= ~IFF_NOARP;
 			} else if (strcmp(*argv, "off") == 0) {
-				req.i.ifi_flags |= IFF_NOARP;
+				req->i.ifi_flags |= IFF_NOARP;
 			} else
 				return on_off("noarp");
 #ifdef IFF_DYNAMIC
 		} else if (matches(*argv, "dynamic") == 0) {
 			NEXT_ARG();
-			req.i.ifi_change |= IFF_DYNAMIC;
+			req->i.ifi_change |= IFF_DYNAMIC;
 			if (strcmp(*argv, "on") == 0) {
-				req.i.ifi_flags |= IFF_DYNAMIC;
+				req->i.ifi_flags |= IFF_DYNAMIC;
 			} else if (strcmp(*argv, "off") == 0) {
-				req.i.ifi_flags &= ~IFF_DYNAMIC;
+				req->i.ifi_flags &= ~IFF_DYNAMIC;
 			} else
 				return on_off("dynamic");
 #endif
 		} else if (matches(*argv, "type") == 0) {
 			NEXT_ARG();
-			type = *argv;
+			*type = *argv;
 			argc--; argv++;
 			break;
 		} else {
-                        if (strcmp(*argv, "dev") == 0) {
+			if (strcmp(*argv, "dev") == 0) {
 				NEXT_ARG();
 			}
-			if (dev)
+			if (*dev)
 				duparg2("dev", *argv);
-			dev = *argv;
+			*dev = *argv;
 		}
 		argc--; argv++;
 	}
 
+	return ret - argc;
+}
+
+static int iplink_modify(int cmd, unsigned int flags, int argc, char **argv)
+{
+	int len;
+	char *dev = NULL;
+	char *name = NULL;
+	char *link = NULL;
+	char *type = NULL;
+	struct link_util *lu = NULL;
+	struct iplink_req req;
+	int ret;
+
+	memset(&req, 0, sizeof(req));
+
+	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+	req.n.nlmsg_type = cmd;
+	req.i.ifi_family = preferred_family;
+
+	ret = iplink_parse(argc, argv, &req, &name, &type, &link, &dev);
+	if (ret < 0)
+		return ret;
+
+	argc -= ret;
+	argv += ret;
 	ll_init_map(&rth);
 
 	if (type) {
-- 
1.5.3.rc6.17.g1911


^ permalink raw reply related

* [PATCH 3/4] [IPROUTE2] Module for ip utility to support veth device
From: Eric W. Biederman @ 2007-09-12 12:59 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Pavel Emelyanov, Patrick McHardy
In-Reply-To: <m1zlzsavla.fsf@ebiederm.dsl.xmission.com>

From: Pavel Emelyanov <xemul@openvz.org>
Date: Thu, 19 Jul 2007 13:33:56 +0400

The link_veth.so itself.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Patrick McHardy <kaber@trash.net>
---
 ip/Makefile    |    6 ++++-
 ip/link_veth.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ip/veth.h      |   12 ++++++++++
 3 files changed, 80 insertions(+), 1 deletions(-)
 create mode 100644 ip/link_veth.c
 create mode 100644 ip/veth.h

diff --git a/ip/Makefile b/ip/Makefile
index 9a5bfe3..b46bce3 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -8,8 +8,9 @@ RTMONOBJ=rtmon.o
 ALLOBJ=$(IPOBJ) $(RTMONOBJ)
 SCRIPTS=ifcfg rtpr routel routef
 TARGETS=ip rtmon
+LIBS=link_veth.so
 
-all: $(TARGETS) $(SCRIPTS)
+all: $(TARGETS) $(SCRIPTS) $(LIBS)
 
 ip: $(IPOBJ) $(LIBNETLINK) $(LIBUTIL)
 
@@ -24,3 +25,6 @@ clean:
 
 LDLIBS	+= -ldl
 LDFLAGS	+= -Wl,-export-dynamic
+
+%.so: %.c
+	$(CC) $(CFLAGS) -shared $< -o $@
diff --git a/ip/link_veth.c b/ip/link_veth.c
new file mode 100644
index 0000000..ded2cdd
--- /dev/null
+++ b/ip/link_veth.c
@@ -0,0 +1,63 @@
+/*
+ * link_veth.c	veth driver module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <string.h>
+
+#include "utils.h"
+#include "ip_common.h"
+#include "veth.h"
+
+#define	IFNAMSIZ	16
+
+static void usage(void)
+{
+	printf("Usage: ip link add ... type veth "
+			"[peer <peer-name>] [mac <mac>] [peer_mac <mac>]\n");
+}
+
+static int veth_parse_opt(struct link_util *lu, int argc, char **argv,
+		struct nlmsghdr *hdr)
+{
+	char *name, *type, *link, *dev;
+	int err, len;
+	struct rtattr * data;
+
+	if (strcmp(argv[0], "peer") != 0) {
+		usage();
+		return -1;
+	}
+
+	data = NLMSG_TAIL(hdr);
+	addattr_l(hdr, 1024, VETH_INFO_PEER, NULL, 0);
+
+	hdr->nlmsg_len += sizeof(struct ifinfomsg);
+
+	err = iplink_parse(argc - 1, argv + 1, (struct iplink_req *)hdr,
+			&name, &type, &link, &dev);
+	if (err < 0)
+		return err;
+
+	if (name) {
+		len = strlen(name) + 1;
+		if (len > IFNAMSIZ)
+			invarg("\"name\" too long\n", *argv);
+		addattr_l(hdr, 1024, IFLA_IFNAME, name, len);
+	}
+
+	data->rta_len = (void *)NLMSG_TAIL(hdr) - (void *)data;
+	return argc - 1 - err;
+}
+
+struct link_util veth_link_util = {
+	.id = "veth",
+	.parse_opt = veth_parse_opt,
+};
diff --git a/ip/veth.h b/ip/veth.h
new file mode 100644
index 0000000..aa2e6f9
--- /dev/null
+++ b/ip/veth.h
@@ -0,0 +1,12 @@
+#ifndef __NET_VETH_H__
+#define __NET_VETH_H__
+
+enum {
+	VETH_INFO_UNSPEC,
+	VETH_INFO_PEER,
+
+	__VETH_INFO_MAX
+#define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
+};
+
+#endif
-- 
1.5.3.rc6.17.g1911


^ permalink raw reply related

* [PATCH 4/4] [IPROUTE2] iproute2: link_veth support bug fixes.
From: Eric W. Biederman @ 2007-09-12 13:01 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Pavel Emelyanov, Patrick McHardy
In-Reply-To: <m1veagavjr.fsf_-_@ebiederm.dsl.xmission.com>

From: Eric W. Biederman <ebiederm@xmission.com>
Date: Sat, 8 Sep 2007 10:17:43 -0600

This patch contains small compile and implementation
bug fixes for link_veth.c.

The compile fixes stop trying to build a shared object
when we can just as easily compile the code in.  Making
support of non arch/i386 architectures easier.

The documentation is fixed to not document the previous version
of the veth support.

The code is to initialize it's pointers before calling
iplink_parse, and we now set name = dev if name is not
passed.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 ip/Makefile    |    8 +++-----
 ip/link_veth.c |   12 +++++++++---
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/ip/Makefile b/ip/Makefile
index b46bce3..a98e1f3 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -3,14 +3,15 @@ IPOBJ=ip.o ipaddress.o iproute.o iprule.o \
     ipmaddr.o ipmonitor.o ipmroute.o ipprefix.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o
 
+IPOBJ += link_veth.o
+
 RTMONOBJ=rtmon.o
 
 ALLOBJ=$(IPOBJ) $(RTMONOBJ)
 SCRIPTS=ifcfg rtpr routel routef
 TARGETS=ip rtmon
-LIBS=link_veth.so
 
-all: $(TARGETS) $(SCRIPTS) $(LIBS)
+all: $(TARGETS) $(SCRIPTS)
 
 ip: $(IPOBJ) $(LIBNETLINK) $(LIBUTIL)
 
@@ -25,6 +26,3 @@ clean:
 
 LDLIBS	+= -ldl
 LDFLAGS	+= -Wl,-export-dynamic
-
-%.so: %.c
-	$(CC) $(CFLAGS) -shared $< -o $@
diff --git a/ip/link_veth.c b/ip/link_veth.c
index ded2cdd..6f3931c 100644
--- a/ip/link_veth.c
+++ b/ip/link_veth.c
@@ -20,14 +20,16 @@
 
 static void usage(void)
 {
-	printf("Usage: ip link add ... type veth "
-			"[peer <peer-name>] [mac <mac>] [peer_mac <mac>]\n");
+	printf("Usage: ip link add ... type veth peer { ... }\n");
 }
 
 static int veth_parse_opt(struct link_util *lu, int argc, char **argv,
 		struct nlmsghdr *hdr)
 {
-	char *name, *type, *link, *dev;
+	char *dev = NULL;
+	char *name = NULL;
+	char *link = NULL;
+	char *type = NULL;
 	int err, len;
 	struct rtattr * data;
 
@@ -46,6 +48,10 @@ static int veth_parse_opt(struct link_util *lu, int argc, char **argv,
 	if (err < 0)
 		return err;
 
+	/* Allow "ip link add dev" and "ip link add name" */
+	if (!name)
+		name = dev;
+
 	if (name) {
 		len = strlen(name) + 1;
 		if (len > IFNAMSIZ)
-- 
1.5.3.rc6.17.g1911


^ permalink raw reply related

* [PATCH] [IPROUTE2] Basic documentation for dynamic link creation/destruction.
From: Eric W. Biederman @ 2007-09-12 13:03 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Pavel Emelyanov, Patrick McHardy
In-Reply-To: <m1r6l4avhp.fsf_-_@ebiederm.dsl.xmission.com>


This updates the usage to indicate that we have support link creation
and destruction in addition to just setting link parameters.

It's not really great documentation of the new netlink support
for link creations and removal but it is a start.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 ip/iplink.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index 64989b2..541f3d6 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -38,7 +38,8 @@ static void usage(void) __attribute__((noreturn));
 
 void iplink_usage(void)
 {
-	fprintf(stderr, "Usage: ip link set DEVICE { up | down |\n");
+	fprintf(stderr, "Usage: ip link { set | add | replace | delete } DEVICE {\n");
+	fprintf(stderr, "			     up | down |\n");
 	fprintf(stderr, "	                     arp { on | off } |\n");
 	fprintf(stderr, "	                     dynamic { on | off } |\n");
 	fprintf(stderr, "	                     multicast { on | off } |\n");
@@ -48,7 +49,9 @@ void iplink_usage(void)
 	fprintf(stderr, "	                     txqueuelen PACKETS |\n");
 	fprintf(stderr, "	                     name NEWNAME |\n");
 	fprintf(stderr, "	                     address LLADDR | broadcast LLADDR |\n");
-	fprintf(stderr, "	                     mtu MTU }\n");
+	fprintf(stderr, "	                     mtu MTU | \n");
+	fprintf(stderr, "			     type TYPE [ TYPE specifc options]\n");
+	fprintf(stderr, "			     }\n");
 	fprintf(stderr, "       ip link show [ DEVICE ]\n");
 	exit(-1);
 }
-- 
1.5.3.rc6.17.g1911


^ permalink raw reply related

* [PATCH] [IPROUTE2] Add support for moving links between network namespaces
From: Eric W. Biederman @ 2007-09-12 13:05 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Pavel Emelyanov, Patrick McHardy
In-Reply-To: <m1myvsavdn.fsf_-_@ebiederm.dsl.xmission.com>


This adds support for setting the IFLA_NET_NS_PID attribute
on links allowing them to be moved between network namespaces.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/if_link.h |    1 +
 ip/iplink.c             |    9 +++++++++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 23b3a8e..c948395 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -78,6 +78,7 @@ enum
 	IFLA_LINKMODE,
 	IFLA_LINKINFO,
 #define IFLA_LINKINFO IFLA_LINKINFO
+	IFLA_NET_NS_PID,
 	__IFLA_MAX
 };
 
diff --git a/ip/iplink.c b/ip/iplink.c
index 541f3d6..624c784 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -158,6 +158,7 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
 	char abuf[32];
 	int qlen = -1;
 	int mtu = -1;
+	pid_t netns_pid = -1;
 
 	ret = argc;
 
@@ -255,6 +256,14 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
 			} else
 				return on_off("dynamic");
 #endif
+		} else if (matches(*argv, "netnspid") == 0) {
+			NEXT_ARG();
+			if (netns_pid != -1)
+				duparg("netnspid", *argv);
+			if (get_integer(&netns_pid, *argv, 0))
+				invarg("Invalid \"netnspid\" value\n", *argv);
+			addattr_l(&req->n, sizeof(*req), IFLA_NET_NS_PID,
+				&netns_pid, sizeof(netns_pid));
 		} else if (matches(*argv, "type") == 0) {
 			NEXT_ARG();
 			*type = *argv;
-- 
1.5.3.rc6.17.g1911


^ permalink raw reply related

* Re: [PATCH 1/4] [IPROUTE2] Revert "Make ip utility veth driver aware"
From: Eric W. Biederman @ 2007-09-12 13:09 UTC (permalink / raw)
  To: Pavel Emelyanov; +Cc: Stephen Hemminger, netdev, Patrick McHardy
In-Reply-To: <46E7E1BF.6070108@openvz.org>

Pavel Emelyanov <xemul@openvz.org> writes:

> Eric W. Biederman wrote:
>> Stephen it looks like you weren't cc'd on the latest version
>> of the veth support.  So this patchset first reverts the old
>
> He was. The latest version looks completely different from what
> is reversed in this patch.

This is against the latest snapshot I could find.  My apologies
if I missed some of the communication.

Eric

^ permalink raw reply

* Re: [-mm patch] really unexport do_softirq
From: David Miller @ 2007-09-12 13:14 UTC (permalink / raw)
  To: bunk; +Cc: akpm, robert.olsson, linux-kernel, netdev
In-Reply-To: <20070909202540.GU3563@stusta.de>

From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 9 Sep 2007 22:25:40 +0200

> On Fri, Aug 31, 2007 at 09:58:22PM -0700, Andrew Morton wrote:
> >...
> > Changes since 2.6.23-rc3-mm1:
> >...
> >  git-net.patch
> >...
> >  git trees
> >...
> 
> This hydra had more than one head...
> 
> Signed-off-by: Adrian Bunk <bunk@kernel.org>

Applied, thanks.

^ permalink raw reply

* Re: [-mm patch] unexport raise_softirq_irqoff
From: David Miller @ 2007-09-12 13:15 UTC (permalink / raw)
  To: hch; +Cc: bunk, akpm, linux-kernel, netdev
In-Reply-To: <20070909204153.GB11764@infradead.org>

From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 9 Sep 2007 21:41:53 +0100

> On Sun, Sep 09, 2007 at 10:25:44PM +0200, Adrian Bunk wrote:
> > On Fri, Aug 31, 2007 at 09:58:22PM -0700, Andrew Morton wrote:
> > >...
> > > Changes since 2.6.23-rc3-mm1:
> > >...
> > >  git-net.patch
> > >...
> > >  git trees
> > >...
> > 
> > raise_softirq_irqoff no longer has any modular user.
> > 
> > Signed-off-by: Adrian Bunk <bunk@kernel.org>
> 
> This should probably go in through Dave's tree as it's removing this
> rather annoying user.

Yep, I've just tossed it into my tree.

Thanks.

^ permalink raw reply

* Re: [2.6 patch] make sctp_addto_param() static
From: David Miller @ 2007-09-12 13:17 UTC (permalink / raw)
  To: bunk
  Cc: yjwei, vladislav.yasevich, sri, linux-kernel, lksctp-developers,
	netdev
In-Reply-To: <20070909202550.GX3563@stusta.de>

From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 9 Sep 2007 22:25:50 +0200

> sctp_addto_param() can become static.
> 
> Signed-off-by: Adrian Bunk <bunk@kernel.org>

Applied, thanks.

^ permalink raw reply

* Re: [-mm patch] net/sctp/socket.c: make 3 variables static
From: David Miller @ 2007-09-12 13:18 UTC (permalink / raw)
  To: bunk; +Cc: akpm, vladislav.yasevich, sri, linux-kernel, lksctp-developers,
	netdev
In-Reply-To: <20070909202554.GY3563@stusta.de>

From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 9 Sep 2007 22:25:54 +0200

> This patch makes the following needlessly globalvariables static:
> - sctp_memory_pressure
> - sctp_memory_allocated
> - sctp_sockets_allocated
> 
> Signed-off-by: Adrian Bunk <bunk@kernel.org>

Applied, thanks.

^ permalink raw reply

* [PATCH] veth: Cleanly handle a missing peer_tb argument on creation.
From: Eric W. Biederman @ 2007-09-12 13:19 UTC (permalink / raw)
  To: Pavel Emelyanov, David Miller; +Cc: Patrick McHardy, netdev, Stephen Hemminger

I was getting strange kernel crashes when attempting to
create veth devices when I did not specify a peer argument
to /bin/ip.

So this patch defaults peer_tb to all zeros and doesn't attempt to
reuse the netlink attributes for the primary link to create the
secondary link and now I can't reproduce the failures.

Given that some of the most interesting netlink attributes to specify
like a mac address or a network device name seem are generally
the wrong thing to do this seems like the right approach.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/veth.c |   16 +++++++---------
 1 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 9e6a746..d49bd2c 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -313,7 +313,7 @@ static int veth_newlink(struct net_device *dev,
 	struct net_device *peer;
 	struct veth_priv *priv;
 	char ifname[IFNAMSIZ];
-	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
+	struct nlattr *peer_tb[IFLA_MAX + 1];

 	/*
 	 * create and register peer first
@@ -322,6 +322,7 @@ static int veth_newlink(struct net_device *dev,
 	 * skip it since no info from it is useful yet
 	 */

+	memset(peer_tb, 0, sizeof(peer_tb));
 	if (data != NULL && data[VETH_INFO_PEER] != NULL) {
 		struct nlattr *nla_peer;

@@ -336,21 +337,18 @@ static int veth_newlink(struct net_device *dev,
 		err = veth_validate(peer_tb, NULL);
 		if (err < 0)
 			return err;
+	}

-		tbp = peer_tb;
-	} else
-		tbp = tb;
-
-	if (tbp[IFLA_IFNAME])
-		nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
+	if (peer_tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, peer_tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");

-	peer = rtnl_create_link(dev->nd_net, ifname, &veth_link_ops, tbp);
+	peer = rtnl_create_link(dev->nd_net, ifname, &veth_link_ops, peer_tb);
 	if (IS_ERR(peer))
 		return PTR_ERR(peer);

-	if (tbp[IFLA_ADDRESS] == NULL)
+	if (peer_tb[IFLA_ADDRESS] == NULL)
 		random_ether_addr(peer->dev_addr);

 	err = register_netdevice(peer);
-- 
1.5.3.rc6.17.g1911

^ permalink raw reply related

* Re: [-mm patch] make tcp_splice_data_recv() static
From: David Miller @ 2007-09-12 13:21 UTC (permalink / raw)
  To: bunk; +Cc: akpm, jens.axboe, linux-kernel, netdev
In-Reply-To: <20070909202558.GZ3563@stusta.de>

From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 9 Sep 2007 22:25:58 +0200

> On Fri, Aug 31, 2007 at 09:58:22PM -0700, Andrew Morton wrote:
> >...
> > Changes since 2.6.23-rc3-mm1:
> >...
> >  git-block.patch
> >...
> >  git trees
> >...
> 
> tcp_splice_data_recv() can become static.
> 
> Signed-off-by: Adrian Bunk <bunk@kernel.org>

I'll let Jens or similar pick this one up since it
obviously won't apply to my tree.


^ permalink raw reply

* Re: new NAPI interface broken for POWER architecture?
From: Christoph Raisch @ 2007-09-12 13:10 UTC (permalink / raw)
  To: David Miller
  Cc: Jan-Bernd Themann, netdev, ossthema, shemminger, Arnd Bergmann,
	Paul Mackerras, Michael Ellerman, linuxppc-dev
In-Reply-To: <20070912.055004.88490155.davem@davemloft.net>



David Miller <davem@davemloft.net> wrote on 12.09.2007 14:50:04:

> From: Jan-Bernd Themann <ossthema@de.ibm.com>
> Date: Fri, 7 Sep 2007 11:37:02 +0200
>
> > 2) On SMP systems: after netif_rx_complete has been called on CPU1
> >    (+interruts enabled), netif_rx_schedule could be called on CPU2
> >    (irq handler) before net_rx_action on CPU1 has checked
NAPI_STATE_SCHED.
> >    In that case the device would be added to poll lists of CPU1 and
CPU2
> >    as net_rx_action would see NAPI_STATE_SCHED set.
> >    This must not happen. It will be caught when netif_rx_complete is
> >    called the second time (BUG() called)
> >
> > This would mean we have a problem on all SMP machines right now.
>
> This is not a correct statement.
>
> Only on your platform do network device interrupts get moved
> around, no other platform does this.
>
> Sparc64 doesn't, all interrupts stay in one location after
> the cpu is initially choosen.
>
> x86 and x86_64 specifically do not move around network
> device interrupts, even though other device types do
> get dynamic IRQ cpu distribution.
>
> That's why you are the only person seeing this problem.
>
> I agree that it should be fixed, but we should also fix the IRQ
> distribution scheme used on powerpc platforms which is totally
> broken in these cases.

This is definitely not something we can change in the HEA device driver
alone.
It could also affect any other networking cards on POWER (e1000,s2io...).

Paul, Michael, Arndt, what is your opinion here?

Gruss / Regards
Christoph Raisch


^ permalink raw reply

* dscc4.c tests for "#ifndef MODULE" even though it must be modular
From: Robert P. J. Day @ 2007-09-12 13:25 UTC (permalink / raw)
  To: netdev; +Cc: romieu

  from drivers/net/wan/dscc4.c:

=====
#ifndef MODULE
static int __init dscc4_setup(char *str)
{
        int *args[] = { &debug, &quartz, NULL }, **p = args;

        while (*p && (get_option(&str, *p) == 2))
                p++;
        return 1;
}

__setup("dscc4.setup=", dscc4_setup);
#endif
=====

  but from drivers/net/wan/Kconfig:

...
config DSCC4
        tristate "Etinc PCISYNC serial board support"
        depends on HDLC && PCI && m
...

  if i read this correctly, doesn't the depends on of "&& m" mean that
that Kconfig selection can be *at most* modular, so that that
preprocessor conditional can never be satisfied?  a quick test under
"make menuconfig" seems to confirm that.

  besides, the kernel parm being defined in that call to __setup()
really violates the spirit of defining kernel parms. :-)

rday
-- 
========================================================================
Robert P. J. Day
Linux Consulting, Training and Annoying Kernel Pedantry
Waterloo, Ontario, CANADA

http://crashcourse.ca
========================================================================

^ permalink raw reply

* Re: new NAPI interface broken for POWER architecture?
From: David Miller @ 2007-09-12 13:27 UTC (permalink / raw)
  To: RAISCH
  Cc: THEMANN, netdev, ossthema, shemminger, ARNDB, pmac, ellerman,
	linuxppc-dev
In-Reply-To: <OF2D6ED296.BDB04FBF-ONC1257354.00473703-C1257354.0048406C@de.ibm.com>

From: Christoph Raisch <RAISCH@de.ibm.com>
Date: Wed, 12 Sep 2007 15:10:08 +0200

> This is definitely not something we can change in the HEA device driver
> alone.

And it shouldn't be, x86 implements the policy in irq balance
daemon, powerpc should do it wherever it would be appropriate
there.

> Paul, Michael, Arndt, what is your opinion here?

I'm all ears too :)

^ permalink raw reply

* Re: [PATCH] veth: Cleanly handle a missing peer_tb argument on creation.
From: David Miller @ 2007-09-12 13:31 UTC (permalink / raw)
  To: ebiederm; +Cc: xemul, kaber, netdev, shemminger
In-Reply-To: <m17imwaumb.fsf@ebiederm.dsl.xmission.com>

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Wed, 12 Sep 2007 07:19:56 -0600

> 
> I was getting strange kernel crashes when attempting to
> create veth devices when I did not specify a peer argument
> to /bin/ip.
> 
> So this patch defaults peer_tb to all zeros and doesn't attempt to
> reuse the netlink attributes for the primary link to create the
> secondary link and now I can't reproduce the failures.
> 
> Given that some of the most interesting netlink attributes to specify
> like a mac address or a network device name seem are generally
> the wrong thing to do this seems like the right approach.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

This looks mostly fine, can someone else who knows
veth a bit review this as well?

^ permalink raw reply

* [PATCH] IPV4 : convert rt_check_expire() from softirq processing to workqueue
From: Eric Dumazet @ 2007-09-12 13:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org

On loaded/big hosts, rt_check_expire() if of litle use, because it
generally breaks out of its main loop because of a jiffies change.

It can take a long time (read : timer invocations) to actually
scan the whole hash table, freeing unused entries.

Converting it to use a workqueue instead of softirq is a nice
move because we can allow rt_check_expire() to do the scan
it is supposed to do, without hogging the CPU.

This has an impact on the average number of entries in cache, 
reducing ram usage. Cache is more responsive to parameter
changes (/proc/sys/net/ipv4/route/gc_timeout and
/proc/sys/net/ipv4/route/gc_interval)

Note: Maybe the default value of gc_interval (60 seconds)
is too high, since this means we actually need 5 (300/60)
invocations to scan the whole table.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 396c631..006d605 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -81,6 +81,7 @@
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/init.h>
+#include <linux/workqueue.h>
 #include <linux/skbuff.h>
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
@@ -136,7 +137,8 @@ static unsigned long rt_deadline;
 #define RTprint(a...)	printk(KERN_DEBUG a)
 
 static struct timer_list rt_flush_timer;
-static struct timer_list rt_periodic_timer;
+static void rt_check_expire(struct work_struct *work);
+static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
 static struct timer_list rt_secret_timer;
 
 /*
@@ -572,20 +574,19 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 		(fl1->iif ^ fl2->iif)) == 0;
 }
 
-/* This runs via a timer and thus is always in BH context. */
-static void rt_check_expire(unsigned long dummy)
+static void rt_check_expire(struct work_struct *work)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
 	struct rtable *rth, **rthp;
-	unsigned long now = jiffies;
 	u64 mult;
 
 	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 	if (ip_rt_gc_timeout > 1)
 		do_div(mult, ip_rt_gc_timeout);
 	goal = (unsigned int)mult;
-	if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+	if (goal > rt_hash_mask)
+		goal = rt_hash_mask + 1;
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 
@@ -594,11 +595,11 @@ static void rt_check_expire(unsigned long dummy)
 
 		if (*rthp == 0)
 			continue;
-		spin_lock(rt_hash_lock_addr(i));
+		spin_lock_bh(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
-				if (time_before_eq(now, rth->u.dst.expires)) {
+				if (time_before_eq(jiffies, rth->u.dst.expires)) {
 					tmo >>= 1;
 					rthp = &rth->u.dst.rt_next;
 					continue;
@@ -613,14 +614,10 @@ static void rt_check_expire(unsigned long dummy)
 			*rthp = rth->u.dst.rt_next;
 			rt_free(rth);
 		}
-		spin_unlock(rt_hash_lock_addr(i));
-
-		/* Fallback loop breaker. */
-		if (time_after(jiffies, now))
-			break;
+		spin_unlock_bh(rt_hash_lock_addr(i));
 	}
 	rover = i;
-	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
+	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 }
 
 /* This can run from both BH and non-BH contexts, the latter
@@ -2993,17 +2990,14 @@ int __init ip_rt_init(void)
 
 	init_timer(&rt_flush_timer);
 	rt_flush_timer.function = rt_run_flush;
-	init_timer(&rt_periodic_timer);
-	rt_periodic_timer.function = rt_check_expire;
 	init_timer(&rt_secret_timer);
 	rt_secret_timer.function = rt_secret_rebuild;
 
 	/* All the timers, started at system startup tend
 	   to synchronize. Perturb it a bit.
 	 */
-	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
-					ip_rt_gc_interval;
-	add_timer(&rt_periodic_timer);
+	schedule_delayed_work(&expires_work,
+		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
 
 	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
 		ip_rt_secret_interval;

^ permalink raw reply related

* Re: [patch] sunrpc: make closing of old temporary sockets work (was: problems with lockd in 2.6.22.6)
From: J. Bruce Fields @ 2007-09-12 13:37 UTC (permalink / raw)
  To: Wolfgang Walter; +Cc: netdev, nfs, linux-kernel, trond.myklebust
In-Reply-To: <200709121407.11151.wolfgang.walter@studentenwerk.mhn.de>

On Wed, Sep 12, 2007 at 02:07:10PM +0200, Wolfgang Walter wrote:
> as already described old temporary sockets (client is gone) of lockd aren't
> closed after some time. So, with enough clients and some time gone, there
> are 80 open dangling sockets and you start getting messages of the form:
> 
> lockd: too many open TCP sockets, consider increasing the number of nfsd threads.

Thanks for working on this problem!

> If I understand the code then the intention was that the server closes
> temporary sockets after about 6 to 12 minutes:
> 
> 	a timer is started which calls svc_age_temp_sockets every 6 minutes.
> 
> 	svc_age_temp_sockets:
> 		if a socket is marked OLD it gets closed.
> 		sockets which are not marked as OLD are marked OLD
> 
> 	every time the sockets receives something OLD is cleared.
> 
> But svc_age_temp_sockets never closes any socket though because it only
> closes sockets with svsk->sk_inuse == 0. This seems to be a bug.
> 
> Here is a patch against 2.6.22.6 which changes the test to
> svsk->sk_inuse <= 0 which was probably meant. The patched kernel runs fine
> here. Unused sockets get closed (after 6 to 12 minutes)

So the fact that this changes the behavior means that sk_inuse is taking
on negative values.  This can't be right--how can something like
svc_sock_put() (which does an atomic_dec_and_test) work in that case?

I wish I had time today to figure out what's going on in this case.  But
from a quick through svsock.c for sk_inuse, it looks odd; I'm suspicious
of anything without the stereotyped behavior--initializing to one,
atomic_inc()ing whenever someone takes a reference, and
atomic_dec_and_test()ing whenever someone drops it....

--b.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply

* Re: [PATCH] [IPROUTE2] Add support for moving links between network namespaces
From: Stephen Hemminger @ 2007-09-12 13:39 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: netdev, Pavel Emelyanov, Patrick McHardy
In-Reply-To: <m1ir6gava1.fsf_-_@ebiederm.dsl.xmission.com>

On Wed, 12 Sep 2007 07:05:42 -0600
ebiederm@xmission.com (Eric W. Biederman) wrote:

> 
> This adds support for setting the IFLA_NET_NS_PID attribute
> on links allowing them to be moved between network namespaces.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
> ---
>  include/linux/if_link.h |    1 +
>  ip/iplink.c             |    9 +++++++++
>  2 files changed, 10 insertions(+), 0 deletions(-)

Please don't mix header file updates with command changes.
As a first step, I always install standard kernel santized headers.

^ permalink raw reply

* Re: [PATCH 1/4] [IPROUTE2] Revert "Make ip utility veth driver aware"
From: Eric W. Biederman @ 2007-09-12 13:48 UTC (permalink / raw)
  To: Pavel Emelyanov; +Cc: Stephen Hemminger, netdev, Patrick McHardy
In-Reply-To: <m1ejh4av4j.fsf@ebiederm.dsl.xmission.com>

ebiederm@xmission.com (Eric W. Biederman) writes:

> Pavel Emelyanov <xemul@openvz.org> writes:
>
>> Eric W. Biederman wrote:
>>> Stephen it looks like you weren't cc'd on the latest version
>>> of the veth support.  So this patchset first reverts the old
>>
>> He was. The latest version looks completely different from what
>> is reversed in this patch.
>
> This is against the latest snapshot I could find.  My apologies
> if I missed some of the communication.

I was working against:
git://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git

And the last I could find of the conversation about veth support was
in the thread announcing iproute-2-2.6.23-rc3, and Stephen Hemminger
asking for the latest version of the veth support to be sent on
Sept 1st.

So it is quite possible this has been resolved in private email,
and nothing public has been updated yet.

I just don't have a copy of anything newer, and I don't know where else
I would look for something newer.  So since I'm starting to use veth
I sent the patches I had to make it work.

The last round of veth support for iproute2 I could find was sent
on the 19th of July and David Miller, Patrick McHardy, and netdev
were copied but Stephen Hemminger wasn't.  Which is where my
assertion that Stephen hadn't been sent the latest version came from.

If you guys have already sorted this out and I just can't find the
code I'm overjoyed.  Otherwise the patches I sent should be enough
to get things sorted out, if I have figure out the current state of
confusion.

Eric

^ permalink raw reply

* Re: RFC: possible NAPI improvements to reduce interrupt rates for low traffic rates
From: James Chapman @ 2007-09-12 13:50 UTC (permalink / raw)
  To: hadi, Bill Fink
  Cc: netdev, davem, jeff, mandeep.baines, ossthema, Stephen Hemminger
In-Reply-To: <1189599142.4326.38.camel@localhost>

jamal wrote:
> On Wed, 2007-12-09 at 03:04 -0400, Bill Fink wrote:
>> On Fri, 07 Sep 2007, jamal wrote:
> 
>>> I am going to be the devil's advocate[1]:
>> So let me be the angel's advocate.  :-)
> 
> I think this would make you God's advocate ;->
> (http://en.wikipedia.org/wiki/God%27s_advocate)
> 
>> I view his results much more favorably.  
> 
> The challenge is, under _low traffic_: bad bad CPU use.
> Thats what is at stake, correct?

By low traffic, I assume you mean a rate at which the NAPI driver 
doesn't stay in polled mode. The problem is that that rate is getting 
higher all the time, as interface and CPU speeds increase. This results 
in too many interrupts and NAPI thrashing in/out of polled mode very 
quickly.

> Lets bury the stats for a sec ...

Yes please. We need an analysis of what happens to cpu usage, latency, 
pps etc when various factors are changed, e.g. input pps, NAPI busy-idle 
delay etc. The main purpose of my RFC wasn't to push a patch into the 
kernel right now, it was to highlight the issue and to find out if 
others were already working on it. The feedback has been good so far. I 
just need to find some time to do some testing. :)

> People are bitching about NAPI abusing CPU, is the 
> answer to abuse more CPU than NAPI?;->

Jamal, do you have more details? Are people saying NAPI gets too much of 
the CPU pie because they profiled it? Are they complaining that system 
behavior degrades too much under certain network traffic conditions? 
Mouse cursor movement jittery? Real-time apps such as music/video 
players starved of CPU? Is it possible they blame NAPI because they see 
tangible effects on their system, not because measured CPU usage is 
high? I say this because my music/video player and mouse cursor behave 
_much_ better with my NAPI changes during general use, despite the 
increase in measured cpu load. Even ftp can make my system's mouse 
cursor jitter...

> The answer could be "I am not solving that problem anymore" - at least
> thats what James is saying;->

I'm investigating whether the symptoms I describe above can be reduced 
or eliminated without resorting to hardware interrupt mitigation. 
Specifically, I want to do more testing on the idle polling scheme which 
seems to improve system behavior in my tests. This will involve more 
than doing a flood ping or two. :)

>> Sometimes there
>> are tradeoffs to be made to be decided by the user based on what's most
>> important to that user and his specific workload.  And the suggested
>> ethtool option (defaulting to current behavior) would enable the user
>> to make that decision.
> 
> And the challenge is:
> What workload is willing to invest that much cpu for low traffic?
> Can you name one? One that may come close is database benchmarks for
> latency - but those folks wouldnt touch this with a mile-long pole if
> you told them their cpu use is going to get worse than what NAPI (that
> big bad CPU hog under low traffic) is giving them.

I agree with both of you. But we need more test results first to know 
whether it will be useful to offer NAPI idle polling as an _option_.

-- 
James Chapman
Katalix Systems Ltd
http://www.katalix.com
Catalysts for your Embedded Linux software development

^ permalink raw reply

* Re: RFC: possible NAPI improvements to reduce interrupt rates for low traffic rates
From: Stephen Hemminger @ 2007-09-12 14:02 UTC (permalink / raw)
  To: James Chapman
  Cc: hadi, Bill Fink, netdev, davem, jeff, mandeep.baines, ossthema
In-Reply-To: <46E7EE89.9060006@katalix.com>

On Wed, 12 Sep 2007 14:50:01 +0100
James Chapman <jchapman@katalix.com> wrote:

> jamal wrote:
> > On Wed, 2007-12-09 at 03:04 -0400, Bill Fink wrote:
> >> On Fri, 07 Sep 2007, jamal wrote:
> > 
> >>> I am going to be the devil's advocate[1]:
> >> So let me be the angel's advocate.  :-)
> > 
> > I think this would make you God's advocate ;->
> > (http://en.wikipedia.org/wiki/God%27s_advocate)
> > 
> >> I view his results much more favorably.  
> > 
> > The challenge is, under _low traffic_: bad bad CPU use.
> > Thats what is at stake, correct?
> 
> By low traffic, I assume you mean a rate at which the NAPI driver 
> doesn't stay in polled mode. The problem is that that rate is getting 
> higher all the time, as interface and CPU speeds increase. This results 
> in too many interrupts and NAPI thrashing in/out of polled mode very 
> quickly.

But if you compare this to non-NAPI driver the same softirq
overhead happens. The problem is that for many older devices disabling IRQ's
require an expensive non-cached PCI access. Smarter, newer devices
all use MSI which is pure edge triggered and with proper register
usage, NAPI should be no worse than non-NAPI.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox