netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Virtual ethernet tunnel
@ 2007-06-06 15:11 Pavel Emelianov
  2007-06-06 15:17 ` [PATCH] Module for ip utility to support veth device Pavel Emelianov
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-06 15:11 UTC (permalink / raw)
  To: Patrick McHardy, Eric W. Biederman
  Cc: Linux Netdev List, Linux Containers, Kirill Korotaev

Veth stands for Virtual ETHernet. It is a simple tunnel driver
that works at the link layer and looks like a pair of ethernet
devices interconnected with each other.

Mainly it allows to communicate between network namespaces but
it can be used as is as well.

Eric recently sent a similar driver called etun. This
implementation uses another interface - the RTM_NRELINK
message introduced by Patric. The patch fits today netdev
tree with Patrick's patches.

The newlink callback is organized that way to make it easy
to create the peer device in the separate namespace when we
have them in kernel.

The patch for an ip utility is also provided.

Eric, since ethtool interface was from your patch, I add
your Signed-off-by line.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Pavel Emelianov <xemul@openvz.org>

---

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 7d57f4a..7e144be 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -119,6 +119,12 @@ config TUN
 
 	  If you don't know what to use this for, you don't need it.
 
+config VETH
+	tristate "Virtual ethernet device"
+	---help---
+	  The device is an ethernet tunnel. Devices are created in pairs. When
+	  one end receives the packet it appears on its pair and vice versa.
+
 config NET_SB1000
 	tristate "General Instruments Surfboard 1000"
 	depends on PNP
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index a77affa..4764119 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -185,6 +185,7 @@ obj-$(CONFIG_MACSONIC) += macsonic.o
 obj-$(CONFIG_MACMACE) += macmace.o
 obj-$(CONFIG_MAC89x0) += mac89x0.o
 obj-$(CONFIG_TUN) += tun.o
+obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_NET_NETX) += netx-eth.o
 obj-$(CONFIG_DL2K) += dl2k.o
 obj-$(CONFIG_R8169) += r8169.o
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
new file mode 100644
index 0000000..6746c91
--- /dev/null
+++ b/drivers/net/veth.c
@@ -0,0 +1,391 @@
+/*
+ *  drivers/net/veth.c
+ *
+ *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/veth.h>
+
+#define DRV_NAME	"veth"
+#define DRV_VERSION	"1.0"
+
+struct veth_priv {
+	struct net_device *peer;
+	struct net_device *dev;
+	struct list_head list;
+	struct net_device_stats stats;
+	unsigned ip_summed;
+};
+
+static LIST_HEAD(veth_list);
+
+/*
+ * ethtool interface
+ */
+
+static struct {
+	const char string[ETH_GSTRING_LEN];
+} ethtool_stats_keys[] = {
+	{ "peer_ifindex" },
+};
+
+static int veth_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	cmd->supported		= 0;
+	cmd->advertising	= 0;
+	cmd->speed		= SPEED_10000;
+	cmd->duplex		= DUPLEX_FULL;
+	cmd->port		= PORT_TP;
+	cmd->phy_address	= 0;
+	cmd->transceiver	= XCVR_INTERNAL;
+	cmd->autoneg		= AUTONEG_DISABLE;
+	cmd->maxtxpkt		= 0;
+	cmd->maxrxpkt		= 0;
+	return 0;
+}
+
+static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, DRV_NAME);
+	strcpy(info->version, DRV_VERSION);
+	strcpy(info->fw_version, "N/A");
+}
+
+static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	switch(stringset) {
+	case ETH_SS_STATS:
+		memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
+		break;
+	}
+}
+
+static int veth_get_stats_count(struct net_device *dev)
+{
+	return ARRAY_SIZE(ethtool_stats_keys);
+}
+
+static void veth_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, u64 *data)
+{
+	struct veth_priv *priv;
+
+	priv = netdev_priv(dev);
+	data[0] = priv->peer->ifindex;
+}
+
+static u32 veth_get_rx_csum(struct net_device *dev)
+{
+	struct veth_priv *priv;
+
+	priv = netdev_priv(dev);
+	return priv->ip_summed == CHECKSUM_UNNECESSARY;
+}
+
+static int veth_set_rx_csum(struct net_device *dev, u32 data)
+{
+	struct veth_priv *priv;
+
+	priv = netdev_priv(dev);
+	priv->ip_summed = data ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+	return 0;
+}
+
+static u32 veth_get_tx_csum(struct net_device *dev)
+{
+	return (dev->features & NETIF_F_NO_CSUM) != 0;
+}
+
+static int veth_set_tx_csum(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_NO_CSUM;
+	else
+		dev->features &= ~NETIF_F_NO_CSUM;
+	return 0;
+}
+
+static struct ethtool_ops veth_ethtool_ops = {
+	.get_settings		= veth_get_settings,
+	.get_drvinfo		= veth_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_rx_csum		= veth_get_rx_csum,
+	.set_rx_csum		= veth_set_rx_csum,
+	.get_tx_csum		= veth_get_tx_csum,
+	.set_tx_csum		= veth_set_tx_csum,
+	.get_sg			= ethtool_op_get_sg,
+	.set_sg			= ethtool_op_set_sg,
+	.get_strings		= veth_get_strings,
+	.get_stats_count	= veth_get_stats_count,
+	.get_ethtool_stats	= veth_get_ethtool_stats,
+	.get_perm_addr		= ethtool_op_get_perm_addr,
+};
+
+/*
+ * xmit
+ */
+
+static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device *rcv = NULL;
+	struct veth_priv *priv, *rcv_priv;
+	int length;
+
+	skb_orphan(skb);
+
+	priv = netdev_priv(dev);
+	rcv = priv->peer;
+	rcv_priv = netdev_priv(rcv);
+
+	if (!(rcv->flags & IFF_UP))
+		goto outf;
+
+	skb->dev = rcv;
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, rcv);
+	if (dev->features & NETIF_F_NO_CSUM)
+		skb->ip_summed = rcv_priv->ip_summed;
+
+	dst_release(skb->dst);
+	skb->dst = NULL;
+
+	secpath_reset(skb);
+	nf_reset(skb);
+
+	length = skb->len;
+
+	priv->stats.tx_bytes += length;
+	priv->stats.tx_packets++;
+
+	rcv_priv->stats.rx_bytes += length;
+	rcv_priv->stats.rx_packets++;
+
+	netif_rx(skb);
+	return 0;
+
+outf:
+	kfree_skb(skb);
+	priv->stats.tx_dropped++;
+	return 0;
+}
+
+/*
+ * general routines
+ */
+
+static struct net_device_stats *veth_get_stats(struct net_device *dev)
+{
+	struct veth_priv *priv;
+	
+	priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+static int veth_open(struct net_device *dev)
+{
+	struct veth_priv *priv;
+
+	priv = netdev_priv(dev);
+	if (priv->peer == NULL)
+		return -ENOTCONN;
+
+	if (priv->peer->flags & IFF_UP) {
+		netif_carrier_on(dev);
+		netif_carrier_on(priv->peer);
+	}
+	return 0;
+}
+
+static int veth_close(struct net_device *dev)
+{
+	struct veth_priv *priv;
+
+	if (netif_carrier_ok(dev)) {
+		priv = netdev_priv(dev);
+		netif_carrier_off(dev);
+		netif_carrier_off(priv->peer);
+	}
+	return 0;
+}
+
+/*
+ * netlink interface
+ */
+
+static void veth_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->hard_start_xmit = veth_xmit;
+	dev->get_stats = veth_get_stats;
+	dev->open = veth_open;
+	dev->stop = veth_close;
+	dev->destructor = free_netdev;
+	dev->ethtool_ops = &veth_ethtool_ops;
+	dev->features |= NETIF_F_LLTX;
+	netif_carrier_off(dev);
+}
+
+static int veth_newlink(struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	int err;
+	struct net_device *peer;
+	struct veth_priv *priv;
+	char ifname[IFNAMSIZ];
+
+	/*
+	 * setup the first device
+	 */
+
+	if (data != NULL && data[VETH_INFO_MAC] != NULL)
+		memcpy(dev->dev_addr,
+				nla_data(data[VETH_INFO_MAC]), ETH_ALEN);
+	else
+		random_ether_addr(dev->dev_addr);
+
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto err_register_dev;
+
+	/*
+	 * alloc and setup the second one
+	 *
+	 * this should be done in another namespace, but we
+	 * do not have them yet
+	 */
+
+	if (data != NULL && data[VETH_INFO_PEER] != NULL)
+		nla_strlcpy(ifname, data[VETH_INFO_PEER], IFNAMSIZ);
+	else
+		snprintf(ifname, IFNAMSIZ, "veth%%d");
+
+	err = -ENOMEM;
+	peer = alloc_netdev(sizeof(struct veth_priv), ifname, veth_setup);
+	if (peer == NULL)
+		goto err_alloc;
+
+	if (strchr(peer->name, '%')) {
+		err = dev_alloc_name(peer, peer->name);
+		if (err < 0)
+			goto err_name;
+	}
+
+	if (data != NULL && data[VETH_INFO_PEER_MAC] != NULL)
+		memcpy(peer->dev_addr,
+				nla_data(data[VETH_INFO_PEER_MAC]), ETH_ALEN);
+	else
+		random_ether_addr(peer->dev_addr);
+
+	/* this should be in sync with rtnl_newlink */
+	peer->mtu = dev->mtu;
+	peer->tx_queue_len = dev->tx_queue_len;
+	peer->weight = dev->weight;
+	peer->link_mode = dev->link_mode;
+	peer->rtnl_link_ops = dev->rtnl_link_ops;
+
+	if (peer->operstate != dev->operstate) {
+		write_lock_bh(&dev_base_lock);
+		peer->operstate = dev->operstate;
+		write_unlock_bh(&dev_base_lock);
+		netdev_state_change(peer);
+	}
+
+	err = register_netdevice(peer);
+	if (err < 0)
+		goto err_register_peer;
+
+	/*
+	 * tie the deviced together
+	 */
+
+	priv = netdev_priv(dev);
+	priv->dev = dev;
+	priv->peer = peer;
+	list_add(&priv->list, &veth_list);
+
+	priv = netdev_priv(peer);
+	priv->dev = peer;
+	priv->peer = dev;
+	INIT_LIST_HEAD(&priv->list);
+	return 0;
+
+err_register_peer:
+	/* nothing special to do */
+err_name:
+	free_netdev(peer);
+err_alloc:
+	unregister_netdevice(dev);
+err_register_dev:
+	return err;
+}
+
+static void veth_dellink(struct net_device *dev)
+{
+	struct veth_priv *priv;
+	struct net_device *peer;
+
+	priv = netdev_priv(dev);
+	peer = priv->peer;
+
+	if (!list_empty(&priv->list))
+		list_del(&priv->list);
+
+	priv = netdev_priv(peer);
+	if (!list_empty(&priv->list))
+		list_del(&priv->list);
+
+	unregister_netdevice(dev);
+	unregister_netdevice(peer);
+}
+
+static const struct nla_policy veth_policy[VETH_INFO_MAX] = {
+	[VETH_INFO_MAC]		= { .type = NLA_BINARY, .len = ETH_ALEN },
+	[VETH_INFO_PEER]	= { .type = NLA_STRING },
+	[VETH_INFO_PEER_MAC]	= { .type = NLA_BINARY, .len = ETH_ALEN },
+};
+
+static struct rtnl_link_ops veth_link_ops = {
+	.name		= "veth",
+	.priv_size	= sizeof(struct veth_priv),
+	.setup		= veth_setup,
+	.newlink	= veth_newlink,
+	.dellink	= veth_dellink,
+	.policy		= veth_policy,
+	.maxtype	= VETH_INFO_MAX,
+};
+
+/*
+ * init/fini
+ */
+
+static __init int veth_init(void)
+{
+	return rtnl_link_register(&veth_link_ops);
+}
+
+static __exit void veth_exit(void)
+{
+	struct veth_priv *priv, *next;
+
+	rtnl_lock();
+	__rtnl_link_unregister(&veth_link_ops);
+
+	list_for_each_entry_safe(priv, next, &veth_list, list)
+		veth_dellink(priv->dev);
+	rtnl_unlock();
+}
+
+module_init(veth_init);
+module_exit(veth_exit);
+
+MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
+MODULE_LICENSE("GPL v2");
diff --git a/include/net/veth.h b/include/net/veth.h
new file mode 100644
index 0000000..74c8e1e
--- /dev/null
+++ b/include/net/veth.h
@@ -0,0 +1,13 @@
+#ifndef __NET_VETH_H__
+#define __NET_VETH_H__
+
+enum {
+	VETH_INFO_UNSPEC,
+	VETH_INFO_MAC,
+	VETH_INFO_PEER,
+	VETH_INFO_PEER_MAC,
+
+	VETH_INFO_MAX
+};
+
+#endif

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH] Module for ip utility to support veth device
  2007-06-06 15:11 [PATCH] Virtual ethernet tunnel Pavel Emelianov
@ 2007-06-06 15:17 ` Pavel Emelianov
  2007-06-06 15:18   ` Patrick McHardy
  2007-06-06 15:28 ` [PATCH] Virtual ethernet tunnel Patrick McHardy
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-06 15:17 UTC (permalink / raw)
  To: Patrick McHardy, Eric W. Biederman
  Cc: Linux Netdev List, Linux Containers, Kirill Korotaev

The usage is
# ip link add [name] type veth [peer <name>] [mac <mac>] [peer_mac <mac>]

The Makefile is maybe not as beautiful as it could be. It
is to be discussed.

One thing I noticed during testing is the following. When launching
this with link_veth.so module and not specifying any module specific
parameters, the kernel refuses to accept the packet when parsing the
IFLA_LINKINFO. So the hunk for ip/iplink.c doesn't add an empty extra 
header if no extra data expected.

Signed-off-by: Pavel Emelianov <xemul@openvz.org>

---

diff --git a/ip/Makefile b/ip/Makefile
index 9a5bfe3..b46bce3 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -8,8 +8,9 @@ RTMONOBJ=rtmon.o
 ALLOBJ=$(IPOBJ) $(RTMONOBJ)
 SCRIPTS=ifcfg rtpr routel routef
 TARGETS=ip rtmon
+LIBS=link_veth.so
 
-all: $(TARGETS) $(SCRIPTS)
+all: $(TARGETS) $(SCRIPTS) $(LIBS)
 
 ip: $(IPOBJ) $(LIBNETLINK) $(LIBUTIL)
 
@@ -24,3 +25,6 @@ clean:
 
 LDLIBS	+= -ldl
 LDFLAGS	+= -Wl,-export-dynamic
+
+%.so: %.c
+	$(CC) $(CFLAGS) -shared $< -o $@
diff --git a/ip/iplink.c b/ip/iplink.c
index 5170419..6975990 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -287,7 +287,7 @@ static int iplink_modify(int cmd, unsign
 			 strlen(type));
 
 		lu = get_link_type(type);
-		if (lu) {
+		if (lu && argc) {
 			struct rtattr * data = NLMSG_TAIL(&req.n);
 			addattr_l(&req.n, sizeof(req), IFLA_INFO_DATA, NULL, 0);
 
diff --git a/ip/link_veth.c b/ip/link_veth.c
new file mode 100644
index 0000000..adfdef6
--- /dev/null
+++ b/ip/link_veth.c
@@ -0,0 +1,77 @@
+#include <string.h>
+
+#include "utils.h"
+#include "ip_common.h"
+#include "veth.h"
+
+#define ETH_ALEN	6
+
+static void usage(void)
+{
+	printf("Usage: ip link add ... "
+			"[peer <peer-name>] [mac <mac>] [peer_mac <mac>]\n");
+}
+
+static int veth_parse_opt(struct link_util *lu, int argc, char **argv,
+		struct nlmsghdr *hdr)
+{
+	__u8 mac[ETH_ALEN];
+
+	for (; argc != 0; argv++, argc--) {
+		if (strcmp(*argv, "peer") == 0) {
+			argv++;
+			argc--;
+			if (argc == 0) {
+				usage();
+				return -1;
+			}
+
+			addattr_l(hdr, 1024, VETH_INFO_PEER,
+					*argv, strlen(*argv));
+
+			continue;
+		}
+
+		if (strcmp(*argv, "mac") == 0) {
+			argv++;
+			argc--;
+			if (argc == 0) {
+				usage();
+				return -1;
+			}
+
+			if (hexstring_a2n(*argv, mac, sizeof(mac)) == NULL)
+				return -1;
+
+			addattr_l(hdr, 1024, VETH_INFO_MAC,
+					mac, ETH_ALEN);
+			continue;
+		}
+
+		if (strcmp(*argv, "peer_mac") == 0) {
+			argv++;
+			argc--;
+			if (argc == 0) {
+				usage();
+				return -1;
+			}
+
+			if (hexstring_a2n(*argv, mac, sizeof(mac)) == NULL)
+				return -1;
+
+			addattr_l(hdr, 1024, VETH_INFO_PEER_MAC,
+					mac, ETH_ALEN);
+			continue;
+		}
+
+		usage();
+		return -1;
+	}
+
+	return 0;
+}
+
+struct link_util veth_link_util = {
+	.id = "veth",
+	.parse_opt = veth_parse_opt,
+};
diff --git a/ip/veth.h b/ip/veth.h
new file mode 100644
index 0000000..74c8e1e
--- /dev/null
+++ b/ip/veth.h
@@ -0,0 +1,13 @@
+#ifndef __NET_VETH_H__
+#define __NET_VETH_H__
+
+enum {
+	VETH_INFO_UNSPEC,
+	VETH_INFO_MAC,
+	VETH_INFO_PEER,
+	VETH_INFO_PEER_MAC,
+
+	VETH_INFO_MAX
+};
+
+#endif

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] Module for ip utility to support veth device
  2007-06-06 15:17 ` [PATCH] Module for ip utility to support veth device Pavel Emelianov
@ 2007-06-06 15:18   ` Patrick McHardy
  0 siblings, 0 replies; 26+ messages in thread
From: Patrick McHardy @ 2007-06-06 15:18 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Pavel Emelianov wrote:
> diff --git a/ip/iplink.c b/ip/iplink.c
> index 5170419..6975990 100644
> --- a/ip/iplink.c
> +++ b/ip/iplink.c
> @@ -287,7 +287,7 @@ static int iplink_modify(int cmd, unsign
>  			 strlen(type));
>  
>  		lu = get_link_type(type);
> -		if (lu) {
> +		if (lu && argc) {
>  			struct rtattr * data = NLMSG_TAIL(&req.n);
>  			addattr_l(&req.n, sizeof(req), IFLA_INFO_DATA, NULL, 0);


I've folded this part into my iproute patch, thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 15:11 [PATCH] Virtual ethernet tunnel Pavel Emelianov
  2007-06-06 15:17 ` [PATCH] Module for ip utility to support veth device Pavel Emelianov
@ 2007-06-06 15:28 ` Patrick McHardy
  2007-06-07  8:09   ` Pavel Emelianov
  2007-06-06 15:39 ` Patrick McHardy
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 26+ messages in thread
From: Patrick McHardy @ 2007-06-06 15:28 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Pavel Emelianov wrote:
> Veth stands for Virtual ETHernet. It is a simple tunnel driver
> that works at the link layer and looks like a pair of ethernet
> devices interconnected with each other.
> 
> Mainly it allows to communicate between network namespaces but
> it can be used as is as well.
> 
> Eric recently sent a similar driver called etun. This
> implementation uses another interface - the RTM_NRELINK
> message introduced by Patric. The patch fits today netdev
> tree with Patrick's patches.
> 
> The newlink callback is organized that way to make it easy
> to create the peer device in the separate namespace when we
> have them in kernel.
> 

> +struct veth_priv {
> +	struct net_device *peer;
> +	struct net_device *dev;
> +	struct list_head list;
> +	struct net_device_stats stats;


You can use dev->stats instead.

> +static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct net_device *rcv = NULL;
> +	struct veth_priv *priv, *rcv_priv;
> +	int length;
> +
> +	skb_orphan(skb);
> +
> +	priv = netdev_priv(dev);
> +	rcv = priv->peer;
> +	rcv_priv = netdev_priv(rcv);
> +
> +	if (!(rcv->flags & IFF_UP))
> +		goto outf;
> +
> +	skb->dev = rcv;

eth_type_trans already sets skb->dev.

> +	skb->pkt_type = PACKET_HOST;
> +	skb->protocol = eth_type_trans(skb, rcv);
> +	if (dev->features & NETIF_F_NO_CSUM)
> +		skb->ip_summed = rcv_priv->ip_summed;
> +
> +	dst_release(skb->dst);
> +	skb->dst = NULL;
> +
> +	secpath_reset(skb);
> +	nf_reset(skb);


Is skb->mark supposed to survive communication between different
namespaces?

> +static const struct nla_policy veth_policy[VETH_INFO_MAX] = {
> +	[VETH_INFO_MAC]		= { .type = NLA_BINARY, .len = ETH_ALEN },
> +	[VETH_INFO_PEER]	= { .type = NLA_STRING },
> +	[VETH_INFO_PEER_MAC]	= { .type = NLA_BINARY, .len = ETH_ALEN },
> +};


The rtnl_link codes looks fine. I don't like the VETH_INFO_MAC attribute
very much though, we already have a generic device attribute for MAC
addresses. Of course that only allows you to supply one MAC address, so
I'm wondering what you think of allocating only a single device per
newlink operation and binding them in a seperate enslave operation?

> +enum {
> +	VETH_INFO_UNSPEC,
> +	VETH_INFO_MAC,
> +	VETH_INFO_PEER,
> +	VETH_INFO_PEER_MAC,
> +
> +	VETH_INFO_MAX
> +};

Please follow the

#define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)

convention here.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 15:11 [PATCH] Virtual ethernet tunnel Pavel Emelianov
  2007-06-06 15:17 ` [PATCH] Module for ip utility to support veth device Pavel Emelianov
  2007-06-06 15:28 ` [PATCH] Virtual ethernet tunnel Patrick McHardy
@ 2007-06-06 15:39 ` Patrick McHardy
  2007-06-06 16:17 ` Stephen Hemminger
  2007-06-06 19:47 ` David Miller
  4 siblings, 0 replies; 26+ messages in thread
From: Patrick McHardy @ 2007-06-06 15:39 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Pavel Emelianov wrote:

> +MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
> +MODULE_LICENSE("GPL v2");

This seems to be missing MODULE_ALIAS_RTNL_LINK("veth");

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 15:11 [PATCH] Virtual ethernet tunnel Pavel Emelianov
                   ` (2 preceding siblings ...)
  2007-06-06 15:39 ` Patrick McHardy
@ 2007-06-06 16:17 ` Stephen Hemminger
  2007-06-06 19:47 ` David Miller
  4 siblings, 0 replies; 26+ messages in thread
From: Stephen Hemminger @ 2007-06-06 16:17 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Patrick McHardy, Eric W. Biederman, Linux Netdev List,
	Linux Containers, Kirill Korotaev

On Wed, 06 Jun 2007 19:11:38 +0400
Pavel Emelianov <xemul@openvz.org> wrote:

> Veth stands for Virtual ETHernet. It is a simple tunnel driver
> that works at the link layer and looks like a pair of ethernet
> devices interconnected with each other.
> 
> Mainly it allows to communicate between network namespaces but
> it can be used as is as well.
> 
> Eric recently sent a similar driver called etun. This
> implementation uses another interface - the RTM_NRELINK
> message introduced by Patric. The patch fits today netdev
> tree with Patrick's patches.
> 
> The newlink callback is organized that way to make it easy
> to create the peer device in the separate namespace when we
> have them in kernel.
> 
> The patch for an ip utility is also provided.
> 
> Eric, since ethtool interface was from your patch, I add
> your Signed-off-by line.
> 
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
> Signed-off-by: Pavel Emelianov <xemul@openvz.org>
> 
> ---
> 
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index 7d57f4a..7e144be 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -119,6 +119,12 @@ config TUN
>  
>  	  If you don't know what to use this for, you don't need it.
>  
> +config VETH
> +	tristate "Virtual ethernet device"
> +	---help---
> +	  The device is an ethernet tunnel. Devices are created in pairs. When
> +	  one end receives the packet it appears on its pair and vice versa.
> +
>  config NET_SB1000
>  	tristate "General Instruments Surfboard 1000"
>  	depends on PNP
> diff --git a/drivers/net/Makefile b/drivers/net/Makefile
> index a77affa..4764119 100644
> --- a/drivers/net/Makefile
> +++ b/drivers/net/Makefile
> @@ -185,6 +185,7 @@ obj-$(CONFIG_MACSONIC) += macsonic.o
>  obj-$(CONFIG_MACMACE) += macmace.o
>  obj-$(CONFIG_MAC89x0) += mac89x0.o
>  obj-$(CONFIG_TUN) += tun.o
> +obj-$(CONFIG_VETH) += veth.o
>  obj-$(CONFIG_NET_NETX) += netx-eth.o
>  obj-$(CONFIG_DL2K) += dl2k.o
>  obj-$(CONFIG_R8169) += r8169.o
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> new file mode 100644
> index 0000000..6746c91
> --- /dev/null
> +++ b/drivers/net/veth.c
> @@ -0,0 +1,391 @@
> +/*
> + *  drivers/net/veth.c
> + *
> + *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
> + *
> + */
> +
> +#include <linux/list.h>
> +#include <linux/netdevice.h>
> +#include <linux/ethtool.h>
> +#include <linux/etherdevice.h>
> +
> +#include <net/dst.h>
> +#include <net/xfrm.h>
> +#include <net/veth.h>
> +
> +#define DRV_NAME	"veth"
> +#define DRV_VERSION	"1.0"
> +
> +struct veth_priv {
> +	struct net_device *peer;
> +	struct net_device *dev;
> +	struct list_head list;
> +	struct net_device_stats stats;
> +	unsigned ip_summed;
> +};
> +
> +static LIST_HEAD(veth_list);
> +
> +/*
> + * ethtool interface
> + */
> +
> +static struct {
> +	const char string[ETH_GSTRING_LEN];
> +} ethtool_stats_keys[] = {
> +	{ "peer_ifindex" },
> +};

Seems like a good usage of sysfs attributes, rather than ethtool.

Then you can get rid of all the useless ethtool for what is
basically a virtual device.

> +/*
> + * xmit
> + */
> +
> +static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct net_device *rcv = NULL;
> +	struct veth_priv *priv, *rcv_priv;
> +	int length;
> +
> +	skb_orphan(skb);
> +
> +	priv = netdev_priv(dev);
> +	rcv = priv->peer;
> +	rcv_priv = netdev_priv(rcv);
> +
> +	if (!(rcv->flags & IFF_UP))
> +		goto outf;
> +
> +	skb->dev = rcv;
> +	skb->pkt_type = PACKET_HOST;
> +	skb->protocol = eth_type_trans(skb, rcv);
> +	if (dev->features & NETIF_F_NO_CSUM)
> +		skb->ip_summed = rcv_priv->ip_summed;
> +
> +	dst_release(skb->dst);
> +	skb->dst = NULL;
> +
> +	secpath_reset(skb);
> +	nf_reset(skb);
> +
> +	length = skb->len;
> +
> +	priv->stats.tx_bytes += length;
> +	priv->stats.tx_packets++;
> +
> +	rcv_priv->stats.rx_bytes += length;
> +	rcv_priv->stats.rx_packets++;

Per-cpu stats? This will cacheline thrash.

> +	netif_rx(skb);
> +	return 0;
> +
> +outf:
> +	kfree_skb(skb);
> +	priv->stats.tx_dropped++;
> +	return 0;
> +}



-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 15:11 [PATCH] Virtual ethernet tunnel Pavel Emelianov
                   ` (3 preceding siblings ...)
  2007-06-06 16:17 ` Stephen Hemminger
@ 2007-06-06 19:47 ` David Miller
  2007-06-06 20:38   ` [Devel] " Daniel Lezcano
  2007-06-07  8:14   ` Kirill Korotaev
  4 siblings, 2 replies; 26+ messages in thread
From: David Miller @ 2007-06-06 19:47 UTC (permalink / raw)
  To: xemul; +Cc: kaber, ebiederm, netdev, containers, dev

From: Pavel Emelianov <xemul@openvz.org>
Date: Wed, 06 Jun 2007 19:11:38 +0400

> Veth stands for Virtual ETHernet. It is a simple tunnel driver
> that works at the link layer and looks like a pair of ethernet
> devices interconnected with each other.

I would suggest choosing a different name.

'veth' is also the name of the virtualized ethernet device
found on IBM machines, driven by driver/net/ibmveth.[ch]


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Devel] Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 19:47 ` David Miller
@ 2007-06-06 20:38   ` Daniel Lezcano
  2007-06-06 20:49     ` David Miller
  2007-06-07  8:14   ` Kirill Korotaev
  1 sibling, 1 reply; 26+ messages in thread
From: Daniel Lezcano @ 2007-06-06 20:38 UTC (permalink / raw)
  To: David Miller; +Cc: xemul, containers, netdev, kaber, dev

David Miller wrote:
> From: Pavel Emelianov <xemul@openvz.org>
> Date: Wed, 06 Jun 2007 19:11:38 +0400
>
>   
>> Veth stands for Virtual ETHernet. It is a simple tunnel driver
>> that works at the link layer and looks like a pair of ethernet
>> devices interconnected with each other.
>>     
>
> I would suggest choosing a different name.
>
> 'veth' is also the name of the virtualized ethernet device
> found on IBM machines, driven by driver/net/ibmveth.[ch]
>   
Eric Biederman proposed the name "etun" which stands for "Ethernet TUNnel".

The goal of the pair device is to pass network packets between namespaces.
That reminds me the well known "pipe" which pass data between processes.

All network devices have a name describing what they do : "eth" for 
ethernet,
"dummy" for a device doing nothing, loopback, ...

Perhaps, a name like "epipe" or "npipe", which reflects what does the 
device, is more appropriate ?

-- Daniel



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Devel] Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 20:38   ` [Devel] " Daniel Lezcano
@ 2007-06-06 20:49     ` David Miller
  0 siblings, 0 replies; 26+ messages in thread
From: David Miller @ 2007-06-06 20:49 UTC (permalink / raw)
  To: dlezcano; +Cc: xemul, containers, netdev, kaber, dev

From: Daniel Lezcano <dlezcano@fr.ibm.com>
Date: Wed, 06 Jun 2007 22:38:11 +0200

> Perhaps, a name like "epipe" or "npipe", which reflects what does the 
> device, is more appropriate ?

'npipe' (Network PIPE) or 'epipe' (Ethernet PIPE) are fine with me.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 15:28 ` [PATCH] Virtual ethernet tunnel Patrick McHardy
@ 2007-06-07  8:09   ` Pavel Emelianov
  2007-06-07  9:29     ` Daniel Lezcano
  2007-06-11 11:39     ` Patrick McHardy
  0 siblings, 2 replies; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-07  8:09 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Patrick McHardy wrote:
> Pavel Emelianov wrote:
>> Veth stands for Virtual ETHernet. It is a simple tunnel driver
>> that works at the link layer and looks like a pair of ethernet
>> devices interconnected with each other.
>>
>> Mainly it allows to communicate between network namespaces but
>> it can be used as is as well.
>>
>> Eric recently sent a similar driver called etun. This
>> implementation uses another interface - the RTM_NRELINK
>> message introduced by Patric. The patch fits today netdev
>> tree with Patrick's patches.
>>
>> The newlink callback is organized that way to make it easy
>> to create the peer device in the separate namespace when we
>> have them in kernel.
>>
> 
>> +struct veth_priv {
>> +	struct net_device *peer;
>> +	struct net_device *dev;
>> +	struct list_head list;
>> +	struct net_device_stats stats;
> 
> 
> You can use dev->stats instead.

OK. Actually I planned to use percpu stats to reduce cacheline
trashing (Stephen has noticed it also). The reason I didn't do it
here is that the patch would look more complicated, but I wanted to
show and approve the netlink interface first.

>> +static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> +	struct net_device *rcv = NULL;
>> +	struct veth_priv *priv, *rcv_priv;
>> +	int length;
>> +
>> +	skb_orphan(skb);
>> +
>> +	priv = netdev_priv(dev);
>> +	rcv = priv->peer;
>> +	rcv_priv = netdev_priv(rcv);
>> +
>> +	if (!(rcv->flags & IFF_UP))
>> +		goto outf;
>> +
>> +	skb->dev = rcv;
> 
> eth_type_trans already sets skb->dev.

Ok. Thanks.

>> +	skb->pkt_type = PACKET_HOST;
>> +	skb->protocol = eth_type_trans(skb, rcv);
>> +	if (dev->features & NETIF_F_NO_CSUM)
>> +		skb->ip_summed = rcv_priv->ip_summed;
>> +
>> +	dst_release(skb->dst);
>> +	skb->dst = NULL;
>> +
>> +	secpath_reset(skb);
>> +	nf_reset(skb);
> 
> 
> Is skb->mark supposed to survive communication between different
> namespaces?

I guess it must not. Thanks.

>> +static const struct nla_policy veth_policy[VETH_INFO_MAX] = {
>> +	[VETH_INFO_MAC]		= { .type = NLA_BINARY, .len = ETH_ALEN },
>> +	[VETH_INFO_PEER]	= { .type = NLA_STRING },
>> +	[VETH_INFO_PEER_MAC]	= { .type = NLA_BINARY, .len = ETH_ALEN },
>> +};
> 
> 
> The rtnl_link codes looks fine. I don't like the VETH_INFO_MAC attribute
> very much though, we already have a generic device attribute for MAC
> addresses. Of course that only allows you to supply one MAC address, so
> I'm wondering what you think of allocating only a single device per
> newlink operation and binding them in a seperate enslave operation?

I did this at the very first version, but Alexey showed me that this
would be wrong. Look. When we create the second device it must be in
the other namespace as it is useless to have them in one namespace.
But if we have the device in the other namespace the RTNL_NEWLINK 
message from kernel would come into this namespace thus confusing ip
utility in the init namespace. Creating the device in the init ns and
moving it into the new one is rather a complex task.

But with such approach the creation looks really logical. We send a 
packet to the kernel and have a single response about the new device 
appearance. At the same time we have a RTNL_NEWLINK message arrived at 
the destination namespace informing that a new device has appeared 
there as well.

>> +enum {
>> +	VETH_INFO_UNSPEC,
>> +	VETH_INFO_MAC,
>> +	VETH_INFO_PEER,
>> +	VETH_INFO_PEER_MAC,
>> +
>> +	VETH_INFO_MAX
>> +};
> 
> Please follow the
> 
> #define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
> 
> convention here.

Could you please clarify this point. I saw the lines
enum {
	...
	RTNL_NEWLINK
#define RTNL_NEWLINK RTNL_NEWLINK
	...
}
and had my brains exploded imagining what this would mean :(

> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Devel] Re: [PATCH] Virtual ethernet tunnel
  2007-06-06 19:47 ` David Miller
  2007-06-06 20:38   ` [Devel] " Daniel Lezcano
@ 2007-06-07  8:14   ` Kirill Korotaev
  2007-06-07  9:07     ` David Miller
  1 sibling, 1 reply; 26+ messages in thread
From: Kirill Korotaev @ 2007-06-07  8:14 UTC (permalink / raw)
  To: David Miller; +Cc: xemul, containers, netdev, kaber, dev

David Miller wrote:
> From: Pavel Emelianov <xemul@openvz.org>
> Date: Wed, 06 Jun 2007 19:11:38 +0400
> 
> 
>>Veth stands for Virtual ETHernet. It is a simple tunnel driver
>>that works at the link layer and looks like a pair of ethernet
>>devices interconnected with each other.
> 
> 
> I would suggest choosing a different name.
> 
> 'veth' is also the name of the virtualized ethernet device
> found on IBM machines, driven by driver/net/ibmveth.[ch]

AFAICS, ibmveth.c registers ethX devices, while this driver registers
vethX by default, so there is no much conflict IMHO.

But if you still don't like it, I would suggest 'ethtun' and later
we will come with iptun for IP tunneling. Is it ok for you?

Thanks,
Kirill


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Devel] Re: [PATCH] Virtual ethernet tunnel
  2007-06-07  8:14   ` Kirill Korotaev
@ 2007-06-07  9:07     ` David Miller
  2007-06-07  9:30       ` Benjamin Thery
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2007-06-07  9:07 UTC (permalink / raw)
  To: dev; +Cc: xemul, containers, netdev, kaber, dev

From: Kirill Korotaev <dev@sw.ru>
Date: Thu, 07 Jun 2007 12:14:29 +0400

> David Miller wrote:
> > From: Pavel Emelianov <xemul@openvz.org>
> > Date: Wed, 06 Jun 2007 19:11:38 +0400
> > 
> > 
> >>Veth stands for Virtual ETHernet. It is a simple tunnel driver
> >>that works at the link layer and looks like a pair of ethernet
> >>devices interconnected with each other.
> > 
> > 
> > I would suggest choosing a different name.
> > 
> > 'veth' is also the name of the virtualized ethernet device
> > found on IBM machines, driven by driver/net/ibmveth.[ch]
> 
> AFAICS, ibmveth.c registers ethX devices, while this driver registers
> vethX by default, so there is no much conflict IMHO.

If that's the case, veth is fine with me.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07  8:09   ` Pavel Emelianov
@ 2007-06-07  9:29     ` Daniel Lezcano
  2007-06-07  9:51       ` Pavel Emelianov
  2007-06-11 11:39     ` Patrick McHardy
  1 sibling, 1 reply; 26+ messages in thread
From: Daniel Lezcano @ 2007-06-07  9:29 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Patrick McHardy, Linux Containers, Linux Netdev List,
	Eric W. Biederman, Kirill Korotaev

Pavel Emelianov wrote:
> Patrick McHardy wrote:
>   
>> Pavel Emelianov wrote:
>>     
>>> Veth stands for Virtual ETHernet. It is a simple tunnel driver
>>> that works at the link layer and looks like a pair of ethernet
>>> devices interconnected with each other.
>>>
>>> Mainly it allows to communicate between network namespaces but
>>> it can be used as is as well.
>>>
>>> Eric recently sent a similar driver called etun. This
>>> implementation uses another interface - the RTM_NRELINK
>>> message introduced by Patric. The patch fits today netdev
>>> tree with Patrick's patches.
>>>
>>> The newlink callback is organized that way to make it easy
>>> to create the peer device in the separate namespace when we
>>> have them in kernel.
>>>
>>>       
>>> +struct veth_priv {
>>> +	struct net_device *peer;
>>> +	struct net_device *dev;
>>> +	struct list_head list;
>>> +	struct net_device_stats stats;
>>>       
>> You can use dev->stats instead.
>>     
>
> OK. Actually I planned to use percpu stats to reduce cacheline
> trashing (Stephen has noticed it also). The reason I didn't do it
> here is that the patch would look more complicated, but I wanted to
> show and approve the netlink interface first.
>
>   
>>> +static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
>>> +{
>>> +	struct net_device *rcv = NULL;
>>> +	struct veth_priv *priv, *rcv_priv;
>>> +	int length;
>>> +
>>> +	skb_orphan(skb);
>>> +
>>> +	priv = netdev_priv(dev);
>>> +	rcv = priv->peer;
>>> +	rcv_priv = netdev_priv(rcv);
>>> +
>>> +	if (!(rcv->flags & IFF_UP))
>>> +		goto outf;
>>> +
>>> +	skb->dev = rcv;
>>>       
>> eth_type_trans already sets skb->dev.
>>     
>
> Ok. Thanks.
>
>   
>>> +	skb->pkt_type = PACKET_HOST;
>>> +	skb->protocol = eth_type_trans(skb, rcv);
>>> +	if (dev->features & NETIF_F_NO_CSUM)
>>> +		skb->ip_summed = rcv_priv->ip_summed;
>>> +
>>> +	dst_release(skb->dst);
>>> +	skb->dst = NULL;
>>> +
>>> +	secpath_reset(skb);
>>> +	nf_reset(skb);
>>>       
>> Is skb->mark supposed to survive communication between different
>> namespaces?
>>     
>
> I guess it must not. Thanks.
>
>   
>>> +static const struct nla_policy veth_policy[VETH_INFO_MAX] = {
>>> +	[VETH_INFO_MAC]		= { .type = NLA_BINARY, .len = ETH_ALEN },
>>> +	[VETH_INFO_PEER]	= { .type = NLA_STRING },
>>> +	[VETH_INFO_PEER_MAC]	= { .type = NLA_BINARY, .len = ETH_ALEN },
>>> +};
>>>       
>> The rtnl_link codes looks fine. I don't like the VETH_INFO_MAC attribute
>> very much though, we already have a generic device attribute for MAC
>> addresses. Of course that only allows you to supply one MAC address, so
>> I'm wondering what you think of allocating only a single device per
>> newlink operation and binding them in a seperate enslave operation?
>>     
>
> I did this at the very first version, but Alexey showed me that this
> would be wrong. Look. When we create the second device it must be in
> the other namespace as it is useless to have them in one namespace.
> But if we have the device in the other namespace the RTNL_NEWLINK 
> message from kernel would come into this namespace thus confusing ip
> utility in the init namespace. Creating the device in the init ns and
> moving it into the new one is rather a complex task.
>   
Pavel,

moving the netdevice to another namespace is not a complex task. Eric 
Biederman did it in its patchset ( cf.  http://lxc.sf.net/network )

When the pair device is created, both extremeties are into the init 
namespace and you can choose to which namespace to move one extremity. 
When the network namespace dies, the netdev is moved back to the init 
namespace.
That facilitate network device management.

Concerning netlink events, this is automatically generated when the 
network device is moved through namespaces.

IMHO, we should have the network device movement between namespaces in 
order to be able to move a physical network device too (eg. you have 4 
NIC and you want to create 3 containers and assign 3 NIC to each of them)

> But with such approach the creation looks really logical. We send a 
> packet to the kernel and have a single response about the new device 
> appearance. At the same time we have a RTNL_NEWLINK message arrived at 
> the destination namespace informing that a new device has appeared 
> there as well.
>
>   
>>> +enum {
>>> +	VETH_INFO_UNSPEC,
>>> +	VETH_INFO_MAC,
>>> +	VETH_INFO_PEER,
>>> +	VETH_INFO_PEER_MAC,
>>> +
>>> +	VETH_INFO_MAX
>>> +};
>>>       
>> Please follow the
>>
>> #define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
>>
>> convention here.
>>     
>
> Could you please clarify this point. I saw the lines
> enum {
> 	...
> 	RTNL_NEWLINK
> #define RTNL_NEWLINK RTNL_NEWLINK
> 	...
> }
> and had my brains exploded imagining what this would mean :(
>
>   
>> -
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>     
>
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
>
>   


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Devel] Re: [PATCH] Virtual ethernet tunnel
  2007-06-07  9:07     ` David Miller
@ 2007-06-07  9:30       ` Benjamin Thery
  0 siblings, 0 replies; 26+ messages in thread
From: Benjamin Thery @ 2007-06-07  9:30 UTC (permalink / raw)
  To: David Miller; +Cc: dev, containers, netdev, dev, kaber, xemul

David Miller wrote:
> From: Kirill Korotaev <dev@sw.ru>
> Date: Thu, 07 Jun 2007 12:14:29 +0400
> 
>> David Miller wrote:
>>> From: Pavel Emelianov <xemul@openvz.org>
>>> Date: Wed, 06 Jun 2007 19:11:38 +0400
>>>
>>>
>>>> Veth stands for Virtual ETHernet. It is a simple tunnel driver
>>>> that works at the link layer and looks like a pair of ethernet
>>>> devices interconnected with each other.
>>>
>>> I would suggest choosing a different name.
>>>
>>> 'veth' is also the name of the virtualized ethernet device
>>> found on IBM machines, driven by driver/net/ibmveth.[ch]
>> AFAICS, ibmveth.c registers ethX devices, while this driver registers
>> vethX by default, so there is no much conflict IMHO.
> 
> If that's the case, veth is fine with me.

I like Daniel's proposals with the tunnel or pipe thing in the name.
I think it is more explicit about what the device really is.

I'm currently using etun, Eric Biederman's implementation. It will be 
nice to have this kind of device merged.

-- Benjamin

-- 
B e n j a m i n   T h e r y  - BULL/DT/Open Software R&D

    http://www.bull.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07  9:29     ` Daniel Lezcano
@ 2007-06-07  9:51       ` Pavel Emelianov
  2007-06-07 14:05         ` Daniel Lezcano
  0 siblings, 1 reply; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-07  9:51 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Patrick McHardy, Linux Containers, Linux Netdev List,
	Eric W. Biederman, Kirill Korotaev

>> I did this at the very first version, but Alexey showed me that this
>> would be wrong. Look. When we create the second device it must be in
>> the other namespace as it is useless to have them in one namespace.
>> But if we have the device in the other namespace the RTNL_NEWLINK
>> message from kernel would come into this namespace thus confusing ip
>> utility in the init namespace. Creating the device in the init ns and
>> moving it into the new one is rather a complex task.
>>   
> Pavel,
> 
> moving the netdevice to another namespace is not a complex task. Eric
> Biederman did it in its patchset ( cf.  http://lxc.sf.net/network )

By saying complex I didn't mean that this is difficult to implement,
but that it consists (must consist) of many stages. I.e. composite.
Making the device right in the namespace is liter.

> When the pair device is created, both extremeties are into the init
> namespace and you can choose to which namespace to move one extremity.

I do not mind that.

> When the network namespace dies, the netdev is moved back to the init
> namespace.
> That facilitate network device management.
> 
> Concerning netlink events, this is automatically generated when the
> network device is moved through namespaces.
> 
> IMHO, we should have the network device movement between namespaces in
> order to be able to move a physical network device too (eg. you have 4
> NIC and you want to create 3 containers and assign 3 NIC to each of them)

Agree. Moving the devices is a must-have functionality.

I do not mind making the pair in the init namespace and move the second
one into the desired namespace. But if we *always* will have two ends in
different namespaces what to complicate things for?

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07  9:51       ` Pavel Emelianov
@ 2007-06-07 14:05         ` Daniel Lezcano
  2007-06-07 14:23           ` Kirill Korotaev
  2007-06-07 15:25           ` Pavel Emelianov
  0 siblings, 2 replies; 26+ messages in thread
From: Daniel Lezcano @ 2007-06-07 14:05 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Daniel Lezcano, Patrick McHardy, Linux Containers,
	Linux Netdev List, Eric W. Biederman, Kirill Korotaev

Pavel Emelianov wrote:
>>> I did this at the very first version, but Alexey showed me that this
>>> would be wrong. Look. When we create the second device it must be in
>>> the other namespace as it is useless to have them in one namespace.
>>> But if we have the device in the other namespace the RTNL_NEWLINK
>>> message from kernel would come into this namespace thus confusing ip
>>> utility in the init namespace. Creating the device in the init ns and
>>> moving it into the new one is rather a complex task.
>>>   
>>>       
>> Pavel,
>>
>> moving the netdevice to another namespace is not a complex task. Eric
>> Biederman did it in its patchset ( cf.  http://lxc.sf.net/network )
>>     
>
> By saying complex I didn't mean that this is difficult to implement,
> but that it consists (must consist) of many stages. I.e. composite.
> Making the device right in the namespace is liter.
>
>   
>> When the pair device is created, both extremeties are into the init
>> namespace and you can choose to which namespace to move one extremity.
>>     
>
> I do not mind that.
>   
>> When the network namespace dies, the netdev is moved back to the init
>> namespace.
>> That facilitate network device management.
>>
>> Concerning netlink events, this is automatically generated when the
>> network device is moved through namespaces.
>>
>> IMHO, we should have the network device movement between namespaces in
>> order to be able to move a physical network device too (eg. you have 4
>> NIC and you want to create 3 containers and assign 3 NIC to each of them)
>>     
>
> Agree. Moving the devices is a must-have functionality.
>
> I do not mind making the pair in the init namespace and move the second
> one into the desired namespace. But if we *always* will have two ends in
> different namespaces what to complicate things for?
>   
Just to provide a netdev sufficiently generic to be used by people who 
don't want namespaces but just want to do some network testing, like Ben 
Greear does. He mentioned in a previous email, he will be happy to stop 
redirecting people to out of tree patch.

https://lists.linux-foundation.org/pipermail/containers/2007-April/004420.html

> Thanks,
> Pavel
>
>   


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07 14:05         ` Daniel Lezcano
@ 2007-06-07 14:23           ` Kirill Korotaev
  2007-06-07 14:42             ` Daniel Lezcano
  2007-06-07 15:25           ` Pavel Emelianov
  1 sibling, 1 reply; 26+ messages in thread
From: Kirill Korotaev @ 2007-06-07 14:23 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Pavel Emelianov, Kirill Korotaev, Linux Netdev List,
	Eric W. Biederman, Linux Containers, Patrick McHardy

Deniel,

Daniel Lezcano wrote:
> Pavel Emelianov wrote:
> 
>>>>I did this at the very first version, but Alexey showed me that this
>>>>would be wrong. Look. When we create the second device it must be in
>>>>the other namespace as it is useless to have them in one namespace.
>>>>But if we have the device in the other namespace the RTNL_NEWLINK
>>>>message from kernel would come into this namespace thus confusing ip
>>>>utility in the init namespace. Creating the device in the init ns and
>>>>moving it into the new one is rather a complex task.
>>>>  
>>>>      
>>>
>>>Pavel,
>>>
>>>moving the netdevice to another namespace is not a complex task. Eric
>>>Biederman did it in its patchset ( cf.  http://lxc.sf.net/network )
>>>    
>>
>>By saying complex I didn't mean that this is difficult to implement,
>>but that it consists (must consist) of many stages. I.e. composite.
>>Making the device right in the namespace is liter.
>>
>>  
>>
>>>When the pair device is created, both extremeties are into the init
>>>namespace and you can choose to which namespace to move one extremity.
>>>    
>>
>>I do not mind that.
>>  
>>
>>>When the network namespace dies, the netdev is moved back to the init
>>>namespace.
>>>That facilitate network device management.
>>>
>>>Concerning netlink events, this is automatically generated when the
>>>network device is moved through namespaces.
>>>
>>>IMHO, we should have the network device movement between namespaces in
>>>order to be able to move a physical network device too (eg. you have 4
>>>NIC and you want to create 3 containers and assign 3 NIC to each of them)
>>>    
>>
>>Agree. Moving the devices is a must-have functionality.
>>
>>I do not mind making the pair in the init namespace and move the second
>>one into the desired namespace. But if we *always* will have two ends in
>>different namespaces what to complicate things for?
>>  
> 
> Just to provide a netdev sufficiently generic to be used by people who 
> don't want namespaces but just want to do some network testing, like Ben 
> Greear does. He mentioned in a previous email, he will be happy to stop 
> redirecting people to out of tree patch.
> 
> https://lists.linux-foundation.org/pipermail/containers/2007-April/004420.html

no one is against generic code and ability to create 2 interfaces in *one* namespace.
(Like we currently allow to do so in OpenVZ)

However, believe me, moving an interface is a *hard* operation. Much harder then netdev
register from the scratch.

Because it requires to take into account many things like:
- packets in flight which requires synchronize and is slow on big machines
- asynchronous sysfs entries registration/deregistration from
  rtln_unlock -> netdev_run_todo
- name/ifindex collisions
- shutdown/cleanup of addresses/routes/qdisc and other similar stuff

Thanks,
Kirill


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07 14:23           ` Kirill Korotaev
@ 2007-06-07 14:42             ` Daniel Lezcano
  2007-06-07 15:33               ` Pavel Emelianov
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Lezcano @ 2007-06-07 14:42 UTC (permalink / raw)
  To: Kirill Korotaev
  Cc: Daniel Lezcano, Pavel Emelianov, Kirill Korotaev,
	Linux Netdev List, Eric W. Biederman, Linux Containers,
	Patrick McHardy

Kirill Korotaev wrote:
> Deniel,
>
> Daniel Lezcano wrote:
>   
>> Pavel Emelianov wrote:
>>
>>     
>>>>> I did this at the very first version, but Alexey showed me that this
>>>>> would be wrong. Look. When we create the second device it must be in
>>>>> the other namespace as it is useless to have them in one namespace.
>>>>> But if we have the device in the other namespace the RTNL_NEWLINK
>>>>> message from kernel would come into this namespace thus confusing ip
>>>>> utility in the init namespace. Creating the device in the init ns and
>>>>> moving it into the new one is rather a complex task.
>>>>>  
>>>>>      
>>>>>           
>>>> Pavel,
>>>>
>>>> moving the netdevice to another namespace is not a complex task. Eric
>>>> Biederman did it in its patchset ( cf.  http://lxc.sf.net/network )
>>>>    
>>>>         
>>> By saying complex I didn't mean that this is difficult to implement,
>>> but that it consists (must consist) of many stages. I.e. composite.
>>> Making the device right in the namespace is liter.
>>>
>>>  
>>>
>>>       
>>>> When the pair device is created, both extremeties are into the init
>>>> namespace and you can choose to which namespace to move one extremity.
>>>>    
>>>>         
>>> I do not mind that.
>>>  
>>>
>>>       
>>>> When the network namespace dies, the netdev is moved back to the init
>>>> namespace.
>>>> That facilitate network device management.
>>>>
>>>> Concerning netlink events, this is automatically generated when the
>>>> network device is moved through namespaces.
>>>>
>>>> IMHO, we should have the network device movement between namespaces in
>>>> order to be able to move a physical network device too (eg. you have 4
>>>> NIC and you want to create 3 containers and assign 3 NIC to each of them)
>>>>    
>>>>         
>>> Agree. Moving the devices is a must-have functionality.
>>>
>>> I do not mind making the pair in the init namespace and move the second
>>> one into the desired namespace. But if we *always* will have two ends in
>>> different namespaces what to complicate things for?
>>>  
>>>       
>> Just to provide a netdev sufficiently generic to be used by people who 
>> don't want namespaces but just want to do some network testing, like Ben 
>> Greear does. He mentioned in a previous email, he will be happy to stop 
>> redirecting people to out of tree patch.
>>
>> https://lists.linux-foundation.org/pipermail/containers/2007-April/004420.html
>>     
>
> no one is against generic code and ability to create 2 interfaces in *one* namespace.
> (Like we currently allow to do so in OpenVZ)
>
> However, believe me, moving an interface is a *hard* operation. Much harder then netdev
> register from the scratch.
>
> Because it requires to take into account many things like:
> - packets in flight which requires synchronize and is slow on big machines
> - asynchronous sysfs entries registration/deregistration from
>   rtln_unlock -> netdev_run_todo
> - name/ifindex collisions
> - shutdown/cleanup of addresses/routes/qdisc and other similar stuff
>
>   
All of what you are describing is already implemented in the Eric's 
patchset.

You can have a look at :

http://lxc.sourceforge.net/patches/2.6.20/2.6.20-netns1/broken_out/

And more precisly:

for sysfs issues:
http://lxc.sourceforge.net/patches/2.6.20/2.6.20-netns1/broken_out/0065-sysfs-Shadow-directory-support.patch

for network device movement:
http://lxc.sourceforge.net/patches/2.6.20/2.6.20-netns1/broken_out/0096-net-Implment-network-device-movement-between-namesp.patch

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07 14:05         ` Daniel Lezcano
  2007-06-07 14:23           ` Kirill Korotaev
@ 2007-06-07 15:25           ` Pavel Emelianov
  2007-06-07 15:44             ` Daniel Lezcano
  1 sibling, 1 reply; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-07 15:25 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Patrick McHardy, Linux Containers, Linux Netdev List,
	Eric W. Biederman, Kirill Korotaev

Daniel Lezcano wrote:
> Pavel Emelianov wrote:
>>>> I did this at the very first version, but Alexey showed me that this
>>>> would be wrong. Look. When we create the second device it must be in
>>>> the other namespace as it is useless to have them in one namespace.
>>>> But if we have the device in the other namespace the RTNL_NEWLINK
>>>> message from kernel would come into this namespace thus confusing ip
>>>> utility in the init namespace. Creating the device in the init ns and
>>>> moving it into the new one is rather a complex task.
>>>>         
>>> Pavel,
>>>
>>> moving the netdevice to another namespace is not a complex task. Eric
>>> Biederman did it in its patchset ( cf.  http://lxc.sf.net/network )
>>>     
>>
>> By saying complex I didn't mean that this is difficult to implement,
>> but that it consists (must consist) of many stages. I.e. composite.
>> Making the device right in the namespace is liter.
>>
>>  
>>> When the pair device is created, both extremeties are into the init
>>> namespace and you can choose to which namespace to move one extremity.
>>>     
>>
>> I do not mind that.
>>  
>>> When the network namespace dies, the netdev is moved back to the init
>>> namespace.
>>> That facilitate network device management.
>>>
>>> Concerning netlink events, this is automatically generated when the
>>> network device is moved through namespaces.
>>>
>>> IMHO, we should have the network device movement between namespaces in
>>> order to be able to move a physical network device too (eg. you have 4
>>> NIC and you want to create 3 containers and assign 3 NIC to each of
>>> them)
>>>     
>>
>> Agree. Moving the devices is a must-have functionality.
>>
>> I do not mind making the pair in the init namespace and move the second
>> one into the desired namespace. But if we *always* will have two ends in
>> different namespaces what to complicate things for?
>>   
> Just to provide a netdev sufficiently generic to be used by people who
> don't want namespaces but just want to do some network testing, like Ben
> Greear does. He mentioned in a previous email, he will be happy to stop
> redirecting people to out of tree patch.

This patch creates booth devices in the init namespace. That's what
you want, isn't it? When we have the namespaces we will be able to
create the pair with booth ends in the init namespace - just do not
specify the namespace id to create the 2nd end in and the driver will
leave it int the init one.

> https://lists.linux-foundation.org/pipermail/containers/2007-April/004420.html
> 
> 
>> Thanks,
>> Pavel
>>
>>   
> 
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07 14:42             ` Daniel Lezcano
@ 2007-06-07 15:33               ` Pavel Emelianov
  0 siblings, 0 replies; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-07 15:33 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Kirill Korotaev, Linux Netdev List, Eric W. Biederman,
	Linux Containers, Patrick McHardy

>>
>> no one is against generic code and ability to create 2 interfaces in
>> *one* namespace.
>> (Like we currently allow to do so in OpenVZ)
>>
>> However, believe me, moving an interface is a *hard* operation. Much
>> harder then netdev
>> register from the scratch.
>>
>> Because it requires to take into account many things like:
>> - packets in flight which requires synchronize and is slow on big
>> machines
>> - asynchronous sysfs entries registration/deregistration from
>>   rtln_unlock -> netdev_run_todo
>> - name/ifindex collisions
>> - shutdown/cleanup of addresses/routes/qdisc and other similar stuff
>>
>>   
> All of what you are describing is already implemented in the Eric's
> patchset.

Daniel. We *agree* that this task *is implementable*. We just want
to say that creating the device in the namespace is less comp...
(oh, well, forget this word) faster than creating and then moving.

> You can have a look at :
> 
> http://lxc.sourceforge.net/patches/2.6.20/2.6.20-netns1/broken_out/
> 
> And more precisly:
> 
> for sysfs issues:
> http://lxc.sourceforge.net/patches/2.6.20/2.6.20-netns1/broken_out/0065-sysfs-Shadow-directory-support.patch
> 
> 
> for network device movement:
> http://lxc.sourceforge.net/patches/2.6.20/2.6.20-netns1/broken_out/0096-net-Implment-network-device-movement-between-namesp.patch

Good job. Can you prove that this code is less buggy than the existing
register_netdevice() one? This requires testing, doesn't it? But on the
other hand we have the ability to create the device right in the namespace
using well known (and thus well debugged) code with minimal actions from
the kernel.

> 
> Thanks,
> Daniel
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07 15:25           ` Pavel Emelianov
@ 2007-06-07 15:44             ` Daniel Lezcano
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Lezcano @ 2007-06-07 15:44 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Daniel Lezcano, Patrick McHardy, Linux Containers,
	Linux Netdev List, Eric W. Biederman, Kirill Korotaev

Pavel Emelianov wrote:
> Daniel Lezcano wrote:
>> Pavel Emelianov wrote:
>>>>> I did this at the very first version, but Alexey showed me that this
>>>>> would be wrong. Look. When we create the second device it must be in
>>>>> the other namespace as it is useless to have them in one namespace.
>>>>> But if we have the device in the other namespace the RTNL_NEWLINK
>>>>> message from kernel would come into this namespace thus confusing ip
>>>>> utility in the init namespace. Creating the device in the init ns and
>>>>> moving it into the new one is rather a complex task.
>>>>>         
>>>> Pavel,
>>>>
>>>> moving the netdevice to another namespace is not a complex task. Eric
>>>> Biederman did it in its patchset ( cf.  http://lxc.sf.net/network )
>>>>     
>>> By saying complex I didn't mean that this is difficult to implement,
>>> but that it consists (must consist) of many stages. I.e. composite.
>>> Making the device right in the namespace is liter.
>>>
>>>  
>>>> When the pair device is created, both extremeties are into the init
>>>> namespace and you can choose to which namespace to move one extremity.
>>>>     
>>> I do not mind that.
>>>  
>>>> When the network namespace dies, the netdev is moved back to the init
>>>> namespace.
>>>> That facilitate network device management.
>>>>
>>>> Concerning netlink events, this is automatically generated when the
>>>> network device is moved through namespaces.
>>>>
>>>> IMHO, we should have the network device movement between namespaces in
>>>> order to be able to move a physical network device too (eg. you have 4
>>>> NIC and you want to create 3 containers and assign 3 NIC to each of
>>>> them)
>>>>     
>>> Agree. Moving the devices is a must-have functionality.
>>>
>>> I do not mind making the pair in the init namespace and move the second
>>> one into the desired namespace. But if we *always* will have two ends in
>>> different namespaces what to complicate things for?
>>>   
>> Just to provide a netdev sufficiently generic to be used by people who
>> don't want namespaces but just want to do some network testing, like Ben
>> Greear does. He mentioned in a previous email, he will be happy to stop
>> redirecting people to out of tree patch.
> 
> This patch creates booth devices in the init namespace. That's what
> you want, isn't it? When we have the namespaces we will be able to
> create the pair with booth ends in the init namespace - just do not
> specify the namespace id to create the 2nd end in and the driver will
> leave it int the init one.

Ok, fine.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-07  8:09   ` Pavel Emelianov
  2007-06-07  9:29     ` Daniel Lezcano
@ 2007-06-11 11:39     ` Patrick McHardy
  2007-06-13  9:24       ` Pavel Emelianov
  1 sibling, 1 reply; 26+ messages in thread
From: Patrick McHardy @ 2007-06-11 11:39 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Pavel Emelianov wrote:
> Patrick McHardy wrote:
> 

>>>+	skb->pkt_type = PACKET_HOST;
>>>+	skb->protocol = eth_type_trans(skb, rcv);
>>>+	if (dev->features & NETIF_F_NO_CSUM)
>>>+		skb->ip_summed = rcv_priv->ip_summed;
>>>+
>>>+	dst_release(skb->dst);
>>>+	skb->dst = NULL;
>>>+
>>>+	secpath_reset(skb);
>>>+	nf_reset(skb);
>>
>>
>>Is skb->mark supposed to survive communication between different
>>namespaces?
> 
> 
> I guess it must not. Thanks.


I guess there are a few others that should be cleared as well,
like the tc related members, secmark, ipvs_property, ...

>>The rtnl_link codes looks fine. I don't like the VETH_INFO_MAC attribute
>>very much though, we already have a generic device attribute for MAC
>>addresses. Of course that only allows you to supply one MAC address, so
>>I'm wondering what you think of allocating only a single device per
>>newlink operation and binding them in a seperate enslave operation?
> 
> 
> I did this at the very first version, but Alexey showed me that this
> would be wrong. Look. When we create the second device it must be in
> the other namespace as it is useless to have them in one namespace.
> But if we have the device in the other namespace the RTNL_NEWLINK 
> message from kernel would come into this namespace thus confusing ip
> utility in the init namespace. Creating the device in the init ns and
> moving it into the new one is rather a complex task.
> 
> But with such approach the creation looks really logical. We send a 
> packet to the kernel and have a single response about the new device 
> appearance. At the same time we have a RTNL_NEWLINK message arrived at 
> the destination namespace informing that a new device has appeared 
> there as well.


The question is how to proceed. I haven't read all mails yet, but it
seems there is some disagreement about whether to create all devices
in the same namespace and move them later or create them directly in
their target namespace. For now I guess it doesn't matter much, so
can everyone agree to adding a IFLA_PARTNER attribute that includes
a complete ifinfomsg and the attributes and you later decide how to
handle namespaces?

>>>+enum {
>>>+	VETH_INFO_UNSPEC,
>>>+	VETH_INFO_MAC,
>>>+	VETH_INFO_PEER,
>>>+	VETH_INFO_PEER_MAC,
>>>+
>>>+	VETH_INFO_MAX
>>>+};
>>
>>Please follow the
>>
>>#define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
>>
>>convention here.
> 
> 
> Could you please clarify this point. I saw the lines
> enum {
> 	...
> 	RTNL_NEWLINK
> #define RTNL_NEWLINK RTNL_NEWLINK
> 	...
> }
> and had my brains exploded imagining what this would mean :(


Thats just to make the new attributes visible as preprocessor
symbols so userspace can use them for #ifdefs. We usually use
it when adding new attributes/message types, but its not necessary
for the initial set of attributes if you already have some other
preprocessor-visisble symbol (like VETH_INFO_MAX) userspace can
use.

What I was refering to is this convention:

enum {
...
        __IFLA_MAX
};

#define IFLA_MAX (__IFLA_MAX - 1)

Which is used to make sure that IFLA_MAX is really the max and
not max + 1 and additionally people won't forget to update it.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-11 11:39     ` Patrick McHardy
@ 2007-06-13  9:24       ` Pavel Emelianov
  2007-06-13 11:12         ` Patrick McHardy
  0 siblings, 1 reply; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-13  9:24 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Patrick McHardy wrote:
> Pavel Emelianov wrote:
>> Patrick McHardy wrote:
>>
> 
>>>> +	skb->pkt_type = PACKET_HOST;
>>>> +	skb->protocol = eth_type_trans(skb, rcv);
>>>> +	if (dev->features & NETIF_F_NO_CSUM)
>>>> +		skb->ip_summed = rcv_priv->ip_summed;
>>>> +
>>>> +	dst_release(skb->dst);
>>>> +	skb->dst = NULL;
>>>> +
>>>> +	secpath_reset(skb);
>>>> +	nf_reset(skb);
>>>
>>> Is skb->mark supposed to survive communication between different
>>> namespaces?
>>
>> I guess it must not. Thanks.
> 
> 
> I guess there are a few others that should be cleared as well,
> like the tc related members, secmark, ipvs_property, ...

It seems like we are about to have some skb_reset_all() routine to
make the skb look like newborn.

>>> The rtnl_link codes looks fine. I don't like the VETH_INFO_MAC attribute
>>> very much though, we already have a generic device attribute for MAC
>>> addresses. Of course that only allows you to supply one MAC address, so
>>> I'm wondering what you think of allocating only a single device per
>>> newlink operation and binding them in a seperate enslave operation?
>>
>> I did this at the very first version, but Alexey showed me that this
>> would be wrong. Look. When we create the second device it must be in
>> the other namespace as it is useless to have them in one namespace.
>> But if we have the device in the other namespace the RTNL_NEWLINK 
>> message from kernel would come into this namespace thus confusing ip
>> utility in the init namespace. Creating the device in the init ns and
>> moving it into the new one is rather a complex task.
>>
>> But with such approach the creation looks really logical. We send a 
>> packet to the kernel and have a single response about the new device 
>> appearance. At the same time we have a RTNL_NEWLINK message arrived at 
>> the destination namespace informing that a new device has appeared 
>> there as well.
> 
> 
> The question is how to proceed. I haven't read all mails yet, but it
> seems there is some disagreement about whether to create all devices
> in the same namespace and move them later or create them directly in

The agreement was that we can make any of the above. We can create
booth devices in the init namespace and then move one of them into the
desired namespace, or we can explicitly specify which namespace to create
the pair in.

> their target namespace. For now I guess it doesn't matter much, so
> can everyone agree to adding a IFLA_PARTNER attribute that includes
> a complete ifinfomsg and the attributes and you later decide how to
> handle namespaces?
> 
>>>> +enum {
>>>> +	VETH_INFO_UNSPEC,
>>>> +	VETH_INFO_MAC,
>>>> +	VETH_INFO_PEER,
>>>> +	VETH_INFO_PEER_MAC,
>>>> +
>>>> +	VETH_INFO_MAX
>>>> +};
>>> Please follow the
>>>
>>> #define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
>>>
>>> convention here.
>>
>> Could you please clarify this point. I saw the lines
>> enum {
>> 	...
>> 	RTNL_NEWLINK
>> #define RTNL_NEWLINK RTNL_NEWLINK
>> 	...
>> }
>> and had my brains exploded imagining what this would mean :(
> 
> 
> Thats just to make the new attributes visible as preprocessor
> symbols so userspace can use them for #ifdefs. We usually use
> it when adding new attributes/message types, but its not necessary
> for the initial set of attributes if you already have some other
> preprocessor-visisble symbol (like VETH_INFO_MAX) userspace can
> use.
> 
> What I was refering to is this convention:
> 
> enum {
> ...
>         __IFLA_MAX
> };
> 
> #define IFLA_MAX (__IFLA_MAX - 1)
> 
> Which is used to make sure that IFLA_MAX is really the max and
> not max + 1 and additionally people won't forget to update it.

OK thanks. This is already done in the v2.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-13  9:24       ` Pavel Emelianov
@ 2007-06-13 11:12         ` Patrick McHardy
  2007-06-13 16:02           ` Pavel Emelianov
  0 siblings, 1 reply; 26+ messages in thread
From: Patrick McHardy @ 2007-06-13 11:12 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Pavel Emelianov wrote:
> Patrick McHardy wrote:
> 
>>The question is how to proceed. I haven't read all mails yet, but it
>>seems there is some disagreement about whether to create all devices
>>in the same namespace and move them later or create them directly in
> 
> 
> The agreement was that we can make any of the above. We can create
> booth devices in the init namespace and then move one of them into the
> desired namespace, or we can explicitly specify which namespace to create
> the pair in.


I'm going to push my latest patches to Dave today, the easiest way is
probably is you just add whatever you need to the API afterwards.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-13 16:02           ` Pavel Emelianov
@ 2007-06-13 15:37             ` Patrick McHardy
  0 siblings, 0 replies; 26+ messages in thread
From: Patrick McHardy @ 2007-06-13 15:37 UTC (permalink / raw)
  To: Pavel Emelianov
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Pavel Emelianov wrote:
> Patrick McHardy wrote:
> 
>>I'm going to push my latest patches to Dave today, the easiest way is
>>probably is you just add whatever you need to the API afterwards.
>>
> 
> 
> OK. Dave didn't object against the driver. Hope he will accept it as well.
> 
> I have also found a BUG in your API. Look, when you declare the alias with
> the MODULE_ALIAS_RTNL_LINK in drivers you use strings as an argument. But 
> this macro stringifyes the argument itself which results in bad aliases.
> 
> Signed-off-by: Pavel Emelianov <xemul@openvz.org>
> 
> ---
> 
> diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
> index d744198..f627e1f 100644
> --- a/include/net/rtnetlink.h
> +++ b/include/net/rtnetlink.h
> @@ -77,6 +77,6 @@ extern void	__rtnl_link_unregister(struc
>  extern int	rtnl_link_register(struct rtnl_link_ops *ops);
>  extern void	rtnl_link_unregister(struct rtnl_link_ops *ops);
>  
> -#define MODULE_ALIAS_RTNL_LINK(name) MODULE_ALIAS("rtnl-link-" #name)
> +#define MODULE_ALIAS_RTNL_LINK(name) MODULE_ALIAS("rtnl-link-" name)


Thanks, I already fixed this during final round of testing :)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Virtual ethernet tunnel
  2007-06-13 11:12         ` Patrick McHardy
@ 2007-06-13 16:02           ` Pavel Emelianov
  2007-06-13 15:37             ` Patrick McHardy
  0 siblings, 1 reply; 26+ messages in thread
From: Pavel Emelianov @ 2007-06-13 16:02 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric W. Biederman, Linux Netdev List, Linux Containers,
	Kirill Korotaev

Patrick McHardy wrote:
> Pavel Emelianov wrote:
>> Patrick McHardy wrote:
>>
>>> The question is how to proceed. I haven't read all mails yet, but it
>>> seems there is some disagreement about whether to create all devices
>>> in the same namespace and move them later or create them directly in
>>
>> The agreement was that we can make any of the above. We can create
>> booth devices in the init namespace and then move one of them into the
>> desired namespace, or we can explicitly specify which namespace to create
>> the pair in.
> 
> 
> I'm going to push my latest patches to Dave today, the easiest way is
> probably is you just add whatever you need to the API afterwards.
> 
> 

OK. Dave didn't object against the driver. Hope he will accept it as well.

I have also found a BUG in your API. Look, when you declare the alias with
the MODULE_ALIAS_RTNL_LINK in drivers you use strings as an argument. But 
this macro stringifyes the argument itself which results in bad aliases.

Signed-off-by: Pavel Emelianov <xemul@openvz.org>

---

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index d744198..f627e1f 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -77,6 +77,6 @@ extern void	__rtnl_link_unregister(struc
 extern int	rtnl_link_register(struct rtnl_link_ops *ops);
 extern void	rtnl_link_unregister(struct rtnl_link_ops *ops);
 
-#define MODULE_ALIAS_RTNL_LINK(name) MODULE_ALIAS("rtnl-link-" #name)
+#define MODULE_ALIAS_RTNL_LINK(name) MODULE_ALIAS("rtnl-link-" name)
 
 #endif

^ permalink raw reply related	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-06-13 15:37 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-06 15:11 [PATCH] Virtual ethernet tunnel Pavel Emelianov
2007-06-06 15:17 ` [PATCH] Module for ip utility to support veth device Pavel Emelianov
2007-06-06 15:18   ` Patrick McHardy
2007-06-06 15:28 ` [PATCH] Virtual ethernet tunnel Patrick McHardy
2007-06-07  8:09   ` Pavel Emelianov
2007-06-07  9:29     ` Daniel Lezcano
2007-06-07  9:51       ` Pavel Emelianov
2007-06-07 14:05         ` Daniel Lezcano
2007-06-07 14:23           ` Kirill Korotaev
2007-06-07 14:42             ` Daniel Lezcano
2007-06-07 15:33               ` Pavel Emelianov
2007-06-07 15:25           ` Pavel Emelianov
2007-06-07 15:44             ` Daniel Lezcano
2007-06-11 11:39     ` Patrick McHardy
2007-06-13  9:24       ` Pavel Emelianov
2007-06-13 11:12         ` Patrick McHardy
2007-06-13 16:02           ` Pavel Emelianov
2007-06-13 15:37             ` Patrick McHardy
2007-06-06 15:39 ` Patrick McHardy
2007-06-06 16:17 ` Stephen Hemminger
2007-06-06 19:47 ` David Miller
2007-06-06 20:38   ` [Devel] " Daniel Lezcano
2007-06-06 20:49     ` David Miller
2007-06-07  8:14   ` Kirill Korotaev
2007-06-07  9:07     ` David Miller
2007-06-07  9:30       ` Benjamin Thery

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).