Netdev List

Netdev List
 help / color / mirror / Atom feed

* [patch net-next 2/4] net: allow to change carrier via sysfs
From: Jiri Pirko @ 2012-12-12 10:58 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, bhutchings, mirqus, shemminger, greearb, fbl
In-Reply-To: <1355309887-1081-1-git-send-email-jiri@resnulli.us>

Make carrier writable

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 net/core/net-sysfs.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 334efd5..7eda40a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -126,6 +126,19 @@ static ssize_t show_broadcast(struct device *dev,
 	return -EINVAL;
 }
 
+static int change_carrier(struct net_device *net, unsigned long new_carrier)
+{
+	if (!netif_running(net))
+		return -EINVAL;
+	return dev_change_carrier(net, (bool) new_carrier);
+}
+
+static ssize_t store_carrier(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_carrier);
+}
+
 static ssize_t show_carrier(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
@@ -331,7 +344,7 @@ static struct device_attribute net_class_attributes[] = {
 	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
 	__ATTR(address, S_IRUGO, show_address, NULL),
 	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
-	__ATTR(carrier, S_IRUGO, show_carrier, NULL),
+	__ATTR(carrier, S_IRUGO | S_IWUSR, show_carrier, store_carrier),
 	__ATTR(speed, S_IRUGO, show_speed, NULL),
 	__ATTR(duplex, S_IRUGO, show_duplex, NULL),
 	__ATTR(dormant, S_IRUGO, show_dormant, NULL),
-- 
1.8.0

^ permalink raw reply related

* [patch net-next 1/4] net: add change_carrier netdev op
From: Jiri Pirko @ 2012-12-12 10:58 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, bhutchings, mirqus, shemminger, greearb, fbl
In-Reply-To: <1355309887-1081-1-git-send-email-jiri@resnulli.us>

This allows a driver to register change_carrier callback which will be
called whenever user will like to change carrier state. This is useful
for devices like dummy, gre, team and so on.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 include/linux/netdevice.h |  7 +++++++
 net/core/dev.c            | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6a14d4..e1a5c16 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -891,6 +891,9 @@ struct netdev_fcoe_hbainfo {
  * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh)
  * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
  *			     struct net_device *dev)
+ *
+ * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
+ *	Called to update device carrier.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1008,6 +1011,8 @@ struct net_device_ops {
 	int			(*ndo_bridge_getlink)(struct sk_buff *skb,
 						      u32 pid, u32 seq,
 						      struct net_device *dev);
+	int			(*ndo_change_carrier)(struct net_device *dev,
+						      bool new_carrier);
 };
 
 /*
@@ -2191,6 +2196,8 @@ extern int		dev_set_mtu(struct net_device *, int);
 extern void		dev_set_group(struct net_device *, int);
 extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
+extern int		dev_change_carrier(struct net_device *,
+					   bool new_carrier);
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
 					    struct netdev_queue *txq);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4783850..cc6426b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5025,6 +5025,25 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 }
 EXPORT_SYMBOL(dev_set_mac_address);
 
+/**
+ *	dev_change_carrier - Change device carrier
+ *	@dev: device
+ *	@new_carries: new value
+ *
+ *	Change device carrier
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_change_carrier)
+		return -EOPNOTSUPP;
+	if (!netif_device_present(dev))
+		return -ENODEV;
+	return ops->ndo_change_carrier(dev, new_carrier);
+}
+EXPORT_SYMBOL(dev_change_carrier);
+
 /*
  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
  */
-- 
1.8.0

^ permalink raw reply related

* [patch net-next 0/4] net: allow to change carrier from userspace
From: Jiri Pirko @ 2012-12-12 10:58 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, bhutchings, mirqus, shemminger, greearb, fbl

This is basically a repost of my previous patchset:
"[patch net-next-2.6 0/2] net: allow to change carrier via sysfs" from Aug 30

The way net-sysfs stores values changed and this patchset reflects it.
Also, I exposed carrier via rtnetlink iface.

So far, only dummy driver uses carrier change ndo. In very near future
team driver will use that as well.

Jiri Pirko (4):
  net: add change_carrier netdev op
  net: allow to change carrier via sysfs
  rtnl: expose carrier value with possibility to set it
  dummy: implement carrier change

 drivers/net/dummy.c          | 10 ++++++++++
 include/linux/netdevice.h    |  7 +++++++
 include/uapi/linux/if_link.h |  1 +
 net/core/dev.c               | 19 +++++++++++++++++++
 net/core/net-sysfs.c         | 15 ++++++++++++++-
 net/core/rtnetlink.c         | 10 ++++++++++
 6 files changed, 61 insertions(+), 1 deletion(-)

-- 
1.8.0

^ permalink raw reply

* Re: [PATCH net-next 2/2] net/mlx4_en: Add support for destination MAC in steering rules
From: Amir Vadai @ 2012-12-12 10:07 UTC (permalink / raw)
  To: Brian Haley; +Cc: David S. Miller, netdev, Or Gerlitz, Yan Burman
In-Reply-To: <50C75382.3040504@hp.com>

On 11/12/2012 17:38, Brian Haley wrote:
> On 12/11/2012 07:03 AM, Amir Vadai wrote:
>> --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
>> @@ -619,7 +619,13 @@ static int mlx4_en_validate_flow(struct net_device *dev,
>>   	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
>>   		return -EINVAL;
>>
>> -	switch (cmd->fs.flow_type & ~FLOW_EXT) {
>> +	if (cmd->fs.flow_type & FLOW_MAC_EXT) {
>> +		/* dest mac mask must be ff:ff:ff:ff:ff:ff */
>> +		if (memcmp(cmd->fs.m_ext.h_dest, &full_mac, ETH_ALEN))
>> +			return -EINVAL;
>> +	}
>
> etherdevice.h has is_broadcast_ether_addr() and is_zero_ether_addr() if you want
> to get rid of full_mac and zero_mac in this function.
>
> -Brian
>

Right, will send a V1 with this fix.

Amir.

^ permalink raw reply

* [PATCH iproute2 1/3] ip: add support of netconf messages
From: Nicolas Dichtel @ 2012-12-12  9:51 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, Nicolas Dichtel

Example of the output:
$ ip monitor netconf&
[1] 24901
$ echo 0 > /proc/sys/net/ipv6/conf/all/forwarding
ipv6 dev lo forwarding off
ipv6 dev eth0 forwarding off
ipv6 all forwarding off
$ echo 1 > /proc/sys/net/ipv4/conf/eth0/forwarding
ipv4 dev eth0 forwarding on

$ ip -6 netconf
ipv6 all forwarding on mc_forwarding 0
$ ip netconf show dev eth0
ipv4 dev eth0 forwarding on rp_filter off mc_forwarding 1

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/libnetlink.h      |   1 +
 include/linux/netconf.h   |  24 ++++++
 include/linux/rtnetlink.h |   9 +++
 ip/Makefile               |   2 +-
 ip/ip.c                   |   1 +
 ip/ip_common.h            |   3 +
 ip/ipmonitor.c            |  16 ++++
 ip/ipnetconf.c            | 183 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 238 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/netconf.h
 create mode 100644 ip/ipnetconf.c

diff --git a/include/libnetlink.h b/include/libnetlink.h
index 81649af..4a6b878 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -8,6 +8,7 @@
 #include <linux/if_link.h>
 #include <linux/if_addr.h>
 #include <linux/neighbour.h>
+#include <linux/netconf.h>
 
 struct rtnl_handle
 {
diff --git a/include/linux/netconf.h b/include/linux/netconf.h
new file mode 100644
index 0000000..64804a7
--- /dev/null
+++ b/include/linux/netconf.h
@@ -0,0 +1,24 @@
+#ifndef _UAPI_LINUX_NETCONF_H_
+#define _UAPI_LINUX_NETCONF_H_
+
+#include <linux/types.h>
+#include <linux/netlink.h>
+
+struct netconfmsg {
+	__u8	ncm_family;
+};
+
+enum {
+	NETCONFA_UNSPEC,
+	NETCONFA_IFINDEX,
+	NETCONFA_FORWARDING,
+	NETCONFA_RP_FILTER,
+	NETCONFA_MC_FORWARDING,
+	__NETCONFA_MAX
+};
+#define NETCONFA_MAX	(__NETCONFA_MAX - 1)
+
+#define NETCONFA_IFINDEX_ALL		-1
+#define NETCONFA_IFINDEX_DEFAULT	-2
+
+#endif /* _UAPI_LINUX_NETCONF_H_ */
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0e3e0c1..a30530e 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -120,6 +120,11 @@ enum {
 	RTM_SETDCB,
 #define RTM_SETDCB RTM_SETDCB
 
+	RTM_NEWNETCONF = 80,
+#define RTM_NEWNETCONF RTM_NEWNETCONF
+	RTM_GETNETCONF = 82,
+#define RTM_GETNETCONF RTM_GETNETCONF
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
@@ -585,6 +590,10 @@ enum rtnetlink_groups {
 #define RTNLGRP_PHONET_ROUTE	RTNLGRP_PHONET_ROUTE
 	RTNLGRP_DCB,
 #define RTNLGRP_DCB		RTNLGRP_DCB
+	RTNLGRP_IPV4_NETCONF,
+#define RTNLGRP_IPV4_NETCONF	RTNLGRP_IPV4_NETCONF
+	RTNLGRP_IPV6_NETCONF,
+#define RTNLGRP_IPV6_NETCONF	RTNLGRP_IPV6_NETCONF
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/ip/Makefile b/ip/Makefile
index 1676f0f..4bc33d7 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -4,7 +4,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
     iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
     iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o \
-    iplink_vxlan.o tcp_metrics.o iplink_ipoib.o
+    iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/ip.c b/ip/ip.c
index e0f7e60..632d271 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -85,6 +85,7 @@ static const struct cmd {
 	{ "mroute",	do_multiroute },
 	{ "mrule",	do_multirule },
 	{ "netns",	do_netns },
+	{ "netconf",	do_ipnetconf },
 	{ "help",	do_help },
 	{ 0 }
 };
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 2fd66b7..a394669 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -25,6 +25,8 @@ extern int print_prefix(const struct sockaddr_nl *who,
 			struct nlmsghdr *n, void *arg);
 extern int print_rule(const struct sockaddr_nl *who,
 		      struct nlmsghdr *n, void *arg);
+extern int print_netconf(const struct sockaddr_nl *who,
+			 struct nlmsghdr *n, void *arg);
 extern int do_ipaddr(int argc, char **argv);
 extern int do_ipaddrlabel(int argc, char **argv);
 extern int do_iproute(int argc, char **argv);
@@ -43,6 +45,7 @@ extern int do_netns(int argc, char **argv);
 extern int do_xfrm(int argc, char **argv);
 extern int do_ipl2tp(int argc, char **argv);
 extern int do_tcp_metrics(int argc, char **argv);
+extern int do_ipnetconf(int argc, char **argv);
 
 static inline int rtm_get_table(struct rtmsg *r, struct rtattr **tb)
 {
diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index 4b1d469..d87e58f 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c
@@ -85,6 +85,12 @@ int accept_msg(const struct sockaddr_nl *who,
 		print_rule(who, n, arg);
 		return 0;
 	}
+	if (n->nlmsg_type == RTM_NEWNETCONF) {
+		if (prefix_banner)
+			fprintf(fp, "[NETCONF]");
+		print_netconf(who, n, arg);
+		return 0;
+	}
 	if (n->nlmsg_type == 15) {
 		char *tstr;
 		time_t secs = ((__u32*)NLMSG_DATA(n))[0];
@@ -118,6 +124,7 @@ int do_ipmonitor(int argc, char **argv)
 	int lroute=0;
 	int lprefix=0;
 	int lneigh=0;
+	int lnetconf=0;
 
 	rtnl_close(&rth);
 	ipaddr_reset_filter(1);
@@ -143,6 +150,9 @@ int do_ipmonitor(int argc, char **argv)
 		} else if (matches(*argv, "neigh") == 0) {
 			lneigh = 1;
 			groups = 0;
+		} else if (matches(*argv, "netconf") == 0) {
+			lnetconf = 1;
+			groups = 0;
 		} else if (strcmp(*argv, "all") == 0) {
 			groups = ~RTMGRP_TC;
 			prefix_banner=1;
@@ -176,6 +186,12 @@ int do_ipmonitor(int argc, char **argv)
 	if (lneigh) {
 		groups |= nl_mgrp(RTNLGRP_NEIGH);
 	}
+	if (lnetconf) {
+		if (!preferred_family || preferred_family == AF_INET)
+			groups |= nl_mgrp(RTNLGRP_IPV4_NETCONF);
+		if (!preferred_family || preferred_family == AF_INET6)
+			groups |= nl_mgrp(RTNLGRP_IPV6_NETCONF);
+	}
 	if (file) {
 		FILE *fp;
 		fp = fopen(file, "r");
diff --git a/ip/ipnetconf.c b/ip/ipnetconf.c
new file mode 100644
index 0000000..66d667b
--- /dev/null
+++ b/ip/ipnetconf.c
@@ -0,0 +1,183 @@
+/*
+ * ipnetconf.c		"ip netconf".
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Nicolas Dichtel, <nicolas.dichtel@6wind.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+
+static struct
+{
+	int family;
+	int ifindex;
+} filter;
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip netconf show [ dev STRING ]\n");
+	exit(-1);
+}
+
+#define NETCONF_RTA(r)	((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct netconfmsg))))
+
+int print_netconf(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
+{
+	FILE *fp = (FILE*)arg;
+	struct netconfmsg *ncm = NLMSG_DATA(n);
+	int len = n->nlmsg_len;
+	struct rtattr *tb[NETCONFA_MAX+1];
+
+	if (n->nlmsg_type == NLMSG_ERROR)
+		return -1;
+	if (n->nlmsg_type != RTM_NEWNETCONF) {
+		fprintf(stderr, "Not RTM_NEWNETCONF: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+
+		return -1;
+	}
+	len -= NLMSG_SPACE(sizeof(*ncm));
+	if (len < 0) {
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
+	}
+
+	if (filter.family && filter.family != ncm->ncm_family)
+		return 0;
+
+	parse_rtattr(tb, NETCONFA_MAX, NETCONF_RTA(ncm),
+		     NLMSG_PAYLOAD(n, sizeof(*ncm)));
+
+	switch (ncm->ncm_family) {
+	case AF_INET:
+		fprintf(fp, "ipv4 ");
+		break;
+	case AF_INET6:
+		fprintf(fp, "ipv6 ");
+		break;
+	default:
+		fprintf(fp, "unknown ");
+		break;
+	}
+
+	if (tb[NETCONFA_IFINDEX]) {
+		int *ifindex = (int *)RTA_DATA(tb[NETCONFA_IFINDEX]);
+
+		switch (*ifindex) {
+		case NETCONFA_IFINDEX_ALL:
+			fprintf(fp, "all ");
+			break;
+		case NETCONFA_IFINDEX_DEFAULT:
+			fprintf(fp, "default ");
+			break;
+		default:
+			fprintf(fp, "dev %s ", ll_index_to_name(*ifindex));
+			break;
+		}
+	}
+
+	if (tb[NETCONFA_FORWARDING])
+		fprintf(fp, "forwarding %s ",
+			*(int *)RTA_DATA(tb[NETCONFA_FORWARDING])?"on":"off");
+	if (tb[NETCONFA_RP_FILTER]) {
+		int rp_filter = *(int *)RTA_DATA(tb[NETCONFA_RP_FILTER]);
+
+		if (rp_filter == 0)
+			fprintf(fp, "rp_filter off ");
+		else if (rp_filter == 1)
+			fprintf(fp, "rp_filter strict ");
+		else if (rp_filter == 2)
+			fprintf(fp, "rp_filter loose ");
+		else
+			fprintf(fp, "rp_filter unknown mode ");
+	}
+	if (tb[NETCONFA_MC_FORWARDING])
+		fprintf(fp, "mc_forwarding %d ",
+			*(int *)RTA_DATA(tb[NETCONFA_MC_FORWARDING]));
+
+	fprintf(fp, "\n");
+	fflush(fp);
+	return 0;
+}
+
+void ipnetconf_reset_filter(void)
+{
+	memset(&filter, 0, sizeof(filter));
+}
+
+int do_show(int argc, char **argv)
+{
+	struct {
+		struct nlmsghdr		n;
+		struct netconfmsg	ncm;
+		char			buf[1024];
+	} req;
+
+	ipnetconf_reset_filter();
+	filter.family = preferred_family;
+	if (filter.family == AF_UNSPEC)
+		filter.family = AF_INET;
+	filter.ifindex = NETCONFA_IFINDEX_ALL;
+
+	while (argc > 0) {
+		if (strcmp(*argv, "dev") == 0) {
+			NEXT_ARG();
+			filter.ifindex = ll_name_to_index(*argv);
+			if (filter.ifindex <= 0) {
+				fprintf(stderr, "Device \"%s\" does not exist.\n",
+					*argv);
+				return -1;
+			}
+		}
+		argv++; argc--;
+	}
+
+	ll_init_map(&rth);
+	memset(&req, 0, sizeof(req));
+	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct netconfmsg));
+	req.n.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
+	req.n.nlmsg_type = RTM_GETNETCONF;
+	req.ncm.ncm_family = filter.family;
+	addattr_l(&req.n, sizeof(req), NETCONFA_IFINDEX, &filter.ifindex,
+		  sizeof(filter.ifindex));
+
+	rtnl_send(&rth, &req.n, req.n.nlmsg_len);
+	rtnl_listen(&rth, print_netconf, stdout);
+
+	return 0;
+}
+
+int do_ipnetconf(int argc, char **argv)
+{
+	if (argc > 0) {
+		if (matches(*argv, "show") == 0 ||
+		    matches(*argv, "lst") == 0 ||
+		    matches(*argv, "list") == 0)
+			return do_show(argc-1, argv+1);
+		if (matches(*argv, "help") == 0)
+			usage();
+	} else
+		return do_show(0, NULL);
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"ip netconf help\".\n", *argv);
+	exit(-1);
+}
-- 
1.8.0.1

^ permalink raw reply related

* [PATCH iproute2 2/3] ip: add support of 'ip link type ip6tnl'
From: Nicolas Dichtel @ 2012-12-12  9:51 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, Nicolas Dichtel
In-Reply-To: <1355305907-7102-1-git-send-email-nicolas.dichtel@6wind.com>

This patch allows to manage ip6 tunnels via the interface ip link.
The syntax for parameters is the same that 'ip -6 tunnel'.

It also allows to display tunnels parameters with 'ip -details link' or
'ip -details monitor link'.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/linux/if_tunnel.h |  20 +++
 ip/Makefile               |   2 +-
 ip/iplink.c               |   3 +-
 ip/link_ip6tnl.c          | 344 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 367 insertions(+), 2 deletions(-)
 create mode 100644 ip/link_ip6tnl.c

diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 5db5942..aee73d0 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -37,6 +37,26 @@ struct ip_tunnel_parm {
 	struct iphdr		iph;
 };
 
+enum {
+	IFLA_IPTUN_UNSPEC,
+	IFLA_IPTUN_LINK,
+	IFLA_IPTUN_LOCAL,
+	IFLA_IPTUN_REMOTE,
+	IFLA_IPTUN_TTL,
+	IFLA_IPTUN_TOS,
+	IFLA_IPTUN_ENCAP_LIMIT,
+	IFLA_IPTUN_FLOWINFO,
+	IFLA_IPTUN_FLAGS,
+	IFLA_IPTUN_PROTO,
+	IFLA_IPTUN_PMTUDISC,
+	IFLA_IPTUN_6RD_PREFIX,
+	IFLA_IPTUN_6RD_RELAY_PREFIX,
+	IFLA_IPTUN_6RD_PREFIXLEN,
+	IFLA_IPTUN_6RD_RELAY_PREFIXLEN,
+	__IFLA_IPTUN_MAX,
+};
+#define IFLA_IPTUN_MAX	(__IFLA_IPTUN_MAX - 1)
+
 /* SIT-mode i_flags */
 #define	SIT_ISATAP	0x0001
 
diff --git a/ip/Makefile b/ip/Makefile
index 4bc33d7..abf54bf 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -4,7 +4,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
     iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
     iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o \
-    iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o
+    iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/iplink.c b/ip/iplink.c
index 7451aa0..8aac9fc 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -83,7 +83,8 @@ void iplink_usage(void)
 
 	if (iplink_have_newlink()) {
 		fprintf(stderr, "\n");
-		fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | can | bridge | ipoib }\n");
+		fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | can |\n");
+		fprintf(stderr, "          bridge | ipoib | ip6tnl }\n");
 	}
 	exit(-1);
 }
diff --git a/ip/link_ip6tnl.c b/ip/link_ip6tnl.c
new file mode 100644
index 0000000..2947364
--- /dev/null
+++ b/ip/link_ip6tnl.c
@@ -0,0 +1,344 @@
+/*
+ * link_ip6tnl.c	ip6tnl driver module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Nicolas Dichtel <nicolas.dichtel@6wind.com>
+ *
+ */
+
+#include <string.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include <linux/ip.h>
+#include <linux/if_tunnel.h>
+#include <linux/ip6_tunnel.h>
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+#include "tunnel.h"
+
+#define IP6_FLOWINFO_TCLASS	htonl(0x0FF00000)
+#define IP6_FLOWINFO_FLOWLABEL	htonl(0x000FFFFF)
+
+#define DEFAULT_TNL_HOP_LIMIT	(64)
+
+static void usage(void) __attribute__((noreturn));
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip link { add | set | change | replace | del } NAME\n");
+	fprintf(stderr, "          type ip6tnl [ remote ADDR ] [ local ADDR ]\n");
+	fprintf(stderr, "          [ dev PHYS_DEV ] [ encaplimit ELIM ]\n");
+	fprintf(stderr ,"          [ hoplimit HLIM ] [ tclass TCLASS ] [ flowlabel FLOWLABEL ]\n");
+	fprintf(stderr, "          [ dscp inherit ] [ fwmark inherit ]\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where: NAME      := STRING\n");
+	fprintf(stderr, "       ADDR      := IPV6_ADDRESS\n");
+	fprintf(stderr, "       ELIM      := { none | 0..255 }(default=%d)\n",
+		IPV6_DEFAULT_TNL_ENCAP_LIMIT);
+	fprintf(stderr, "       HLIM      := 0..255 (default=%d)\n",
+		DEFAULT_TNL_HOP_LIMIT);
+	fprintf(stderr, "       TCLASS    := { 0x0..0xff | inherit }\n");
+	fprintf(stderr, "       FLOWLABEL := { 0x0..0xfffff | inherit }\n");
+	exit(-1);
+}
+
+static int ip6tunnel_parse_opt(struct link_util *lu, int argc, char **argv,
+			       struct nlmsghdr *n)
+{
+	struct {
+		struct nlmsghdr n;
+		struct ifinfomsg i;
+		char buf[2048];
+	} req;
+	struct ifinfomsg *ifi = (struct ifinfomsg *)(n + 1);
+	struct rtattr *tb[IFLA_MAX + 1];
+	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
+	struct rtattr *iptuninfo[IFLA_IPTUN_MAX + 1];
+	int len;
+	struct in6_addr laddr;
+	struct in6_addr raddr;
+	__u8 hop_limit = DEFAULT_TNL_HOP_LIMIT;
+	__u8 encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT;
+	__u32 flowinfo = 0;
+	__u32 flags = 0;
+	__u32 link = 0;
+	__u8 proto = 0;
+
+	memset(&laddr, 0, sizeof(laddr));
+	memset(&raddr, 0, sizeof(raddr));
+
+	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+		memset(&req, 0, sizeof(req));
+
+		req.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi));
+		req.n.nlmsg_flags = NLM_F_REQUEST;
+		req.n.nlmsg_type = RTM_GETLINK;
+		req.i.ifi_family = preferred_family;
+		req.i.ifi_index = ifi->ifi_index;
+
+		if (rtnl_talk(&rth, &req.n, 0, 0, &req.n) < 0) {
+get_failed:
+			fprintf(stderr,
+				"Failed to get existing tunnel info.\n");
+			return -1;
+		}
+
+		len = req.n.nlmsg_len;
+		len -= NLMSG_LENGTH(sizeof(*ifi));
+		if (len < 0)
+			goto get_failed;
+
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+
+		if (!tb[IFLA_LINKINFO])
+			goto get_failed;
+
+		parse_rtattr_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
+
+		if (!linkinfo[IFLA_INFO_DATA])
+			goto get_failed;
+
+		parse_rtattr_nested(iptuninfo, IFLA_IPTUN_MAX,
+				    linkinfo[IFLA_INFO_DATA]);
+
+		if (iptuninfo[IFLA_IPTUN_LOCAL])
+			memcpy(&laddr, RTA_DATA(iptuninfo[IFLA_IPTUN_LOCAL]),
+			       sizeof(laddr));
+
+		if (iptuninfo[IFLA_IPTUN_REMOTE])
+			memcpy(&raddr, RTA_DATA(iptuninfo[IFLA_IPTUN_REMOTE]),
+			       sizeof(raddr));
+
+		if (iptuninfo[IFLA_IPTUN_TTL])
+			hop_limit = rta_getattr_u8(iptuninfo[IFLA_IPTUN_TTL]);
+
+		if (iptuninfo[IFLA_IPTUN_ENCAP_LIMIT])
+			encap_limit = rta_getattr_u8(iptuninfo[IFLA_IPTUN_ENCAP_LIMIT]);
+
+		if (iptuninfo[IFLA_IPTUN_FLOWINFO])
+			flowinfo = rta_getattr_u32(iptuninfo[IFLA_IPTUN_FLOWINFO]);
+
+		if (iptuninfo[IFLA_IPTUN_FLAGS])
+			flags = rta_getattr_u32(iptuninfo[IFLA_IPTUN_FLAGS]);
+
+		if (iptuninfo[IFLA_IPTUN_LINK])
+			link = rta_getattr_u32(iptuninfo[IFLA_IPTUN_LINK]);
+
+		if (iptuninfo[IFLA_IPTUN_PROTO])
+			proto = rta_getattr_u8(iptuninfo[IFLA_IPTUN_PROTO]);
+	}
+
+	while (argc > 0) {
+		if (matches(*argv, "mode") == 0) {
+			NEXT_ARG();
+			if (strcmp(*argv, "ipv6/ipv6") == 0 ||
+			    strcmp(*argv, "ip6ip6") == 0)
+				proto = IPPROTO_IPV6;
+			else if (strcmp(*argv, "ip/ipv6") == 0 ||
+				 strcmp(*argv, "ipv4/ipv6") == 0 ||
+				 strcmp(*argv, "ipip6") == 0 ||
+				 strcmp(*argv, "ip4ip6") == 0)
+				proto = IPPROTO_IPIP;
+			else if (strcmp(*argv, "any/ipv6") == 0 ||
+				 strcmp(*argv, "any") == 0)
+				proto = 0;
+			else
+				invarg("Cannot guess tunnel mode.", *argv);
+		} else if (strcmp(*argv, "remote") == 0) {
+			inet_prefix addr;
+			NEXT_ARG();
+			get_prefix(&addr, *argv, preferred_family);
+			if (addr.family == AF_UNSPEC)
+				invarg("\"remote\" address family is AF_UNSPEC", *argv);
+			memcpy(&raddr, addr.data, addr.bytelen);
+		} else if (strcmp(*argv, "local") == 0) {
+			inet_prefix addr;
+			NEXT_ARG();
+			get_prefix(&addr, *argv, preferred_family);
+			if (addr.family == AF_UNSPEC)
+				invarg("\"local\" address family is AF_UNSPEC", *argv);
+			memcpy(&laddr, addr.data, addr.bytelen);
+		} else if (matches(*argv, "dev") == 0) {
+			NEXT_ARG();
+			link = if_nametoindex(*argv);
+			if (link == 0)
+				invarg("\"dev\" is invalid", *argv);
+		} else if (strcmp(*argv, "hoplimit") == 0 ||
+			   strcmp(*argv, "ttl") == 0 ||
+			   strcmp(*argv, "hlim") == 0) {
+			__u8 uval;
+			NEXT_ARG();
+			if (get_u8(&uval, *argv, 0))
+				invarg("invalid HLIM", *argv);
+			hop_limit = uval;
+		} else if (matches(*argv, "encaplimit") == 0) {
+			NEXT_ARG();
+			if (strcmp(*argv, "none") == 0) {
+				flags |= IP6_TNL_F_IGN_ENCAP_LIMIT;
+			} else {
+				__u8 uval;
+				if (get_u8(&uval, *argv, 0) < -1)
+					invarg("invalid ELIM", *argv);
+				encap_limit = uval;
+				flags &= ~IP6_TNL_F_IGN_ENCAP_LIMIT;
+			}
+		} else if (strcmp(*argv, "tclass") == 0 ||
+			   strcmp(*argv, "tc") == 0 ||
+			   strcmp(*argv, "tos") == 0 ||
+			   matches(*argv, "dsfield") == 0) {
+			__u8 uval;
+			NEXT_ARG();
+			flowinfo &= ~IP6_FLOWINFO_TCLASS;
+			if (strcmp(*argv, "inherit") == 0)
+				flags |= IP6_TNL_F_USE_ORIG_TCLASS;
+			else {
+				if (get_u8(&uval, *argv, 16))
+					invarg("invalid TClass", *argv);
+				flowinfo |= htonl((__u32)uval << 20) & IP6_FLOWINFO_TCLASS;
+				flags &= ~IP6_TNL_F_USE_ORIG_TCLASS;
+			}
+		} else if (strcmp(*argv, "flowlabel") == 0 ||
+			   strcmp(*argv, "fl") == 0) {
+			__u32 uval;
+			NEXT_ARG();
+			flowinfo &= ~IP6_FLOWINFO_FLOWLABEL;
+			if (strcmp(*argv, "inherit") == 0)
+				flags |= IP6_TNL_F_USE_ORIG_FLOWLABEL;
+			else {
+				if (get_u32(&uval, *argv, 16))
+					invarg("invalid Flowlabel", *argv);
+				if (uval > 0xFFFFF)
+					invarg("invalid Flowlabel", *argv);
+				flowinfo |= htonl(uval) & IP6_FLOWINFO_FLOWLABEL;
+				flags &= ~IP6_TNL_F_USE_ORIG_FLOWLABEL;
+			}
+		} else if (strcmp(*argv, "dscp") == 0) {
+			NEXT_ARG();
+			if (strcmp(*argv, "inherit") != 0)
+				invarg("not inherit", *argv);
+			flags |= IP6_TNL_F_RCV_DSCP_COPY;
+		} else if (strcmp(*argv, "fwmark") == 0) {
+			NEXT_ARG();
+			if (strcmp(*argv, "inherit") != 0)
+				invarg("not inherit", *argv);
+			flags |= IP6_TNL_F_USE_ORIG_FWMARK;
+		} else
+			usage();
+		argc--, argv++;
+	}
+
+	addattr8(n, 1024, IFLA_IPTUN_PROTO, proto);
+	addattr_l(n, 1024, IFLA_IPTUN_LOCAL, &laddr, sizeof(laddr));
+	addattr_l(n, 1024, IFLA_IPTUN_REMOTE, &raddr, sizeof(raddr));
+	addattr8(n, 1024, IFLA_IPTUN_TTL, hop_limit);
+	addattr8(n, 1024, IFLA_IPTUN_ENCAP_LIMIT, encap_limit);
+	addattr32(n, 1024, IFLA_IPTUN_FLOWINFO, flowinfo);
+	addattr32(n, 1024, IFLA_IPTUN_FLAGS, flags);
+	addattr32(n, 1024, IFLA_IPTUN_LINK, link);
+
+	return 0;
+}
+
+static void ip6tunnel_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
+{
+	char s1[256];
+	char s2[64];
+	int flags = 0;
+	__u32 flowinfo = 0;
+
+	if (!tb)
+		return;
+
+	if (tb[IFLA_IPTUN_FLAGS])
+		flags = rta_getattr_u32(tb[IFLA_IPTUN_FLAGS]);
+
+	if (tb[IFLA_IPTUN_FLOWINFO])
+		flowinfo = rta_getattr_u32(tb[IFLA_IPTUN_FLOWINFO]);
+
+	if (tb[IFLA_IPTUN_PROTO]) {
+		switch (rta_getattr_u8(tb[IFLA_IPTUN_PROTO])) {
+		case IPPROTO_IPIP:
+			fprintf(f, "ipip6 ");
+			break;
+		case IPPROTO_IPV6:
+			fprintf(f, "ip6ip6 ");
+			break;
+		case 0:
+			fprintf(f, "any ");
+			break;
+		}
+	}
+
+	if (tb[IFLA_IPTUN_REMOTE]) {
+		fprintf(f, "remote %s ",
+			rt_addr_n2a(AF_INET6,
+				    RTA_PAYLOAD(tb[IFLA_IPTUN_REMOTE]),
+				    RTA_DATA(tb[IFLA_IPTUN_REMOTE]),
+				    s1, sizeof(s1)));
+	}
+
+	if (tb[IFLA_IPTUN_LOCAL]) {
+		fprintf(f, "local %s ",
+			rt_addr_n2a(AF_INET6,
+				    RTA_PAYLOAD(tb[IFLA_IPTUN_LOCAL]),
+				    RTA_DATA(tb[IFLA_IPTUN_LOCAL]),
+				    s1, sizeof(s1)));
+	}
+
+	if (tb[IFLA_IPTUN_LINK] && rta_getattr_u32(tb[IFLA_IPTUN_LINK])) {
+		unsigned link = rta_getattr_u32(tb[IFLA_IPTUN_LINK]);
+		const char *n = if_indextoname(link, s2);
+
+		if (n)
+			fprintf(f, "dev %s ", n);
+		else
+			fprintf(f, "dev %u ", link);
+	}
+
+	if (flags & IP6_TNL_F_IGN_ENCAP_LIMIT)
+		printf("encaplimit none ");
+	else if (tb[IFLA_IPTUN_ENCAP_LIMIT])
+		fprintf(f, "encaplimit %u ",
+			rta_getattr_u8(tb[IFLA_IPTUN_ENCAP_LIMIT]));
+
+	if (tb[IFLA_IPTUN_TTL])
+		fprintf(f, "hoplimit %u ", rta_getattr_u8(tb[IFLA_IPTUN_TTL]));
+
+	if (flags & IP6_TNL_F_USE_ORIG_TCLASS)
+		printf("tclass inherit ");
+	else if (tb[IFLA_IPTUN_FLOWINFO]) {
+		__u32 val = ntohl(flowinfo & IP6_FLOWINFO_TCLASS);
+
+		printf("tclass 0x%02x ", (__u8)(val >> 20));
+	}
+
+	if (flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+		printf("flowlabel inherit ");
+	else
+		printf("flowlabel 0x%05x ", ntohl(flowinfo & IP6_FLOWINFO_FLOWLABEL));
+
+	printf("(flowinfo 0x%08x) ", ntohl(flowinfo));
+
+	if (flags & IP6_TNL_F_RCV_DSCP_COPY)
+		printf("dscp inherit ");
+
+	if (flags & IP6_TNL_F_MIP6_DEV)
+		fprintf(f, "mip6 ");
+
+	if (flags & IP6_TNL_F_USE_ORIG_FWMARK)
+		fprintf(f, "fwmark inherit ");
+}
+
+struct link_util ip6tnl_link_util = {
+	.id = "ip6tnl",
+	.maxattr = IFLA_IPTUN_MAX,
+	.parse_opt = ip6tunnel_parse_opt,
+	.print_opt = ip6tunnel_print_opt,
+};
-- 
1.8.0.1

^ permalink raw reply related

* [PATCH iproute2 3/3] ip: add support of 'ip link type [ipip|sit]'
From: Nicolas Dichtel @ 2012-12-12  9:51 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, Nicolas Dichtel
In-Reply-To: <1355305907-7102-1-git-send-email-nicolas.dichtel@6wind.com>

This patch allows to manage ip tunnels via the interface ip link.
The syntax for parameters is the same that 'ip tunnel'.

It also allows to display tunnels parameters with 'ip -details link' or
'ip -details monitor link'.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 ip/Makefile     |   3 +-
 ip/iplink.c     |   2 +-
 ip/link_iptnl.c | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 ip/link_iptnl.c

diff --git a/ip/Makefile b/ip/Makefile
index abf54bf..2b606d4 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -4,7 +4,8 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     ipxfrm.o xfrm_state.o xfrm_policy.o xfrm_monitor.o \
     iplink_vlan.o link_veth.o link_gre.o iplink_can.o \
     iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o \
-    iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o
+    iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
+    link_iptnl.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/iplink.c b/ip/iplink.c
index 8aac9fc..d73c705 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -84,7 +84,7 @@ void iplink_usage(void)
 	if (iplink_have_newlink()) {
 		fprintf(stderr, "\n");
 		fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | can |\n");
-		fprintf(stderr, "          bridge | ipoib | ip6tnl }\n");
+		fprintf(stderr, "          bridge | ipoib | ip6tnl | ipip | sit }\n");
 	}
 	exit(-1);
 }
diff --git a/ip/link_iptnl.c b/ip/link_iptnl.c
new file mode 100644
index 0000000..238722d
--- /dev/null
+++ b/ip/link_iptnl.c
@@ -0,0 +1,340 @@
+/*
+ * link_iptnl.c	ipip and sit driver module
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Nicolas Dichtel <nicolas.dichtel@6wind.com>
+ *
+ */
+
+#include <string.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include <linux/ip.h>
+#include <linux/if_tunnel.h>
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+#include "tunnel.h"
+
+static void usage(int sit) __attribute__((noreturn));
+static void usage(int sit)
+{
+	fprintf(stderr, "Usage: ip link { add | set | change | replace | del } NAME\n");
+	fprintf(stderr, "          type { ipip | sit } [ remote ADDR ] [ local ADDR ]\n");
+	fprintf(stderr, "          [ ttl TTL ] [ tos TOS ] [ [no]pmtudisc ] [ dev PHYS_DEV ]\n");
+	fprintf(stderr, "          [ 6rd-prefix ADDR ] [ 6rd-relay_prefix ADDR ] [ 6rd-reset ]\n");
+	if (sit)
+		fprintf(stderr, "          [ isatap ]\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where: NAME := STRING\n");
+	fprintf(stderr, "       ADDR := { IP_ADDRESS | any }\n");
+	fprintf(stderr, "       TOS  := { NUMBER | inherit }\n");
+	fprintf(stderr, "       TTL  := { 1..255 | inherit }\n");
+	exit(-1);
+}
+
+static int iptunnel_parse_opt(struct link_util *lu, int argc, char **argv,
+			      struct nlmsghdr *n)
+{
+	struct {
+		struct nlmsghdr n;
+		struct ifinfomsg i;
+		char buf[2048];
+	} req;
+	struct ifinfomsg *ifi = (struct ifinfomsg *)(n + 1);
+	struct rtattr *tb[IFLA_MAX + 1];
+	struct rtattr *linkinfo[IFLA_INFO_MAX+1];
+	struct rtattr *iptuninfo[IFLA_IPTUN_MAX + 1];
+	int len;
+	__u32 link = 0;
+	__u32 laddr = 0;
+	__u32 raddr = 0;
+	__u8 ttl = 0;
+	__u8 tos = 0;
+	__u8 pmtudisc = 1;
+	__u16 iflags = 0;
+	struct in6_addr ip6rdprefix;
+	__u16 ip6rdprefixlen = 0;
+	__u32 ip6rdrelayprefix = 0;
+	__u16 ip6rdrelayprefixlen = 0;
+
+	memset(&ip6rdprefix, 0, sizeof(ip6rdprefix));
+
+	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+		memset(&req, 0, sizeof(req));
+
+		req.n.nlmsg_len = NLMSG_LENGTH(sizeof(*ifi));
+		req.n.nlmsg_flags = NLM_F_REQUEST;
+		req.n.nlmsg_type = RTM_GETLINK;
+		req.i.ifi_family = preferred_family;
+		req.i.ifi_index = ifi->ifi_index;
+
+		if (rtnl_talk(&rth, &req.n, 0, 0, &req.n) < 0) {
+get_failed:
+			fprintf(stderr,
+				"Failed to get existing tunnel info.\n");
+			return -1;
+		}
+
+		len = req.n.nlmsg_len;
+		len -= NLMSG_LENGTH(sizeof(*ifi));
+		if (len < 0)
+			goto get_failed;
+
+		parse_rtattr(tb, IFLA_MAX, IFLA_RTA(&req.i), len);
+
+		if (!tb[IFLA_LINKINFO])
+			goto get_failed;
+
+		parse_rtattr_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
+
+		if (!linkinfo[IFLA_INFO_DATA])
+			goto get_failed;
+
+		parse_rtattr_nested(iptuninfo, IFLA_IPTUN_MAX,
+				    linkinfo[IFLA_INFO_DATA]);
+
+		if (iptuninfo[IFLA_IPTUN_LOCAL])
+			laddr = rta_getattr_u32(iptuninfo[IFLA_IPTUN_LOCAL]);
+
+		if (iptuninfo[IFLA_IPTUN_REMOTE])
+			raddr = rta_getattr_u32(iptuninfo[IFLA_IPTUN_REMOTE]);
+
+		if (iptuninfo[IFLA_IPTUN_TTL])
+			ttl = rta_getattr_u8(iptuninfo[IFLA_IPTUN_TTL]);
+
+		if (iptuninfo[IFLA_IPTUN_TOS])
+			tos = rta_getattr_u8(iptuninfo[IFLA_IPTUN_TOS]);
+
+		if (iptuninfo[IFLA_IPTUN_PMTUDISC])
+			pmtudisc =
+				rta_getattr_u8(iptuninfo[IFLA_IPTUN_PMTUDISC]);
+
+		if (iptuninfo[IFLA_IPTUN_FLAGS])
+			iflags = rta_getattr_u16(iptuninfo[IFLA_IPTUN_FLAGS]);
+
+		if (iptuninfo[IFLA_IPTUN_LINK])
+			link = rta_getattr_u32(iptuninfo[IFLA_IPTUN_LINK]);
+
+		if (iptuninfo[IFLA_IPTUN_6RD_PREFIX])
+			memcpy(&ip6rdprefix,
+			       RTA_DATA(iptuninfo[IFLA_IPTUN_6RD_PREFIX]),
+			       sizeof(laddr));
+
+		if (iptuninfo[IFLA_IPTUN_6RD_PREFIXLEN])
+			ip6rdprefixlen =
+				rta_getattr_u16(iptuninfo[IFLA_IPTUN_6RD_PREFIXLEN]);
+
+		if (iptuninfo[IFLA_IPTUN_6RD_RELAY_PREFIX])
+			ip6rdrelayprefix =
+				rta_getattr_u32(iptuninfo[IFLA_IPTUN_6RD_RELAY_PREFIX]);
+
+		if (iptuninfo[IFLA_IPTUN_6RD_RELAY_PREFIXLEN])
+			ip6rdrelayprefixlen =
+				rta_getattr_u16(iptuninfo[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]);
+	}
+
+	while (argc > 0) {
+		if (strcmp(*argv, "remote") == 0) {
+			NEXT_ARG();
+			if (strcmp(*argv, "any"))
+				raddr = get_addr32(*argv);
+			else
+				raddr = 0;
+		} else if (strcmp(*argv, "local") == 0) {
+			NEXT_ARG();
+			if (strcmp(*argv, "any"))
+				laddr = get_addr32(*argv);
+			else
+				laddr = 0;
+		} else if (matches(*argv, "dev") == 0) {
+			NEXT_ARG();
+			link = if_nametoindex(*argv);
+			if (link == 0)
+				invarg("\"dev\" is invalid", *argv);
+		} else if (strcmp(*argv, "ttl") == 0 ||
+			   strcmp(*argv, "hoplimit") == 0) {
+			NEXT_ARG();
+			if (strcmp(*argv, "inherit") != 0) {
+				if (get_u8(&ttl, *argv, 0))
+					invarg("invalid TTL\n", *argv);
+			} else
+				ttl = 0;
+		} else if (strcmp(*argv, "tos") == 0 ||
+			   strcmp(*argv, "tclass") == 0 ||
+			   matches(*argv, "dsfield") == 0) {
+			__u32 uval;
+			NEXT_ARG();
+			if (strcmp(*argv, "inherit") != 0) {
+				if (rtnl_dsfield_a2n(&uval, *argv))
+					invarg("bad TOS value", *argv);
+				tos = uval;
+			} else
+				tos = 1;
+		} else if (strcmp(*argv, "nopmtudisc") == 0) {
+			pmtudisc = 0;
+		} else if (strcmp(*argv, "pmtudisc") == 0) {
+			pmtudisc = 1;
+		} else if (strcmp(lu->id, "sit") == 0 &&
+			   strcmp(*argv, "isatap") == 0) {
+			iflags |= SIT_ISATAP;
+		} else if (strcmp(*argv, "6rd-prefix") == 0) {
+			inet_prefix prefix;
+			NEXT_ARG();
+			if (get_prefix(&prefix, *argv, AF_INET6))
+				invarg("invalid 6rd_prefix\n", *argv);
+			memcpy(&ip6rdprefix, prefix.data, 16);
+			ip6rdprefixlen = prefix.bitlen;
+		} else if (strcmp(*argv, "6rd-relay_prefix") == 0) {
+			inet_prefix prefix;
+			NEXT_ARG();
+			if (get_prefix(&prefix, *argv, AF_INET))
+				invarg("invalid 6rd-relay_prefix\n", *argv);
+			memcpy(&ip6rdrelayprefix, prefix.data, 4);
+			ip6rdrelayprefixlen = prefix.bitlen;
+		} else if (strcmp(*argv, "6rd-reset") == 0) {
+			inet_prefix prefix;
+			get_prefix(&prefix, "2002::", AF_INET6);
+			memcpy(&ip6rdprefix, prefix.data, 16);
+			ip6rdprefixlen = 16;
+			ip6rdrelayprefix = 0;
+			ip6rdrelayprefixlen = 0;
+		} else
+			usage(strcmp(lu->id, "sit") == 0);
+		argc--, argv++;
+	}
+
+	if (ttl && pmtudisc == 0) {
+		fprintf(stderr, "ttl != 0 and noptmudisc are incompatible\n");
+		exit(-1);
+	}
+
+	addattr32(n, 1024, IFLA_IPTUN_LINK, link);
+	addattr32(n, 1024, IFLA_IPTUN_LOCAL, laddr);
+	addattr32(n, 1024, IFLA_IPTUN_REMOTE, raddr);
+	addattr8(n, 1024, IFLA_IPTUN_TTL, ttl);
+	addattr8(n, 1024, IFLA_IPTUN_TOS, tos);
+	addattr8(n, 1024, IFLA_IPTUN_PMTUDISC, pmtudisc);
+	if (strcmp(lu->id, "sit") == 0) {
+		addattr16(n, 1024, IFLA_IPTUN_FLAGS, iflags);
+		if (ip6rdprefixlen) {
+			addattr_l(n, 1024, IFLA_IPTUN_6RD_PREFIX,
+				  &ip6rdprefix, sizeof(ip6rdprefix));
+			addattr16(n, 1024, IFLA_IPTUN_6RD_PREFIXLEN,
+				  ip6rdprefixlen);
+			addattr32(n, 1024, IFLA_IPTUN_6RD_RELAY_PREFIX,
+				  ip6rdrelayprefix);
+			addattr16(n, 1024, IFLA_IPTUN_6RD_RELAY_PREFIXLEN,
+				  ip6rdrelayprefixlen);
+		}
+	}
+
+	return 0;
+}
+
+static void iptunnel_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
+{
+	char s1[1024];
+	char s2[64];
+	const char *local = "any";
+	const char *remote = "any";
+
+	if (!tb)
+		return;
+
+	if (tb[IFLA_IPTUN_REMOTE]) {
+		unsigned addr = rta_getattr_u32(tb[IFLA_IPTUN_REMOTE]);
+
+		if (addr)
+			remote = format_host(AF_INET, 4, &addr, s1, sizeof(s1));
+	}
+
+	fprintf(f, "remote %s ", remote);
+
+	if (tb[IFLA_IPTUN_LOCAL]) {
+		unsigned addr = rta_getattr_u32(tb[IFLA_IPTUN_LOCAL]);
+
+		if (addr)
+			local = format_host(AF_INET, 4, &addr, s1, sizeof(s1));
+	}
+
+	fprintf(f, "local %s ", local);
+
+	if (tb[IFLA_IPTUN_LINK] && rta_getattr_u32(tb[IFLA_IPTUN_LINK])) {
+		unsigned link = rta_getattr_u32(tb[IFLA_IPTUN_LINK]);
+		const char *n = if_indextoname(link, s2);
+
+		if (n)
+			fprintf(f, "dev %s ", n);
+		else
+			fprintf(f, "dev %u ", link);
+	}
+
+	if (tb[IFLA_IPTUN_TTL] && rta_getattr_u8(tb[IFLA_IPTUN_TTL]))
+		fprintf(f, "ttl %d ", rta_getattr_u8(tb[IFLA_IPTUN_TTL]));
+	else
+		fprintf(f, "ttl inherit ");
+
+	if (tb[IFLA_IPTUN_TOS] && rta_getattr_u8(tb[IFLA_IPTUN_TOS])) {
+		int tos = rta_getattr_u8(tb[IFLA_IPTUN_TOS]);
+
+		fputs("tos ", f);
+		if (tos == 1)
+			fputs("inherit ", f);
+		else
+			fprintf(f, "0x%x ", tos);
+	}
+
+	if (tb[IFLA_IPTUN_PMTUDISC] && rta_getattr_u8(tb[IFLA_IPTUN_PMTUDISC]))
+		fprintf(f, "pmtudisc ");
+	else
+		fprintf(f, "nopmtudisc ");
+
+	if (tb[IFLA_IPTUN_FLAGS]) {
+	       __u16 iflags = rta_getattr_u16(tb[IFLA_IPTUN_FLAGS]);
+
+	      if (iflags & SIT_ISATAP)
+		      fprintf(f, "isatap ");
+	}
+
+	if (tb[IFLA_IPTUN_6RD_PREFIXLEN] &&
+	    *(__u16 *)RTA_DATA(tb[IFLA_IPTUN_6RD_PREFIXLEN])) {
+		__u16 prefixlen = rta_getattr_u16(tb[IFLA_IPTUN_6RD_PREFIXLEN]);
+		__u16 relayprefixlen =
+			rta_getattr_u16(tb[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]);
+		__u32 relayprefix =
+			rta_getattr_u32(tb[IFLA_IPTUN_6RD_RELAY_PREFIX]);
+
+		printf("6rd-prefix %s/%u ",
+		       inet_ntop(AF_INET6, RTA_DATA(tb[IFLA_IPTUN_6RD_PREFIX]),
+			         s1, sizeof(s1)),
+		       prefixlen);
+		if (relayprefix) {
+			printf("6rd-relay_prefix %s/%u ",
+			       format_host(AF_INET, 4, &relayprefix, s1,
+				           sizeof(s1)),
+			       relayprefixlen);
+		}
+	}
+}
+
+struct link_util ipip_link_util = {
+	.id = "ipip",
+	.maxattr = IFLA_IPTUN_MAX,
+	.parse_opt = iptunnel_parse_opt,
+	.print_opt = iptunnel_print_opt,
+};
+
+struct link_util sit_link_util = {
+	.id = "sit",
+	.maxattr = IFLA_IPTUN_MAX,
+	.parse_opt = iptunnel_parse_opt,
+	.print_opt = iptunnel_print_opt,
+};
-- 
1.8.0.1

^ permalink raw reply related

* Re: [PATCH] net: filter: return -EINVAL if BPF_S_ANC* operation is not supported
From: Daniel Borkmann @ 2012-12-12  9:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Ani Sinha, Eric Dumazet
In-Reply-To: <1355304701-22228-1-git-send-email-dborkman@redhat.com>

On 12/12/2012 10:31 AM, Daniel Borkmann wrote:
> Currently, we return -EINVAL for malicious or wrong BPF filters.
> However, this is not done for BPF_S_ANC* operations, which makes it
> more difficult to detect if it's actually supported or not by the
> BPF machine. Therefore, we should also return -EINVAL if K is within
> the SKF_AD_OFF universe and the ancillary operation did not match.
>
> Cc: Ani Sinha <ani@aristanetworks.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Daniel Borkmann <dborkman@redhat.com>

(Sorry, this is intended for net-next.)

^ permalink raw reply

* [PATCH iproute2] ip: use rtnelink to manage mroute
From: Nicolas Dichtel @ 2012-12-12  9:32 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, Nicolas Dichtel

mroute was using /proc/net/ip_mr_[vif|cache] to display mroute entries. Hence,
only RT_TABLE_DEFAULT was displayed and only IPv4.
With rtnetlink, it is possible to display all tables for IPv4 and IPv6. The output
format is kept. Also, like before the patch, statistics are displayed when user specify
the '-s' argument.

The patch also adds the support of 'ip monitor mroute', which is now possible.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/linux/rtnetlink.h |   7 ++
 ip/ip_common.h            |   3 +
 ip/ipmonitor.c            |  35 +++++-
 ip/ipmroute.c             | 296 ++++++++++++++++++++++++++++------------------
 4 files changed, 220 insertions(+), 121 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0e3e0c1..e0595dc 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -283,6 +283,7 @@ enum rtattr_type_t {
 	RTA_MP_ALGO, /* no longer used */
 	RTA_TABLE,
 	RTA_MARK,
+	RTA_MFC_STATS,
 	__RTA_MAX
 };
 
@@ -403,6 +404,12 @@ struct rta_session {
 	} u;
 };
 
+struct rta_mfc_stats {
+	__u64	mfcs_packets;
+	__u64	mfcs_bytes;
+	__u64	mfcs_wrong_if;
+};
+
 /****
  *		General form of address family dependent message.
  ****/
diff --git a/ip/ip_common.h b/ip/ip_common.h
index 2fd66b7..57653b5 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -16,11 +16,14 @@ extern int ipaddr_list_link(int argc, char **argv);
 extern int iproute_monitor(int argc, char **argv);
 extern void iplink_usage(void) __attribute__((noreturn));
 extern void iproute_reset_filter(void);
+extern void ipmroute_reset_filter(void);
 extern void ipaddr_reset_filter(int);
 extern void ipneigh_reset_filter(void);
 extern void ipntable_reset_filter(void);
 extern int print_route(const struct sockaddr_nl *who,
 		       struct nlmsghdr *n, void *arg);
+extern int print_mroute(const struct sockaddr_nl *who,
+			struct nlmsghdr *n, void *arg);
 extern int print_prefix(const struct sockaddr_nl *who,
 			struct nlmsghdr *n, void *arg);
 extern int print_rule(const struct sockaddr_nl *who,
diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index 4b1d469..39bfb8e 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c
@@ -43,10 +43,26 @@ int accept_msg(const struct sockaddr_nl *who,
 		print_timestamp(fp);
 
 	if (n->nlmsg_type == RTM_NEWROUTE || n->nlmsg_type == RTM_DELROUTE) {
-		if (prefix_banner)
-			fprintf(fp, "[ROUTE]");
-		print_route(who, n, arg);
-		return 0;
+		struct rtmsg *r = NLMSG_DATA(n);
+
+		if (n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)) < 0) {
+			fprintf(stderr, "BUG: wrong nlmsg len %d\n",
+				n->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
+			return -1;
+		}
+
+		if (r->rtm_family == RTNL_FAMILY_IPMR ||
+		    r->rtm_family == RTNL_FAMILY_IP6MR) {
+			if (prefix_banner)
+				fprintf(fp, "[MROUTE]");
+			print_mroute(who, n, arg);
+			return 0;
+		} else {
+			if (prefix_banner)
+				fprintf(fp, "[ROUTE]");
+			print_route(who, n, arg);
+			return 0;
+		}
 	}
 	if (n->nlmsg_type == RTM_NEWLINK || n->nlmsg_type == RTM_DELLINK) {
 		ll_remember_index(who, n, NULL);
@@ -116,12 +132,14 @@ int do_ipmonitor(int argc, char **argv)
 	int llink=0;
 	int laddr=0;
 	int lroute=0;
+	int lmroute=0;
 	int lprefix=0;
 	int lneigh=0;
 
 	rtnl_close(&rth);
 	ipaddr_reset_filter(1);
 	iproute_reset_filter();
+	ipmroute_reset_filter();
 	ipneigh_reset_filter();
 
 	while (argc > 0) {
@@ -137,6 +155,9 @@ int do_ipmonitor(int argc, char **argv)
 		} else if (matches(*argv, "route") == 0) {
 			lroute=1;
 			groups = 0;
+		} else if (matches(*argv, "mroute") == 0) {
+			lmroute=1;
+			groups = 0;
 		} else if (matches(*argv, "prefix") == 0) {
 			lprefix=1;
 			groups = 0;
@@ -169,6 +190,12 @@ int do_ipmonitor(int argc, char **argv)
 		if (!preferred_family || preferred_family == AF_INET6)
 			groups |= nl_mgrp(RTNLGRP_IPV6_ROUTE);
 	}
+	if (lmroute) {
+		if (!preferred_family || preferred_family == AF_INET)
+			groups |= nl_mgrp(RTNLGRP_IPV4_MROUTE);
+		if (!preferred_family || preferred_family == AF_INET6)
+			groups |= nl_mgrp(RTNLGRP_IPV6_MROUTE);
+	}
 	if (lprefix) {
 		if (!preferred_family || preferred_family == AF_INET6)
 			groups |= nl_mgrp(RTNLGRP_IPV6_PREFIX);
diff --git a/ip/ipmroute.c b/ip/ipmroute.c
index 945727d..4c82c6e 100644
--- a/ip/ipmroute.c
+++ b/ip/ipmroute.c
@@ -15,6 +15,7 @@
 #include <unistd.h>
 #include <syslog.h>
 #include <fcntl.h>
+#include <inttypes.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
@@ -26,167 +27,228 @@
 #include <linux/if_arp.h>
 #include <linux/sockios.h>
 
+#include <rt_names.h>
 #include "utils.h"
-
-char filter_dev[16];
-int  filter_family;
+#include "ip_common.h"
 
 static void usage(void) __attribute__((noreturn));
 
 static void usage(void)
 {
-	fprintf(stderr, "Usage: ip mroute show [ PREFIX ] [ from PREFIX ] [ iif DEVICE ]\n");
+	fprintf(stderr, "Usage: ip mroute show [ [ to ] PREFIX ] [ from PREFIX ] [ iif DEVICE ]\n");
 #if 0
 	fprintf(stderr, "Usage: ip mroute [ add | del ] DESTINATION from SOURCE [ iif DEVICE ] [ oif DEVICE ]\n");
 #endif
 	exit(-1);
 }
 
-static char *viftable[32];
-
 struct rtfilter
 {
+	int tb;
+	int af;
+	int iif;
 	inet_prefix mdst;
 	inet_prefix msrc;
 } filter;
 
-static void read_viftable(void)
+int print_mroute(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 {
-	char buf[256];
-	FILE *fp = fopen("/proc/net/ip_mr_vif", "r");
-
-	if (!fp)
-		return;
-
-	if (!fgets(buf, sizeof(buf), fp)) {
-		fclose(fp);
-		return;
+	FILE *fp = (FILE*)arg;
+	struct rtmsg *r = NLMSG_DATA(n);
+	int len = n->nlmsg_len;
+	struct rtattr * tb[RTA_MAX+1];
+	char abuf[256];
+	char obuf[256];
+	SPRINT_BUF(b1);
+	__u32 table;
+	int iif = 0;
+	int family;
+
+	if ((n->nlmsg_type != RTM_NEWROUTE &&
+	     n->nlmsg_type != RTM_DELROUTE) ||
+	    !(n->nlmsg_flags & NLM_F_MULTI)) {
+		fprintf(stderr, "Not a multicast route: %08x %08x %08x\n",
+			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
+		return 0;
 	}
-	while (fgets(buf, sizeof(buf), fp)) {
-		int vifi;
-		char dev[256];
-
-		if (sscanf(buf, "%d%s", &vifi, dev) < 2)
-			continue;
-
-		if (vifi<0 || vifi>31)
-			continue;
-
-		viftable[vifi] = strdup(dev);
+	len -= NLMSG_LENGTH(sizeof(*r));
+	if (len < 0) {
+		fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+		return -1;
 	}
-	fclose(fp);
-}
-
-static void read_mroute_list(FILE *ofp)
-{
-	char buf[256];
-	FILE *fp = fopen("/proc/net/ip_mr_cache", "r");
-
-	if (!fp)
-		return;
-
-	if (!fgets(buf, sizeof(buf), fp)) {
-		fclose(fp);
-		return;
+	if (r->rtm_type != RTN_MULTICAST) {
+		fprintf(stderr, "Not a multicast route (type: %s)\n",
+			rtnl_rtntype_n2a(r->rtm_type, b1, sizeof(b1)));
+		return 0;
 	}
 
-	while (fgets(buf, sizeof(buf), fp)) {
-		inet_prefix maddr, msrc;
-		unsigned pkts, b, w;
-		int vifi;
-		char oiflist[256];
-		char sbuf[256];
-		char mbuf[256];
-		char obuf[256];
-
-		oiflist[0] = 0;
-		if (sscanf(buf, "%x%x%d%u%u%u %[^\n]",
-			   maddr.data, msrc.data, &vifi,
-			   &pkts, &b, &w, oiflist) < 6)
-			continue;
-
-		if (vifi!=-1 && (vifi < 0 || vifi>31))
-			continue;
-
-		if (filter_dev[0] && (vifi<0 || strcmp(filter_dev, viftable[vifi])))
-			continue;
-		if (filter.mdst.family && inet_addr_match(&maddr, &filter.mdst, filter.mdst.bitlen))
-			continue;
-		if (filter.msrc.family && inet_addr_match(&msrc, &filter.msrc, filter.msrc.bitlen))
-			continue;
-
-		snprintf(obuf, sizeof(obuf), "(%s, %s)",
-			 format_host(AF_INET, 4, &msrc.data[0], sbuf, sizeof(sbuf)),
-			 format_host(AF_INET, 4, &maddr.data[0], mbuf, sizeof(mbuf)));
-
-		fprintf(ofp, "%-32s Iif: ", obuf);
-
-		if (vifi == -1)
-			fprintf(ofp, "unresolved ");
-		else
-			fprintf(ofp, "%-10s ", viftable[vifi]);
-
-		if (oiflist[0]) {
-			char *next = NULL;
-			char *p = oiflist;
-			int ovifi, ottl;
-
-			fprintf(ofp, "Oifs: ");
-
-			while (p) {
-				next = strchr(p, ' ');
-				if (next) {
-					*next = 0;
-					next++;
-				}
-				if (sscanf(p, "%d:%d", &ovifi, &ottl)<2) {
-					p = next;
-					continue;
-				}
-				p = next;
-
-				fprintf(ofp, "%s", viftable[ovifi]);
-				if (ottl>1)
-					fprintf(ofp, "(ttl %d) ", ovifi);
-				else
-					fprintf(ofp, " ");
+	parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len);
+	table = rtm_get_table(r, tb);
+
+	if (filter.tb > 0 && filter.tb != table)
+		return 0;
+
+	if (tb[RTA_IIF])
+		iif = *(int*)RTA_DATA(tb[RTA_IIF]);
+	if (filter.iif && filter.iif != iif)
+		return 0;
+
+	if (filter.af && filter.af != r->rtm_family)
+		return 0;
+
+	if (tb[RTA_DST] &&
+	    filter.mdst.bitlen > 0 &&
+	    inet_addr_match(RTA_DATA(tb[RTA_DST]), &filter.mdst, filter.mdst.bitlen))
+		return 0;
+
+	if (tb[RTA_SRC] &&
+	    filter.msrc.bitlen > 0 &&
+	    inet_addr_match(RTA_DATA(tb[RTA_SRC]), &filter.msrc, filter.msrc.bitlen))
+		return 0;
+
+	family = r->rtm_family == RTNL_FAMILY_IPMR ? AF_INET : AF_INET6;
+
+	if (n->nlmsg_type == RTM_DELROUTE)
+		fprintf(fp, "Deleted ");
+
+	if (tb[RTA_SRC])
+		len = snprintf(obuf, sizeof(obuf),
+			       "(%s, ", rt_addr_n2a(family,
+						    RTA_PAYLOAD(tb[RTA_SRC]),
+						    RTA_DATA(tb[RTA_SRC]),
+						    abuf, sizeof(abuf)));
+	else
+		len = sprintf(obuf, "(unknown, ");
+	if (tb[RTA_DST])
+		snprintf(obuf + len, sizeof(obuf) - len,
+			 "%s)", rt_addr_n2a(family, RTA_PAYLOAD(tb[RTA_DST]),
+					    RTA_DATA(tb[RTA_DST]),
+					    abuf, sizeof(abuf)));
+	else
+		snprintf(obuf + len, sizeof(obuf) - len, "unknown) ");
+
+	fprintf(fp, "%-32s Iif: ", obuf);
+	if (iif)
+		fprintf(fp, "%-10s ", ll_index_to_name(iif));
+	else
+		fprintf(fp, "unresolved ");
+
+	if (tb[RTA_MULTIPATH]) {
+		struct rtnexthop *nh = RTA_DATA(tb[RTA_MULTIPATH]);
+		int first = 1;
+
+		len = RTA_PAYLOAD(tb[RTA_MULTIPATH]);
+
+		for (;;) {
+			if (len < sizeof(*nh))
+				break;
+			if (nh->rtnh_len > len)
+				break;
+
+			if (first) {
+				fprintf(fp, "Oifs: ");
+				first = 0;
 			}
+			fprintf(fp, "%s", ll_index_to_name(nh->rtnh_ifindex));
+			if (nh->rtnh_hops > 1)
+				fprintf(fp, "(ttl %d) ", nh->rtnh_hops);
+			else
+				fprintf(fp, " ");
+			len -= NLMSG_ALIGN(nh->rtnh_len);
+			nh = RTNH_NEXT(nh);
 		}
-
-		if (show_stats && b) {
-			fprintf(ofp, "%s  %u packets, %u bytes", _SL_, pkts, b);
-			if (w)
-				fprintf(ofp, ", %u arrived on wrong iif.", w);
-		}
-		fprintf(ofp, "\n");
 	}
-	fclose(fp);
+	if (show_stats && tb[RTA_MFC_STATS]) {
+		struct rta_mfc_stats *mfcs = RTA_DATA(tb[RTA_MFC_STATS]);
+
+		fprintf(fp, "%s  %"PRIu64" packets, %"PRIu64" bytes", _SL_,
+			mfcs->mfcs_packets, mfcs->mfcs_bytes);
+		if (mfcs->mfcs_wrong_if)
+			fprintf(fp, ", %"PRIu64" arrived on wrong iif.",
+				mfcs->mfcs_wrong_if);
+	}
+	fprintf(fp, "\n");
+	fflush(fp);
+	return 0;
 }
 
+void ipmroute_reset_filter(void)
+{
+	memset(&filter, 0, sizeof(filter));
+	filter.mdst.bitlen = -1;
+	filter.msrc.bitlen = -1;
+}
 
 static int mroute_list(int argc, char **argv)
 {
+	char *id = NULL;
+	int family;
+
+	ipmroute_reset_filter();
+	if (preferred_family == AF_UNSPEC)
+		family = AF_INET;
+	else
+		family = AF_INET6;
+	if (family == AF_INET) {
+		filter.af = RTNL_FAMILY_IPMR;
+		filter.tb = RT_TABLE_DEFAULT;  /* for backward compatibility */
+	} else
+		filter.af = RTNL_FAMILY_IP6MR;
+
 	while (argc > 0) {
-		if (strcmp(*argv, "iif") == 0) {
+		if (matches(*argv, "table") == 0) {
+			__u32 tid;
 			NEXT_ARG();
-			strncpy(filter_dev, *argv, sizeof(filter_dev)-1);
+			if (rtnl_rttable_a2n(&tid, *argv)) {
+				if (strcmp(*argv, "all") == 0) {
+					filter.tb = 0;
+				} else if (strcmp(*argv, "help") == 0) {
+					usage();
+				} else {
+					invarg("table id value is invalid\n", *argv);
+				}
+			} else
+				filter.tb = tid;
+		} else if (strcmp(*argv, "iif") == 0) {
+			NEXT_ARG();
+			id = *argv;
 		} else if (matches(*argv, "from") == 0) {
 			NEXT_ARG();
-			get_prefix(&filter.msrc, *argv, AF_INET);
+			get_prefix(&filter.msrc, *argv, family);
 		} else {
 			if (strcmp(*argv, "to") == 0) {
 				NEXT_ARG();
 			}
 			if (matches(*argv, "help") == 0)
 				usage();
-			get_prefix(&filter.mdst, *argv, AF_INET);
+			get_prefix(&filter.mdst, *argv, family);
 		}
-		argv++; argc--;
+		argc--; argv++;
 	}
 
-	read_viftable();
-	read_mroute_list(stdout);
-	return 0;
+	ll_init_map(&rth);
+
+	if (id)  {
+		int idx;
+
+		if ((idx = ll_name_to_index(id)) == 0) {
+			fprintf(stderr, "Cannot find device \"%s\"\n", id);
+			return -1;
+		}
+		filter.iif = idx;
+	}
+
+	if (rtnl_wilddump_request(&rth, filter.af, RTM_GETROUTE) < 0) {
+		perror("Cannot send dump request");
+		return 1;
+	}
+
+	if (rtnl_dump_filter(&rth, print_mroute, stdout) < 0) {
+		fprintf(stderr, "Dump terminated\n");
+		exit(1);
+	}
+
+	exit(0);
 }
 
 int do_multiroute(int argc, char **argv)
-- 
1.8.0.1

^ permalink raw reply related

* [PATCH] net: filter: return -EINVAL if BPF_S_ANC* operation is not supported
From: Daniel Borkmann @ 2012-12-12  9:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Daniel Borkmann, Ani Sinha, Eric Dumazet

Currently, we return -EINVAL for malicious or wrong BPF filters.
However, this is not done for BPF_S_ANC* operations, which makes it
more difficult to detect if it's actually supported or not by the
BPF machine. Therefore, we should also return -EINVAL if K is within
the SKF_AD_OFF universe and the ancillary operation did not match.

Cc: Ani Sinha <ani@aristanetworks.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
 net/core/filter.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index c23543c..de9bed4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -531,7 +531,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
 		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
 	};
-	int pc;
+	int pc, anc_found;
 
 	if (flen == 0 || flen > BPF_MAXINSNS)
 		return -EINVAL;
@@ -592,8 +592,10 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 		case BPF_S_LD_W_ABS:
 		case BPF_S_LD_H_ABS:
 		case BPF_S_LD_B_ABS:
+			anc_found = 0;
 #define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\
 				code = BPF_S_ANC_##CODE;	\
+				anc_found = 1;			\
 				break
 			switch (ftest->k) {
 			ANCILLARY(PROTOCOL);
@@ -610,6 +612,10 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG);
 			ANCILLARY(VLAN_TAG_PRESENT);
 			}
+
+			/* ancillary operation unkown or unsupported */
+			if (anc_found == 0 && ftest->k >= SKF_AD_OFF)
+				return -EINVAL;
 		}
 		ftest->code = code;
 	}
-- 
1.7.11.7

^ permalink raw reply related

* Re: [RFC PATCH v2 3/3] tun: fix LSM/SELinux labeling of tun/tap devices
From: Michael S. Tsirkin @ 2012-12-12  9:22 UTC (permalink / raw)
  To: Paul Moore; +Cc: netdev, linux-security-module, selinux, jasowang
In-Reply-To: <20121205202619.18626.98778.stgit@localhost>

On Wed, Dec 05, 2012 at 03:26:19PM -0500, Paul Moore wrote:
> This patch corrects some problems with LSM/SELinux that were introduced
> with the multiqueue patchset.  The problem stems from the fact that the
> multiqueue work changed the relationship between the tun device and its
> associated socket; before the socket persisted for the life of the
> device, however after the multiqueue changes the socket only persisted
> for the life of the userspace connection (fd open).  For non-persistent
> devices this is not an issue, but for persistent devices this can cause
> the tun device to lose its SELinux label.
> 
> We correct this problem by adding an opaque LSM security blob to the
> tun device struct which allows us to have the LSM security state, e.g.
> SELinux labeling information, persist for the lifetime of the tun
> device.  In the process we tweak the LSM hooks to work with this new
> approach to TUN device/socket labeling and introduce a new LSM hook,
> security_tun_dev_create_queue(), to approve requests to create a new
> TUN queue via TUNSETQUEUE.
> 
> The SELinux code has been adjusted to match the new LSM hooks, the
> other LSMs do not make use of the LSM TUN controls.  This patch makes
> use of the recently added "tun_socket:create_queue" permission to
> restrict access to the TUNSETQUEUE operation.  On older SELinux
> policies which do not define the "tun_socket:create_queue" permission
> the access control decision for TUNSETQUEUE will be handled according
> to the SELinux policy's unknown permission setting.
> 
> Signed-off-by: Paul Moore <pmoore@redhat.com>
> ---
>  drivers/net/tun.c                 |   26 +++++++++++++---
>  include/linux/security.h          |   59 +++++++++++++++++++++++++++++--------
>  security/capability.c             |   24 +++++++++++++--
>  security/security.c               |   28 ++++++++++++++----
>  security/selinux/hooks.c          |   50 ++++++++++++++++++++++++-------
>  security/selinux/include/objsec.h |    4 +++
>  6 files changed, 153 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 14a0454..fb8148b 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -182,6 +182,7 @@ struct tun_struct {
>  	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
>  	struct timer_list flow_gc_timer;
>  	unsigned long ageing_time;
> +	void *security;
>  };
>  
>  static inline u32 tun_hashfn(u32 rxhash)
> @@ -465,6 +466,10 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
>  	struct tun_file *tfile = file->private_data;
>  	int err;
>  
> +	err = security_tun_dev_attach(tfile->socket.sk, tun->security);
> +	if (err < 0)
> +		goto out;
> +
>  	err = -EINVAL;
>  	if (rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held()))
>  		goto out;

This hook triggers with both set_queue and set_iff,
and it also seems to trigger when attaching to a
persistent device and when creating a new one. But I
believe we might want to be able to allow one but not the other.

For example:
	- we might want to allow qemu to do set_queue but not set_iff
	- we might want to configure presistent devices and
	  prevent a user from adding new ones

> @@ -1348,6 +1353,7 @@ static void tun_free_netdev(struct net_device *dev)
>  	struct tun_struct *tun = netdev_priv(dev);
>  
>  	tun_flow_uninit(tun);
> +	security_tun_dev_free_security(tun->security);
>  	free_netdev(dev);
>  }
>  
> @@ -1534,7 +1540,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  
>  		if (tun_not_capable(tun))
>  			return -EPERM;
> -		err = security_tun_dev_attach(tfile->socket.sk);
> +		err = security_tun_dev_open(tun->security);
>  		if (err < 0)
>  			return err;
>  
> @@ -1587,7 +1593,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  
>  		spin_lock_init(&tun->lock);
>  
> -		security_tun_dev_post_create(&tfile->sk);
> +		err = security_tun_dev_alloc_security(&tun->security);
> +		if (err < 0)
> +			goto err_free_dev;
>  
>  		tun_net_init(dev);
>  
> @@ -1767,12 +1775,18 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>  
>  		tun = netdev_priv(dev);
>  		if (dev->netdev_ops != &tap_netdev_ops &&
> -			dev->netdev_ops != &tun_netdev_ops)
> +			dev->netdev_ops != &tun_netdev_ops) {
>  			ret = -EINVAL;
> -		else if (tun_not_capable(tun))
> +			goto unlock;
> +		}
> +		if (tun_not_capable(tun)) {
>  			ret = -EPERM;
> -		else
> -			ret = tun_attach(tun, file);
> +			goto unlock;
> +		}
> +		ret = security_tun_dev_create_queue(tun->security);
> +		if (ret < 0)
> +			goto unlock;
> +		ret = tun_attach(tun, file);
>  	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE)
>  		__tun_detach(tfile, false);
>  	else
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 05e88bd..8ea923b 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -983,17 +983,29 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
>   *	tells the LSM to decrement the number of secmark labeling rules loaded
>   * @req_classify_flow:
>   *	Sets the flow's sid to the openreq sid.
> + * @tun_dev_alloc_security:
> + *	This hook allows a module to allocate a security structure for a TUN
> + *	device.
> + *	@security pointer to a security structure pointer.
> + *	Returns a zero on success, negative values on failure.
> + * @tun_dev_free_security:
> + *	This hook allows a module to free the security structure for a TUN
> + *	device.
> + *	@security pointer to the TUN device's security structure
>   * @tun_dev_create:
>   *	Check permissions prior to creating a new TUN device.
> - * @tun_dev_post_create:
> - *	This hook allows a module to update or allocate a per-socket security
> - *	structure.
> - *	@sk contains the newly created sock structure.

I worry that removing a hook hurt users that use it in their
ecurity policy.

> + * @tun_dev_create_queue:
> + *	Check permissions prior to creating a new TUN device queue.
> + *	@security pointer to the TUN device's security structure.
>   * @tun_dev_attach:
> - *	Check permissions prior to attaching to a persistent TUN device.  This
> - *	hook can also be used by the module to update any security state
> + *	This hook can be used by the module to update any security state
>   *	associated with the TUN device's sock structure.
>   *	@sk contains the existing sock structure.
> + *	@security pointer to the TUN device's security structure.
> + * @tun_dev_open:
> + *	This hook can be used by the module to update any security state
> + *	associated with the TUN device's security structure.
> + *	@security pointer to the TUN devices's security structure.
>   *
>   * Security hooks for XFRM operations.
>   *
> @@ -1613,9 +1625,12 @@ struct security_operations {
>  	void (*secmark_refcount_inc) (void);
>  	void (*secmark_refcount_dec) (void);
>  	void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl);
> -	int (*tun_dev_create)(void);
> -	void (*tun_dev_post_create)(struct sock *sk);
> -	int (*tun_dev_attach)(struct sock *sk);
> +	int (*tun_dev_alloc_security) (void **security);
> +	void (*tun_dev_free_security) (void *security);
> +	int (*tun_dev_create) (void);
> +	int (*tun_dev_create_queue) (void *security);
> +	int (*tun_dev_attach) (struct sock *sk, void *security);
> +	int (*tun_dev_open) (void *security);
>  #endif	/* CONFIG_SECURITY_NETWORK */
>  
>  #ifdef CONFIG_SECURITY_NETWORK_XFRM
> @@ -2553,9 +2568,12 @@ void security_inet_conn_established(struct sock *sk,
>  int security_secmark_relabel_packet(u32 secid);
>  void security_secmark_refcount_inc(void);
>  void security_secmark_refcount_dec(void);
> +int security_tun_dev_alloc_security(void **security);
> +void security_tun_dev_free_security(void *security);
>  int security_tun_dev_create(void);
> -void security_tun_dev_post_create(struct sock *sk);
> -int security_tun_dev_attach(struct sock *sk);
> +int security_tun_dev_create_queue(void *security);
> +int security_tun_dev_attach(struct sock *sk, void *security);
> +int security_tun_dev_open(void *security);
>  
>  #else	/* CONFIG_SECURITY_NETWORK */
>  static inline int security_unix_stream_connect(struct sock *sock,
> @@ -2720,16 +2738,31 @@ static inline void security_secmark_refcount_dec(void)
>  {
>  }
>  
> +static inline int security_tun_dev_alloc_security(void **security)
> +{
> +	return 0;
> +}
> +
> +static inline void security_tun_dev_free_security(void *security)
> +{
> +}
> +
>  static inline int security_tun_dev_create(void)
>  {
>  	return 0;
>  }
>  
> -static inline void security_tun_dev_post_create(struct sock *sk)
> +static inline int security_tun_dev_create_queue(void *security)
> +{
> +	return 0;
> +}
> +
> +static inline int security_tun_dev_attach(struct sock *sk, void *security)
>  {
> +	return 0;
>  }
>  
> -static inline int security_tun_dev_attach(struct sock *sk)
> +static inline int security_tun_dev_open(void *security)
>  {
>  	return 0;
>  }
> diff --git a/security/capability.c b/security/capability.c
> index b14a30c..bf4cbf2 100644
> --- a/security/capability.c
> +++ b/security/capability.c
> @@ -704,16 +704,31 @@ static void cap_req_classify_flow(const struct request_sock *req,
>  {
>  }
>  
> +static int cap_tun_dev_alloc_security(void **security)
> +{
> +	return 0;
> +}
> +
> +static void cap_tun_dev_free_security(void *security)
> +{
> +}
> +
>  static int cap_tun_dev_create(void)
>  {
>  	return 0;
>  }
>  
> -static void cap_tun_dev_post_create(struct sock *sk)
> +static int cap_tun_dev_create_queue(void *security)
> +{
> +	return 0;
> +}
> +
> +static int cap_tun_dev_attach(struct sock *sk, void *security)
>  {
> +	return 0;
>  }
>  
> -static int cap_tun_dev_attach(struct sock *sk)
> +static int cap_tun_dev_open(void *security)
>  {
>  	return 0;
>  }
> @@ -1044,8 +1059,11 @@ void __init security_fixup_ops(struct security_operations *ops)
>  	set_to_cap_if_null(ops, secmark_refcount_inc);
>  	set_to_cap_if_null(ops, secmark_refcount_dec);
>  	set_to_cap_if_null(ops, req_classify_flow);
> +	set_to_cap_if_null(ops, tun_dev_alloc_security);
> +	set_to_cap_if_null(ops, tun_dev_free_security);
>  	set_to_cap_if_null(ops, tun_dev_create);
> -	set_to_cap_if_null(ops, tun_dev_post_create);
> +	set_to_cap_if_null(ops, tun_dev_create_queue);
> +	set_to_cap_if_null(ops, tun_dev_open);
>  	set_to_cap_if_null(ops, tun_dev_attach);
>  #endif	/* CONFIG_SECURITY_NETWORK */
>  #ifdef CONFIG_SECURITY_NETWORK_XFRM
> diff --git a/security/security.c b/security/security.c
> index 8dcd4ae..4d82654 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -1244,24 +1244,42 @@ void security_secmark_refcount_dec(void)
>  }
>  EXPORT_SYMBOL(security_secmark_refcount_dec);
>  
> +int security_tun_dev_alloc_security(void **security)
> +{
> +	return security_ops->tun_dev_alloc_security(security);
> +}
> +EXPORT_SYMBOL(security_tun_dev_alloc_security);
> +
> +void security_tun_dev_free_security(void *security)
> +{
> +	security_ops->tun_dev_free_security(security);
> +}
> +EXPORT_SYMBOL(security_tun_dev_free_security);
> +
>  int security_tun_dev_create(void)
>  {
>  	return security_ops->tun_dev_create();
>  }
>  EXPORT_SYMBOL(security_tun_dev_create);
>  
> -void security_tun_dev_post_create(struct sock *sk)
> +int security_tun_dev_create_queue(void *security)
>  {
> -	return security_ops->tun_dev_post_create(sk);
> +	return security_ops->tun_dev_create_queue(security);
>  }
> -EXPORT_SYMBOL(security_tun_dev_post_create);
> +EXPORT_SYMBOL(security_tun_dev_create_queue);
>  
> -int security_tun_dev_attach(struct sock *sk)
> +int security_tun_dev_attach(struct sock *sk, void *security)
>  {
> -	return security_ops->tun_dev_attach(sk);
> +	return security_ops->tun_dev_attach(sk, security);
>  }
>  EXPORT_SYMBOL(security_tun_dev_attach);
>  
> +int security_tun_dev_open(void *security)
> +{
> +	return security_ops->tun_dev_open(security);
> +}
> +EXPORT_SYMBOL(security_tun_dev_open);
> +
>  #endif	/* CONFIG_SECURITY_NETWORK */
>  
>  #ifdef CONFIG_SECURITY_NETWORK_XFRM
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 61a5336..f1efb08 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -4399,6 +4399,24 @@ static void selinux_req_classify_flow(const struct request_sock *req,
>  	fl->flowi_secid = req->secid;
>  }
>  
> +static int selinux_tun_dev_alloc_security(void **security)
> +{
> +	struct tun_security_struct *tunsec;
> +
> +	tunsec = kzalloc(sizeof(*tunsec), GFP_KERNEL);
> +	if (!tunsec)
> +		return -ENOMEM;
> +	tunsec->sid = current_sid();
> +
> +	*security = tunsec;
> +	return 0;
> +}
> +
> +static void selinux_tun_dev_free_security(void *security)
> +{
> +	kfree(security);
> +}
> +
>  static int selinux_tun_dev_create(void)
>  {
>  	u32 sid = current_sid();
> @@ -4414,8 +4432,17 @@ static int selinux_tun_dev_create(void)
>  			    NULL);
>  }
>  
> -static void selinux_tun_dev_post_create(struct sock *sk)
> +static int selinux_tun_dev_create_queue(void *security)
>  {
> +	struct tun_security_struct *tunsec = security;
> +
> +	return avc_has_perm(current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET,
> +			    TUN_SOCKET__CREATE_QUEUE, NULL);
> +}
> +
> +static int selinux_tun_dev_attach(struct sock *sk, void *security)
> +{
> +	struct tun_security_struct *tunsec = security;
>  	struct sk_security_struct *sksec = sk->sk_security;
>  
>  	/* we don't currently perform any NetLabel based labeling here and it
> @@ -4425,20 +4452,19 @@ static void selinux_tun_dev_post_create(struct sock *sk)
>  	 * cause confusion to the TUN user that had no idea network labeling
>  	 * protocols were being used */
>  
> -	/* see the comments in selinux_tun_dev_create() about why we don't use
> -	 * the sockcreate SID here */
> -
> -	sksec->sid = current_sid();
> +	sksec->sid = tunsec->sid;
>  	sksec->sclass = SECCLASS_TUN_SOCKET;
> +
> +	return 0;
>  }
>  
> -static int selinux_tun_dev_attach(struct sock *sk)
> +static int selinux_tun_dev_open(void *security)
>  {
> -	struct sk_security_struct *sksec = sk->sk_security;
> +	struct tun_security_struct *tunsec = security;
>  	u32 sid = current_sid();
>  	int err;
>  
> -	err = avc_has_perm(sid, sksec->sid, SECCLASS_TUN_SOCKET,
> +	err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET,
>  			   TUN_SOCKET__RELABELFROM, NULL);
>  	if (err)
>  		return err;
> @@ -4446,8 +4472,7 @@ static int selinux_tun_dev_attach(struct sock *sk)
>  			   TUN_SOCKET__RELABELTO, NULL);
>  	if (err)
>  		return err;
> -
> -	sksec->sid = sid;
> +	tunsec->sid = sid;
>  
>  	return 0;
>  }
> @@ -5642,9 +5667,12 @@ static struct security_operations selinux_ops = {
>  	.secmark_refcount_inc =		selinux_secmark_refcount_inc,
>  	.secmark_refcount_dec =		selinux_secmark_refcount_dec,
>  	.req_classify_flow =		selinux_req_classify_flow,
> +	.tun_dev_alloc_security =	selinux_tun_dev_alloc_security,
> +	.tun_dev_free_security =	selinux_tun_dev_free_security,
>  	.tun_dev_create =		selinux_tun_dev_create,
> -	.tun_dev_post_create = 		selinux_tun_dev_post_create,
> +	.tun_dev_create_queue =		selinux_tun_dev_create_queue,
>  	.tun_dev_attach =		selinux_tun_dev_attach,
> +	.tun_dev_open =			selinux_tun_dev_open,
>  
>  #ifdef CONFIG_SECURITY_NETWORK_XFRM
>  	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
> diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
> index 26c7eee..aa47bca 100644
> --- a/security/selinux/include/objsec.h
> +++ b/security/selinux/include/objsec.h
> @@ -110,6 +110,10 @@ struct sk_security_struct {
>  	u16 sclass;			/* sock security class */
>  };
>  
> +struct tun_security_struct {
> +	u32 sid;			/* SID for the tun device sockets */
> +};
> +
>  struct key_security_struct {
>  	u32 sid;	/* SID of key */
>  };

^ permalink raw reply

* Re: [RFC PATCH v2 3/3] tun: fix LSM/SELinux labeling of tun/tap devices
From: Michael S. Tsirkin @ 2012-12-12  9:10 UTC (permalink / raw)
  To: Paul Moore; +Cc: netdev, linux-security-module, selinux, jasowang
In-Reply-To: <1963349.P9uq3yvlyR@sifl>

On Mon, Dec 10, 2012 at 05:43:49PM -0500, Paul Moore wrote:
> On Monday, December 10, 2012 07:50:35 PM Michael S. Tsirkin wrote:
> > On Mon, Dec 10, 2012 at 12:33:49PM -0500, Paul Moore wrote:
> > > On Monday, December 10, 2012 07:26:56 PM Michael S. Tsirkin wrote:
> > > > On Mon, Dec 10, 2012 at 12:04:35PM -0500, Paul Moore wrote:
> > > > > On Friday, December 07, 2012 02:25:16 PM Michael S. Tsirkin wrote:
> > > > > > On Thu, Dec 06, 2012 at 04:09:51PM -0500, Paul Moore wrote:
> > > > > > > On Thursday, December 06, 2012 10:57:16 PM Michael S. Tsirkin 
> wrote:
> > > > > > > > On Thu, Dec 06, 2012 at 11:56:45AM -0500, Paul Moore wrote:
> > > > > > > > > The SETQUEUE/tun_socket:create_queue permissions do not yet
> > > > > > > > > exist
> > > > > > > > > in any released SELinux policy as we are just now adding them
> > > > > > > > > with
> > > > > > > > > this patchset. With current policies loaded into a kernel with
> > > > > > > > > this patchset applied the SETQUEUE/tun_socket:create_queue
> > > > > > > > > permission would be treated according to the policy's unknown
> > > > > > > > > permission setting.
> > > > > > > > 
> > > > > > > > OK I think we need to rethink what we are doing here: what you
> > > > > > > > sent
> > > > > > > > addresses the problem as stated but I think we mis-stated it. 
> > > > > > > > Let
> > > > > > > > me try to restate the problem: it is not just selinux problem.
> > > > > > > > Let's
> > > > > > > > assume qemu wants to use tun, I (libvirt) don't want to run it
> > > > > > > > as
> > > > > > > > root.
> > > > > > > > 
> > > > > > > > 1. TUNSETIFF: I can open tun, attach an fd and pass it to qemu.
> > > > > > > > Now, qemu does not invoke TUNSETIFF so it can run without
> > > > > > > > kernel priveledges.
> > > > > > > 
> > > > > > > Correct me if I'm wrong, but I believe libvirt does this while
> > > > > > > running
> > > > > > > as root.  Assuming that is the case, why not simply
> > > > > > > setuid()/setgid()
> > > > > > > to the same credentials as the QEMU instance before creating the
> > > > > > > TUN
> > > > > > > device? You can always (re)configure the device afterwards while
> > > > > > > running as root/CAP_NET_ADMIN.
> > > > > > 
> > > > > > We want isolation between qemu instances.
> > > > > 
> > > > > Understood, I agree.
> > > > > 
> > > > > Achieving separation via SELinux is easily done, with libvirt/sVirt
> > > > > already doing this for us automatically in most cases; the only thing
> > > > > we
> > > > > will want to do is make sure the SELinux policy is aware of the new
> > > > > permission.
> > > > > 
> > > > > Achieving separation via DAC should also be easily done, simply run
> > > > > each
> > > > > QEMU instance with a separate UID and/or GID.
> > > > > 
> > > > > > Giving qemu right to open tun and SETIFF would give it rights
> > > > > > to access any tun device.
> > > > > 
> > > > > I'm quickly looked at tun_chr_open() again and I don't see any special
> > > > > rights/privileges required, the same for tun_chr_ioctl() and
> > > > > __tun_chr_ioctl().  Looking at tun_set_queue() I see we call
> > > > > tun_not_capable() which does a simple DAC check; it must have the same
> > > > > UID/GID or have CAP_NET_ADMIN.
> > > > > 
> > > > > I'm having a hard time seeing the problem you are describing; help me
> > > > > understand.
> > > > 
> > > > The issue is guest controls the number of queues in use.
> > > > So qemu would be required to be allowed to call tun_set_queue.
> > > > If we allow this we have a problem as one qemu will be
> > > > able to access any tun.
> > > 
> > > QEMU can call tun_set_queue() as long as it satisfies tun_not_capable(),
> > > which from a practical point of view means that the TUN device was
> > > created with the same UID/GID as the QEMU instance.  If you want TUN
> > > device separation between QEMU instances using DAC you need to run each
> > > QEMU instance with a different UID/GID (which you should be doing anyway
> > > if you want DAC enforced general separation).
> > > 
> > > I believe I've stated this point several times now and I don't feel you've
> > > addressed it properly.
> > 
> > Look at how it works at the moment:
> > a priveledged libvirt server calls tun_set_iff
> > and passes the fd to qemu which is not priveledged.
> > 
> > The result is isolation between qemu instances without
> > need to create uid per qemu instance.
> 
> Okay, good.  That is my understanding.
>  
> > How do we create multiple queues? It makes sense to
> > follow this model and pass in fds for individual queues.
> 
> Okay.
> 
> > However they need to be disabled initially
> > so libvirt can not do tun_set_queue for us.
> 
> Unrelated question: why do the queues need to be disabled initially?  Is this 
> to prevent traffic from being queued up?  Some other reason?  I'm just curious 
> as to the reason ...

Yes.
Basically because old guests only use a single queue.
If a guest comes along and declares multiqueue support
we can queue up traffic on new queues but if we
do this with a legacy guest it will not be able to
consume it.



> > can't utilize multiqueue.
> 
> I still don't understand why in the multiqueue case libvirt doesn't just 
> change it's effective UID/GID when creating the TUN device, or just use the 
> TUNSETOWNER/TUNSETGROUP commands. This would solve the problem you describe 
> above and - at least to me - seems like a better solution conceptually.
> 
> Help me understand why you believe that will not work.
> 
> Do you not want to give ownership of the TUN device to QEMU?  That would be 
> the only reason I can think of, but all of your comments that I can recall 
> have been about isolation between QEMU instances and not access control 
> between a QEMU instance and its assigned TUN device.

I think I might have confused things more than clarified them.
Let me comment on specific lines in patch that worry me
that will make it clear I hope.

> > My solution is an unpriveledged variant
> > of tun_set_queue that only enables/disables
> > a queue without attach/detach.
> 
> -- 
> paul moore
> security and virtualization @ redhat

^ permalink raw reply

* Re: [PATCH RFC 0/5] Containerize syslog
From: Glauber Costa @ 2012-12-12  8:56 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Serge Hallyn, Andrew Morton, Rui Xiang, netdev, containers
In-Reply-To: <87txrs30ur.fsf@xmission.com>

On 12/11/2012 10:22 PM, Eric W. Biederman wrote:
> Glauber Costa <glommer@parallels.com> writes:
> 
>> On 12/07/2012 10:05 PM, Eric W. Biederman wrote:
>>> Glauber Costa <glommer@parallels.com> writes:
>>>
>>>> I keep asking myself if it isn't the case of forwarding to a container
>>>> all messages printed in process context. That will obviously exclude all
>>>> messages resulting from kthreads - that will always be in the initial
>>>> namespace anyway, interrupts, etc. There is no harm, for instance, in
>>>> delivering the same message twice: one to the container, and the other
>>>> to the host system.
>>>
>>> Except that there is harm in double printing.  One of the better
>>> justifications for doing something with the kernel log is that it is
>>> possible to overflow the kernel log with operations performed
>>> exclusively in a container.
>>>
>> I don't agree with you here.
>>
>> If we are double printing, we are using up more memory, but we also have
>> an extra buffer anyway. The messages are print on behalf of the user,
>> but still, by the kernel.
>>
>> So one of the following will necessarily hold:
>>
>> 1) There is no way that the process can overflow the main log, and as a
>> consequence, the container log, that has less messages than it.
>>
>> 2) The process will overflow the main log. But since we are not printing
>> anything extra to the main log compared to the scenario in which the
>> process lives in the main namespace, this would already be a problem
>> independent of namespaces. And needs to be fixed.
> 
> Well mounts, brining network interfaces up and down, running packets
> through our own choice of firewall rules, possibly enabling debug
> messages on network interfaces has the potential to create messages we
> aren't seeing today.
> 

There are two kinds of messages: the ones that would be print anyway if
you were not running in an ns, which we have no reason to fear, and the
ones that only exist because we wrote the code for them, due to ns
support. They are no different from writing a new fs support, driver,
etc. Any new functionality can print new messages, and we have to be
sure they won't fill the log. We write that code, so it is on us to make
sure the messages are being print in a reasonable rate.

This should be true for all messages running in process context. It is
either that, or we have a bug and we relying on a specific clone flag to
protect us against the buffer overrun.


>> IOW, double printing should not print anything *extra* to the main log.
>> It just prints to the container log, and leaves a copy to the box admin
>> to see. I think it is very reasonable to imagine that the main admin
>> would like to see anything the kernel has to tell him about the box.
> 
> The only reason that I have seen for doing anything with printks is
> because we are generating messages that would not be generated in a
> non-container environment.  At which point double printing is scary
> because it allows a container user to flood the kernel log ring buffer
> and suppress interesting messages.
> 
>>> I do think the idea of process context printks going to the current
>>> container one worth playing with.
>>>
>>
>> It still leaves the problem of prinkts outside process context that
>> should go to a namespace open. But it is easy to extend this idea to do
>> both.
> 
> Hmm.  For printks from process context I think I can see a point where
> double printing makes sense, because that is a rather indiscriminate grab
> of printk messages.
> 
exactly. What I have in mind this whole time, is that if you are
printing a message of interest of the container admin, it is *very
likely* also of interest of the box admin, specially if it indicates
something going wrong. Maybe what goes and does not go to the main
buffer can be determined by the log level of each buffer. But still, I
think that just hiding them from the box admin may not exactly be what
we want.

Cheers

^ permalink raw reply

* RE: Gianfar driver issue
From: voncken @ 2012-12-12  8:46 UTC (permalink / raw)
  To: 'Claudiu Manoil'; +Cc: netdev
In-Reply-To: <50C744CE.7040109@freescale.com>

	Thanks claudiu, 

	I will reformat my patch and resend it.

	Regards.

Cedric Voncken 

-----Message d'origine-----
De : Claudiu Manoil [mailto:claudiu.manoil@freescale.com] 
Envoyé : mardi 11 décembre 2012 15:36
À : Cedric VONCKEN
Cc : netdev@vger.kernel.org
Objet : Re: Gianfar driver issue

On 12/11/2012 11:59 AM, Cedric VONCKEN wrote:
> 	Hi all,
>
> 	I think he have an issue in Gianfar driver.
>
> 	When the Netdev tx queue timeout occurred, the function
> gfar_timeout(..) is called. This function calls indirectly the
> gfar_init_mac(..) function.
>
> 	In this function, the rctrl register is set to a default value.
>
> 	If the Promiscuous is enable on the net dev ( flag IFF_PROMISC is 
> set), the gfar_init_function does not reactivate it.
>
> 	The Promiscuous mode is used for example when the netdev is bridged.
> 	
> 	I apply this patch to fix it.
>
> 	--- a/drivers/net/ethernet/freescale/gianfar.c.	2012-06-01
> 09:16:13.000000000 +0200
> 	+++ b/drivers/net/ethernet/freescale/gianfar.c	2012-12-11
> 10:38:23.000000000 +0100
> 	@@ -356,6 +356,11 @@
>   	/* Configure the coalescing support */
>   	gfar_configure_coalescing(priv, 0xFF, 0xFF);
>
> +	if (ndev->flags & IFF_PROMISC) {
> +		/* Set RCTRL to PROM */
> +		rctrl |= RCTRL_PROM;
> +	}
> +
>   	if (priv->rx_filer_enable) {
>   		rctrl |= RCTRL_FILREN;
>   		/* Program the RIR0 reg with the required distribution */

Hello,

I don't see any issue with this code change, and there are other drivers too
reconfiguring the promiscuous mode upon tx timeout.
A valid (formatted) patch needs to be sent however.

Thanks,
Claudiu

^ permalink raw reply

* Re: [BUG] Kernel recieves DNS reply, but doesn't deliver it to a waiting application
From: Andrew Savchenko @ 2012-12-12  8:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <20121023012759.ca7f91d6.bircoph@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 6266 bytes --]

Hello,

On Tue, 23 Oct 2012 01:27:59 +0400 Andrew Savchenko wrote:
> On Mon, 22 Oct 2012 08:48:09 +0200 Eric Dumazet wrote:
[...]
> > Some driver or protocol stack is messing with skb->truesize, as
> > your /proc/net/udp file contains anomalies :
> > 
> > $ cat /proc/net/udp
> >   sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops
> > ...
> >   323: 074A070A:007B 00000000:0000 07 FFFDF700:00000000 00:00000000 00000000   123        0 254469 2 ffff88003d581880 0
> > ...
> >   323: 00FCA8C0:007B 00000000:0000 07 FFFFF900:00000000 00:00000000 00000000     0        0 5187 2 ffff880039993880 0
> > 
> > Its clearly not possible to get tx_queue = 0xFFFDF700 or 0xFFFFF900
> > 
> > So what drivers handle following IP addresses : 192.168.252.0 , 10.7.74.7  ?
> 
> 192.168.252.0 is handled by eth0 interface running on Realtek
> Semiconductor Co., Ltd. RTL-8139/8139C/8139C+ (10ec:8139) NIC.
> Kernel driver 8139too. This interface handles multiple subnetworks:
> 
> # ip addr show eth0
> 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN qlen 1000 
> link/ether 00:80:48:30:ca:f3 brd ff:ff:ff:ff:ff:ff
> inet 10.51.15.126/25 brd 10.51.15.127 scope global eth0
> inet 192.168.252.0/31 scope global eth0
> 
> 10.7.74.7 is an l2tp connection handled by ppp over l2tp:
> CONFIG_PPPOL2TP=y
> It is running on top of eth0 described above.
> 
> # ip addr show ppp0
> 65: ppp0: <POINTOPOINT,MULTICAST,NOARP,UP,LOWER_UP> mtu 1400 qdisc pfifo_fast state UNKNOWN qlen 3
> link/ppp 
> inet 10.7.74.7 peer 10.7.2.18/32 scope global ppp0

I updated kernel on this system to 3.7.0 and udp anomaly is still
present:

$ cat /proc/net/udp
  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops             
    0: 00000000:06A5 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5326 2 ffff88003dbf0a80 0          
    8: 00000000:7EAD 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5157 2 ffff8800398c2000 0          
   89: 00000000:90FE 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5101 2 ffff88003dbd3500 0          
  160: 0100007F:2745 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4598 2 ffff88003d612700 0          
  184: 0100007F:035D 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4774 2 ffff88003d612a80 0          
  217: 00000000:857E 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5195 2 ffff8800398c2700 0          
  318: 00000000:A9E3 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4782 2 ffff88003d612e00 0          
  335: 7E0F330A:01F4 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5303 2 ffff8800398c2e00 0          
  348: 00000000:0801 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5186 2 ffff8800398c2380 0          
  387: 7E0F330A:DE28 1400320A:06A5 01 00000000:00000000 00:00000000 00000000     0        0 5332 4 ffff88003dbf0e00 0          
  400: 010013AC:0035 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4842 2 ffff88003d613880 0          
  400: 0100007F:0035 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4841 2 ffff88003d613500 0          
  414: 00000000:0043 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5273 2 ffff8800398c2a80 0          
  458: 00000000:006F 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4483 2 ffff88003d612000 0          
  459: 00000000:0270 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4507 2 ffff88003d612380 0          
  466: 00000000:0277 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4802 2 ffff88003d613180 0          
  470: 076A070A:007B 00000000:0000 07 FFFF4600:00000000 00:00000000 00000000   123        0 5552 2 ffff880039974380 0          
  470: 010213AC:007B 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4986 2 ffff88003dbd3180 0          
  470: 010013AC:007B 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4985 2 ffff88003dbd2e00 0          
  470: 00FCA8C0:007B 00000000:0000 07 FFFFFB00:00000000 00:00000000 00000000     0        0 4984 2 ffff88003dbd2a80 0          
  470: 7E0F330A:007B 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4983 2 ffff88003dbd2700 0          
  470: 0100007F:007B 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4982 2 ffff88003dbd2380 0          
  470: 00000000:007B 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 4975 2 ffff88003d613c00 0          
  484: FF0013AC:0089 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5316 2 ffff88003dbf0000 0          
  484: 010013AC:0089 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5315 2 ffff88003dbd3880 0          
  484: FF0213AC:0089 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5312 2 ffff8800398c3c00 0          
  484: 010213AC:0089 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5311 2 ffff8800398c3880 0          
  484: 00000000:0089 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5308 2 ffff8800398c3180 0          
  485: FF0013AC:008A 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5318 2 ffff88003dbf0700 0          
  485: 010013AC:008A 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5317 2 ffff88003dbf0380 0          
  485: FF0213AC:008A 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5314 2 ffff88003dbd3c00 0          
  485: 010213AC:008A 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5313 2 ffff88003dbd2000 0          
  485: 00000000:008A 00000000:0000 07 00000000:00000000 00:00000000 00000000     0        0 5309 2 ffff8800398c3500 0

The bug hasn't shown up yet, I'll need to wait for about a week to see
if it is reproducible.

Best regards,
Andrew Savchenko

[-- Attachment #2: Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* [PATCH net-next 2/2] bridge: add support of adding and deleting mdb entries
From: Cong Wang @ 2012-12-12  8:23 UTC (permalink / raw)
  To: netdev
  Cc: bridge, Cong Wang, Herbert Xu, Stephen Hemminger, David S. Miller,
	Thomas Graf
In-Reply-To: <1355300590-2390-1-git-send-email-amwang@redhat.com>

From: Cong Wang <amwang@redhat.com>

This patch implents adding/deleting mdb entries via netlink.
Currently all entries are temp, we probably need a flag to distinguish
permanent entries too.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Cong Wang <amwang@redhat.com>

---
 include/uapi/linux/if_bridge.h |    8 ++
 net/bridge/br_mdb.c            |  240 ++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_multicast.c      |   55 +++++-----
 net/bridge/br_private.h        |   23 ++++
 4 files changed, 297 insertions(+), 29 deletions(-)

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 9a0f6ff..afbb18a 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -157,6 +157,7 @@ enum {
 #define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
 
 struct br_port_msg {
+	__u8  family;
 	__u32 ifindex;
 };
 
@@ -171,4 +172,11 @@ struct br_mdb_entry {
 	} addr;
 };
 
+enum {
+	MDBA_SET_ENTRY_UNSPEC,
+	MDBA_SET_ENTRY,
+	__MDBA_SET_ENTRY_MAX,
+};
+#define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1)
+
 #endif /* _UAPI_LINUX_IF_BRIDGE_H */
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index a8cfbf5..6f0a2ee 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -4,6 +4,7 @@
 #include <linux/netdevice.h>
 #include <linux/rculist.h>
 #include <linux/skbuff.h>
+#include <linux/if_ether.h>
 #include <net/ip.h>
 #include <net/netlink.h>
 #if IS_ENABLED(CONFIG_IPV6)
@@ -235,7 +236,246 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 	__br_mdb_notify(dev, &entry, type);
 }
 
+static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
+{
+	if (entry->ifindex == 0)
+		return false;
+
+	if (entry->addr.proto == htons(ETH_P_IP)) {
+		if (!ipv4_is_multicast(entry->addr.u.ip4))
+			return false;
+		if (ipv4_is_local_multicast(entry->addr.u.ip4))
+			return false;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (entry->addr.proto == htons(ETH_P_IPV6)) {
+		if (!ipv6_is_transient_multicast(&entry->addr.u.ip6))
+			return false;
+#endif
+	} else
+		return false;
+
+	return true;
+}
+
+static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
+			struct net_device **pdev, struct br_mdb_entry **pentry)
+{
+	struct net *net = sock_net(skb->sk);
+	struct br_mdb_entry *entry;
+	struct br_port_msg *bpm;
+	struct nlattr *tb[MDBA_SET_ENTRY_MAX+1];
+	struct net_device *dev;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY, NULL);
+	if (err < 0)
+		return err;
+
+	bpm = nlmsg_data(nlh);
+	if (bpm->ifindex == 0) {
+		pr_info("PF_BRIDGE: br_mdb_parse() with invalid ifindex\n");
+		return -EINVAL;
+	}
+
+	dev = __dev_get_by_index(net, bpm->ifindex);
+	if (dev == NULL) {
+		pr_info("PF_BRIDGE: br_mdb_parse() with unknown ifindex\n");
+		return -ENODEV;
+	}
+
+	if (!(dev->priv_flags & IFF_EBRIDGE)) {
+		pr_info("PF_BRIDGE: br_mdb_parse() with non-bridge\n");
+		return -EOPNOTSUPP;
+	}
+
+	*pdev = dev;
+
+	if (!tb[MDBA_SET_ENTRY] ||
+	    nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
+		pr_info("PF_BRIDGE: br_mdb_parse() with invalid attr\n");
+		return -EINVAL;
+	}
+
+	entry = nla_data(tb[MDBA_SET_ENTRY]);
+	if (!is_valid_mdb_entry(entry)) {
+		pr_info("PF_BRIDGE: br_mdb_parse() with invalid entry\n");
+		return -EINVAL;
+	}
+
+	*pentry = entry;
+	return 0;
+}
+
+static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
+			    struct br_ip *group)
+{
+	struct net_bridge_mdb_entry *mp;
+	struct net_bridge_port_group *p;
+	struct net_bridge_port_group __rcu **pp;
+	struct net_bridge_mdb_htable *mdb;
+	int err;
+
+	mdb = mlock_dereference(br->mdb, br);
+	mp = br_mdb_ip_get(mdb, group);
+	if (!mp) {
+		mp = br_multicast_new_group(br, port, group);
+		err = PTR_ERR(mp);
+		if (IS_ERR(mp))
+			return err;
+	}
+
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
+		if (p->port == port)
+			return -EEXIST;
+		if ((unsigned long)p->port < (unsigned long)port)
+			break;
+	}
+
+	p = br_multicast_new_port_group(port, group, *pp);
+	if (unlikely(!p))
+		return -ENOMEM;
+	rcu_assign_pointer(*pp, p);
+
+	br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
+	return 0;
+}
+
+static int __br_mdb_add(struct net *net, struct net_bridge *br,
+			struct br_mdb_entry *entry)
+{
+	struct br_ip ip;
+	struct net_device *dev;
+	struct net_bridge_port *p;
+	int ret;
+
+	if (!netif_running(br->dev) || br->multicast_disabled)
+		return -EINVAL;
+
+	dev = __dev_get_by_index(net, entry->ifindex);
+	if (!dev)
+		return -ENODEV;
+
+	p = br_port_get_rtnl(dev);
+	if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+		return -EINVAL;
+
+	ip.proto = entry->addr.proto;
+	if (ip.proto == htons(ETH_P_IP))
+		ip.u.ip4 = entry->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		ip.u.ip6 = entry->addr.u.ip6;
+#endif
+
+	spin_lock_bh(&br->multicast_lock);
+	ret = br_mdb_add_group(br, p, &ip);
+	spin_unlock_bh(&br->multicast_lock);
+	return ret;
+}
+
+static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct br_mdb_entry *entry;
+	struct net_device *dev;
+	struct net_bridge *br;
+	int err;
+
+	err = br_mdb_parse(skb, nlh, &dev, &entry);
+	if (err < 0)
+		return err;
+
+	br = netdev_priv(dev);
+
+	err = __br_mdb_add(net, br, entry);
+	if (!err)
+		__br_mdb_notify(dev, entry, RTM_NEWMDB);
+	return err;
+}
+
+static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
+{
+	struct net_bridge_mdb_htable *mdb;
+	struct net_bridge_mdb_entry *mp;
+	struct net_bridge_port_group *p;
+	struct net_bridge_port_group __rcu **pp;
+	struct br_ip ip;
+	int err = -EINVAL;
+
+	if (!netif_running(br->dev) || br->multicast_disabled)
+		return -EINVAL;
+
+	if (timer_pending(&br->multicast_querier_timer))
+		return -EBUSY;
+
+	ip.proto = entry->addr.proto;
+	if (ip.proto == htons(ETH_P_IP))
+		ip.u.ip4 = entry->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		ip.u.ip6 = entry->addr.u.ip6;
+#endif
+
+	spin_lock_bh(&br->multicast_lock);
+	mdb = mlock_dereference(br->mdb, br);
+
+	mp = br_mdb_ip_get(mdb, &ip);
+	if (!mp)
+		goto unlock;
+
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
+		if (!p->port || p->port->dev->ifindex != entry->ifindex)
+			continue;
+
+		if (p->port->state == BR_STATE_DISABLED)
+			goto unlock;
+
+		rcu_assign_pointer(*pp, p->next);
+		hlist_del_init(&p->mglist);
+		del_timer(&p->timer);
+		call_rcu_bh(&p->rcu, br_multicast_free_pg);
+		err = 0;
+
+		if (!mp->ports && !mp->mglist &&
+		    netif_running(br->dev))
+			mod_timer(&mp->timer, jiffies);
+		break;
+	}
+
+unlock:
+	spin_unlock_bh(&br->multicast_lock);
+	return err;
+}
+
+static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net_device *dev;
+	struct br_mdb_entry *entry;
+	struct net_bridge *br;
+	int err;
+
+	err = br_mdb_parse(skb, nlh, &dev, &entry);
+	if (err < 0)
+		return err;
+
+	br = netdev_priv(dev);
+
+	err = __br_mdb_del(br, entry);
+	if (!err)
+		__br_mdb_notify(dev, entry, RTM_DELMDB);
+	return err;
+}
+
 void br_mdb_init(void)
 {
 	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
+	rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL);
+	rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL);
 }
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d929586..977c3ee 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -27,27 +27,14 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
 #include <net/mld.h>
-#include <net/addrconf.h>
 #include <net/ip6_checksum.h>
 #endif
 
 #include "br_private.h"
 
-#define mlock_dereference(X, br) \
-	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
-
 static void br_multicast_start_querier(struct net_bridge *br);
 unsigned int br_mdb_rehash_seq;
 
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int ipv6_is_transient_multicast(const struct in6_addr *addr)
-{
-	if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr))
-		return 1;
-	return 0;
-}
-#endif
-
 static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
 {
 	if (a->proto != b->proto)
@@ -104,8 +91,8 @@ static struct net_bridge_mdb_entry *__br_mdb_ip_get(
 	return NULL;
 }
 
-static struct net_bridge_mdb_entry *br_mdb_ip_get(
-	struct net_bridge_mdb_htable *mdb, struct br_ip *dst)
+struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb,
+					   struct br_ip *dst)
 {
 	if (!mdb)
 		return NULL;
@@ -208,7 +195,7 @@ static int br_mdb_copy(struct net_bridge_mdb_htable *new,
 	return maxlen > elasticity ? -EINVAL : 0;
 }
 
-static void br_multicast_free_pg(struct rcu_head *head)
+void br_multicast_free_pg(struct rcu_head *head)
 {
 	struct net_bridge_port_group *p =
 		container_of(head, struct net_bridge_port_group, rcu);
@@ -584,9 +571,8 @@ err:
 	return mp;
 }
 
-static struct net_bridge_mdb_entry *br_multicast_new_group(
-	struct net_bridge *br, struct net_bridge_port *port,
-	struct br_ip *group)
+struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
+	struct net_bridge_port *port, struct br_ip *group)
 {
 	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
@@ -633,6 +619,26 @@ out:
 	return mp;
 }
 
+struct net_bridge_port_group *br_multicast_new_port_group(
+			struct net_bridge_port *port,
+			struct br_ip *group,
+			struct net_bridge_port_group *next)
+{
+	struct net_bridge_port_group *p;
+
+	p = kzalloc(sizeof(*p), GFP_ATOMIC);
+	if (unlikely(!p))
+		return NULL;
+
+	p->addr = *group;
+	p->port = port;
+	p->next = next;
+	hlist_add_head(&p->mglist, &port->mglist);
+	setup_timer(&p->timer, br_multicast_port_group_expired,
+		    (unsigned long)p);
+	return p;
+}
+
 static int br_multicast_add_group(struct net_bridge *br,
 				  struct net_bridge_port *port,
 				  struct br_ip *group)
@@ -668,18 +674,9 @@ static int br_multicast_add_group(struct net_bridge *br,
 			break;
 	}
 
-	p = kzalloc(sizeof(*p), GFP_ATOMIC);
-	err = -ENOMEM;
+	p = br_multicast_new_port_group(port, group, *pp);
 	if (unlikely(!p))
 		goto err;
-
-	p->addr = *group;
-	p->port = port;
-	p->next = *pp;
-	hlist_add_head(&p->mglist, &port->mglist);
-	setup_timer(&p->timer, br_multicast_port_group_expired,
-		    (unsigned long)p);
-
 	rcu_assign_pointer(*pp, p);
 	br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2807c76..f21a739 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -434,10 +434,33 @@ extern int br_multicast_set_port_router(struct net_bridge_port *p,
 extern int br_multicast_toggle(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+extern struct net_bridge_mdb_entry *br_mdb_ip_get(
+				struct net_bridge_mdb_htable *mdb,
+				struct br_ip *dst);
+extern struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
+				struct net_bridge_port *port, struct br_ip *group);
+extern void br_multicast_free_pg(struct rcu_head *head);
+extern struct net_bridge_port_group *br_multicast_new_port_group(
+				struct net_bridge_port *port,
+				struct br_ip *group,
+				struct net_bridge_port_group *next);
 extern void br_mdb_init(void);
 extern void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 			  struct br_ip *group, int type);
 
+#define mlock_dereference(X, br) \
+	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/addrconf.h>
+static inline int ipv6_is_transient_multicast(const struct in6_addr *addr)
+{
+	if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr))
+		return 1;
+	return 0;
+}
+#endif
+
 static inline bool br_multicast_is_router(struct net_bridge *br)
 {
 	return br->multicast_router == 2 ||
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH net-next 1/2] bridge: notify mdb changes via netlink
From: Cong Wang @ 2012-12-12  8:23 UTC (permalink / raw)
  To: netdev
  Cc: bridge, Cong Wang, Herbert Xu, Stephen Hemminger, David S. Miller,
	Thomas Graf

From: Cong Wang <amwang@redhat.com>

As Stephen mentioned, we need to monitor the mdb
changes in user-space, so add notifications via netlink too.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 include/uapi/linux/rtnetlink.h |    6 +++
 net/bridge/br_mdb.c            |   80 ++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_multicast.c      |    2 +
 net/bridge/br_private.h        |    2 +
 4 files changed, 90 insertions(+), 0 deletions(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 354a1e7..7a5eb19 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -125,6 +125,10 @@ enum {
 	RTM_GETNETCONF = 82,
 #define RTM_GETNETCONF RTM_GETNETCONF
 
+	RTM_NEWMDB = 84,
+#define RTM_NEWMDB RTM_NEWMDB
+	RTM_DELMDB = 85,
+#define RTM_DELMDB RTM_DELMDB
 	RTM_GETMDB = 86,
 #define RTM_GETMDB RTM_GETMDB
 
@@ -607,6 +611,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_IPV4_NETCONF	RTNLGRP_IPV4_NETCONF
 	RTNLGRP_IPV6_NETCONF,
 #define RTNLGRP_IPV6_NETCONF	RTNLGRP_IPV6_NETCONF
+	RTNLGRP_MDB,
+#define RTNLGRP_MDB		RTNLGRP_MDB
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index ccc43a9..a8cfbf5 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -155,6 +155,86 @@ out:
 	return skb->len;
 }
 
+static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
+				   struct net_device *dev,
+				   struct br_mdb_entry *entry, u32 pid,
+				   u32 seq, int type, unsigned int flags)
+{
+	struct nlmsghdr *nlh;
+	struct br_port_msg *bpm;
+	struct nlattr *nest, *nest2;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	bpm = nlmsg_data(nlh);
+	bpm->family  = AF_BRIDGE;
+	bpm->ifindex = dev->ifindex;
+	nest = nla_nest_start(skb, MDBA_MDB);
+	if (nest == NULL)
+		goto cancel;
+	nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+	if (nest2 == NULL)
+		goto end;
+
+	if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(*entry), entry))
+		goto end;
+
+	nla_nest_end(skb, nest2);
+	nla_nest_end(skb, nest);
+	return nlmsg_end(skb, nlh);
+
+end:
+	nla_nest_end(skb, nest);
+cancel:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t rtnl_mdb_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct br_port_msg))
+		+ nla_total_size(sizeof(struct br_mdb_entry));
+}
+
+static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry,
+			    int type)
+{
+	struct net *net = dev_net(dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
+	if (!skb)
+		goto errout;
+
+	err = nlmsg_populate_mdb_fill(skb, dev, entry, 0, 0, type, NTF_SELF);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+	return;
+errout:
+	rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
+		   struct br_ip *group, int type)
+{
+	struct br_mdb_entry entry;
+
+	entry.ifindex = port->dev->ifindex;
+	entry.addr.proto = group->proto;
+	entry.addr.u.ip4 = group->u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+	entry.addr.u.ip6 = group->u.ip6;
+#endif
+	__br_mdb_notify(dev, &entry, type);
+}
+
 void br_mdb_init(void)
 {
 	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 847b98a1..d929586 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -681,6 +681,7 @@ static int br_multicast_add_group(struct net_bridge *br,
 		    (unsigned long)p);
 
 	rcu_assign_pointer(*pp, p);
+	br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
 
 found:
 	mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -1240,6 +1241,7 @@ static void br_multicast_leave_group(struct net_bridge *br,
 			hlist_del_init(&p->mglist);
 			del_timer(&p->timer);
 			call_rcu_bh(&p->rcu, br_multicast_free_pg);
+			br_mdb_notify(br->dev, port, group, RTM_DELMDB);
 
 			if (!mp->ports && !mp->mglist &&
 			    netif_running(br->dev))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index f95b766..2807c76 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -435,6 +435,8 @@ extern int br_multicast_toggle(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
 extern void br_mdb_init(void);
+extern void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
+			  struct br_ip *group, int type);
 
 static inline bool br_multicast_is_router(struct net_bridge *br)
 {
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 2/2] iproute2: add support to monitor mdb entries too
From: Cong Wang @ 2012-12-12  8:23 UTC (permalink / raw)
  To: netdev; +Cc: bridge, Cong Wang, Stephen Hemminger, Thomas Graf
In-Reply-To: <1355300590-2390-1-git-send-email-amwang@redhat.com>

From: Cong Wang <amwang@redhat.com>

This patch implements `bridge monitor mdb`.

Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Cong Wang <amwang@redhat.com>

---
 bridge/br_common.h        |    2 ++
 bridge/mdb.c              |    4 ++--
 bridge/monitor.c          |   14 ++++++++++++++
 include/linux/rtnetlink.h |    2 ++
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/bridge/br_common.h b/bridge/br_common.h
index 892fb76..10f6ce9 100644
--- a/bridge/br_common.h
+++ b/bridge/br_common.h
@@ -3,6 +3,8 @@ extern int print_linkinfo(const struct sockaddr_nl *who,
 			  void *arg);
 extern int print_fdb(const struct sockaddr_nl *who,
 		     struct nlmsghdr *n, void *arg);
+extern int print_mdb(const struct sockaddr_nl *who,
+		     struct nlmsghdr *n, void *arg);
 
 extern int do_fdb(int argc, char **argv);
 extern int do_mdb(int argc, char **argv);
diff --git a/bridge/mdb.c b/bridge/mdb.c
index 4d8a896..121ce9c 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -82,8 +82,8 @@ int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 	int len = n->nlmsg_len;
 	struct rtattr * tb[MDBA_MAX+1];
 
-	if (n->nlmsg_type != RTM_GETMDB) {
-		fprintf(stderr, "Not RTM_GETMDB: %08x %08x %08x\n",
+	if (n->nlmsg_type != RTM_GETMDB && n->nlmsg_type != RTM_NEWMDB && n->nlmsg_type != RTM_DELMDB) {
+		fprintf(stderr, "Not RTM_GETMDB, RTM_NEWMDB or RTM_DELMDB: %08x %08x %08x\n",
 			n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags);
 
 		return 0;
diff --git a/bridge/monitor.c b/bridge/monitor.c
index 2f60655..44e14d8 100644
--- a/bridge/monitor.c
+++ b/bridge/monitor.c
@@ -68,6 +68,12 @@ int accept_msg(const struct sockaddr_nl *who,
 			fprintf(fp, "[NEIGH]");
 		return print_fdb(who, n, arg);
 
+	case RTM_NEWMDB:
+	case RTM_DELMDB:
+		if (prefix_banner)
+			fprintf(fp, "[MDB]");
+		return print_mdb(who, n, arg);
+
 	case 15:
 		return show_mark(fp, n);
 
@@ -84,6 +90,7 @@ int do_monitor(int argc, char **argv)
 	unsigned groups = ~RTMGRP_TC;
 	int llink=0;
 	int lneigh=0;
+	int lmdb=0;
 
 	rtnl_close(&rth);
 
@@ -97,6 +104,9 @@ int do_monitor(int argc, char **argv)
 		} else if (matches(*argv, "fdb") == 0) {
 			lneigh = 1;
 			groups = 0;
+		} else if (matches(*argv, "mdb") == 0) {
+			lmdb = 1;
+			groups = 0;
 		} else if (strcmp(*argv, "all") == 0) {
 			groups = ~RTMGRP_TC;
 			prefix_banner=1;
@@ -116,6 +126,10 @@ int do_monitor(int argc, char **argv)
 		groups |= nl_mgrp(RTNLGRP_NEIGH);
 	}
 
+	if (lmdb) {
+		groups |= nl_mgrp(RTNLGRP_MDB);
+	}
+
 	if (file) {
 		FILE *fp;
 		fp = fopen(file, "r");
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3ea85dc..87452b4 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -609,6 +609,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_IPV4_NETCONF	RTNLGRP_IPV4_NETCONF
 	RTNLGRP_IPV6_NETCONF,
 #define RTNLGRP_IPV6_NETCONF	RTNLGRP_IPV6_NETCONF
+	RTNLGRP_MDB,
+#define RTNLGRP_MDB		RTNLGRP_MDB
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 1/2] iproute2: implement add/del mdb entry
From: Cong Wang @ 2012-12-12  8:23 UTC (permalink / raw)
  To: netdev; +Cc: Thomas Graf, Stephen Hemminger, bridge, Cong Wang
In-Reply-To: <1355300590-2390-1-git-send-email-amwang@redhat.com>

From: Cong Wang <amwang@redhat.com>

This patch implements:

	bridge mdb { add | del } dev DEV port PORT grp GROUP

Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Cong Wang <amwang@redhat.com>

---
 bridge/mdb.c              |   76 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/if_bridge.h |    8 +++++
 include/linux/rtnetlink.h |    4 ++
 3 files changed, 88 insertions(+), 0 deletions(-)

diff --git a/bridge/mdb.c b/bridge/mdb.c
index 390d7f6..4d8a896 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -28,6 +28,7 @@ int filter_index;
 
 static void usage(void)
 {
+	fprintf(stderr, "Usage: bridge mdb { add | del } dev DEV port PORT grp GROUP\n");
 	fprintf(stderr, "       bridge mdb {show} [ dev DEV ]\n");
 	exit(-1);
 }
@@ -153,11 +154,86 @@ static int mdb_show(int argc, char **argv)
 	return 0;
 }
 
+static int mdb_modify(int cmd, int flags, int argc, char **argv)
+{
+	struct {
+		struct nlmsghdr 	n;
+		struct br_port_msg	bpm;
+		char   			buf[1024];
+	} req;
+	struct br_mdb_entry entry;
+	char *d = NULL, *p = NULL, *grp = NULL;
+
+	memset(&req, 0, sizeof(req));
+	memset(&entry, 0, sizeof(entry));
+
+	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct br_port_msg));
+	req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+	req.n.nlmsg_type = cmd;
+	req.bpm.family = PF_BRIDGE;
+
+	while (argc > 0) {
+		if (strcmp(*argv, "dev") == 0) {
+			NEXT_ARG();
+			d = *argv;
+		} else if (strcmp(*argv, "grp") == 0) {
+			NEXT_ARG();
+			grp = *argv;
+		} else {
+			if (strcmp(*argv, "port") == 0) {
+				NEXT_ARG();
+				p = *argv;
+			}
+			if (matches(*argv, "help") == 0)
+				usage();
+		}
+		argc--; argv++;
+	}
+
+	if (d == NULL || grp == NULL || p == NULL) {
+		fprintf(stderr, "Device, group address and port name are required arguments.\n");
+		exit(-1);
+	}
+
+	req.bpm.ifindex = ll_name_to_index(d);
+	if (req.bpm.ifindex == 0) {
+		fprintf(stderr, "Cannot find device \"%s\"\n", d);
+		return -1;
+	}
+
+	entry.ifindex = ll_name_to_index(p);
+	if (entry.ifindex == 0) {
+		fprintf(stderr, "Cannot find device \"%s\"\n", p);
+		return -1;
+	}
+
+	if (!inet_pton(AF_INET, grp, &entry.addr.u.ip4)) {
+		if (!inet_pton(AF_INET6, grp, &entry.addr.u.ip6)) {
+			fprintf(stderr, "Invalid address \"%s\"\n", grp);
+			return -1;
+		} else
+			entry.addr.proto = htons(ETH_P_IPV6);
+	} else
+		entry.addr.proto = htons(ETH_P_IP);
+
+	addattr_l(&req.n, sizeof(req), MDBA_SET_ENTRY, &entry, sizeof(entry));
+
+	if (rtnl_talk(&rth, &req.n, 0, 0, NULL) < 0)
+		exit(2);
+
+	return 0;
+}
+
 int do_mdb(int argc, char **argv)
 {
 	ll_init_map(&rth);
 
 	if (argc > 0) {
+		if (matches(*argv, "add") == 0)
+			return mdb_modify(RTM_NEWMDB, NLM_F_CREATE|NLM_F_EXCL, argc-1, argv+1);
+		if (matches(*argv, "delete") == 0)
+			return mdb_modify(RTM_DELMDB, 0, argc-1, argv+1);
+
 		if (matches(*argv, "show") == 0 ||
 		    matches(*argv, "lst") == 0 ||
 		    matches(*argv, "list") == 0)
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 151a8bb..b3b6a67 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -157,6 +157,7 @@ enum {
 #define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
 
 struct br_port_msg {
+	__u8  family;
 	__u32 ifindex;
 };
 
@@ -171,4 +172,11 @@ struct br_mdb_entry {
 	} addr;
 };
 
+enum {
+	MDBA_SET_ENTRY_UNSPEC,
+	MDBA_SET_ENTRY,
+	__MDBA_SET_ENTRY_MAX,
+};
+#define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1)
+
 #endif /* _LINUX_IF_BRIDGE_H */
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index c82a159..3ea85dc 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -125,6 +125,10 @@ enum {
 	RTM_GETNETCONF = 82,
 #define RTM_GETNETCONF RTM_GETNETCONF
 
+	RTM_NEWMDB = 84,
+#define RTM_NEWMDB RTM_NEWMDB
+	RTM_DELMDB = 85,
+#define RTM_DELMDB RTM_DELMDB
 	RTM_GETMDB = 86,
 #define RTM_GETMDB RTM_GETMDB
 
-- 
1.7.7.6

^ permalink raw reply related

* Re: [PATCH net-next v5] bridge: export multicast database via netlink
From: Cong Wang @ 2012-12-12  7:59 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, bridge, Herbert Xu, Jesper Dangaard Brouer, Thomas Graf,
	David S. Miller
In-Reply-To: <20121211164856.37ce94fe@nehalam.linuxnetplumber.net>

On Tue, 2012-12-11 at 16:48 -0800, Stephen Hemminger wrote:    
> 
> Applied, but required some manual fixing. It required adding if_bridge.h
> to include/linux in iproute2 exported headers. Also patch still had some fuzz
> against current version.
> 

Thanks, Stephen!

I thought those headers are sync'ed with kernel headers automatically,
so we have to keep them up to date manually.

^ permalink raw reply

* Re: [PATCH net-next] pkt_sched: avoid requeues if possible
From: David Miller @ 2012-12-12  5:24 UTC (permalink / raw)
  To: erdnetdev; +Cc: netdev, jhs, john.r.fastabend
In-Reply-To: <1355277273.27891.166.camel@edumazet-glaptop>

From: Eric Dumazet <erdnetdev@gmail.com>
Date: Tue, 11 Dec 2012 17:54:33 -0800

> From: Eric Dumazet <edumazet@google.com>
> 
> With BQL being deployed, we can more likely have following behavior :
> 
> We dequeue a packet from qdisc in dequeue_skb(), then we realize target
> tx queue is in XOFF state in sch_direct_xmit(), and we have to hold the
> skb into gso_skb for later.
> 
> This shows in stats (tc -s qdisc dev eth0) as requeues.
> 
> Problem of these requeues is that high priority packets can not be
> dequeued as long as this (possibly low prio and big TSO packet) is not
> removed from gso_skb.
> 
> At 1Gbps speed, a full size TSO packet is 500 us of extra latency.
> 
> In some cases, we know that all packets dequeued from a qdisc are
> for a particular and known txq :
> 
> - If device is non multi queue
> - For all MQ/MQPRIO slave qdiscs
> 
> This patch introduces a new qdisc flag, TCQ_F_ONETXQUEUE to mark
> this capability, so that dequeue_skb() is allowed to dequeue a packet
> only if the associated txq is not stopped.
> 
> This indeed reduce latencies for high prio packets (or improve fairness
> with sfq/fq_codel), and almost remove qdisc 'requeues'.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied.

^ permalink raw reply

* Re: [PATCH] solos-pci: fix double-free of TX skb in DMA mode
From: David Miller @ 2012-12-12  5:24 UTC (permalink / raw)
  To: dwmw2; +Cc: netdev, nathan
In-Reply-To: <20121212.002345.2154152659215725592.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Wed, 12 Dec 2012 00:23:45 -0500 (EST)

> From: David Woodhouse <dwmw2@infradead.org>
> Date: Wed, 12 Dec 2012 00:57:14 +0000
> 
>> We weren't clearing card->tx_skb[port] when processing the TX done interrupt.
>> If there wasn't another skb ready to transmit immediately, this led to a
>> double-free because we'd free it *again* next time we did have a packet to
>> send.
>> 
>> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
>> Cc: stable@kernel.org
> 
> Acked-by: David S. Miller <davem@davemloft.net>

Sorry, fingers slipped, I meant "Applied" :-)

^ permalink raw reply

* Re: [PATCH] solos-pci: fix double-free of TX skb in DMA mode
From: David Miller @ 2012-12-12  5:23 UTC (permalink / raw)
  To: dwmw2; +Cc: netdev, nathan
In-Reply-To: <1355273834.23544.37.camel@shinybook.infradead.org>

From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 12 Dec 2012 00:57:14 +0000

> We weren't clearing card->tx_skb[port] when processing the TX done interrupt.
> If there wasn't another skb ready to transmit immediately, this led to a
> double-free because we'd free it *again* next time we did have a packet to
> send.
> 
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> Cc: stable@kernel.org

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* Re: [PATCH 4/5] net: sfc: fix return value check in efx_ptp_probe_channel().
From: David Miller @ 2012-12-12  5:15 UTC (permalink / raw)
  To: tipecaml
  Cc: linux-kernel, kernel-janitors, linux-net-drivers, bhutchings,
	netdev
In-Reply-To: <1355271894-5284-5-git-send-email-tipecaml@gmail.com>

From: Cyril Roelandt <tipecaml@gmail.com>
Date: Wed, 12 Dec 2012 01:24:53 +0100

> The ptp_clock_register() returns ERR_PTR() and never returns NULL. Replace the
> NULL check by a call to IS_ERR().
> 
> Signed-off-by: Cyril Roelandt <tipecaml@gmail.com>

I'll let Ben queue this up.

Probably he'll want to avoid potentially leaving an ERR_PTR
in ptp->phc_clock even if, with this fix, that would be
harmless.

^ permalink raw reply

* Re: [PATCH net-next rfc 2/2] tuntap: allow unpriveledge user to enable and disable queues
From: Jason Wang @ 2012-12-12  3:34 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: pmoore, netdev, linux-kernel, mprivozn
In-Reply-To: <20121211123012.GB15435@redhat.com>

On 12/11/2012 08:30 PM, Michael S. Tsirkin wrote:
> On Tue, Dec 11, 2012 at 07:03:47PM +0800, Jason Wang wrote:
>> Currently, when a file is attached to tuntap through TUNSETQUEUE, the uid/gid
>> and CAP_NET_ADMIN were checked, and we use this ioctl to create and destroy
>> queues. Sometimes, userspace such as qemu need to the ability to enable and
>> disable a specific queue without priveledge since guest operating system may
>> change the number of queues it want use.
>>
>> To support this kind of ability, this patch introduce a flag enabled which is
>> used to track whether the queue is enabled by userspace. And also restrict that
>> only one deivce could be used for a queue to attach. With this patch, the DAC
>> checking when adding queues through IFF_ATTACH_QUEUE is still done and after
>> this, IFF_DETACH_QUEUE/IFF_ATTACH_QUEUE  could be used to disable/enable this
>> queue.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>  drivers/net/tun.c |   81 +++++++++++++++++++++++++++++++++++++++++++++++-----
>>  1 files changed, 73 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index d593f56..43831a7 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -138,6 +138,7 @@ struct tun_file {
>>  	/* only used for fasnyc */
>>  	unsigned int flags;
>>  	u16 queue_index;
>> +	bool enabled;
>>  };
>>  
>>  struct tun_flow_entry {
>> @@ -345,9 +346,11 @@ unlock:
>>  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
>>  {
>>  	struct tun_struct *tun = netdev_priv(dev);
>> +	struct tun_file *tfile;
>>  	struct tun_flow_entry *e;
>>  	u32 txq = 0;
>>  	u32 numqueues = 0;
>> +	int i;
>>  
>>  	rcu_read_lock();
>>  	numqueues = tun->numqueues;
>> @@ -366,6 +369,19 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
>>  			txq -= numqueues;
>>  	}
>>  
>> +	tfile = rcu_dereference(tun->tfiles[txq]);
>> +	if (unlikely(!tfile->enabled))
> This unlikely tag is suspicious. It should be perfectly
> legal to use less queues than created.

Ok. will remove this check.
>
>> +		/* tun_detach() should make sure there's at least one queue
>> +		 * could be used to do the tranmission.
>> +		 */
>> +		for (i = 0; i < numqueues; i++) {
>> +			tfile = rcu_dereference(tun->tfiles[i]);
>> +			if (tfile->enabled) {
>> +				txq = i;
>> +				break;
>> +			}
>> +		}
>> +
> Worst case this will do a linear scan over all queueus on each packet.
> Instead, I think we need a list of all queues and only install
> the active ones in the array.

Another method is using another variable e.g. active_queues to track how
many queues were enabled. And re-shuffle the pointers during
detaching/attaching to make sure [0, active_queues) to be enabled
queues, and [active_queues, num_queues) to be disabled queues. Then we
could avoid this issue.
>
>>  	rcu_read_unlock();
>>  	return txq;
>>  }
>> @@ -386,6 +402,36 @@ static void tun_set_real_num_queues(struct tun_struct *tun)
>>  	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
>>  }
>>  
>> +static int tun_enable(struct tun_file *tfile)
>> +{
>> +	if (tfile->enabled == true)
> simply if (tfile->enabled)

Right.
>> +		return -EINVAL;
> Actually it's better to have operations be
> idempotent. If it's enabled, enabling should
> be a NOP not an error.

Ok.
>> +
>> +	tfile->enabled = true;
>> +	return 0;
>> +}
>> +
>> +static int tun_disable(struct tun_file *tfile)
>> +{
>> +	struct tun_struct *tun = rcu_dereference_protected(tfile->tun,
>> +							   lockdep_rtnl_is_held());
>> +	u16 index = tfile->queue_index;
>> +
>> +	if (!tun)
>> +		return -EINVAL;
>> +
>> +	if (tun->numqueues == 1)
>> +		return -EINVAL;
> So if there's a single queue we can't disable it,
> but if there are > 1 we can disable them all.
> This seems arbitrary.
>

The question is whether we can allow the userspace to disable all queues
which looks useless to me. So I try to forbid this.
>> +
>> +	BUG_ON(index >= tun->numqueues);
>> +	tfile->enabled = false;
>> +
>> +	synchronize_net();
>> +	tun_flow_delete_by_queue(tun, index);
>> +
>> +	return 0;
>> +}
>> +
>>  static void __tun_detach(struct tun_file *tfile, bool clean)
>>  {
>>  	struct tun_file *ntfile;
>> @@ -446,6 +492,7 @@ static void tun_detach_all(struct net_device *dev)
>>  		BUG_ON(!tfile);
>>  		wake_up_all(&tfile->wq.wait);
>>  		rcu_assign_pointer(tfile->tun, NULL);
>> +		tfile->enabled = false;
>>  		--tun->numqueues;
>>  	}
>>  	BUG_ON(tun->numqueues != 0);
>> @@ -490,6 +537,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
>>  	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>  	sock_hold(&tfile->sk);
>>  	tun->numqueues++;
>> +	tfile->enabled = true;
>>  
>>  	tun_set_real_num_queues(tun);
>>  
>> @@ -672,6 +720,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>  	if (txq >= tun->numqueues)
>>  		goto drop;
>>  
>> +	/* Drop packet if the queue was not enabled */
>> +	if (!tfile->enabled)
>> +		goto drop;
>> +
>>  	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>>  
>>  	BUG_ON(!tfile);
>> @@ -1010,6 +1062,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>>  	bool zerocopy = false;
>>  	int err;
>>  
>> +	if (!tfile->enabled)
>> +		return -EINVAL;
>> +
>>  	if (!(tun->flags & TUN_NO_PI)) {
>>  		if ((len -= sizeof(pi)) > total_len)
>>  			return -EINVAL;
>> @@ -1199,6 +1254,9 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>  	struct tun_pi pi = { 0, skb->protocol };
>>  	ssize_t total = 0;
>>  
>> +	if (!tfile->enabled)
>> +		return -EINVAL;
>> +
>>  	if (!(tun->flags & TUN_NO_PI)) {
>>  		if ((len -= sizeof(pi)) < 0)
>>  			return -EINVAL;
>> @@ -1769,15 +1827,21 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>>  		if (dev->netdev_ops != &tap_netdev_ops &&
>>  			dev->netdev_ops != &tun_netdev_ops)
>>  			ret = -EINVAL;
>> -		else if (tun_not_capable(tun))
>> -			ret = -EPERM;
>> -		/* TUNSETIFF is needed to do permission checking */
>> -		else if (tun->numqueues == 0)
>> -			ret = -EPERM;
>> -		else
>> -			ret = tun_attach(tun, file);
>> +		else {
>> +			if (!rcu_dereference(tfile->tun)) {
> Should be rcu_dereference_protected.

True.
>
>> +				if (tun_not_capable(tun) ||
>> +				    tun->numqueues == 0)
>> +					ret = -EPERM;
>> +				else
>> +					ret = tun_attach(tun, file);
>> +			}
>> +			else {
>> +				/* FIXME: permission check? */
>> +				ret = tun_enable(tfile);
>> +			}
>> +		}
>>  	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE)
>> -		__tun_detach(tfile, false);
>> +		tun_disable(tfile);
>>  	else
>>  		ret = -EINVAL;
>>  
>> @@ -2085,6 +2149,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>  	tfile->socket.file = file;
>>  	tfile->socket.ops = &tun_socket_ops;
>>  
>> +	tfile->enabled = false;
>>  	sock_init_data(&tfile->socket, &tfile->sk);
>>  	sk_change_net(&tfile->sk, tfile->net);
>>  
>> -- 
>> 1.7.1
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox