* [PATCH 1/6] vxlan: Allow for VXLAN extensions to be implemented
2015-01-12 12:26 [PATCH 0/6 net-next v3] VXLAN Group Policy Extension Thomas Graf
@ 2015-01-12 12:26 ` Thomas Graf
2015-01-12 12:26 ` [PATCH 2/6] vxlan: Group Policy extension Thomas Graf
` (4 subsequent siblings)
5 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 12:26 UTC (permalink / raw)
To: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov; +Cc: dev, netdev
The VXLAN receive code is currently conservative in what it accepts and
will reject any frame that uses any of the reserved VXLAN protocol fields.
The VXLAN draft specifies that "reserved fields MUST be set to zero on
transmit and ignored on receive.".
Retain the current conservative parsing behaviour by default but allows
these fields to be used by VXLAN extensions which are explicitly enabled
on the VXLAN socket respectively VXLAN net_device.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v2->v3:
- No change
v1->v2:
- No change
drivers/net/vxlan.c | 29 +++++++++++++++++++----------
include/net/vxlan.h | 32 +++++++++++++++++++++++++++++---
2 files changed, 48 insertions(+), 13 deletions(-)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2ab0922..4d52aa9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -65,7 +65,7 @@
#define VXLAN_VID_MASK (VXLAN_N_VID - 1)
#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
-#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
+#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags default value. */
/* UDP port for VXLAN traffic.
* The IANA assigned port is 4789, but the Linux default is 8472
@@ -1100,22 +1100,28 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (!pskb_may_pull(skb, VXLAN_HLEN))
goto error;
+ vs = rcu_dereference_sk_user_data(sk);
+ if (!vs)
+ goto drop;
+
/* Return packets with reserved bits set */
vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
- if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
- (vxh->vx_vni & htonl(0xff))) {
- netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
- ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
- goto error;
+
+ /* For backwards compatibility, only allow reserved fields to be
+ * used by VXLAN extensions if explicitly requested.
+ */
+ if (vs->exts) {
+ if (!vxh->vni_present)
+ goto error_invalid_header;
+ } else {
+ if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+ (vxh->vx_vni & htonl(0xff)))
+ goto error_invalid_header;
}
if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
goto drop;
- vs = rcu_dereference_sk_user_data(sk);
- if (!vs)
- goto drop;
-
vs->rcv(vs, skb, vxh->vx_vni);
return 0;
@@ -1124,6 +1130,9 @@ drop:
kfree_skb(skb);
return 0;
+error_invalid_header:
+ netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
+ ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
error:
/* Return non vxlan pkt */
return 1;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 903461a..3e98d31 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -11,10 +11,35 @@
#define VNI_HASH_BITS 10
#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
-/* VXLAN protocol header */
+/* VXLAN protocol header:
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|I|R|R|R| Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | VXLAN Network Identifier (VNI) | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * I = 1 VXLAN Network Identifier (VNI) present
+ */
struct vxlanhdr {
- __be32 vx_flags;
- __be32 vx_vni;
+ union {
+ struct {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+ __u8 reserved_flags1:3,
+ vni_present:1,
+ reserved_flags2:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u8 reserved_flags2:4,
+ vni_present:1,
+ reserved_flags1:3;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ __u8 vx_reserved1;
+ __be16 vx_reserved2;
+ };
+ __be32 vx_flags;
+ };
+ __be32 vx_vni;
};
struct vxlan_sock;
@@ -25,6 +50,7 @@ struct vxlan_sock {
struct hlist_node hlist;
vxlan_rcv_t *rcv;
void *data;
+ u32 exts;
struct work_struct del_work;
struct socket *sock;
struct rcu_head rcu;
--
1.9.3
^ permalink raw reply related [flat|nested] 44+ messages in thread* [PATCH 2/6] vxlan: Group Policy extension
2015-01-12 12:26 [PATCH 0/6 net-next v3] VXLAN Group Policy Extension Thomas Graf
2015-01-12 12:26 ` [PATCH 1/6] vxlan: Allow for VXLAN extensions to be implemented Thomas Graf
@ 2015-01-12 12:26 ` Thomas Graf
2015-01-12 19:23 ` Jesse Gross
2015-01-12 12:26 ` [PATCH 3/6] vxlan: Only bind to sockets with correct extensions enabled Thomas Graf
` (3 subsequent siblings)
5 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 12:26 UTC (permalink / raw)
To: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov; +Cc: dev, netdev
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v2->v3:
- Removed empty struct vxlan_gbp as spotted by Alexei
v1->v2:
- split GBP header definition into separate struct vxlanhdr_gbp as requested
by Alexei
drivers/net/vxlan.c | 161 ++++++++++++++++++++++++++++++------------
include/net/vxlan.h | 70 ++++++++++++++++--
include/uapi/linux/if_link.h | 8 +++
net/openvswitch/vport-vxlan.c | 9 ++-
4 files changed, 195 insertions(+), 53 deletions(-)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 4d52aa9..b148739 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -132,6 +132,7 @@ struct vxlan_dev {
__u8 tos; /* TOS override */
__u8 ttl;
u32 flags; /* VXLAN_F_* in vxlan.h */
+ u32 exts; /* Enabled extensions */
struct work_struct sock_work;
struct work_struct igmp_join;
@@ -568,7 +569,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
continue;
vh2 = (struct vxlanhdr *)(p->data + off_vx);
- if (vh->vx_vni != vh2->vx_vni) {
+ if (vh->vx_flags != vh2->vx_flags ||
+ vh->vx_vni != vh2->vx_vni) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -1095,6 +1097,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct vxlan_sock *vs;
struct vxlanhdr *vxh;
+ struct vxlan_metadata md = {0};
/* Need Vxlan and inner Ethernet header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1113,6 +1116,22 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (vs->exts) {
if (!vxh->vni_present)
goto error_invalid_header;
+
+ if (vxh->gbp_present) {
+ struct vxlanhdr_gbp *gbp;
+
+ if (!(vs->exts & VXLAN_EXT_GBP))
+ goto error_invalid_header;
+
+ gbp = (struct vxlanhdr_gbp *)vxh;
+ md.gbp = ntohs(gbp->policy_id);
+
+ if (gbp->dont_learn)
+ md.gbp |= VXLAN_GBP_DONT_LEARN;
+
+ if (gbp->policy_applied)
+ md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+ }
} else {
if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
(vxh->vx_vni & htonl(0xff)))
@@ -1122,7 +1141,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
goto drop;
- vs->rcv(vs, skb, vxh->vx_vni);
+ md.vni = vxh->vx_vni;
+ vs->rcv(vs, skb, &md);
return 0;
drop:
@@ -1138,8 +1158,8 @@ error:
return 1;
}
-static void vxlan_rcv(struct vxlan_sock *vs,
- struct sk_buff *skb, __be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md)
{
struct iphdr *oip = NULL;
struct ipv6hdr *oip6 = NULL;
@@ -1150,7 +1170,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
int err = 0;
union vxlan_addr *remote_ip;
- vni = ntohl(vx_vni) >> 8;
+ vni = ntohl(md->vni) >> 8;
/* Is this VNI defined? */
vxlan = vxlan_vs_find_vni(vs, vni);
if (!vxlan)
@@ -1184,6 +1204,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
goto drop;
skb_reset_network_header(skb);
+ skb->mark = md->gbp;
if (oip6)
err = IP6_ECN_decapsulate(oip6, skb);
@@ -1533,15 +1554,57 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
return false;
}
+static int vxlan_build_hdr(struct sk_buff *skb, struct vxlan_sock *vs,
+ int min_headroom, struct vxlan_metadata *md)
+{
+ struct vxlanhdr *vxh;
+ int err;
+
+ /* Need space for new headers (invalidates iph ptr) */
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (WARN_ON(!skb))
+ return -ENOMEM;
+
+ vxh = (struct vxlanhdr *)__skb_push(skb, sizeof(*vxh));
+ vxh->vx_flags = htonl(VXLAN_FLAGS);
+ vxh->vx_vni = md->vni;
+
+ if (vs->exts) {
+ if (vs->exts & VXLAN_EXT_GBP) {
+ struct vxlanhdr_gbp *gbp;
+
+ gbp = (struct vxlanhdr_gbp *)vxh;
+ vxh->gbp_present = 1;
+
+ if (md->gbp & VXLAN_GBP_DONT_LEARN)
+ gbp->dont_learn = 1;
+
+ if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
+ gbp->policy_applied = 1;
+
+ gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
+ }
+ }
+
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+
+ return 0;
+}
+
#if IS_ENABLED(CONFIG_IPV6)
static int vxlan6_xmit_skb(struct vxlan_sock *vs,
struct dst_entry *dst, struct sk_buff *skb,
struct net_device *dev, struct in6_addr *saddr,
struct in6_addr *daddr, __u8 prio, __u8 ttl,
- __be16 src_port, __be16 dst_port, __be32 vni,
- bool xnet)
+ __be16 src_port, __be16 dst_port,
+ struct vxlan_metadata *md, bool xnet)
{
- struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk);
@@ -1558,24 +1621,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
+ VXLAN_HLEN + sizeof(struct ipv6hdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
- /* Need space for new headers (invalidates iph ptr) */
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
- goto err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (WARN_ON(!skb)) {
- err = -ENOMEM;
+ err = vxlan_build_hdr(skb, vs, min_headroom, md);
+ if (err)
goto err;
- }
-
- vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
- vxh->vx_flags = htonl(VXLAN_FLAGS);
- vxh->vx_vni = vni;
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
ttl, src_port, dst_port);
@@ -1589,9 +1637,9 @@ err:
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni, bool xnet)
+ __be16 src_port, __be16 dst_port,
+ struct vxlan_metadata *md, bool xnet)
{
- struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !vs->sock->sk->sk_no_check_tx;
@@ -1604,22 +1652,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
+ VXLAN_HLEN + sizeof(struct iphdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
- /* Need space for new headers (invalidates iph ptr) */
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
+ err = vxlan_build_hdr(skb, vs, min_headroom, md);
+ if (err)
return err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (WARN_ON(!skb))
- return -ENOMEM;
-
- vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
- vxh->vx_flags = htonl(VXLAN_FLAGS);
- vxh->vx_vni = vni;
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
ttl, df, src_port, dst_port, xnet);
@@ -1679,6 +1714,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *old_iph;
struct flowi4 fl4;
union vxlan_addr *dst;
+ struct vxlan_metadata md;
__be16 src_port = 0, dst_port;
u32 vni;
__be16 df = 0;
@@ -1749,11 +1785,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+ md.vni = htonl(vni << 8);
+ md.gbp = skb->mark;
err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
fl4.saddr, dst->sin.sin_addr.s_addr,
- tos, ttl, df, src_port, dst_port,
- htonl(vni << 8),
+ tos, ttl, df, src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)));
if (err < 0) {
/* skb is already freed. */
@@ -1806,10 +1843,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
}
ttl = ttl ? : ip6_dst_hoplimit(ndst);
+ md.vni = htonl(vni << 8);
+ md.gbp = skb->mark;
err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
dev, &fl6.saddr, &fl6.daddr, 0, ttl,
- src_port, dst_port, htonl(vni << 8),
+ src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)));
#endif
}
@@ -2210,6 +2249,11 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
+ [IFLA_VXLAN_EXTENSION] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy vxlan_ext_policy[IFLA_VXLAN_EXT_MAX + 1] = {
+ [IFLA_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
};
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -2246,6 +2290,18 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
}
}
+ if (data[IFLA_VXLAN_EXTENSION]) {
+ int err;
+
+ err = nla_validate_nested(data[IFLA_VXLAN_EXTENSION],
+ IFLA_VXLAN_EXT_MAX, vxlan_ext_policy);
+ if (err < 0) {
+ pr_debug("invalid VXLAN extension configuration: %d\n",
+ err);
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -2400,6 +2456,18 @@ static void vxlan_sock_work(struct work_struct *work)
dev_put(vxlan->dev);
}
+static void configure_vxlan_exts(struct vxlan_dev *vxlan, struct nlattr *attr)
+{
+ struct nlattr *exts[IFLA_VXLAN_EXT_MAX+1];
+
+ /* Validated in vxlan_validate() */
+ if (nla_parse_nested(exts, IFLA_VXLAN_EXT_MAX, attr, NULL) < 0)
+ BUG();
+
+ if (exts[IFLA_VXLAN_EXT_GBP])
+ vxlan->exts |= VXLAN_EXT_GBP;
+}
+
static int vxlan_newlink(struct net *net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
@@ -2525,6 +2593,9 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
+ if (data[IFLA_VXLAN_EXTENSION])
+ configure_vxlan_exts(vxlan, data[IFLA_VXLAN_EXTENSION]);
+
if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET,
vxlan->dst_port)) {
pr_info("duplicate VNI %u\n", vni);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 3e98d31..66ec53c 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -11,13 +11,62 @@
#define VNI_HASH_BITS 10
#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
+/*
+ * VXLAN Group Based Policy Extension:
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |1|-|-|-|1|-|-|-|R|D|R|R|A|R|R|R| Group Policy ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | VXLAN Network Identifier (VNI) | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * D = Don't Learn bit. When set, this bit indicates that the egress
+ * VTEP MUST NOT learn the source address of the encapsulated frame.
+ *
+ * A = Indicates that the group policy has already been applied to
+ * this packet. Policies MUST NOT be applied by devices when the
+ * A bit is set.
+ *
+ * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
+ */
+struct vxlanhdr_gbp {
+ __u8 vx_flags;
+#ifdef __LITTLE_ENDIAN_BITFIELD
+ __u8 reserved_flags1:3,
+ policy_applied:1,
+ reserved_flags2:2,
+ dont_learn:1,
+ reserved_flags3:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u8 reserved_flags1:1,
+ dont_learn:1,
+ reserved_flags2:2,
+ policy_applied:1,
+ reserved_flags3:3;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ __be16 policy_id;
+ __be32 vx_vni;
+};
+
+/* skb->mark mapping
+ *
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+#define VXLAN_GBP_DONT_LEARN (BIT(6) << 16)
+#define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16)
+#define VXLAN_GBP_ID_MASK (0xFFFF)
+
/* VXLAN protocol header:
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |R|R|R|R|I|R|R|R| Reserved |
+ * |G|R|R|R|I|R|R|R| Reserved |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | VXLAN Network Identifier (VNI) | Reserved |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
+ * G = 1 Group Policy (VXLAN-GBP)
* I = 1 VXLAN Network Identifier (VNI) present
*/
struct vxlanhdr {
@@ -26,9 +75,11 @@ struct vxlanhdr {
#ifdef __LITTLE_ENDIAN_BITFIELD
__u8 reserved_flags1:3,
vni_present:1,
- reserved_flags2:4;
+ reserved_flags2:3,
+ gbp_present:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
- __u8 reserved_flags2:4,
+ __u8 gbp_present:1,
+ reserved_flags2:3,
vni_present:1,
reserved_flags1:3;
#else
@@ -42,8 +93,16 @@ struct vxlanhdr {
__be32 vx_vni;
};
+struct vxlan_metadata {
+ __be32 vni;
+ u32 gbp;
+};
+
struct vxlan_sock;
-typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb, __be32 key);
+typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb,
+ struct vxlan_metadata *md);
+
+#define VXLAN_EXT_GBP BIT(0)
/* per UDP socket information */
struct vxlan_sock {
@@ -78,7 +137,8 @@ void vxlan_sock_release(struct vxlan_sock *vs);
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni, bool xnet);
+ __be16 src_port, __be16 dst_port, struct vxlan_metadata *md,
+ bool xnet);
static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
netdev_features_t features)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f7d0d2d..9f07bf5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -370,10 +370,18 @@ enum {
IFLA_VXLAN_UDP_CSUM,
IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
+ IFLA_VXLAN_EXTENSION,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
+enum {
+ IFLA_VXLAN_EXT_UNSPEC,
+ IFLA_VXLAN_EXT_GBP,
+ __IFLA_VXLAN_EXT_MAX,
+};
+#define IFLA_VXLAN_EXT_MAX (__IFLA_VXLAN_EXT_MAX - 1)
+
struct ifla_vxlan_port_range {
__be16 low;
__be16 high;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index d7c46b3..dd68c97 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -59,7 +59,8 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
}
/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md)
{
struct ovs_tunnel_info tun_info;
struct vport *vport = vs->data;
@@ -68,7 +69,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
/* Save outer tunnel values */
iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(vx_vni) >> 8);
+ key = cpu_to_be64(ntohl(md->vni) >> 8);
ovs_flow_tun_info_init(&tun_info, iph,
udp_hdr(skb)->source, udp_hdr(skb)->dest,
key, TUNNEL_KEY, NULL, 0);
@@ -146,6 +147,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
struct vxlan_port *vxlan_port = vxlan_vport(vport);
__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
struct ovs_key_ipv4_tunnel *tun_key;
+ struct vxlan_metadata md;
struct rtable *rt;
struct flowi4 fl;
__be16 src_port;
@@ -178,12 +180,13 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
skb->ignore_df = 1;
src_port = udp_flow_src_port(net, skb, 0, 0, true);
+ md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
fl.saddr, tun_key->ipv4_dst,
tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
src_port, dst_port,
- htonl(be64_to_cpu(tun_key->tun_id) << 8),
+ &md,
false);
if (err < 0)
ip_rt_put(rt);
--
1.9.3
^ permalink raw reply related [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-12 12:26 ` [PATCH 2/6] vxlan: Group Policy extension Thomas Graf
@ 2015-01-12 19:23 ` Jesse Gross
[not found] ` <CAEP_g=8TqGnftZa_scKODa2ra7gsV6ov_5J+Lbfq+4bFDZjiBw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 44+ messages in thread
From: Jesse Gross @ 2015-01-12 19:23 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 4d52aa9..b148739 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -568,7 +569,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
> continue;
>
> vh2 = (struct vxlanhdr *)(p->data + off_vx);
> - if (vh->vx_vni != vh2->vx_vni) {
> + if (vh->vx_flags != vh2->vx_flags ||
> + vh->vx_vni != vh2->vx_vni) {
It's probably better to do a memcmp over the entire header. There's no
guarantee that new fields will be entirely represented by flags.
> diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
> index d7c46b3..dd68c97 100644
> --- a/net/openvswitch/vport-vxlan.c
> +++ b/net/openvswitch/vport-vxlan.c
> @@ -146,6 +147,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
> struct vxlan_port *vxlan_port = vxlan_vport(vport);
> __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
> struct ovs_key_ipv4_tunnel *tun_key;
> + struct vxlan_metadata md;
It might be a good idea to zero out 'md', even if not strictly required.
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 3/6] vxlan: Only bind to sockets with correct extensions enabled
2015-01-12 12:26 [PATCH 0/6 net-next v3] VXLAN Group Policy Extension Thomas Graf
2015-01-12 12:26 ` [PATCH 1/6] vxlan: Allow for VXLAN extensions to be implemented Thomas Graf
2015-01-12 12:26 ` [PATCH 2/6] vxlan: Group Policy extension Thomas Graf
@ 2015-01-12 12:26 ` Thomas Graf
2015-01-12 12:26 ` [PATCH 4/6] openvswitch: Rename GENEVE_TUN_OPTS() to TUN_METADATA_OPTS() Thomas Graf
` (2 subsequent siblings)
5 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 12:26 UTC (permalink / raw)
To: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov; +Cc: dev, netdev
A VXLAN net_device looking for an appropriate socket may only consider
a socket which has a matching set of extensions enabled. If the
extensions don't match, return a conflict to have the caller create a
distinct socket with distinct port.
The OVS VXLAN port is kept unaware of extensions at this point.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v2->v3:
- No change
v1->v2:
- Improved commit message, reported by Jesse
drivers/net/vxlan.c | 35 +++++++++++++++++++++--------------
include/net/vxlan.h | 2 +-
net/openvswitch/vport-vxlan.c | 2 +-
3 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index b148739..61e1112 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -271,14 +271,15 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
}
/* Find VXLAN socket based on network namespace, address family and UDP port */
-static struct vxlan_sock *vxlan_find_sock(struct net *net,
- sa_family_t family, __be16 port)
+static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
+ __be16 port, u32 exts)
{
struct vxlan_sock *vs;
hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
if (inet_sk(vs->sock->sk)->inet_sport == port &&
- inet_sk(vs->sock->sk)->sk.sk_family == family)
+ inet_sk(vs->sock->sk)->sk.sk_family == family &&
+ vs->exts == exts)
return vs;
}
return NULL;
@@ -298,11 +299,12 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id)
/* Look up VNI in a per net namespace table */
static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id,
- sa_family_t family, __be16 port)
+ sa_family_t family, __be16 port,
+ u32 exts)
{
struct vxlan_sock *vs;
- vs = vxlan_find_sock(net, family, port);
+ vs = vxlan_find_sock(net, family, port, exts);
if (!vs)
return NULL;
@@ -1776,7 +1778,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
ip_rt_put(rt);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
- dst->sa.sa_family, dst_port);
+ dst->sa.sa_family, dst_port,
+ vxlan->exts);
if (!dst_vxlan)
goto tx_error;
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
@@ -1835,7 +1838,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
dst_release(ndst);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
- dst->sa.sa_family, dst_port);
+ dst->sa.sa_family, dst_port,
+ vxlan->exts);
if (!dst_vxlan)
goto tx_error;
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
@@ -2005,7 +2009,7 @@ static int vxlan_init(struct net_device *dev)
spin_lock(&vn->sock_lock);
vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
- vxlan->dst_port);
+ vxlan->dst_port, vxlan->exts);
if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) {
/* If we have a socket with same port already, reuse it */
vxlan_vs_add_dev(vs, vxlan);
@@ -2359,7 +2363,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
/* Create new listen socket if needed */
static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
vxlan_rcv_t *rcv, void *data,
- u32 flags)
+ u32 flags, u32 exts)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_sock *vs;
@@ -2387,6 +2391,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
atomic_set(&vs->refcnt, 1);
vs->rcv = rcv;
vs->data = data;
+ vs->exts = exts;
/* Initialize the vxlan udp offloads structure */
vs->udp_offloads.port = port;
@@ -2411,13 +2416,14 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
vxlan_rcv_t *rcv, void *data,
- bool no_share, u32 flags)
+ bool no_share, u32 flags,
+ u32 exts)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_sock *vs;
bool ipv6 = flags & VXLAN_F_IPV6;
- vs = vxlan_socket_create(net, port, rcv, data, flags);
+ vs = vxlan_socket_create(net, port, rcv, data, flags, exts);
if (!IS_ERR(vs))
return vs;
@@ -2425,7 +2431,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
return vs;
spin_lock(&vn->sock_lock);
- vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port);
+ vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, exts);
if (vs && ((vs->rcv != rcv) ||
!atomic_add_unless(&vs->refcnt, 1, 0)))
vs = ERR_PTR(-EBUSY);
@@ -2447,7 +2453,8 @@ static void vxlan_sock_work(struct work_struct *work)
__be16 port = vxlan->dst_port;
struct vxlan_sock *nvs;
- nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags);
+ nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags,
+ vxlan->exts);
spin_lock(&vn->sock_lock);
if (!IS_ERR(nvs))
vxlan_vs_add_dev(nvs, vxlan);
@@ -2597,7 +2604,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
configure_vxlan_exts(vxlan, data[IFLA_VXLAN_EXTENSION]);
if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET,
- vxlan->dst_port)) {
+ vxlan->dst_port, vxlan->exts)) {
pr_info("duplicate VNI %u\n", vni);
return -EEXIST;
}
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 66ec53c..5ba49d5 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -130,7 +130,7 @@ struct vxlan_sock {
struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
vxlan_rcv_t *rcv, void *data,
- bool no_share, u32 flags);
+ bool no_share, u32 flags, u32 exts);
void vxlan_sock_release(struct vxlan_sock *vs);
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index dd68c97..266c595 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -128,7 +128,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
vxlan_port = vxlan_vport(vport);
strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
- vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0);
+ vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0, 0);
if (IS_ERR(vs)) {
ovs_vport_free(vport);
return (void *)vs;
--
1.9.3
^ permalink raw reply related [flat|nested] 44+ messages in thread* [PATCH 4/6] openvswitch: Rename GENEVE_TUN_OPTS() to TUN_METADATA_OPTS()
2015-01-12 12:26 [PATCH 0/6 net-next v3] VXLAN Group Policy Extension Thomas Graf
` (2 preceding siblings ...)
2015-01-12 12:26 ` [PATCH 3/6] vxlan: Only bind to sockets with correct extensions enabled Thomas Graf
@ 2015-01-12 12:26 ` Thomas Graf
2015-01-12 21:38 ` Jesse Gross
[not found] ` <cover.1421064100.git.tgraf-G/eBtMaohhA@public.gmane.org>
2015-01-12 12:26 ` [PATCH 6/6] openvswitch: Support VXLAN Group Policy extension Thomas Graf
5 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 12:26 UTC (permalink / raw)
To: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov; +Cc: dev, netdev
Also factors out Geneve validation code into a new separate function
validate_and_copy_geneve_opts().
A subsequent patch will introduce VXLAN options. Rename the existing
GENEVE_TUN_OPTS() to reflect its extended purpose of carrying generic
tunnel metadata options.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v2->v3:
- No change
v1->v2:
- Don't rename genev_tun_opt_from_nlattr() and keep it Geneve specific,
pointed out by Jesse.
- Factor out Geneve specific validation code into separate function as
requested by Jesse.
net/openvswitch/flow.c | 2 +-
net/openvswitch/flow.h | 14 ++++----
net/openvswitch/flow_netlink.c | 72 +++++++++++++++++++++++-------------------
3 files changed, 47 insertions(+), 41 deletions(-)
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index da2fae0..41f2dfd 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -691,7 +691,7 @@ int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
8)) - 1
> sizeof(key->tun_opts));
- memcpy(GENEVE_OPTS(key, tun_info->options_len),
+ memcpy(TUN_METADATA_OPTS(key, tun_info->options_len),
tun_info->options, tun_info->options_len);
key->tun_opts_len = tun_info->options_len;
} else {
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a8b30f3..d3d0a40 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -53,7 +53,7 @@ struct ovs_key_ipv4_tunnel {
struct ovs_tunnel_info {
struct ovs_key_ipv4_tunnel tunnel;
- const struct geneve_opt *options;
+ const void *options;
u8 options_len;
};
@@ -61,10 +61,10 @@ struct ovs_tunnel_info {
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
*/
-#define GENEVE_OPTS(flow_key, opt_len) \
- ((struct geneve_opt *)((flow_key)->tun_opts + \
- FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \
- opt_len))
+#define TUN_METADATA_OFFSET(opt_len) \
+ (FIELD_SIZEOF(struct sw_flow_key, tun_opts) - opt_len)
+#define TUN_METADATA_OPTS(flow_key, opt_len) \
+ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
__be32 saddr, __be32 daddr,
@@ -73,7 +73,7 @@ static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
__be16 tp_dst,
__be64 tun_id,
__be16 tun_flags,
- const struct geneve_opt *opts,
+ const void *opts,
u8 opts_len)
{
tun_info->tunnel.tun_id = tun_id;
@@ -105,7 +105,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
__be16 tp_dst,
__be64 tun_id,
__be16 tun_flags,
- const struct geneve_opt *opts,
+ const void *opts,
u8 opts_len)
{
__ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d1eecf7..8980d32 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -432,8 +432,7 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a,
SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
}
- opt_key_offset = (unsigned long)GENEVE_OPTS((struct sw_flow_key *)0,
- nla_len(a));
+ opt_key_offset = TUN_METADATA_OFFSET(nla_len(a));
SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a),
nla_len(a), is_mask);
return 0;
@@ -558,8 +557,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *output,
- const struct geneve_opt *tun_opts,
- int swkey_tun_opts_len)
+ const void *tun_opts, int swkey_tun_opts_len)
{
if (output->tun_flags & TUNNEL_KEY &&
nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
@@ -600,8 +598,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
static int ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *output,
- const struct geneve_opt *tun_opts,
- int swkey_tun_opts_len)
+ const void *tun_opts, int swkey_tun_opts_len)
{
struct nlattr *nla;
int err;
@@ -1148,10 +1145,10 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
goto nla_put_failure;
if ((swkey->tun_key.ipv4_dst || is_mask)) {
- const struct geneve_opt *opts = NULL;
+ const void *opts = NULL;
if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
- opts = GENEVE_OPTS(output, swkey->tun_opts_len);
+ opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts,
swkey->tun_opts_len))
@@ -1540,6 +1537,34 @@ void ovs_match_init(struct sw_flow_match *match,
}
}
+static int validate_and_copy_geneve_opts(struct sw_flow_key *key)
+{
+ struct geneve_opt *option;
+ int opts_len = key->tun_opts_len;
+ bool crit_opt = false;
+
+ option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len);
+ while (opts_len > 0) {
+ int len;
+
+ if (opts_len < sizeof(*option))
+ return -EINVAL;
+
+ len = sizeof(*option) + option->length * 4;
+ if (len > opts_len)
+ return -EINVAL;
+
+ crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
+
+ option = (struct geneve_opt *)((u8 *)option + len);
+ opts_len -= len;
+ };
+
+ key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
+
+ return 0;
+}
+
static int validate_and_copy_set_tun(const struct nlattr *attr,
struct sw_flow_actions **sfa, bool log)
{
@@ -1555,28 +1580,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
return err;
if (key.tun_opts_len) {
- struct geneve_opt *option = GENEVE_OPTS(&key,
- key.tun_opts_len);
- int opts_len = key.tun_opts_len;
- bool crit_opt = false;
-
- while (opts_len > 0) {
- int len;
-
- if (opts_len < sizeof(*option))
- return -EINVAL;
-
- len = sizeof(*option) + option->length * 4;
- if (len > opts_len)
- return -EINVAL;
-
- crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE);
-
- option = (struct geneve_opt *)((u8 *)option + len);
- opts_len -= len;
- };
-
- key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
+ err = validate_and_copy_geneve_opts(&key);
+ if (err < 0)
+ return err;
};
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log);
@@ -1597,9 +1603,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
* everything else will go away after flow setup. We can append
* it to tun_info and then point there.
*/
- memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len),
- key.tun_opts_len);
- tun_info->options = (struct geneve_opt *)(tun_info + 1);
+ memcpy((tun_info + 1),
+ TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len);
+ tun_info->options = (tun_info + 1);
} else {
tun_info->options = NULL;
}
--
1.9.3
^ permalink raw reply related [flat|nested] 44+ messages in thread* Re: [PATCH 4/6] openvswitch: Rename GENEVE_TUN_OPTS() to TUN_METADATA_OPTS()
2015-01-12 12:26 ` [PATCH 4/6] openvswitch: Rename GENEVE_TUN_OPTS() to TUN_METADATA_OPTS() Thomas Graf
@ 2015-01-12 21:38 ` Jesse Gross
2015-01-12 23:00 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: Jesse Gross @ 2015-01-12 21:38 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index d1eecf7..8980d32 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> +static int validate_and_copy_geneve_opts(struct sw_flow_key *key)
> +{
This function doesn't actually do any copying, so maybe there is a
more descriptive name for it?
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 4/6] openvswitch: Rename GENEVE_TUN_OPTS() to TUN_METADATA_OPTS()
2015-01-12 21:38 ` Jesse Gross
@ 2015-01-12 23:00 ` Thomas Graf
0 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 23:00 UTC (permalink / raw)
To: Jesse Gross
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On 01/12/15 at 01:38pm, Jesse Gross wrote:
> On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
> > diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> > index d1eecf7..8980d32 100644
> > --- a/net/openvswitch/flow_netlink.c
> > +++ b/net/openvswitch/flow_netlink.c
> > +static int validate_and_copy_geneve_opts(struct sw_flow_key *key)
> > +{
>
> This function doesn't actually do any copying, so maybe there is a
> more descriptive name for it?
Sure, will rename.
^ permalink raw reply [flat|nested] 44+ messages in thread
[parent not found: <cover.1421064100.git.tgraf-G/eBtMaohhA@public.gmane.org>]
* [PATCH 5/6] openvswitch: Allow for any level of nesting in flow attributes
[not found] ` <cover.1421064100.git.tgraf-G/eBtMaohhA@public.gmane.org>
@ 2015-01-12 12:26 ` Thomas Graf
2015-01-12 19:41 ` Jesse Gross
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 12:26 UTC (permalink / raw)
To: davem-fT/PcQaiUtIeIZ0/mPfg9Q, jesse-l0M0P4e3n4LQT0dZR+AlfA,
stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
pshelar-l0M0P4e3n4LQT0dZR+AlfA, therbert-hpIqsD4AKlfQT0dZR+AlfA,
alexei.starovoitov-Re5JQEeQqe8AvxtiuMwx3w
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA
nlattr_set() is currently hardcoded to two levels of nesting. This change
introduces struct ovs_len_tbl to define minimal length requirements plus
next level nesting tables to traverse the key attributes to arbitary depth.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v2->v3:
- No change
v1->v2:
- New patch to allow nested Netlink attributes inside
OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS
net/openvswitch/flow_netlink.c | 106 ++++++++++++++++++++++-------------------
1 file changed, 56 insertions(+), 50 deletions(-)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 8980d32..457ccf3 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -50,6 +50,13 @@
#include "flow_netlink.h"
+struct ovs_len_tbl {
+ int len;
+ const struct ovs_len_tbl *next;
+};
+
+#define OVS_ATTR_NESTED -1
+
static void update_range(struct sw_flow_match *match,
size_t offset, size_t size, bool is_mask)
{
@@ -289,29 +296,44 @@ size_t ovs_key_attr_size(void)
+ nla_total_size(28); /* OVS_KEY_ATTR_ND */
}
+static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
+ [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) },
+ [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) },
+ [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) },
+ [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 },
+ [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 },
+ [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 },
+ [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 },
+ [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) },
+ [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) },
+ [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 },
+ [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED },
+};
+
/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
-static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
- [OVS_KEY_ATTR_ENCAP] = -1,
- [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
- [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
- [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32),
- [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
- [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
- [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
- [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
- [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
- [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
- [OVS_KEY_ATTR_TCP_FLAGS] = sizeof(__be16),
- [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
- [OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
- [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
- [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
- [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
- [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
- [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32),
- [OVS_KEY_ATTR_DP_HASH] = sizeof(u32),
- [OVS_KEY_ATTR_TUNNEL] = -1,
- [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls),
+static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
+ [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED },
+ [OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) },
+ [OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) },
+ [OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) },
+ [OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) },
+ [OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) },
+ [OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) },
+ [OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) },
+ [OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) },
+ [OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) },
+ [OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) },
+ [OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) },
+ [OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) },
+ [OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) },
+ [OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) },
+ [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED,
+ .next = ovs_tunnel_key_lens, },
+ [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) },
};
static bool is_all_zero(const u8 *fp, size_t size)
@@ -352,8 +374,8 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,
return -EINVAL;
}
- expected_len = ovs_key_lens[type];
- if (nla_len(nla) != expected_len && expected_len != -1) {
+ expected_len = ovs_key_lens[type].len;
+ if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) {
OVS_NLERR(log, "Key %d has unexpected len %d expected %d",
type, nla_len(nla), expected_len);
return -EINVAL;
@@ -451,30 +473,16 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
int type = nla_type(a);
int err;
- static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
- [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
- [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
- [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
- [OVS_TUNNEL_KEY_ATTR_TOS] = 1,
- [OVS_TUNNEL_KEY_ATTR_TTL] = 1,
- [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
- [OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
- [OVS_TUNNEL_KEY_ATTR_TP_SRC] = sizeof(u16),
- [OVS_TUNNEL_KEY_ATTR_TP_DST] = sizeof(u16),
- [OVS_TUNNEL_KEY_ATTR_OAM] = 0,
- [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1,
- };
-
if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
OVS_NLERR(log, "Tunnel attr %d out of range max %d",
type, OVS_TUNNEL_KEY_ATTR_MAX);
return -EINVAL;
}
- if (ovs_tunnel_key_lens[type] != nla_len(a) &&
- ovs_tunnel_key_lens[type] != -1) {
+ if (ovs_tunnel_key_lens[type].len != nla_len(a) &&
+ ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) {
OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d",
- type, nla_len(a), ovs_tunnel_key_lens[type]);
+ type, nla_len(a), ovs_tunnel_key_lens[type].len);
return -EINVAL;
}
@@ -912,18 +920,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
return 0;
}
-static void nlattr_set(struct nlattr *attr, u8 val, bool is_attr_mask_key)
+static void nlattr_set(struct nlattr *attr, u8 val,
+ const struct ovs_len_tbl *tbl)
{
struct nlattr *nla;
int rem;
/* The nlattr stream should already have been validated */
nla_for_each_nested(nla, attr, rem) {
- /* We assume that ovs_key_lens[type] == -1 means that type is a
- * nested attribute
- */
- if (is_attr_mask_key && ovs_key_lens[nla_type(nla)] == -1)
- nlattr_set(nla, val, false);
+ if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED)
+ nlattr_set(nla, val, tbl[nla_type(nla)].next);
else
memset(nla_data(nla), val, nla_len(nla));
}
@@ -931,7 +937,7 @@ static void nlattr_set(struct nlattr *attr, u8 val, bool is_attr_mask_key)
static void mask_set_nlattr(struct nlattr *attr, u8 val)
{
- nlattr_set(attr, val, true);
+ nlattr_set(attr, val, ovs_key_lens);
}
/**
@@ -1628,8 +1634,8 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
if (key_type > OVS_KEY_ATTR_MAX ||
- (ovs_key_lens[key_type] != nla_len(ovs_key) &&
- ovs_key_lens[key_type] != -1))
+ (ovs_key_lens[key_type].len != nla_len(ovs_key) &&
+ ovs_key_lens[key_type].len != OVS_ATTR_NESTED))
return -EINVAL;
switch (key_type) {
--
1.9.3
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply related [flat|nested] 44+ messages in thread* Re: [PATCH 5/6] openvswitch: Allow for any level of nesting in flow attributes
2015-01-12 12:26 ` [PATCH 5/6] openvswitch: Allow for any level of nesting in flow attributes Thomas Graf
@ 2015-01-12 19:41 ` Jesse Gross
2015-01-12 23:04 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: Jesse Gross @ 2015-01-12 19:41 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index 8980d32..457ccf3 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> +static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
> + [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) },
> + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) },
> + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) },
> + [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 },
> + [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 },
> + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 },
> + [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 },
> + [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) },
> + [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) },
> + [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 },
> + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED },
> +};
Geneve isn't really nested - maybe we should break it out into a
separate name? OVS_ATTR_VARIABLE? We shouldn't really try to traverse
it as netlink attributes anyways.
> +static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
> + [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED },
This is not new but I think that encap isn't really handled correctly.
In theory, there could be multiple levels of nesting here (either
another encap or some other element) but there's no 'next' link.
However, I don't believe the situation arises today.
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 5/6] openvswitch: Allow for any level of nesting in flow attributes
2015-01-12 19:41 ` Jesse Gross
@ 2015-01-12 23:04 ` Thomas Graf
0 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 23:04 UTC (permalink / raw)
To: Jesse Gross
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On 01/12/15 at 11:41am, Jesse Gross wrote:
> On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
> > + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED },
> > +};
>
> Geneve isn't really nested - maybe we should break it out into a
> separate name? OVS_ATTR_VARIABLE? We shouldn't really try to traverse
> it as netlink attributes anyways.
Agreed. I inteionally kept the behaviour intact in this series and was
about to do fix this in a separate patch to not make this series any
more complex than it already is ;-)
> > +static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
> > + [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED },
>
> This is not new but I think that encap isn't really handled correctly.
> In theory, there could be multiple levels of nesting here (either
> another encap or some other element) but there's no 'next' link.
> However, I don't believe the situation arises today.
Right. The behaviour is identical as before the patch. I just replaced
the -1 with a name. Agreed that we should clean this up
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 6/6] openvswitch: Support VXLAN Group Policy extension
2015-01-12 12:26 [PATCH 0/6 net-next v3] VXLAN Group Policy Extension Thomas Graf
` (4 preceding siblings ...)
[not found] ` <cover.1421064100.git.tgraf-G/eBtMaohhA@public.gmane.org>
@ 2015-01-12 12:26 ` Thomas Graf
2015-01-12 21:54 ` Jesse Gross
5 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-12 12:26 UTC (permalink / raw)
To: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov; +Cc: dev, netdev
Introduces support for the group policy extension to the VXLAN virtual
port. The extension is disabled by default and only enabled if the user
has provided the respective configuration.
ovs-vsctl add-port br0 vxlan0 -- \
set Interface vxlan0 type=vxlan options:exts=gbp
The configuration interface to enable the extension is based on a new
attribute OVS_VXLAN_EXT_GBP nested inside OVS_TUNNEL_ATTR_EXTENSION
which can carry additional extensions as needed in the future.
The group policy metadata is stored as binary blob (struct ovs_vxlan_opts)
internally just like Geneve options but transported as nested Netlink
attributes to user space.
Renames the existing TUNNEL_OPTIONS_PRESENT to TUNNEL_GENEVE_OPT with the
binary value kept intact, a new flag TUNNEL_VXLAN_OPT is introduced.
The attributes OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and existing
OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS are implemented mutually exclusive.
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v2->v3:
- No change
v1->v2:
- Addressed Jesse's request to transport VXLAN options as Netlink
attributes instead of a binary blob. Allows a partial transport of
VXLAN extensions. Internally, the datapath continues to use a binary
blob (defined in vport-vxlan.h) for performance reasons.
- Added new TUNNEL_GENEVE_OPT and TUNNEL_VXLAN_OPT flags to mark
tunnel option flavour
- Correctly report VXLAN options to user space
include/net/ip_tunnels.h | 5 +-
include/uapi/linux/openvswitch.h | 11 ++++
net/openvswitch/flow_netlink.c | 114 ++++++++++++++++++++++++++++++++++-----
net/openvswitch/vport-geneve.c | 2 +-
net/openvswitch/vport-vxlan.c | 81 +++++++++++++++++++++++++++-
net/openvswitch/vport-vxlan.h | 11 ++++
6 files changed, 207 insertions(+), 17 deletions(-)
create mode 100644 net/openvswitch/vport-vxlan.h
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 25a59eb..ce4db3c 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -97,7 +97,10 @@ struct ip_tunnel {
#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100)
#define TUNNEL_OAM __cpu_to_be16(0x0200)
#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400)
-#define TUNNEL_OPTIONS_PRESENT __cpu_to_be16(0x0800)
+#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800)
+#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000)
+
+#define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)
struct tnl_ptk_info {
__be16 flags;
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 3a6dcaa..e474c95 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -248,11 +248,21 @@ enum ovs_vport_attr {
#define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
+enum {
+ OVS_VXLAN_EXT_UNSPEC,
+ OVS_VXLAN_EXT_GBP, /* Flag or __u32 */
+ __OVS_VXLAN_EXT_MAX,
+};
+
+#define OVS_VXLAN_EXT_MAX (__OVS_VXLAN_EXT_MAX - 1)
+
+
/* OVS_VPORT_ATTR_OPTIONS attributes for tunnels.
*/
enum {
OVS_TUNNEL_ATTR_UNSPEC,
OVS_TUNNEL_ATTR_DST_PORT, /* 16-bit UDP port, used by L4 tunnels. */
+ OVS_TUNNEL_ATTR_EXTENSION,
__OVS_TUNNEL_ATTR_MAX
};
@@ -324,6 +334,7 @@ enum ovs_tunnel_key_attr {
OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */
OVS_TUNNEL_KEY_ATTR_TP_SRC, /* be16 src Transport Port. */
OVS_TUNNEL_KEY_ATTR_TP_DST, /* be16 dst Transport Port. */
+ OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS, /* Nested OVS_VXLAN_EXT_* */
__OVS_TUNNEL_KEY_ATTR_MAX
};
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 457ccf3..cea492b 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -49,6 +49,7 @@
#include <net/mpls.h>
#include "flow_netlink.h"
+#include "vport-vxlan.h"
struct ovs_len_tbl {
int len;
@@ -268,6 +269,9 @@ size_t ovs_tun_key_attr_size(void)
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */
+ nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
+ /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with
+ * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
+ */
+ nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
+ nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */
}
@@ -308,6 +312,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
[OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) },
[OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 },
[OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED },
+ [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED },
};
/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
@@ -460,6 +465,41 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a,
return 0;
}
+static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = {
+ [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 },
+};
+
+static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
+ struct sw_flow_match *match, bool is_mask,
+ bool log)
+{
+ struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
+ unsigned long opt_key_offset;
+ struct ovs_vxlan_opts opts;
+ int err;
+
+ BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
+
+ err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy);
+ if (err < 0)
+ return err;
+
+ memset(&opts, 0, sizeof(opts));
+
+ if (tb[OVS_VXLAN_EXT_MAX])
+ opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_MAX]);
+
+ if (!is_mask)
+ SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
+ else
+ SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
+
+ opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
+ SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
+ is_mask);
+ return 0;
+}
+
static int ipv4_tun_from_nlattr(const struct nlattr *attr,
struct sw_flow_match *match, bool is_mask,
bool log)
@@ -468,6 +508,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
int rem;
bool ttl = false;
__be16 tun_flags = 0;
+ int opts_type = 0;
nla_for_each_nested(a, attr, rem) {
int type = nla_type(a);
@@ -527,11 +568,30 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
tun_flags |= TUNNEL_OAM;
break;
case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+ if (opts_type) {
+ OVS_NLERR(log, "Multiple metadata blocks provided");
+ return -EINVAL;
+ }
+
err = genev_tun_opt_from_nlattr(a, match, is_mask, log);
if (err)
return err;
- tun_flags |= TUNNEL_OPTIONS_PRESENT;
+ tun_flags |= TUNNEL_GENEVE_OPT;
+ opts_type = type;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+ if (opts_type) {
+ OVS_NLERR(log, "Multiple metadata blocks provided");
+ return -EINVAL;
+ }
+
+ err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log);
+ if (err)
+ return err;
+
+ tun_flags |= TUNNEL_VXLAN_OPT;
+ opts_type = type;
break;
default:
OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d",
@@ -560,6 +620,23 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
}
}
+ return opts_type;
+}
+
+static int vxlan_opt_to_nlattr(struct sk_buff *skb,
+ const void *tun_opts, int swkey_tun_opts_len)
+{
+ const struct ovs_vxlan_opts *opts = tun_opts;
+ struct nlattr *nla;
+
+ nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
+ if (!nla)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0)
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, nla);
return 0;
}
@@ -596,10 +673,15 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
if ((output->tun_flags & TUNNEL_OAM) &&
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
return -EMSGSIZE;
- if (tun_opts &&
- nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
- swkey_tun_opts_len, tun_opts))
- return -EMSGSIZE;
+ if (tun_opts) {
+ if (output->tun_flags & TUNNEL_GENEVE_OPT &&
+ nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
+ swkey_tun_opts_len, tun_opts))
+ return -EMSGSIZE;
+ else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
+ vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
+ return -EMSGSIZE;
+ }
return 0;
}
@@ -680,7 +762,7 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs,
}
if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
- is_mask, log))
+ is_mask, log) < 0)
return -EINVAL;
*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
}
@@ -1578,17 +1660,23 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
struct sw_flow_key key;
struct ovs_tunnel_info *tun_info;
struct nlattr *a;
- int err, start;
+ int err, start, opts_type;
ovs_match_init(&match, &key, NULL);
- err = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log);
- if (err)
- return err;
+ opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log);
+ if (opts_type < 0)
+ return opts_type;
if (key.tun_opts_len) {
- err = validate_and_copy_geneve_opts(&key);
- if (err < 0)
- return err;
+ switch (opts_type) {
+ case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+ err = validate_and_copy_geneve_opts(&key);
+ if (err < 0)
+ return err;
+ break;
+ case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+ break;
+ }
};
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log);
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 484864d..902ee4f 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -90,7 +90,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
opts_len = geneveh->opt_len * 4;
- flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT |
+ flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
(udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
(geneveh->oam ? TUNNEL_OAM : 0) |
(geneveh->critical ? TUNNEL_CRIT_OPT : 0);
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 266c595..dbd6c75 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -40,6 +40,7 @@
#include "datapath.h"
#include "vport.h"
+#include "vport-vxlan.h"
/**
* struct vxlan_port - Keeps track of open UDP ports
@@ -49,6 +50,7 @@
struct vxlan_port {
struct vxlan_sock *vs;
char name[IFNAMSIZ];
+ u32 exts; /* VXLAN_EXT_* in <net/vxlan.h> */
};
static struct vport_ops ovs_vxlan_vport_ops;
@@ -63,16 +65,26 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
struct vxlan_metadata *md)
{
struct ovs_tunnel_info tun_info;
+ struct vxlan_port *vxlan_port;
struct vport *vport = vs->data;
struct iphdr *iph;
+ struct ovs_vxlan_opts opts = {
+ .gbp = md->gbp,
+ };
__be64 key;
+ __be16 flags;
+
+ flags = TUNNEL_KEY;
+ vxlan_port = vxlan_vport(vport);
+ if (vxlan_port->exts & VXLAN_EXT_GBP)
+ flags |= TUNNEL_VXLAN_OPT;
/* Save outer tunnel values */
iph = ip_hdr(skb);
key = cpu_to_be64(ntohl(md->vni) >> 8);
ovs_flow_tun_info_init(&tun_info, iph,
udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, TUNNEL_KEY, NULL, 0);
+ key, flags, &opts, sizeof(opts));
ovs_vport_receive(vport, skb, &tun_info);
}
@@ -84,6 +96,21 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
return -EMSGSIZE;
+
+ if (vxlan_port->exts) {
+ struct nlattr *exts;
+
+ exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
+ if (!exts)
+ return -EMSGSIZE;
+
+ if (vxlan_port->exts & VXLAN_EXT_GBP &&
+ nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
+ return -EMSGSIZE;
+
+ nla_nest_end(skb, exts);
+ }
+
return 0;
}
@@ -96,6 +123,31 @@ static void vxlan_tnl_destroy(struct vport *vport)
ovs_vport_deferred_free(vport);
}
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
+ [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
+};
+
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
+{
+ struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
+ struct vxlan_port *vxlan_port;
+ int err;
+
+ if (nla_len(attr) < sizeof(struct nlattr))
+ return -EINVAL;
+
+ err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
+ if (err < 0)
+ return err;
+
+ vxlan_port = vxlan_vport(vport);
+
+ if (exts[OVS_VXLAN_EXT_GBP])
+ vxlan_port->exts |= VXLAN_EXT_GBP;
+
+ return 0;
+}
+
static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
{
struct net *net = ovs_dp_get_net(parms->dp);
@@ -128,7 +180,17 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
vxlan_port = vxlan_vport(vport);
strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
- vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0, 0);
+ a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
+ if (a) {
+ err = vxlan_configure_exts(vport, a);
+ if (err) {
+ ovs_vport_free(vport);
+ goto error;
+ }
+ }
+
+ vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0,
+ vxlan_port->exts);
if (IS_ERR(vs)) {
ovs_vport_free(vport);
return (void *)vs;
@@ -141,6 +203,20 @@ error:
return ERR_PTR(err);
}
+static int vxlan_ext_gbp(struct sk_buff *skb)
+{
+ const struct ovs_tunnel_info *tun_info;
+ const struct ovs_vxlan_opts *opts;
+
+ tun_info = OVS_CB(skb)->egress_tun_info;
+ opts = tun_info->options;
+
+ if (tun_info->options_len >= sizeof(*opts))
+ return opts->gbp;
+ else
+ return 0;
+}
+
static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
{
struct net *net = ovs_dp_get_net(vport->dp);
@@ -181,6 +257,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
src_port = udp_flow_src_port(net, skb, 0, 0, true);
md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
+ md.gbp = vxlan_ext_gbp(skb);
err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
fl.saddr, tun_key->ipv4_dst,
diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h
new file mode 100644
index 0000000..4b08233e
--- /dev/null
+++ b/net/openvswitch/vport-vxlan.h
@@ -0,0 +1,11 @@
+#ifndef VPORT_VXLAN_H
+#define VPORT_VXLAN_H 1
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+struct ovs_vxlan_opts {
+ __u32 gbp;
+};
+
+#endif
--
1.9.3
^ permalink raw reply related [flat|nested] 44+ messages in thread* Re: [PATCH 6/6] openvswitch: Support VXLAN Group Policy extension
2015-01-12 12:26 ` [PATCH 6/6] openvswitch: Support VXLAN Group Policy extension Thomas Graf
@ 2015-01-12 21:54 ` Jesse Gross
2015-01-13 1:02 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: Jesse Gross @ 2015-01-12 21:54 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index 457ccf3..cea492b 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> +static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
> + struct sw_flow_match *match, bool is_mask,
> + bool log)
> +{
> + struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
> + unsigned long opt_key_offset;
> + struct ovs_vxlan_opts opts;
> + int err;
> +
> + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
> +
> + err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy);
> + if (err < 0)
> + return err;
> +
> + memset(&opts, 0, sizeof(opts));
> +
> + if (tb[OVS_VXLAN_EXT_MAX])
> + opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_MAX]);
Shouldn't this be OVS_VXLAN_EXT_GBP instead of OVS_VXLAN_EXT_MAX?
(They have the same value.)
> + if (!is_mask)
> + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
> + else
> + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
Have you thought carefully about how the masking model work as other
extensions are potentially added? This was a little tricky with Geneve
because I wanted to be able to match on both "no options present" as
well as wildcard all options. The other interesting thing is how you
serialize them back correctly to userspace, which was the genesis of
the TUNNEL_OPTIONS_PRESENT flag.
My guess is that this may basically work fine now that there is only
one extension present but it is important to think about how it might
work with multiple independent extensions in the future. (I haven't
thought about it, I'm just asking.)
> diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
> index 266c595..dbd6c75 100644
> --- a/net/openvswitch/vport-vxlan.c
> +++ b/net/openvswitch/vport-vxlan.c
> +static int vxlan_ext_gbp(struct sk_buff *skb)
> +{
> + const struct ovs_tunnel_info *tun_info;
> + const struct ovs_vxlan_opts *opts;
> +
> + tun_info = OVS_CB(skb)->egress_tun_info;
> + opts = tun_info->options;
> +
> + if (tun_info->options_len >= sizeof(*opts))
> + return opts->gbp;
> + else
> + return 0;
> +}
If you set Geneve options and output to a VXLAN port (or vice versa),
you will get garbage, right? Is there any way that we can sanity check
that?
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 6/6] openvswitch: Support VXLAN Group Policy extension
2015-01-12 21:54 ` Jesse Gross
@ 2015-01-13 1:02 ` Thomas Graf
2015-01-13 22:15 ` Jesse Gross
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-13 1:02 UTC (permalink / raw)
To: Jesse Gross
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On 01/12/15 at 01:54pm, Jesse Gross wrote:
> On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
> > + if (tb[OVS_VXLAN_EXT_MAX])
> > + opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_MAX]);
>
> Shouldn't this be OVS_VXLAN_EXT_GBP instead of OVS_VXLAN_EXT_MAX?
> (They have the same value.)
Good catch, thanks!
> > + if (!is_mask)
> > + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
> > + else
> > + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
>
> Have you thought carefully about how the masking model work as other
> extensions are potentially added? This was a little tricky with Geneve
> because I wanted to be able to match on both "no options present" as
> well as wildcard all options. The other interesting thing is how you
> serialize them back correctly to userspace, which was the genesis of
> the TUNNEL_OPTIONS_PRESENT flag.
>
> My guess is that this may basically work fine now that there is only
> one extension present but it is important to think about how it might
> work with multiple independent extensions in the future. (I haven't
> thought about it, I'm just asking.)
I currently don't see a reason why adding another extension would be
a problem. It should work like Geneve options except that the order
of the options in the flow is given (struct vxlan_opts).
Matching on "no options present" is supported in the datapath by
via the TUNNEL_VXLAN_OPT flag although there is no way in user space
to express this intent yet. I haven't come across a need to support it
yet.
Since the Netlink API is decoupled from the datapath flow
representation, all of this can be changed if needed without breaking
the Netlink ABI.
> If you set Geneve options and output to a VXLAN port (or vice versa),
> you will get garbage, right? Is there any way that we can sanity check
> that?
What about if we only apply tun_info->options on Geneve if
TUNNEL_GENEVE_OPT is set and vice versa?
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 6/6] openvswitch: Support VXLAN Group Policy extension
2015-01-13 1:02 ` Thomas Graf
@ 2015-01-13 22:15 ` Jesse Gross
[not found] ` <CAEP_g=9=am_n_aSjA8mxOaViUMEaJgfr8DpMG9GsbitJm8006w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 44+ messages in thread
From: Jesse Gross @ 2015-01-13 22:15 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Stephen Hemminger, Pravin Shelar, Tom Herbert,
Alexei Starovoitov, dev@openvswitch.org, netdev
On Mon, Jan 12, 2015 at 5:02 PM, Thomas Graf <tgraf@suug.ch> wrote:
> On 01/12/15 at 01:54pm, Jesse Gross wrote:
>> On Mon, Jan 12, 2015 at 4:26 AM, Thomas Graf <tgraf@suug.ch> wrote:
>> > + if (!is_mask)
>> > + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
>> > + else
>> > + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
>>
>> Have you thought carefully about how the masking model work as other
>> extensions are potentially added? This was a little tricky with Geneve
>> because I wanted to be able to match on both "no options present" as
>> well as wildcard all options. The other interesting thing is how you
>> serialize them back correctly to userspace, which was the genesis of
>> the TUNNEL_OPTIONS_PRESENT flag.
>>
>> My guess is that this may basically work fine now that there is only
>> one extension present but it is important to think about how it might
>> work with multiple independent extensions in the future. (I haven't
>> thought about it, I'm just asking.)
>
> I currently don't see a reason why adding another extension would be
> a problem. It should work like Geneve options except that the order
> of the options in the flow is given (struct vxlan_opts).
>
> Matching on "no options present" is supported in the datapath by
> via the TUNNEL_VXLAN_OPT flag although there is no way in user space
> to express this intent yet. I haven't come across a need to support it
> yet.
>
> Since the Netlink API is decoupled from the datapath flow
> representation, all of this can be changed if needed without breaking
> the Netlink ABI.
OK, it seems fine for now.
I agree that "not present" is probably less interesting for VXLAN than
Geneve given the fixed sized header. It would seem to only have
benefit in the event that the port configuration is decoupled from
flow processing in userspace.
>> If you set Geneve options and output to a VXLAN port (or vice versa),
>> you will get garbage, right? Is there any way that we can sanity check
>> that?
>
> What about if we only apply tun_info->options on Geneve if
> TUNNEL_GENEVE_OPT is set and vice versa?
That seems nice and simple to me.
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 2/6] vxlan: Group Policy extension
2015-01-08 22:47 [PATCH 0/6 net-next v2] VXLAN Group Policy Extension Thomas Graf
@ 2015-01-08 22:47 ` Thomas Graf
2015-01-09 17:37 ` Alexei Starovoitov
` (2 more replies)
0 siblings, 3 replies; 44+ messages in thread
From: Thomas Graf @ 2015-01-08 22:47 UTC (permalink / raw)
To: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov; +Cc: netdev, dev
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v2:
- split GBP header definition into separate struct vxlanhdr_gbp as requested
by Alexei
drivers/net/vxlan.c | 161 ++++++++++++++++++++++++++++++------------
include/net/vxlan.h | 73 +++++++++++++++++--
include/uapi/linux/if_link.h | 8 +++
net/openvswitch/vport-vxlan.c | 9 ++-
4 files changed, 198 insertions(+), 53 deletions(-)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 4d52aa9..b148739 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -132,6 +132,7 @@ struct vxlan_dev {
__u8 tos; /* TOS override */
__u8 ttl;
u32 flags; /* VXLAN_F_* in vxlan.h */
+ u32 exts; /* Enabled extensions */
struct work_struct sock_work;
struct work_struct igmp_join;
@@ -568,7 +569,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
continue;
vh2 = (struct vxlanhdr *)(p->data + off_vx);
- if (vh->vx_vni != vh2->vx_vni) {
+ if (vh->vx_flags != vh2->vx_flags ||
+ vh->vx_vni != vh2->vx_vni) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -1095,6 +1097,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct vxlan_sock *vs;
struct vxlanhdr *vxh;
+ struct vxlan_metadata md = {0};
/* Need Vxlan and inner Ethernet header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1113,6 +1116,22 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (vs->exts) {
if (!vxh->vni_present)
goto error_invalid_header;
+
+ if (vxh->gbp_present) {
+ struct vxlanhdr_gbp *gbp;
+
+ if (!(vs->exts & VXLAN_EXT_GBP))
+ goto error_invalid_header;
+
+ gbp = (struct vxlanhdr_gbp *)vxh;
+ md.gbp = ntohs(gbp->policy_id);
+
+ if (gbp->dont_learn)
+ md.gbp |= VXLAN_GBP_DONT_LEARN;
+
+ if (gbp->policy_applied)
+ md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+ }
} else {
if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
(vxh->vx_vni & htonl(0xff)))
@@ -1122,7 +1141,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
goto drop;
- vs->rcv(vs, skb, vxh->vx_vni);
+ md.vni = vxh->vx_vni;
+ vs->rcv(vs, skb, &md);
return 0;
drop:
@@ -1138,8 +1158,8 @@ error:
return 1;
}
-static void vxlan_rcv(struct vxlan_sock *vs,
- struct sk_buff *skb, __be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md)
{
struct iphdr *oip = NULL;
struct ipv6hdr *oip6 = NULL;
@@ -1150,7 +1170,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
int err = 0;
union vxlan_addr *remote_ip;
- vni = ntohl(vx_vni) >> 8;
+ vni = ntohl(md->vni) >> 8;
/* Is this VNI defined? */
vxlan = vxlan_vs_find_vni(vs, vni);
if (!vxlan)
@@ -1184,6 +1204,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
goto drop;
skb_reset_network_header(skb);
+ skb->mark = md->gbp;
if (oip6)
err = IP6_ECN_decapsulate(oip6, skb);
@@ -1533,15 +1554,57 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
return false;
}
+static int vxlan_build_hdr(struct sk_buff *skb, struct vxlan_sock *vs,
+ int min_headroom, struct vxlan_metadata *md)
+{
+ struct vxlanhdr *vxh;
+ int err;
+
+ /* Need space for new headers (invalidates iph ptr) */
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (WARN_ON(!skb))
+ return -ENOMEM;
+
+ vxh = (struct vxlanhdr *)__skb_push(skb, sizeof(*vxh));
+ vxh->vx_flags = htonl(VXLAN_FLAGS);
+ vxh->vx_vni = md->vni;
+
+ if (vs->exts) {
+ if (vs->exts & VXLAN_EXT_GBP) {
+ struct vxlanhdr_gbp *gbp;
+
+ gbp = (struct vxlanhdr_gbp *)vxh;
+ vxh->gbp_present = 1;
+
+ if (md->gbp & VXLAN_GBP_DONT_LEARN)
+ gbp->dont_learn = 1;
+
+ if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
+ gbp->policy_applied = 1;
+
+ gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
+ }
+ }
+
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+
+ return 0;
+}
+
#if IS_ENABLED(CONFIG_IPV6)
static int vxlan6_xmit_skb(struct vxlan_sock *vs,
struct dst_entry *dst, struct sk_buff *skb,
struct net_device *dev, struct in6_addr *saddr,
struct in6_addr *daddr, __u8 prio, __u8 ttl,
- __be16 src_port, __be16 dst_port, __be32 vni,
- bool xnet)
+ __be16 src_port, __be16 dst_port,
+ struct vxlan_metadata *md, bool xnet)
{
- struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk);
@@ -1558,24 +1621,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
+ VXLAN_HLEN + sizeof(struct ipv6hdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
- /* Need space for new headers (invalidates iph ptr) */
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
- goto err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (WARN_ON(!skb)) {
- err = -ENOMEM;
+ err = vxlan_build_hdr(skb, vs, min_headroom, md);
+ if (err)
goto err;
- }
-
- vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
- vxh->vx_flags = htonl(VXLAN_FLAGS);
- vxh->vx_vni = vni;
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
ttl, src_port, dst_port);
@@ -1589,9 +1637,9 @@ err:
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni, bool xnet)
+ __be16 src_port, __be16 dst_port,
+ struct vxlan_metadata *md, bool xnet)
{
- struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !vs->sock->sk->sk_no_check_tx;
@@ -1604,22 +1652,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
+ VXLAN_HLEN + sizeof(struct iphdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
- /* Need space for new headers (invalidates iph ptr) */
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
+ err = vxlan_build_hdr(skb, vs, min_headroom, md);
+ if (err)
return err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (WARN_ON(!skb))
- return -ENOMEM;
-
- vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
- vxh->vx_flags = htonl(VXLAN_FLAGS);
- vxh->vx_vni = vni;
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
ttl, df, src_port, dst_port, xnet);
@@ -1679,6 +1714,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *old_iph;
struct flowi4 fl4;
union vxlan_addr *dst;
+ struct vxlan_metadata md;
__be16 src_port = 0, dst_port;
u32 vni;
__be16 df = 0;
@@ -1749,11 +1785,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+ md.vni = htonl(vni << 8);
+ md.gbp = skb->mark;
err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
fl4.saddr, dst->sin.sin_addr.s_addr,
- tos, ttl, df, src_port, dst_port,
- htonl(vni << 8),
+ tos, ttl, df, src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)));
if (err < 0) {
/* skb is already freed. */
@@ -1806,10 +1843,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
}
ttl = ttl ? : ip6_dst_hoplimit(ndst);
+ md.vni = htonl(vni << 8);
+ md.gbp = skb->mark;
err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
dev, &fl6.saddr, &fl6.daddr, 0, ttl,
- src_port, dst_port, htonl(vni << 8),
+ src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)));
#endif
}
@@ -2210,6 +2249,11 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
+ [IFLA_VXLAN_EXTENSION] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy vxlan_ext_policy[IFLA_VXLAN_EXT_MAX + 1] = {
+ [IFLA_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
};
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -2246,6 +2290,18 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
}
}
+ if (data[IFLA_VXLAN_EXTENSION]) {
+ int err;
+
+ err = nla_validate_nested(data[IFLA_VXLAN_EXTENSION],
+ IFLA_VXLAN_EXT_MAX, vxlan_ext_policy);
+ if (err < 0) {
+ pr_debug("invalid VXLAN extension configuration: %d\n",
+ err);
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -2400,6 +2456,18 @@ static void vxlan_sock_work(struct work_struct *work)
dev_put(vxlan->dev);
}
+static void configure_vxlan_exts(struct vxlan_dev *vxlan, struct nlattr *attr)
+{
+ struct nlattr *exts[IFLA_VXLAN_EXT_MAX+1];
+
+ /* Validated in vxlan_validate() */
+ if (nla_parse_nested(exts, IFLA_VXLAN_EXT_MAX, attr, NULL) < 0)
+ BUG();
+
+ if (exts[IFLA_VXLAN_EXT_GBP])
+ vxlan->exts |= VXLAN_EXT_GBP;
+}
+
static int vxlan_newlink(struct net *net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
@@ -2525,6 +2593,9 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
+ if (data[IFLA_VXLAN_EXTENSION])
+ configure_vxlan_exts(vxlan, data[IFLA_VXLAN_EXTENSION]);
+
if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET,
vxlan->dst_port)) {
pr_info("duplicate VNI %u\n", vni);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 3e98d31..af0526b 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -11,13 +11,65 @@
#define VNI_HASH_BITS 10
#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
+/*
+ * VXLAN Group Based Policy Extension:
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |1|-|-|-|1|-|-|-|R|D|R|R|A|R|R|R| Group Policy ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | VXLAN Network Identifier (VNI) | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * D = Don't Learn bit. When set, this bit indicates that the egress
+ * VTEP MUST NOT learn the source address of the encapsulated frame.
+ *
+ * A = Indicates that the group policy has already been applied to
+ * this packet. Policies MUST NOT be applied by devices when the
+ * A bit is set.
+ *
+ * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
+ */
+struct vxlanhdr_gbp {
+ __u8 vx_flags;
+#ifdef __LITTLE_ENDIAN_BITFIELD
+ __u8 reserved_flags1:3,
+ policy_applied:1,
+ reserved_flags2:2,
+ dont_learn:1,
+ reserved_flags3:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u8 reserved_flags1:1,
+ dont_learn:1,
+ reserved_flags2:2,
+ policy_applied:1,
+ reserved_flags3:3;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ __be16 policy_id;
+ __be32 vx_vni;
+};
+
+struct vxlan_gbp {
+} __packed;
+
+/* skb->mark mapping
+ *
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+#define VXLAN_GBP_DONT_LEARN (BIT(6) << 16)
+#define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16)
+#define VXLAN_GBP_ID_MASK (0xFFFF)
+
/* VXLAN protocol header:
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |R|R|R|R|I|R|R|R| Reserved |
+ * |G|R|R|R|I|R|R|R| Reserved |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | VXLAN Network Identifier (VNI) | Reserved |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
+ * G = 1 Group Policy (VXLAN-GBP)
* I = 1 VXLAN Network Identifier (VNI) present
*/
struct vxlanhdr {
@@ -26,9 +78,11 @@ struct vxlanhdr {
#ifdef __LITTLE_ENDIAN_BITFIELD
__u8 reserved_flags1:3,
vni_present:1,
- reserved_flags2:4;
+ reserved_flags2:3,
+ gbp_present:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
- __u8 reserved_flags2:4,
+ __u8 gbp_present:1,
+ reserved_flags2:3,
vni_present:1,
reserved_flags1:3;
#else
@@ -42,8 +96,16 @@ struct vxlanhdr {
__be32 vx_vni;
};
+struct vxlan_metadata {
+ __be32 vni;
+ u32 gbp;
+};
+
struct vxlan_sock;
-typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb, __be32 key);
+typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb,
+ struct vxlan_metadata *md);
+
+#define VXLAN_EXT_GBP BIT(0)
/* per UDP socket information */
struct vxlan_sock {
@@ -78,7 +140,8 @@ void vxlan_sock_release(struct vxlan_sock *vs);
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni, bool xnet);
+ __be16 src_port, __be16 dst_port, struct vxlan_metadata *md,
+ bool xnet);
static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
netdev_features_t features)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f7d0d2d..9f07bf5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -370,10 +370,18 @@ enum {
IFLA_VXLAN_UDP_CSUM,
IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
+ IFLA_VXLAN_EXTENSION,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
+enum {
+ IFLA_VXLAN_EXT_UNSPEC,
+ IFLA_VXLAN_EXT_GBP,
+ __IFLA_VXLAN_EXT_MAX,
+};
+#define IFLA_VXLAN_EXT_MAX (__IFLA_VXLAN_EXT_MAX - 1)
+
struct ifla_vxlan_port_range {
__be16 low;
__be16 high;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index d7c46b3..dd68c97 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -59,7 +59,8 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
}
/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md)
{
struct ovs_tunnel_info tun_info;
struct vport *vport = vs->data;
@@ -68,7 +69,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
/* Save outer tunnel values */
iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(vx_vni) >> 8);
+ key = cpu_to_be64(ntohl(md->vni) >> 8);
ovs_flow_tun_info_init(&tun_info, iph,
udp_hdr(skb)->source, udp_hdr(skb)->dest,
key, TUNNEL_KEY, NULL, 0);
@@ -146,6 +147,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
struct vxlan_port *vxlan_port = vxlan_vport(vport);
__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
struct ovs_key_ipv4_tunnel *tun_key;
+ struct vxlan_metadata md;
struct rtable *rt;
struct flowi4 fl;
__be16 src_port;
@@ -178,12 +180,13 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
skb->ignore_df = 1;
src_port = udp_flow_src_port(net, skb, 0, 0, true);
+ md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
fl.saddr, tun_key->ipv4_dst,
tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
src_port, dst_port,
- htonl(be64_to_cpu(tun_key->tun_id) << 8),
+ &md,
false);
if (err < 0)
ip_rt_put(rt);
--
1.9.3
^ permalink raw reply related [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-08 22:47 ` [PATCH 2/6] vxlan: Group Policy extension Thomas Graf
@ 2015-01-09 17:37 ` Alexei Starovoitov
2015-01-09 22:10 ` Thomas Graf
2015-01-12 17:37 ` Nicolas Dichtel
2015-01-12 18:14 ` Tom Herbert
2 siblings, 1 reply; 44+ messages in thread
From: Alexei Starovoitov @ 2015-01-09 17:37 UTC (permalink / raw)
To: Thomas Graf
Cc: David S. Miller, Jesse Gross, Stephen Hemminger, Pravin Shelar,
Tom Herbert, netdev@vger.kernel.org, dev@openvswitch.org
On Thu, Jan 8, 2015 at 2:47 PM, Thomas Graf <tgraf@suug.ch> wrote:
> +
> +struct vxlan_gbp {
> +} __packed;
empty struct ? seems unused.
looks good to me otherwise.
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-09 17:37 ` Alexei Starovoitov
@ 2015-01-09 22:10 ` Thomas Graf
0 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2015-01-09 22:10 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: David S. Miller, Jesse Gross, Stephen Hemminger, Pravin Shelar,
Tom Herbert, netdev@vger.kernel.org, dev@openvswitch.org
On 01/09/15 at 09:37am, Alexei Starovoitov wrote:
> On Thu, Jan 8, 2015 at 2:47 PM, Thomas Graf <tgraf@suug.ch> wrote:
> > +
> > +struct vxlan_gbp {
> > +} __packed;
>
> empty struct ? seems unused.
> looks good to me otherwise.
Poor leftover, must feel all lonely there. Thanks for the reviews.
Will wait a little bit longer for more feedback and send out v3.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-08 22:47 ` [PATCH 2/6] vxlan: Group Policy extension Thomas Graf
2015-01-09 17:37 ` Alexei Starovoitov
@ 2015-01-12 17:37 ` Nicolas Dichtel
2015-01-12 17:59 ` David Miller
2015-01-13 1:04 ` Thomas Graf
2015-01-12 18:14 ` Tom Herbert
2 siblings, 2 replies; 44+ messages in thread
From: Nicolas Dichtel @ 2015-01-12 17:37 UTC (permalink / raw)
To: Thomas Graf, davem, jesse, stephen, pshelar, therbert,
alexei.starovoitov
Cc: netdev, dev
Le 08/01/2015 23:47, Thomas Graf a écrit :
> Implements supports for the Group Policy VXLAN extension [0] to provide
> a lightweight and simple security label mechanism across network peers
> based on VXLAN. The security context and associated metadata is mapped
> to/from skb->mark. This allows further mapping to a SELinux context
> using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
> tc, etc.
>
> The group membership is defined by the lower 16 bits of skb->mark, the
> upper 16 bits are used for flags.
>
> SELinux allows to manage label to secure local resources. However,
> distributed applications require ACLs to implemented across hosts. This
> is typically achieved by matching on L2-L4 fields to identify the
> original sending host and process on the receiver. On top of that,
> netlabel and specifically CIPSO [1] allow to map security contexts to
> universal labels. However, netlabel and CIPSO are relatively complex.
> This patch provides a lightweight alternative for overlay network
> environments with a trusted underlay. No additional control protocol
> is required.
>
> Host 1: Host 2:
>
> Group A Group B Group B Group A
> +-----+ +-------------+ +-------+ +-----+
> | lxc | | SELinux CTX | | httpd | | VM |
> +--+--+ +--+----------+ +---+---+ +--+--+
> \---+---/ \----+---/
> | |
> +---+---+ +---+---+
> | vxlan | | vxlan |
> +---+---+ +---+---+
> +------------------------------+
>
> Backwards compatibility:
> A VXLAN-GBP socket can receive standard VXLAN frames and will assign
> the default group 0x0000 to such frames. A Linux VXLAN socket will
> drop VXLAN-GBP frames. The extension is therefore disabled by default
> and needs to be specifically enabled:
>
> ip link add [...] type vxlan [...] gbp
>
> In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
> must run on a separate port number.
>
> Examples:
> iptables:
> host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
> host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
>
> OVS:
> # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
> # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
>
> [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
> [1] http://lwn.net/Articles/204905/
>
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
> v2:
> - split GBP header definition into separate struct vxlanhdr_gbp as requested
> by Alexei
>
> drivers/net/vxlan.c | 161 ++++++++++++++++++++++++++++++------------
> include/net/vxlan.h | 73 +++++++++++++++++--
> include/uapi/linux/if_link.h | 8 +++
> net/openvswitch/vport-vxlan.c | 9 ++-
> 4 files changed, 198 insertions(+), 53 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 4d52aa9..b148739 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -132,6 +132,7 @@ struct vxlan_dev {
> __u8 tos; /* TOS override */
> __u8 ttl;
> u32 flags; /* VXLAN_F_* in vxlan.h */
> + u32 exts; /* Enabled extensions */
>
> struct work_struct sock_work;
> struct work_struct igmp_join;
> @@ -568,7 +569,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
> continue;
>
> vh2 = (struct vxlanhdr *)(p->data + off_vx);
> - if (vh->vx_vni != vh2->vx_vni) {
> + if (vh->vx_flags != vh2->vx_flags ||
> + vh->vx_vni != vh2->vx_vni) {
> NAPI_GRO_CB(p)->same_flow = 0;
> continue;
> }
> @@ -1095,6 +1097,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> {
> struct vxlan_sock *vs;
> struct vxlanhdr *vxh;
> + struct vxlan_metadata md = {0};
>
> /* Need Vxlan and inner Ethernet header to be present */
> if (!pskb_may_pull(skb, VXLAN_HLEN))
> @@ -1113,6 +1116,22 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> if (vs->exts) {
> if (!vxh->vni_present)
> goto error_invalid_header;
> +
> + if (vxh->gbp_present) {
> + struct vxlanhdr_gbp *gbp;
> +
> + if (!(vs->exts & VXLAN_EXT_GBP))
> + goto error_invalid_header;
> +
> + gbp = (struct vxlanhdr_gbp *)vxh;
> + md.gbp = ntohs(gbp->policy_id);
> +
> + if (gbp->dont_learn)
> + md.gbp |= VXLAN_GBP_DONT_LEARN;
> +
> + if (gbp->policy_applied)
> + md.gbp |= VXLAN_GBP_POLICY_APPLIED;
> + }
> } else {
> if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> (vxh->vx_vni & htonl(0xff)))
> @@ -1122,7 +1141,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
> goto drop;
>
> - vs->rcv(vs, skb, vxh->vx_vni);
> + md.vni = vxh->vx_vni;
> + vs->rcv(vs, skb, &md);
> return 0;
>
> drop:
> @@ -1138,8 +1158,8 @@ error:
> return 1;
> }
>
> -static void vxlan_rcv(struct vxlan_sock *vs,
> - struct sk_buff *skb, __be32 vx_vni)
> +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> + struct vxlan_metadata *md)
> {
> struct iphdr *oip = NULL;
> struct ipv6hdr *oip6 = NULL;
> @@ -1150,7 +1170,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
> int err = 0;
> union vxlan_addr *remote_ip;
>
> - vni = ntohl(vx_vni) >> 8;
> + vni = ntohl(md->vni) >> 8;
> /* Is this VNI defined? */
> vxlan = vxlan_vs_find_vni(vs, vni);
> if (!vxlan)
> @@ -1184,6 +1204,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
> goto drop;
>
> skb_reset_network_header(skb);
> + skb->mark = md->gbp;
>
> if (oip6)
> err = IP6_ECN_decapsulate(oip6, skb);
> @@ -1533,15 +1554,57 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
> return false;
> }
>
> +static int vxlan_build_hdr(struct sk_buff *skb, struct vxlan_sock *vs,
> + int min_headroom, struct vxlan_metadata *md)
> +{
> + struct vxlanhdr *vxh;
> + int err;
> +
> + /* Need space for new headers (invalidates iph ptr) */
> + err = skb_cow_head(skb, min_headroom);
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + return err;
> + }
> +
> + skb = vlan_hwaccel_push_inside(skb);
> + if (WARN_ON(!skb))
> + return -ENOMEM;
> +
> + vxh = (struct vxlanhdr *)__skb_push(skb, sizeof(*vxh));
> + vxh->vx_flags = htonl(VXLAN_FLAGS);
> + vxh->vx_vni = md->vni;
> +
> + if (vs->exts) {
> + if (vs->exts & VXLAN_EXT_GBP) {
> + struct vxlanhdr_gbp *gbp;
> +
> + gbp = (struct vxlanhdr_gbp *)vxh;
> + vxh->gbp_present = 1;
> +
> + if (md->gbp & VXLAN_GBP_DONT_LEARN)
> + gbp->dont_learn = 1;
> +
> + if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
> + gbp->policy_applied = 1;
> +
> + gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
> + }
> + }
> +
> + skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +
> + return 0;
> +}
> +
> #if IS_ENABLED(CONFIG_IPV6)
> static int vxlan6_xmit_skb(struct vxlan_sock *vs,
> struct dst_entry *dst, struct sk_buff *skb,
> struct net_device *dev, struct in6_addr *saddr,
> struct in6_addr *daddr, __u8 prio, __u8 ttl,
> - __be16 src_port, __be16 dst_port, __be32 vni,
> - bool xnet)
> + __be16 src_port, __be16 dst_port,
> + struct vxlan_metadata *md, bool xnet)
> {
> - struct vxlanhdr *vxh;
> int min_headroom;
> int err;
> bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk);
> @@ -1558,24 +1621,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
> + VXLAN_HLEN + sizeof(struct ipv6hdr)
> + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
>
> - /* Need space for new headers (invalidates iph ptr) */
> - err = skb_cow_head(skb, min_headroom);
> - if (unlikely(err)) {
> - kfree_skb(skb);
> - goto err;
> - }
> -
> - skb = vlan_hwaccel_push_inside(skb);
> - if (WARN_ON(!skb)) {
> - err = -ENOMEM;
> + err = vxlan_build_hdr(skb, vs, min_headroom, md);
> + if (err)
> goto err;
> - }
> -
> - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
> - vxh->vx_flags = htonl(VXLAN_FLAGS);
> - vxh->vx_vni = vni;
> -
> - skb_set_inner_protocol(skb, htons(ETH_P_TEB));
>
> udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
> ttl, src_port, dst_port);
> @@ -1589,9 +1637,9 @@ err:
> int vxlan_xmit_skb(struct vxlan_sock *vs,
> struct rtable *rt, struct sk_buff *skb,
> __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
> - __be16 src_port, __be16 dst_port, __be32 vni, bool xnet)
> + __be16 src_port, __be16 dst_port,
> + struct vxlan_metadata *md, bool xnet)
> {
> - struct vxlanhdr *vxh;
> int min_headroom;
> int err;
> bool udp_sum = !vs->sock->sk->sk_no_check_tx;
> @@ -1604,22 +1652,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
> + VXLAN_HLEN + sizeof(struct iphdr)
> + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
>
> - /* Need space for new headers (invalidates iph ptr) */
> - err = skb_cow_head(skb, min_headroom);
> - if (unlikely(err)) {
> - kfree_skb(skb);
> + err = vxlan_build_hdr(skb, vs, min_headroom, md);
> + if (err)
> return err;
> - }
> -
> - skb = vlan_hwaccel_push_inside(skb);
> - if (WARN_ON(!skb))
> - return -ENOMEM;
> -
> - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
> - vxh->vx_flags = htonl(VXLAN_FLAGS);
> - vxh->vx_vni = vni;
> -
> - skb_set_inner_protocol(skb, htons(ETH_P_TEB));
>
> return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
> ttl, df, src_port, dst_port, xnet);
> @@ -1679,6 +1714,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
> const struct iphdr *old_iph;
> struct flowi4 fl4;
> union vxlan_addr *dst;
> + struct vxlan_metadata md;
> __be16 src_port = 0, dst_port;
> u32 vni;
> __be16 df = 0;
> @@ -1749,11 +1785,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>
> tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
> ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
> + md.vni = htonl(vni << 8);
> + md.gbp = skb->mark;
>
> err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
> fl4.saddr, dst->sin.sin_addr.s_addr,
> - tos, ttl, df, src_port, dst_port,
> - htonl(vni << 8),
> + tos, ttl, df, src_port, dst_port, &md,
> !net_eq(vxlan->net, dev_net(vxlan->dev)));
> if (err < 0) {
> /* skb is already freed. */
> @@ -1806,10 +1843,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
> }
>
> ttl = ttl ? : ip6_dst_hoplimit(ndst);
> + md.vni = htonl(vni << 8);
> + md.gbp = skb->mark;
>
> err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
> dev, &fl6.saddr, &fl6.daddr, 0, ttl,
> - src_port, dst_port, htonl(vni << 8),
> + src_port, dst_port, &md,
> !net_eq(vxlan->net, dev_net(vxlan->dev)));
> #endif
> }
> @@ -2210,6 +2249,11 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
> [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
> [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
> [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
> + [IFLA_VXLAN_EXTENSION] = { .type = NLA_NESTED },
> +};
> +
> +static const struct nla_policy vxlan_ext_policy[IFLA_VXLAN_EXT_MAX + 1] = {
> + [IFLA_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
> };
>
> static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
> @@ -2246,6 +2290,18 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
> }
> }
>
> + if (data[IFLA_VXLAN_EXTENSION]) {
> + int err;
> +
> + err = nla_validate_nested(data[IFLA_VXLAN_EXTENSION],
> + IFLA_VXLAN_EXT_MAX, vxlan_ext_policy);
> + if (err < 0) {
> + pr_debug("invalid VXLAN extension configuration: %d\n",
> + err);
> + return -EINVAL;
> + }
> + }
> +
> return 0;
> }
>
> @@ -2400,6 +2456,18 @@ static void vxlan_sock_work(struct work_struct *work)
> dev_put(vxlan->dev);
> }
>
> +static void configure_vxlan_exts(struct vxlan_dev *vxlan, struct nlattr *attr)
> +{
> + struct nlattr *exts[IFLA_VXLAN_EXT_MAX+1];
> +
> + /* Validated in vxlan_validate() */
> + if (nla_parse_nested(exts, IFLA_VXLAN_EXT_MAX, attr, NULL) < 0)
> + BUG();
> +
> + if (exts[IFLA_VXLAN_EXT_GBP])
> + vxlan->exts |= VXLAN_EXT_GBP;
> +}
> +
> static int vxlan_newlink(struct net *net, struct net_device *dev,
> struct nlattr *tb[], struct nlattr *data[])
> {
> @@ -2525,6 +2593,9 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
> nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
> vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
>
> + if (data[IFLA_VXLAN_EXTENSION])
> + configure_vxlan_exts(vxlan, data[IFLA_VXLAN_EXTENSION]);
> +
Can you also update vxlan_fill_info() so that these new attributes can be dumped
via netlink?
Thank you,
Nicolas
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-12 17:37 ` Nicolas Dichtel
@ 2015-01-12 17:59 ` David Miller
2015-01-13 8:29 ` Nicolas Dichtel
2015-01-13 1:04 ` Thomas Graf
1 sibling, 1 reply; 44+ messages in thread
From: David Miller @ 2015-01-12 17:59 UTC (permalink / raw)
To: nicolas.dichtel
Cc: tgraf, jesse, stephen, pshelar, therbert, alexei.starovoitov,
netdev, dev
Can you PLEASE, PLEASE, not quote and entire full patch just to add two
lines of commentary.
Quote _only_ the _RELEVANT_ portions of the email you are replying to.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-12 17:59 ` David Miller
@ 2015-01-13 8:29 ` Nicolas Dichtel
0 siblings, 0 replies; 44+ messages in thread
From: Nicolas Dichtel @ 2015-01-13 8:29 UTC (permalink / raw)
To: David Miller
Cc: tgraf, jesse, stephen, pshelar, therbert, alexei.starovoitov,
netdev, dev
Le 12/01/2015 18:59, David Miller a écrit :
>
> Can you PLEASE, PLEASE, not quote and entire full patch just to add two
> lines of commentary.
>
> Quote _only_ the _RELEVANT_ portions of the email you are replying to.
>
Will do, sorry for that.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-12 17:37 ` Nicolas Dichtel
2015-01-12 17:59 ` David Miller
@ 2015-01-13 1:04 ` Thomas Graf
1 sibling, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2015-01-13 1:04 UTC (permalink / raw)
To: Nicolas Dichtel
Cc: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov,
netdev, dev
On 01/12/15 at 06:37pm, Nicolas Dichtel wrote:
> >+ if (data[IFLA_VXLAN_EXTENSION])
> >+ configure_vxlan_exts(vxlan, data[IFLA_VXLAN_EXTENSION]);
> >+
> Can you also update vxlan_fill_info() so that these new attributes can be
> dumped via netlink?
Sure, will do.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-08 22:47 ` [PATCH 2/6] vxlan: Group Policy extension Thomas Graf
2015-01-09 17:37 ` Alexei Starovoitov
2015-01-12 17:37 ` Nicolas Dichtel
@ 2015-01-12 18:14 ` Tom Herbert
2015-01-13 1:03 ` Thomas Graf
2 siblings, 1 reply; 44+ messages in thread
From: Tom Herbert @ 2015-01-12 18:14 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Jesse Gross, Stephen Hemminger, Pravin B Shelar,
Alexei Starovoitov, Linux Netdev List, dev@openvswitch.org
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index f7d0d2d..9f07bf5 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -370,10 +370,18 @@ enum {
> IFLA_VXLAN_UDP_CSUM,
> IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
> IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
> + IFLA_VXLAN_EXTENSION,
> __IFLA_VXLAN_MAX
> };
> #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
>
> +enum {
> + IFLA_VXLAN_EXT_UNSPEC,
> + IFLA_VXLAN_EXT_GBP,
> + __IFLA_VXLAN_EXT_MAX,
> +};
> +#define IFLA_VXLAN_EXT_MAX (__IFLA_VXLAN_EXT_MAX - 1)
> +
Creating a level of indirection for extensions seems overly
complicated to me. Why not just define IFLA_VXLAN_GBP as just another
enum above?
Tom
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-12 18:14 ` Tom Herbert
@ 2015-01-13 1:03 ` Thomas Graf
2015-01-13 2:28 ` Tom Herbert
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-13 1:03 UTC (permalink / raw)
To: Tom Herbert
Cc: David Miller, Jesse Gross, Stephen Hemminger, Pravin B Shelar,
Alexei Starovoitov, Linux Netdev List, dev@openvswitch.org
On 01/12/15 at 10:14am, Tom Herbert wrote:
> > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> > index f7d0d2d..9f07bf5 100644
> > --- a/include/uapi/linux/if_link.h
> > +++ b/include/uapi/linux/if_link.h
> > @@ -370,10 +370,18 @@ enum {
> > IFLA_VXLAN_UDP_CSUM,
> > IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
> > IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
> > + IFLA_VXLAN_EXTENSION,
> > __IFLA_VXLAN_MAX
> > };
> > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
> >
> > +enum {
> > + IFLA_VXLAN_EXT_UNSPEC,
> > + IFLA_VXLAN_EXT_GBP,
> > + __IFLA_VXLAN_EXT_MAX,
> > +};
> > +#define IFLA_VXLAN_EXT_MAX (__IFLA_VXLAN_EXT_MAX - 1)
> > +
>
> Creating a level of indirection for extensions seems overly
> complicated to me. Why not just define IFLA_VXLAN_GBP as just another
> enum above?
I think it's cleaner to group them in a nested attribute.
It clearly separates the optional extensions from the base
attributes. RCO, GPE, GBP can all live in there.
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-13 1:03 ` Thomas Graf
@ 2015-01-13 2:28 ` Tom Herbert
2015-01-13 11:32 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: Tom Herbert @ 2015-01-13 2:28 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Jesse Gross, Stephen Hemminger, Pravin B Shelar,
Alexei Starovoitov, Linux Netdev List, dev@openvswitch.org
On Mon, Jan 12, 2015 at 5:03 PM, Thomas Graf <tgraf@suug.ch> wrote:
> On 01/12/15 at 10:14am, Tom Herbert wrote:
>> > diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
>> > index f7d0d2d..9f07bf5 100644
>> > --- a/include/uapi/linux/if_link.h
>> > +++ b/include/uapi/linux/if_link.h
>> > @@ -370,10 +370,18 @@ enum {
>> > IFLA_VXLAN_UDP_CSUM,
>> > IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
>> > IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
>> > + IFLA_VXLAN_EXTENSION,
>> > __IFLA_VXLAN_MAX
>> > };
>> > #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
>> >
>> > +enum {
>> > + IFLA_VXLAN_EXT_UNSPEC,
>> > + IFLA_VXLAN_EXT_GBP,
>> > + __IFLA_VXLAN_EXT_MAX,
>> > +};
>> > +#define IFLA_VXLAN_EXT_MAX (__IFLA_VXLAN_EXT_MAX - 1)
>> > +
>>
>> Creating a level of indirection for extensions seems overly
>> complicated to me. Why not just define IFLA_VXLAN_GBP as just another
>> enum above?
>
> I think it's cleaner to group them in a nested attribute.
> It clearly separates the optional extensions from the base
> attributes. RCO, GPE, GBP can all live in there.
This is inconsistent with similar things in GRE and GUE. For instance,
GRE keyid is set as its own attribute. It just seems like this adding
more code to the driver than is necessary for the functionality
needed.
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-13 2:28 ` Tom Herbert
@ 2015-01-13 11:32 ` Thomas Graf
2015-01-13 16:16 ` Tom Herbert
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-13 11:32 UTC (permalink / raw)
To: Tom Herbert
Cc: David Miller, Jesse Gross, Stephen Hemminger, Pravin B Shelar,
Alexei Starovoitov, Linux Netdev List, dev@openvswitch.org
On 01/12/15 at 06:28pm, Tom Herbert wrote:
> On Mon, Jan 12, 2015 at 5:03 PM, Thomas Graf <tgraf@suug.ch> wrote:
> >>
> >> Creating a level of indirection for extensions seems overly
> >> complicated to me. Why not just define IFLA_VXLAN_GBP as just another
> >> enum above?
> >
> > I think it's cleaner to group them in a nested attribute.
> > It clearly separates the optional extensions from the base
> > attributes. RCO, GPE, GBP can all live in there.
>
> This is inconsistent with similar things in GRE and GUE. For instance,
> GRE keyid is set as its own attribute. It just seems like this adding
> more code to the driver than is necessary for the functionality
> needed.
The major difference here is that we have to consider backwards
compatibility specifically for VXLAN. Your initial feedback on GPE
actually led me to how I implemented GBP.
I think the axioms we want to establish are as follows:
1. Extensions need to be explicitly enabled by the user. A previously
dropped frame should only be processed if the user explitly asks
for it.
2. As a consequence: only share a VLXAN UDP port if the enabled
extensions match (vxlan_sock_add), e.g. user A might want RCO
but user B might be unaware. They cannot share the same UDP port.
The 2nd lead me to introduce the 'exts' member to vxlan_sock so we can
compare it in vxlan_find_sock() and only share a UDP port if the
enabled extensions match.
Your patch currently implements (1) but not (2).
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-13 11:32 ` Thomas Graf
@ 2015-01-13 16:16 ` Tom Herbert
0 siblings, 0 replies; 44+ messages in thread
From: Tom Herbert @ 2015-01-13 16:16 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Jesse Gross, Stephen Hemminger, Pravin B Shelar,
Alexei Starovoitov, Linux Netdev List, dev@openvswitch.org
On Tue, Jan 13, 2015 at 3:32 AM, Thomas Graf <tgraf@suug.ch> wrote:
> On 01/12/15 at 06:28pm, Tom Herbert wrote:
>> On Mon, Jan 12, 2015 at 5:03 PM, Thomas Graf <tgraf@suug.ch> wrote:
>> >>
>> >> Creating a level of indirection for extensions seems overly
>> >> complicated to me. Why not just define IFLA_VXLAN_GBP as just another
>> >> enum above?
>> >
>> > I think it's cleaner to group them in a nested attribute.
>> > It clearly separates the optional extensions from the base
>> > attributes. RCO, GPE, GBP can all live in there.
>>
>> This is inconsistent with similar things in GRE and GUE. For instance,
>> GRE keyid is set as its own attribute. It just seems like this adding
>> more code to the driver than is necessary for the functionality
>> needed.
>
> The major difference here is that we have to consider backwards
> compatibility specifically for VXLAN. Your initial feedback on GPE
> actually led me to how I implemented GBP.
>
> I think the axioms we want to establish are as follows:
> 1. Extensions need to be explicitly enabled by the user. A previously
> dropped frame should only be processed if the user explitly asks
> for it.
> 2. As a consequence: only share a VLXAN UDP port if the enabled
> extensions match (vxlan_sock_add), e.g. user A might want RCO
> but user B might be unaware. They cannot share the same UDP port.
>
> The 2nd lead me to introduce the 'exts' member to vxlan_sock so we can
> compare it in vxlan_find_sock() and only share a UDP port if the
> enabled extensions match.
>
RCO is represented in the socket in VXLAN flags (VLXAN_F_*). My patch
also adds a flags to vxlan_sock which contains the VLXAN flags. For
shared port, I suspect all the receive features must match, including
receive checksum settings for instance, but we don't care about
transmit side. To facilitate this, I would suggest splitting flags
into o_flags and i_flags like ip_tunnel does, and then compare i_flags
in vxlan_find_sock.
Regardless of the internal implementation, I still don't see much
value in exposing these distinctions in netlink.
Tom
> Your patch currently implements (1) but not (2).
^ permalink raw reply [flat|nested] 44+ messages in thread
* [PATCH 2/6] vxlan: Group Policy extension
2015-01-07 2:05 [PATCH 0/6 net-next] VXLAN Group Policy Extension Thomas Graf
@ 2015-01-07 2:05 ` Thomas Graf
2015-01-07 16:05 ` Tom Herbert
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2015-01-07 2:05 UTC (permalink / raw)
To: davem, jesse, stephen, pshelar; +Cc: netdev, dev
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
$ iptables -I OUTPUT -p icmp -j MARK --set-mark 0x200
$ iptables -I INPUT -i br0 -m mark --mark 0x200 -j ACCEPT
OVS (patches provided separately):
in_port=1, actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
drivers/net/vxlan.c | 155 ++++++++++++++++++++++++++++++------------
include/net/vxlan.h | 80 ++++++++++++++++++++--
include/uapi/linux/if_link.h | 8 +++
net/openvswitch/vport-vxlan.c | 9 ++-
4 files changed, 197 insertions(+), 55 deletions(-)
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 4d52aa9..30b7b59 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -132,6 +132,7 @@ struct vxlan_dev {
__u8 tos; /* TOS override */
__u8 ttl;
u32 flags; /* VXLAN_F_* in vxlan.h */
+ u32 exts; /* Enabled extensions */
struct work_struct sock_work;
struct work_struct igmp_join;
@@ -568,7 +569,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
continue;
vh2 = (struct vxlanhdr *)(p->data + off_vx);
- if (vh->vx_vni != vh2->vx_vni) {
+ if (vh->vx_flags != vh2->vx_flags ||
+ vh->vx_vni != vh2->vx_vni) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -1095,6 +1097,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct vxlan_sock *vs;
struct vxlanhdr *vxh;
+ struct vxlan_metadata md = {0};
/* Need Vxlan and inner Ethernet header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1113,6 +1116,19 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (vs->exts) {
if (!vxh->vni_present)
goto error_invalid_header;
+
+ if (vxh->gbp_present) {
+ if (!(vs->exts & VXLAN_EXT_GBP))
+ goto error_invalid_header;
+
+ md.gbp = ntohs(vxh->gbp.policy_id);
+
+ if (vxh->gbp.dont_learn)
+ md.gbp |= VXLAN_GBP_DONT_LEARN;
+
+ if (vxh->gbp.policy_applied)
+ md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+ }
} else {
if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
(vxh->vx_vni & htonl(0xff)))
@@ -1122,7 +1138,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
goto drop;
- vs->rcv(vs, skb, vxh->vx_vni);
+ md.vni = vxh->vx_vni;
+ vs->rcv(vs, skb, &md);
return 0;
drop:
@@ -1138,8 +1155,8 @@ error:
return 1;
}
-static void vxlan_rcv(struct vxlan_sock *vs,
- struct sk_buff *skb, __be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md)
{
struct iphdr *oip = NULL;
struct ipv6hdr *oip6 = NULL;
@@ -1150,7 +1167,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
int err = 0;
union vxlan_addr *remote_ip;
- vni = ntohl(vx_vni) >> 8;
+ vni = ntohl(md->vni) >> 8;
/* Is this VNI defined? */
vxlan = vxlan_vs_find_vni(vs, vni);
if (!vxlan)
@@ -1184,6 +1201,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
goto drop;
skb_reset_network_header(skb);
+ skb->mark = md->gbp;
if (oip6)
err = IP6_ECN_decapsulate(oip6, skb);
@@ -1533,15 +1551,54 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
return false;
}
+static int vxlan_build_hdr(struct sk_buff *skb, struct vxlan_sock *vs,
+ int min_headroom, struct vxlan_metadata *md)
+{
+ struct vxlanhdr *vxh;
+ int err;
+
+ /* Need space for new headers (invalidates iph ptr) */
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (WARN_ON(!skb))
+ return -ENOMEM;
+
+ vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+ vxh->vx_flags = htonl(VXLAN_FLAGS);
+ vxh->vx_vni = md->vni;
+
+ if (vs->exts) {
+ if (vs->exts & VXLAN_EXT_GBP) {
+ vxh->gbp_present = 1;
+
+ if (md->gbp & VXLAN_GBP_DONT_LEARN)
+ vxh->gbp.dont_learn = 1;
+
+ if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
+ vxh->gbp.policy_applied = 1;
+
+ vxh->gbp.policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
+ }
+ }
+
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+
+ return 0;
+}
+
#if IS_ENABLED(CONFIG_IPV6)
static int vxlan6_xmit_skb(struct vxlan_sock *vs,
struct dst_entry *dst, struct sk_buff *skb,
struct net_device *dev, struct in6_addr *saddr,
struct in6_addr *daddr, __u8 prio, __u8 ttl,
- __be16 src_port, __be16 dst_port, __be32 vni,
- bool xnet)
+ __be16 src_port, __be16 dst_port,
+ struct vxlan_metadata *md, bool xnet)
{
- struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk);
@@ -1558,24 +1615,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
+ VXLAN_HLEN + sizeof(struct ipv6hdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
- /* Need space for new headers (invalidates iph ptr) */
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
- goto err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (WARN_ON(!skb)) {
- err = -ENOMEM;
+ err = vxlan_build_hdr(skb, vs, min_headroom, md);
+ if (err)
goto err;
- }
-
- vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
- vxh->vx_flags = htonl(VXLAN_FLAGS);
- vxh->vx_vni = vni;
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
ttl, src_port, dst_port);
@@ -1589,9 +1631,9 @@ err:
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni, bool xnet)
+ __be16 src_port, __be16 dst_port,
+ struct vxlan_metadata *md, bool xnet)
{
- struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !vs->sock->sk->sk_no_check_tx;
@@ -1604,22 +1646,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
+ VXLAN_HLEN + sizeof(struct iphdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
- /* Need space for new headers (invalidates iph ptr) */
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
+ err = vxlan_build_hdr(skb, vs, min_headroom, md);
+ if (err)
return err;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (WARN_ON(!skb))
- return -ENOMEM;
-
- vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
- vxh->vx_flags = htonl(VXLAN_FLAGS);
- vxh->vx_vni = vni;
-
- skb_set_inner_protocol(skb, htons(ETH_P_TEB));
return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
ttl, df, src_port, dst_port, xnet);
@@ -1679,6 +1708,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *old_iph;
struct flowi4 fl4;
union vxlan_addr *dst;
+ struct vxlan_metadata md;
__be16 src_port = 0, dst_port;
u32 vni;
__be16 df = 0;
@@ -1749,11 +1779,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+ md.vni = htonl(vni << 8);
+ md.gbp = skb->mark;
err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
fl4.saddr, dst->sin.sin_addr.s_addr,
- tos, ttl, df, src_port, dst_port,
- htonl(vni << 8),
+ tos, ttl, df, src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)));
if (err < 0) {
/* skb is already freed. */
@@ -1806,10 +1837,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
}
ttl = ttl ? : ip6_dst_hoplimit(ndst);
+ md.vni = htonl(vni << 8);
+ md.gbp = skb->mark;
err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
dev, &fl6.saddr, &fl6.daddr, 0, ttl,
- src_port, dst_port, htonl(vni << 8),
+ src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)));
#endif
}
@@ -2210,6 +2243,11 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
+ [IFLA_VXLAN_EXTENSION] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy vxlan_ext_policy[IFLA_VXLAN_EXT_MAX + 1] = {
+ [IFLA_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
};
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -2246,6 +2284,18 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
}
}
+ if (data[IFLA_VXLAN_EXTENSION]) {
+ int err;
+
+ err = nla_validate_nested(data[IFLA_VXLAN_EXTENSION],
+ IFLA_VXLAN_EXT_MAX, vxlan_ext_policy);
+ if (err < 0) {
+ pr_debug("invalid VXLAN extension configuration: %d\n",
+ err);
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -2400,6 +2450,18 @@ static void vxlan_sock_work(struct work_struct *work)
dev_put(vxlan->dev);
}
+static void configure_vxlan_exts(struct vxlan_dev *vxlan, struct nlattr *attr)
+{
+ struct nlattr *exts[IFLA_VXLAN_EXT_MAX+1];
+
+ /* Validated in vxlan_validate() */
+ if (nla_parse_nested(exts, IFLA_VXLAN_EXT_MAX, attr, NULL) < 0)
+ BUG();
+
+ if (exts[IFLA_VXLAN_EXT_GBP])
+ vxlan->exts |= VXLAN_EXT_GBP;
+}
+
static int vxlan_newlink(struct net *net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
@@ -2525,6 +2587,9 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
+ if (data[IFLA_VXLAN_EXTENSION])
+ configure_vxlan_exts(vxlan, data[IFLA_VXLAN_EXTENSION]);
+
if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET,
vxlan->dst_port)) {
pr_info("duplicate VNI %u\n", vni);
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 3e98d31..66000d0 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -11,13 +11,60 @@
#define VNI_HASH_BITS 10
#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
+/*
+ * VXLAN Group Based Policy Extension:
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |1|-|-|-|1|-|-|-|R|D|R|R|A|R|R|R| Group Policy ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | VXLAN Network Identifier (VNI) | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * D = Don't Learn bit. When set, this bit indicates that the egress
+ * VTEP MUST NOT learn the source address of the encapsulated frame.
+ *
+ * A = Indicates that the group policy has already been applied to
+ * this packet. Policies MUST NOT be applied by devices when the
+ * A bit is set.
+ *
+ * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
+ */
+struct vxlan_gbp {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+ __u8 reserved_flags1:3,
+ policy_applied:1,
+ reserved_flags2:2,
+ dont_learn:1,
+ reserved_flags3:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u8 reserved_flags1:1,
+ dont_learn:1,
+ reserved_flags2:2,
+ policy_applied:1,
+ reserved_flags3:3;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ __be16 policy_id;
+} __packed;
+
+/* skb->mark mapping
+ *
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+#define VXLAN_GBP_DONT_LEARN (BIT(6) << 16)
+#define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16)
+#define VXLAN_GBP_ID_MASK (0xFFFF)
+
/* VXLAN protocol header:
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |R|R|R|R|I|R|R|R| Reserved |
+ * |G|R|R|R|I|R|R|R| Reserved |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | VXLAN Network Identifier (VNI) | Reserved |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
+ * G = 1 Group Policy (VXLAN-GBP)
* I = 1 VXLAN Network Identifier (VNI) present
*/
struct vxlanhdr {
@@ -26,24 +73,42 @@ struct vxlanhdr {
#ifdef __LITTLE_ENDIAN_BITFIELD
__u8 reserved_flags1:3,
vni_present:1,
- reserved_flags2:4;
+ reserved_flags2:3,
+ gbp_present:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
- __u8 reserved_flags2:4,
+ __u8 gbp_present:1,
+ reserved_flags2:3,
vni_present:1,
reserved_flags1:3;
#else
#error "Please fix <asm/byteorder.h>"
#endif
- __u8 vx_reserved1;
- __be16 vx_reserved2;
+ union {
+ /* NOTE: Offset 0 will be 1 byte aligned, so
+ * all member structs must be marked packed.
+ */
+ struct vxlan_gbp gbp;
+ struct {
+ __u8 vx_reserved1;
+ __be16 vx_reserved2;
+ } __packed;
+ };
};
__be32 vx_flags;
};
__be32 vx_vni;
};
+struct vxlan_metadata {
+ __be32 vni;
+ u32 gbp;
+};
+
struct vxlan_sock;
-typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb, __be32 key);
+typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb,
+ struct vxlan_metadata *md);
+
+#define VXLAN_EXT_GBP BIT(0)
/* per UDP socket information */
struct vxlan_sock {
@@ -78,7 +143,8 @@ void vxlan_sock_release(struct vxlan_sock *vs);
int vxlan_xmit_skb(struct vxlan_sock *vs,
struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port, __be32 vni, bool xnet);
+ __be16 src_port, __be16 dst_port, struct vxlan_metadata *md,
+ bool xnet);
static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
netdev_features_t features)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index f7d0d2d..9f07bf5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -370,10 +370,18 @@ enum {
IFLA_VXLAN_UDP_CSUM,
IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
+ IFLA_VXLAN_EXTENSION,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
+enum {
+ IFLA_VXLAN_EXT_UNSPEC,
+ IFLA_VXLAN_EXT_GBP,
+ __IFLA_VXLAN_EXT_MAX,
+};
+#define IFLA_VXLAN_EXT_MAX (__IFLA_VXLAN_EXT_MAX - 1)
+
struct ifla_vxlan_port_range {
__be16 low;
__be16 high;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index d7c46b3..dd68c97 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -59,7 +59,8 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
}
/* Called with rcu_read_lock and BH disabled. */
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md)
{
struct ovs_tunnel_info tun_info;
struct vport *vport = vs->data;
@@ -68,7 +69,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
/* Save outer tunnel values */
iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(vx_vni) >> 8);
+ key = cpu_to_be64(ntohl(md->vni) >> 8);
ovs_flow_tun_info_init(&tun_info, iph,
udp_hdr(skb)->source, udp_hdr(skb)->dest,
key, TUNNEL_KEY, NULL, 0);
@@ -146,6 +147,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
struct vxlan_port *vxlan_port = vxlan_vport(vport);
__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
struct ovs_key_ipv4_tunnel *tun_key;
+ struct vxlan_metadata md;
struct rtable *rt;
struct flowi4 fl;
__be16 src_port;
@@ -178,12 +180,13 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
skb->ignore_df = 1;
src_port = udp_flow_src_port(net, skb, 0, 0, true);
+ md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
fl.saddr, tun_key->ipv4_dst,
tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
src_port, dst_port,
- htonl(be64_to_cpu(tun_key->tun_id) << 8),
+ &md,
false);
if (err < 0)
ip_rt_put(rt);
--
1.9.3
^ permalink raw reply related [flat|nested] 44+ messages in thread* Re: [PATCH 2/6] vxlan: Group Policy extension
2015-01-07 2:05 ` [PATCH 2/6] vxlan: Group Policy extension Thomas Graf
@ 2015-01-07 16:05 ` Tom Herbert
[not found] ` <CA+mtBx_Jj-tUM1nbHd2fHb0-=QpK3tcQgA=smWmg=cB-fupdGg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 44+ messages in thread
From: Tom Herbert @ 2015-01-07 16:05 UTC (permalink / raw)
To: Thomas Graf
Cc: David Miller, Jesse Gross, Stephen Hemminger, Pravin B Shelar,
Linux Netdev List, dev@openvswitch.org
On Tue, Jan 6, 2015 at 6:05 PM, Thomas Graf <tgraf@suug.ch> wrote:
> Implements supports for the Group Policy VXLAN extension [0] to provide
> a lightweight and simple security label mechanism across network peers
> based on VXLAN. The security context and associated metadata is mapped
> to/from skb->mark. This allows further mapping to a SELinux context
> using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
> tc, etc.
>
> The group membership is defined by the lower 16 bits of skb->mark, the
> upper 16 bits are used for flags.
>
> SELinux allows to manage label to secure local resources. However,
> distributed applications require ACLs to implemented across hosts. This
> is typically achieved by matching on L2-L4 fields to identify the
> original sending host and process on the receiver. On top of that,
> netlabel and specifically CIPSO [1] allow to map security contexts to
> universal labels. However, netlabel and CIPSO are relatively complex.
> This patch provides a lightweight alternative for overlay network
> environments with a trusted underlay. No additional control protocol
> is required.
>
Associating a sixteen bit field with security is worrisome, especially
considering that VXLAN provides no verification for any header fields
and doesn't even advocate use of outer UDP checksum so the field is
susceptible to an undetected single bit flip. The concept of a
"trusted underlay" is weak justification and hardly universal, so the
only way to actually secure this is through IPsec (this is mentioned
in the VXLAN-GPB draft). But if we have the security state of IPsec
then why would we need this field anyway?
Could this same functionality be achieved if we just match the VNI to
a mark in IP tables?
Tom
> Host 1: Host 2:
>
> Group A Group B Group B Group A
> +-----+ +-------------+ +-------+ +-----+
> | lxc | | SELinux CTX | | httpd | | VM |
> +--+--+ +--+----------+ +---+---+ +--+--+
> \---+---/ \----+---/
> | |
> +---+---+ +---+---+
> | vxlan | | vxlan |
> +---+---+ +---+---+
> +------------------------------+
>
> Backwards compatibility:
> A VXLAN-GBP socket can receive standard VXLAN frames and will assign
> the default group 0x0000 to such frames. A Linux VXLAN socket will
> drop VXLAN-GBP frames. The extension is therefore disabled by default
> and needs to be specifically enabled:
>
> ip link add [...] type vxlan [...] gbp
>
> In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
> must run on a separate port number.
>
> Examples:
> iptables:
> $ iptables -I OUTPUT -p icmp -j MARK --set-mark 0x200
> $ iptables -I INPUT -i br0 -m mark --mark 0x200 -j ACCEPT
>
> OVS (patches provided separately):
> in_port=1, actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL
>
> [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
> [1] http://lwn.net/Articles/204905/
>
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
> drivers/net/vxlan.c | 155 ++++++++++++++++++++++++++++++------------
> include/net/vxlan.h | 80 ++++++++++++++++++++--
> include/uapi/linux/if_link.h | 8 +++
> net/openvswitch/vport-vxlan.c | 9 ++-
> 4 files changed, 197 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 4d52aa9..30b7b59 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -132,6 +132,7 @@ struct vxlan_dev {
> __u8 tos; /* TOS override */
> __u8 ttl;
> u32 flags; /* VXLAN_F_* in vxlan.h */
> + u32 exts; /* Enabled extensions */
>
> struct work_struct sock_work;
> struct work_struct igmp_join;
> @@ -568,7 +569,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
> continue;
>
> vh2 = (struct vxlanhdr *)(p->data + off_vx);
> - if (vh->vx_vni != vh2->vx_vni) {
> + if (vh->vx_flags != vh2->vx_flags ||
> + vh->vx_vni != vh2->vx_vni) {
> NAPI_GRO_CB(p)->same_flow = 0;
> continue;
> }
> @@ -1095,6 +1097,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> {
> struct vxlan_sock *vs;
> struct vxlanhdr *vxh;
> + struct vxlan_metadata md = {0};
>
> /* Need Vxlan and inner Ethernet header to be present */
> if (!pskb_may_pull(skb, VXLAN_HLEN))
> @@ -1113,6 +1116,19 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> if (vs->exts) {
> if (!vxh->vni_present)
> goto error_invalid_header;
> +
> + if (vxh->gbp_present) {
> + if (!(vs->exts & VXLAN_EXT_GBP))
> + goto error_invalid_header;
> +
> + md.gbp = ntohs(vxh->gbp.policy_id);
> +
> + if (vxh->gbp.dont_learn)
> + md.gbp |= VXLAN_GBP_DONT_LEARN;
> +
> + if (vxh->gbp.policy_applied)
> + md.gbp |= VXLAN_GBP_POLICY_APPLIED;
> + }
> } else {
> if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> (vxh->vx_vni & htonl(0xff)))
> @@ -1122,7 +1138,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
> if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
> goto drop;
>
> - vs->rcv(vs, skb, vxh->vx_vni);
> + md.vni = vxh->vx_vni;
> + vs->rcv(vs, skb, &md);
> return 0;
>
> drop:
> @@ -1138,8 +1155,8 @@ error:
> return 1;
> }
>
> -static void vxlan_rcv(struct vxlan_sock *vs,
> - struct sk_buff *skb, __be32 vx_vni)
> +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> + struct vxlan_metadata *md)
> {
> struct iphdr *oip = NULL;
> struct ipv6hdr *oip6 = NULL;
> @@ -1150,7 +1167,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
> int err = 0;
> union vxlan_addr *remote_ip;
>
> - vni = ntohl(vx_vni) >> 8;
> + vni = ntohl(md->vni) >> 8;
> /* Is this VNI defined? */
> vxlan = vxlan_vs_find_vni(vs, vni);
> if (!vxlan)
> @@ -1184,6 +1201,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
> goto drop;
>
> skb_reset_network_header(skb);
> + skb->mark = md->gbp;
>
> if (oip6)
> err = IP6_ECN_decapsulate(oip6, skb);
> @@ -1533,15 +1551,54 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
> return false;
> }
>
> +static int vxlan_build_hdr(struct sk_buff *skb, struct vxlan_sock *vs,
> + int min_headroom, struct vxlan_metadata *md)
> +{
> + struct vxlanhdr *vxh;
> + int err;
> +
> + /* Need space for new headers (invalidates iph ptr) */
> + err = skb_cow_head(skb, min_headroom);
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + return err;
> + }
> +
> + skb = vlan_hwaccel_push_inside(skb);
> + if (WARN_ON(!skb))
> + return -ENOMEM;
> +
> + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
> + vxh->vx_flags = htonl(VXLAN_FLAGS);
> + vxh->vx_vni = md->vni;
> +
> + if (vs->exts) {
> + if (vs->exts & VXLAN_EXT_GBP) {
> + vxh->gbp_present = 1;
> +
> + if (md->gbp & VXLAN_GBP_DONT_LEARN)
> + vxh->gbp.dont_learn = 1;
> +
> + if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
> + vxh->gbp.policy_applied = 1;
> +
> + vxh->gbp.policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
> + }
> + }
> +
> + skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +
> + return 0;
> +}
> +
> #if IS_ENABLED(CONFIG_IPV6)
> static int vxlan6_xmit_skb(struct vxlan_sock *vs,
> struct dst_entry *dst, struct sk_buff *skb,
> struct net_device *dev, struct in6_addr *saddr,
> struct in6_addr *daddr, __u8 prio, __u8 ttl,
> - __be16 src_port, __be16 dst_port, __be32 vni,
> - bool xnet)
> + __be16 src_port, __be16 dst_port,
> + struct vxlan_metadata *md, bool xnet)
> {
> - struct vxlanhdr *vxh;
> int min_headroom;
> int err;
> bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk);
> @@ -1558,24 +1615,9 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
> + VXLAN_HLEN + sizeof(struct ipv6hdr)
> + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
>
> - /* Need space for new headers (invalidates iph ptr) */
> - err = skb_cow_head(skb, min_headroom);
> - if (unlikely(err)) {
> - kfree_skb(skb);
> - goto err;
> - }
> -
> - skb = vlan_hwaccel_push_inside(skb);
> - if (WARN_ON(!skb)) {
> - err = -ENOMEM;
> + err = vxlan_build_hdr(skb, vs, min_headroom, md);
> + if (err)
> goto err;
> - }
> -
> - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
> - vxh->vx_flags = htonl(VXLAN_FLAGS);
> - vxh->vx_vni = vni;
> -
> - skb_set_inner_protocol(skb, htons(ETH_P_TEB));
>
> udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
> ttl, src_port, dst_port);
> @@ -1589,9 +1631,9 @@ err:
> int vxlan_xmit_skb(struct vxlan_sock *vs,
> struct rtable *rt, struct sk_buff *skb,
> __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
> - __be16 src_port, __be16 dst_port, __be32 vni, bool xnet)
> + __be16 src_port, __be16 dst_port,
> + struct vxlan_metadata *md, bool xnet)
> {
> - struct vxlanhdr *vxh;
> int min_headroom;
> int err;
> bool udp_sum = !vs->sock->sk->sk_no_check_tx;
> @@ -1604,22 +1646,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
> + VXLAN_HLEN + sizeof(struct iphdr)
> + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
>
> - /* Need space for new headers (invalidates iph ptr) */
> - err = skb_cow_head(skb, min_headroom);
> - if (unlikely(err)) {
> - kfree_skb(skb);
> + err = vxlan_build_hdr(skb, vs, min_headroom, md);
> + if (err)
> return err;
> - }
> -
> - skb = vlan_hwaccel_push_inside(skb);
> - if (WARN_ON(!skb))
> - return -ENOMEM;
> -
> - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
> - vxh->vx_flags = htonl(VXLAN_FLAGS);
> - vxh->vx_vni = vni;
> -
> - skb_set_inner_protocol(skb, htons(ETH_P_TEB));
>
> return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
> ttl, df, src_port, dst_port, xnet);
> @@ -1679,6 +1708,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
> const struct iphdr *old_iph;
> struct flowi4 fl4;
> union vxlan_addr *dst;
> + struct vxlan_metadata md;
> __be16 src_port = 0, dst_port;
> u32 vni;
> __be16 df = 0;
> @@ -1749,11 +1779,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>
> tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
> ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
> + md.vni = htonl(vni << 8);
> + md.gbp = skb->mark;
>
> err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
> fl4.saddr, dst->sin.sin_addr.s_addr,
> - tos, ttl, df, src_port, dst_port,
> - htonl(vni << 8),
> + tos, ttl, df, src_port, dst_port, &md,
> !net_eq(vxlan->net, dev_net(vxlan->dev)));
> if (err < 0) {
> /* skb is already freed. */
> @@ -1806,10 +1837,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
> }
>
> ttl = ttl ? : ip6_dst_hoplimit(ndst);
> + md.vni = htonl(vni << 8);
> + md.gbp = skb->mark;
>
> err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
> dev, &fl6.saddr, &fl6.daddr, 0, ttl,
> - src_port, dst_port, htonl(vni << 8),
> + src_port, dst_port, &md,
> !net_eq(vxlan->net, dev_net(vxlan->dev)));
> #endif
> }
> @@ -2210,6 +2243,11 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
> [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
> [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
> [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
> + [IFLA_VXLAN_EXTENSION] = { .type = NLA_NESTED },
> +};
> +
> +static const struct nla_policy vxlan_ext_policy[IFLA_VXLAN_EXT_MAX + 1] = {
> + [IFLA_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
> };
>
> static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
> @@ -2246,6 +2284,18 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
> }
> }
>
> + if (data[IFLA_VXLAN_EXTENSION]) {
> + int err;
> +
> + err = nla_validate_nested(data[IFLA_VXLAN_EXTENSION],
> + IFLA_VXLAN_EXT_MAX, vxlan_ext_policy);
> + if (err < 0) {
> + pr_debug("invalid VXLAN extension configuration: %d\n",
> + err);
> + return -EINVAL;
> + }
> + }
> +
> return 0;
> }
>
> @@ -2400,6 +2450,18 @@ static void vxlan_sock_work(struct work_struct *work)
> dev_put(vxlan->dev);
> }
>
> +static void configure_vxlan_exts(struct vxlan_dev *vxlan, struct nlattr *attr)
> +{
> + struct nlattr *exts[IFLA_VXLAN_EXT_MAX+1];
> +
> + /* Validated in vxlan_validate() */
> + if (nla_parse_nested(exts, IFLA_VXLAN_EXT_MAX, attr, NULL) < 0)
> + BUG();
> +
> + if (exts[IFLA_VXLAN_EXT_GBP])
> + vxlan->exts |= VXLAN_EXT_GBP;
> +}
> +
> static int vxlan_newlink(struct net *net, struct net_device *dev,
> struct nlattr *tb[], struct nlattr *data[])
> {
> @@ -2525,6 +2587,9 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
> nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
> vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
>
> + if (data[IFLA_VXLAN_EXTENSION])
> + configure_vxlan_exts(vxlan, data[IFLA_VXLAN_EXTENSION]);
> +
> if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET,
> vxlan->dst_port)) {
> pr_info("duplicate VNI %u\n", vni);
> diff --git a/include/net/vxlan.h b/include/net/vxlan.h
> index 3e98d31..66000d0 100644
> --- a/include/net/vxlan.h
> +++ b/include/net/vxlan.h
> @@ -11,13 +11,60 @@
> #define VNI_HASH_BITS 10
> #define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
>
> +/*
> + * VXLAN Group Based Policy Extension:
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |1|-|-|-|1|-|-|-|R|D|R|R|A|R|R|R| Group Policy ID |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * | VXLAN Network Identifier (VNI) | Reserved |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + *
> + * D = Don't Learn bit. When set, this bit indicates that the egress
> + * VTEP MUST NOT learn the source address of the encapsulated frame.
> + *
> + * A = Indicates that the group policy has already been applied to
> + * this packet. Policies MUST NOT be applied by devices when the
> + * A bit is set.
> + *
> + * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
> + */
> +struct vxlan_gbp {
> +#ifdef __LITTLE_ENDIAN_BITFIELD
> + __u8 reserved_flags1:3,
> + policy_applied:1,
> + reserved_flags2:2,
> + dont_learn:1,
> + reserved_flags3:1;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> + __u8 reserved_flags1:1,
> + dont_learn:1,
> + reserved_flags2:2,
> + policy_applied:1,
> + reserved_flags3:3;
> +#else
> +#error "Please fix <asm/byteorder.h>"
> +#endif
> + __be16 policy_id;
> +} __packed;
> +
> +/* skb->mark mapping
> + *
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID |
> + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> + */
> +#define VXLAN_GBP_DONT_LEARN (BIT(6) << 16)
> +#define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16)
> +#define VXLAN_GBP_ID_MASK (0xFFFF)
> +
> /* VXLAN protocol header:
> * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> - * |R|R|R|R|I|R|R|R| Reserved |
> + * |G|R|R|R|I|R|R|R| Reserved |
> * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> * | VXLAN Network Identifier (VNI) | Reserved |
> * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> *
> + * G = 1 Group Policy (VXLAN-GBP)
> * I = 1 VXLAN Network Identifier (VNI) present
> */
> struct vxlanhdr {
> @@ -26,24 +73,42 @@ struct vxlanhdr {
> #ifdef __LITTLE_ENDIAN_BITFIELD
> __u8 reserved_flags1:3,
> vni_present:1,
> - reserved_flags2:4;
> + reserved_flags2:3,
> + gbp_present:1;
> #elif defined(__BIG_ENDIAN_BITFIELD)
> - __u8 reserved_flags2:4,
> + __u8 gbp_present:1,
> + reserved_flags2:3,
> vni_present:1,
> reserved_flags1:3;
> #else
> #error "Please fix <asm/byteorder.h>"
> #endif
> - __u8 vx_reserved1;
> - __be16 vx_reserved2;
> + union {
> + /* NOTE: Offset 0 will be 1 byte aligned, so
> + * all member structs must be marked packed.
> + */
> + struct vxlan_gbp gbp;
> + struct {
> + __u8 vx_reserved1;
> + __be16 vx_reserved2;
> + } __packed;
> + };
> };
> __be32 vx_flags;
> };
> __be32 vx_vni;
> };
>
> +struct vxlan_metadata {
> + __be32 vni;
> + u32 gbp;
> +};
> +
> struct vxlan_sock;
> -typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb, __be32 key);
> +typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb,
> + struct vxlan_metadata *md);
> +
> +#define VXLAN_EXT_GBP BIT(0)
>
> /* per UDP socket information */
> struct vxlan_sock {
> @@ -78,7 +143,8 @@ void vxlan_sock_release(struct vxlan_sock *vs);
> int vxlan_xmit_skb(struct vxlan_sock *vs,
> struct rtable *rt, struct sk_buff *skb,
> __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
> - __be16 src_port, __be16 dst_port, __be32 vni, bool xnet);
> + __be16 src_port, __be16 dst_port, struct vxlan_metadata *md,
> + bool xnet);
>
> static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
> netdev_features_t features)
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index f7d0d2d..9f07bf5 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -370,10 +370,18 @@ enum {
> IFLA_VXLAN_UDP_CSUM,
> IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
> IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
> + IFLA_VXLAN_EXTENSION,
> __IFLA_VXLAN_MAX
> };
> #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
>
> +enum {
> + IFLA_VXLAN_EXT_UNSPEC,
> + IFLA_VXLAN_EXT_GBP,
> + __IFLA_VXLAN_EXT_MAX,
> +};
> +#define IFLA_VXLAN_EXT_MAX (__IFLA_VXLAN_EXT_MAX - 1)
> +
> struct ifla_vxlan_port_range {
> __be16 low;
> __be16 high;
> diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
> index d7c46b3..dd68c97 100644
> --- a/net/openvswitch/vport-vxlan.c
> +++ b/net/openvswitch/vport-vxlan.c
> @@ -59,7 +59,8 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
> }
>
> /* Called with rcu_read_lock and BH disabled. */
> -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
> +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
> + struct vxlan_metadata *md)
> {
> struct ovs_tunnel_info tun_info;
> struct vport *vport = vs->data;
> @@ -68,7 +69,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
>
> /* Save outer tunnel values */
> iph = ip_hdr(skb);
> - key = cpu_to_be64(ntohl(vx_vni) >> 8);
> + key = cpu_to_be64(ntohl(md->vni) >> 8);
> ovs_flow_tun_info_init(&tun_info, iph,
> udp_hdr(skb)->source, udp_hdr(skb)->dest,
> key, TUNNEL_KEY, NULL, 0);
> @@ -146,6 +147,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
> struct vxlan_port *vxlan_port = vxlan_vport(vport);
> __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
> struct ovs_key_ipv4_tunnel *tun_key;
> + struct vxlan_metadata md;
> struct rtable *rt;
> struct flowi4 fl;
> __be16 src_port;
> @@ -178,12 +180,13 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
> skb->ignore_df = 1;
>
> src_port = udp_flow_src_port(net, skb, 0, 0, true);
> + md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
>
> err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
> fl.saddr, tun_key->ipv4_dst,
> tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
> src_port, dst_port,
> - htonl(be64_to_cpu(tun_key->tun_id) << 8),
> + &md,
> false);
> if (err < 0)
> ip_rt_put(rt);
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 44+ messages in thread