* [PATCH nf-next v10 5/8] openvswitch: Find existing conntrack entry after upcall.
2016-03-10 18:54 [PATCH nf-next v10 0/8] openvswitch: NAT support Jarno Rajahalme
` (2 preceding siblings ...)
2016-03-10 18:54 ` [PATCH nf-next v10 4/8] openvswitch: Update the CT state key only after nf_conntrack_in() Jarno Rajahalme
@ 2016-03-10 18:54 ` Jarno Rajahalme
2016-03-10 18:54 ` [PATCH nf-next v10 7/8] openvswitch: Delay conntrack helper call for new connections Jarno Rajahalme
` (2 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Jarno Rajahalme @ 2016-03-10 18:54 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, dev, jarno
Add a new function ovs_ct_find_existing() to find an existing
conntrack entry for which this packet was already applied to. This is
only to be called when there is evidence that the packet was already
tracked and committed, but we lost the ct reference due to an
userspace upcall.
ovs_ct_find_existing() is called from skb_nfct_cached(), which can now
hide the fact that the ct reference may have been lost due to an
upcall. This allows ovs_ct_commit() to be simplified.
This patch is needed by later "openvswitch: Interface with NAT" patch,
as we need to be able to pass the packet through NAT using the
original ct reference also after the reference is lost after an
upcall.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
---
net/openvswitch/conntrack.c | 103 ++++++++++++++++++++++++++++++++++++++------
1 file changed, 90 insertions(+), 13 deletions(-)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index a487bb3..ae36fe2 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -356,14 +356,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
return __nf_ct_expect_find(net, zone, &tuple);
}
+/* This replicates logic from nf_conntrack_core.c that is not exported. */
+static enum ip_conntrack_info
+ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
+{
+ const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ return IP_CT_ESTABLISHED_REPLY;
+ /* Once we've had two way comms, always ESTABLISHED. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ return IP_CT_ESTABLISHED;
+ if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+ return IP_CT_RELATED;
+ return IP_CT_NEW;
+}
+
+/* Find an existing connection which this packet belongs to without
+ * re-attributing statistics or modifying the connection state. This allows an
+ * skb->nfct lost due to an upcall to be recovered during actions execution.
+ *
+ * Must be called with rcu_read_lock.
+ *
+ * On success, populates skb->nfct and skb->nfctinfo, and returns the
+ * connection. Returns NULL if there is no existing entry.
+ */
+static struct nf_conn *
+ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
+ u8 l3num, struct sk_buff *skb)
+{
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ unsigned int dataoff;
+ u8 protonum;
+
+ l3proto = __nf_ct_l3proto_find(l3num);
+ if (!l3proto) {
+ pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
+ return NULL;
+ }
+ if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+ &protonum) <= 0) {
+ pr_debug("ovs_ct_find_existing: Can't get protonum\n");
+ return NULL;
+ }
+ l4proto = __nf_ct_l4proto_find(l3num, protonum);
+ if (!l4proto) {
+ pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
+ return NULL;
+ }
+ if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+ protonum, net, &tuple, l3proto, l4proto)) {
+ pr_debug("ovs_ct_find_existing: Can't get tuple\n");
+ return NULL;
+ }
+
+ /* look for tuple match */
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return NULL; /* Not found. */
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ ctinfo = ovs_ct_get_info(h);
+ if (ctinfo == IP_CT_NEW) {
+ /* This should not happen. */
+ WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
+ }
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = ctinfo;
+ return ct;
+}
+
/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
-static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
- const struct ovs_conntrack_info *info)
+static bool skb_nfct_cached(struct net *net,
+ const struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
+ /* If no ct, check if we have evidence that an existing conntrack entry
+ * might be found for this skb. This happens when we lose a skb->nfct
+ * due to an upcall. If the connection was not confirmed, it is not
+ * cached and needs to be run through conntrack again.
+ */
+ if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
+ !(key->ct.state & OVS_CS_F_INVALID) &&
+ key->ct.zone == info->zone.id)
+ ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
if (!ct)
return false;
if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -396,7 +483,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
* actually run the packet through conntrack twice unless it's for a
* different zone.
*/
- if (!skb_nfct_cached(net, skb, info)) {
+ if (!skb_nfct_cached(net, key, info, skb)) {
struct nf_conn *tmpl = info->ct;
/* Associate skb with specified zone. */
@@ -459,18 +546,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
- u8 state;
int err;
- state = key->ct.state;
- if (key->ct.zone == info->zone.id &&
- ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
- /* Previous lookup has shown that this connection is already
- * tracked and committed. Skip committing.
- */
- return 0;
- }
-
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
--
2.1.4
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH nf-next v10 8/8] openvswitch: Interface with NAT.
2016-03-10 18:54 [PATCH nf-next v10 0/8] openvswitch: NAT support Jarno Rajahalme
` (4 preceding siblings ...)
2016-03-10 18:54 ` [PATCH nf-next v10 7/8] openvswitch: Delay conntrack helper call for new connections Jarno Rajahalme
@ 2016-03-10 18:54 ` Jarno Rajahalme
[not found] ` <1457636063-92746-9-git-send-email-jarno-LZ6Gd1LRuIk@public.gmane.org>
2016-03-15 0:01 ` [PATCH nf-next v10 0/8] openvswitch: NAT support Pablo Neira Ayuso
6 siblings, 1 reply; 14+ messages in thread
From: Jarno Rajahalme @ 2016-03-10 18:54 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, dev, jarno
Extend OVS conntrack interface to cover NAT. New nested
OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action.
A bare OVS_CT_ATTR_NAT only mangles existing and expected connections.
If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
attributes, new (non-committed/non-confirmed) connections are mangled
according to the rest of the nested attributes.
The corresponding OVS userspace patch series includes test cases (in
tests/system-traffic.at) that also serve as example uses.
This work extends on a branch by Thomas Graf at
https://github.com/tgraf/ovs/tree/nat.
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
---
include/uapi/linux/openvswitch.h | 49 ++++
net/openvswitch/Kconfig | 3 +-
net/openvswitch/conntrack.c | 524 +++++++++++++++++++++++++++++++++++++--
net/openvswitch/conntrack.h | 3 +-
4 files changed, 551 insertions(+), 28 deletions(-)
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index a27222d..616d047 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
#define OVS_CS_F_REPLY_DIR 0x08 /* Flow is in the reply direction. */
#define OVS_CS_F_INVALID 0x10 /* Could not track connection. */
#define OVS_CS_F_TRACKED 0x20 /* Conntrack has occurred. */
+#define OVS_CS_F_SRC_NAT 0x40 /* Packet's source address/port was
+ * mangled by NAT.
+ */
+#define OVS_CS_F_DST_NAT 0x80 /* Packet's destination address/port
+ * was mangled by NAT.
+ */
+
+#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
/**
* enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -632,6 +640,8 @@ struct ovs_action_hash {
* mask. For each bit set in the mask, the corresponding bit in the value is
* copied to the connection tracking label field in the connection.
* @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
+ * translation (NAT) on the packet.
*/
enum ovs_ct_attr {
OVS_CT_ATTR_UNSPEC,
@@ -641,12 +651,51 @@ enum ovs_ct_attr {
OVS_CT_ATTR_LABELS, /* labels to associate with this connection. */
OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of
related connections. */
+ OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */
__OVS_CT_ATTR_MAX
};
#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
/**
+ * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
+ *
+ * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
+ * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
+ * address/port). Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
+ * specified. Effective only for packets for ct_state NEW connections.
+ * Packets of committed connections are mangled by the NAT action according to
+ * the committed NAT type regardless of the flags specified. As a corollary, a
+ * NAT action without a NAT type flag will only mangle packets of committed
+ * connections. The following NAT attributes only apply for NEW
+ * (non-committed) connections, and they may be included only when the CT
+ * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
+ * @OVS_NAT_ATTR_DST is also included.
+ * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
+ * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
+ * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
+ * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
+ * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
+ */
+enum ovs_nat_attr {
+ OVS_NAT_ATTR_UNSPEC,
+ OVS_NAT_ATTR_SRC,
+ OVS_NAT_ATTR_DST,
+ OVS_NAT_ATTR_IP_MIN,
+ OVS_NAT_ATTR_IP_MAX,
+ OVS_NAT_ATTR_PROTO_MIN,
+ OVS_NAT_ATTR_PROTO_MAX,
+ OVS_NAT_ATTR_PERSISTENT,
+ OVS_NAT_ATTR_PROTO_HASH,
+ OVS_NAT_ATTR_PROTO_RANDOM,
+ __OVS_NAT_ATTR_MAX,
+};
+
+#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
+
+/**
* enum ovs_action_attr - Action types.
*
* @OVS_ACTION_ATTR_OUTPUT: Output packet to port.
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index cd5fd9d..234a733 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,7 +6,8 @@ config OPENVSWITCH
tristate "Open vSwitch"
depends on INET
depends on !NF_CONNTRACK || \
- (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
+ (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
+ (!NF_NAT || NF_NAT)))
select LIBCRC32C
select MPLS
select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f718b72..dc5eb29 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
#include <linux/module.h>
#include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
#include <net/ip.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#endif
+
#include "datapath.h"
#include "conntrack.h"
#include "flow.h"
#include "flow_netlink.h"
struct ovs_ct_len_tbl {
- size_t maxlen;
- size_t minlen;
+ int maxlen;
+ int minlen;
};
/* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
struct ovs_key_ct_labels mask;
};
+enum ovs_ct_nat {
+ OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */
+ OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
+ OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
+};
+
/* Conntrack action context for execution. */
struct ovs_conntrack_info {
struct nf_conntrack_helper *helper;
struct nf_conntrack_zone zone;
struct nf_conn *ct;
u8 commit : 1;
+ u8 nat : 3; /* enum ovs_ct_nat */
u16 family;
struct md_mark mark;
struct md_labels labels;
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */
+#endif
};
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -137,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
ovs_ct_get_labels(ct, &key->ct.labels);
}
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action. If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
*/
static void ovs_ct_update_key(const struct sk_buff *skb,
const struct ovs_conntrack_info *info,
- struct sw_flow_key *key, bool post_ct)
+ struct sw_flow_key *key, bool post_ct,
+ bool keep_nat_flags)
{
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
@@ -160,6 +183,14 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
*/
if (ct->master)
state |= OVS_CS_F_RELATED;
+ if (keep_nat_flags) {
+ state |= key->ct.state & OVS_CS_F_NAT_MASK;
+ } else {
+ if (ct->status & IPS_SRC_NAT)
+ state |= OVS_CS_F_SRC_NAT;
+ if (ct->status & IPS_DST_NAT)
+ state |= OVS_CS_F_DST_NAT;
+ }
zone = nf_ct_zone(ct);
} else if (post_ct) {
state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -174,7 +205,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
*/
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
{
- ovs_ct_update_key(skb, NULL, key, false);
+ ovs_ct_update_key(skb, NULL, key, false, false);
}
int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -263,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
enum ip_conntrack_info ctinfo;
unsigned int protoff;
struct nf_conn *ct;
+ int err;
ct = nf_ct_get(skb, &ctinfo);
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -299,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
return NF_DROP;
}
- return helper->help(skb, protoff, ct, ctinfo);
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
}
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -468,6 +511,200 @@ static bool skb_nfct_cached(struct net *net,
return true;
}
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype)
+{
+ int hooknum, nh_off, err = NF_ACCEPT;
+
+ nh_off = skb_network_offset(skb);
+ skb_pull(skb, nh_off);
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (skb->protocol == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto push;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto push;
+ }
+#endif
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto push;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto push;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+ skb_push(skb, nh_off);
+
+ return err;
+}
+
+static void ovs_nat_update_key(struct sw_flow_key *key,
+ const struct sk_buff *skb,
+ enum nf_nat_manip_type maniptype)
+{
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ __be16 src;
+
+ key->ct.state |= OVS_CS_F_SRC_NAT;
+ if (key->eth.type == htons(ETH_P_IP))
+ key->ipv4.addr.src = ip_hdr(skb)->saddr;
+ else if (key->eth.type == htons(ETH_P_IPV6))
+ memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+ sizeof(key->ipv6.addr.src));
+ else
+ return;
+
+ if (key->ip.proto == IPPROTO_UDP)
+ src = udp_hdr(skb)->source;
+ else if (key->ip.proto == IPPROTO_TCP)
+ src = tcp_hdr(skb)->source;
+ else if (key->ip.proto == IPPROTO_SCTP)
+ src = sctp_hdr(skb)->source;
+ else
+ return;
+
+ key->tp.src = src;
+ } else {
+ __be16 dst;
+
+ key->ct.state |= OVS_CS_F_DST_NAT;
+ if (key->eth.type == htons(ETH_P_IP))
+ key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+ else if (key->eth.type == htons(ETH_P_IPV6))
+ memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+ sizeof(key->ipv6.addr.dst));
+ else
+ return;
+
+ if (key->ip.proto == IPPROTO_UDP)
+ dst = udp_hdr(skb)->dest;
+ else if (key->ip.proto == IPPROTO_TCP)
+ dst = tcp_hdr(skb)->dest;
+ else if (key->ip.proto == IPPROTO_SCTP)
+ dst = sctp_hdr(skb)->dest;
+ else
+ return;
+
+ key->tp.dst = dst;
+ }
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ enum nf_nat_manip_type maniptype;
+ int err;
+
+ if (nf_ct_is_untracked(ct)) {
+ /* A NAT action may only be performed on tracked packets. */
+ return NF_ACCEPT;
+ }
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_ACCEPT; /* Can't NAT. */
+
+ /* Determine NAT type.
+ * Check if the NAT type can be deduced from the tracked connection.
+ * Make sure expected traffic is NATted only when committing.
+ */
+ if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
+ ct->status & IPS_NAT_MASK &&
+ (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (info->nat & OVS_CT_SRC_NAT) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (info->nat & OVS_CT_DST_NAT) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT; /* Connection is not NATed. */
+ }
+ err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+ /* Mark NAT done if successful and update the flow key. */
+ if (err == NF_ACCEPT)
+ ovs_nat_update_key(key, skb, maniptype);
+
+ return err;
+}
+#else /* !CONFIG_NF_NAT_NEEDED */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ return NF_ACCEPT;
+}
+#endif
+
/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
* not done already. Update key with new CT state after passing the packet
* through conntrack.
@@ -509,19 +746,43 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
if (err != NF_ACCEPT)
return -ENOENT;
- ovs_ct_update_key(skb, info, key, true);
+ /* Clear CT state NAT flags to mark that we have not yet done
+ * NAT after the nf_conntrack_in() call. We can actually clear
+ * the whole state, as it will be re-initialized below.
+ */
+ key->ct.state = 0;
+
+ /* Update the key, but keep the NAT flags. */
+ ovs_ct_update_key(skb, info, key, true, true);
}
- /* Call the helper only if:
- * - nf_conntrack_in() was executed above ("!cached") for a confirmed
- * connection, or
- * - When committing an unconfirmed connection.
- */
ct = nf_ct_get(skb, &ctinfo);
- if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
- ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
- WARN_ONCE(1, "helper rejected packet");
- return -EINVAL;
+ if (ct) {
+ /* Packets starting a new connection must be NATted before the
+ * helper, so that the helper knows about the NAT. We enforce
+ * this by delaying both NAT and helper calls for unconfirmed
+ * connections until the committing CT action. For later
+ * packets NAT and Helper may be called in either order.
+ *
+ * NAT will be done only if the CT action has NAT, and only
+ * once per packet (per zone), as guarded by the NAT bits in
+ * the key->ct.state.
+ */
+ if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+ (nf_ct_is_confirmed(ct) || info->commit) &&
+ ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
+ return -EINVAL;
+ }
+
+ /* Call the helper only if:
+ * - nf_conntrack_in() was executed above ("!cached") for a
+ * confirmed connection, or
+ * - When committing an unconfirmed connection.
+ */
+ if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+ ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+ return -EINVAL;
+ }
}
return 0;
@@ -545,15 +806,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
if (exp) {
u8 state;
+ /* NOTE: New connections are NATted and Helped only when
+ * committed, so we are not calling into NAT here.
+ */
state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
__ovs_ct_update_key(key, state, &info->zone, exp->master);
- } else {
- int err;
-
- err = __ovs_ct_lookup(net, key, info, skb);
- if (err)
- return err;
- }
+ } else
+ return __ovs_ct_lookup(net, key, info, skb);
return 0;
}
@@ -653,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
return 0;
}
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+ struct ovs_conntrack_info *info, bool log)
+{
+ struct nlattr *a;
+ int rem;
+ bool have_ip_max = false;
+ bool have_proto_max = false;
+ bool ip_vers = (info->family == NFPROTO_IPV6);
+
+ nla_for_each_nested(a, attr, rem) {
+ static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+ [OVS_NAT_ATTR_SRC] = {0, 0},
+ [OVS_NAT_ATTR_DST] = {0, 0},
+ [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+ sizeof(struct in6_addr)},
+ [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+ sizeof(struct in6_addr)},
+ [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
+ [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
+ [OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+ [OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+ [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+ };
+ int type = nla_type(a);
+
+ if (type > OVS_NAT_ATTR_MAX) {
+ OVS_NLERR(log,
+ "Unknown NAT attribute (type=%d, max=%d).\n",
+ type, OVS_NAT_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
+ OVS_NLERR(log,
+ "NAT attribute type %d has unexpected length (%d != %d).\n",
+ type, nla_len(a),
+ ovs_nat_attr_lens[type][ip_vers]);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_NAT_ATTR_SRC:
+ case OVS_NAT_ATTR_DST:
+ if (info->nat) {
+ OVS_NLERR(log,
+ "Only one type of NAT may be specified.\n"
+ );
+ return -ERANGE;
+ }
+ info->nat |= OVS_CT_NAT;
+ info->nat |= ((type == OVS_NAT_ATTR_SRC)
+ ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
+ break;
+
+ case OVS_NAT_ATTR_IP_MIN:
+ nla_memcpy(&info->range.min_addr, a, nla_len(a));
+ info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+ break;
+
+ case OVS_NAT_ATTR_IP_MAX:
+ have_ip_max = true;
+ nla_memcpy(&info->range.max_addr, a,
+ sizeof(info->range.max_addr));
+ info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_MIN:
+ info->range.min_proto.all = htons(nla_get_u16(a));
+ info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_MAX:
+ have_proto_max = true;
+ info->range.max_proto.all = htons(nla_get_u16(a));
+ info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ break;
+
+ case OVS_NAT_ATTR_PERSISTENT:
+ info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_HASH:
+ info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_RANDOM:
+ info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+ break;
+
+ default:
+ OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+ return -EINVAL;
+ }
+ }
+
+ if (rem > 0) {
+ OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+ if (!info->nat) {
+ /* Do not allow flags if no type is given. */
+ if (info->range.flags) {
+ OVS_NLERR(log,
+ "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+ );
+ return -EINVAL;
+ }
+ info->nat = OVS_CT_NAT; /* NAT existing connections. */
+ } else if (!info->commit) {
+ OVS_NLERR(log,
+ "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+ );
+ return -EINVAL;
+ }
+ /* Allow missing IP_MAX. */
+ if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+ memcpy(&info->range.max_addr, &info->range.min_addr,
+ sizeof(info->range.max_addr));
+ }
+ /* Allow missing PROTO_MAX. */
+ if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+ !have_proto_max) {
+ info->range.max_proto.all = info->range.min_proto.all;
+ }
+ return 0;
+}
+#endif
+
static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
[OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
@@ -662,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels),
.maxlen = sizeof(struct md_labels) },
[OVS_CT_ATTR_HELPER] = { .minlen = 1,
- .maxlen = NF_CT_HELPER_NAME_LEN }
+ .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+ /* NAT length is checked when parsing the nested attributes. */
+ [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
+#endif
};
static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -729,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
return -EINVAL;
}
break;
+#ifdef CONFIG_NF_NAT_NEEDED
+ case OVS_CT_ATTR_NAT: {
+ int err = parse_nat(a, info, log);
+
+ if (err)
+ return err;
+ break;
+ }
+#endif
default:
OVS_NLERR(log, "Unknown conntrack attr (%d)",
type);
@@ -816,6 +1217,74 @@ err_free_ct:
return err;
}
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
+{
+ struct nlattr *start;
+
+ start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+ if (!start)
+ return false;
+
+ if (info->nat & OVS_CT_SRC_NAT) {
+ if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+ return false;
+ } else if (info->nat & OVS_CT_DST_NAT) {
+ if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+ return false;
+ } else {
+ goto out;
+ }
+
+ if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+ if (info->family == NFPROTO_IPV4) {
+ if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+ info->range.min_addr.ip) ||
+ (info->range.max_addr.ip
+ != info->range.min_addr.ip &&
+ (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+ info->range.max_addr.ip))))
+ return false;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+ } else if (info->family == NFPROTO_IPV6) {
+ if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+ &info->range.min_addr.in6) ||
+ (memcmp(&info->range.max_addr.in6,
+ &info->range.min_addr.in6,
+ sizeof(info->range.max_addr.in6)) &&
+ (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+ &info->range.max_addr.in6))))
+ return false;
+#endif
+ } else {
+ return false;
+ }
+ }
+ if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+ (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+ ntohs(info->range.min_proto.all)) ||
+ (info->range.max_proto.all != info->range.min_proto.all &&
+ nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+ ntohs(info->range.max_proto.all)))))
+ return false;
+
+ if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+ nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+ return false;
+ if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+ nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+ return false;
+ if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+ nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+ return false;
+out:
+ nla_nest_end(skb, start);
+
+ return true;
+}
+#endif
+
int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
struct sk_buff *skb)
{
@@ -844,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
ct_info->helper->name))
return -EMSGSIZE;
}
-
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
+ return -EMSGSIZE;
+#endif
nla_nest_end(skb, start);
return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f4..8f6230b 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
- OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+ OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
+ OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
#else
#include <linux/errno.h>
--
2.1.4
^ permalink raw reply related [flat|nested] 14+ messages in thread