* [PATCH 07/18] netfilter: ctnetlink: remove dead NAT code
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
The NAT range to nlattr conversation callbacks and helpers are entirely
dead code and are also useless since there are no NAT ranges in conntrack
context, they are only used for initially selecting a tuple. The final NAT
information is contained in the selected tuples of the conntrack entry.
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/net/netfilter/nf_nat_protocol.h | 5 -----
net/ipv4/netfilter/nf_nat_proto_common.c | 14 +-------------
net/ipv4/netfilter/nf_nat_proto_dccp.c | 1 -
net/ipv4/netfilter/nf_nat_proto_gre.c | 1 -
net/ipv4/netfilter/nf_nat_proto_icmp.c | 1 -
net/ipv4/netfilter/nf_nat_proto_sctp.c | 1 -
net/ipv4/netfilter/nf_nat_proto_tcp.c | 1 -
net/ipv4/netfilter/nf_nat_proto_udp.c | 1 -
net/ipv4/netfilter/nf_nat_proto_udplite.c | 1 -
9 files changed, 1 insertions(+), 25 deletions(-)
diff --git a/include/net/netfilter/nf_nat_protocol.h b/include/net/netfilter/nf_nat_protocol.h
index eaad0ac..7b0b511 100644
--- a/include/net/netfilter/nf_nat_protocol.h
+++ b/include/net/netfilter/nf_nat_protocol.h
@@ -32,9 +32,6 @@ struct nf_nat_protocol {
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct);
- int (*range_to_nlattr)(struct sk_buff *skb,
- const struct nf_nat_ipv4_range *range);
-
int (*nlattr_to_range)(struct nlattr *tb[],
struct nf_nat_ipv4_range *range);
};
@@ -64,8 +61,6 @@ extern void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conn *ct,
u_int16_t *rover);
-extern int nf_nat_proto_range_to_nlattr(struct sk_buff *skb,
- const struct nf_nat_ipv4_range *range);
extern int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
struct nf_nat_ipv4_range *range);
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 47fff91..9993bc9 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -96,18 +96,6 @@ void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
-int nf_nat_proto_range_to_nlattr(struct sk_buff *skb,
- const struct nf_nat_ipv4_range *range)
-{
- NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.all);
- NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.all);
- return 0;
-
-nla_put_failure:
- return -1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
-
int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
struct nf_nat_ipv4_range *range)
{
@@ -122,5 +110,5 @@ int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
}
return 0;
}
-EXPORT_SYMBOL_GPL(nf_nat_proto_range_to_nlattr);
+EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 466d63d..3f67138 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -84,7 +84,6 @@ static const struct nf_nat_protocol nf_nat_protocol_dccp = {
.in_range = nf_nat_proto_in_range,
.unique_tuple = dccp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
.nlattr_to_range = nf_nat_proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 35cd158..46ba0b9 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -123,7 +123,6 @@ static const struct nf_nat_protocol gre = {
.in_range = nf_nat_proto_in_range,
.unique_tuple = gre_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
.nlattr_to_range = nf_nat_proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 036c009..b351728 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -78,7 +78,6 @@ const struct nf_nat_protocol nf_nat_protocol_icmp = {
.in_range = icmp_in_range,
.unique_tuple = icmp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
.nlattr_to_range = nf_nat_proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 50283ab..3cce9b6 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -74,7 +74,6 @@ static const struct nf_nat_protocol nf_nat_protocol_sctp = {
.in_range = nf_nat_proto_in_range,
.unique_tuple = sctp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
.nlattr_to_range = nf_nat_proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index e0e2ba8..9fb4b4e 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -86,7 +86,6 @@ const struct nf_nat_protocol nf_nat_protocol_tcp = {
.in_range = nf_nat_proto_in_range,
.unique_tuple = tcp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
.nlattr_to_range = nf_nat_proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index bde94cd..9883336 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -77,7 +77,6 @@ const struct nf_nat_protocol nf_nat_protocol_udp = {
.in_range = nf_nat_proto_in_range,
.unique_tuple = udp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
.nlattr_to_range = nf_nat_proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index 58e9a3a..d24d10a 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -76,7 +76,6 @@ static const struct nf_nat_protocol nf_nat_protocol_udplite = {
.in_range = nf_nat_proto_in_range,
.unique_tuple = udplite_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nlattr = nf_nat_proto_range_to_nlattr,
.nlattr_to_range = nf_nat_proto_nlattr_to_range,
#endif
};
--
1.7.1
^ permalink raw reply related
* [PATCH 05/18] netfilter: nf_nat: remove obsolete code from nf_nat_icmp_reply_translation()
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
The inner tuple that is extracted from the packet is unused. The code also
doesn't have any useful side-effects like verifying the packet does contain
enough data to extract the inner tuple since conntrack already does the
same, so remove it.
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv4/netfilter/nf_nat_core.c | 14 +-------------
1 files changed, 1 insertions(+), 13 deletions(-)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 5e1bd85..acdd002 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -30,7 +30,6 @@
#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_zones.h>
static DEFINE_SPINLOCK(nf_nat_lock);
@@ -414,8 +413,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
struct icmphdr icmp;
struct iphdr ip;
} *inside;
- const struct nf_conntrack_l4proto *l4proto;
- struct nf_conntrack_tuple inner, target;
+ struct nf_conntrack_tuple target;
int hdrlen = ip_hdrlen(skb);
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned long statusbit;
@@ -463,16 +461,6 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
"dir %s\n", skb, manip,
dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
- /* rcu_read_lock()ed by nf_hook_slow */
- l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
-
- if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
- (hdrlen +
- sizeof(struct icmphdr) + inside->ip.ihl * 4),
- (u_int16_t)AF_INET, inside->ip.protocol,
- &inner, l3proto, l4proto))
- return 0;
-
/* Change inner back to look like incoming packet. We do the
opposite manip on this hook to normal, because it might not
pass all hooks (locally-generated ICMP). Consider incoming
--
1.7.1
^ permalink raw reply related
* [PATCH 08/18] netfilter: conntrack: restrict NAT helper invocation to IPv4
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
The NAT helpers currently only handle IPv4 packets correctly. Restrict
invocation of the helpers to IPv4 in preparation of IPv6 NAT.
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/nf_conntrack_amanda.c | 3 +-
net/netfilter/nf_conntrack_ftp.c | 3 +-
net/netfilter/nf_conntrack_h323_main.c | 41 ++++++++++++++++++++++---------
net/netfilter/nf_conntrack_irc.c | 3 +-
net/netfilter/nf_conntrack_sip.c | 18 +++++++++----
net/netfilter/nf_conntrack_tftp.c | 3 +-
6 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index 13fd2c5..49880c8 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -155,7 +155,8 @@ static int amanda_help(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook);
- if (nf_nat_amanda && ct->status & IPS_NAT_MASK)
+ if (nf_nat_amanda && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
ret = nf_nat_amanda(skb, ctinfo, off - dataoff,
len, exp);
else if (nf_ct_expect_related(exp) != 0)
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 6f5801e..9e738eb 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -487,7 +487,8 @@ static int help(struct sk_buff *skb,
/* Now, NAT might want to mangle the packet, and register the
* (possibly changed) expectation itself. */
nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
- if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
+ if (nf_nat_ftp && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
matchoff, matchlen, exp);
else {
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index f03c2d4..200c7a4 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -296,6 +296,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
ret = nat_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
@@ -354,6 +355,7 @@ static int expect_t120(struct sk_buff *skb,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
ret = nat_t120(skb, ct, ctinfo, data, dataoff, taddr,
@@ -689,6 +691,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
ret = nat_h245(skb, ct, ctinfo, data, dataoff, taddr,
@@ -813,6 +816,7 @@ static int expect_callforwarding(struct sk_buff *skb,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
(nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Need NAT */
ret = nat_callforwarding(skb, ct, ctinfo, data, dataoff,
@@ -854,7 +858,8 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
set_h225_addr = rcu_dereference(set_h225_addr_hook);
if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
- (set_h225_addr) && ct->status & IPS_NAT_MASK &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->destCallSignalAddress,
&addr, &port) &&
memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {
@@ -870,7 +875,8 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
}
if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
- (set_h225_addr) && ct->status & IPS_NAT_MASK &&
+ (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
&addr, &port) &&
memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {
@@ -1280,7 +1286,8 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
nat_q931 = rcu_dereference(nat_q931_hook);
- if (nat_q931 && ct->status & IPS_NAT_MASK) { /* Need NAT */
+ if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) { /* Need NAT */
ret = nat_q931(skb, ct, ctinfo, data, taddr, i, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp) == 0) {
@@ -1308,7 +1315,8 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: GRQ\n");
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK) /* NATed */
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) /* NATed */
return set_ras_addr(skb, ct, ctinfo, data,
&grq->rasAddress, 1);
return 0;
@@ -1376,7 +1384,8 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
return -1;
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK) {
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
ret = set_ras_addr(skb, ct, ctinfo, data,
rrq->rasAddress.item,
rrq->rasAddress.count);
@@ -1407,7 +1416,8 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: RCF\n");
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK) {
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
ret = set_sig_addr(skb, ct, ctinfo, data,
rcf->callSignalAddress.item,
rcf->callSignalAddress.count);
@@ -1455,7 +1465,8 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: URQ\n");
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK) {
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
ret = set_sig_addr(skb, ct, ctinfo, data,
urq->callSignalAddress.item,
urq->callSignalAddress.count);
@@ -1493,6 +1504,7 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
port == info->sig_port[dir] &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
set_h225_addr && ct->status & IPS_NAT_MASK) {
/* Answering ARQ */
return set_h225_addr(skb, data, 0,
@@ -1505,7 +1517,8 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
- set_h225_addr && ct->status & IPS_NAT_MASK) {
+ set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
/* Calling ARQ */
return set_h225_addr(skb, data, 0,
&arq->srcCallSignalAddress,
@@ -1537,7 +1550,8 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
/* Answering ACF */
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK)
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
return set_sig_addr(skb, ct, ctinfo, data,
&acf->destCallSignalAddress, 1);
return 0;
@@ -1573,7 +1587,8 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: LRQ\n");
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK)
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
return set_ras_addr(skb, ct, ctinfo, data,
&lrq->replyAddress, 1);
return 0;
@@ -1630,7 +1645,8 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_ras: IRR\n");
set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && ct->status & IPS_NAT_MASK) {
+ if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
ret = set_ras_addr(skb, ct, ctinfo, data,
&irr->rasAddress, 1);
if (ret < 0)
@@ -1638,7 +1654,8 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
}
set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && ct->status & IPS_NAT_MASK) {
+ if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
ret = set_sig_addr(skb, ct, ctinfo, data,
irr->callSignalAddress.item,
irr->callSignalAddress.count);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 4f9390b..17ad9a4 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -206,7 +206,8 @@ static int help(struct sk_buff *skb, unsigned int protoff,
IPPROTO_TCP, NULL, &port);
nf_nat_irc = rcu_dereference(nf_nat_irc_hook);
- if (nf_nat_irc && ct->status & IPS_NAT_MASK)
+ if (nf_nat_irc && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
ret = nf_nat_irc(skb, ctinfo,
addr_beg_p - ib_ptr,
addr_end_p - addr_beg_p,
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 93faf6a..6294f02 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -928,7 +928,8 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,
IPPROTO_UDP, NULL, &rtcp_port);
nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook);
- if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp)
+ if (nf_nat_sdp_media && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK && !direct_rtp)
ret = nf_nat_sdp_media(skb, dataoff, dptr, datalen,
rtp_exp, rtcp_exp,
mediaoff, medialen, daddr);
@@ -1051,7 +1052,8 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,
return ret;
/* Update media connection address if present */
- if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) {
+ if (maddr_len && nf_nat_sdp_addr &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 && ct->status & IPS_NAT_MASK) {
ret = nf_nat_sdp_addr(skb, dataoff, dptr, datalen,
mediaoff, c_hdr, SDP_HDR_MEDIA,
&rtp_addr);
@@ -1063,7 +1065,8 @@ static int process_sdp(struct sk_buff *skb, unsigned int dataoff,
/* Update session connection and owner addresses */
nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook);
- if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK)
+ if (nf_nat_sdp_session && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
ret = nf_nat_sdp_session(skb, dataoff, dptr, datalen, sdpoff,
&rtp_addr);
@@ -1222,7 +1225,8 @@ static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook);
- if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK)
+ if (nf_nat_sip_expect && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
ret = nf_nat_sip_expect(skb, dataoff, dptr, datalen, exp,
matchoff, matchlen);
else {
@@ -1400,7 +1404,8 @@ static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct,
else
ret = process_sip_response(skb, dataoff, dptr, datalen);
- if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+ if (ret == NF_ACCEPT && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
nf_nat_sip = rcu_dereference(nf_nat_sip_hook);
if (nf_nat_sip && !nf_nat_sip(skb, dataoff, dptr, datalen))
ret = NF_DROP;
@@ -1481,7 +1486,8 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
datalen = datalen + diff - msglen;
}
- if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+ if (ret == NF_ACCEPT && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK) {
nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook);
if (nf_nat_sip_seq_adjust)
nf_nat_sip_seq_adjust(skb, tdiff);
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 75466fd..abc2a5e 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -72,7 +72,8 @@ static int tftp_help(struct sk_buff *skb,
nf_ct_dump_tuple(&exp->tuple);
nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);
- if (nf_nat_tftp && ct->status & IPS_NAT_MASK)
+ if (nf_nat_tftp && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ ct->status & IPS_NAT_MASK)
ret = nf_nat_tftp(skb, ctinfo, exp);
else if (nf_ct_expect_related(exp) != 0)
ret = NF_DROP;
--
1.7.1
^ permalink raw reply related
* [PATCH 11/18] netfilter: ipv6: expand skb head in ip6_route_me_harder after oif change
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
Expand the skb headroom if the oif changed due to rerouting similar to
how IPv4 packets are handled.
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv6/netfilter.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index db31561..429089c 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -15,6 +15,7 @@ int ip6_route_me_harder(struct sk_buff *skb)
{
struct net *net = dev_net(skb_dst(skb)->dev);
const struct ipv6hdr *iph = ipv6_hdr(skb);
+ unsigned int hh_len;
struct dst_entry *dst;
struct flowi6 fl6 = {
.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
@@ -47,6 +48,13 @@ int ip6_route_me_harder(struct sk_buff *skb)
}
#endif
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
+ 0, GFP_ATOMIC))
+ return -1;
+
return 0;
}
EXPORT_SYMBOL(ip6_route_me_harder);
--
1.7.1
^ permalink raw reply related
* [PATCH 12/18] net: core: add function for incremental IPv6 pseudo header checksum updates
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
Add inet_proto_csum_replace16 for incrementally updating IPv6 pseudo header
checksums for IPv6 NAT.
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/net/checksum.h | 3 +++
net/core/utils.c | 20 ++++++++++++++++++++
2 files changed, 23 insertions(+), 0 deletions(-)
diff --git a/include/net/checksum.h b/include/net/checksum.h
index ba55d8b..600d1d7 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -109,6 +109,9 @@ static inline void csum_replace2(__sum16 *sum, __be16 from, __be16 to)
struct sk_buff;
extern void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, int pseudohdr);
+extern void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ int pseudohdr);
static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb,
__be16 from, __be16 to,
diff --git a/net/core/utils.c b/net/core/utils.c
index 386e263..b09a73a 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -298,6 +298,26 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
+void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
+ const __be32 *from, const __be32 *to,
+ int pseudohdr)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], ~from[2], ~from[3],
+ to[0], to[1], to[2], to[3],
+ };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial(diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial(diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace16);
+
int mac_pton(const char *s, u8 *mac)
{
int i;
--
1.7.1
^ permalink raw reply related
* [PATCH 16/18] netfilter: ip6tables: add NETMAP target
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv6/netfilter/Kconfig | 11 ++++
net/ipv6/netfilter/Makefile | 1 +
net/ipv6/netfilter/ip6t_NETMAP.c | 94 ++++++++++++++++++++++++++++++++++++++
3 files changed, 106 insertions(+), 0 deletions(-)
create mode 100644 net/ipv6/netfilter/ip6t_NETMAP.c
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 54c2e78..b14becc 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -50,6 +50,17 @@ config IP6_NF_TARGET_MASQUERADE
To compile it as a module, choose M here. If unsure, say N.
+config IP6_NF_TARGET_NETMAP
+ tristate "NETMAP target support"
+ depends on NF_NAT_IPV6
+ depends on NETFILTER_ADVANCED
+ help
+ NETMAP is an implementation of static 1:1 NAT mapping of network
+ addresses. It maps the network address part, while keeping the host
+ address part intact.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP6_NF_TARGET_REDIRECT
tristate "REDIRECT target support"
depends on NF_NAT_IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 902c59b..107ab83 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
# targets
obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
+obj-$(CONFIG_IP6_NF_TARGET_NETMAP) += ip6t_NETMAP.o
obj-$(CONFIG_IP6_NF_TARGET_REDIRECT) += ip6t_REDIRECT.o
obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
diff --git a/net/ipv6/netfilter/ip6t_NETMAP.c b/net/ipv6/netfilter/ip6t_NETMAP.c
new file mode 100644
index 0000000..4f3bf36
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NETMAP.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Svenning Soerensen's IPv4 NETMAP target. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+
+static unsigned int
+netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ struct nf_nat_range newrange;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ union nf_inet_addr new_addr, netmask;
+ unsigned int i;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++)
+ netmask.ip6[i] = ~(range->min_addr.ip6[i] ^
+ range->max_addr.ip6[i]);
+
+ if (par->hooknum == NF_INET_PRE_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_OUT)
+ new_addr.in6 = ipv6_hdr(skb)->daddr;
+ else
+ new_addr.in6 = ipv6_hdr(skb)->saddr;
+
+ for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) {
+ new_addr.ip6[i] &= ~netmask.ip6[i];
+ new_addr.ip6[i] |= range->min_addr.ip6[i] &
+ netmask.ip6[i];
+ }
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr = new_addr;
+ newrange.max_addr = new_addr;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+}
+
+static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+
+ if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
+ return -EINVAL;
+ return 0;
+}
+
+static struct xt_target netmap_tg6_reg __read_mostly = {
+ .name = "NETMAP",
+ .family = NFPROTO_IPV6,
+ .target = netmap_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .checkentry = netmap_tg6_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init netmap_tg6_init(void)
+{
+ return xt_register_target(&netmap_tg6_reg);
+}
+
+static void netmap_tg6_exit(void)
+{
+ xt_unregister_target(&netmap_tg6_reg);
+}
+
+module_init(netmap_tg6_init);
+module_exit(netmap_tg6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv6 subnets");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
--
1.7.1
^ permalink raw reply related
* [PATCH 13/18] netfilter: ipv6: add IPv6 NAT support
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/linux/netfilter/nfnetlink_conntrack.h | 2 +
include/net/netfilter/nf_nat_l3proto.h | 5 +
include/net/netfilter/nf_nat_l4proto.h | 1 +
include/net/netns/ipv6.h | 1 +
net/core/secure_seq.c | 1 +
net/ipv6/netfilter/Kconfig | 12 +
net/ipv6/netfilter/Makefile | 4 +
net/ipv6/netfilter/ip6table_nat.c | 319 ++++++++++++++++++++++++
net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 14 +
net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 285 +++++++++++++++++++++
net/ipv6/netfilter/nf_nat_proto_icmpv6.c | 87 +++++++
net/netfilter/nf_nat_core.c | 2 +
net/netfilter/xt_nat.c | 2 +
13 files changed, 735 insertions(+), 0 deletions(-)
create mode 100644 net/ipv6/netfilter/ip6table_nat.c
create mode 100644 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
create mode 100644 net/ipv6/netfilter/nf_nat_proto_icmpv6.c
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 5f0bed8..2c5c304 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -143,6 +143,8 @@ enum ctattr_nat {
CTA_NAT_V4_MAXIP,
#define CTA_NAT_MAXIP CTA_NAT_V4_MAXIP
CTA_NAT_PROTO,
+ CTA_NAT_V6_MINIP,
+ CTA_NAT_V6_MAXIP,
__CTA_NAT_MAX
};
#define CTA_NAT_MAX (__CTA_NAT_MAX - 1)
diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index 3df4a67..57d9e27 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -43,5 +43,10 @@ extern int nf_nat_icmp_reply_translation(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum);
+extern int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ unsigned int hdrlen);
#endif /* _NF_NAT_L3PROTO_H */
diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 1f0a4f0..24feb68 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -51,6 +51,7 @@ extern const struct nf_nat_l4proto *__nf_nat_l4proto_find(u8 l3proto, u8 l4proto
extern const struct nf_nat_l4proto nf_nat_l4proto_tcp;
extern const struct nf_nat_l4proto nf_nat_l4proto_udp;
extern const struct nf_nat_l4proto nf_nat_l4proto_icmp;
+extern const struct nf_nat_l4proto nf_nat_l4proto_icmpv6;
extern const struct nf_nat_l4proto nf_nat_l4proto_unknown;
extern bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 81abfcb..049d3a9 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -39,6 +39,7 @@ struct netns_ipv6 {
#ifdef CONFIG_SECURITY
struct xt_table *ip6table_security;
#endif
+ struct xt_table *ip6table_nat;
#endif
struct rt6_info *ip6_null_entry;
struct rt6_statistics *rt6_stats;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 025233d..04d93ad 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -74,6 +74,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
return hash[0];
}
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 4484648..ef15839 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -25,6 +25,18 @@ config NF_CONNTRACK_IPV6
To compile it as a module, choose M here. If unsure, say N.
+config NF_NAT_IPV6
+ tristate "IPv6 NAT"
+ depends on NF_CONNTRACK_IPV6
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ help
+ The IPv6 NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. It is controlled by
+ the `nat' table in ip6tables, see the man page for ip6tables(8).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP6_NF_QUEUE
tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
depends on INET && IPV6 && NETFILTER
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index abfee91..c709dae 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
+obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o
# objects for l3 independent conntrack
nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
@@ -16,6 +17,9 @@ nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
# l3 independent conntrack
obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o
+nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
+obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
+
# defrag
nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
new file mode 100644
index 0000000..d05c437
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv4 NAT code. Development of IPv6 NAT
+ * funded by Astaro.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+
+static const struct xt_table nf_nat_ipv6_table = {
+ .name = "nat",
+ .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ .af = NFPROTO_IPV6,
+};
+
+static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+ /* Force range to this IP; let proto decide mapping for
+ * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+ */
+ struct nf_nat_range range;
+
+ range.flags = 0;
+ pr_debug("Allocating NULL binding for %p (%pI6)\n", ct,
+ HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip6 :
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip6);
+
+ return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+}
+
+static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+ unsigned int ret;
+
+ ret = ip6t_do_table(skb, hooknum, in, out, net->ipv6.ip6table_nat);
+ if (ret == NF_ACCEPT) {
+ if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
+ ret = alloc_null_binding(ct, hooknum);
+ }
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv6_fn(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+ int hdrlen;
+ u8 nexthdr;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
+
+ /* Don't try to NAT if this packet is not conntracked */
+ if (nf_ct_is_untracked(ct))
+ return NF_ACCEPT;
+
+ nat = nfct_nat(ct);
+ if (!nat) {
+ /* NAT module was loaded late. */
+ if (nf_ct_is_confirmed(ct))
+ return NF_ACCEPT;
+ nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+ if (nat == NULL) {
+ pr_debug("failed to add NAT extension\n");
+ return NF_ACCEPT;
+ }
+ }
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
+ hooknum, hdrlen))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ unsigned int ret;
+
+ ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+ if (ret != NF_ACCEPT)
+ return ret;
+ } else
+ pr_debug("Already setup manip %s for ct %p\n",
+ maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ break;
+
+ default:
+ /* ESTABLISHED */
+ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+ ctinfo == IP_CT_ESTABLISHED_REPLY);
+ }
+
+ return nf_nat_packet(ct, ctinfo, hooknum, skb);
+}
+
+static unsigned int
+nf_nat_ipv6_in(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int ret;
+ struct in6_addr daddr = ipv6_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
+ skb_dst_drop(skb);
+
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv6_out(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+#endif
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct ipv6hdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3) ||
+ (ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all))
+ if (nf_xfrm_me_harder(skb, AF_INET6) < 0)
+ ret = NF_DROP;
+ }
+#endif
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv6_local_fn(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+
+ /* root is playing with raw sockets. */
+ if (skb->len < sizeof(struct ipv6hdr))
+ return NF_ACCEPT;
+
+ ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN &&
+ (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
+ &ct->tuplehash[!dir].tuple.src.u3)) {
+ if (ip6_route_me_harder(skb))
+ ret = NF_DROP;
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all)
+ if (nf_xfrm_me_harder(skb, AF_INET6))
+ ret = NF_DROP;
+#endif
+ }
+ return ret;
+}
+
+static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv6_in,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv6_out,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv6_local_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv6_fn,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+static int __net_init ip6table_nat_net_init(struct net *net)
+{
+ struct ip6t_replace *repl;
+
+ repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl);
+ kfree(repl);
+ if (IS_ERR(net->ipv6.ip6table_nat))
+ return PTR_ERR(net->ipv6.ip6table_nat);
+ return 0;
+}
+
+static void __net_exit ip6table_nat_net_exit(struct net *net)
+{
+ ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+}
+
+static struct pernet_operations ip6table_nat_net_ops = {
+ .init = ip6table_nat_net_init,
+ .exit = ip6table_nat_net_exit,
+};
+
+static int __init ip6table_nat_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&ip6table_nat_net_ops);
+ if (err < 0)
+ goto err1;
+
+ err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+ if (err < 0)
+ goto err2;
+ return 0;
+
+err2:
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+err1:
+ return err;
+}
+
+static void __exit ip6table_nat_exit(void)
+{
+ nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+}
+
+module_init(ip6table_nat_init);
+module_exit(ip6table_nat_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 4111050..e6cc7ac 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -28,6 +28,7 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_log.h>
@@ -184,6 +185,19 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
"nf_ct_%s: dropping packet", helper->name);
return ret;
}
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ typeof(nf_nat_seq_adjust_hook) seq_adjust;
+
+ seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
+ if (!seq_adjust ||
+ !seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
out:
/* We've seen it coming out the other side: confirm it */
return nf_conntrack_confirm(skb);
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
new file mode 100644
index 0000000..3603af5
--- /dev/null
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Development of IPv6 NAT funded by Astaro.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/secure_seq.h>
+#include <net/checksum.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv6;
+
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ unsigned long statusbit,
+ struct flowi *fl)
+{
+ const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+ struct flowi6 *fl6 = &fl->u.ip6;
+
+ if (ct->status & statusbit) {
+ fl6->daddr = t->dst.u3.in6;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl6->fl6_dport = t->dst.u.all;
+ }
+
+ statusbit ^= IPS_NAT_MASK;
+
+ if (ct->status & statusbit) {
+ fl6->saddr = t->src.u3.in6;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_DCCP ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl6->fl6_sport = t->src.u.all;
+ }
+}
+#endif
+
+static int nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
+ const struct nf_nat_range *range)
+{
+ return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
+ ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
+}
+
+static u32 nf_nat_ipv6_secure_port(const struct nf_conntrack_tuple *t,
+ __be16 dport)
+{
+ return secure_ipv6_port_ephemeral(t->src.u3.ip6, t->dst.u3.ip6, dport);
+}
+
+static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
+ unsigned int iphdroff,
+ const struct nf_nat_l4proto *l4proto,
+ const struct nf_conntrack_tuple *target,
+ enum nf_nat_manip_type maniptype)
+{
+ struct ipv6hdr *ipv6h;
+ int hdroff;
+ u8 nexthdr;
+
+ if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h)))
+ return false;
+
+ ipv6h = (void *)skb->data + iphdroff;
+ nexthdr = ipv6h->nexthdr;
+ hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), &nexthdr);
+ if (hdroff < 0)
+ return false; // FIXME: indicate success?
+
+ if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff,
+ target, maniptype))
+ return false;
+
+ if (maniptype == NF_NAT_MANIP_SRC)
+ ipv6_addr_copy(&ipv6h->saddr, &target->src.u3.in6);
+ else
+ ipv6_addr_copy(&ipv6h->daddr, &target->dst.u3.in6);
+
+ return true;
+}
+
+static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+ const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff);
+ const struct in6_addr *oldip, *newip;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ oldip = &ipv6h->saddr;
+ newip = &t->src.u3.in6;
+ } else {
+ oldip = &ipv6h->daddr;
+ newip = &t->dst.u3.in6;
+ }
+ inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
+ newip->s6_addr32, 1);
+}
+
+static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
+ u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt6i_flags & RTF_LOCAL) && // FIXME
+ (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ (data - (void *)skb->data);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, proto, 0);
+ } else {
+ *check = 0;
+ *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, proto,
+ csum_partial(data, datalen,
+ 0));
+ if (proto == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
+static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range *range)
+{
+ if (tb[CTA_NAT_V6_MINIP]) {
+ nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
+ sizeof(struct in6_addr));
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (tb[CTA_NAT_V6_MAXIP])
+ nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP],
+ sizeof(struct in6_addr));
+ else
+ range->max_addr = range->min_addr;
+
+ return 0;
+}
+
+static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = {
+ .l3proto = NFPROTO_IPV6,
+ .secure_port = nf_nat_ipv6_secure_port,
+ .in_range = nf_nat_ipv6_in_range,
+ .manip_pkt = nf_nat_ipv6_manip_pkt,
+ .csum_update = nf_nat_ipv6_csum_update,
+ .csum_recalc = nf_nat_ipv6_csum_recalc,
+ .nlattr_to_range = nf_nat_ipv6_nlattr_to_range,
+#ifdef CONFIG_XFRM
+ .decode_session = nf_nat_ipv6_decode_session,
+#endif
+};
+
+int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ unsigned int hdrlen)
+{
+ struct {
+ struct icmp6hdr icmp6;
+ struct ipv6hdr ip6;
+ } *inside;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+ const struct nf_nat_l4proto *l4proto;
+ struct nf_conntrack_tuple target;
+ unsigned long statusbit;
+
+ NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+
+ if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+ return 0;
+ if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
+ return 0;
+
+ inside = (void *)skb->data + hdrlen;
+ if (inside->icmp6.icmp6_type == NDISC_REDIRECT) {
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ if (manip == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply direction */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, inside->ip6.nexthdr);
+ if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6),
+ l4proto, &ct->tuplehash[!dir].tuple, !manip))
+ return 0;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ inside = (void *)skb->data + hdrlen;
+ inside->icmp6.icmp6_cksum = 0;
+ inside->icmp6.icmp6_cksum =
+ csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ skb->len - hdrlen, IPPROTO_ICMPV6,
+ csum_partial(&inside->icmp6,
+ skb->len - hdrlen, 0));
+ }
+
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, IPPROTO_ICMPV6);
+ if (!nf_nat_ipv6_manip_pkt(skb, 0, l4proto, &target, manip))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
+
+static int __init nf_nat_l3proto_ipv6_init(void)
+{
+ int err;
+
+ err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
+ if (err < 0)
+ goto err1;
+ err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv6);
+ if (err < 0)
+ goto err2;
+ return err;
+
+err2:
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
+err1:
+ return err;
+}
+
+static void __exit nf_nat_l3proto_ipv6_exit(void)
+{
+ nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv6);
+ nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-" __stringify(AF_INET6));
+
+module_init(nf_nat_l3proto_ipv6_init);
+module_exit(nf_nat_l3proto_ipv6_exit);
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
new file mode 100644
index 0000000..c83549b
--- /dev/null
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2011 Patrick Mchardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv4 ICMP NAT code. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/icmpv6.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/netfilter/nf_nat_l4proto.h>
+
+static bool
+icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype,
+ const union nf_conntrack_man_proto *min,
+ const union nf_conntrack_man_proto *max)
+{
+ return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
+ ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
+}
+
+static void
+icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype,
+ const struct nf_conn *ct)
+{
+ static u16 id;
+ unsigned int range_size;
+ unsigned int i;
+
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
+
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
+ range_size = 0xffff;
+
+ for (i = 0; ; ++id) {
+ tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
+ (id % range_size));
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
+ }
+}
+
+static bool
+icmpv6_manip_pkt(struct sk_buff *skb,
+ const struct nf_nat_l3proto *l3proto,
+ unsigned int iphdroff, unsigned int hdroff,
+ const struct nf_conntrack_tuple *tuple,
+ enum nf_nat_manip_type maniptype)
+{
+ struct icmp6hdr *hdr;
+
+ if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ return false;
+
+ hdr = (struct icmp6hdr *)(skb->data + hdroff);
+ l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum,
+ tuple, maniptype);
+ inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
+ hdr->icmp6_identifier,
+ tuple->src.u.icmp.id, 0);
+ hdr->icmp6_identifier = tuple->src.u.icmp.id;
+ return true;
+}
+
+const struct nf_nat_l4proto nf_nat_l4proto_icmpv6 = {
+ .l4proto = IPPROTO_ICMPV6,
+ .manip_pkt = icmpv6_manip_pkt,
+ .in_range = icmpv6_in_range,
+ .unique_tuple = icmpv6_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+ .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
+#endif
+};
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 0cdad5c..18bc701 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -595,6 +595,8 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr,
static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
[CTA_NAT_V4_MINIP] = { .type = NLA_U32 },
[CTA_NAT_V4_MAXIP] = { .type = NLA_U32 },
+ [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) },
+ [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) },
[CTA_NAT_PROTO] = { .type = NLA_NESTED },
};
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index 8b94bb8..3a01caf 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -180,4 +180,6 @@ module_exit(xt_nat_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_SNAT");
+MODULE_ALIAS("ip6t_SNAT");
MODULE_ALIAS("ipt_DNAT");
+MODULE_ALIAS("ip6t_DNAT");
--
1.7.1
^ permalink raw reply related
* [PATCH 14/18] netfilter: ip6tables: add MASQUERADE target
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/net/addrconf.h | 2 +-
net/ipv4/netfilter/ipt_MASQUERADE.c | 3 +-
net/ipv6/addrconf.c | 2 +-
net/ipv6/netfilter/Kconfig | 13 +++
net/ipv6/netfilter/Makefile | 1 +
net/ipv6/netfilter/ip6t_MASQUERADE.c | 135 ++++++++++++++++++++++++++++++++++
6 files changed, 153 insertions(+), 3 deletions(-)
create mode 100644 net/ipv6/netfilter/ip6t_MASQUERADE.c
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index cbc6bb0..291be6c 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -77,7 +77,7 @@ extern struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net,
int strict);
extern int ipv6_dev_get_saddr(struct net *net,
- struct net_device *dev,
+ const struct net_device *dev,
const struct in6_addr *daddr,
unsigned int srcprefs,
struct in6_addr *saddr);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 1b96b50..9ab96d9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -98,7 +98,8 @@ device_cmp(struct nf_conn *i, void *ifindex)
if (!nat)
return 0;
-
+ if (nf_ct_l3num(i) != NFPROTO_IPV4)
+ return 0;
return nat->masq_index == (int)(long)ifindex;
}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cf88df8..619af79 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1107,7 +1107,7 @@ out:
return ret;
}
-int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
+int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
const struct in6_addr *daddr, unsigned int prefs,
struct in6_addr *saddr)
{
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ef15839..7215c7f 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -37,6 +37,19 @@ config NF_NAT_IPV6
To compile it as a module, choose M here. If unsure, say N.
+config IP6_NF_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ depends on NF_NAT_IPV6
+ default m if NETFILTER_ADVANCED=n
+ help
+ Masquerading is a special case of NAT: all outgoing connections are
+ changed to seem to come from a particular interface's address, and
+ if the interface goes down, those connections are lost. This is
+ only useful for dialup accounts with dynamic IP address (ie. your IP
+ address will be different on next dialup).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP6_NF_QUEUE
tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
depends on INET && IPV6 && NETFILTER
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index c709dae..4a04c4b 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -35,4 +35,5 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
# targets
obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
+obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
new file mode 100644
index 0000000..60e9053
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6
+ * NAT funded by Astaro.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+
+static unsigned int
+masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+ enum ip_conntrack_info ctinfo;
+ struct in6_addr src;
+ struct nf_conn *ct;
+ struct nf_nat_range newrange;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY));
+
+ if (ipv6_dev_get_saddr(dev_net(par->out), par->out,
+ &ipv6_hdr(skb)->daddr, 0, &src) < 0)
+ return NF_DROP;
+
+ nfct_nat(ct)->masq_index = par->out->ifindex;
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.in6 = src;
+ newrange.max_addr.in6 = src;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+}
+
+static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range *range = par->targinfo;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS)
+ return -EINVAL;
+ return 0;
+}
+
+static int device_cmp(struct nf_conn *ct, void *ifindex)
+{
+ const struct nf_conn_nat *nat = nfct_nat(ct);
+
+ if (!nat)
+ return 0;
+ if (nf_ct_l3num(ct) != NFPROTO_IPV6)
+ return 0;
+ return nat->masq_index == (int)(long)ifindex;
+}
+
+static int masq_device_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ const struct net_device *dev = ptr;
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_DOWN)
+ nf_ct_iterate_cleanup(net, device_cmp,
+ (void *)(long)dev->ifindex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block masq_dev_notifier = {
+ .notifier_call = masq_device_event,
+};
+
+static int masq_inet_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = ptr;
+
+ return masq_device_event(this, event, ifa->idev->dev);
+}
+
+static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+};
+
+static struct xt_target masquerade_tg6_reg __read_mostly = {
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV6,
+ .checkentry = masquerade_tg6_checkentry,
+ .target = masquerade_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .me = THIS_MODULE,
+};
+
+static int __init masquerade_tg6_init(void)
+{
+ int err;
+
+ err = xt_register_target(&masquerade_tg6_reg);
+ if (err == 0) {
+ register_netdevice_notifier(&masq_dev_notifier);
+ register_inet6addr_notifier(&masq_inet_notifier);
+ }
+
+ return err;
+}
+static void __exit masquerade_tg6_exit(void)
+{
+ unregister_inet6addr_notifier(&masq_inet_notifier);
+ unregister_netdevice_notifier(&masq_dev_notifier);
+ xt_unregister_target(&masquerade_tg6_reg);
+}
+
+module_init(masquerade_tg6_init);
+module_exit(masquerade_tg6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: automatic address SNAT");
--
1.7.1
^ permalink raw reply related
* [PATCH 18/18] netfilter: nf_nat: support IPv6 in amanda NAT helper
From: kaber @ 2011-11-24 16:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber, ulrich.weber
In-Reply-To: <1322153850-10533-1-git-send-email-kaber@trash.net>
From: Patrick McHardy <kaber@trash.net>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv4/netfilter/Kconfig | 5 -----
net/ipv4/netfilter/Makefile | 1 -
net/netfilter/Kconfig | 5 +++++
net/netfilter/Makefile | 1 +
net/netfilter/nf_conntrack_amanda.c | 3 +--
net/{ipv4 => }/netfilter/nf_nat_amanda.c | 0
6 files changed, 7 insertions(+), 8 deletions(-)
rename net/{ipv4 => }/netfilter/nf_nat_amanda.c (100%)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1b043a5..1c6917a 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -229,11 +229,6 @@ config NF_NAT_TFTP
depends on NF_CONNTRACK && NF_NAT_IPV4
default NF_NAT_IPV4 && NF_CONNTRACK_TFTP
-config NF_NAT_AMANDA
- tristate
- depends on NF_CONNTRACK && NF_NAT_IPV4
- default NF_NAT_IPV4 && NF_CONNTRACK_AMANDA
-
config NF_NAT_PPTP
tristate
depends on NF_CONNTRACK && NF_NAT_IPV4
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 0fa8715..b365cf2 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
# NAT helpers (nf_conntrack)
-obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1782da9..353b160 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -321,6 +321,11 @@ config NF_NAT_PROTO_SCTP
depends on NF_NAT && NF_CT_PROTO_SCTP
select LIBCRC32C
+config NF_NAT_AMANDA
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default NF_NAT && NF_CONNTRACK_AMANDA
+
config NF_NAT_FTP
tristate
depends on NF_CONNTRACK && NF_NAT
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a747580..a28e44d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
# NAT helpers
+obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
# transparent proxy support
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index dab3913..909c54d 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -156,8 +156,7 @@ static int amanda_help(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook);
- if (nf_nat_amanda && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- ct->status & IPS_NAT_MASK)
+ if (nf_nat_amanda && ct->status & IPS_NAT_MASK)
ret = nf_nat_amanda(skb, ctinfo, protoff,
off - dataoff, len, exp);
else if (nf_ct_expect_related(exp) != 0)
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
similarity index 100%
rename from net/ipv4/netfilter/nf_nat_amanda.c
rename to net/netfilter/nf_nat_amanda.c
--
1.7.1
^ permalink raw reply related
* [PATCH net-next 2/2] netem: add cell concept to simulate special MAC behavior
From: Hagen Paul Pfeifer @ 2011-11-24 17:39 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Hagen Paul Pfeifer
In-Reply-To: <1322156378-23257-1-git-send-email-hagen@jauu.net>
This extension can be used to simulate special link layer
characteristics. Simulate because packet data is not modified, only the
calculation base is changed to delay a packet based on the original
packet size and artificial cell information.
packet_overhead can be used to simulate a link layer header compression
scheme (e.g. set packet_overhead to -20) or with a positive
packet_overhead value an additional MAC header can be simulated. It is
also possible to "replace" the 14 byte Ethernet header with something
else.
cell_size and cell_overhead can be used to simulate link layer schemes,
based on cells, like some TDMA schemes. Another application area are MAC
schemes using a link layer fragmentation with a (small) header each.
Cell size is the maximum amount of data bytes within one cell. Cell
overhead is an additional variable to change the per-cell-overhead (e.g.
5 byte header per fragment).
Example (5 kbit/s, 20 byte per packet overhead, cellsize 100 byte, per
cell overhead 5 byte):
tc qdisc add dev eth0 root netem ratelatency 5kbit 20 100 5
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
---
include/linux/pkt_sched.h | 3 +++
net/sched/sch_netem.c | 30 +++++++++++++++++++++++++++---
2 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index cf826d3..5ad3858 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -498,6 +498,9 @@ struct tc_netem_corrupt {
struct tc_netem_ratelatency {
__u32 ratelatency; /* byte/s */
+ __s32 packet_overhead;
+ __u32 cell_size;
+ __s32 cell_overhead;
};
enum {
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 3ae1cdd..40ad634 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -80,6 +80,9 @@ struct netem_sched_data {
u32 reorder;
u32 corrupt;
u32 ratelatency;
+ s32 packet_overhead;
+ u32 cell_size;
+ s32 cell_overhead;
struct crndstate {
u32 last;
@@ -299,9 +302,24 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
-static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate)
+static psched_time_t packet_len_2_sched_time(unsigned int len,
+ struct netem_sched_data *q)
{
- return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / rate);
+ len += q->packet_overhead;
+
+ if (q->cell_size) {
+ u32 carry = len % q->cell_size;
+ len += carry;
+
+ if (q->cell_overhead) {
+ u32 cells = len / q->cell_size;
+ if (carry)
+ cells += 1;
+ len += cells * q->cell_overhead;
+ }
+ }
+
+ return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / q->ratelatency);
}
/*
@@ -381,7 +399,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (q->ratelatency) {
struct sk_buff_head *list = &q->qdisc->q;
- delay += packet_len_2_sched_time(skb->len, q->ratelatency);
+ delay += packet_len_2_sched_time(skb->len, q);
if (!skb_queue_empty(list)) {
/*
@@ -565,6 +583,9 @@ static void get_ratelatency(struct Qdisc *sch, const struct nlattr *attr)
const struct tc_netem_ratelatency *r = nla_data(attr);
q->ratelatency = r->ratelatency;
+ q->packet_overhead = r->packet_overhead;
+ q->cell_size = r->cell_size;
+ q->cell_overhead = r->cell_overhead;
}
static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
@@ -906,6 +927,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
ratelatency.ratelatency = q->ratelatency;
+ ratelatency.packet_overhead = q->packet_overhead;
+ ratelatency.cell_size = q->cell_size;
+ ratelatency.cell_overhead = q->cell_overhead;
NLA_PUT(skb, TCA_NETEM_RATELATENCY, sizeof(ratelatency), &ratelatency);
if (dump_loss_model(q, skb) != 0)
--
1.7.7
^ permalink raw reply related
* [PATCH net-next 1/2] netem: rate-latency extension
From: Hagen Paul Pfeifer @ 2011-11-24 17:39 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Hagen Paul Pfeifer
Currently netem is not in the ability to emulate channel bandwidth. Only static
delay (and optional random jitter) can be configured.
To emulate the channel rate the token bucket filter (sch_tbf) can be used. But
TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot
be 0. Also the idea behind TBF is that the credit (token in buckets) fills if
no packet is transmitted. So that there is always a "positive" credit for new
packets. In real life this behavior contradicts the law of nature where
nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s
link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0
seconds.
Netem is an excellent place to implement a rate limiting feature: static
delay is already implemented, tfifo already has time information and the
user can skip TBF configuration completely.
This patch implement rate latency feature which can be configured via
tc. e.g:
tc qdisc add dev eth0 root netem ratelatency 10kbit
To emulate a link of 5000byte/s and add an additional static delay of 10ms:
tc qdisc add dev eth0 root netem delay 10ms ratelatency 5KBps
Note: similar to TBF the rate-latency extension is bounded to the kernel timing
system. Depending on the architecture timer granularity, higher rates (e.g.
10mbit/s and higher) tend to transmission bursts. Also note: further queues
living in network adaptors; see ethtool(8).
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
---
include/linux/pkt_sched.h | 5 +++++
net/sched/sch_netem.c | 40 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 45 insertions(+), 0 deletions(-)
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index c533670..cf826d3 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -465,6 +465,7 @@ enum {
TCA_NETEM_REORDER,
TCA_NETEM_CORRUPT,
TCA_NETEM_LOSS,
+ TCA_NETEM_RATELATENCY,
__TCA_NETEM_MAX,
};
@@ -495,6 +496,10 @@ struct tc_netem_corrupt {
__u32 correlation;
};
+struct tc_netem_ratelatency {
+ __u32 ratelatency; /* byte/s */
+};
+
enum {
NETEM_LOSS_UNSPEC,
NETEM_LOSS_GI, /* General Intuitive - 4 state model */
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index eb3b9a8..3ae1cdd 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -79,6 +79,7 @@ struct netem_sched_data {
u32 duplicate;
u32 reorder;
u32 corrupt;
+ u32 ratelatency;
struct crndstate {
u32 last;
@@ -298,6 +299,11 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
+static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate)
+{
+ return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / rate);
+}
+
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -371,6 +377,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
&q->delay_cor, q->delay_dist);
now = psched_get_time();
+
+ if (q->ratelatency) {
+ struct sk_buff_head *list = &q->qdisc->q;
+
+ delay += packet_len_2_sched_time(skb->len, q->ratelatency);
+
+ if (!skb_queue_empty(list)) {
+ /*
+ * Last packet in queue is reference point (now).
+ * First packet in queue is already in flight,
+ * calculate this time bonus and substract
+ * from delay.
+ */
+ delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
+ now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
+ }
+ }
+
cb->time_to_send = now + delay;
++q->counter;
ret = qdisc_enqueue(skb, q->qdisc);
@@ -535,6 +559,14 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
init_crandom(&q->corrupt_cor, r->correlation);
}
+static void get_ratelatency(struct Qdisc *sch, const struct nlattr *attr)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ const struct tc_netem_ratelatency *r = nla_data(attr);
+
+ q->ratelatency = r->ratelatency;
+}
+
static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -594,6 +626,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
+ [TCA_NETEM_RATELATENCY] = { .len = sizeof(struct tc_netem_ratelatency) },
[TCA_NETEM_LOSS] = { .type = NLA_NESTED },
};
@@ -666,6 +699,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt)
if (tb[TCA_NETEM_CORRUPT])
get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
+ if (tb[TCA_NETEM_RATELATENCY])
+ get_ratelatency(sch, tb[TCA_NETEM_RATELATENCY]);
+
q->loss_model = CLG_RANDOM;
if (tb[TCA_NETEM_LOSS])
ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
@@ -846,6 +882,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
struct tc_netem_corrupt corrupt;
+ struct tc_netem_ratelatency ratelatency;
qopt.latency = q->latency;
qopt.jitter = q->jitter;
@@ -868,6 +905,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
corrupt.correlation = q->corrupt_cor.rho;
NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+ ratelatency.ratelatency = q->ratelatency;
+ NLA_PUT(skb, TCA_NETEM_RATELATENCY, sizeof(ratelatency), &ratelatency);
+
if (dump_loss_model(q, skb) != 0)
goto nla_put_failure;
--
1.7.7
^ permalink raw reply related
* [PATCH iproute2 1/2] utils: add s32 parser
From: Hagen Paul Pfeifer @ 2011-11-24 17:40 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Hagen Paul Pfeifer
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
---
include/utils.h | 1 +
lib/utils.c | 14 ++++++++++++++
2 files changed, 15 insertions(+), 0 deletions(-)
diff --git a/include/utils.h b/include/utils.h
index 47f8e07..496db68 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -85,6 +85,7 @@ extern int get_time_rtt(unsigned *val, const char *arg, int *raw);
#define get_short get_s16
extern int get_u64(__u64 *val, const char *arg, int base);
extern int get_u32(__u32 *val, const char *arg, int base);
+extern int get_s32(__s32 *val, const char *arg, int base);
extern int get_u16(__u16 *val, const char *arg, int base);
extern int get_s16(__s16 *val, const char *arg, int base);
extern int get_u8(__u8 *val, const char *arg, int base);
diff --git a/lib/utils.c b/lib/utils.c
index efaf377..6788dd9 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -198,6 +198,20 @@ int get_u8(__u8 *val, const char *arg, int base)
return 0;
}
+int get_s32(__s32 *val, const char *arg, int base)
+{
+ long res;
+ char *ptr;
+
+ if (!arg || !*arg)
+ return -1;
+ res = strtoul(arg, &ptr, base);
+ if (!ptr || ptr == arg || *ptr || res > INT32_MAX || res < INT32_MIN)
+ return -1;
+ *val = res;
+ return 0;
+}
+
int get_s16(__s16 *val, const char *arg, int base)
{
long res;
--
1.7.7
^ permalink raw reply related
* [PATCH iproute2 2/2] tc: netem ratelatency and cell extension
From: Hagen Paul Pfeifer @ 2011-11-24 17:40 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Hagen Paul Pfeifer
In-Reply-To: <1322156415-23331-1-git-send-email-hagen@jauu.net>
This patch add ratelatency as well as cell support. Ratelatency can be
added with ratelatency options. Three optional arguments control the
cell knobs: packet-overhead, cell-size, cell-overhead. To ratelimit eth0
root queue to 5kbit/s, with a 20 byte packet overhead, 100 byte cell
size and a 5 byte per cell overhead:
tc qdisc add dev eth0 root netem ratelatency 5kbit 20 100 5
Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
---
include/linux/pkt_sched.h | 8 ++++++
tc/q_netem.c | 53 ++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 60 insertions(+), 1 deletions(-)
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index c533670..76b26a2 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -465,6 +465,7 @@ enum {
TCA_NETEM_REORDER,
TCA_NETEM_CORRUPT,
TCA_NETEM_LOSS,
+ TCA_NETEM_RATELATENCY,
__TCA_NETEM_MAX,
};
@@ -495,6 +496,13 @@ struct tc_netem_corrupt {
__u32 correlation;
};
+struct tc_netem_ratelatency {
+ __u32 ratelatency; /* byte/s */
+ __s32 packet_overhead;
+ __u32 cell_size;
+ __s32 cell_overhead;
+};
+
enum {
NETEM_LOSS_UNSPEC,
NETEM_LOSS_GI, /* General Intuitive - 4 state model */
diff --git a/tc/q_netem.c b/tc/q_netem.c
index 6dc40bd..d6a3266 100644
--- a/tc/q_netem.c
+++ b/tc/q_netem.c
@@ -34,7 +34,8 @@ static void explain(void)
" [ drop PERCENT [CORRELATION]] \n" \
" [ corrupt PERCENT [CORRELATION]] \n" \
" [ duplicate PERCENT [CORRELATION]]\n" \
-" [ reorder PRECENT [CORRELATION] [ gap DISTANCE ]]\n");
+" [ reorder PRECENT [CORRELATION] [ gap DISTANCE ]]\n" \
+" [ ratelatency RATE [PACKETOVERHEAD] [CELLSIZE] [CELLOVERHEAD]]\n");
}
static void explain1(const char *arg)
@@ -131,6 +132,7 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv,
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
struct tc_netem_corrupt corrupt;
+ struct tc_netem_ratelatency ratelatency;
__s16 *dist_data = NULL;
int present[__TCA_NETEM_MAX];
@@ -139,6 +141,7 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv,
memset(&cor, 0, sizeof(cor));
memset(&reorder, 0, sizeof(reorder));
memset(&corrupt, 0, sizeof(corrupt));
+ memset(&ratelatency, 0, sizeof(ratelatency));
memset(present, 0, sizeof(present));
while (argc > 0) {
@@ -244,6 +247,34 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv,
free(dist_data);
return -1;
}
+ } else if (matches(*argv, "ratelatency") == 0) {
+ ++present[TCA_NETEM_RATELATENCY];
+ NEXT_ARG();
+ if (get_rate(&ratelatency.ratelatency, *argv)) {
+ explain1("ratelatency");
+ return -1;
+ }
+ if (NEXT_IS_NUMBER()) {
+ NEXT_ARG();
+ if (get_s32(&ratelatency.packet_overhead, *argv, 0)) {
+ explain1("ratelatency");
+ return -1;
+ }
+ }
+ if (NEXT_IS_NUMBER()) {
+ NEXT_ARG();
+ if (get_u32(&ratelatency.cell_size, *argv, 0)) {
+ explain1("ratelatency");
+ return -1;
+ }
+ }
+ if (NEXT_IS_NUMBER()) {
+ NEXT_ARG();
+ if (get_s32(&ratelatency.cell_overhead, *argv, 0)) {
+ explain1("ratelatency");
+ return -1;
+ }
+ }
} else if (strcmp(*argv, "help") == 0) {
explain();
return -1;
@@ -290,6 +321,10 @@ static int netem_parse_opt(struct qdisc_util *qu, int argc, char **argv,
addattr_l(n, 1024, TCA_NETEM_CORRUPT, &corrupt, sizeof(corrupt)) < 0)
return -1;
+ if (present[TCA_NETEM_RATELATENCY] &&
+ addattr_l(n, 1024, TCA_NETEM_RATELATENCY, &ratelatency, sizeof(ratelatency)) < 0)
+ return -1;
+
if (dist_data) {
if (addattr_l(n, MAX_DIST * sizeof(dist_data[0]),
TCA_NETEM_DELAY_DIST,
@@ -306,6 +341,7 @@ static int netem_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
const struct tc_netem_corr *cor = NULL;
const struct tc_netem_reorder *reorder = NULL;
const struct tc_netem_corrupt *corrupt = NULL;
+ const struct tc_netem_ratelatency *ratelatency = NULL;
struct tc_netem_qopt qopt;
int len = RTA_PAYLOAD(opt) - sizeof(qopt);
SPRINT_BUF(b1);
@@ -339,6 +375,11 @@ static int netem_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
return -1;
corrupt = RTA_DATA(tb[TCA_NETEM_CORRUPT]);
}
+ if (tb[TCA_NETEM_RATELATENCY]) {
+ if (RTA_PAYLOAD(tb[TCA_NETEM_RATELATENCY]) < sizeof(*ratelatency))
+ return -1;
+ ratelatency = RTA_DATA(tb[TCA_NETEM_RATELATENCY]);
+ }
}
fprintf(f, "limit %d", qopt.limit);
@@ -382,6 +423,16 @@ static int netem_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
sprint_percent(corrupt->correlation, b1));
}
+ if (ratelatency && ratelatency->ratelatency) {
+ fprintf(f, " ratelatency %s", sprint_rate(ratelatency->ratelatency, b1));
+ if (ratelatency->packet_overhead)
+ fprintf(f, " packetoverhead %d", ratelatency->packet_overhead);
+ if (ratelatency->cell_size)
+ fprintf(f, " cellsize %u", ratelatency->cell_size);
+ if (ratelatency->cell_overhead)
+ fprintf(f, " celloverhead %d", ratelatency->cell_overhead);
+ }
+
if (qopt.gap)
fprintf(f, " gap %lu", (unsigned long)qopt.gap);
--
1.7.7
^ permalink raw reply related
* Re: [PATCH] net: fsl_pq_mdio: fix oops when using uninitialized mutex
From: Fleming Andy-AFLEMING @ 2011-11-24 17:46 UTC (permalink / raw)
To: Kumar Gala
Cc: Andy Fleming, Baruch Siach, netdev@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org
In-Reply-To: <2465EDAB-01F8-4D5B-9950-E5F180E8A435@kernel.crashing.org>
Yes, I sent a patch. Then he sent another patch which breaks things differently. I have not yet submitted my fix. My fix is to revert his patch, and then modify your updated device trees to automatically set the tbi to something.
On Nov 24, 2011, at 1:51, "Kumar Gala" <galak@kernel.crashing.org> wrote:
>
> On Nov 9, 2011, at 2:10 PM, Andy Fleming wrote:
>
>>> Fix this by moving the of_mdiobus_register() call earlier.
>>>
>>> Cc: Andy Fleming <afleming@freescale.com>
>>> Signed-off-by: Baruch Siach <baruch@tkos.co.il>
>>> ---
>>> drivers/net/ethernet/freescale/fsl_pq_mdio.c | 14 +++++++-------
>>> 1 files changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
>>> index 52f4e8a..e17fd2f 100644
>>> --- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c
>>> +++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c
>>> @@ -385,6 +385,13 @@ static int fsl_pq_mdio_probe(struct platform_device *ofdev)
>>> tbiaddr = *prop;
>>> }
>>>
>>> + err = of_mdiobus_register(new_bus, np);
>>> + if (err) {
>>> + printk (KERN_ERR "%s: Cannot register as MDIO bus\n",
>>> + new_bus->name);
>>> + goto err_free_irqs;
>>> + }
>>> +
>>
>>
>> This fix totally breaks the point of setting tbipa beforehand.
>> mdiobus_register will cause the bus to be scanned, and if any of the
>> PHYs are at the default address for tbipa, they won't be found. I have
>> a different fix which I will (re)submit today.
>
> What happened here, did you send a patch?
>
> - k
^ permalink raw reply
* [PATCH net-next] bonding: Remove obsolete source file 'bond_ipv6.c'
From: Ben Hutchings @ 2011-11-24 18:16 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Jay Vosburgh, Andy Gospodarek
This file is now unused and should have been removed by commit
7c89943236750537d26421d9bbb6f6575e2d1e1b ("bonding, ipv4, ipv6, vlan:
Handle NETDEV_BONDING_FAILOVER like NETDEV_NOTIFY_PEERS").
Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
drivers/net/bonding/bond_ipv6.c | 225 ---------------------------------------
1 files changed, 0 insertions(+), 225 deletions(-)
delete mode 100644 drivers/net/bonding/bond_ipv6.c
diff --git a/drivers/net/bonding/bond_ipv6.c b/drivers/net/bonding/bond_ipv6.c
deleted file mode 100644
index 027a0ee..0000000
--- a/drivers/net/bonding/bond_ipv6.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright(c) 2008 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * The full GNU General Public License is included in this distribution in the
- * file called LICENSE.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/types.h>
-#include <linux/if_vlan.h>
-#include <net/ipv6.h>
-#include <net/ndisc.h>
-#include <net/addrconf.h>
-#include <net/netns/generic.h>
-#include "bonding.h"
-
-/*
- * Assign bond->master_ipv6 to the next IPv6 address in the list, or
- * zero it out if there are none.
- */
-static void bond_glean_dev_ipv6(struct net_device *dev, struct in6_addr *addr)
-{
- struct inet6_dev *idev;
-
- if (!dev)
- return;
-
- idev = in6_dev_get(dev);
- if (!idev)
- return;
-
- read_lock_bh(&idev->lock);
- if (!list_empty(&idev->addr_list)) {
- struct inet6_ifaddr *ifa
- = list_first_entry(&idev->addr_list,
- struct inet6_ifaddr, if_list);
- ipv6_addr_copy(addr, &ifa->addr);
- } else
- ipv6_addr_set(addr, 0, 0, 0, 0);
-
- read_unlock_bh(&idev->lock);
-
- in6_dev_put(idev);
-}
-
-static void bond_na_send(struct net_device *slave_dev,
- struct in6_addr *daddr,
- int router,
- unsigned short vlan_id)
-{
- struct in6_addr mcaddr;
- struct icmp6hdr icmp6h = {
- .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT,
- };
- struct sk_buff *skb;
-
- icmp6h.icmp6_router = router;
- icmp6h.icmp6_solicited = 0;
- icmp6h.icmp6_override = 1;
-
- addrconf_addr_solict_mult(daddr, &mcaddr);
-
- pr_debug("ipv6 na on slave %s: dest %pI6, src %pI6\n",
- slave_dev->name, &mcaddr, daddr);
-
- skb = ndisc_build_skb(slave_dev, &mcaddr, daddr, &icmp6h, daddr,
- ND_OPT_TARGET_LL_ADDR);
-
- if (!skb) {
- pr_err("NA packet allocation failed\n");
- return;
- }
-
- if (vlan_id) {
- /* The Ethernet header is not present yet, so it is
- * too early to insert a VLAN tag. Force use of an
- * out-of-line tag here and let dev_hard_start_xmit()
- * insert it if the slave hardware can't.
- */
- skb = __vlan_hwaccel_put_tag(skb, vlan_id);
- if (!skb) {
- pr_err("failed to insert VLAN tag\n");
- return;
- }
- }
-
- ndisc_send_skb(skb, slave_dev, NULL, &mcaddr, daddr, &icmp6h);
-}
-
-/*
- * Kick out an unsolicited Neighbor Advertisement for an IPv6 address on
- * the bonding master. This will help the switch learn our address
- * if in active-backup mode.
- *
- * Caller must hold curr_slave_lock for read or better
- */
-void bond_send_unsolicited_na(struct bonding *bond)
-{
- struct slave *slave = bond->curr_active_slave;
- struct vlan_entry *vlan;
- struct inet6_dev *idev;
- int is_router;
-
- pr_debug("%s: bond %s slave %s\n", bond->dev->name,
- __func__, slave ? slave->dev->name : "NULL");
-
- if (!slave || !bond->send_unsol_na ||
- test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state))
- return;
-
- bond->send_unsol_na--;
-
- idev = in6_dev_get(bond->dev);
- if (!idev)
- return;
-
- is_router = !!idev->cnf.forwarding;
-
- in6_dev_put(idev);
-
- if (!ipv6_addr_any(&bond->master_ipv6))
- bond_na_send(slave->dev, &bond->master_ipv6, is_router, 0);
-
- list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
- if (!ipv6_addr_any(&vlan->vlan_ipv6)) {
- bond_na_send(slave->dev, &vlan->vlan_ipv6, is_router,
- vlan->vlan_id);
- }
- }
-}
-
-/*
- * bond_inet6addr_event: handle inet6addr notifier chain events.
- *
- * We keep track of device IPv6 addresses primarily to use as source
- * addresses in NS probes.
- *
- * We track one IPv6 for the main device (if it has one).
- */
-static int bond_inet6addr_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- struct inet6_ifaddr *ifa = ptr;
- struct net_device *vlan_dev, *event_dev = ifa->idev->dev;
- struct bonding *bond;
- struct vlan_entry *vlan;
- struct bond_net *bn = net_generic(dev_net(event_dev), bond_net_id);
-
- list_for_each_entry(bond, &bn->dev_list, bond_list) {
- if (bond->dev == event_dev) {
- switch (event) {
- case NETDEV_UP:
- if (ipv6_addr_any(&bond->master_ipv6))
- ipv6_addr_copy(&bond->master_ipv6,
- &ifa->addr);
- return NOTIFY_OK;
- case NETDEV_DOWN:
- if (ipv6_addr_equal(&bond->master_ipv6,
- &ifa->addr))
- bond_glean_dev_ipv6(bond->dev,
- &bond->master_ipv6);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
- }
-
- list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
- rcu_read_lock();
- vlan_dev = __vlan_find_dev_deep(bond->dev,
- vlan->vlan_id);
- rcu_read_unlock();
- if (vlan_dev == event_dev) {
- switch (event) {
- case NETDEV_UP:
- if (ipv6_addr_any(&vlan->vlan_ipv6))
- ipv6_addr_copy(&vlan->vlan_ipv6,
- &ifa->addr);
- return NOTIFY_OK;
- case NETDEV_DOWN:
- if (ipv6_addr_equal(&vlan->vlan_ipv6,
- &ifa->addr))
- bond_glean_dev_ipv6(vlan_dev,
- &vlan->vlan_ipv6);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
- }
- }
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block bond_inet6addr_notifier = {
- .notifier_call = bond_inet6addr_event,
-};
-
-void bond_register_ipv6_notifier(void)
-{
- register_inet6addr_notifier(&bond_inet6addr_notifier);
-}
-
-void bond_unregister_ipv6_notifier(void)
-{
- unregister_inet6addr_notifier(&bond_inet6addr_notifier);
-}
-
--
1.7.4.4
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply related
* Re: [PATCH 1/2] ax25: integer overflows in ax25_setsockopt()
From: Xi Wang @ 2011-11-24 19:09 UTC (permalink / raw)
To: Ralf Baechle
Cc: linux-kernel, Joerg Reuter, David Miller, linux-hams, netdev,
Thomas Osterried
In-Reply-To: <20111123170930.GA7260@linux-mips.org>
Thanks a lot for your comments! Look forward to your patch.
- xi
^ permalink raw reply
* Re: [PATCH 1/1] Adding examples in ip man page on basic use of the ip command.
From: Ben Hutchings @ 2011-11-24 19:12 UTC (permalink / raw)
To: Alex Juncu; +Cc: netdev, shemminger
In-Reply-To: <4ECE31F6.4070002@ixiacom.com>
On Thu, 2011-11-24 at 14:00 +0200, Alex Juncu wrote:
> ---
> man/man8/ip.8 | 43 +++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 43 insertions(+), 0 deletions(-)
>
> diff --git a/man/man8/ip.8 b/man/man8/ip.8
> index a20eca7..99c8953 100644
> --- a/man/man8/ip.8
> +++ b/man/man8/ip.8
> @@ -2855,6 +2855,49 @@ can be
Although it's customary to put EXAMPLES near the bottom of the page, I
suspect that the same users who are intimidated by the long usage
description are unlikely ever to get this far. But I may be wrong;
maybe people do specifically look for this section.
Alternately, it may be more helpful to reorder the SYNOPSIS to emphasise
the most commonly used commands.
Ben.
> .SS ip xfrm monitor - state monitoring for xfrm objects
> The xfrm objects to monitor can be optionally specified.
>
> +.SH EXAMPLES
> +.LP
> +To turn on an interface:
> +.RS
> +.nf
> +.B ip link set up dev eth0
> +.fi
> +.RE
> +.LP
> +To add an IPv4 address to an interface:
> +.RS
> +.nf
> +.B ip address add 141.85.37.42/24 dev eth0
> +.fi
> +.RE
> +.LP
> +To add a default route in the IPv4 routing table using a next hop IP
> address:
> +.RS
> +.nf
> +.B ip route add default via 141.85.37.1
> +.fi
> +.RE
> +.LP
> +Equivalent ways of listing address information for all the interfaces:
> +.RS
> +.nf
> +.B ip addr show
> +.fi
> +.RE
> +.LP
> +.RS
> +.nf
> +.B ip a s
> +.fi
> +.RE
> +.LP
> +.RS
> +.nf
> +.B ip a
> +.fi
> +.RE
> +.LP
> +
> .SH HISTORY
> .B ip
> was written by Alexey N. Kuznetsov and added in Linux 2.2.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: [PATCH] natsemi: make cable length magic configurable
From: Ben Hutchings @ 2011-11-24 19:25 UTC (permalink / raw)
To: Jean Delvare; +Cc: netdev, Tim Hockin, Olaf Kirch
In-Reply-To: <201111241443.59191.jdelvare@suse.de>
On Thu, 2011-11-24 at 14:43 +0100, Jean Delvare wrote:
> From: Olaf Kirch <okir@suse.de>
>
> We had a customer report concerning problems with a Natsemi DP83815-D
> and long cables. With 100m cables, the network would be essentially dead,
> not a single packet would get through either way. We had to apply the
> patch below to make it work.
>
> The patch adds a module parameter named "no_cable_magic" that does
> two things:
>
> - Unconditionally set the DSPCFG register to the
> fixed value. Without this change, the chip apparently
> never completes autonegotiation in the tested configuration.
>
> This has been an unconditional assignment for a long time,
> until this was changed in 2.6.11 (there's an interesting
> explanation in the ChangeLog, subject is
> "[PATCH] natsemi long cable fix", bk commit is
> 5871b81bf2b5cf188deab0d414dce104fcb69ca6, git commit in
> tglx/history tree is c0d51c67f9c398279a95c5a7df387f2d9a586c98.
>
> - Skip the bit banging in {,un}do_cable_magic. It seems that
> if we write the DSPCFG register as above, a rev D chip will report
> all cables as "short cables", which do_cable_magic detects, and
> trying to be helpful it will "fix" the attenuation coefficient.
>
> I admit the use of a module parameter is ugly, but I didn't find a sane
> way to fix this - especially since the magic registers we're changing
> are undocumented.
[...]
This could be implemented as an ethtool 'private flag'. However, the
ethtool utility currently does not provide an interface to them.
Perhaps you could implement both the private flag and the module
parameter for now, and then drop the module parameter some time after
the utility has been updated.
You would need to:
1. Number the flags starting from 0. Well, that was easy.
2. Implement {get,set}_priv_flags() operations to access all flags as
a bitmask.
3. Expose the flag names as string set ETH_SS_PRIV_FLAGS accessed by
get_sset_count() and get_strings() operations.
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Open vSwitch Design
From: Jesse Gross @ 2011-11-24 20:10 UTC (permalink / raw)
To: netdev, dev
Cc: David Miller, Stephen Hemminger, Chris Wright, Herbert Xu,
Eric Dumazet, John Fastabend, Justin Pettit, jhs
I realized that since Open vSwitch is so userspace-centric some of the
design considerations might not be apparent from the kernel code
alone. I did a poor job of explaining the larger picture which has
lead to some misconceptions, so I thought it would be helpful if I
gave a short overview.
One of the driving goals was to push as much logic as possible to
userspace, so the kernel portion is less than 6000 lines and has four
components:
* Switching infrastructure: As the name implies, Open vSwitch is
intended to be a network switch, focused on
virtualization/OpenFlow/software defined networking. This means that
what we are modeling is not actually a collection of flows but a
switch which contains a group of related ports, a software virtual
device, etc. The switch model is used in a variety of places, such as
to measure traffic that actually flows through it in order to
implement monitoring and sampling protocols.
* Flow lookup: Although used to implement OpenFlow, the kernel flow
table does not actually directly contain OpenFlow flows. This is
because OpenFlow tables can contain wildcards, multiple pipeline
stages, etc. and we did not want to push that complexity into the
kernel fast path (nor tie it to a specific version of OpenFlow).
Instead an exact match flow table is populated on-demand from
userspace based on the more complex rules stored there. Although it
might seem limiting, this design has allowed significant new
functionality to be added without modifications to the kernel or
performance impact.
* Packet execution: Once a flow is matched it can be output,
enqueued to a particular qdisc, etc. Some of these operations are
specific to Open vSwitch, such as sampling, whereas others we leverage
existing infrastructure (including tc for QoS) by simply marking the
packet for further processing.
* Userspace interfaces: One of the difficulties of having a
specialized, exact match flow lookup engine is maintaining
compatibility across differing kernel/userspace versions. This
compatibility shows up heavily in the userspace interfaces and is
achieved by passing the kernel's version of the flow along with packet
information. This allows userspace to install appropriate flows even
if its interpretation of a packet differs from the kernel's without
version checks or maintaining multiple implementations of the flow
extraction code in the kernel.
It's obviously possible to put this code anywhere, whether it is an
independent module, in the bridge, or tc. Regardless, however, it's
largely new code that is geared towards this particular model so it
seems better not to add to the complexity of existing components if at
all possible.
^ permalink raw reply
* [PATCH] at91_ether: use gpio_is_valid for phy IRQ line
From: Nicolas Ferre @ 2011-11-24 21:21 UTC (permalink / raw)
To: jamie, netdev
Cc: plagnioj, sfr, linux-next, linux-kernel, linux-arm-kernel,
Nicolas Ferre
In-Reply-To: <1322128098-17724-1-git-send-email-jamie@jamieiles.com>
Use the generic gpiolib gpio_is_valid() function to test
if the phy IRQ line GPIO is actually provided.
For non-connected or non-existing phy IRQ lines, -EINVAL
value is used for phy_irq_pin field of struct at91_eth_data.
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
drivers/net/ethernet/cadence/at91_ether.c | 23 +++++++++++++----------
1 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/cadence/at91_ether.c b/drivers/net/ethernet/cadence/at91_ether.c
index 56624d3..a1c4143 100644
--- a/drivers/net/ethernet/cadence/at91_ether.c
+++ b/drivers/net/ethernet/cadence/at91_ether.c
@@ -255,8 +255,7 @@ static void enable_phyirq(struct net_device *dev)
unsigned int dsintr, irq_number;
int status;
- irq_number = lp->board_data.phy_irq_pin;
- if (!irq_number) {
+ if (!gpio_is_valid(lp->board_data.phy_irq_pin)) {
/*
* PHY doesn't have an IRQ pin (RTL8201, DP83847, AC101L),
* or board does not have it connected.
@@ -265,6 +264,7 @@ static void enable_phyirq(struct net_device *dev)
return;
}
+ irq_number = lp->board_data.phy_irq_pin;
status = request_irq(irq_number, at91ether_phy_interrupt, 0, dev->name, dev);
if (status) {
printk(KERN_ERR "at91_ether: PHY IRQ %d request failed - status %d!\n", irq_number, status);
@@ -319,8 +319,7 @@ static void disable_phyirq(struct net_device *dev)
unsigned int dsintr;
unsigned int irq_number;
- irq_number = lp->board_data.phy_irq_pin;
- if (!irq_number) {
+ if (!gpio_is_valid(lp->board_data.phy_irq_pin)) {
del_timer_sync(&lp->check_timer);
return;
}
@@ -365,6 +364,7 @@ static void disable_phyirq(struct net_device *dev)
disable_mdi();
spin_unlock_irq(&lp->lock);
+ irq_number = lp->board_data.phy_irq_pin;
free_irq(irq_number, dev); /* Free interrupt handler */
}
@@ -1077,7 +1077,7 @@ static int __init at91ether_setup(unsigned long phy_type, unsigned short phy_add
netif_carrier_off(dev); /* will be enabled in open() */
/* If board has no PHY IRQ, use a timer to poll the PHY */
- if (!lp->board_data.phy_irq_pin) {
+ if (!gpio_is_valid(lp->board_data.phy_irq_pin)) {
init_timer(&lp->check_timer);
lp->check_timer.data = (unsigned long)dev;
lp->check_timer.function = at91ether_check_link;
@@ -1169,7 +1169,8 @@ static int __devexit at91ether_remove(struct platform_device *pdev)
struct net_device *dev = platform_get_drvdata(pdev);
struct at91_private *lp = netdev_priv(dev);
- if (lp->board_data.phy_irq_pin >= 32)
+ if (gpio_is_valid(lp->board_data.phy_irq_pin) &&
+ lp->board_data.phy_irq_pin >= 32)
gpio_free(lp->board_data.phy_irq_pin);
unregister_netdev(dev);
@@ -1188,11 +1189,12 @@ static int at91ether_suspend(struct platform_device *pdev, pm_message_t mesg)
{
struct net_device *net_dev = platform_get_drvdata(pdev);
struct at91_private *lp = netdev_priv(net_dev);
- int phy_irq = lp->board_data.phy_irq_pin;
if (netif_running(net_dev)) {
- if (phy_irq)
+ if (gpio_is_valid(lp->board_data.phy_irq_pin)) {
+ int phy_irq = lp->board_data.phy_irq_pin;
disable_irq(phy_irq);
+ }
netif_stop_queue(net_dev);
netif_device_detach(net_dev);
@@ -1206,7 +1208,6 @@ static int at91ether_resume(struct platform_device *pdev)
{
struct net_device *net_dev = platform_get_drvdata(pdev);
struct at91_private *lp = netdev_priv(net_dev);
- int phy_irq = lp->board_data.phy_irq_pin;
if (netif_running(net_dev)) {
clk_enable(lp->ether_clk);
@@ -1214,8 +1215,10 @@ static int at91ether_resume(struct platform_device *pdev)
netif_device_attach(net_dev);
netif_start_queue(net_dev);
- if (phy_irq)
+ if (gpio_is_valid(lp->board_data.phy_irq_pin)) {
+ int phy_irq = lp->board_data.phy_irq_pin;
enable_irq(phy_irq);
+ }
}
return 0;
}
--
1.7.5.4
^ permalink raw reply related
* Re: Finding a hidden bound TCP socket
From: richard -rw- weinberger @ 2011-11-24 21:31 UTC (permalink / raw)
To: Rick Jones; +Cc: David Miller, gdfuego, linux-kernel, netdev
In-Reply-To: <4ECD67C7.3010702@hp.com>
On Wed, Nov 23, 2011 at 10:38 PM, Rick Jones <rick.jones2@hp.com> wrote:
> On 11/23/2011 01:01 PM, David Miller wrote:
>>
>> From: "G. D. Fuego"<gdfuego@gmail.com>
>> Date: Wed, 23 Nov 2011 15:27:33 -0500
>>
>>> Any comments? The behavior seems broken. At the very least its very
>>> inconsistent with other Unixes.
>>
>> Until the socket has a full final tuple it is bound to, there is no
>> reason to list it.
>>
>> No UNIX lists a socket which is partially bound and hasn't either
>> performed a listen() or a connect().
>
> Well.... I took the .c file mentioned previously, and compiled it on a
> Solaris 10 8/11 instance. The 25-odd sockets it created *were* listed in
> the output of netstat -an -- local address as *.<portnum> remote address as
> *.* and a state of "BOUND."
>
> A FreeBSD (rev 8 IIRC) netstat -an seems to display them in a state of
> "CLOSED." I didn't check HP-UX 11i v3 or AIX 6.
>
IRIX (6.5) shows them as "CLOSED".
--
Thanks,
//richard
^ permalink raw reply
* Re: Finding a hidden bound TCP socket
From: Eric Dumazet @ 2011-11-24 21:53 UTC (permalink / raw)
To: richard -rw- weinberger
Cc: Rick Jones, David Miller, gdfuego, linux-kernel, netdev
In-Reply-To: <CAFLxGvzAV9ODxo6Bs=ow1J0JRwHYNFmkBqrmN33D7Ci=YvNKNg@mail.gmail.com>
Le jeudi 24 novembre 2011 à 22:31 +0100, richard -rw- weinberger a
écrit :
> IRIX (6.5) shows them as "CLOSED".
>
As I said, patches are welcome.
^ permalink raw reply
* Re: [PATCH net-next 1/2] netem: rate-latency extension
From: Eric Dumazet @ 2011-11-24 22:14 UTC (permalink / raw)
To: Hagen Paul Pfeifer; +Cc: netdev, Stephen Hemminger
In-Reply-To: <1322156378-23257-1-git-send-email-hagen@jauu.net>
Le jeudi 24 novembre 2011 à 18:39 +0100, Hagen Paul Pfeifer a écrit :
> Currently netem is not in the ability to emulate channel bandwidth. Only static
> delay (and optional random jitter) can be configured.
>
> To emulate the channel rate the token bucket filter (sch_tbf) can be used. But
> TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot
> be 0. Also the idea behind TBF is that the credit (token in buckets) fills if
> no packet is transmitted. So that there is always a "positive" credit for new
> packets. In real life this behavior contradicts the law of nature where
> nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s
> link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0
> seconds.
>
> Netem is an excellent place to implement a rate limiting feature: static
> delay is already implemented, tfifo already has time information and the
> user can skip TBF configuration completely.
>
> This patch implement rate latency feature which can be configured via
> tc. e.g:
>
> tc qdisc add dev eth0 root netem ratelatency 10kbit
>
> To emulate a link of 5000byte/s and add an additional static delay of 10ms:
>
> tc qdisc add dev eth0 root netem delay 10ms ratelatency 5KBps
>
> Note: similar to TBF the rate-latency extension is bounded to the kernel timing
> system. Depending on the architecture timer granularity, higher rates (e.g.
> 10mbit/s and higher) tend to transmission bursts. Also note: further queues
> living in network adaptors; see ethtool(8).
>
> Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
> ---
> include/linux/pkt_sched.h | 5 +++++
> net/sched/sch_netem.c | 40 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 45 insertions(+), 0 deletions(-)
I like this patch, this is a useful extension.
Only point is why you chose ratelatency instead of rate ?
We want to emulate a real link, and yes, a 1000 bytes packet must be
delayed _before_ we deliver it to the device, but its a detail of how
works netem.
The usual word we use to describe a 1Mbps link is "1Mbps rate" ;)
^ permalink raw reply
* Re: [PATCH] at91_ether: use gpio_is_valid for phy IRQ line
From: Jamie Iles @ 2011-11-24 22:28 UTC (permalink / raw)
To: Nicolas Ferre
Cc: jamie, netdev, plagnioj, sfr, linux-next, linux-kernel,
linux-arm-kernel
In-Reply-To: <1322169674-4109-1-git-send-email-nicolas.ferre@atmel.com>
Hi Nicolas,
On Thu, Nov 24, 2011 at 10:21:14PM +0100, Nicolas Ferre wrote:
> Use the generic gpiolib gpio_is_valid() function to test
> if the phy IRQ line GPIO is actually provided.
>
> For non-connected or non-existing phy IRQ lines, -EINVAL
> value is used for phy_irq_pin field of struct at91_eth_data.
>
> Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
> ---
> drivers/net/ethernet/cadence/at91_ether.c | 23 +++++++++++++----------
> 1 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/ethernet/cadence/at91_ether.c b/drivers/net/ethernet/cadence/at91_ether.c
> index 56624d3..a1c4143 100644
> --- a/drivers/net/ethernet/cadence/at91_ether.c
> +++ b/drivers/net/ethernet/cadence/at91_ether.c
> @@ -255,8 +255,7 @@ static void enable_phyirq(struct net_device *dev)
> unsigned int dsintr, irq_number;
> int status;
>
> - irq_number = lp->board_data.phy_irq_pin;
> - if (!irq_number) {
> + if (!gpio_is_valid(lp->board_data.phy_irq_pin)) {
> /*
> * PHY doesn't have an IRQ pin (RTL8201, DP83847, AC101L),
> * or board does not have it connected.
> @@ -265,6 +264,7 @@ static void enable_phyirq(struct net_device *dev)
> return;
> }
>
> + irq_number = lp->board_data.phy_irq_pin;
Does this need to be:
irq_number = gpio_to_irq(lp->board_data.phy_irq_pin);
and the same for the other occurrences? Otherwise this looks like the
right thing to me.
Jamie
^ permalink raw reply
* Re: Open vSwitch Design
From: jamal @ 2011-11-24 22:30 UTC (permalink / raw)
To: Jesse Gross
Cc: dev-yBygre7rU0TnMu66kgdUjQ, Chris Wright, Herbert Xu,
Eric Dumazet, netdev, John Fastabend, Stephen Hemminger,
David Miller
In-Reply-To: <CAEP_g=_2L1xFWtDXh_6YyXz1Mt9TR3zvjLzix+SpO6yzeOLsSQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
Jesse,
I am going to try and respond to your comments below.
On Thu, 2011-11-24 at 12:10 -0800, Jesse Gross wrote:
>
> * Switching infrastructure: As the name implies, Open vSwitch is
> intended to be a network switch, focused on
> virtualization/OpenFlow/software defined networking. This means that
> what we are modeling is not actually a collection of flows but a
> switch which contains a group of related ports, a software virtual
> device, etc. The switch model is used in a variety of places, such as
> to measure traffic that actually flows through it in order to
> implement monitoring and sampling protocols.
Can you explain why you couldnt use the current bridge code (likely with
some mods)? I can see you want to isolate the VMs via the virtual ports;
maybe even vlans on the virtual ports - the current bridge code should
be able to handle that.
> * Flow lookup: Although used to implement OpenFlow, the kernel flow
> table does not actually directly contain OpenFlow flows. This is
> because OpenFlow tables can contain wildcards, multiple pipeline
> stages, etc. and we did not want to push that complexity into the
> kernel fast path (nor tie it to a specific version of OpenFlow).
> Instead an exact match flow table is populated on-demand from
> userspace based on the more complex rules stored there. Although it
> might seem limiting, this design has allowed significant new
> functionality to be added without modifications to the kernel or
> performance impact.
This can be achieved easily with zero changes to the kernel code.
You need to have default filters that redirect flows to user space
when you fail to match.
> * Packet execution: Once a flow is matched it can be output,
> enqueued to a particular qdisc, etc. Some of these operations are
> specific to Open vSwitch, such as sampling, whereas others we leverage
> existing infrastructure (including tc for QoS) by simply marking the
> packet for further processing.
The tc classifier-action-qdisc infrastructure handles this.
The sampler needs a new action defined.
> * Userspace interfaces: One of the difficulties of having a
> specialized, exact match flow lookup engine is maintaining
> compatibility across differing kernel/userspace versions. This
> compatibility shows up heavily in the userspace interfaces and is
> achieved by passing the kernel's version of the flow along with packet
> information. This allows userspace to install appropriate flows even
> if its interpretation of a packet differs from the kernel's without
> version checks or maintaining multiple implementations of the flow
> extraction code in the kernel.
I didnt quiet follow - are we talking about backward/forward
compatibility?
> It's obviously possible to put this code anywhere, whether it is an
> independent module, in the bridge, or tc. Regardless, however, it's
> largely new code that is geared towards this particular model so it
> seems better not to add to the complexity of existing components if at
> all possible.
I am still not seeing how this could not be done without the
infrastructure that exists. Granted, the user space brains - thats where
everything else resides - but you are not pushing that i think.
cheers,
jamal
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox