* [PATCH 26/72] IPVS: sip persistence engine
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
Add the SIP callid as a key for persistence.
This allows multiple connections from the same IP address to be
differentiated on the basis of the callid.
When used in conjunction with the persistence mask, it allows connections
from different IP addresses to be aggregated on the basis of the callid.
It is envisaged that a persistence mask of 0.0.0.0 will be a useful
setting. That is, ignore the source IP address when checking for
persistence.
It is envisaged that this option will be used in conjunction with
one-packet scheduling.
This only works with UDP and cannot be made to work with TCP
within the current framework.
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/Kconfig | 7 ++
net/netfilter/ipvs/Makefile | 3 +
net/netfilter/ipvs/ip_vs_pe_sip.c | 167 +++++++++++++++++++++++++++++++++++++
3 files changed, 177 insertions(+), 0 deletions(-)
create mode 100644 net/netfilter/ipvs/ip_vs_pe_sip.c
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index af3c9f4..a22dac2 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -256,4 +256,11 @@ config IP_VS_NFCT
connection state to be exported to the Netfilter framework
for filtering purposes.
+config IP_VS_PE_SIP
+ tristate "SIP persistence engine"
+ depends on IP_VS_PROTO_UDP
+ depends on NF_CONNTRACK_SIP
+ ---help---
+ Allow persistence based on the SIP Call-ID
+
endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index 4a87bf3..34ee602 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -35,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 0000000..a0539f1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,167 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+ const char *callid, size_t callid_len,
+ int *idx)
+{
+ size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+ memcpy(buf + *idx, callid, len);
+ buf[*idx+len] = '\0';
+ *idx += len + 1;
+ return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len) \
+ ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \
+ callid, len, &ip_vs_dbg_idx)
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+ unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ /* Find callid */
+ while (1) {
+ int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+ SIP_HDR_CALL_ID, matchoff,
+ matchlen);
+ if (ret > 0)
+ break;
+ if (!ret)
+ return 0;
+ dataoff += *matchoff;
+ }
+
+ /* Empty callid is useless */
+ if (!*matchlen)
+ return -EINVAL;
+
+ /* Too large is useless */
+ if (*matchlen > IP_VS_PEDATA_MAXLEN)
+ return -EINVAL;
+
+ /* SIP headers are always followed by a line terminator */
+ if (*matchoff + *matchlen == datalen)
+ return -EINVAL;
+
+ /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+ * RFC 3261 allows only CRLF, we support both. */
+ if (*(dptr + *matchoff + *matchlen) != '\r' &&
+ *(dptr + *matchoff + *matchlen) != '\n')
+ return -EINVAL;
+
+ IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+ IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+ *matchlen);
+ return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+ struct ip_vs_iphdr iph;
+ unsigned int dataoff, datalen, matchoff, matchlen;
+ const char *dptr;
+
+ ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+ /* Only useful with UDP */
+ if (iph.protocol != IPPROTO_UDP)
+ return -EINVAL;
+
+ /* No Data ? */
+ dataoff = iph.len + sizeof(struct udphdr);
+ if (dataoff >= skb->len)
+ return -EINVAL;
+
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+
+ if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+ return -EINVAL;
+
+ p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
+ /* N.B: pe_data is only set on success,
+ * this allows fallback to the default persistence logic on failure
+ */
+ memcpy(p->pe_data, dptr + matchoff, matchlen);
+ p->pe_data_len = matchlen;
+
+ return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+ struct ip_vs_conn *ct)
+
+{
+ bool ret = 0;
+
+ if (ct->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * d_addr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ p->vaddr, &ct->vaddr) &&
+ ct->vport == p->vport &&
+ ct->flags & IP_VS_CONN_F_TEMPLATE &&
+ ct->protocol == p->protocol &&
+ ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+ !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+ ret = 1;
+
+ IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+ u32 initval, bool inverse)
+{
+ return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+ memcpy(buf, cp->pe_data, cp->pe_data_len);
+ return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+ .name = "sip",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+ .fill_param = ip_vs_sip_fill_param,
+ .ct_match = ip_vs_sip_ct_match,
+ .hashkey_raw = ip_vs_sip_hashkey_raw,
+ .show_pe_data = ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+ return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+ unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
--
1.7.1
^ permalink raw reply related
* [PATCH 25/72] IPVS: Fallback if persistence engine fails
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
Fall back to normal persistence handling if the persistence
engine fails to recognise a packet.
This way, at least the packet will go somewhere.
It is envisaged that iptables could be used to block packets
such if this is not desired although nf_conntrack_sip would
likely need to be enhanced first.
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/ip_vs_conn.c | 6 +++---
net/netfilter/ipvs/ip_vs_core.c | 10 ++++------
2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 4adedef..1d1a529 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -154,7 +154,7 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
const union nf_inet_addr *addr;
__be16 port;
- if (p->pe && p->pe->hashkey_raw)
+ if (p->pe_data && p->pe->hashkey_raw)
return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
ip_vs_conn_tab_mask;
@@ -353,7 +353,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (p->pe && p->pe->ct_match) {
+ if (p->pe_data && p->pe->ct_match) {
if (p->pe->ct_match(p, cp))
goto out;
continue;
@@ -956,7 +956,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
size_t len = 0;
- if (cp->dest && cp->dest->svc->pe &&
+ if (cp->dest && cp->pe_data &&
cp->dest->svc->pe->show_pe_data) {
pe_data[0] = ' ';
len = strlen(cp->dest->svc->pe->name);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index ab98893..e5fef7a 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -176,7 +176,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
return pp->state_transition(cp, direction, skb, pp);
}
-static inline int
+static inline void
ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
struct sk_buff *skb, int protocol,
const union nf_inet_addr *caddr, __be16 cport,
@@ -186,8 +186,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
p->pe = svc->pe;
if (p->pe && p->pe->fill_param)
- return p->pe->fill_param(p, skb);
- return 0;
+ p->pe->fill_param(p, skb);
}
/*
@@ -268,9 +267,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
vaddr = &fwmark;
}
}
- if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
- vaddr, vport, ¶m))
- return NULL;
+ ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, ¶m);
}
/* Check if a template already exists */
--
1.7.1
^ permalink raw reply related
* [PATCH 28/72] netfilter: ipt_LOG: add bufferisation to call printk() once
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Eric Dumazet <eric.dumazet@gmail.com>
ipt_LOG & ip6t_LOG use lot of calls to printk() and use a lock in a hope
several cpus wont mix their output in syslog.
printk() being very expensive [1], its better to call it once, on a
prebuilt and complete line. Also, with mixed IPv4 and IPv6 trafic,
separate IPv4/IPv6 locks dont avoid garbage.
I used an allocation of a 1024 bytes structure, sort of seq_printf() but
with a fixed size limit.
Use a static buffer if dynamic allocation failed.
Emit a once time alert if buffer size happens to be too short.
[1]: printk() has various features like printk_delay()...
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv4/netfilter/ipt_LOG.c | 145 +++++++++++++++++++-------------------
net/ipv6/netfilter/ip6t_LOG.c | 157 +++++++++++++++++++++--------------------
2 files changed, 152 insertions(+), 150 deletions(-)
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 915fc17..72ffc8f 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -24,16 +24,15 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_LOG.h>
#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
/* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb,
unsigned int iphoff)
{
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Important fields:
* TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- printk("SRC=%pI4 DST=%pI4 ",
+ sb_add(m, "SRC=%pI4 DST=%pI4 ",
&ih->saddr, &ih->daddr);
/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
/* Max length: 6 "CE DF MF " */
if (ntohs(ih->frag_off) & IP_CE)
- printk("CE ");
+ sb_add(m, "CE ");
if (ntohs(ih->frag_off) & IP_DF)
- printk("DF ");
+ sb_add(m, "DF ");
if (ntohs(ih->frag_off) & IP_MF)
- printk("MF ");
+ sb_add(m, "MF ");
/* Max length: 11 "FRAG:65535 " */
if (ntohs(ih->frag_off) & IP_OFFSET)
- printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+ sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
if ((logflags & IPT_LOG_IPOPT) &&
ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
op = skb_header_pointer(skb, iphoff+sizeof(_iph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct tcphdr *th;
/* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
+ sb_add(m, "PROTO=TCP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_tcph), &_tcph);
if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
+ sb_add(m, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
if (logflags & IPT_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
+ sb_add(m, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
+ sb_add(m, "WINDOW=%u ", ntohs(th->window));
/* Max length: 9 "RES=0x3F " */
- printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
if (th->cwr)
- printk("CWR ");
+ sb_add(m, "CWR ");
if (th->ece)
- printk("ECE ");
+ sb_add(m, "ECE ");
if (th->urg)
- printk("URG ");
+ sb_add(m, "URG ");
if (th->ack)
- printk("ACK ");
+ sb_add(m, "ACK ");
if (th->psh)
- printk("PSH ");
+ sb_add(m, "PSH ");
if (th->rst)
- printk("RST ");
+ sb_add(m, "RST ");
if (th->syn)
- printk("SYN ");
+ sb_add(m, "SYN ");
if (th->fin)
- printk("FIN ");
+ sb_add(m, "FIN ");
/* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
+ sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
if ((logflags & IPT_LOG_TCPOPT) &&
th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
iphoff+ih->ihl*4+sizeof(_tcph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
break;
}
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
if (ih->protocol == IPPROTO_UDP)
/* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
+ sb_add(m, "PROTO=UDP " );
else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
+ sb_add(m, "PROTO=UDPLITE ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_udph), &_udph);
if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
+ sb_add(m, "SPT=%u DPT=%u LEN=%u ",
ntohs(uh->source), ntohs(uh->dest),
ntohs(uh->len));
break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
[ICMP_ADDRESSREPLY] = 12 };
/* Max length: 11 "PROTO=ICMP " */
- printk("PROTO=ICMP ");
+ sb_add(m, "PROTO=ICMP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_icmph), &_icmph);
if (ich == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+ sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
if (ich->type <= NR_ICMP_TYPES &&
required_len[ich->type] &&
skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
case ICMP_ECHOREPLY:
case ICMP_ECHO:
/* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
+ sb_add(m, "ID=%u SEQ=%u ",
ntohs(ich->un.echo.id),
ntohs(ich->un.echo.sequence));
break;
case ICMP_PARAMETERPROB:
/* Max length: 14 "PARAMETER=255 " */
- printk("PARAMETER=%u ",
+ sb_add(m, "PARAMETER=%u ",
ntohl(ich->un.gateway) >> 24);
break;
case ICMP_REDIRECT:
/* Max length: 24 "GATEWAY=255.255.255.255 " */
- printk("GATEWAY=%pI4 ", &ich->un.gateway);
+ sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
/* Fall through */
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_TIME_EXCEEDED:
/* Max length: 3+maxlen */
if (!iphoff) { /* Only recurse once. */
- printk("[");
- dump_packet(info, skb,
+ sb_add(m, "[");
+ dump_packet(m, info, skb,
iphoff + ih->ihl*4+sizeof(_icmph));
- printk("] ");
+ sb_add(m, "] ");
}
/* Max length: 10 "MTU=65535 " */
if (ich->type == ICMP_DEST_UNREACH &&
ich->code == ICMP_FRAG_NEEDED)
- printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+ sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
}
break;
}
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
break;
/* Max length: 9 "PROTO=AH " */
- printk("PROTO=AH ");
+ sb_add(m, "PROTO=AH ");
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_ahdr), &_ahdr);
if (ah == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(ah->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
break;
}
case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_esp_hdr *eh;
/* Max length: 10 "PROTO=ESP " */
- printk("PROTO=ESP ");
+ sb_add(m, "PROTO=ESP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_esph), &_esph);
if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(eh->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
break;
}
/* Max length: 10 "PROTO 255 " */
default:
- printk("PROTO=%u ", ih->protocol);
+ sb_add(m, "PROTO=%u ", ih->protocol);
}
/* Max length: 15 "UID=4294967295 " */
if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u GID=%u ",
+ sb_add(m, "UID=%u GID=%u ",
skb->sk->sk_socket->file->f_cred->fsuid,
skb->sk->sk_socket->file->f_cred->fsgid);
read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (!iphoff && skb->mark)
- printk("MARK=0x%x ", skb->mark);
+ sb_add(m, "MARK=0x%x ", skb->mark);
/* Proto Max log string length */
/* IP: 40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
switch (dev->type) {
case ARPHRD_ETHER:
- printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
ntohs(eth_hdr(skb)->h_proto));
return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
}
fallback:
- printk("MAC=");
+ sb_add(m, "MAC=");
if (dev->hard_header_len &&
skb->mac_header != skb->network_header) {
const unsigned char *p = skb_mac_header(skb);
unsigned int i;
- printk("%02x", *p++);
+ sb_add(m, "%02x", *p++);
for (i = 1; i < dev->hard_header_len; i++, p++)
- printk(":%02x", *p);
+ sb_add(m, ":%02x", *p);
}
- printk(" ");
+ sb_add(m, " ");
}
static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
+ struct sbuff *m = sb_open();
+
if (!loginfo)
loginfo = &default_loginfo;
- spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
prefix,
in ? in->name : "",
out ? out->name : "");
@@ -434,20 +435,20 @@ ipt_log_packet(u_int8_t pf,
physindev = skb->nf_bridge->physindev;
if (physindev && in != physindev)
- printk("PHYSIN=%s ", physindev->name);
+ sb_add(m, "PHYSIN=%s ", physindev->name);
physoutdev = skb->nf_bridge->physoutdev;
if (physoutdev && out != physoutdev)
- printk("PHYSOUT=%s ", physoutdev->name);
+ sb_add(m, "PHYSOUT=%s ", physoutdev->name);
}
#endif
/* MAC logging for input path only. */
if (in && !out)
- dump_mac_header(loginfo, skb);
+ dump_mac_header(m, loginfo, skb);
+
+ dump_packet(m, loginfo, skb, 0);
- dump_packet(loginfo, skb, 0);
- printk("\n");
- spin_unlock_bh(&log_lock);
+ sb_close(m);
}
static unsigned int
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0a07ae7..09c8889 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -23,6 +23,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog");
@@ -32,11 +33,9 @@ struct in_device;
#include <net/route.h>
#include <linux/netfilter_ipv6/ip6t_LOG.h>
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
/* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int ip6hoff,
int recurse)
{
@@ -55,15 +54,15 @@ static void dump_packet(const struct nf_loginfo *info,
ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
if (ih == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
- printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+ sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
- printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
(ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
ih->hop_limit,
@@ -78,35 +77,35 @@ static void dump_packet(const struct nf_loginfo *info,
hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
if (hp == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 48 "OPT (...) " */
if (logflags & IP6T_LOG_IPOPT)
- printk("OPT ( ");
+ sb_add(m, "OPT ( ");
switch (currenthdr) {
case IPPROTO_FRAGMENT: {
struct frag_hdr _fhdr;
const struct frag_hdr *fh;
- printk("FRAG:");
+ sb_add(m, "FRAG:");
fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
&_fhdr);
if (fh == NULL) {
- printk("TRUNCATED ");
+ sb_add(m, "TRUNCATED ");
return;
}
/* Max length: 6 "65535 " */
- printk("%u ", ntohs(fh->frag_off) & 0xFFF8);
+ sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
/* Max length: 11 "INCOMPLETE " */
if (fh->frag_off & htons(0x0001))
- printk("INCOMPLETE ");
+ sb_add(m, "INCOMPLETE ");
- printk("ID:%08x ", ntohl(fh->identification));
+ sb_add(m, "ID:%08x ", ntohl(fh->identification));
if (ntohs(fh->frag_off) & 0xFFF8)
fragment = 1;
@@ -120,7 +119,7 @@ static void dump_packet(const struct nf_loginfo *info,
case IPPROTO_HOPOPTS:
if (fragment) {
if (logflags & IP6T_LOG_IPOPT)
- printk(")");
+ sb_add(m, ")");
return;
}
hdrlen = ipv6_optlen(hp);
@@ -132,10 +131,10 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_auth_hdr *ah;
/* Max length: 3 "AH " */
- printk("AH ");
+ sb_add(m, "AH ");
if (fragment) {
- printk(")");
+ sb_add(m, ")");
return;
}
@@ -146,13 +145,13 @@ static void dump_packet(const struct nf_loginfo *info,
* Max length: 26 "INCOMPLETE [65535
* bytes] )"
*/
- printk("INCOMPLETE [%u bytes] )",
+ sb_add(m, "INCOMPLETE [%u bytes] )",
skb->len - ptr);
return;
}
/* Length: 15 "SPI=0xF1234567 */
- printk("SPI=0x%x ", ntohl(ah->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
}
@@ -164,10 +163,10 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_esp_hdr *eh;
/* Max length: 4 "ESP " */
- printk("ESP ");
+ sb_add(m, "ESP ");
if (fragment) {
- printk(")");
+ sb_add(m, ")");
return;
}
@@ -177,23 +176,23 @@ static void dump_packet(const struct nf_loginfo *info,
eh = skb_header_pointer(skb, ptr, sizeof(_esph),
&_esph);
if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] )",
+ sb_add(m, "INCOMPLETE [%u bytes] )",
skb->len - ptr);
return;
}
/* Length: 16 "SPI=0xF1234567 )" */
- printk("SPI=0x%x )", ntohl(eh->spi) );
+ sb_add(m, "SPI=0x%x )", ntohl(eh->spi) );
}
return;
default:
/* Max length: 20 "Unknown Ext Hdr 255" */
- printk("Unknown Ext Hdr %u", currenthdr);
+ sb_add(m, "Unknown Ext Hdr %u", currenthdr);
return;
}
if (logflags & IP6T_LOG_IPOPT)
- printk(") ");
+ sb_add(m, ") ");
currenthdr = hp->nexthdr;
ptr += hdrlen;
@@ -205,7 +204,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct tcphdr *th;
/* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
+ sb_add(m, "PROTO=TCP ");
if (fragment)
break;
@@ -213,40 +212,40 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
return;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
+ sb_add(m, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
if (logflags & IP6T_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
+ sb_add(m, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
+ sb_add(m, "WINDOW=%u ", ntohs(th->window));
/* Max length: 9 "RES=0x3C " */
- printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
if (th->cwr)
- printk("CWR ");
+ sb_add(m, "CWR ");
if (th->ece)
- printk("ECE ");
+ sb_add(m, "ECE ");
if (th->urg)
- printk("URG ");
+ sb_add(m, "URG ");
if (th->ack)
- printk("ACK ");
+ sb_add(m, "ACK ");
if (th->psh)
- printk("PSH ");
+ sb_add(m, "PSH ");
if (th->rst)
- printk("RST ");
+ sb_add(m, "RST ");
if (th->syn)
- printk("SYN ");
+ sb_add(m, "SYN ");
if (th->fin)
- printk("FIN ");
+ sb_add(m, "FIN ");
/* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
+ sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
if ((logflags & IP6T_LOG_TCPOPT) &&
th->doff * 4 > sizeof(struct tcphdr)) {
@@ -260,15 +259,15 @@ static void dump_packet(const struct nf_loginfo *info,
ptr + sizeof(struct tcphdr),
optsize, _opt);
if (op == NULL) {
- printk("OPT (TRUNCATED)");
+ sb_add(m, "OPT (TRUNCATED)");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i =0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
break;
}
@@ -279,9 +278,9 @@ static void dump_packet(const struct nf_loginfo *info,
if (currenthdr == IPPROTO_UDP)
/* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
+ sb_add(m, "PROTO=UDP " );
else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
+ sb_add(m, "PROTO=UDPLITE ");
if (fragment)
break;
@@ -289,12 +288,12 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
return;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
+ sb_add(m, "SPT=%u DPT=%u LEN=%u ",
ntohs(uh->source), ntohs(uh->dest),
ntohs(uh->len));
break;
@@ -304,7 +303,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct icmp6hdr *ic;
/* Max length: 13 "PROTO=ICMPv6 " */
- printk("PROTO=ICMPv6 ");
+ sb_add(m, "PROTO=ICMPv6 ");
if (fragment)
break;
@@ -312,18 +311,18 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
if (ic == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
return;
}
/* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+ sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
switch (ic->icmp6_type) {
case ICMPV6_ECHO_REQUEST:
case ICMPV6_ECHO_REPLY:
/* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
+ sb_add(m, "ID=%u SEQ=%u ",
ntohs(ic->icmp6_identifier),
ntohs(ic->icmp6_sequence));
break;
@@ -334,35 +333,35 @@ static void dump_packet(const struct nf_loginfo *info,
case ICMPV6_PARAMPROB:
/* Max length: 17 "POINTER=ffffffff " */
- printk("POINTER=%08x ", ntohl(ic->icmp6_pointer));
+ sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
/* Fall through */
case ICMPV6_DEST_UNREACH:
case ICMPV6_PKT_TOOBIG:
case ICMPV6_TIME_EXCEED:
/* Max length: 3+maxlen */
if (recurse) {
- printk("[");
- dump_packet(info, skb, ptr + sizeof(_icmp6h),
- 0);
- printk("] ");
+ sb_add(m, "[");
+ dump_packet(m, info, skb,
+ ptr + sizeof(_icmp6h), 0);
+ sb_add(m, "] ");
}
/* Max length: 10 "MTU=65535 " */
if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
- printk("MTU=%u ", ntohl(ic->icmp6_mtu));
+ sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
}
break;
}
/* Max length: 10 "PROTO=255 " */
default:
- printk("PROTO=%u ", currenthdr);
+ sb_add(m, "PROTO=%u ", currenthdr);
}
/* Max length: 15 "UID=4294967295 " */
if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u GID=%u ",
+ sb_add(m, "UID=%u GID=%u ",
skb->sk->sk_socket->file->f_cred->fsuid,
skb->sk->sk_socket->file->f_cred->fsgid);
read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -370,10 +369,11 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (!recurse && skb->mark)
- printk("MARK=0x%x ", skb->mark);
+ sb_add(m, "MARK=0x%x ", skb->mark);
}
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
@@ -387,7 +387,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
switch (dev->type) {
case ARPHRD_ETHER:
- printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
ntohs(eth_hdr(skb)->h_proto));
return;
@@ -396,7 +396,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
}
fallback:
- printk("MAC=");
+ sb_add(m, "MAC=");
if (dev->hard_header_len &&
skb->mac_header != skb->network_header) {
const unsigned char *p = skb_mac_header(skb);
@@ -408,19 +408,19 @@ fallback:
p = NULL;
if (p != NULL) {
- printk("%02x", *p++);
+ sb_add(m, "%02x", *p++);
for (i = 1; i < len; i++)
- printk(":%02x", p[i]);
+ sb_add(m, ":%02x", p[i]);
}
- printk(" ");
+ sb_add(m, " ");
if (dev->type == ARPHRD_SIT) {
const struct iphdr *iph =
(struct iphdr *)skb_mac_header(skb);
- printk("TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
+ sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
}
} else
- printk(" ");
+ sb_add(m, " ");
}
static struct nf_loginfo default_loginfo = {
@@ -442,22 +442,23 @@ ip6t_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
+ struct sbuff *m = sb_open();
+
if (!loginfo)
loginfo = &default_loginfo;
- spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
- prefix,
- in ? in->name : "",
- out ? out->name : "");
+ sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ prefix,
+ in ? in->name : "",
+ out ? out->name : "");
/* MAC logging for input path only. */
if (in && !out)
- dump_mac_header(loginfo, skb);
+ dump_mac_header(m, loginfo, skb);
+
+ dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
- dump_packet(loginfo, skb, skb_network_offset(skb), 1);
- printk("\n");
- spin_unlock_bh(&log_lock);
+ sb_close(m);
}
static unsigned int
--
1.7.1
^ permalink raw reply related
* [PATCH 21/72] IPVS: Add struct ip_vs_pe
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
include/linux/ip_vs.h | 2 +
include/net/ip_vs.h | 28 +++++++++++++++-
net/netfilter/ipvs/ip_vs_conn.c | 67 ++++++++++++++++++++++++++++++++------
net/netfilter/ipvs/ip_vs_core.c | 36 +++++++++++++++++----
net/netfilter/ipvs/ip_vs_sync.c | 17 +++++++++-
5 files changed, 129 insertions(+), 21 deletions(-)
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index df77286..0a9c44d 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -99,8 +99,10 @@
0)
#define IP_VS_SCHEDNAME_MAXLEN 16
+#define IP_VS_PENAME_MAXLEN 16
#define IP_VS_IFNAME_MAXLEN 16
+#define IP_VS_PEDATA_MAXLEN 255
/*
* The struct ip_vs_service_user and struct ip_vs_dest_user are
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index d4da774..b6b309d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -364,6 +364,10 @@ struct ip_vs_conn_param {
__be16 vport;
__u16 protocol;
u16 af;
+
+ const struct ip_vs_pe *pe;
+ char *pe_data;
+ __u8 pe_data_len;
};
/*
@@ -416,6 +420,9 @@ struct ip_vs_conn {
void *app_data; /* Application private data */
struct ip_vs_seq in_seq; /* incoming seq. struct */
struct ip_vs_seq out_seq; /* outgoing seq. struct */
+
+ char *pe_data;
+ __u8 pe_data_len;
};
@@ -486,6 +493,9 @@ struct ip_vs_service {
struct ip_vs_scheduler *scheduler; /* bound scheduler object */
rwlock_t sched_lock; /* lock sched_data */
void *sched_data; /* scheduler application data */
+
+ /* alternate persistence engine */
+ struct ip_vs_pe *pe;
};
@@ -549,6 +559,20 @@ struct ip_vs_scheduler {
const struct sk_buff *skb);
};
+/* The persistence engine object */
+struct ip_vs_pe {
+ struct list_head n_list; /* d-linked list head */
+ char *name; /* scheduler name */
+ atomic_t refcnt; /* reference counter */
+ struct module *module; /* THIS_MODULE/NULL */
+
+ /* get the connection template, if any */
+ int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
+ bool (*ct_match)(const struct ip_vs_conn_param *p,
+ struct ip_vs_conn *ct);
+ u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
+ bool inverse);
+};
/*
* The application module object (a.k.a. app incarnation)
@@ -648,6 +672,8 @@ static inline void ip_vs_conn_fill_param(int af, int protocol,
p->cport = cport;
p->vaddr = vaddr;
p->vport = vport;
+ p->pe = NULL;
+ p->pe_data = NULL;
}
struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p);
@@ -803,7 +829,7 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
extern struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb);
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb);
extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index deeb906..06da21e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
& ip_vs_conn_tab_mask;
}
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+ bool inverse)
+{
+ const union nf_inet_addr *addr;
+ __be16 port;
+
+ if (p->pe && p->pe->hashkey_raw)
+ return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+ ip_vs_conn_tab_mask;
+
+ if (likely(!inverse)) {
+ addr = p->caddr;
+ port = p->cport;
+ } else {
+ addr = p->vaddr;
+ port = p->vport;
+ }
+
+ return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+ NULL, 0, &p);
+
+ if (cp->dest && cp->dest->svc->pe) {
+ p.pe = cp->dest->svc->pe;
+ p.pe_data = cp->pe_data;
+ p.pe_data_len = cp->pe_data_len;
+ }
+
+ return ip_vs_conn_hashkey_param(&p, false);
+}
/*
* Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
return 0;
/* Hash by protocol, client address and port */
- hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+ hash = ip_vs_conn_hashkey_conn(cp);
ct_write_lock(hash);
spin_lock(&cp->lock);
@@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
int ret;
/* unhash it and decrease its reference counter */
- hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+ hash = ip_vs_conn_hashkey_conn(cp);
ct_write_lock(hash);
spin_lock(&cp->lock);
@@ -227,7 +263,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
unsigned hash;
struct ip_vs_conn *cp;
- hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
+ hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
@@ -312,11 +348,17 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
unsigned hash;
struct ip_vs_conn *cp;
- hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport);
+ hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->pe && p->pe->ct_match) {
+ if (p->pe->ct_match(p, cp))
+ goto out;
+ continue;
+ }
+
if (cp->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
/* protocol should only be IPPROTO_IP if
@@ -325,15 +367,14 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
p->af, p->vaddr, &cp->vaddr) &&
p->cport == cp->cport && p->vport == cp->vport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- p->protocol == cp->protocol) {
- /* HIT */
- atomic_inc(&cp->refcnt);
+ p->protocol == cp->protocol)
goto out;
- }
}
cp = NULL;
out:
+ if (cp)
+ atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
@@ -357,7 +398,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
/*
* Check for "full" addressed entries
*/
- hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport);
+ hash = ip_vs_conn_hashkey_param(p, true);
ct_read_lock(hash);
@@ -722,6 +763,7 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->flags & IP_VS_CONN_F_NFCT)
ip_vs_conn_drop_conntrack(cp);
+ kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
@@ -782,6 +824,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
&cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ cp->pe_data = p->pe_data;
+ cp->pe_data_len = p->pe_data_len;
+ }
spin_lock_init(&cp->lock);
/*
@@ -832,7 +878,6 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
return cp;
}
-
/*
* /proc/net/ip_vs_conn entries
*/
@@ -848,7 +893,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
seq->private = &ip_vs_conn_tab[idx];
- return cp;
+ return cp;
}
}
ct_read_unlock_bh(idx);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 87602a6..ab98893 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -176,6 +176,19 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
return pp->state_transition(cp, direction, skb, pp);
}
+static inline int
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+ struct sk_buff *skb, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+ p->pe = svc->pe;
+ if (p->pe && p->pe->fill_param)
+ return p->pe->fill_param(p, skb);
+ return 0;
+}
/*
* IPVS persistent scheduling function
@@ -186,7 +199,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
*/
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
__be16 ports[2])
{
struct ip_vs_conn *cp = NULL;
@@ -255,8 +268,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
vaddr = &fwmark;
}
}
- ip_vs_conn_fill_param(svc->af, protocol, &snet, 0,
- vaddr, vport, ¶m);
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, ¶m))
+ return NULL;
}
/* Check if a template already exists */
@@ -268,22 +282,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
dest = svc->scheduler->schedule(svc, skb);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ kfree(param.pe_data);
return NULL;
}
if (ports[1] == svc->port && svc->port != FTPPORT)
dport = dest->port;
- /* Create a template */
+ /* Create a template
+ * This adds param.pe_data to the template,
+ * and thus param.pe_data will be destroyed
+ * when the template expires */
ct = ip_vs_conn_new(¶m, &dest->addr, dport,
IP_VS_CONN_F_TEMPLATE, dest);
- if (ct == NULL)
+ if (ct == NULL) {
+ kfree(param.pe_data);
return NULL;
+ }
ct->timeout = svc->timeout;
- } else
+ } else {
/* set destination with the found template */
dest = ct->dest;
+ kfree(param.pe_data);
+ }
dport = ports[1];
if (dport == svc->port && dest->port)
@@ -322,7 +344,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* Protocols supported: TCP, UDP
*/
struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index f68631f..ab85aed 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
ip_vs_sync_conn(cp->control);
}
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ /* XXX: Need to take into account persistence engine */
+ ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+ return 0;
+}
/*
* Process received multicast message and create the corresponding
@@ -372,11 +382,14 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
}
{
- ip_vs_conn_fill_param(AF_INET, s->protocol,
+ if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
(union nf_inet_addr *)&s->caddr,
s->cport,
(union nf_inet_addr *)&s->vaddr,
- s->vport, ¶m);
+ s->vport, ¶m)) {
+ pr_err("ip_vs_conn_fill_param_sync failed");
+ return;
+ }
if (!(flags & IP_VS_CONN_F_TEMPLATE))
cp = ip_vs_conn_in_get(¶m);
else
--
1.7.1
^ permalink raw reply related
* [PATCH 19/72] IPVS: Allow null argument to ip_vs_scheduler_put()
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
This simplifies caller logic sightly.
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/ip_vs_ctl.c | 13 ++++---------
net/netfilter/ipvs/ip_vs_sched.c | 2 +-
2 files changed, 5 insertions(+), 10 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e4ec8f3..f7afcfe 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1144,7 +1144,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
if (sched == NULL) {
pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
ret = -ENOENT;
- goto out_mod_dec;
+ goto out_err;
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1204,7 +1204,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
*svc_p = svc;
return 0;
- out_err:
+ out_err:
if (svc != NULL) {
if (svc->scheduler)
ip_vs_unbind_scheduler(svc);
@@ -1217,7 +1217,6 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
ip_vs_scheduler_put(sched);
- out_mod_dec:
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1300,10 +1299,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
#ifdef CONFIG_IP_VS_IPV6
out:
#endif
-
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
-
+ ip_vs_scheduler_put(old_sched);
return ret;
}
@@ -1327,8 +1323,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/* Unbind scheduler */
old_sched = svc->scheduler;
ip_vs_unbind_scheduler(svc);
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
+ ip_vs_scheduler_put(old_sched);
/* Unbind app inc */
if (svc->inc) {
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 727e45b..9f94e32 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -159,7 +159,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
- if (scheduler->module)
+ if (scheduler && scheduler->module)
module_put(scheduler->module);
}
--
1.7.1
^ permalink raw reply related
* [PATCH 22/72] IPVS: Add persistence engine data to /proc/net/ip_vs_conn
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
This shouldn't break compatibility with userspace as the new data
is at the end of the line.
I have confirmed that this doesn't break ipvsadm, the main (only?)
user-space user of this data.
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
include/net/ip_vs.h | 1 +
net/netfilter/ipvs/ip_vs_conn.c | 25 ++++++++++++++++++++-----
2 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index b6b309d..974daf5 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -572,6 +572,7 @@ struct ip_vs_pe {
struct ip_vs_conn *ct);
u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
bool inverse);
+ int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
};
/*
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 06da21e..4adedef 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -950,30 +950,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
seq_puts(seq,
- "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
else {
const struct ip_vs_conn *cp = v;
+ char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+ size_t len = 0;
+
+ if (cp->dest && cp->dest->svc->pe &&
+ cp->dest->svc->pe->show_pe_data) {
+ pe_data[0] = ' ';
+ len = strlen(cp->dest->svc->pe->name);
+ memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ pe_data[len + 1] = ' ';
+ len += 2;
+ len += cp->dest->svc->pe->show_pe_data(cp,
+ pe_data + len);
+ }
+ pe_data[len] = '\0';
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
- seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%pI6 %04X %-11s %7lu%s\n",
ip_vs_proto_name(cp->protocol),
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
&cp->daddr.in6, ntohs(cp->dport),
ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ);
+ (cp->timer.expires-jiffies)/HZ, pe_data);
else
#endif
seq_printf(seq,
"%-3s %08X %04X %08X %04X"
- " %08X %04X %-11s %7lu\n",
+ " %08X %04X %-11s %7lu%s\n",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
ntohl(cp->daddr.ip), ntohs(cp->dport),
ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ);
+ (cp->timer.expires-jiffies)/HZ, pe_data);
}
return 0;
}
--
1.7.1
^ permalink raw reply related
* [PATCH 16/72] netfilter: nf_conntrack_sip: Add callid parser
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
include/linux/netfilter/nf_conntrack_sip.h | 1 +
net/netfilter/nf_conntrack_sip.c | 39 ++++++++++++++++++++++++++++
2 files changed, 40 insertions(+), 0 deletions(-)
diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h
index ff8cfbc..0ce91d5 100644
--- a/include/linux/netfilter/nf_conntrack_sip.h
+++ b/include/linux/netfilter/nf_conntrack_sip.h
@@ -89,6 +89,7 @@ enum sip_header_types {
SIP_HDR_VIA_TCP,
SIP_HDR_EXPIRES,
SIP_HDR_CONTENT_LENGTH,
+ SIP_HDR_CALL_ID,
};
enum sdp_header_types {
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 2fd1ea2..715ce54 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
return len;
}
+static int iswordc(const char c)
+{
+ if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+ (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+ c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+ c == '{' || c == '}' || c == '~')
+ return 1;
+ return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+ int len = 0;
+ while (dptr < limit && iswordc(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len, domain_len;
+
+ len = word_len(dptr, limit);
+ dptr += len;
+ if (!len || dptr == limit || *dptr != '@')
+ return len;
+ dptr++;
+ len++;
+
+ domain_len = word_len(dptr, limit);
+ if (!domain_len)
+ return 0;
+ return len + domain_len;
+}
+
/* get media type + port length */
static int media_len(const struct nf_conn *ct, const char *dptr,
const char *limit, int *shift)
@@ -299,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
[SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len),
[SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len),
[SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len),
+ [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len),
};
static const char *sip_follow_continuation(const char *dptr, const char *limit)
--
1.7.1
^ permalink raw reply related
* [PATCH 14/72] netfilter: ctnetlink: add support for user-space expectation helpers
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Pablo Neira Ayuso <pablo@netfilter.org>
This patch adds the basic infrastructure to support user-space
expectation helpers via ctnetlink and the netfilter queuing
infrastructure NFQUEUE. Basically, this patch:
* adds NF_CT_EXPECT_USERSPACE flag to identify user-space
created expectations. I have also added a sanity check in
__nf_ct_expect_check() to avoid that kernel-space helpers
may create an expectation if the master conntrack has no
helper assigned.
* adds some branches to check if the master conntrack helper
exists, otherwise we skip the code that refers to kernel-space
helper such as the local expectation list and the expectation
policy.
* allows to set the timeout for user-space expectations with
no helper assigned.
* a list of expectations created from user-space that depends
on ctnetlink (if this module is removed, they are deleted).
* includes USERSPACE in the /proc output for expectations
that have been created by a user-space helper.
This patch also modifies ctnetlink to skip including the helper
name in the Netlink messages if no kernel-space helper is set
(since no user-space expectation has not kernel-space kernel
assigned).
You can access an example user-space FTP conntrack helper at:
http://people.netfilter.org/pablo/userspace-conntrack-helpers/nf-ftp-helper-userspace-POC.tar.bz
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/linux/netfilter/nf_conntrack_common.h | 1 +
include/net/netfilter/nf_conntrack_expect.h | 1 +
net/netfilter/nf_conntrack_expect.c | 62 +++++++++++++++++++------
net/netfilter/nf_conntrack_netlink.c | 46 ++++++++++++------
4 files changed, 79 insertions(+), 31 deletions(-)
diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index fdc50ca..23a1a08 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -103,6 +103,7 @@ enum ip_conntrack_expect_events {
/* expectation flags */
#define NF_CT_EXPECT_PERMANENT 0x1
#define NF_CT_EXPECT_INACTIVE 0x2
+#define NF_CT_EXPECT_USERSPACE 0x4
#ifdef __KERNEL__
struct ip_conntrack_stat {
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 96bb42a..416b838 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -85,6 +85,7 @@ nf_ct_find_expectation(struct net *net, u16 zone,
void nf_ct_unlink_expect(struct nf_conntrack_expect *exp);
void nf_ct_remove_expectations(struct nf_conn *ct);
void nf_ct_unexpect_related(struct nf_conntrack_expect *exp);
+void nf_ct_remove_userspace_expectations(void);
/* Allocate space for an expectation: this is mandatory before calling
nf_ct_expect_related. You will have to call put afterwards. */
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acb29cc..b30a1f2 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -38,20 +38,23 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
- NF_CT_ASSERT(master_help);
NF_CT_ASSERT(!timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
net->ct.expect_count--;
hlist_del(&exp->lnode);
- master_help->expecting[exp->class]--;
+ if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+ master_help->expecting[exp->class]--;
+
nf_ct_expect_put(exp);
NF_CT_STAT_INC(net, expect_delete);
@@ -320,16 +323,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
atomic_inc(&exp->use);
- hlist_add_head(&exp->lnode, &master_help->expectations);
- master_help->expecting[exp->class]++;
+ if (master_help) {
+ hlist_add_head(&exp->lnode, &master_help->expectations);
+ master_help->expecting[exp->class]++;
+ } else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+ hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
(unsigned long)exp);
- p = &master_help->helper->expect_policy[exp->class];
- exp->timeout.expires = jiffies + p->timeout * HZ;
+ if (master_help) {
+ p = &master_help->helper->expect_policy[exp->class];
+ exp->timeout.expires = jiffies + p->timeout * HZ;
+ }
add_timer(&exp->timeout);
atomic_inc(&exp->use);
@@ -380,7 +388,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
unsigned int h;
int ret = 1;
- if (!master_help->helper) {
+ /* Don't allow expectations created from kernel-space with no helper */
+ if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+ (!master_help || (master_help && !master_help->helper))) {
ret = -ESHUTDOWN;
goto out;
}
@@ -398,13 +408,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
}
/* Will be over limit? */
- p = &master_help->helper->expect_policy[expect->class];
- if (p->max_expected &&
- master_help->expecting[expect->class] >= p->max_expected) {
- evict_oldest_expect(master, expect);
- if (master_help->expecting[expect->class] >= p->max_expected) {
- ret = -EMFILE;
- goto out;
+ if (master_help) {
+ p = &master_help->helper->expect_policy[expect->class];
+ if (p->max_expected &&
+ master_help->expecting[expect->class] >= p->max_expected) {
+ evict_oldest_expect(master, expect);
+ if (master_help->expecting[expect->class]
+ >= p->max_expected) {
+ ret = -EMFILE;
+ goto out;
+ }
}
}
@@ -439,6 +452,21 @@ out:
}
EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
+void nf_ct_remove_userspace_expectations(void)
+{
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *n, *next;
+
+ hlist_for_each_entry_safe(exp, n, next,
+ &nf_ct_userspace_expect_list, lnode) {
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
#ifdef CONFIG_PROC_FS
struct ct_expect_iter_state {
struct seq_net_private p;
@@ -529,8 +557,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
seq_printf(s, "PERMANENT");
delim = ",";
}
- if (expect->flags & NF_CT_EXPECT_INACTIVE)
+ if (expect->flags & NF_CT_EXPECT_INACTIVE) {
seq_printf(s, "%sINACTIVE", delim);
+ delim = ",";
+ }
+ if (expect->flags & NF_CT_EXPECT_USERSPACE)
+ seq_printf(s, "%sUSERSPACE", delim);
helper = rcu_dereference(nfct_help(expect->master)->helper);
if (helper) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 0804e0e..b4077be 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1560,8 +1560,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
const struct nf_conntrack_expect *exp)
{
struct nf_conn *master = exp->master;
- struct nf_conntrack_helper *helper;
long timeout = (exp->timeout.expires - jiffies) / HZ;
+ struct nf_conn_help *help;
if (timeout < 0)
timeout = 0;
@@ -1578,9 +1578,14 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
- helper = rcu_dereference(nfct_help(master)->helper);
- if (helper)
- NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+ help = nfct_help(master);
+ if (help) {
+ struct nf_conntrack_helper *helper;
+
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+ }
return 0;
@@ -1921,24 +1926,32 @@ ctnetlink_create_expect(struct net *net, u16 zone,
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- help = nfct_help(ct);
-
- if (!help || !help->helper) {
- /* such conntrack hasn't got any helper, abort */
- err = -EOPNOTSUPP;
- goto out;
- }
-
exp = nf_ct_expect_alloc(ct);
if (!exp) {
err = -ENOMEM;
goto out;
}
+ help = nfct_help(ct);
+ if (!help) {
+ if (!cda[CTA_EXPECT_TIMEOUT]) {
+ err = -EINVAL;
+ goto out;
+ }
+ exp->timeout.expires =
+ jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
- if (cda[CTA_EXPECT_FLAGS])
- exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
- else
- exp->flags = 0;
+ exp->flags = NF_CT_EXPECT_USERSPACE;
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags |=
+ ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ }
+ } else {
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+ } else
+ exp->flags = 0;
+ }
exp->class = 0;
exp->expectfn = NULL;
@@ -2109,6 +2122,7 @@ static void __exit ctnetlink_exit(void)
{
pr_info("ctnetlink: unregistering from nfnetlink.\n");
+ nf_ct_remove_userspace_expectations();
#ifdef CONFIG_NF_CONNTRACK_EVENTS
nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
nf_conntrack_unregister_notifier(&ctnl_notifier);
--
1.7.1
^ permalink raw reply related
* [PATCH 15/72] netfilter: nf_conntrack_sip: Allow ct_sip_get_header() to be called with a null ct argument
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
Signed-off-by: Simon Horman <horms@verge.net.au>
Acked-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/nf_conntrack_sip.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 53d8922..2fd1ea2 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -152,6 +152,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
const char *end;
int ret = 0;
+ if (!ct)
+ return 0;
+
memset(addr, 0, sizeof(*addr));
switch (nf_ct_l3num(ct)) {
case AF_INET:
--
1.7.1
^ permalink raw reply related
* [PATCH 10/72] ipvs: changes related to service usecnt
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Julian Anastasov <ja@ssi.bg>
Change the usage of svc usecnt during command execution:
- we check if svc is registered but we do not need to hold usecnt
reference while under __ip_vs_mutex, only the packet handling needs
it during scheduling
- change __ip_vs_service_get to __ip_vs_service_find and
__ip_vs_svc_fwm_get to __ip_vs_svc_fwm_find because now caller
will increase svc->usecnt
- put common code that calls update_service in __ip_vs_update_dest
- put common code in ip_vs_unlink_service() and use it to unregister
the service
- add comment that svc should not be accessed after ip_vs_del_service
anymore
- all IP_VS_WAIT_WHILE calls are now unified: usecnt > 0
- Properly log the app ports
As result, some problems are fixed:
- possible use-after-free of svc in ip_vs_genl_set_cmd after
ip_vs_del_service because our usecnt reference does not guarantee that
svc is not freed on refcnt==0, eg. when no dests are moved to trash
- possible usecnt leak in do_ip_vs_set_ctl after ip_vs_del_service
when the service is not freed now, for example, when some
destionations are moved into trash and svc->refcnt remains above 0.
It is harmless because svc is not in hash anymore.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/ipvs/ip_vs_app.c | 6 +-
net/netfilter/ipvs/ip_vs_ctl.c | 250 ++++++++++++++++------------------------
2 files changed, 102 insertions(+), 154 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index e76f87f..a475ede 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
goto out;
list_add(&inc->a_list, &app->incs_list);
- IP_VS_DBG(9, "%s application %s:%u registered\n",
- pp->name, inc->name, inc->port);
+ IP_VS_DBG(9, "%s App %s:%u registered\n",
+ pp->name, inc->name, ntohs(inc->port));
return 0;
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
pp->unregister_app(inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
- pp->name, inc->name, inc->port);
+ pp->name, inc->name, ntohs(inc->port));
list_del(&inc->a_list);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e637cd0..e4ec8f3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -405,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
* Get service by {proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
__be16 vport)
{
unsigned hash;
@@ -420,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
&& (svc->port == vport)
&& (svc->protocol == protocol)) {
/* HIT */
- atomic_inc(&svc->usecnt);
return svc;
}
}
@@ -433,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
{
unsigned hash;
struct ip_vs_service *svc;
@@ -444,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af) {
/* HIT */
- atomic_inc(&svc->usecnt);
return svc;
}
}
@@ -463,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
/*
* Check the table hashed by fwmark first
*/
- if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+ if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
goto out;
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
@@ -480,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
@@ -488,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(af, protocol, vaddr, 0);
}
out:
+ if (svc)
+ atomic_inc(&svc->usecnt);
read_unlock(&__ip_vs_svc_lock);
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -510,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
dest->svc = svc;
}
-static inline void
+static void
__ip_vs_unbind_svc(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = dest->svc;
dest->svc = NULL;
- if (atomic_dec_and_test(&svc->refcnt))
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port), atomic_read(&svc->usecnt));
kfree(svc);
+ }
}
@@ -762,8 +767,8 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
* Update a destination in the given service
*/
static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
- struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+ struct ip_vs_dest_user_kern *udest, int add)
{
int conn_flags;
@@ -818,6 +823,25 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
+
+ if (add)
+ ip_vs_new_estimator(&dest->stats);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /* Wait until all other svc users go away */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+ if (add) {
+ list_add(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+ }
+
+ /* call the update_service, because server weight may be changed */
+ if (svc->scheduler->update_service)
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
}
@@ -865,13 +889,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->activeconns, 0);
atomic_set(&dest->inactconns, 0);
atomic_set(&dest->persistconns, 0);
- atomic_set(&dest->refcnt, 0);
+ atomic_set(&dest->refcnt, 1);
INIT_LIST_HEAD(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
- __ip_vs_update_dest(svc, dest, udest);
- ip_vs_new_estimator(&dest->stats);
+ __ip_vs_update_dest(svc, dest, udest, 1);
*dest_p = dest;
@@ -931,65 +954,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
- __ip_vs_update_dest(svc, dest, udest);
-
/*
* Get the destination from the trash
*/
list_del(&dest->n_list);
- ip_vs_new_estimator(&dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
+ __ip_vs_update_dest(svc, dest, udest, 1);
+ ret = 0;
+ } else {
/*
- * Wait until all other svc users go away.
+ * Allocate and initialize the dest structure
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
- return 0;
- }
-
- /*
- * Allocate and initialize the dest structure
- */
- ret = ip_vs_new_dest(svc, udest, &dest);
- if (ret) {
- return ret;
+ ret = ip_vs_new_dest(svc, udest, &dest);
}
-
- /*
- * Add the dest entry into the list
- */
- atomic_inc(&dest->refcnt);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
LeaveFunction(2);
- return 0;
+ return ret;
}
@@ -1028,19 +1008,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ENOENT;
}
- __ip_vs_update_dest(svc, dest, udest);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- /* call the update_service, because server weight may be changed */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
+ __ip_vs_update_dest(svc, dest, udest, 0);
LeaveFunction(2);
return 0;
@@ -1067,6 +1035,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
* the destination into the trash.
*/
if (atomic_dec_and_test(&dest->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
ip_vs_dst_reset(dest);
/* simply decrease svc->refcnt here, let the caller check
and release the service if nobody refers to it.
@@ -1133,7 +1105,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Wait until all other svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
/*
* Unlink dest from the service
@@ -1190,7 +1162,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
/* I'm the first user of the service */
- atomic_set(&svc->usecnt, 1);
+ atomic_set(&svc->usecnt, 0);
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
@@ -1284,7 +1256,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
/*
* Wait until all other svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
/*
* Set the flags and timeout value
@@ -1383,21 +1355,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Free the service if nobody refers to it
*/
- if (atomic_read(&svc->refcnt) == 0)
+ if (atomic_read(&svc->refcnt) == 0) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port), atomic_read(&svc->usecnt));
kfree(svc);
+ }
/* decrease the module use count */
ip_vs_use_count_dec();
}
/*
- * Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
*/
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
{
- if (svc == NULL)
- return -EEXIST;
-
/*
* Unhash it from the service table
*/
@@ -1408,11 +1382,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Wait until all the svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
__ip_vs_del_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+ ip_vs_unlink_service(svc);
return 0;
}
@@ -1431,14 +1415,7 @@ static int ip_vs_flush(void)
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
+ ip_vs_unlink_service(svc);
}
}
@@ -1448,14 +1425,7 @@ static int ip_vs_flush(void)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt,
&ip_vs_svc_fwm_table[idx], f_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
+ ip_vs_unlink_service(svc);
}
}
@@ -2168,15 +2138,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
if (usvc.fwmark == 0)
- svc = __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
+ svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+ &usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
ret = -ESRCH;
- goto out_drop_service;
+ goto out_unlock;
}
switch (cmd) {
@@ -2210,10 +2180,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
ret = -EINVAL;
}
-out_drop_service:
- if (svc)
- ip_vs_service_put(svc);
-
out_unlock:
mutex_unlock(&__ip_vs_mutex);
out_dec:
@@ -2306,10 +2272,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
int ret = 0;
if (get->fwmark)
- svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
else
- svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
- get->port);
+ svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+ get->port);
if (svc) {
int count = 0;
@@ -2337,7 +2303,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
}
count++;
}
- ip_vs_service_put(svc);
} else
ret = -ESRCH;
return ret;
@@ -2458,15 +2423,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_get(AF_INET, entry->protocol,
- &addr, entry->port);
+ svc = __ip_vs_service_find(AF_INET, entry->protocol,
+ &addr, entry->port);
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
ret = -EFAULT;
- ip_vs_service_put(svc);
} else
ret = -ESRCH;
}
@@ -2733,10 +2697,12 @@ nla_put_failure:
}
static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
- struct nlattr *nla, int full_entry)
+ struct nlattr *nla, int full_entry,
+ struct ip_vs_service **ret_svc)
{
struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+ struct ip_vs_service *svc;
/* Parse mandatory identifying service fields first */
if (nla == NULL ||
@@ -2772,12 +2738,18 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
usvc->fwmark = 0;
}
+ if (usvc->fwmark)
+ svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+ else
+ svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+ &usvc->addr, usvc->port);
+ *ret_svc = svc;
+
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_sched, *nla_flags, *nla_timeout,
*nla_netmask;
struct ip_vs_flags flags;
- struct ip_vs_service *svc;
nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
@@ -2790,16 +2762,8 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
nla_memcpy(&flags, nla_flags, sizeof(flags));
/* prefill flags from service if it already exists */
- if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
- else
- svc = __ip_vs_service_get(usvc->af, usvc->protocol,
- &usvc->addr, usvc->port);
- if (svc) {
+ if (svc)
usvc->flags = svc->flags;
- ip_vs_service_put(svc);
- } else
- usvc->flags = 0;
/* set new flags from userland */
usvc->flags = (usvc->flags & ~flags.mask) |
@@ -2815,17 +2779,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(&usvc, nla, 0);
- if (ret)
- return ERR_PTR(ret);
-
- if (usvc.fwmark)
- return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
- else
- return __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
+ ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+ return ret ? ERR_PTR(ret) : svc;
}
static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2916,7 +2874,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
nla_put_failure:
cb->args[0] = idx;
- ip_vs_service_put(svc);
out_err:
mutex_unlock(&__ip_vs_mutex);
@@ -3129,17 +3086,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
ret = ip_vs_genl_parse_service(&usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
- need_full_svc);
+ need_full_svc, &svc);
if (ret)
goto out;
- /* Lookup the exact service by <protocol, addr, port> or fwmark */
- if (usvc.fwmark == 0)
- svc = __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
- else
- svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
/* Unless we're adding a new service, the service must already exist */
if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
ret = -ESRCH;
@@ -3173,6 +3123,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
break;
case IPVS_CMD_DEL_SERVICE:
ret = ip_vs_del_service(svc);
+ /* do not use svc, it can be freed */
break;
case IPVS_CMD_NEW_DEST:
ret = ip_vs_add_dest(svc, &udest);
@@ -3191,8 +3142,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
}
out:
- if (svc)
- ip_vs_service_put(svc);
mutex_unlock(&__ip_vs_mutex);
return ret;
@@ -3238,7 +3187,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
goto out_err;
} else if (svc) {
ret = ip_vs_genl_fill_service(msg, svc);
- ip_vs_service_put(svc);
if (ret)
goto nla_put_failure;
} else {
--
1.7.1
^ permalink raw reply related
* [PATCH 13/72] netfilter: ctnetlink: allow to specify the expectation flags
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Pablo Neira Ayuso <pablo@netfilter.org>
With this patch, you can specify the expectation flags for user-space
created expectations.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/linux/netfilter/nf_conntrack_common.h | 4 ++++
include/linux/netfilter/nfnetlink_conntrack.h | 1 +
include/net/netfilter/nf_conntrack_expect.h | 3 ---
net/netfilter/nf_conntrack_netlink.c | 8 +++++++-
4 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 1afd18c..fdc50ca 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -100,6 +100,10 @@ enum ip_conntrack_expect_events {
IPEXP_NEW, /* new expectation */
};
+/* expectation flags */
+#define NF_CT_EXPECT_PERMANENT 0x1
+#define NF_CT_EXPECT_INACTIVE 0x2
+
#ifdef __KERNEL__
struct ip_conntrack_stat {
unsigned int searched;
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 9ed534c..455f0ce 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -161,6 +161,7 @@ enum ctattr_expect {
CTA_EXPECT_ID,
CTA_EXPECT_HELP_NAME,
CTA_EXPECT_ZONE,
+ CTA_EXPECT_FLAGS,
__CTA_EXPECT_MAX
};
#define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1)
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 11e8150..96bb42a 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -67,9 +67,6 @@ struct nf_conntrack_expect_policy {
#define NF_CT_EXPECT_CLASS_DEFAULT 0
-#define NF_CT_EXPECT_PERMANENT 0x1
-#define NF_CT_EXPECT_INACTIVE 0x2
-
int nf_conntrack_expect_init(struct net *net);
void nf_conntrack_expect_fini(struct net *net);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 37533a3..0804e0e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1577,6 +1577,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
+ NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
helper = rcu_dereference(nfct_help(master)->helper);
if (helper)
NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
@@ -1734,6 +1735,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
[CTA_EXPECT_ID] = { .type = NLA_U32 },
[CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING },
[CTA_EXPECT_ZONE] = { .type = NLA_U16 },
+ [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
};
static int
@@ -1933,9 +1935,13 @@ ctnetlink_create_expect(struct net *net, u16 zone,
goto out;
}
+ if (cda[CTA_EXPECT_FLAGS])
+ exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ else
+ exp->flags = 0;
+
exp->class = 0;
exp->expectfn = NULL;
- exp->flags = 0;
exp->master = ct;
exp->helper = NULL;
memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
--
1.7.1
^ permalink raw reply related
* [PATCH 09/72] netfilter: save the hash of the tuple in the original direction for latter use
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
Since we don't change the tuple in the original direction, we can save it
in ct->tuplehash[IP_CT_DIR_REPLY].hnode.pprev for __nf_conntrack_confirm()
use.
__hash_conntrack() is split into two steps: hash_conntrack_raw() is used
to get the raw hash, and __hash_bucket() is used to get the bucket id.
In SYN-flood case, early_drop() doesn't need to recompute the hash again.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/nf_conntrack_core.c | 112 +++++++++++++++++++++++++-----------
1 files changed, 78 insertions(+), 34 deletions(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 4c0ad9b..1eacf8d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -67,29 +67,40 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
static unsigned int nf_conntrack_hash_rnd __read_mostly;
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- u16 zone, unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
{
unsigned int n;
- u_int32_t h;
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- h = jhash2((u32 *)tuple, n,
- zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
- tuple->dst.protonum));
+ return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+ (((__force __u16)tuple->dst.u.all << 16) |
+ tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+ return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+ return __hash_bucket(hash, net->ct.htable_size);
+}
- return ((u64)h * size) >> 32;
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+ u16 zone, unsigned int size)
+{
+ return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
}
static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, zone, net->ct.htable_size,
- nf_conntrack_hash_rnd);
+ return __hash_conntrack(tuple, zone, net->ct.htable_size);
}
bool
@@ -291,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
* OR
* - Caller must lock nf_conntrack_lock before calling this function
*/
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int hash = hash_conntrack(net, zone, tuple);
+ unsigned int bucket = hash_bucket(hash, net);
/* Disable BHs the entire time since we normally need to disable them
* at least once for the stats anyway.
*/
local_bh_disable();
begin:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
if (nf_ct_tuple_equal(tuple, &h->tuple) &&
nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
NF_CT_STAT_INC(net, found);
@@ -318,7 +329,7 @@ begin:
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
- if (get_nulls_value(n) != hash) {
+ if (get_nulls_value(n) != bucket) {
NF_CT_STAT_INC(net, search_restart);
goto begin;
}
@@ -326,19 +337,27 @@ begin:
return NULL;
}
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return ____nf_conntrack_find(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
EXPORT_SYMBOL_GPL(__nf_conntrack_find);
/* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
rcu_read_lock();
begin:
- h = __nf_conntrack_find(net, zone, tuple);
+ h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (unlikely(nf_ct_is_dying(ct) ||
@@ -356,6 +375,14 @@ begin:
return h;
}
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -408,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
zone = nf_ct_zone(ct);
- hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ /* reuse the hash saved before */
+ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+ hash = hash_bucket(hash, net);
+ repl_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* We're not in hash table, and we refuse to set up related
connections for unconfirmed conns. But packet copies and
@@ -566,10 +596,11 @@ static noinline int early_drop(struct net *net, unsigned int hash)
return dropped;
}
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_tuple *repl,
- gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp, u32 hash)
{
struct nf_conn *ct;
@@ -585,6 +616,9 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
get_random_bytes(&rand, sizeof(rand));
} while (!rand);
cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+
+ /* recompute the hash as nf_conntrack_hash_rnd is initialized */
+ hash = hash_conntrack_raw(orig, zone);
}
/* We don't want any race condition at early drop stage */
@@ -592,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
- unsigned int hash = hash_conntrack(net, zone, orig);
- if (!early_drop(net, hash)) {
+ if (!early_drop(net, hash_bucket(hash, net))) {
atomic_dec(&net->ct.count);
if (net_ratelimit())
printk(KERN_WARNING
@@ -623,7 +656,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
- ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+ /* save hash for reusing when confirming */
+ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
/* Don't set timer yet: wait for confirmation */
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
write_pnet(&ct->ct_net, net);
@@ -650,6 +684,14 @@ out_free:
return ERR_PTR(-ENOMEM);
#endif
}
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp)
+{
+ return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
void nf_conntrack_free(struct nf_conn *ct)
@@ -671,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
@@ -685,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return NULL;
}
- ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+ ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+ hash);
if (IS_ERR(ct)) {
pr_debug("Can't allocate conntrack.\n");
return (struct nf_conntrack_tuple_hash *)ct;
@@ -762,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, &tuple, l3proto,
@@ -771,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
}
/* look for tuple match */
- h = nf_conntrack_find_get(net, zone, &tuple);
+ hash = hash_conntrack_raw(&tuple, zone);
+ h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
- skb, dataoff);
+ skb, dataoff, hash);
if (!h)
return NULL;
if (IS_ERR(h))
@@ -1314,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
- hashsize,
- nf_conntrack_hash_rnd);
+ hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
--
1.7.1
^ permalink raw reply related
* [PATCH 08/72] ipvs: make rerouting optional with snat_reroute
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Julian Anastasov <ja@ssi.bg>
Add new sysctl flag "snat_reroute". Recent kernels use
ip_route_me_harder() to route LVS-NAT responses properly by
VIP when there are multiple paths to client. But setups
that do not have alternative default routes can skip this
routing lookup by using snat_reroute=0.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/net/ip_vs.h | 1 +
net/netfilter/ipvs/ip_vs_core.c | 37 +++++++++++++++++++++++++++++--------
net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++++++
3 files changed, 38 insertions(+), 8 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e8ec523..3915a4f 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -801,6 +801,7 @@ extern int sysctl_ip_vs_expire_quiescent_template;
extern int sysctl_ip_vs_sync_threshold[2];
extern int sysctl_ip_vs_nat_icmp_send;
extern int sysctl_ip_vs_conntrack;
+extern int sysctl_ip_vs_snat_reroute;
extern struct ip_vs_stats ip_vs_stats;
extern const struct ctl_path net_vs_ctl_path[];
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 7fbc80d..06c388b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -929,20 +929,31 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
ip_send_check(ip_hdr(skb));
}
+ /*
+ * nf_iterate does not expect change in the skb->dst->dev.
+ * It looks like it is not fatal to enable this code for hooks
+ * where our handlers are at the end of the chain list and
+ * when all next handlers use skb->dst->dev and not outdev.
+ * It will definitely route properly the inout NAT traffic
+ * when multiple paths are used.
+ */
+
/* For policy routing, packets originating from this
* machine itself may be routed differently to packets
* passing through. We want this packet to be routed as
* if it came from this machine itself. So re-compute
* the routing information.
*/
+ if (sysctl_ip_vs_snat_reroute) {
#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6) {
- if (ip6_route_me_harder(skb) != 0)
- goto drop;
- } else
+ if (af == AF_INET6) {
+ if (ip6_route_me_harder(skb) != 0)
+ goto drop;
+ } else
#endif
- if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
- goto drop;
+ if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ goto drop;
+ }
IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
@@ -991,8 +1002,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related, verdict = ip_vs_out_icmp_v6(skb, &related);
- if (related)
+ if (related) {
+ if (sysctl_ip_vs_snat_reroute &&
+ NF_ACCEPT == verdict &&
+ ip6_route_me_harder(skb))
+ verdict = NF_DROP;
return verdict;
+ }
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
} else
@@ -1000,8 +1016,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_out_icmp(skb, &related);
- if (related)
+ if (related) {
+ if (sysctl_ip_vs_snat_reroute &&
+ NF_ACCEPT == verdict &&
+ ip_route_me_harder(skb, RTN_LOCAL))
+ verdict = NF_DROP;
return verdict;
+ }
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d2d842f..e637cd0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -91,6 +91,7 @@ int sysctl_ip_vs_nat_icmp_send = 0;
#ifdef CONFIG_IP_VS_NFCT
int sysctl_ip_vs_conntrack;
#endif
+int sysctl_ip_vs_snat_reroute = 1;
#ifdef CONFIG_IP_VS_DEBUG
@@ -1599,6 +1600,13 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
+ {
+ .procname = "snat_reroute",
+ .data = &sysctl_ip_vs_snat_reroute,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#if 0
{
.procname = "timeout_established",
--
1.7.1
^ permalink raw reply related
* [PATCH 04/72] netfilter: nf_nat: no IP_NAT_RANGE_MAP_IPS flags when alloc_null_binding()
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
When alloc_null_binding(), no IP_NAT_RNAGE_MAP_IPS in flags means no IP address
translation is needed. It isn't necessary to specify the address explicitly.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv4/netfilter/nf_nat_rule.c | 17 ++++++++---------
1 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ebbd319..21c3042 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
/* Force range to this IP; let proto decide mapping for
per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
- Use reply in case it's already been mangled (eg local packet).
*/
- __be32 ip
- = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
- ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
- : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
- struct nf_nat_range range
- = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
-
- pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
+ struct nf_nat_range range;
+
+ range.flags = 0;
+ pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+ HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
--
1.7.1
^ permalink raw reply related
* [PATCH 05/72] netfilter: nf_conntrack: fix the hash random initializing race
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
nf_conntrack_alloc() isn't called with nf_conntrack_lock locked, so hash
random initializing code maybe executed more than once on different
CPUs.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/nf_conntrack_core.c | 19 +++++++++++++------
1 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index df3eedb..4c0ad9b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
u16 zone, unsigned int size, unsigned int rnd)
@@ -574,10 +573,18 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
{
struct nf_conn *ct;
- if (unlikely(!nf_conntrack_hash_rnd_initted)) {
- get_random_bytes(&nf_conntrack_hash_rnd,
- sizeof(nf_conntrack_hash_rnd));
- nf_conntrack_hash_rnd_initted = 1;
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ unsigned int rand;
+
+ /*
+ * Why not initialize nf_conntrack_rnd in a "init()" function ?
+ * Because there isn't enough entropy when system initializing,
+ * and we initialize it as late as possible.
+ */
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
}
/* We don't want any race condition at early drop stage */
--
1.7.1
^ permalink raw reply related
* [PATCH 06/72] ipvs: extend connection flags to 32 bits
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Julian Anastasov <ja@ssi.bg>
- the sync protocol supports 16 bits only, so bits 0..15 should be
used only for flags that should go to backup server, bits 16 and
above should be allocated for flags not sent to backup.
- use IP_VS_CONN_F_DEST_MASK as mask of connection flags in
destination that can be changed by user space
- allow IP_VS_CONN_F_ONE_PACKET to be set in destination
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/linux/ip_vs.h | 8 ++++++++
include/net/ip_vs.h | 2 +-
net/netfilter/ipvs/ip_vs_conn.c | 16 ++++++++++------
net/netfilter/ipvs/ip_vs_core.c | 11 ++++++-----
net/netfilter/ipvs/ip_vs_ctl.c | 5 +++--
5 files changed, 28 insertions(+), 14 deletions(-)
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index 9708de2..003d75f 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -70,6 +70,7 @@
/*
* IPVS Connection Flags
+ * Only flags 0..15 are sent to backup server
*/
#define IP_VS_CONN_F_FWD_MASK 0x0007 /* mask for the fwd methods */
#define IP_VS_CONN_F_MASQ 0x0000 /* masquerading/NAT */
@@ -88,6 +89,13 @@
#define IP_VS_CONN_F_TEMPLATE 0x1000 /* template, not connection */
#define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */
+/* Flags that are not sent to backup server start from bit 16 */
+
+/* Connection flags from destination that can be changed by user space */
+#define IP_VS_CONN_F_DEST_MASK (IP_VS_CONN_F_FWD_MASK | \
+ IP_VS_CONN_F_ONE_PACKET | \
+ 0)
+
#define IP_VS_SCHEDNAME_MAXLEN 16
#define IP_VS_IFNAME_MAXLEN 16
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index f976885..62698a9 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -366,6 +366,7 @@ struct ip_vs_conn {
union nf_inet_addr caddr; /* client address */
union nf_inet_addr vaddr; /* virtual address */
union nf_inet_addr daddr; /* destination address */
+ volatile __u32 flags; /* status flags */
__be16 cport;
__be16 vport;
__be16 dport;
@@ -378,7 +379,6 @@ struct ip_vs_conn {
/* Flags and state transition */
spinlock_t lock; /* lock for state transition */
- volatile __u16 flags; /* status flags */
volatile __u16 state; /* state info */
volatile __u16 old_state; /* old state, to be used for
* state transition triggerd
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index b71c69a..9fe1da7 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -505,6 +505,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
+ unsigned int conn_flags;
+
/* if dest is NULL, then return directly */
if (!dest)
return;
@@ -512,16 +514,18 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
/* Increase the refcnt counter of the dest */
atomic_inc(&dest->refcnt);
+ conn_flags = atomic_read(&dest->conn_flags);
+ if (cp->protocol != IPPROTO_UDP)
+ conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
/* Bind with the destination and its corresponding transmitter */
- if ((cp->flags & IP_VS_CONN_F_SYNC) &&
- (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+ if (cp->flags & IP_VS_CONN_F_SYNC) {
/* if the connection is not template and is created
* by sync, preserve the activity flag.
*/
- cp->flags |= atomic_read(&dest->conn_flags) &
- (~IP_VS_CONN_F_INACTIVE);
- else
- cp->flags |= atomic_read(&dest->conn_flags);
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ cp->flags |= conn_flags;
cp->dest = dest;
IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 0c043b6..319991d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -194,7 +194,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
struct ip_vs_dest *dest;
struct ip_vs_conn *ct;
__be16 dport; /* destination port to forward */
- __be16 flags;
+ unsigned int flags;
union nf_inet_addr snet; /* source network of the client,
after masking */
@@ -382,7 +382,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
- __be16 _ports[2], *pptr, flags;
+ __be16 _ports[2], *pptr;
+ unsigned int flags;
ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -473,9 +474,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
int ret, cs;
struct ip_vs_conn *cp;
- __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
- iph.protocol == IPPROTO_UDP)?
- IP_VS_CONN_F_ONE_PACKET : 0;
+ unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+ iph.protocol == IPPROTO_UDP)?
+ IP_VS_CONN_F_ONE_PACKET : 0;
union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
ip_vs_service_put(svc);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ca8ec8c..7bd41d2 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -765,7 +765,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
- conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+ conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+ conn_flags |= IP_VS_CONN_F_INACTIVE;
/* check if local node and update the flags */
#ifdef CONFIG_IP_VS_IPV6
@@ -782,7 +783,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
}
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
- if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
--
1.7.1
^ permalink raw reply related
* [PATCH 02/72] netfilter: use NFPROTO_IPV4 instead of AF_INET
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
The field family of xt_target should be NFPROTO_IPV4, though
NFPROTO_IPV4 and AF_INET are the same.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/xt_TPROXY.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index c61294d..21bb2af 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -76,7 +76,7 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
static struct xt_target tproxy_tg_reg __read_mostly = {
.name = "TPROXY",
- .family = AF_INET,
+ .family = NFPROTO_IPV4,
.table = "mangle",
.target = tproxy_tg,
.targetsize = sizeof(struct xt_tproxy_target_info),
--
1.7.1
^ permalink raw reply related
* [PATCH 03/72] netfilter: nf_nat_core: don't check if the tuple is used if there is no other choice
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
Eliminate nf_nat_used_tuple() to save some CPU cycles when there is no
other choice.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv4/netfilter/nf_nat_core.c | 16 +++++++++++-----
1 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d..2c084b3 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -262,11 +262,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
/* Only bother mapping if it's not already in range and unique */
- if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
- (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
- proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
- !nf_nat_used_tuple(tuple, ct))
- goto out;
+ if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+ if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+ if (proto->in_range(tuple, maniptype, &range->min,
+ &range->max) &&
+ (range->min.all == range->max.all ||
+ !nf_nat_used_tuple(tuple, ct)))
+ goto out;
+ } else if (!nf_nat_used_tuple(tuple, ct)) {
+ goto out;
+ }
+ }
/* Last change: get protocol to try to obtain unique tuple. */
proto->unique_tuple(tuple, range, maniptype, ct);
--
1.7.1
^ permalink raw reply related
* [PATCH 01/72] netfilter: nf_nat: add nf_nat_csum()
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
Add a static function nf_nat_csum() to replace the duplicate code in
nf_nat_mangle_udp_packet() and __nf_nat_mangle_tcp_packet().
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv4/netfilter/nf_nat_helper.c | 76 +++++++++++++++---------------------
1 files changed, 31 insertions(+), 45 deletions(-)
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b5..31427fb 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
}
EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
+ int datalen, __sum16 *check, int oldlen)
+{
+ struct rtable *rt = skb_rtable(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt_flags & RTCF_LOCAL) &&
+ skb->dev->features & NETIF_F_V4_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ iph->ihl * 4;
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, iph->protocol, 0);
+ } else {
+ *check = 0;
+ *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, iph->protocol,
+ csum_partial(data, datalen,
+ 0));
+ if (iph->protocol == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
/* Generic function for mangling variable-length address changes inside
* NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
* command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len, bool adjust)
{
- struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
struct tcphdr *tcph;
int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
match_offset, match_len, rep_buffer, rep_len);
datalen = skb->len - iph->ihl*4;
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- skb->dev->features & NETIF_F_V4_CSUM) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- iph->ihl * 4;
- skb->csum_offset = offsetof(struct tcphdr, check);
- tcph->check = ~tcp_v4_check(datalen,
- iph->saddr, iph->daddr, 0);
- } else {
- tcph->check = 0;
- tcph->check = tcp_v4_check(datalen,
- iph->saddr, iph->daddr,
- csum_partial(tcph,
- datalen, 0));
- }
- } else
- inet_proto_csum_replace2(&tcph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
if (adjust && rep_len != match_len)
nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len)
{
- struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
struct udphdr *udph;
int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
return 1;
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- skb->dev->features & NETIF_F_V4_CSUM) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- iph->ihl * 4;
- skb->csum_offset = offsetof(struct udphdr, check);
- udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, IPPROTO_UDP,
- 0);
- } else {
- udph->check = 0;
- udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, IPPROTO_UDP,
- csum_partial(udph,
- datalen, 0));
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
- } else
- inet_proto_csum_replace2(&udph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
return 1;
}
--
1.7.1
^ permalink raw reply related
* [PATCH 00/72] netfilter: netfilter update for 2.6.37
From: kaber @ 2010-10-21 15:18 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
Hi Dave,
the following patches contain my netfilter update for 2.6.37, containing:
- xtables and ebtables cleanups from Jan
- NAT cleanups and optimizations from Changli
- a fix for a race condition in netfilter hash random initialization and
caching of the original hash value from the lookup for hash insertion
from Changli
- smaller ctnetlink fixes and support for user-space connection tracking
helpers from Pablo
- ctnetlink expectation deletion event notifications from Pablo
- a patch from Eric to make the LOG targets build the entire log string
in a buffer before printing it to save some overhead and avoid having
the log message intermixed with unrelated printks
- a TPROXY fix for dealing with TIME_WAIT sockets from Balazs
- a TPROXY fix for locking problems in __inet_inherit_port() from Balazs
- IPv6 TPROXY support from Balazs and Krisztian
- IPVS persistent engine core and SIP persistent engine from Simon
- IPVS IPv6 tunnel mode from Hans Schillstrom
- various IPVS fixes and optimizations from Julian
- IPVS support for local servers and clients from Julian
Please pull from:
git://git.kernel.org:/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git master
Thanks!
^ permalink raw reply
* [PATCH 71/72] netfilter: ebtables: replace EBT_MATCH_ITERATE macro
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
include/linux/netfilter_bridge/ebtables.h | 9 +++++
net/bridge/netfilter/ebtables.c | 47 ++++++++++++++++++++--------
2 files changed, 42 insertions(+), 14 deletions(-)
diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index af0b721..1c33b9e 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -263,6 +263,14 @@ extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
((pos)->bitmask == 0 ? sizeof(struct ebt_entries) : \
(pos)->next_offset)))
+#define ebt_ematch_foreach(pos, entry) \
+ for ((pos) = (struct ebt_entry_match *)(entry)->elems; \
+ (pos) < (struct ebt_entry_match *)((char *)(entry) + \
+ (entry)->watchers_offset); \
+ (pos) = (struct ebt_entry_match *)((char *)((pos)->data) + \
+ (pos)->match_size))
+
+#ifndef __KERNEL__
#define EBT_MATCH_ITERATE(e, fn, args...) \
({ \
unsigned int __i; \
@@ -285,6 +293,7 @@ extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
} \
__ret; \
})
+#endif
#define EBT_WATCHER_ITERATE(e, fn, args...) \
({ \
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index ef4ca1b..1960c68 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -190,6 +190,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
const char *base;
const struct ebt_table_info *private;
struct xt_action_param acpar;
+ struct ebt_entry_match *ematch;
acpar.family = NFPROTO_BRIDGE;
acpar.in = in;
@@ -216,8 +217,9 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
if (ebt_basic_match(point, eth_hdr(skb), in, out))
goto letscontinue;
- if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
- goto letscontinue;
+ ebt_ematch_foreach(ematch, point)
+ if (ebt_do_match(ematch, skb, &acpar) != 0)
+ goto letscontinue;
if (acpar.hotdrop) {
read_unlock_bh(&table->lock);
return NF_DROP;
@@ -621,6 +623,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
{
struct xt_tgdtor_param par;
struct ebt_entry_target *t;
+ struct ebt_entry_match *ematch;
if (e->bitmask == 0)
return 0;
@@ -628,7 +631,9 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
if (cnt && (*cnt)-- == 0)
return 1;
EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
- EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
+ ebt_ematch_foreach(ematch, e)
+ if (ebt_cleanup_match(ematch, net, NULL) != 0)
+ break;
t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
par.net = net;
@@ -654,6 +659,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
int ret;
struct xt_mtchk_param mtpar;
struct xt_tgchk_param tgpar;
+ struct ebt_entry_match *ematch;
/* don't mess with the struct ebt_entries */
if (e->bitmask == 0)
@@ -700,9 +706,11 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
mtpar.entryinfo = tgpar.entryinfo = e;
mtpar.hook_mask = tgpar.hook_mask = hookmask;
mtpar.family = tgpar.family = NFPROTO_BRIDGE;
- ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i);
- if (ret != 0)
- goto cleanup_matches;
+ ebt_ematch_foreach(ematch, e) {
+ ret = ebt_check_match(ematch, &mtpar, &i);
+ if (ret != 0)
+ goto cleanup_matches;
+ }
j = 0;
ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
if (ret != 0)
@@ -748,7 +756,9 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
cleanup_watchers:
EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j);
cleanup_matches:
- EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i);
+ ebt_ematch_foreach(ematch, e)
+ if (ebt_cleanup_match(ematch, net, &i) != 0)
+ break;
return ret;
}
@@ -1361,6 +1371,7 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
int ret;
char __user *hlp;
const struct ebt_entry_target *t;
+ struct ebt_entry_match *ematch;
if (e->bitmask == 0)
return 0;
@@ -1368,9 +1379,11 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
hlp = ubase + (((char *)e + e->target_offset) - base);
t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
- ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase);
- if (ret != 0)
- return ret;
+ ebt_ematch_foreach(ematch, e) {
+ ret = ebt_make_matchname(ematch, base, ubase);
+ if (ret != 0)
+ return ret;
+ }
ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
if (ret != 0)
return ret;
@@ -1663,6 +1676,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
struct ebt_entry __user *ce;
u32 watchers_offset, target_offset, next_offset;
compat_uint_t origsize;
+ struct ebt_entry_match *ematch;
int ret;
if (e->bitmask == 0) {
@@ -1686,9 +1700,11 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
origsize = *size;
*dstptr += sizeof(*ce);
- ret = EBT_MATCH_ITERATE(e, compat_match_to_user, dstptr, size);
- if (ret)
- return ret;
+ ebt_ematch_foreach(ematch, e) {
+ ret = compat_match_to_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
watchers_offset = e->watchers_offset - (origsize - *size);
ret = EBT_WATCHER_ITERATE(e, compat_watcher_to_user, dstptr, size);
@@ -1733,6 +1749,7 @@ static int compat_calc_entry(const struct ebt_entry *e,
{
const struct ebt_entry_target *t;
unsigned int entry_offset;
+ struct ebt_entry_match *ematch;
int off, ret, i;
if (e->bitmask == 0)
@@ -1741,7 +1758,9 @@ static int compat_calc_entry(const struct ebt_entry *e,
off = 0;
entry_offset = (void *)e - base;
- EBT_MATCH_ITERATE(e, compat_calc_match, &off);
+ ebt_ematch_foreach(ematch, e)
+ if (compat_calc_match(ematch, &off) != 0)
+ break;
EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
--
1.7.1
^ permalink raw reply related
* [PATCH 70/72] netfilter: ebtables: replace EBT_ENTRY_ITERATE macro
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Jan Engelhardt <jengelh@medozas.de>
The macro is replaced by a list.h-like foreach loop. This makes the
This is similar to v2.6.33-rc8-1212-g72b2b1d.
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
include/linux/netfilter_bridge/ebtables.h | 14 ++++-
net/bridge/netfilter/ebtables.c | 93 +++++++++++++++++++----------
2 files changed, 74 insertions(+), 33 deletions(-)
diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index cbbb883..af0b721 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -254,8 +254,15 @@ extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
#endif /* __KERNEL__ */
-/* blatently stolen from ip_tables.h
+/* blatantly stolen from ip_tables.h
* fn returns 0 to continue iteration */
+#define ebt_entry_foreach(pos, ehead, esize) \
+ for ((pos) = (struct ebt_entry *)(ehead); \
+ (pos) < (struct ebt_entry *)((char *)(ehead) + (esize)); \
+ (pos) = (struct ebt_entry *)((char *)(pos) + \
+ ((pos)->bitmask == 0 ? sizeof(struct ebt_entries) : \
+ (pos)->next_offset)))
+
#define EBT_MATCH_ITERATE(e, fn, args...) \
({ \
unsigned int __i; \
@@ -302,6 +309,7 @@ extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
__ret; \
})
+#ifndef __KERNEL__
#define EBT_ENTRY_ITERATE(entries, size, fn, args...) \
({ \
unsigned int __i; \
@@ -324,5 +332,7 @@ extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
} \
__ret; \
})
+#endif /* __KERNEL__ */
+
+#endif /* __LINUX_BRIDGE_EFF_H */
-#endif
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index bcc102e..ef4ca1b 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -834,6 +834,7 @@ static int translate_table(struct net *net, const char *name,
unsigned int i, j, k, udc_cnt;
int ret;
struct ebt_cl_stack *cl_s = NULL; /* used in the checking for chain loops */
+ struct ebt_entry *entry;
i = 0;
while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i])
@@ -864,12 +865,12 @@ static int translate_table(struct net *net, const char *name,
k = 0; /* holds the total nr. of entries, should equal
newinfo->nentries afterwards */
udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */
- ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_check_entry_size_and_hooks, newinfo,
- &i, &j, &k, &udc_cnt);
-
- if (ret != 0)
- return ret;
+ ebt_entry_foreach(entry, newinfo->entries, newinfo->entries_size) {
+ ret = ebt_check_entry_size_and_hooks(entry, newinfo,
+ &i, &j, &k, &udc_cnt);
+ if (ret != 0)
+ return ret;
+ }
if (i != j) {
BUGPRINT("nentries does not equal the nr of entries in the "
@@ -906,8 +907,10 @@ static int translate_table(struct net *net, const char *name,
if (!cl_s)
return -ENOMEM;
i = 0; /* the i'th udc */
- EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_get_udc_positions, newinfo, &i, cl_s);
+ ebt_entry_foreach(entry, newinfo->entries,
+ newinfo->entries_size)
+ if (ebt_get_udc_positions(entry, newinfo, &i, cl_s) < 0)
+ break;
/* sanity check */
if (i != udc_cnt) {
BUGPRINT("i != udc_cnt\n");
@@ -937,12 +940,18 @@ static int translate_table(struct net *net, const char *name,
/* used to know what we need to clean up if something goes wrong */
i = 0;
- ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt);
- if (ret != 0) {
- EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_cleanup_entry, net, &i);
+ ret = 0;
+ ebt_entry_foreach(entry, newinfo->entries, newinfo->entries_size) {
+ ret = ebt_check_entry(entry, net, newinfo, name, &i,
+ cl_s, udc_cnt);
+ if (ret != 0)
+ break;
}
+ if (ret != 0)
+ ebt_entry_foreach(entry, newinfo->entries,
+ newinfo->entries_size)
+ if (ebt_cleanup_entry(entry, net, &i) != 0)
+ break;
vfree(cl_s);
return ret;
}
@@ -978,6 +987,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
/* used to be able to unlock earlier */
struct ebt_table_info *table;
struct ebt_table *t;
+ struct ebt_entry *entry;
/* the user wants counters back
the check on the size is done later, when we have the lock */
@@ -1044,8 +1054,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
ret = 0;
/* decrease module count and free resources */
- EBT_ENTRY_ITERATE(table->entries, table->entries_size,
- ebt_cleanup_entry, net, NULL);
+ ebt_entry_foreach(entry, table->entries, table->entries_size)
+ if (ebt_cleanup_entry(entry, net, NULL) != 0)
+ break;
vfree(table->entries);
if (table->chainstack) {
@@ -1061,8 +1072,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
free_unlock:
mutex_unlock(&ebt_mutex);
free_iterate:
- EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
- ebt_cleanup_entry, net, NULL);
+ ebt_entry_foreach(entry, newinfo->entries, newinfo->entries_size)
+ if (ebt_cleanup_entry(entry, net, NULL) != 0)
+ break;
free_counterstmp:
vfree(counterstmp);
/* can be initialized in translate_table() */
@@ -1234,6 +1246,7 @@ out:
void ebt_unregister_table(struct net *net, struct ebt_table *table)
{
+ struct ebt_entry *entry;
int i;
if (!table) {
@@ -1243,8 +1256,10 @@ void ebt_unregister_table(struct net *net, struct ebt_table *table)
mutex_lock(&ebt_mutex);
list_del(&table->list);
mutex_unlock(&ebt_mutex);
- EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
- ebt_cleanup_entry, net, NULL);
+ ebt_entry_foreach(entry, table->private->entries,
+ table->private->entries_size)
+ if (ebt_cleanup_entry(entry, net, NULL) != 0)
+ break;
if (table->private->nentries)
module_put(table->me);
vfree(table->private->entries);
@@ -1403,6 +1418,7 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
struct ebt_replace tmp;
const struct ebt_counter *oldcounters;
unsigned int entries_size, nentries;
+ struct ebt_entry *entry;
int ret;
char *entries;
@@ -1445,8 +1461,12 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
return -EFAULT;
}
/* set the match/watcher/target names right */
- return EBT_ENTRY_ITERATE(entries, entries_size,
- ebt_make_names, entries, tmp.entries);
+ ebt_entry_foreach(entry, entries, entries_size) {
+ ret = ebt_make_names(entry, entries, tmp.entries);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
}
static int do_ebt_set_ctl(struct sock *sk,
@@ -1755,11 +1775,16 @@ static int compat_table_info(const struct ebt_table_info *info,
{
unsigned int size = info->entries_size;
const void *entries = info->entries;
+ struct ebt_entry *entry;
+ int ret;
newinfo->entries_size = size;
-
- return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
- entries, newinfo);
+ ebt_entry_foreach(entry, entries, size) {
+ ret = compat_calc_entry(entry, info, entries, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
}
static int compat_copy_everything_to_user(struct ebt_table *t,
@@ -1768,6 +1793,7 @@ static int compat_copy_everything_to_user(struct ebt_table *t,
struct compat_ebt_replace repl, tmp;
struct ebt_counter *oldcounters;
struct ebt_table_info tinfo;
+ struct ebt_entry *entry;
int ret;
void __user *pos;
@@ -1814,8 +1840,12 @@ static int compat_copy_everything_to_user(struct ebt_table *t,
return ret;
pos = compat_ptr(tmp.entries);
- return EBT_ENTRY_ITERATE(tinfo.entries, tinfo.entries_size,
- compat_copy_entry_to_user, &pos, &tmp.entries_size);
+ ebt_entry_foreach(entry, tinfo.entries, tinfo.entries_size) {
+ ret = compat_copy_entry_to_user(entry, &pos, &tmp.entries_size);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
}
struct ebt_entries_buf_state {
@@ -2141,13 +2171,14 @@ static int compat_copy_entries(unsigned char *data, unsigned int size_user,
struct ebt_entries_buf_state *state)
{
unsigned int size_remaining = size_user;
+ struct ebt_entry *entry;
int ret;
- ret = EBT_ENTRY_ITERATE(data, size_user, size_entry_mwt, data,
- &size_remaining, state);
- if (ret < 0)
- return ret;
-
+ ebt_entry_foreach(entry, data, size_user) {
+ ret = size_entry_mwt(entry, data, &size_remaining, state);
+ if (ret != 0)
+ return ret;
+ }
WARN_ON(size_remaining);
return state->buf_kern_offset;
}
--
1.7.1
^ permalink raw reply related
* [PATCH 67/72] tproxy: use the interface primary IP address as a default value for --on-ip
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Balazs Scheidler <bazsi@balabit.hu>
The REDIRECT target and the older TProxy versions used the primary address
of the incoming interface as the default value of the --on-ip parameter.
This was unintentionally changed during the initial TProxy submission and
caused confusion among users.
Since IPv6 has no notion of primary address, we just select the first address
on the list: this way the socket lookup finds wildcard bound sockets
properly and we cannot really do better without the user telling us the
IPv6 address of the proxy.
This is implemented for both IPv4 and IPv6.
Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/xt_TPROXY.c | 202 +++++++++++++++++++++++++++++----------------
1 files changed, 132 insertions(+), 70 deletions(-)
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index d5f97e2..19c482c 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -16,15 +16,41 @@
#include <net/checksum.h>
#include <net/udp.h>
#include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
#include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+ struct in_device *indev;
+ __be32 laddr;
+
+ if (user_laddr)
+ return user_laddr;
+
+ laddr = 0;
+ rcu_read_lock();
+ indev = __in_dev_get_rcu(skb->dev);
+ for_primary_ifa(indev) {
+ laddr = ifa->ifa_local;
+ break;
+ } endfor_ifa(indev);
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
/**
* tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
@@ -75,60 +101,6 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
return sk;
}
-/**
- * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
- * @skb: The skb being processed.
- * @tproto: Transport protocol.
- * @thoff: Transport protocol header offset.
- * @par: Iptables target parameters.
- * @sk: The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait6() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
- const struct xt_action_param *par,
- struct sock *sk)
-{
- const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct tcphdr _hdr, *hp;
- const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
-
- hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- inet_twsk_put(inet_twsk(sk));
- return NULL;
- }
-
- if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
- /* SYN to a TIME_WAIT socket, we'd rather redirect it
- * to a listener socket if there's one */
- struct sock *sk2;
-
- sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
- &iph->saddr,
- !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
- hp->source,
- tgi->lport ? tgi->lport : hp->dest,
- skb->dev, NFT_LOOKUP_LISTENER);
- if (sk2) {
- inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
- inet_twsk_put(inet_twsk(sk));
- sk = sk2;
- }
- }
-
- return sk;
-}
-
static unsigned int
tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
u_int32_t mark_mask, u_int32_t mark_value)
@@ -150,6 +122,10 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
hp->source, hp->dest,
skb->dev, NFT_LOOKUP_ESTABLISHED);
+ laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+ if (!lport)
+ lport = hp->dest;
+
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
/* reopening a TIME_WAIT connection needs special handling */
@@ -158,8 +134,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
- iph->saddr, laddr ? laddr : iph->daddr,
- hp->source, lport ? lport : hp->dest,
+ iph->saddr, laddr,
+ hp->source, lport,
skb->dev, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
@@ -174,9 +150,9 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %08x:%hu -> %08x:%hu, mark: %x\n",
- iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
- ntohl(laddr), ntohs(lport), skb->mark);
+ pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
return NF_DROP;
}
@@ -197,6 +173,88 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+ const struct in6_addr *daddr)
+{
+ struct inet6_dev *indev;
+ struct inet6_ifaddr *ifa;
+ struct in6_addr *laddr;
+
+ if (!ipv6_addr_any(user_laddr))
+ return user_laddr;
+ laddr = NULL;
+
+ rcu_read_lock();
+ indev = __in6_dev_get(skb->dev);
+ if (indev)
+ list_for_each_entry(ifa, &indev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+ continue;
+
+ laddr = &ifa->addr;
+ break;
+ }
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @tproto: Transport protocol.
+ * @thoff: Transport protocol header offset.
+ * @par: Iptables target parameters.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+ const struct xt_action_param *par,
+ struct sock *sk)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr _hdr, *hp;
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr,
+ tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+ hp->source,
+ tgi->lport ? tgi->lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
static unsigned int
tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -204,6 +262,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
struct udphdr _hdr, *hp;
struct sock *sk;
+ const struct in6_addr *laddr;
+ __be16 lport;
int thoff;
int tproto;
@@ -228,6 +288,9 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
hp->source, hp->dest,
par->in, NFT_LOOKUP_ESTABLISHED);
+ laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+ lport = tgi->lport ? tgi->lport : hp->dest;
+
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
/* reopening a TIME_WAIT connection needs special handling */
@@ -236,10 +299,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
- &iph->saddr,
- !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
- hp->source,
- tgi->lport ? tgi->lport : hp->dest,
+ &iph->saddr, laddr,
+ hp->source, lport,
par->in, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
@@ -249,14 +310,15 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->dest),
- &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
+ tproto, &iph->saddr, ntohs(hp->source),
+ laddr, ntohs(lport), skb->mark);
return NF_ACCEPT;
}
pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->dest),
- &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
+ tproto, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
+
return NF_DROP;
}
--
1.7.1
^ permalink raw reply related
* [PATCH 65/72] tproxy: added IPv6 support to the TPROXY target
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Balazs Scheidler <bazsi@balabit.hu>
This requires a new revision as the old target structure was
IPv4 specific.
Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/linux/netfilter/xt_TPROXY.h | 13 ++-
net/netfilter/xt_TPROXY.c | 262 ++++++++++++++++++++++++++++++-----
2 files changed, 235 insertions(+), 40 deletions(-)
diff --git a/include/linux/netfilter/xt_TPROXY.h b/include/linux/netfilter/xt_TPROXY.h
index 152e8f9..3f3d693 100644
--- a/include/linux/netfilter/xt_TPROXY.h
+++ b/include/linux/netfilter/xt_TPROXY.h
@@ -1,5 +1,5 @@
-#ifndef _XT_TPROXY_H_target
-#define _XT_TPROXY_H_target
+#ifndef _XT_TPROXY_H
+#define _XT_TPROXY_H
/* TPROXY target is capable of marking the packet to perform
* redirection. We can get rid of that whenever we get support for
@@ -11,4 +11,11 @@ struct xt_tproxy_target_info {
__be16 lport;
};
-#endif /* _XT_TPROXY_H_target */
+struct xt_tproxy_target_info_v1 {
+ u_int32_t mark_mask;
+ u_int32_t mark_value;
+ union nf_inet_addr laddr;
+ __be16 lport;
+};
+
+#endif /* _XT_TPROXY_H */
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e0b6900..d5f97e2 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
/*
* Transparent proxy support for Linux/iptables
*
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
* Author: Balazs Scheidler, Krisztian Kovacs
*
* This program is free software; you can redistribute it and/or modify
@@ -19,15 +19,18 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/xt_TPROXY.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_tproxy_core.h>
/**
- * tproxy_handle_time_wait() - handle TCP TIME_WAIT reopen redirections
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
* @skb: The skb being processed.
- * @par: Iptables target parameters.
+ * @laddr: IPv4 address to redirect to or zero.
+ * @lport: TCP port to redirect to or zero.
* @sk: The TIME_WAIT TCP socket found by the lookup.
*
* We have to handle SYN packets arriving to TIME_WAIT sockets
@@ -35,16 +38,16 @@
* redirect the new connection to the proxy if there's a listener
* socket present.
*
- * tproxy_handle_time_wait() consumes the socket reference passed in.
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
*
* Returns the listener socket if there's one, the TIME_WAIT socket if
* no such listener is found, or NULL if the TCP header is incomplete.
*/
static struct sock *
-tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk)
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ struct sock *sk)
{
const struct iphdr *iph = ip_hdr(skb);
- const struct xt_tproxy_target_info *tgi = par->targinfo;
struct tcphdr _hdr, *hp;
hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
@@ -59,13 +62,64 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par,
struct sock *sk2;
sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
- iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
- hp->source, tgi->lport ? tgi->lport : hp->dest,
- par->in, NFT_LOOKUP_LISTENER);
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @tproto: Transport protocol.
+ * @thoff: Transport protocol header offset.
+ * @par: Iptables target parameters.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+ const struct xt_action_param *par,
+ struct sock *sk)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr _hdr, *hp;
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr,
+ !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
+ hp->source,
+ tgi->lport ? tgi->lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
if (sk2) {
- /* yeah, there's one, let's kill the TIME_WAIT
- * socket and redirect to the listener
- */
inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
inet_twsk_put(inet_twsk(sk));
sk = sk2;
@@ -76,10 +130,10 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par,
}
static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ u_int32_t mark_mask, u_int32_t mark_value)
{
const struct iphdr *iph = ip_hdr(skb);
- const struct xt_tproxy_target_info *tgi = par->targinfo;
struct udphdr _hdr, *hp;
struct sock *sk;
@@ -87,18 +141,105 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (hp == NULL)
return NF_DROP;
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
- par->in, NFT_LOOKUP_ESTABLISHED);
+ skb->dev, NFT_LOOKUP_ESTABLISHED);
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
- sk = tproxy_handle_time_wait(skb, par, sk);
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
else if (!sk)
+ /* no, there's no established connection, check if
+ * there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
- iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
- hp->source, tgi->lport ? tgi->lport : hp->dest,
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+
+ /* NOTE: assign_sock consumes our sk reference */
+ if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ /* This should be in a separate target, but we don't do multiple
+ targets on the same rule yet */
+ skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+ pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->daddr, ntohs(hp->dest),
+ &laddr, ntohs(lport), skb->mark);
+ return NF_ACCEPT;
+ }
+
+ pr_debug("no socket, dropping: proto %hhu %08x:%hu -> %08x:%hu, mark: %x\n",
+ iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
+ ntohl(laddr), ntohs(lport), skb->mark);
+ return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ struct udphdr _hdr, *hp;
+ struct sock *sk;
+ int thoff;
+ int tproto;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr, &iph->daddr,
+ hp->source, hp->dest,
+ par->in, NFT_LOOKUP_ESTABLISHED);
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+ else if (!sk)
+ /* no there's no established connection, check if
+ * there's a listener on the redirected addr/port */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr,
+ !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
+ hp->source,
+ tgi->lport ? tgi->lport : hp->dest,
par->in, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
@@ -107,19 +248,33 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
- pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
- iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
- ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+ pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->dest),
+ &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
- iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
- ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+ pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->dest),
+ &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
return NF_DROP;
}
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct ip6t_ip6 *i = par->entryinfo;
+
+ if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+ && !(i->flags & IP6T_INV_PROTO))
+ return 0;
+
+ pr_info("Can be used only in combination with "
+ "either -p tcp or -p udp\n");
+ return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
{
const struct ipt_ip *i = par->entryinfo;
@@ -132,31 +287,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
-static struct xt_target tproxy_tg_reg __read_mostly = {
- .name = "TPROXY",
- .family = NFPROTO_IPV4,
- .table = "mangle",
- .target = tproxy_tg,
- .targetsize = sizeof(struct xt_tproxy_target_info),
- .checkentry = tproxy_tg_check,
- .hooks = 1 << NF_INET_PRE_ROUTING,
- .me = THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v0,
+ .revision = 0,
+ .targetsize = sizeof(struct xt_tproxy_target_info),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV6,
+ .table = "mangle",
+ .target = tproxy_tg6_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg6_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#endif
+
};
static int __init tproxy_tg_init(void)
{
nf_defrag_ipv4_enable();
- return xt_register_target(&tproxy_tg_reg);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ nf_defrag_ipv6_enable();
+#endif
+
+ return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
static void __exit tproxy_tg_exit(void)
{
- xt_unregister_target(&tproxy_tg_reg);
+ xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
module_init(tproxy_tg_init);
module_exit(tproxy_tg_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
--
1.7.1
^ permalink raw reply related
* [PATCH 63/72] tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled
From: kaber @ 2010-10-21 15:19 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1287674399-31455-1-git-send-email-kaber@trash.net>
From: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv6/af_inet6.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 56b9bf2..4869797 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -343,7 +343,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!ipv6_chk_addr(net, &addr->sin6_addr,
+ if (!inet->transparent &&
+ !ipv6_chk_addr(net, &addr->sin6_addr,
dev, 0)) {
err = -EADDRNOTAVAIL;
goto out_unlock;
--
1.7.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox