From: Stephen Hemminger <shemminger@vyatta.com>
To: Eric Dumazet <dada1@cosmosbay.com>,
Patrick McHardy <kaber@trash.net>,
David Miller <davem@davemloft.net>
Cc: Rick Jones <rick.jones2@hp.com>,
netdev@vger.kernel.org, netfilter-devel@vger.kernel.org
Subject: Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
Date: Thu, 19 Feb 2009 14:03:03 -0800 [thread overview]
Message-ID: <20090219140303.4329f860@extreme> (raw)
In-Reply-To: <499C1894.7060400@cosmosbay.com>
TCP connection tracking suffers of huge contention on a global rwlock,
used to protect tcp conntracking state.
As each tcp conntrack state have no relations between each others, we
can switch to fine grained lock, using a spinlock per "struct ip_ct_tcp"
tcp_print_conntrack() dont need to lock anything to read ct->proto.tcp.state,
so speedup /proc/net/ip_conntrack as well.
In this version the lock is placed in a 4 byte whole in the nf_conntrack
structure. This means no size change, and same method can later be used
for UDP, SCTP, DCCP conntrack.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reported-by: Rick Jones <rick.jones2@hp.com>
---
include/linux/skbuff.h | 1
include/net/netfilter/nf_conntrack_helper.h | 2 -
include/net/netfilter/nf_conntrack_l4proto.h | 3 --
net/netfilter/nf_conntrack_core.c | 1
net/netfilter/nf_conntrack_netlink.c | 6 ++--
net/netfilter/nf_conntrack_proto_dccp.c | 2 -
net/netfilter/nf_conntrack_proto_sctp.c | 2 -
net/netfilter/nf_conntrack_proto_tcp.c | 37 ++++++++++++---------------
8 files changed, 26 insertions(+), 28 deletions(-)
--- a/include/net/netfilter/nf_conntrack_helper.h 2009-02-19 13:45:26.103408544 -0800
+++ b/include/net/netfilter/nf_conntrack_helper.h 2009-02-19 13:45:56.136167400 -0800
@@ -34,7 +34,7 @@ struct nf_conntrack_helper
void (*destroy)(struct nf_conn *ct);
- int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct);
+ int (*to_nlattr)(struct sk_buff *skb, struct nf_conn *ct);
unsigned int expect_class_max;
};
--- a/include/net/netfilter/nf_conntrack_l4proto.h 2009-02-19 13:45:26.103408544 -0800
+++ b/include/net/netfilter/nf_conntrack_l4proto.h 2009-02-19 13:45:56.136167400 -0800
@@ -62,8 +62,7 @@ struct nf_conntrack_l4proto
int (*print_conntrack)(struct seq_file *s, const struct nf_conn *);
/* convert protoinfo to nfnetink attributes */
- int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
- const struct nf_conn *ct);
+ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct);
/* convert nfnetlink attributes to protoinfo */
int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct);
--- a/net/netfilter/nf_conntrack_core.c 2009-02-19 13:42:48.316883082 -0800
+++ b/net/netfilter/nf_conntrack_core.c 2009-02-19 13:58:59.952707711 -0800
@@ -499,6 +499,7 @@ struct nf_conn *nf_conntrack_alloc(struc
return ERR_PTR(-ENOMEM);
}
+ spin_lock_init(&ct->ct_general.lock);
atomic_set(&ct->ct_general.use, 1);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
--- a/net/netfilter/nf_conntrack_netlink.c 2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_netlink.c 2009-02-19 13:45:56.136167400 -0800
@@ -143,7 +143,7 @@ nla_put_failure:
}
static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
{
struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
@@ -168,7 +168,7 @@ nla_put_failure:
}
static inline int
-ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_helpinfo(struct sk_buff *skb, struct nf_conn *ct)
{
struct nlattr *nest_helper;
const struct nf_conn_help *help = nfct_help(ct);
@@ -350,7 +350,7 @@ nla_put_failure:
static int
ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
int event, int nowait,
- const struct nf_conn *ct)
+ struct nf_conn *ct)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
--- a/net/netfilter/nf_conntrack_proto_dccp.c 2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_dccp.c 2009-02-19 13:45:56.136167400 -0800
@@ -612,7 +612,7 @@ static int dccp_print_conntrack(struct s
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- const struct nf_conn *ct)
+ struct nf_conn *ct)
{
struct nlattr *nest_parms;
--- a/net/netfilter/nf_conntrack_proto_sctp.c 2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_sctp.c 2009-02-19 13:45:56.136167400 -0800
@@ -469,7 +469,7 @@ static bool sctp_new(struct nf_conn *ct,
#include <linux/netfilter/nfnetlink_conntrack.h>
static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- const struct nf_conn *ct)
+ struct nf_conn *ct)
{
struct nlattr *nest_parms;
--- a/net/netfilter/nf_conntrack_proto_tcp.c 2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_tcp.c 2009-02-19 13:59:58.025139232 -0800
@@ -26,9 +26,6 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_log.h>
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
/* "Be conservative in what you do,
be liberal in what you accept from others."
If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -297,9 +294,7 @@ static int tcp_print_conntrack(struct se
{
enum tcp_conntrack state;
- read_lock_bh(&tcp_lock);
state = ct->proto.tcp.state;
- read_unlock_bh(&tcp_lock);
return seq_printf(s, "%s ", tcp_conntrack_names[state]);
}
@@ -705,14 +700,15 @@ void nf_conntrack_tcp_update(const struc
end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
- write_lock_bh(&tcp_lock);
+ spin_lock_bh(&ct->ct_general.lock);
/*
* We have to worry for the ack in the reply packet only...
*/
if (after(end, ct->proto.tcp.seen[dir].td_end))
ct->proto.tcp.seen[dir].td_end = end;
ct->proto.tcp.last_end = end;
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
+
pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
"receiver end=%u maxend=%u maxwin=%u scale=%i\n",
sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -821,7 +817,7 @@ static int tcp_packet(struct nf_conn *ct
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
- write_lock_bh(&tcp_lock);
+ spin_lock_bh(&ct->ct_general.lock);
old_state = ct->proto.tcp.state;
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
@@ -851,7 +847,7 @@ static int tcp_packet(struct nf_conn *ct
&& ct->proto.tcp.last_index == TCP_RST_SET)) {
/* Attempt to reopen a closed/aborted connection.
* Delete this connection and look up again. */
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
/* Only repeat if we can actually remove the timer.
* Destruction may already be in progress in process
@@ -887,7 +883,7 @@ static int tcp_packet(struct nf_conn *ct
* that the client cannot but retransmit its SYN and
* thus initiate a clean new session.
*/
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
if (LOG_INVALID(net, IPPROTO_TCP))
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: killing out of sync session ");
@@ -900,7 +896,7 @@ static int tcp_packet(struct nf_conn *ct
ct->proto.tcp.last_end =
segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
if (LOG_INVALID(net, IPPROTO_TCP))
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid packet ignored ");
@@ -909,7 +905,7 @@ static int tcp_packet(struct nf_conn *ct
/* Invalid packet */
pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
dir, get_conntrack_index(th), old_state);
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
if (LOG_INVALID(net, IPPROTO_TCP))
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid state ");
@@ -940,7 +936,7 @@ static int tcp_packet(struct nf_conn *ct
if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
skb, dataoff, th, pf)) {
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
return -NF_ACCEPT;
}
in_window:
@@ -969,7 +965,7 @@ static int tcp_packet(struct nf_conn *ct
timeout = nf_ct_tcp_timeout_unacknowledged;
else
timeout = tcp_timeouts[new_state];
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
if (new_state != old_state)
@@ -1022,6 +1018,7 @@ static bool tcp_new(struct nf_conn *ct,
pr_debug("nf_ct_tcp: invalid new deleting.\n");
return false;
}
+ spin_lock_init(&ct->ct_general.lock);
if (new_state == TCP_CONNTRACK_SYN_SENT) {
/* SYN packet */
@@ -1087,12 +1084,12 @@ static bool tcp_new(struct nf_conn *ct,
#include <linux/netfilter/nfnetlink_conntrack.h>
static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- const struct nf_conn *ct)
+ struct nf_conn *ct)
{
struct nlattr *nest_parms;
struct nf_ct_tcp_flags tmp = {};
- read_lock_bh(&tcp_lock);
+ spin_lock_bh(&ct->ct_general.lock);
nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
@@ -1112,14 +1109,14 @@ static int tcp_to_nlattr(struct sk_buff
tmp.flags = ct->proto.tcp.seen[1].flags;
NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
sizeof(struct nf_ct_tcp_flags), &tmp);
- read_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
nla_nest_end(skb, nest_parms);
return 0;
nla_put_failure:
- read_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
return -1;
}
@@ -1150,7 +1147,7 @@ static int nlattr_to_tcp(struct nlattr *
nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
return -EINVAL;
- write_lock_bh(&tcp_lock);
+ spin_lock_bh(&ct->ct_general.lock);
if (tb[CTA_PROTOINFO_TCP_STATE])
ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
@@ -1177,7 +1174,7 @@ static int nlattr_to_tcp(struct nlattr *
ct->proto.tcp.seen[1].td_scale =
nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
}
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(&ct->ct_general.lock);
return 0;
}
--- a/include/linux/skbuff.h 2009-02-19 13:53:46.575411267 -0800
+++ b/include/linux/skbuff.h 2009-02-19 13:53:57.414478437 -0800
@@ -97,6 +97,7 @@ struct pipe_inode_info;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack {
atomic_t use;
+ spinlock_t lock;
};
#endif
next prev parent reply other threads:[~2009-02-19 22:03 UTC|newest]
Thread overview: 83+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-02-18 5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
2009-02-18 5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
2009-02-18 10:02 ` Patrick McHardy
2009-02-19 19:47 ` [PATCH] " Stephen Hemminger
2009-02-19 23:46 ` Eric Dumazet
2009-02-19 23:56 ` Rick Jones
2009-02-20 1:03 ` Stephen Hemminger
2009-02-20 1:18 ` Rick Jones
2009-02-20 9:42 ` Patrick McHardy
2009-02-20 22:57 ` Rick Jones
2009-02-21 0:35 ` Rick Jones
2009-02-20 9:37 ` Patrick McHardy
2009-02-20 18:10 ` [PATCH] iptables: xt_hashlimit fix Eric Dumazet
2009-02-20 18:33 ` Jan Engelhardt
2009-02-28 1:54 ` Jan Engelhardt
2009-02-28 6:56 ` Eric Dumazet
2009-02-28 8:22 ` Jan Engelhardt
2009-02-24 14:31 ` Patrick McHardy
2009-02-27 14:02 ` [PATCH] iptables: lock free counters Eric Dumazet
2009-02-27 16:08 ` [PATCH] rcu: increment quiescent state counter in ksoftirqd() Eric Dumazet
2009-02-27 16:34 ` Paul E. McKenney
2009-03-02 10:55 ` [PATCH] iptables: lock free counters Patrick McHardy
2009-03-02 17:47 ` Eric Dumazet
2009-03-02 21:56 ` Patrick McHardy
2009-03-02 22:02 ` Stephen Hemminger
2009-03-02 22:07 ` Patrick McHardy
2009-03-02 22:17 ` Paul E. McKenney
2009-03-02 22:27 ` Eric Dumazet
2009-02-18 5:19 ` [RFT 2/4] Add mod_timer_noact Stephen Hemminger
2009-02-18 9:20 ` Ingo Molnar
2009-02-18 9:30 ` David Miller
2009-02-18 11:01 ` Ingo Molnar
2009-02-18 11:39 ` Jarek Poplawski
2009-02-18 12:37 ` Ingo Molnar
2009-02-18 12:33 ` Patrick McHardy
2009-02-18 21:39 ` David Miller
2009-02-18 21:51 ` Ingo Molnar
2009-02-18 22:04 ` David Miller
2009-02-18 22:42 ` Peter Zijlstra
2009-02-18 22:47 ` David Miller
2009-02-18 22:56 ` Stephen Hemminger
2009-02-18 10:07 ` Patrick McHardy
2009-02-18 12:05 ` [patch] timers: add mod_timer_pending() Ingo Molnar
2009-02-18 12:33 ` Patrick McHardy
2009-02-18 12:50 ` Ingo Molnar
2009-02-18 12:54 ` Patrick McHardy
2009-02-18 13:47 ` Ingo Molnar
2009-02-18 17:00 ` Oleg Nesterov
2009-02-18 18:23 ` Ingo Molnar
2009-02-18 18:58 ` Oleg Nesterov
2009-02-18 19:24 ` Ingo Molnar
2009-02-18 10:29 ` [RFT 2/4] Add mod_timer_noact Patrick McHardy
2009-02-18 5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
2009-02-18 9:54 ` Patrick McHardy
2009-02-18 11:05 ` Jarek Poplawski
2009-02-18 11:08 ` Patrick McHardy
2009-02-18 14:01 ` Eric Dumazet
2009-02-18 14:04 ` Patrick McHardy
2009-02-18 14:22 ` Eric Dumazet
2009-02-18 14:27 ` Patrick McHardy
2009-02-18 5:19 ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking Stephen Hemminger
2009-02-18 9:56 ` Patrick McHardy
2009-02-18 14:17 ` Eric Dumazet
2009-02-19 22:03 ` Stephen Hemminger [this message]
2009-03-28 16:55 ` [PATCH] netfilter: finer grained nf_conn locking Eric Dumazet
2009-03-29 0:48 ` Stephen Hemminger
2009-03-30 19:57 ` Eric Dumazet
2009-03-30 20:05 ` Stephen Hemminger
2009-04-06 12:07 ` Patrick McHardy
2009-04-06 12:32 ` Jan Engelhardt
2009-04-06 17:25 ` Stephen Hemminger
2009-03-30 18:57 ` Rick Jones
2009-03-30 19:20 ` Eric Dumazet
2009-03-30 19:38 ` Jesper Dangaard Brouer
2009-03-30 19:54 ` Eric Dumazet
2009-03-30 20:34 ` Jesper Dangaard Brouer
2009-03-30 20:41 ` Eric Dumazet
2009-03-30 21:25 ` Jesper Dangaard Brouer
2009-03-30 22:44 ` Rick Jones
2009-02-18 21:55 ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking David Miller
2009-02-18 23:23 ` Patrick McHardy
2009-02-18 23:35 ` Stephen Hemminger
2009-02-18 8:30 ` [RFT 0/4] Netfilter/iptables performance improvements Eric Dumazet
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090219140303.4329f860@extreme \
--to=shemminger@vyatta.com \
--cc=dada1@cosmosbay.com \
--cc=davem@davemloft.net \
--cc=kaber@trash.net \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
--cc=rick.jones2@hp.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).