All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stephen Hemminger <shemminger@vyatta.com>
To: Eric Dumazet <dada1@cosmosbay.com>,
	Patrick McHardy <kaber@trash.net>,
	David Miller <davem@davemloft.net>
Cc: Rick Jones <rick.jones2@hp.com>,
	netdev@vger.kernel.org, netfilter-devel@vger.kernel.org
Subject: Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
Date: Thu, 19 Feb 2009 14:03:03 -0800	[thread overview]
Message-ID: <20090219140303.4329f860@extreme> (raw)
In-Reply-To: <499C1894.7060400@cosmosbay.com>

TCP connection tracking suffers of huge contention on a global rwlock,
used to protect tcp conntracking state.
As each tcp conntrack state have no relations between each others, we
can switch to fine grained lock, using a spinlock per "struct ip_ct_tcp"

tcp_print_conntrack() dont need to lock anything to read ct->proto.tcp.state,
so speedup /proc/net/ip_conntrack as well.

In this version the lock is placed in a 4 byte whole in the nf_conntrack
structure. This means no size change, and same method can later be used
for UDP, SCTP, DCCP conntrack.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reported-by: Rick Jones <rick.jones2@hp.com>

---
 include/linux/skbuff.h                       |    1 
 include/net/netfilter/nf_conntrack_helper.h  |    2 -
 include/net/netfilter/nf_conntrack_l4proto.h |    3 --
 net/netfilter/nf_conntrack_core.c            |    1 
 net/netfilter/nf_conntrack_netlink.c         |    6 ++--
 net/netfilter/nf_conntrack_proto_dccp.c      |    2 -
 net/netfilter/nf_conntrack_proto_sctp.c      |    2 -
 net/netfilter/nf_conntrack_proto_tcp.c       |   37 ++++++++++++---------------
 8 files changed, 26 insertions(+), 28 deletions(-)

--- a/include/net/netfilter/nf_conntrack_helper.h	2009-02-19 13:45:26.103408544 -0800
+++ b/include/net/netfilter/nf_conntrack_helper.h	2009-02-19 13:45:56.136167400 -0800
@@ -34,7 +34,7 @@ struct nf_conntrack_helper
 
 	void (*destroy)(struct nf_conn *ct);
 
-	int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct);
+	int (*to_nlattr)(struct sk_buff *skb, struct nf_conn *ct);
 	unsigned int expect_class_max;
 };
 
--- a/include/net/netfilter/nf_conntrack_l4proto.h	2009-02-19 13:45:26.103408544 -0800
+++ b/include/net/netfilter/nf_conntrack_l4proto.h	2009-02-19 13:45:56.136167400 -0800
@@ -62,8 +62,7 @@ struct nf_conntrack_l4proto
 	int (*print_conntrack)(struct seq_file *s, const struct nf_conn *);
 
 	/* convert protoinfo to nfnetink attributes */
-	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct);
+	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct);
 
 	/* convert nfnetlink attributes to protoinfo */
 	int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct);
--- a/net/netfilter/nf_conntrack_core.c	2009-02-19 13:42:48.316883082 -0800
+++ b/net/netfilter/nf_conntrack_core.c	2009-02-19 13:58:59.952707711 -0800
@@ -499,6 +499,7 @@ struct nf_conn *nf_conntrack_alloc(struc
 		return ERR_PTR(-ENOMEM);
 	}
 
+	spin_lock_init(&ct->ct_general.lock);
 	atomic_set(&ct->ct_general.use, 1);
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
--- a/net/netfilter/nf_conntrack_netlink.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_netlink.c	2009-02-19 13:45:56.136167400 -0800
@@ -143,7 +143,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *nest_proto;
@@ -168,7 +168,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_helpinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nlattr *nest_helper;
 	const struct nf_conn_help *help = nfct_help(ct);
@@ -350,7 +350,7 @@ nla_put_failure:
 static int
 ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 		    int event, int nowait,
-		    const struct nf_conn *ct)
+		    struct nf_conn *ct)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
--- a/net/netfilter/nf_conntrack_proto_dccp.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_dccp.c	2009-02-19 13:45:56.136167400 -0800
@@ -612,7 +612,7 @@ static int dccp_print_conntrack(struct s
 
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 
--- a/net/netfilter/nf_conntrack_proto_sctp.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_sctp.c	2009-02-19 13:45:56.136167400 -0800
@@ -469,7 +469,7 @@ static bool sctp_new(struct nf_conn *ct,
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 
--- a/net/netfilter/nf_conntrack_proto_tcp.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_tcp.c	2009-02-19 13:59:58.025139232 -0800
@@ -26,9 +26,6 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -297,9 +294,7 @@ static int tcp_print_conntrack(struct se
 {
 	enum tcp_conntrack state;
 
-	read_lock_bh(&tcp_lock);
 	state = ct->proto.tcp.state;
-	read_unlock_bh(&tcp_lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -705,14 +700,15 @@ void nf_conntrack_tcp_update(const struc
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, ct->proto.tcp.seen[dir].td_end))
 		ct->proto.tcp.seen[dir].td_end = end;
 	ct->proto.tcp.last_end = end;
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
+
 	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -821,7 +817,7 @@ static int tcp_packet(struct nf_conn *ct
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	old_state = ct->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -851,7 +847,7 @@ static int tcp_packet(struct nf_conn *ct
 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 			/* Attempt to reopen a closed/aborted connection.
 			 * Delete this connection and look up again. */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->ct_general.lock);
 
 			/* Only repeat if we can actually remove the timer.
 			 * Destruction may already be in progress in process
@@ -887,7 +883,7 @@ static int tcp_packet(struct nf_conn *ct
 			 * that the client cannot but retransmit its SYN and
 			 * thus initiate a clean new session.
 			 */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->ct_general.lock);
 			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
@@ -900,7 +896,7 @@ static int tcp_packet(struct nf_conn *ct
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->ct_general.lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
@@ -909,7 +905,7 @@ static int tcp_packet(struct nf_conn *ct
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->ct_general.lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
@@ -940,7 +936,7 @@ static int tcp_packet(struct nf_conn *ct
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->ct_general.lock);
 		return -NF_ACCEPT;
 	}
      in_window:
@@ -969,7 +965,7 @@ static int tcp_packet(struct nf_conn *ct
 		timeout = nf_ct_tcp_timeout_unacknowledged;
 	else
 		timeout = tcp_timeouts[new_state];
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 
 	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 	if (new_state != old_state)
@@ -1022,6 +1018,7 @@ static bool tcp_new(struct nf_conn *ct, 
 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
 		return false;
 	}
+	spin_lock_init(&ct->ct_general.lock);
 
 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
 		/* SYN packet */
@@ -1087,12 +1084,12 @@ static bool tcp_new(struct nf_conn *ct, 
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct)
+			 struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 	struct nf_ct_tcp_flags tmp = {};
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -1112,14 +1109,14 @@ static int tcp_to_nlattr(struct sk_buff 
 	tmp.flags = ct->proto.tcp.seen[1].flags;
 	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
 		sizeof(struct nf_ct_tcp_flags), &tmp);
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 	return -1;
 }
 
@@ -1150,7 +1147,7 @@ static int nlattr_to_tcp(struct nlattr *
 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
 		return -EINVAL;
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	if (tb[CTA_PROTOINFO_TCP_STATE])
 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1177,7 +1174,7 @@ static int nlattr_to_tcp(struct nlattr *
 		ct->proto.tcp.seen[1].td_scale =
 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
 	}
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 
 	return 0;
 }
--- a/include/linux/skbuff.h	2009-02-19 13:53:46.575411267 -0800
+++ b/include/linux/skbuff.h	2009-02-19 13:53:57.414478437 -0800
@@ -97,6 +97,7 @@ struct pipe_inode_info;
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
 	atomic_t use;
+	spinlock_t lock;
 };
 #endif
 

  reply	other threads:[~2009-02-19 22:03 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-02-18  5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
2009-02-18  5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
2009-02-18 10:02   ` Patrick McHardy
2009-02-19 19:47   ` [PATCH] " Stephen Hemminger
2009-02-19 23:46     ` Eric Dumazet
2009-02-19 23:56       ` Rick Jones
2009-02-20  1:03         ` Stephen Hemminger
2009-02-20  1:18           ` Rick Jones
2009-02-20  9:42             ` Patrick McHardy
2009-02-20 22:57               ` Rick Jones
2009-02-21  0:35                 ` Rick Jones
2009-02-20  9:37       ` Patrick McHardy
2009-02-20 18:10       ` [PATCH] iptables: xt_hashlimit fix Eric Dumazet
2009-02-20 18:33         ` Jan Engelhardt
2009-02-28  1:54           ` Jan Engelhardt
2009-02-28  6:56             ` Eric Dumazet
2009-02-28  8:22               ` Jan Engelhardt
2009-02-24 14:31         ` Patrick McHardy
2009-02-27 14:02       ` [PATCH] iptables: lock free counters Eric Dumazet
2009-02-27 16:08         ` [PATCH] rcu: increment quiescent state counter in ksoftirqd() Eric Dumazet
2009-02-27 16:08           ` Eric Dumazet
2009-02-27 16:34           ` Paul E. McKenney
2009-03-02 10:55         ` [PATCH] iptables: lock free counters Patrick McHardy
2009-03-02 17:47           ` Eric Dumazet
2009-03-02 21:56             ` Patrick McHardy
2009-03-02 22:02               ` Stephen Hemminger
2009-03-02 22:07                 ` Patrick McHardy
2009-03-02 22:17                   ` Paul E. McKenney
2009-03-02 22:27                 ` Eric Dumazet
2009-02-18  5:19 ` [RFT 2/4] Add mod_timer_noact Stephen Hemminger
2009-02-18  9:20   ` Ingo Molnar
2009-02-18  9:30     ` David Miller
2009-02-18 11:01       ` Ingo Molnar
2009-02-18 11:39         ` Jarek Poplawski
2009-02-18 12:37           ` Ingo Molnar
2009-02-18 12:33         ` Patrick McHardy
2009-02-18 21:39         ` David Miller
2009-02-18 21:51           ` Ingo Molnar
2009-02-18 22:04             ` David Miller
2009-02-18 22:42               ` Peter Zijlstra
2009-02-18 22:47                 ` David Miller
2009-02-18 22:56                   ` Stephen Hemminger
2009-02-18 10:07     ` Patrick McHardy
2009-02-18 12:05       ` [patch] timers: add mod_timer_pending() Ingo Molnar
2009-02-18 12:33         ` Patrick McHardy
2009-02-18 12:50           ` Ingo Molnar
2009-02-18 12:54             ` Patrick McHardy
2009-02-18 13:47               ` Ingo Molnar
2009-02-18 17:00         ` Oleg Nesterov
2009-02-18 18:23           ` Ingo Molnar
2009-02-18 18:58             ` Oleg Nesterov
2009-02-18 19:24               ` Ingo Molnar
2009-02-18 10:29   ` [RFT 2/4] Add mod_timer_noact Patrick McHardy
2009-02-18  5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
2009-02-18  9:54   ` Patrick McHardy
2009-02-18 11:05   ` Jarek Poplawski
2009-02-18 11:08     ` Patrick McHardy
2009-02-18 14:01   ` Eric Dumazet
2009-02-18 14:04     ` Patrick McHardy
2009-02-18 14:22       ` Eric Dumazet
2009-02-18 14:27         ` Patrick McHardy
2009-02-18  5:19 ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking Stephen Hemminger
2009-02-18  9:56   ` Patrick McHardy
2009-02-18 14:17     ` Eric Dumazet
2009-02-19 22:03       ` Stephen Hemminger [this message]
2009-03-28 16:55       ` [PATCH] netfilter: finer grained nf_conn locking Eric Dumazet
2009-03-29  0:48         ` Stephen Hemminger
2009-03-30 19:57           ` Eric Dumazet
2009-03-30 20:05             ` Stephen Hemminger
2009-04-06 12:07               ` Patrick McHardy
2009-04-06 12:32                 ` Jan Engelhardt
2009-04-06 17:25                   ` Stephen Hemminger
2009-03-30 18:57         ` Rick Jones
2009-03-30 19:20           ` Eric Dumazet
2009-03-30 19:38           ` Jesper Dangaard Brouer
2009-03-30 19:54             ` Eric Dumazet
2009-03-30 20:34               ` Jesper Dangaard Brouer
2009-03-30 20:41                 ` Eric Dumazet
2009-03-30 21:25                   ` Jesper Dangaard Brouer
2009-03-30 22:44                   ` Rick Jones
2009-02-18 21:55     ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking David Miller
2009-02-18 23:23       ` Patrick McHardy
2009-02-18 23:35         ` Stephen Hemminger
2009-02-18  8:30 ` [RFT 0/4] Netfilter/iptables performance improvements Eric Dumazet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090219140303.4329f860@extreme \
    --to=shemminger@vyatta.com \
    --cc=dada1@cosmosbay.com \
    --cc=davem@davemloft.net \
    --cc=kaber@trash.net \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=rick.jones2@hp.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.