[PATCH] aggressive early_drop and reserved conntrack entries

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] aggressive early_drop and reserved conntrack entries
@ 2004-12-09  8:34 Jozsef Kadlecsik
  2004-12-09  8:52 ` Patrick Schaaf
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Jozsef Kadlecsik @ 2004-12-09  8:34 UTC (permalink / raw)
  To: netfilter-devel, Grzegorz Piotr Jaskiewicz

Hi,

The included patch addresses the following issues:

- When the conntrack table is full, we search only in a single hash
  bucket. We are in trouble anyway, so let's search harder for
  droppable entries: the patch extends the search to at most the third of
  all the buckets.
- If the conntrack table is full, the remote management of the machine
  becomes a little bit complicated :-). The patch adds the ability to
  reserve a given number of entries to be used for management connections.
  The following proc entries are added to /proc/sys/net/ipv4/netfilter:

  ip_conntrack_reserved		the number of reserved connections

  ip_conntrack_reserved_mark	mark value to match

  ip_conntrack_reserved_mask	mask to use at matching

  Example:

  # Let's reserve 3 conntrack entries
  echo 3 > /proc/sys/net/ipv4/netfilter/ip_conntrack_reserved
  # Set mark value; mask is left the default 0xffffffff
  echo 1 > /proc/sys/net/ipv4/netfilter/ip_conntrack_reserved_mark
  # Mark connection-initiating packets in raw table to use the reserved
  # entries when the table is full, i.e ip_conntrack_count >=
  # ip_conntrack_max - ip_conntrack_reserved
  iptables -t raw -A PREROUTING -s <management station> -d <firewall> \
	  	  -p tcp --dport 22 -m mark --mark 1 -j ACCEPT

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/include/linux/sysctl.h linux-2.6.9-early_drop/include/linux/sysctl.h
--- linux-2.6.9-orig/include/linux/sysctl.h	2004-10-18 23:54:31.000000000 +0200
+++ linux-2.6.9-early_drop/include/linux/sysctl.h	2004-12-09 05:27:02.000000000 +0100
@@ -426,6 +426,9 @@
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25,
  	NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26,
 	NET_IPV4_NF_CONNTRACK_COUNT=27,
+	NET_IPV4_NF_CONNTRACK_RESERVED=28,
+	NET_IPV4_NF_CONNTRACK_RESERVED_MARK=29,
+	NET_IPV4_NF_CONNTRACK_RESERVED_MASK=30,
 };

 /* /proc/sys/net/ipv6 */
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_core.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_core.c	2004-10-18 23:53:05.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_core.c	2004-12-09 05:42:19.000000000 +0100
@@ -76,6 +76,11 @@
 struct ip_conntrack ip_conntrack_untracked;
 unsigned int ip_ct_log_invalid;

+/* Number of reserved conntrack entries; mark value; mask */
+int ip_ct_reserved = 0;
+unsigned long ip_ct_reserved_mark = 0;
+unsigned long ip_ct_reserved_mask = 0xffffffff;
+
 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);

 inline void
@@ -468,16 +473,32 @@
 	return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
 }

-static int early_drop(struct list_head *chain)
+static int early_drop(struct sk_buff *skb, unsigned int hash)
 {
-	/* Traverse backwards: gives us oldest, which is roughly LRU */
-	struct ip_conntrack_tuple_hash *h;
-	int dropped = 0;
+	struct ip_conntrack_tuple_hash *h = NULL;
+	int dropped = 1;
+	unsigned int i, bucket;
+
+	if (ip_ct_reserved == 0
+	    || atomic_read(&ip_conntrack_count) >= ip_conntrack_max)
+		goto not_reserved;

+	/* Let through reserved connections */
+	if ((skb->nfmark & ip_ct_reserved_mask) == ip_ct_reserved_mark)
+		return dropped;
+
+    not_reserved:
+    	dropped = 0;
 	READ_LOCK(&ip_conntrack_lock);
-	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
-	if (h)
-		atomic_inc(&h->ctrack->ct_general.use);
+	/* Try hard but not fully: magic number is three */
+	for (i = 0; h == NULL && i < ip_conntrack_htable_size/3; i++) {
+		bucket = (i + hash) % ip_conntrack_htable_size;
+		/* Traverse backwards: gives us oldest, which is roughly LRU */
+		h = LIST_FIND_B(&ip_conntrack_hash[bucket], unreplied,
+				struct ip_conntrack_tuple_hash *);
+		if (h)
+			atomic_inc(&h->ctrack->ct_general.use);
+	}
 	READ_UNLOCK(&ip_conntrack_lock);

 	if (!h)
@@ -525,9 +546,10 @@
 	hash = hash_conntrack(tuple);

 	if (ip_conntrack_max
-	    && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
-		/* Try dropping from this hash chain. */
-		if (!early_drop(&ip_conntrack_hash[hash])) {
+	    && atomic_read(&ip_conntrack_count)
+	       >= (ip_conntrack_max - ip_ct_reserved)) {
+		/* Try dropping starting from this hash chain. */
+		if (!early_drop(skb, hash)) {
 			if (net_ratelimit())
 				printk(KERN_WARNING
 				       "ip_conntrack: table full, dropping"
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_standalone.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_standalone.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_standalone.c	2004-10-18 23:54:07.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_standalone.c	2004-12-09 05:47:04.000000000 +0100
@@ -481,6 +481,14 @@
 /* From ip_conntrack_core.c */
 extern int ip_conntrack_max;
 extern unsigned int ip_conntrack_htable_size;
+extern int ip_ct_reserved;
+extern unsigned long ip_ct_reserved_mask;
+extern unsigned long ip_ct_reserved_mark;
+
+/* Static boundaries */
+static int reserved_min = 0;
+static int conntrack_max = INT_MAX;
+

 /* From ip_conntrack_proto_tcp.c */
 extern unsigned long ip_ct_tcp_timeout_syn_sent;
@@ -519,7 +527,9 @@
 		.data		= &ip_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &ip_ct_reserved,
+		.extra2		= &conntrack_max,
 	},
 	{
 		.ctl_name	= NET_IPV4_NF_CONNTRACK_COUNT,
@@ -676,6 +686,32 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_RESERVED,
+		.procname	= "ip_conntrack_reserved",
+		.data		= &ip_ct_reserved,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &reserved_min,
+		.extra2		= &ip_conntrack_max,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_RESERVED_MARK,
+		.procname	= "ip_conntrack_reserved_mark",
+		.data		= &ip_ct_reserved_mark,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_RESERVED_MASK,
+		.procname	= "ip_conntrack_reserved_mask",
+		.data		= &ip_ct_reserved_mask,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
 	{ .ctl_name = 0 }
 };

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-09  8:34 [PATCH] aggressive early_drop and reserved conntrack entries Jozsef Kadlecsik
@ 2004-12-09  8:52 ` Patrick Schaaf
  2004-12-09 10:34   ` Jozsef Kadlecsik
  2004-12-09 12:25 ` Grzegorz Piotr Jaskiewicz
  2004-12-16 12:31 ` Harald Welte
  2 siblings, 1 reply; 15+ messages in thread
From: Patrick Schaaf @ 2004-12-09  8:52 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Grzegorz Piotr Jaskiewicz

Hi,

> The included patch addresses the following issues:
> 
> - When the conntrack table is full, we search only in a single hash
>   bucket. We are in trouble anyway, so let's search harder for
>   droppable entries: the patch extends the search to at most the third of
>   all the buckets.

Hmm. It's correct that we are in trouble anyway, but will it help burning
much more CPU to get out of trouble?

Looking for alternatives, I note that early_drop will only consider
unreplied connections for reaping. In a normal setup, only a small
number of connections will be unreplied, AND each connection will
make at most one transition from unreplied to assured.

This suggest, to me, that we keep unreplied connections on a new,
additional list. They are put there at the HEAD upon creation,
they are removed form the list when they make their transition
to assured. And early_drop becomes a simple, O(1) operation:
reap the connection which is at the TAIL of this new list.

Of course, it's a tradeoff between burning (lots of) CPU when under
pressure, vs. two list operations per connection for each and every
connection.

best regards
  Patrick

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-09  8:52 ` Patrick Schaaf
@ 2004-12-09 10:34   ` Jozsef Kadlecsik
  2004-12-09 11:29     ` Patrick Schaaf
  2004-12-10 22:27     ` Jozsef Kadlecsik
  0 siblings, 2 replies; 15+ messages in thread
From: Jozsef Kadlecsik @ 2004-12-09 10:34 UTC (permalink / raw)
  To: Patrick Schaaf; +Cc: netfilter-devel, Grzegorz Piotr Jaskiewicz

Hi,

On Thu, 9 Dec 2004, Patrick Schaaf wrote:

> > - When the conntrack table is full, we search only in a single hash
> >   bucket. We are in trouble anyway, so let's search harder for
> >   droppable entries: the patch extends the search to at most the third of
> >   all the buckets.
>
> Hmm. It's correct that we are in trouble anyway, but will it help burning
> much more CPU to get out of trouble?

How could we lessen the trouble we are in? By refusing to add the new
connection to the table after failing to find an unreplied connection
in one bucket, or searching more with the price of spinning the CPU a
little further?

> Looking for alternatives, I note that early_drop will only consider
> unreplied connections for reaping. In a normal setup, only a small
> number of connections will be unreplied, AND each connection will
> make at most one transition from unreplied to assured.
>
> This suggest, to me, that we keep unreplied connections on a new,
> additional list. They are put there at the HEAD upon creation,
> they are removed form the list when they make their transition
> to assured. And early_drop becomes a simple, O(1) operation:
> reap the connection which is at the TAIL of this new list.
>
> Of course, it's a tradeoff between burning (lots of) CPU when under
> pressure, vs. two list operations per connection for each and every
> connection.

Plus another struct list_head element in struct ip_conntrack.

But I like it! Hmm, expect some new code soon...

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-09 10:34   ` Jozsef Kadlecsik
@ 2004-12-09 11:29     ` Patrick Schaaf
  2004-12-10 22:27     ` Jozsef Kadlecsik
  1 sibling, 0 replies; 15+ messages in thread
From: Patrick Schaaf @ 2004-12-09 11:29 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: netfilter-devel, Patrick Schaaf, Grzegorz Piotr Jaskiewicz

Hi Jozsef,

> > > - When the conntrack table is full, we search only in a single hash
> > >   bucket. We are in trouble anyway, so let's search harder for
> > >   droppable entries: the patch extends the search to at most the third of
> > >   all the buckets.
> >
> > Hmm. It's correct that we are in trouble anyway, but will it help burning
> > much more CPU to get out of trouble?
> 
> How could we lessen the trouble we are in? By refusing to add the new
> connection to the table after failing to find an unreplied connection
> in one bucket, or searching more with the price of spinning the CPU a
> little further?

Well, the way I see it, the primary task, under pressure, is still to
run ASSURED connections as good as possible. Burning more CPU in
early_drop for each new potential connection (at possibly high rate,
when under a real DoS attempt), will take significant time from routing
ASSURED connection's packets.

best regards
  Patrick

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-09  8:34 [PATCH] aggressive early_drop and reserved conntrack entries Jozsef Kadlecsik
  2004-12-09  8:52 ` Patrick Schaaf
@ 2004-12-09 12:25 ` Grzegorz Piotr Jaskiewicz
  2004-12-09 13:21   ` Jozsef Kadlecsik
  2004-12-16 12:31 ` Harald Welte
  2 siblings, 1 reply; 15+ messages in thread
From: Grzegorz Piotr Jaskiewicz @ 2004-12-09 12:25 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel

Jozsef Kadlecsik wrote:
> Hi,
> 
> The included patch addresses the following issues:
> 
> - When the conntrack table is full, we search only in a single hash
>   bucket. We are in trouble anyway, so let's search harder for
>   droppable entries: the patch extends the search to at most the third of
>   all the buckets.
> - If the conntrack table is full, the remote management of the machine
>   becomes a little bit complicated :-). The patch adds the ability to
>   reserve a given number of entries to be used for management connections.
>   The following proc entries are added to /proc/sys/net/ipv4/netfilter:

That is a good idea, but asside that I think that we need some kind of 
'grabage collector' that is going to remove the oldests connections from 
the hash to make space for those new. This sounds a bit more 
complicated, I know, maybe someone has a better idea about it.
But to be honest letting someone to manage computer remotely is the one 
thing, and letting system to solve the problem on its own is another.
Now that you can get in, tell me what you can do ?
You can resize hash table size for instance, but so can netfilter on its 
own in case hash is filled to the brim.
So there are 2 ideas, either let it resize hash table by some value, but 
that would have it maximum too. You can also forget oldest connections, 
and spare memory for new ones.

-- 
GJ

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-09 12:25 ` Grzegorz Piotr Jaskiewicz
@ 2004-12-09 13:21   ` Jozsef Kadlecsik
  0 siblings, 0 replies; 15+ messages in thread
From: Jozsef Kadlecsik @ 2004-12-09 13:21 UTC (permalink / raw)
  To: Grzegorz Piotr Jaskiewicz; +Cc: netfilter-devel

On Thu, 9 Dec 2004, Grzegorz Piotr Jaskiewicz wrote:

> That is a good idea, but asside that I think that we need some kind of
> 'grabage collector' that is going to remove the oldests connections from
> the hash to make space for those new.

We have already got such garbage collection, which is simple, clean and
natural: timeout.

> This sounds a bit more
> complicated, I know, maybe someone has a better idea about it.
> But to be honest letting someone to manage computer remotely is the one
> thing, and letting system to solve the problem on its own is another.
> Now that you can get in, tell me what you can do ?
> You can resize hash table size for instance, but so can netfilter on its
> own in case hash is filled to the brim.

I don't really believe in resizing in the case of conntrack. The admin
knows how much memory is available in the machine and should better load
in the conntrack module with the most appropriate hash size value.
Resizing *is* expensive and even more so when the machine is just under an
attack.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-09 10:34   ` Jozsef Kadlecsik
  2004-12-09 11:29     ` Patrick Schaaf
@ 2004-12-10 22:27     ` Jozsef Kadlecsik
  2004-12-11 13:34       ` Martin Josefsson
  1 sibling, 1 reply; 15+ messages in thread
From: Jozsef Kadlecsik @ 2004-12-10 22:27 UTC (permalink / raw)
  To: Patrick Schaaf; +Cc: netfilter-devel, Grzegorz Piotr Jaskiewicz

[-- Attachment #1: Type: TEXT/PLAIN, Size: 949 bytes --]

On Thu, 9 Dec 2004, Jozsef Kadlecsik wrote:

> On Thu, 9 Dec 2004, Patrick Schaaf wrote:
>
> > This suggest, to me, that we keep unreplied connections on a new,
> > additional list. They are put there at the HEAD upon creation,
> > they are removed form the list when they make their transition
> > to assured. And early_drop becomes a simple, O(1) operation:
> > reap the connection which is at the TAIL of this new list.
>
> But I like it! Hmm, expect some new code soon...

Attached (;-) is the new patch, which implements the list of unassured
connections. (Reserving conntracks is dropped completely as unnecessary.)
I tested it slighgtly and seems to work fine. What do you think about it?

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

[-- Attachment #2: early_drop.patch3 --]
[-- Type: TEXT/PLAIN, Size: 9549 bytes --]

diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.9-early_drop/include/linux/netfilter_ipv4/ip_conntrack.h
--- linux-2.6.9-orig/include/linux/netfilter_ipv4/ip_conntrack.h	2004-10-18 23:55:21.000000000 +0200
+++ linux-2.6.9-early_drop/include/linux/netfilter_ipv4/ip_conntrack.h	2004-12-09 16:08:29.000000000 +0100
@@ -196,6 +196,9 @@
 	/* Helper, if any. */
 	struct ip_conntrack_helper *helper;
 
+	/* List of unassured connections */
+	struct list_head unassured;
+
 	/* Storage reserved for other modules: */
 	union ip_conntrack_proto proto;
 
@@ -260,7 +263,8 @@
 extern void ip_ct_refresh_acct(struct ip_conntrack *ct,
 			       enum ip_conntrack_info ctinfo,
 			       const struct sk_buff *skb,
-			       unsigned long extra_jiffies);
+			       unsigned long extra_jiffies,
+			       int set_assured);
 
 /* These are for NAT.  Icky. */
 /* Update TCP window tracking data when NAT mangles the packet */
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_core.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_core.c	2004-10-18 23:53:05.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_core.c	2004-12-10 21:32:46.000000000 +0100
@@ -66,6 +66,7 @@
 
 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
 LIST_HEAD(ip_conntrack_expect_list);
+static LIST_HEAD(unassured_list);
 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
 static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size = 0;
@@ -313,6 +314,10 @@
 		}
 		kmem_cache_free(ip_conntrack_expect_cachep, ct->master);
 	}
+	/* Delete from the list of unassured conntracks */
+	if (!test_bit(IPS_ASSURED_BIT, &ct->status)
+	    && test_bit(IPS_CONFIRMED_BIT, &ct->status))
+		list_del(&ct->unassured);
 	WRITE_UNLOCK(&ip_conntrack_lock);
 
 	if (master)
@@ -436,6 +441,7 @@
 		add_timer(&ct->timeout);
 		atomic_inc(&ct->ct_general.use);
 		set_bit(IPS_CONFIRMED_BIT, &ct->status);
+		list_add_tail(&ct->unassured, &unassured_list);
 		WRITE_UNLOCK(&ip_conntrack_lock);
 		CONNTRACK_STAT_INC(insert);
 		return NF_ACCEPT;
@@ -461,34 +467,36 @@
 	return h != NULL;
 }
 
-/* There's a small race here where we may free a just-assured
-   connection.  Too bad: we're in trouble anyway. */
-static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
+static int early_drop(void)
 {
-	return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
-}
-
-static int early_drop(struct list_head *chain)
-{
-	/* Traverse backwards: gives us oldest, which is roughly LRU */
-	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ct = NULL;
+	struct list_head *entry;
 	int dropped = 0;
 
+    	/* There's a small race here where we may free a just-assured
+	   connection.  Too bad: we're in trouble anyway. */
 	READ_LOCK(&ip_conntrack_lock);
-	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
-	if (h)
-		atomic_inc(&h->ctrack->ct_general.use);
+	__list_for_each(entry, &unassured_list) {
+		ct = list_entry(entry,
+			        struct ip_conntrack, unassured);
+		atomic_inc(&ct->ct_general.use);
+		break;
+	}
 	READ_UNLOCK(&ip_conntrack_lock);
 
-	if (!h)
+	if (!ct)
 		return dropped;
 
-	if (del_timer(&h->ctrack->timeout)) {
-		death_by_timeout((unsigned long)h->ctrack);
+	if (del_timer(&ct->timeout)) {
+		death_by_timeout((unsigned long)ct);
 		dropped = 1;
+		if (net_ratelimit())
+			printk(KERN_WARNING
+			       "ip_conntrack: table full, dropping"
+			       " unassured connection.\n");
 		CONNTRACK_STAT_INC(early_drop);
 	}
-	ip_conntrack_put(h->ctrack);
+	ip_conntrack_put(ct);
 	return dropped;
 }
 
@@ -526,8 +534,8 @@
 
 	if (ip_conntrack_max
 	    && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
-		/* Try dropping from this hash chain. */
-		if (!early_drop(&ip_conntrack_hash[hash])) {
+		/* Try to drop unassured connection. */
+		if (!early_drop()) {
 			if (net_ratelimit())
 				printk(KERN_WARNING
 				       "ip_conntrack: table full, dropping"
@@ -564,6 +572,7 @@
 	conntrack->timeout.function = death_by_timeout;
 
 	INIT_LIST_HEAD(&conntrack->sibling_list);
+	INIT_LIST_HEAD(&conntrack->unassured);
 
 	WRITE_LOCK(&ip_conntrack_lock);
 	/* Need finding and deleting of expected ONLY if we win race */
@@ -1091,7 +1100,8 @@
 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
 		        enum ip_conntrack_info ctinfo,
 			const struct sk_buff *skb,
-			unsigned long extra_jiffies)
+			unsigned long extra_jiffies,
+			int set_assured)
 {
 	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
 
@@ -1107,6 +1117,10 @@
 			add_timer(&ct->timeout);
 		}
 		ct_add_counters(ct, ctinfo, skb);
+		if (set_assured) {
+			set_bit(IPS_ASSURED_BIT, &ct->status);
+			list_del(&ct->unassured);
+		}
 		WRITE_UNLOCK(&ip_conntrack_lock);
 	}
 }
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_generic.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c	2004-10-18 23:53:46.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_generic.c	2004-12-09 16:01:30.000000000 +0100
@@ -52,7 +52,8 @@
 		  const struct sk_buff *skb,
 		  enum ip_conntrack_info ctinfo)
 {
-	ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout,
+			   CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY);
 	return NF_ACCEPT;
 }
 
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c	2004-10-18 23:53:06.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_icmp.c	2004-12-09 15:34:32.000000000 +0100
@@ -102,7 +102,7 @@
 			ct->timeout.function((unsigned long)ct);
 	} else {
 		atomic_inc(&ct->proto.icmp.count);
-		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
+		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout, 0);
 	}
 
 	return NF_ACCEPT;
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c	2004-10-18 23:55:07.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_sctp.c	2004-12-09 15:37:52.000000000 +0100
@@ -319,6 +319,7 @@
 	sctp_chunkhdr_t _sch, *sch;
 	u_int32_t offset, count;
 	char map[256 / sizeof (char)] = {0};
+	int set_assured = 0;
 
 	DEBUGP(__FUNCTION__);
 	DEBUGP("\n");
@@ -408,14 +409,14 @@
 		WRITE_UNLOCK(&sctp_lock);
 	}
 
-	ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
-
 	if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
 		&& CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
 		&& newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
 		DEBUGP("Setting assured bit\n");
-		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		set_assured = 1;
 	}
+	ip_ct_refresh_acct(conntrack, ctinfo, skb,
+			   *sctp_timeouts[newconntrack], set_assured);
 
 	return NF_ACCEPT;
 }
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c	2004-10-18 23:55:29.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_tcp.c	2004-12-09 15:36:36.000000000 +0100
@@ -834,6 +834,7 @@
 	struct tcphdr *th, _tcph;
 	unsigned long timeout;
 	unsigned int index;
+	int set_assured = 0;
 	
 	th = skb_header_pointer(skb, iph->ihl * 4,
 				sizeof(_tcph), &_tcph);
@@ -962,9 +963,9 @@
 		/* Set ASSURED if we see see valid ack in ESTABLISHED 
 		   after SYN_RECV or a valid answer for a picked up 
 		   connection. */
-			set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		set_assured = 1;
 	}
-	ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout, set_assured);
 
 	return NF_ACCEPT;
 }
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_udp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c	2004-10-18 23:53:05.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_udp.c	2004-12-09 15:35:16.000000000 +0100
@@ -70,12 +70,12 @@
 	/* If we've seen traffic both ways, this is some kind of UDP
 	   stream.  Extend timeout. */
 	if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
-		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
-				   ip_ct_udp_timeout_stream);
 		/* Also, more likely to be important, and not a probe */
-		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
+				   ip_ct_udp_timeout_stream, 1);
 	} else
-		ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
+		ip_ct_refresh_acct(conntrack, ctinfo, skb,
+				   ip_ct_udp_timeout, 0);
 
 	return NF_ACCEPT;
 }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-10 22:27     ` Jozsef Kadlecsik
@ 2004-12-11 13:34       ` Martin Josefsson
  2004-12-11 13:39         ` Martin Josefsson
  2004-12-11 16:56         ` Jozsef Kadlecsik
  0 siblings, 2 replies; 15+ messages in thread
From: Martin Josefsson @ 2004-12-11 13:34 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: Netfilter-devel, Patrick Schaaf, Grzegorz Piotr Jaskiewicz

[-- Attachment #1: Type: text/plain, Size: 3837 bytes --]

On Fri, 2004-12-10 at 23:27, Jozsef Kadlecsik wrote:
> On Thu, 9 Dec 2004, Jozsef Kadlecsik wrote:
> 
> > On Thu, 9 Dec 2004, Patrick Schaaf wrote:
> >
> > > This suggest, to me, that we keep unreplied connections on a new,
> > > additional list. They are put there at the HEAD upon creation,
> > > they are removed form the list when they make their transition
> > > to assured. And early_drop becomes a simple, O(1) operation:
> > > reap the connection which is at the TAIL of this new list.
> >
> > But I like it! Hmm, expect some new code soon...
> 
> Attached (;-) is the new patch, which implements the list of unassured
> connections. (Reserving conntracks is dropped completely as unnecessary.)
> I tested it slighgtly and seems to work fine. What do you think about it?

I've been thinking about this as well, but mostly I've been thinking
about how to get it to scale when we go for more finegrained locking.
The locking is going to be nasty.

The patch looks pretty good, just a few things...

@@ -461,34 +467,36 @@
 	return h != NULL;
 }
 
-/* There's a small race here where we may free a just-assured
-   connection.  Too bad: we're in trouble anyway. */
-static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
+static int early_drop(void)
 {
-	return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
-}
-
-static int early_drop(struct list_head *chain)
-{
-	/* Traverse backwards: gives us oldest, which is roughly LRU */
-	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ct = NULL;
+	struct list_head *entry;
 	int dropped = 0;
 
+    	/* There's a small race here where we may free a just-assured
+	   connection.  Too bad: we're in trouble anyway. */
 	READ_LOCK(&ip_conntrack_lock);
-	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
-	if (h)
-		atomic_inc(&h->ctrack->ct_general.use);
+	__list_for_each(entry, &unassured_list) {
+		ct = list_entry(entry,
+			        struct ip_conntrack, unassured);
+		atomic_inc(&ct->ct_general.use);
+		break;
+	}
 	READ_UNLOCK(&ip_conntrack_lock);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Why not loop and kill multiple entries each time? Saves some locking and
cache. But in order to do that in a good way wee need a counter of how
many entries we have in the unassured list. But we don't want to kill
too many each time, then almost no real connections will get through.

-	if (!h)
+	if (!ct)
 		return dropped;
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
I think we are pretty much guranteed to have a valid ct here, otherwise
we would oops above :)
 
-	if (del_timer(&h->ctrack->timeout)) {
-		death_by_timeout((unsigned long)h->ctrack);
+	if (del_timer(&ct->timeout)) {
+		death_by_timeout((unsigned long)ct);
 		dropped = 1;
+		if (net_ratelimit())
+			printk(KERN_WARNING
+			       "ip_conntrack: table full, dropping"
+			       " unassured connection.\n");
 		CONNTRACK_STAT_INC(early_drop);
 	}
-	ip_conntrack_put(h->ctrack);
+	ip_conntrack_put(ct);
 	return dropped;
 }
 
@@ -1091,7 +1100,8 @@
 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
 		        enum ip_conntrack_info ctinfo,
 			const struct sk_buff *skb,
-			unsigned long extra_jiffies)
+			unsigned long extra_jiffies,
+			int set_assured)
 {
 	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
 
@@ -1107,6 +1117,10 @@
 			add_timer(&ct->timeout);
 		}
 		ct_add_counters(ct, ctinfo, skb);
+		if (set_assured) {
+			set_bit(IPS_ASSURED_BIT, &ct->status);
+			list_del(&ct->unassured);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Huh? No check to see if we already are assured or not?
Not needed for icmp, tcp or sctp but udp and the generic handler does.

-- 
/Martin

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-11 13:34       ` Martin Josefsson
@ 2004-12-11 13:39         ` Martin Josefsson
  2004-12-11 16:56         ` Jozsef Kadlecsik
  1 sibling, 0 replies; 15+ messages in thread
From: Martin Josefsson @ 2004-12-11 13:39 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: Netfilter-devel, Patrick Schaaf, Grzegorz Piotr Jaskiewicz

[-- Attachment #1: Type: text/plain, Size: 399 bytes --]

On Sat, 2004-12-11 at 14:34, Martin Josefsson wrote:

Argh

> -	if (!h)
> +	if (!ct)
>  		return dropped;
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> I think we are pretty much guranteed to have a valid ct here, otherwise
> we would oops above :)

Forget that :) Didn't think about the case where we don't have any
unassured connections.

-- 
/Martin

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-11 13:34       ` Martin Josefsson
  2004-12-11 13:39         ` Martin Josefsson
@ 2004-12-11 16:56         ` Jozsef Kadlecsik
  2004-12-12 11:40           ` Henrik Nordstrom
  2004-12-13 12:14           ` Jozsef Kadlecsik
  1 sibling, 2 replies; 15+ messages in thread
From: Jozsef Kadlecsik @ 2004-12-11 16:56 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Netfilter-devel, Patrick Schaaf, Grzegorz Piotr Jaskiewicz

Hi Martin,

On Sat, 11 Dec 2004, Martin Josefsson wrote:

> > Attached (;-) is the new patch, which implements the list of unassured
> > connections. (Reserving conntracks is dropped completely as unnecessary.)
> > I tested it slighgtly and seems to work fine. What do you think about it?
>
> I've been thinking about this as well, but mostly I've been thinking
> about how to get it to scale when we go for more finegrained locking.
> The locking is going to be nasty.

It's already a little bit messy. I strongly believe, we must revise
reference counting in order to make locking more straightforward (and to
be able to introduce say per bucket locking at all). The
conntrack_arefcount patch tries to step ahead in that direction.

> Why not loop and kill multiple entries each time? Saves some locking and
> cache. But in order to do that in a good way wee need a counter of how
> many entries we have in the unassured list. But we don't want to kill
> too many each time, then almost no real connections will get through.

An atomic counter could help: this is just a proof of concept code :-)

> @@ -1107,6 +1117,10 @@
>  			add_timer(&ct->timeout);
>  		}
>  		ct_add_counters(ct, ctinfo, skb);
> +		if (set_assured) {
> +			set_bit(IPS_ASSURED_BIT, &ct->status);
> +			list_del(&ct->unassured);
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Huh? No check to see if we already are assured or not?
> Not needed for icmp, tcp or sctp but udp and the generic handler does.

Oops, absolutely correct, I tested the code with TCP only. Fortunately
it's easy to fix :-)

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-11 16:56         ` Jozsef Kadlecsik
@ 2004-12-12 11:40           ` Henrik Nordstrom
  2004-12-13 21:52             ` Jozsef Kadlecsik
  2004-12-13 12:14           ` Jozsef Kadlecsik
  1 sibling, 1 reply; 15+ messages in thread
From: Henrik Nordstrom @ 2004-12-12 11:40 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: Netfilter-devel, Martin Josefsson, Patrick Schaaf,
	Grzegorz Piotr Jaskiewicz

On Sat, 11 Dec 2004, Jozsef Kadlecsik wrote:

> It's already a little bit messy. I strongly believe, we must revise
> reference counting in order to make locking more straightforward (and to
> be able to introduce say per bucket locking at all). The
> conntrack_arefcount patch tries to step ahead in that direction.

The patch adds yet another entity needing locking: The list of unassured 
connections. This list requires two write updates per new connection 
(append, delete).

With it being a linked list fine grained locking becomes a bit hard, and 
to have FIFO semantics there is not many other choices. But fortunately 
the operations needing this list locked is very short in time so it should 
be fine with a global lock on the unassured list, in addition to the 
(finegrained) conntrack locking, provided the unassured lock is aquired 
last (atomic list insert, delete operations).

Note: RCU does not help us here.

Regards
Henrik

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-11 16:56         ` Jozsef Kadlecsik
  2004-12-12 11:40           ` Henrik Nordstrom
@ 2004-12-13 12:14           ` Jozsef Kadlecsik
  2004-12-13 13:25             ` Martin Josefsson
  1 sibling, 1 reply; 15+ messages in thread
From: Jozsef Kadlecsik @ 2004-12-13 12:14 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Netfilter-devel, Patrick Schaaf, Grzegorz Piotr Jaskiewicz

[-- Attachment #1: Type: TEXT/PLAIN, Size: 993 bytes --]

On Sat, 11 Dec 2004, Jozsef Kadlecsik wrote:

> On Sat, 11 Dec 2004, Martin Josefsson wrote:
> > @@ -1107,6 +1117,10 @@
> >  			add_timer(&ct->timeout);
> >  		}
> >  		ct_add_counters(ct, ctinfo, skb);
> > +		if (set_assured) {
> > +			set_bit(IPS_ASSURED_BIT, &ct->status);
> > +			list_del(&ct->unassured);
> > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > Huh? No check to see if we already are assured or not?
> > Not needed for icmp, tcp or sctp but udp and the generic handler does.
>
> Oops, absolutely correct, I tested the code with TCP only. Fortunately
> it's easy to fix :-)

Attached is the patch which takes into account that the assured bit might
already be set (and thus we are on the unassured list).

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

[-- Attachment #2: early_drop.patch4 --]
[-- Type: TEXT/PLAIN, Size: 9601 bytes --]

diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.9-early_drop/include/linux/netfilter_ipv4/ip_conntrack.h
--- linux-2.6.9-orig/include/linux/netfilter_ipv4/ip_conntrack.h	2004-10-18 23:55:21.000000000 +0200
+++ linux-2.6.9-early_drop/include/linux/netfilter_ipv4/ip_conntrack.h	2004-12-09 16:08:29.000000000 +0100
@@ -196,6 +196,9 @@
 	/* Helper, if any. */
 	struct ip_conntrack_helper *helper;
 
+	/* List of unassured connections */
+	struct list_head unassured;
+
 	/* Storage reserved for other modules: */
 	union ip_conntrack_proto proto;
 
@@ -260,7 +263,8 @@
 extern void ip_ct_refresh_acct(struct ip_conntrack *ct,
 			       enum ip_conntrack_info ctinfo,
 			       const struct sk_buff *skb,
-			       unsigned long extra_jiffies);
+			       unsigned long extra_jiffies,
+			       int set_assured);
 
 /* These are for NAT.  Icky. */
 /* Update TCP window tracking data when NAT mangles the packet */
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_core.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_core.c	2004-10-18 23:53:05.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_core.c	2004-12-12 13:20:33.000000000 +0100
@@ -66,6 +66,7 @@
 
 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
 LIST_HEAD(ip_conntrack_expect_list);
+static LIST_HEAD(unassured_list);
 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
 static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size = 0;
@@ -313,6 +314,10 @@
 		}
 		kmem_cache_free(ip_conntrack_expect_cachep, ct->master);
 	}
+	/* Delete from the list of unassured conntracks */
+	if (!test_bit(IPS_ASSURED_BIT, &ct->status)
+	    && test_bit(IPS_CONFIRMED_BIT, &ct->status))
+		list_del(&ct->unassured);
 	WRITE_UNLOCK(&ip_conntrack_lock);
 
 	if (master)
@@ -436,6 +441,7 @@
 		add_timer(&ct->timeout);
 		atomic_inc(&ct->ct_general.use);
 		set_bit(IPS_CONFIRMED_BIT, &ct->status);
+		list_add_tail(&ct->unassured, &unassured_list);
 		WRITE_UNLOCK(&ip_conntrack_lock);
 		CONNTRACK_STAT_INC(insert);
 		return NF_ACCEPT;
@@ -461,34 +467,36 @@
 	return h != NULL;
 }
 
-/* There's a small race here where we may free a just-assured
-   connection.  Too bad: we're in trouble anyway. */
-static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
+static int early_drop(void)
 {
-	return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
-}
-
-static int early_drop(struct list_head *chain)
-{
-	/* Traverse backwards: gives us oldest, which is roughly LRU */
-	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ct = NULL;
+	struct list_head *entry;
 	int dropped = 0;
 
+    	/* There's a small race here where we may free a just-assured
+	   connection.  Too bad: we're in trouble anyway. */
 	READ_LOCK(&ip_conntrack_lock);
-	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
-	if (h)
-		atomic_inc(&h->ctrack->ct_general.use);
+	__list_for_each(entry, &unassured_list) {
+		ct = list_entry(entry,
+			        struct ip_conntrack, unassured);
+		atomic_inc(&ct->ct_general.use);
+		break;
+	}
 	READ_UNLOCK(&ip_conntrack_lock);
 
-	if (!h)
+	if (!ct)
 		return dropped;
 
-	if (del_timer(&h->ctrack->timeout)) {
-		death_by_timeout((unsigned long)h->ctrack);
+	if (del_timer(&ct->timeout)) {
+		death_by_timeout((unsigned long)ct);
 		dropped = 1;
+		if (net_ratelimit())
+			printk(KERN_WARNING
+			       "ip_conntrack: table full, dropping"
+			       " unassured connection.\n");
 		CONNTRACK_STAT_INC(early_drop);
 	}
-	ip_conntrack_put(h->ctrack);
+	ip_conntrack_put(ct);
 	return dropped;
 }
 
@@ -526,8 +534,8 @@
 
 	if (ip_conntrack_max
 	    && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
-		/* Try dropping from this hash chain. */
-		if (!early_drop(&ip_conntrack_hash[hash])) {
+		/* Try to drop unassured connection. */
+		if (!early_drop()) {
 			if (net_ratelimit())
 				printk(KERN_WARNING
 				       "ip_conntrack: table full, dropping"
@@ -564,6 +572,7 @@
 	conntrack->timeout.function = death_by_timeout;
 
 	INIT_LIST_HEAD(&conntrack->sibling_list);
+	INIT_LIST_HEAD(&conntrack->unassured);
 
 	WRITE_LOCK(&ip_conntrack_lock);
 	/* Need finding and deleting of expected ONLY if we win race */
@@ -1091,7 +1100,8 @@
 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
 		        enum ip_conntrack_info ctinfo,
 			const struct sk_buff *skb,
-			unsigned long extra_jiffies)
+			unsigned long extra_jiffies,
+			int set_assured)
 {
 	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
 
@@ -1107,6 +1117,11 @@
 			add_timer(&ct->timeout);
 		}
 		ct_add_counters(ct, ctinfo, skb);
+		if (set_assured 
+		    && !test_bit(IPS_ASSURED_BIT, &ct->status)) {
+			set_bit(IPS_ASSURED_BIT, &ct->status);
+			list_del(&ct->unassured);
+		}
 		WRITE_UNLOCK(&ip_conntrack_lock);
 	}
 }
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_generic.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c	2004-10-18 23:53:46.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_generic.c	2004-12-09 16:01:30.000000000 +0100
@@ -52,7 +52,8 @@
 		  const struct sk_buff *skb,
 		  enum ip_conntrack_info ctinfo)
 {
-	ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout,
+			   CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY);
 	return NF_ACCEPT;
 }
 
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c	2004-10-18 23:53:06.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_icmp.c	2004-12-09 15:34:32.000000000 +0100
@@ -102,7 +102,7 @@
 			ct->timeout.function((unsigned long)ct);
 	} else {
 		atomic_inc(&ct->proto.icmp.count);
-		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
+		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout, 0);
 	}
 
 	return NF_ACCEPT;
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c	2004-10-18 23:55:07.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_sctp.c	2004-12-09 15:37:52.000000000 +0100
@@ -319,6 +319,7 @@
 	sctp_chunkhdr_t _sch, *sch;
 	u_int32_t offset, count;
 	char map[256 / sizeof (char)] = {0};
+	int set_assured = 0;
 
 	DEBUGP(__FUNCTION__);
 	DEBUGP("\n");
@@ -408,14 +409,14 @@
 		WRITE_UNLOCK(&sctp_lock);
 	}
 
-	ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
-
 	if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
 		&& CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
 		&& newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
 		DEBUGP("Setting assured bit\n");
-		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		set_assured = 1;
 	}
+	ip_ct_refresh_acct(conntrack, ctinfo, skb,
+			   *sctp_timeouts[newconntrack], set_assured);
 
 	return NF_ACCEPT;
 }
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c	2004-10-18 23:55:29.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_tcp.c	2004-12-09 15:36:36.000000000 +0100
@@ -834,6 +834,7 @@
 	struct tcphdr *th, _tcph;
 	unsigned long timeout;
 	unsigned int index;
+	int set_assured = 0;
 	
 	th = skb_header_pointer(skb, iph->ihl * 4,
 				sizeof(_tcph), &_tcph);
@@ -962,9 +963,9 @@
 		/* Set ASSURED if we see see valid ack in ESTABLISHED 
 		   after SYN_RECV or a valid answer for a picked up 
 		   connection. */
-			set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		set_assured = 1;
 	}
-	ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout, set_assured);
 
 	return NF_ACCEPT;
 }
diff -urN --exclude-from=/usr/src/diff.exclude linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_udp.c
--- linux-2.6.9-orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c	2004-10-18 23:53:05.000000000 +0200
+++ linux-2.6.9-early_drop/net/ipv4/netfilter/ip_conntrack_proto_udp.c	2004-12-09 15:35:16.000000000 +0100
@@ -70,12 +70,12 @@
 	/* If we've seen traffic both ways, this is some kind of UDP
 	   stream.  Extend timeout. */
 	if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
-		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
-				   ip_ct_udp_timeout_stream);
 		/* Also, more likely to be important, and not a probe */
-		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
+				   ip_ct_udp_timeout_stream, 1);
 	} else
-		ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
+		ip_ct_refresh_acct(conntrack, ctinfo, skb,
+				   ip_ct_udp_timeout, 0);
 
 	return NF_ACCEPT;
 }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-13 12:14           ` Jozsef Kadlecsik
@ 2004-12-13 13:25             ` Martin Josefsson
  0 siblings, 0 replies; 15+ messages in thread
From: Martin Josefsson @ 2004-12-13 13:25 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: Netfilter-devel, Patrick Schaaf, Grzegorz Piotr Jaskiewicz

On Mon, 13 Dec 2004, Jozsef Kadlecsik wrote:

> > > Huh? No check to see if we already are assured or not?
> > > Not needed for icmp, tcp or sctp but udp and the generic handler does.
> >
> > Oops, absolutely correct, I tested the code with TCP only. Fortunately
> > it's easy to fix :-)
>
> Attached is the patch which takes into account that the assured bit might
> already be set (and thus we are on the unassured list).

I'll take this for a testdrive during a DoS on a testmachine here later
this evening. I doubt it makes any diffrence compared to the old one when
you have virtually no assured entries, just a truckload of unassured ones.
So it's a bit tricky to test in a lab, I need to set up that
sniffermachine on the real internet feed so I get a proper conntrack
table to test with.

/Martin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-12 11:40           ` Henrik Nordstrom
@ 2004-12-13 21:52             ` Jozsef Kadlecsik
  0 siblings, 0 replies; 15+ messages in thread
From: Jozsef Kadlecsik @ 2004-12-13 21:52 UTC (permalink / raw)
  To: Henrik Nordstrom
  Cc: Netfilter-devel, Martin Josefsson, Patrick Schaaf,
	Grzegorz Piotr Jaskiewicz

On Sun, 12 Dec 2004, Henrik Nordstrom wrote:

> On Sat, 11 Dec 2004, Jozsef Kadlecsik wrote:
>
> > It's already a little bit messy. I strongly believe, we must revise
> > reference counting in order to make locking more straightforward (and to
> > be able to introduce say per bucket locking at all). The
> > conntrack_arefcount patch tries to step ahead in that direction.
>
> The patch adds yet another entity needing locking: The list of unassured
> connections. This list requires two write updates per new connection
> (append, delete).

I just wanted to note that fine grained locking needs some preparation.

> With it being a linked list fine grained locking becomes a bit hard, and
> to have FIFO semantics there is not many other choices. But fortunately
> the operations needing this list locked is very short in time so it should
> be fine with a global lock on the unassured list, in addition to the
> (finegrained) conntrack locking, provided the unassured lock is aquired
> last (atomic list insert, delete operations).

With a little tweaking we could create a fast path for the new conntrack
entries:  in __ip_conntrack_confirm do not add them to the unassured_list
till we have, say, more than 5% percent of the conntrack entries free.
Of course when assuring/deleting a conntrack, we should still lock, check
and delete, unlock, but that is not the critical path.

> Note: RCU does not help us here.

Yes, no way to use RCU.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] aggressive early_drop and reserved conntrack entries
  2004-12-09  8:34 [PATCH] aggressive early_drop and reserved conntrack entries Jozsef Kadlecsik
  2004-12-09  8:52 ` Patrick Schaaf
  2004-12-09 12:25 ` Grzegorz Piotr Jaskiewicz
@ 2004-12-16 12:31 ` Harald Welte
  2 siblings, 0 replies; 15+ messages in thread
From: Harald Welte @ 2004-12-16 12:31 UTC (permalink / raw)
  To: Jozsef Kadlecsik; +Cc: netfilter-devel, Grzegorz Piotr Jaskiewicz

[-- Attachment #1: Type: text/plain, Size: 1655 bytes --]

On Thu, Dec 09, 2004 at 09:34:34AM +0100, Jozsef Kadlecsik wrote:
> Hi,
> 
> The included patch addresses the following issues:
> 
> - When the conntrack table is full, we search only in a single hash
>   bucket. We are in trouble anyway, so let's search harder for
>   droppable entries: the patch extends the search to at most the third of
>   all the buckets.

I don't think it's a good idea to increase cpu and especially cache
usage in sucha (likely DoS) case.

> - If the conntrack table is full, the remote management of the machine
>   becomes a little bit complicated :-). 

That's why you put in a special rule into 'raw' to bypass conntrack for
administrative connections.  I think this is the best way to address the
problem, especially since we're talking about local sockets... and the
local tcp stack should behave just as conntrack itself and reject all
packets that don't match an existing connection.

So from my point of view, even the extra new list is not needed.

We really should think of decreasing complexity of conntrack, not
increasing it more.

And also, all of this new experimentation should definitley be done on
top of nf_conntrack.  I already dislike the amount of changes still
going into ip_conntrack at this time :(

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2004-12-16 12:31 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-12-09  8:34 [PATCH] aggressive early_drop and reserved conntrack entries Jozsef Kadlecsik
2004-12-09  8:52 ` Patrick Schaaf
2004-12-09 10:34   ` Jozsef Kadlecsik
2004-12-09 11:29     ` Patrick Schaaf
2004-12-10 22:27     ` Jozsef Kadlecsik
2004-12-11 13:34       ` Martin Josefsson
2004-12-11 13:39         ` Martin Josefsson
2004-12-11 16:56         ` Jozsef Kadlecsik
2004-12-12 11:40           ` Henrik Nordstrom
2004-12-13 21:52             ` Jozsef Kadlecsik
2004-12-13 12:14           ` Jozsef Kadlecsik
2004-12-13 13:25             ` Martin Josefsson
2004-12-09 12:25 ` Grzegorz Piotr Jaskiewicz
2004-12-09 13:21   ` Jozsef Kadlecsik
2004-12-16 12:31 ` Harald Welte

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.