Re: [PATCH] use 32bit counters for connection-based accounting

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Harald Welte <laforge@netfilter.org>
To: Patrick Schaaf <bof@bof.de>
Cc: netfilter-devel@lists.netfilter.org,
	"David S. Miller" <davem@davemloft.net>,
	kaber@trash.net
Subject: Re: [PATCH] use 32bit counters for connection-based accounting
Date: Mon, 10 Oct 2005 23:36:57 +0200	[thread overview]
Message-ID: <20051010213657.GI5627@rama> (raw)
In-Reply-To: <20051010163844.GG9644@oknodo.bof.de>


[-- Attachment #1.1: Type: text/plain, Size: 1488 bytes --]

On Mon, Oct 10, 2005 at 06:38:45PM +0200, Patrick Schaaf wrote:
> > > I have one (ignorant) question left: when a certain byte counter
> > > for some direction/flow overflows, is the packet counter forced
> > > to overflow at the same time? Same question for when the packet
> > > counter overflows - does the associated byte counter wrap, too?
> > > (I intuit such would be desired behaviour. Wouldn't it?)
> > 
> > no, the counters will overflow independently - and we have no indication
> > that such an event occurred.
> 
> Could this be discussed a bit? My rationale is that sometimes
> all I'm interested in, is a quick evaluation of average packet
> sizes. With synchronized byte and packet counters, this is
> one divide away. When synchronization cannot be assumed, several
> samples MUST be taken, and differences calculated. Damn.

I'm fixing this up now, see the attached preliminary, untested and
not-yet-fully-finished patch.  As it seems something like this will
still have to go into 2.6.14, even though I feel really sorry about the
necessity of yet another patch in this area.

-- 
- Harald Welte <laforge@netfilter.org>                 http://netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #1.2: counters.diff --]
[-- Type: text/plain, Size: 8695 bytes --]

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -27,6 +27,8 @@ enum nfnetlink_groups {
 #define NFNLGRP_CONNTRACK_EXP_UPDATE	NFNLGRP_CONNTRACK_EXP_UPDATE
 	NFNLGRP_CONNTRACK_EXP_DESTROY,
 #define NFNLGRP_CONNTRACK_EXP_DESTROY	NFNLGRP_CONNTRACK_EXP_DESTROY
+	NFNLGRP_CONNTRACK_CTR_OVERFLOW,
+#define NFNLGRP_CONNTRACK_CTR_OVERFLOW	NFNLGRP_CONNTRACK_CTR_OVERFLOW
 	__NFNLGRP_MAX,
 };
 #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -69,6 +69,10 @@ enum ip_conntrack_status {
 	/* Connection is dying (removed from lists), can not be unset. */
 	IPS_DYING_BIT = 9,
 	IPS_DYING = (1 << IPS_DYING_BIT),
+
+	/* Per connection counters have overflowed at least once */
+	IPS_COUNTER_OVERFLOW_BIT = 10,
+	IPS_COUNTER_OVERFLOW = (1 << IPS_COUNTER_OVERFLOW_BIT),
 };
 
 /* Connection tracking event bits */
@@ -119,10 +123,25 @@ enum ip_conntrack_events
 	IPCT_NATINFO = (1 << IPCT_NATINFO_BIT),
 
 	/* Counter highest bit has been set */
-	IPCT_COUNTER_FILLING_BIT = 11,
-	IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT),
+	IPCT_ACCT_OFLOW_PKTS_ORIG_BIT = 11,
+	IPCT_ACCT_OFLOW_PKTS_ORIG = (1 << IPCT_ACCT_OFLOW_PKTS_ORIG_BIT),
+#define IPCT_ACCT_OFLOW_PKTS IPCT_ACCT_OFLOW_PKTS_ORIG
+
+	IPCT_ACCT_OFLOW_PKTS_RPLY_BIT = 12,
+	IPCT_ACCT_OFLOW_PKTS_RPLY = (1 << IPCT_ACCT_OFLOW_PKTS_RPLY_BIT),
+
+	/* Counter highest bit has been set */
+	IPCT_ACCT_OFLOW_BYTES_ORIG_BIT = 13,
+	IPCT_ACCT_OFLOW_BYTES_ORIG = (1 << IPCT_ACCT_OFLOW_BYTES_ORIG_BIT),
+#define IPCT_ACCT_OFLOW_BYTES IPCT_ACCT_OFLOW_BYTES_ORIG
+
+	IPCT_ACCT_OFLOW_BYTES_RPLY_BIT = 14,
+	IPCT_ACCT_OFLOW_BYTES_RPLY = (1 << IPCT_ACCT_OFLOW_BYTES_RPLY_BIT),
 };
 
+#define IPCT_ACCT_OFLOW_BITS (IPCT_ACCT_OFLOW_BYTES_RPLY|IPCT_ACCT_OFLOW_BYTES_ORIG \
+			      IPCT_ACCT_OFLOW_PKTS_RPLY|IPCT_ACCT_OFLOW_PKTS_ORIG)
+
 enum ip_conntrack_expect_events {
 	IPEXP_NEW_BIT = 0,
 	IPEXP_NEW = (1 << IPEXP_NEW_BIT),
@@ -194,11 +213,21 @@ do {									\
 #define IP_NF_ASSERT(x)
 #endif
 
+#ifdef CONFIG_IP_NF_CT_ACCT
+#define CTR_HIGHEST_BIT (1 << ((sizeof(ct_ctr_t)*8)-1))
+#ifdef CONFIG_IP_NF_CT_ACCT64
+typedef ct_ctr_t u_int64_t;
+#define ct_ctr_to_be(x) cpu_to_be64(x)
+#else
+typedef ct_ctr_t u_int32_t;
+#define ct_ctr_to_be(x) htonl(x)
+#endif /* CONFIG_IP_NF_CT_ACCT64 */
 struct ip_conntrack_counter
 {
-	u_int32_t packets;
-	u_int32_t bytes;
+	ct_ctr_t packets;
+	ct_ctr_t bytes;
 };
+#endif /* CONFIG_IP_NF_CT_ACCT */
 
 struct ip_conntrack_helper;
 
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -32,6 +32,21 @@ config IP_NF_CT_ACCT
 
 	  If unsure, say `N'.
 
+config IP_NF_CT_ACCT64
+	bool "Use 64bit counters for flow accounting"
+	depends on IP_NF_CT_ACCT
+	help
+	  The conntrack counters are 32bit by default, which makes them
+	  overflow at more than 4GB of traffic in one direction of a
+	  connection.  This is fine if you have a userspace accounting daemon
+	  that receives the COUNTER_FILLING event message.
+	  
+	  However, if you want to use the counters inside the kernel (e.g.
+	  by the connbytes match), you should select this option to increase
+	  their range to 64bits.
+
+	  If unsure, say `N'.
+
 config IP_NF_CONNTRACK_MARK
 	bool  'Connection mark tracking support'
 	depends on IP_NF_CONNTRACK
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1112,6 +1112,35 @@ void ip_conntrack_helper_unregister(stru
 	synchronize_net();
 }
 
+static inline int __ip_ct_acct(struct ip_conntrack *ct,
+				enum ip_conntrack_info ctinfo,
+				const struct sk_buff *skb)
+{
+	unsigned int ret = 0;
+#ifdef CONFIG_IP_NF_CT_ACCT
+	unsigned int dir = CTINFO2DIR(ctinfo);
+	ct_ctr_t packets, bytes;
+
+	/* The idea is to just reset the highest bit and continue counting.
+	 * Userspace will then have to add that 'highest bit' to their
+	 * accounting system.  */
+	ct->counters[dir].packets++;
+	if (unlikely(ct->counters[dir].packets > CTR_HIGHEST_BIT)) {
+		set_bit(IPS_COUNTER_OVERFLOW, &ct->status);
+		ct->counters[dir].packets &= ~CTR_HIGHEST_BIT;
+		ret |= IPCT_ACCT_OFLOW_PKTS + dir;
+	}
+
+	ct->counters[dir].bytes += ntohs(skb->nh.iph->tot_len);
+	if (unlikely(ct->counters[dir].bytes > CTR_HIGHEST_BIT)) {
+		set_bit(IPS_COUNTER_OVERFLOW, &ct->status);
+		ct->counters[dir].bytes &= ~CTR_HIGHEST_BIT;
+		ret |= IPCT_ACCT_OFLOW_BYTES + dir;
+	}
+#endif
+	return ret;
+}
+
 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
 		        enum ip_conntrack_info ctinfo,
@@ -1139,16 +1168,8 @@ void __ip_ct_refresh_acct(struct ip_conn
 		}
 	}
 
-#ifdef CONFIG_IP_NF_CT_ACCT
-	if (do_acct) {
-		ct->counters[CTINFO2DIR(ctinfo)].packets++;
-		ct->counters[CTINFO2DIR(ctinfo)].bytes += 
-						ntohs(skb->nh.iph->tot_len);
-		if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
-		    || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
-			event |= IPCT_COUNTER_FILLING;
-	}
-#endif
+	if (do_acct)
+		event |= __ip_ct_acct(ct, ctinfo, skb);
 
 	write_unlock_bh(&ip_conntrack_lock);
 
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -168,20 +168,28 @@ nfattr_failure:
 	return -ENOMEM;
 }
 
-#ifdef CONFIG_IP_NF_CT_ACCT
 static inline int
 ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
-			enum ip_conntrack_dir dir)
+			enum ip_conntrack_dir dir, unsigned int events)
 {
+#ifdef CONFIG_IP_NF_CT_ACCT
 	enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
 	struct nfattr *nest_count = NFA_NEST(skb, type);
-	u_int64_t tmp;
+	ct_ctr_t tmp;
 
-	tmp = htonl(ct->counters[dir].packets);
-	NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
+	if (unlikely(events & (IPCT_ACCT_OFLOW_PKTS + dir)))
+		tmp = CT_CTR_HIGHEST_BIT;
+	else
+		tmp = ct->counters[dir].packets;
+	tmp = ct_ctr_to_be(tmp);
+	NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(ct_ctr_t), &tmp);
 
-	tmp = htonl(ct->counters[dir].bytes);
-	NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp);
+	if (unlikely(events & (IPCT_ACCT_OFLOW_BYTES + dir)))
+		tmp = CT_CTR_HIGHEST_BIT;
+	else
+		tmp = ct->counters[dir].bytes;
+	tmp = ct_ctr_to_be(tmp);
+	NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(ct_ctr_t), &tmp);
 
 	NFA_NEST_END(skb, nest_count);
 
@@ -189,10 +197,10 @@ ctnetlink_dump_counters(struct sk_buff *
 
 nfattr_failure:
 	return -ENOMEM;
-}
 #else
-#define ctnetlink_dump_counters(a, b, c) (0)
+	return 0;
 #endif
+}
 
 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
 static inline int
@@ -268,8 +276,8 @@ ctnetlink_fill_info(struct sk_buff *skb,
 
 	if (ctnetlink_dump_status(skb, ct) < 0 ||
 	    ctnetlink_dump_timeout(skb, ct) < 0 ||
-	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
-	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, event) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, event) < 0 ||
 	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_mark(skb, ct) < 0 ||
@@ -325,6 +333,14 @@ static int ctnetlink_conntrack_event(str
 		group = NFNLGRP_CONNTRACK_UPDATE;
 		goto alloc_skb;
 	} 
+	/* FIXME: what if we have a status update _and_ an overflow event
+	 * at the same time? */
+	if (events & IPCT_ACCT_OFLOW_BITS) {
+		type = IPCTNL_MSG_CT_NEW;
+		group = NFNLGRP_CONNTRACK_CTR_OVERFLOW;
+		goto alloc_skb;
+	}
+
 	
 	return NOTIFY_DONE;
 
@@ -370,8 +386,8 @@ alloc_skb:
 	    && ctnetlink_dump_helpinfo(skb, ct) < 0)
 		goto nfattr_failure;
 
-	if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
-	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+	if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, events) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, events) < 0)
 		goto nfattr_failure;
 
 	nlh->nlmsg_len = skb->tail - b;

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

next prev parent reply	other threads:[~2005-10-10 21:36 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-10-07 10:54 [PATCH] use 32bit counters for connection-based accounting Harald Welte
2005-10-07 20:26 ` David S. Miller
2005-10-07 22:04   ` Harald Welte
2005-10-10  4:45     ` David S. Miller
2005-10-10  5:16     ` Patrick Schaaf
2005-10-10  8:30       ` Harald Welte
2005-10-10  8:59         ` Patrick Schaaf
2005-10-10  9:47           ` Harald Welte
2005-10-10 16:38             ` Patrick Schaaf
2005-10-10 21:36               ` Harald Welte [this message]
2005-10-10 14:00           ` Amin Azez

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20051010213657.GI5627@rama \
    --to=laforge@netfilter.org \
    --cc=bof@bof.de \
    --cc=davem@davemloft.net \
    --cc=kaber@trash.net \
    --cc=netfilter-devel@lists.netfilter.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.