diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -27,6 +27,8 @@ enum nfnetlink_groups { #define NFNLGRP_CONNTRACK_EXP_UPDATE NFNLGRP_CONNTRACK_EXP_UPDATE NFNLGRP_CONNTRACK_EXP_DESTROY, #define NFNLGRP_CONNTRACK_EXP_DESTROY NFNLGRP_CONNTRACK_EXP_DESTROY + NFNLGRP_CONNTRACK_CTR_OVERFLOW, +#define NFNLGRP_CONNTRACK_CTR_OVERFLOW NFNLGRP_CONNTRACK_CTR_OVERFLOW __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -69,6 +69,10 @@ enum ip_conntrack_status { /* Connection is dying (removed from lists), can not be unset. */ IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT), + + /* Per connection counters have overflowed at least once */ + IPS_COUNTER_OVERFLOW_BIT = 10, + IPS_COUNTER_OVERFLOW = (1 << IPS_COUNTER_OVERFLOW_BIT), }; /* Connection tracking event bits */ @@ -119,10 +123,25 @@ enum ip_conntrack_events IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), /* Counter highest bit has been set */ - IPCT_COUNTER_FILLING_BIT = 11, - IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT), + IPCT_ACCT_OFLOW_PKTS_ORIG_BIT = 11, + IPCT_ACCT_OFLOW_PKTS_ORIG = (1 << IPCT_ACCT_OFLOW_PKTS_ORIG_BIT), +#define IPCT_ACCT_OFLOW_PKTS IPCT_ACCT_OFLOW_PKTS_ORIG + + IPCT_ACCT_OFLOW_PKTS_RPLY_BIT = 12, + IPCT_ACCT_OFLOW_PKTS_RPLY = (1 << IPCT_ACCT_OFLOW_PKTS_RPLY_BIT), + + /* Counter highest bit has been set */ + IPCT_ACCT_OFLOW_BYTES_ORIG_BIT = 13, + IPCT_ACCT_OFLOW_BYTES_ORIG = (1 << IPCT_ACCT_OFLOW_BYTES_ORIG_BIT), +#define IPCT_ACCT_OFLOW_BYTES IPCT_ACCT_OFLOW_BYTES_ORIG + + IPCT_ACCT_OFLOW_BYTES_RPLY_BIT = 14, + IPCT_ACCT_OFLOW_BYTES_RPLY = (1 << IPCT_ACCT_OFLOW_BYTES_RPLY_BIT), }; +#define IPCT_ACCT_OFLOW_BITS (IPCT_ACCT_OFLOW_BYTES_RPLY|IPCT_ACCT_OFLOW_BYTES_ORIG \ + IPCT_ACCT_OFLOW_PKTS_RPLY|IPCT_ACCT_OFLOW_PKTS_ORIG) + enum ip_conntrack_expect_events { IPEXP_NEW_BIT = 0, IPEXP_NEW = (1 << IPEXP_NEW_BIT), @@ -194,11 +213,21 @@ do { \ #define IP_NF_ASSERT(x) #endif +#ifdef CONFIG_IP_NF_CT_ACCT +#define CTR_HIGHEST_BIT (1 << ((sizeof(ct_ctr_t)*8)-1)) +#ifdef CONFIG_IP_NF_CT_ACCT64 +typedef ct_ctr_t u_int64_t; +#define ct_ctr_to_be(x) cpu_to_be64(x) +#else +typedef ct_ctr_t u_int32_t; +#define ct_ctr_to_be(x) htonl(x) +#endif /* CONFIG_IP_NF_CT_ACCT64 */ struct ip_conntrack_counter { - u_int32_t packets; - u_int32_t bytes; + ct_ctr_t packets; + ct_ctr_t bytes; }; +#endif /* CONFIG_IP_NF_CT_ACCT */ struct ip_conntrack_helper; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -32,6 +32,21 @@ config IP_NF_CT_ACCT If unsure, say `N'. +config IP_NF_CT_ACCT64 + bool "Use 64bit counters for flow accounting" + depends on IP_NF_CT_ACCT + help + The conntrack counters are 32bit by default, which makes them + overflow at more than 4GB of traffic in one direction of a + connection. This is fine if you have a userspace accounting daemon + that receives the COUNTER_FILLING event message. + + However, if you want to use the counters inside the kernel (e.g. + by the connbytes match), you should select this option to increase + their range to 64bits. + + If unsure, say `N'. + config IP_NF_CONNTRACK_MARK bool 'Connection mark tracking support' depends on IP_NF_CONNTRACK diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1112,6 +1112,35 @@ void ip_conntrack_helper_unregister(stru synchronize_net(); } +static inline int __ip_ct_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) +{ + unsigned int ret = 0; +#ifdef CONFIG_IP_NF_CT_ACCT + unsigned int dir = CTINFO2DIR(ctinfo); + ct_ctr_t packets, bytes; + + /* The idea is to just reset the highest bit and continue counting. + * Userspace will then have to add that 'highest bit' to their + * accounting system. */ + ct->counters[dir].packets++; + if (unlikely(ct->counters[dir].packets > CTR_HIGHEST_BIT)) { + set_bit(IPS_COUNTER_OVERFLOW, &ct->status); + ct->counters[dir].packets &= ~CTR_HIGHEST_BIT; + ret |= IPCT_ACCT_OFLOW_PKTS + dir; + } + + ct->counters[dir].bytes += ntohs(skb->nh.iph->tot_len); + if (unlikely(ct->counters[dir].bytes > CTR_HIGHEST_BIT)) { + set_bit(IPS_COUNTER_OVERFLOW, &ct->status); + ct->counters[dir].bytes &= ~CTR_HIGHEST_BIT; + ret |= IPCT_ACCT_OFLOW_BYTES + dir; + } +#endif + return ret; +} + /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __ip_ct_refresh_acct(struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, @@ -1139,16 +1168,8 @@ void __ip_ct_refresh_acct(struct ip_conn } } -#ifdef CONFIG_IP_NF_CT_ACCT - if (do_acct) { - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - ntohs(skb->nh.iph->tot_len); - if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) - || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) - event |= IPCT_COUNTER_FILLING; - } -#endif + if (do_acct) + event |= __ip_ct_acct(ct, ctinfo, skb); write_unlock_bh(&ip_conntrack_lock); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -168,20 +168,28 @@ nfattr_failure: return -ENOMEM; } -#ifdef CONFIG_IP_NF_CT_ACCT static inline int ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, - enum ip_conntrack_dir dir) + enum ip_conntrack_dir dir, unsigned int events) { +#ifdef CONFIG_IP_NF_CT_ACCT enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nfattr *nest_count = NFA_NEST(skb, type); - u_int64_t tmp; + ct_ctr_t tmp; - tmp = htonl(ct->counters[dir].packets); - NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); + if (unlikely(events & (IPCT_ACCT_OFLOW_PKTS + dir))) + tmp = CT_CTR_HIGHEST_BIT; + else + tmp = ct->counters[dir].packets; + tmp = ct_ctr_to_be(tmp); + NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(ct_ctr_t), &tmp); - tmp = htonl(ct->counters[dir].bytes); - NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp); + if (unlikely(events & (IPCT_ACCT_OFLOW_BYTES + dir))) + tmp = CT_CTR_HIGHEST_BIT; + else + tmp = ct->counters[dir].bytes; + tmp = ct_ctr_to_be(tmp); + NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(ct_ctr_t), &tmp); NFA_NEST_END(skb, nest_count); @@ -189,10 +197,10 @@ ctnetlink_dump_counters(struct sk_buff * nfattr_failure: return -ENOMEM; -} #else -#define ctnetlink_dump_counters(a, b, c) (0) + return 0; #endif +} #ifdef CONFIG_IP_NF_CONNTRACK_MARK static inline int @@ -268,8 +276,8 @@ ctnetlink_fill_info(struct sk_buff *skb, if (ctnetlink_dump_status(skb, ct) < 0 || ctnetlink_dump_timeout(skb, ct) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, event) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, event) < 0 || ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || @@ -325,6 +333,14 @@ static int ctnetlink_conntrack_event(str group = NFNLGRP_CONNTRACK_UPDATE; goto alloc_skb; } + /* FIXME: what if we have a status update _and_ an overflow event + * at the same time? */ + if (events & IPCT_ACCT_OFLOW_BITS) { + type = IPCTNL_MSG_CT_NEW; + group = NFNLGRP_CONNTRACK_CTR_OVERFLOW; + goto alloc_skb; + } + return NOTIFY_DONE; @@ -370,8 +386,8 @@ alloc_skb: && ctnetlink_dump_helpinfo(skb, ct) < 0) goto nfattr_failure; - if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, events) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, events) < 0) goto nfattr_failure; nlh->nlmsg_len = skb->tail - b;