public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Pablo Neira Ayuso <pablo@netfilter.org>
To: Ahmed Zaki <anzaki@gmail.com>
Cc: netfilter-devel@vger.kernel.org, andrew@lunn.ch,
	olteanv@gmail.com, fw@strlen.de, kuba@kernel.org,
	pabeni@redhat.com, edumazet@google.com, coreteam@netfilter.org,
	netdev@vger.kernel.org
Subject: Re: [PATCH nf-next v2 1/2] netfilter: flowtable: update netdev stats with HW_OFFLOAD flows
Date: Tue, 24 Mar 2026 22:28:23 +0100	[thread overview]
Message-ID: <acMB9xSTEmwly8QK@chamomile> (raw)
In-Reply-To: <20260324204016.2089193-2-anzaki@gmail.com>

On Tue, Mar 24, 2026 at 02:40:15PM -0600, Ahmed Zaki wrote:
> Some drivers (notably DSA) delegate the nft flowtable HW_OFFLOAD flows
> to a parent driver. While the parent driver is able to report the
> offloaded traffic stats directly from the HW, the delegating driver
> does not report the stats. This fails SNMP-based monitoring tools that
> rely on netdev stats to report the network traffic.
> 
> Add a new struct pcpu_sw_netstats "fstats" to net_device that gets
> allocated only if the new flag "flow_offload_via_parent" is set by the
> driver. The new stats are lazily allocated by the nft flow offloading
> code when the first flow is offloaded. The stats are updated periodically
> in flow_offload_work_stats() and also once in flow_offload_work_del()
> before the flow is deleted. For this, flow_offload_work_del() had to
> be moved below flow_offload_tuple_stats().

Hm, I think v1 was a simpler approach... except that you innecesarily
modified a lot of callsites as Jakub pointed out. I don't see why you
need this new callback for netdev_ops.

> Signed-off-by: Ahmed Zaki <anzaki@gmail.com>
> ---
>  include/linux/netdevice.h             | 45 ++++++++++++
>  net/core/dev.c                        |  8 +++
>  net/netfilter/nf_flow_table_offload.c | 98 +++++++++++++++++++++++++--
>  3 files changed, 145 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 67e25f6d15a4..647758f78213 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1840,6 +1840,11 @@ enum netdev_reg_state {
>   *	@stats:		Statistics struct, which was left as a legacy, use
>   *			rtnl_link_stats64 instead
>   *
> + *	@fstats:	HW offloaded flow statistics: RX/TX packets,
> + *			RX/TX bytes. Lazily allocated by the flow offload
> + *			path on the first offloaded flow for devices that
> + *			set @flow_offload_via_parent. Freed by free_netdev().
> + *
>   *	@core_stats:	core networking counters,
>   *			do not use this in drivers
>   *	@carrier_up_count:	Number of times the carrier has been up
> @@ -2048,6 +2053,12 @@ enum netdev_reg_state {
>   *	@change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
>   *	@netns_immutable: interface can't change network namespaces
>   *	@fcoe_mtu:	device supports maximum FCoE MTU, 2158 bytes
> + *	@flow_offload_via_parent: device delegates nft flowtable hardware
> + *				  offload to a parent/conduit device (e.g. DSA
> + *				  user ports delegate to their conduit MAC).
> + *				  The parent's HW count the offloaded traffic
> + *				  but this device's sw netstats path does not.
> + *				  @fstats is allocated to fill that gap.
>   *
>   *	@net_notifier_list:	List of per-net netdev notifier block
>   *				that follow this device when it is moved
> @@ -2233,6 +2244,7 @@ struct net_device {
>  
>  	struct net_device_stats	stats; /* not used by modern drivers */
>  
> +	struct pcpu_sw_netstats __percpu *fstats;
>  	struct net_device_core_stats __percpu *core_stats;
>  
>  	/* Stats to monitor link on/off, flapping */
> @@ -2463,6 +2475,7 @@ struct net_device {
>  	unsigned long		change_proto_down:1;
>  	unsigned long		netns_immutable:1;
>  	unsigned long		fcoe_mtu:1;
> +	unsigned long		flow_offload_via_parent:1;
>  
>  	struct list_head	net_notifier_list;
>  
> @@ -2992,6 +3005,38 @@ struct pcpu_lstats {
>  
>  void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);
>  
> +static inline void dev_fstats_rx_add(struct net_device *dev,
> +				     unsigned int packets,
> +				     unsigned int len)
> +{
> +	struct pcpu_sw_netstats *fstats;
> +
> +	if (!dev->fstats)
> +		return;
> +
> +	fstats = this_cpu_ptr(dev->fstats);
> +	u64_stats_update_begin(&fstats->syncp);
> +	u64_stats_add(&fstats->rx_bytes, len);
> +	u64_stats_add(&fstats->rx_packets, packets);
> +	u64_stats_update_end(&fstats->syncp);
> +}
> +
> +static inline void dev_fstats_tx_add(struct net_device *dev,
> +				     unsigned int packets,
> +				     unsigned int len)
> +{
> +	struct pcpu_sw_netstats *fstats;
> +
> +	if (!dev->fstats)
> +		return;
> +
> +	fstats = this_cpu_ptr(dev->fstats);
> +	u64_stats_update_begin(&fstats->syncp);
> +	u64_stats_add(&fstats->tx_bytes, len);
> +	u64_stats_add(&fstats->tx_packets, packets);
> +	u64_stats_update_end(&fstats->syncp);
> +}
> +
>  static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
>  {
>  	struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index f48dc299e4b2..07fb315ad42c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -11865,6 +11865,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
>  {
>  	const struct net_device_ops *ops = dev->netdev_ops;
>  	const struct net_device_core_stats __percpu *p;
> +	const struct pcpu_sw_netstats __percpu *fstats;
>  
>  	/*
>  	 * IPv{4,6} and udp tunnels share common stat helpers and use
> @@ -11893,6 +11894,11 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
>  		netdev_stats_to_stats64(storage, &dev->stats);
>  	}
>  
> +	/* This READ_ONCE() pairs with cmpxchg in flow_offload_fstats_ensure() */
> +	fstats = READ_ONCE(dev->fstats);
> +	if (fstats)
> +		dev_fetch_sw_netstats(storage, fstats);
> +
>  	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
>  	p = READ_ONCE(dev->core_stats);
>  	if (p) {
> @@ -12212,6 +12218,8 @@ void free_netdev(struct net_device *dev)
>  	free_percpu(dev->pcpu_refcnt);
>  	dev->pcpu_refcnt = NULL;
>  #endif
> +	free_percpu(dev->fstats);
> +	dev->fstats = NULL;
>  	free_percpu(dev->core_stats);
>  	dev->core_stats = NULL;
>  	free_percpu(dev->xdp_bulkq);
> diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
> index b2e4fb6fa011..fc1e67a79904 100644
> --- a/net/netfilter/nf_flow_table_offload.c
> +++ b/net/netfilter/nf_flow_table_offload.c
> @@ -925,13 +925,80 @@ static void flow_offload_work_add(struct flow_offload_work *offload)
>  	nf_flow_offload_destroy(flow_rule);
>  }
>  
> -static void flow_offload_work_del(struct flow_offload_work *offload)
> +static bool flow_offload_fstats_ensure(struct net_device *dev)
>  {
> -	clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
> -	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
> -	if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
> -		flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
> -	set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
> +	struct pcpu_sw_netstats __percpu *p;
> +
> +	if (!dev->flow_offload_via_parent)
> +		return false;
> +
> +	/* Pairs with cmpxchg() below. */
> +	if (likely(READ_ONCE(dev->fstats)))
> +		return true;
> +
> +	p = __netdev_alloc_pcpu_stats(struct pcpu_sw_netstats, GFP_ATOMIC);
> +	if (!p)
> +		return false;
> +
> +	if (cmpxchg(&dev->fstats, NULL, p))
> +		free_percpu(p);	/* lost the race, discard and use winner's */
> +
> +	return true;
> +}
> +
> +static u32 flow_offload_egress_ifidx(const struct flow_offload_tuple *tuple)
> +{
> +	switch (tuple->xmit_type) {
> +	case FLOW_OFFLOAD_XMIT_NEIGH:
> +		return tuple->ifidx;
> +	case FLOW_OFFLOAD_XMIT_DIRECT:
> +		return tuple->out.ifidx;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static void flow_offload_netdev_update(struct flow_offload_work *offload,
> +				       struct flow_stats *stats)
> +{
> +	const struct flow_offload_tuple *tuple;
> +	struct net_device *indev, *outdev;
> +	struct net *net;
> +
> +	rcu_read_lock();
> +	net = read_pnet(&offload->flowtable->net);
> +	if (stats[FLOW_OFFLOAD_DIR_ORIGINAL].pkts) {
> +		tuple = &offload->flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple;
> +		indev = dev_get_by_index_rcu(net, tuple->iifidx);
> +		if (indev && flow_offload_fstats_ensure(indev))
> +			dev_fstats_rx_add(indev,
> +					  stats[FLOW_OFFLOAD_DIR_ORIGINAL].pkts,
> +					  stats[FLOW_OFFLOAD_DIR_ORIGINAL].bytes);
> +
> +		outdev = dev_get_by_index_rcu(net,
> +					      flow_offload_egress_ifidx(tuple));
> +		if (outdev && flow_offload_fstats_ensure(outdev))
> +			dev_fstats_tx_add(outdev,
> +					  stats[FLOW_OFFLOAD_DIR_ORIGINAL].pkts,
> +					  stats[FLOW_OFFLOAD_DIR_ORIGINAL].bytes);
> +	}
> +
> +	if (stats[FLOW_OFFLOAD_DIR_REPLY].pkts) {
> +		tuple = &offload->flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple;
> +		indev = dev_get_by_index_rcu(net, tuple->iifidx);
> +		if (indev && flow_offload_fstats_ensure(indev))
> +			dev_fstats_rx_add(indev,
> +					  stats[FLOW_OFFLOAD_DIR_REPLY].pkts,
> +					  stats[FLOW_OFFLOAD_DIR_REPLY].bytes);
> +
> +		outdev = dev_get_by_index_rcu(net,
> +					      flow_offload_egress_ifidx(tuple));
> +		if (outdev && flow_offload_fstats_ensure(outdev))
> +			dev_fstats_tx_add(outdev,
> +					  stats[FLOW_OFFLOAD_DIR_REPLY].pkts,
> +					  stats[FLOW_OFFLOAD_DIR_REPLY].bytes);
> +	}
> +	rcu_read_unlock();
>  }
>  
>  static void flow_offload_tuple_stats(struct flow_offload_work *offload,
> @@ -968,6 +1035,25 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
>  				       FLOW_OFFLOAD_DIR_REPLY,
>  				       stats[1].pkts, stats[1].bytes);
>  	}
> +
> +	flow_offload_netdev_update(offload, stats);
> +}
> +
> +static void flow_offload_work_del(struct flow_offload_work *offload)
> +{
> +	struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {};
> +
> +	flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]);
> +	if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
> +		flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY,
> +					 &stats[1]);
> +	flow_offload_netdev_update(offload, stats);
> +
> +	clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
> +	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
> +	if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
> +		flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
> +	set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
>  }
>  
>  static void flow_offload_work_handler(struct work_struct *work)
> -- 
> 2.43.0
> 

  reply	other threads:[~2026-03-24 21:28 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-24 20:40 [PATCH nf-next v2 0/2] Update (DSA) netdev stats with offloaded flows Ahmed Zaki
2026-03-24 20:40 ` [PATCH nf-next v2 1/2] netfilter: flowtable: update netdev stats with HW_OFFLOAD flows Ahmed Zaki
2026-03-24 21:28   ` Pablo Neira Ayuso [this message]
2026-03-24 23:27     ` Ahmed Zaki
2026-03-24 20:40 ` [PATCH nf-next v2 2/2] net: dsa: update net_device stats with HW offloaded flows stats Ahmed Zaki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=acMB9xSTEmwly8QK@chamomile \
    --to=pablo@netfilter.org \
    --cc=andrew@lunn.ch \
    --cc=anzaki@gmail.com \
    --cc=coreteam@netfilter.org \
    --cc=edumazet@google.com \
    --cc=fw@strlen.de \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=olteanv@gmail.com \
    --cc=pabeni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox