All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: Tom Herbert <therbert@google.com>
Cc: David Miller <davem@davemloft.net>,
	Linux Netdev List <netdev@vger.kernel.org>,
	Andi Kleen <andi@firstfloor.org>
Subject: Re: [PATCH v4 1/1] rps: core implementation
Date: Sat, 21 Nov 2009 09:07:59 +0100	[thread overview]
Message-ID: <4B079FDF.9040809@gmail.com> (raw)
In-Reply-To: <65634d660911201528k5a07135el471b65fff9dd7c9d@mail.gmail.com>

Tom Herbert a écrit :
> Version 4 of RPS patch.  I think this addresses most of the comments
> on the previous versions.  Thanks for all the insightful comments and
> patience!
> 
> Changes from previous version:
> 
> - Added rxhash to kernel-doc for struct sk_buff
> - Removed /** on comments not for kernel-doc
> - Change get_token declaration to kernel style
> - Added struct dev_rps_maps.  Each netdevice now has a pointer to this
> structure which contains the array of per NAPI rps maps, the number of
> this maps.  rcu is used to protect pointer
> - In store_rps_cpus a new map set is allocated each call.
> - Removed dev_isalive check and other locks since rps struct in
> netdevice are protected by rcu
> - Removed NET_RPS_SOFTIRQ and call net_rps_action from net_rx_action instead
> - Queue to remote backlog queues only in NAPI case.  This means
> rps_remote_softirq_cpus does not need to be accessed with interrupts
> disabled and __smp_call_function_single will not be called with
> interrupts disabled
> - Limit the number of entries in an rps map to min(256, num_possible_cpus())
> - Removed unnecessary irq_local_disable
> - Renamed skb_tx_hashrnd to just hashrnd and use that for the rps hash as well
> - Make index u16 in index=skb_get_rx_queue() and don't check index<0 now
> - Replace spin_lock_irqsave with simpler spin_lock_irq in process_backlog

Excellent !

>   *	The DEVICE structure.
>   *	Actually, this whole structure is a big mistake.  It mixes I/O
>   *	data with strictly "high-level" data, and it has to know about
> @@ -861,6 +884,9 @@ struct net_device {
> 
>  	struct netdev_queue	rx_queue;
> 
> +	struct dev_rps_maps	*dev_rps_maps;	/* Per-NAPI maps for
> +						   receive packet steeing */
> +
>  	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
> 
>  	/* Number of TX queues allocated at alloc_netdev_mq() time  */
> @@ -1276,6 +1302,9 @@ struct softnet_data {
>  	struct Qdisc		*output_queue;
>  	struct sk_buff_head	input_pkt_queue;
>  	struct list_head	poll_list;
> +
> +	struct call_single_data	csd;

This is problematic. softnet_data used to be only used by local cpu.
With RPS, other cpus are going to access csd, input_pkt_queue, backlog
and dirty cache lines.

Maybe we should split sofnet_data in two cache lines, one private,
one chared, and 
DEFINE_PER_CPU(struct softnet_data, softnet_data);
-> 
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

Also you need to change comment at start of struct softnet_data,
since it is wrong after RPS.

/*
 * Incoming packets are placed on per-cpu queues so that
 * no locking is needed.
 */

> +
>  	struct sk_buff		*completion_queue;
> 
>  	struct napi_struct	backlog;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 63f4742..f188301 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -267,6 +267,7 @@ typedef unsigned char *sk_buff_data_t;
>   *	@mac_header: Link layer header
>   *	@_skb_dst: destination entry
>   *	@sp: the security path, used for xfrm
> + *	@rxhash: the packet hash computed on receive
>   *	@cb: Control buffer. Free for use by every layer. Put private vars here
>   *	@len: Length of actual data
>   *	@data_len: Data length
> @@ -323,6 +324,8 @@ struct sk_buff {
>  #ifdef CONFIG_XFRM
>  	struct	sec_path	*sp;
>  #endif
> +	__u32			rxhash;
> +
>  	/*
>  	 * This is the control buffer. It is free to use for every
>  	 * layer. Please put your private variables there. If you
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 9977288..f2c199c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1834,7 +1834,7 @@ out_kfree_skb:
>  	return rc;
>  }
> 
> -static u32 skb_tx_hashrnd;
> +static u32 hashrnd;
adding a __read_mostly here could help :)

> 
>  u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
>  {
> @@ -1852,7 +1852,7 @@ u16 skb_tx_hash(const struct net_device *dev,
> const struct sk_buff *skb)
>  	else
>  		hash = skb->protocol;
> 
> -	hash = jhash_1word(hash, skb_tx_hashrnd);
> +	hash = jhash_1word(hash, hashrnd);
> 
>  	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
>  }
> @@ -2073,6 +2073,146 @@ int weight_p __read_mostly = 64;            /*
> old backlog weight */
> 
>  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
> 
> +/*
> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
> + */
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2, ports;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u32 hash, ihl;
> +	u8 ip_proto;
> +	int cpu = -1;
> +	struct dev_rps_maps *drmap;
> +	struct rps_map *map = NULL;
> +	u16 index;
> +

> +	rcu_read_lock();
If called from netif_receive_skb(), we already are in a rcu protected section,
but this could be a cleanup patch, because many other parts in stack could
be changed as well.


> +/*
> + * enqueue_to_backlog is called to queue an skb to a per CPU backlog
> + * queue (may be a remote CPU queue).
> + */
> +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
> +{
> +	struct softnet_data *queue;
> +	unsigned long flags;
> +
> +	queue = &per_cpu(softnet_data, cpu);
> +


> +	local_irq_save(flags);
> +	__get_cpu_var(netdev_rx_stat).total++;
> +
> +	spin_lock(&queue->input_pkt_queue.lock);

could reduce to :
	percpu_add(netdev_rx_stat.total, 1);
	spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);

> +	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
> +		if (queue->input_pkt_queue.qlen) {
> +enqueue:
> +			__skb_queue_tail(&queue->input_pkt_queue, skb);
> +			spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
> +			    flags);
> +			return NET_RX_SUCCESS;
> +		}
> +
> +		/* Schedule NAPI for backlog device */

> +		if (napi_schedule_prep(&queue->backlog)) {

  Is it legal (or wanter at all) to call napi_schedule_prep() on remote cpu backlog ?

> +			if (cpu != smp_processor_id()) {
> +				cpu_set(cpu,
> +				    get_cpu_var(rps_remote_softirq_cpus));
> +				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
> +			} else
> +				__napi_schedule(&queue->backlog);

Why not the more easy :
	if (cpu != smp_processor_id())
		cpu_set(cpu, get_cpu_var(rps_remote_softirq_cpus));
	else
		napi_schedule(&queue->backlog);

This way we dont touch to remote backlog (and this backlog could stay in the private
cache line of remote cpu)

> +		}
> +		goto enqueue;
> +	}
> +

> +	spin_unlock(&queue->input_pkt_queue.lock);
> +
> +	__get_cpu_var(netdev_rx_stat).dropped++;
> +	local_irq_restore(flags);

-> 
	spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
	percpu_add(netdev_rx_stat.dropped, 1);


+/*
+ * net_rps_action sends any pending IPI's for rps.  This is only called from
+ * softirq and interrupts must be enabled.
+ */
+static void net_rps_action(void)
+{
+       int cpu;
+
+       /* Send pending IPI's to kick RPS processing on remote cpus. */
+       for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+               struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+               __smp_call_function_single(cpu, &queue->csd, 0);
+       }
+       cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
+}


net_rps_action() might be not very descriptive name and bit expensive...

Did you tried smp_call_function_many() ?
(I suspect you did, but found it was not that optimized ?)

CC Andi to get feedback from him :)

static void net_rps_remcpus_fire(void)
{
	smp_call_function_many(__get_cpu_var(rps_remote_softirq_cpus),
			       trigger_softirq, NULL, 0);
}

Of course you would have to use following code as well :
(eg ignore void *data argument)

static void trigger_softirq(void *data)
{
/* kind of :
 * __napi_schedule(__get_cpu_var(softnet_data).backlog);
 * without local_irq_save(flags);/local_irq_restore(flags);
 */
	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

Thanks Herbert


  parent reply	other threads:[~2009-11-21  8:08 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-11-20 23:28 [PATCH v4 1/1] rps: core implementation Tom Herbert
2009-11-20 23:39 ` David Miller
2009-11-20 23:50   ` Tom Herbert
2009-11-21  0:05     ` David Miller
2009-11-21  0:12       ` Tom Herbert
2009-11-21  0:40         ` Jarek Poplawski
2009-11-20 23:40 ` Stephen Hemminger
2009-11-20 23:53   ` Tom Herbert
2009-11-20 23:56   ` David Miller
2009-12-17 21:04   ` Tom Herbert
2010-01-06  1:32     ` Tom Herbert
2010-01-06  5:54       ` Eric Dumazet
2010-01-06  7:56         ` Tom Herbert
2010-01-06 18:38         ` Eric Dumazet
2010-01-06 21:10           ` [BUG net-next-2.6] Had to revert bonding: allow arp_ip_targets on separate vlans to use arp validation Eric Dumazet
2010-01-06 21:28             ` Jay Vosburgh
2010-01-06 21:34               ` Eric Dumazet
2010-01-06 21:38             ` David Miller
2010-01-06 21:45               ` Andy Gospodarek
2010-01-06 22:56             ` [PATCH net-next-2.6] fix " Andy Gospodarek
2010-01-06 23:53               ` Jay Vosburgh
2010-01-07  8:37                 ` Eric Dumazet
2010-01-07  8:41                   ` David Miller
2010-01-06 22:54           ` [PATCH v4 1/1] rps: core implementation Tom Herbert
2010-01-07  9:15             ` Eric Dumazet
2010-01-07 17:42               ` rps: some comments Eric Dumazet
2010-01-08  0:07                 ` Tom Herbert
2010-01-08  6:27                   ` Eric Dumazet
2010-01-11  6:25               ` [PATCH v4 1/1] rps: core implementation Tom Herbert
2010-01-11  9:00                 ` Eric Dumazet
2010-01-14  4:40                   ` David Miller
2009-11-20 23:42 ` Stephen Hemminger
2009-11-21  0:04   ` Tom Herbert
2009-11-21  8:07 ` Eric Dumazet [this message]
2009-11-21  9:03   ` Tom Herbert
2009-11-21  9:31     ` Eric Dumazet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4B079FDF.9040809@gmail.com \
    --to=eric.dumazet@gmail.com \
    --cc=andi@firstfloor.org \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=therbert@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.