* [PATCH v2] Receive Packet Steering
@ 2009-05-04 4:03 Tom Herbert
2009-05-04 5:30 ` Eric Dumazet
` (5 more replies)
0 siblings, 6 replies; 23+ messages in thread
From: Tom Herbert @ 2009-05-04 4:03 UTC (permalink / raw)
To: netdev, David Miller
This is an update of the receive packet steering patch (RPS) based on received
comments (thanks for all the comments). Improvements are:
1) Removed config option for the feature.
2) Made scheduling of backlog NAPI devices between CPUs lockless and much
simpler.
3) Added new softirq to do defer sending IPIs for coalescing.
4) Imported hash from simple_rx_hash. Eliminates modulo operation to convert
hash to index.
5) If no cpu is found for packet steering, then netif_receive_skb processes
packet inline as before without queueing. In paritcular if RPS is not
configured on a device the receive path is unchanged from current for
NAPI devices (one additional conditional).
Tom
Signed-off-by: Tom Herbert <therbert@google.com>
---
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 91658d0..32ec04b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -260,6 +260,7 @@ enum
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
+ NET_RPS_SOFTIRQ,
BLOCK_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index be3ebd7..f94ee96 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -758,6 +758,8 @@ struct net_device
void *ax25_ptr; /* AX.25 specific data */
struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
assign before registering */
+ u16 *rps_map;
+ int rps_map_len;
/*
* Cache line mostly used on receive path (including eth_type_trans())
@@ -1170,6 +1172,9 @@ struct softnet_data
struct Qdisc *output_queue;
struct sk_buff_head input_pkt_queue;
struct list_head poll_list;
+
+ struct call_single_data csd;
+
struct sk_buff *completion_queue;
struct napi_struct backlog;
diff --git a/net/core/dev.c b/net/core/dev.c
index 052dd47..3107544 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1906,6 +1906,142 @@ int weight_p __read_mostly = 64; /*
old backlog weight */
DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
+static u32 simple_hashrnd;
+static int simple_hashrnd_initialized;
+
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+{
+ u32 addr1, addr2, ports;
+ struct ipv6hdr *ip6;
+ struct iphdr *ip;
+ u32 hash, ihl;
+ u8 ip_proto;
+ int cpu;
+
+ if (!dev->rps_map_len)
+ return -1;
+
+ if (unlikely(!simple_hashrnd_initialized)) {
+ get_random_bytes(&simple_hashrnd, 4);
+ simple_hashrnd_initialized = 1;
+ }
+
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_IP):
+ if (!pskb_may_pull(skb, sizeof(*ip)))
+ return -1;
+
+ ip = (struct iphdr *) skb->data;
+ ip_proto = ip->protocol;
+ addr1 = ip->saddr;
+ addr2 = ip->daddr;
+ ihl = ip->ihl;
+ break;
+ case __constant_htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, sizeof(*ip6)))
+ return -1;
+
+ ip6 = (struct ipv6hdr *) skb->data;
+ ip_proto = ip6->nexthdr;
+ addr1 = ip6->saddr.s6_addr32[3];
+ addr2 = ip6->daddr.s6_addr32[3];
+ ihl = (40 >> 2);
+ break;
+ default:
+ return -1;
+ }
+ ports = 0;
+ switch (ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_AH:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ if (pskb_may_pull(skb, (ihl * 4) + 4))
+ ports = *((u32 *) (skb->data + (ihl * 4)));
+ break;
+
+ default:
+ break;
+ }
+
+ hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
+
+ cpu = skb->dev->rps_map[((u64) hash * dev->rps_map_len) >> 32];
+ return cpu_online(cpu) ? cpu : -1;
+}
+
+static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
+
+/* Called from hardirq (IPI) context */
+static void trigger_softirq(void *data)
+{
+ struct softnet_data *queue = data;
+ __napi_schedule(&queue->backlog);
+}
+
+/**
+ * net_rps_action is called from NET_RPS_SOFTIRQ to do IPIs to schedule RX
+ * softirq on remote CPUs. Called in a separate softirq to allow for
+ * coalescing.
+ */
+static void net_rps_action(struct softirq_action *h)
+{
+ int cpu;
+
+ local_irq_disable();
+
+ for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
+ struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+ __smp_call_function_single(cpu, &queue->csd);
+ }
+ cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
+
+ local_irq_enable();
+}
+
+/**
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+{
+ struct softnet_data *queue;
+ unsigned long flags;
+
+ queue = &per_cpu(softnet_data, cpu);
+ spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
+
+ __get_cpu_var(netdev_rx_stat).total++;
+ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+ if (queue->input_pkt_queue.qlen) {
+enqueue:
+ __skb_queue_tail(&queue->input_pkt_queue, skb);
+ spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+ flags);
+ return NET_RX_SUCCESS;
+ }
+
+ /* Schedule NAPI for backlog device */
+ if (napi_schedule_prep(&queue->backlog)) {
+ if (cpu != smp_processor_id()) {
+ cpu_set(cpu,
+ get_cpu_var(rps_remote_softirq_cpus));
+ __raise_softirq_irqoff(NET_RPS_SOFTIRQ);
+ } else
+ __napi_schedule(&queue->backlog);
+ }
+ goto enqueue;
+ }
+
+ __get_cpu_var(netdev_rx_stat).dropped++;
+ spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
+
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
/**
* netif_rx - post buffer to the network code
@@ -1924,8 +2060,7 @@ DEFINE_PER_CPU(struct netif_rx_stats,
netdev_rx_stat) = { 0, };
int netif_rx(struct sk_buff *skb)
{
- struct softnet_data *queue;
- unsigned long flags;
+ int cpu;
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
@@ -1934,31 +2069,11 @@ int netif_rx(struct sk_buff *skb)
if (!skb->tstamp.tv64)
net_timestamp(skb);
- /*
- * The code is rearranged so that the path is the most
- * short when CPU is congested, but is still operating.
- */
- local_irq_save(flags);
- queue = &__get_cpu_var(softnet_data);
+ cpu = get_rps_cpu(skb->dev, skb);
+ if (cpu < 0)
+ cpu = smp_processor_id();
- __get_cpu_var(netdev_rx_stat).total++;
- if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
- if (queue->input_pkt_queue.qlen) {
-enqueue:
- __skb_queue_tail(&queue->input_pkt_queue, skb);
- local_irq_restore(flags);
- return NET_RX_SUCCESS;
- }
-
- napi_schedule(&queue->backlog);
- goto enqueue;
- }
-
- __get_cpu_var(netdev_rx_stat).dropped++;
- local_irq_restore(flags);
-
- kfree_skb(skb);
- return NET_RX_DROP;
+ return enqueue_to_backlog(skb, cpu);
}
int netif_rx_ni(struct sk_buff *skb)
@@ -2192,10 +2307,10 @@ void netif_nit_deliver(struct sk_buff *skb)
}
/**
- * netif_receive_skb - process receive buffer from network
+ * __netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
- * netif_receive_skb() is the main receive data processing function.
+ * __netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
@@ -2206,7 +2321,7 @@ void netif_nit_deliver(struct sk_buff *skb)
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
-int netif_receive_skb(struct sk_buff *skb)
+int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev;
@@ -2305,6 +2420,16 @@ out:
return ret;
}
+int netif_receive_skb(struct sk_buff *skb)
+{
+ int cpu = get_rps_cpu(skb->dev, skb);
+
+ if (cpu < 0)
+ return __netif_receive_skb(skb);
+ else
+ return enqueue_to_backlog(skb, cpu);
+}
+
/* Network device is going away, flush any packets still pending */
static void flush_backlog(void *arg)
{
@@ -2347,7 +2472,7 @@ static int napi_gro_complete(struct sk_buff *skb)
out:
skb_shinfo(skb)->gso_size = 0;
- return netif_receive_skb(skb);
+ return __netif_receive_skb(skb);
}
void napi_gro_flush(struct napi_struct *napi)
@@ -2484,7 +2609,7 @@ int napi_skb_finish(int ret, struct sk_buff *skb)
switch (ret) {
case GRO_NORMAL:
- return netif_receive_skb(skb);
+ return __netif_receive_skb(skb);
case GRO_DROP:
err = NET_RX_DROP;
@@ -2585,7 +2710,7 @@ int napi_frags_finish(struct napi_struct *napi,
struct sk_buff *skb, int ret)
skb->protocol = eth_type_trans(skb, napi->dev);
if (ret == GRO_NORMAL)
- return netif_receive_skb(skb);
+ return __netif_receive_skb(skb);
skb_gro_pull(skb, -ETH_HLEN);
break;
@@ -2619,19 +2744,26 @@ static int process_backlog(struct napi_struct
*napi, int quota)
int work = 0;
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
+ unsigned long flags;
napi->weight = weight_p;
do {
struct sk_buff *skb;
- local_irq_disable();
+ spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
skb = __skb_dequeue(&queue->input_pkt_queue);
if (!skb) {
- local_irq_enable();
- napi_complete(napi);
+ spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+ flags);
+ napi_gro_flush(napi);
+ spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
+ if (skb_queue_empty(&queue->input_pkt_queue))
+ __napi_complete(napi);
+ spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
+ flags);
goto out;
}
- local_irq_enable();
+ spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
napi_gro_receive(napi, skb);
} while (++work < quota && jiffies == start_time);
@@ -4804,6 +4936,8 @@ void free_netdev(struct net_device *dev)
kfree(dev->_tx);
+ kfree(dev->rps_map);
+
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
@@ -5239,6 +5373,10 @@ static int __init net_dev_init(void)
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);
+ queue->csd.func = trigger_softirq;
+ queue->csd.info = queue;
+ queue->csd.flags = 0;
+
queue->backlog.poll = process_backlog;
queue->backlog.weight = weight_p;
queue->backlog.gro_list = NULL;
@@ -5264,6 +5402,7 @@ static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
+ open_softirq(NET_RPS_SOFTIRQ, net_rps_action);
hotcpu_notifier(dev_cpu_callback, 0);
dst_init();
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2da59a0..073a407 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -211,6 +211,63 @@ static ssize_t store_tx_queue_len(struct device *dev,
return netdev_store(dev, attr, buf, len, change_tx_queue_len);
}
+static ssize_t store_rps_cpus(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct net_device *net = to_net_dev(dev);
+ cpumask_t mask;
+ int err, cpu;
+ int i = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits);
+ if (err)
+ return err;
+
+ rtnl_lock();
+ if (dev_isalive(net)) {
+ if (!net->rps_map) {
+ net->rps_map = kzalloc(sizeof(u16) *
+ num_possible_cpus(), GFP_KERNEL);
+ if (!net->rps_map)
+ return -ENOMEM;
+ }
+ cpus_and(mask, mask, cpu_online_map);
+ for_each_cpu_mask(cpu, mask)
+ net->rps_map[i++] = cpu;
+ net->rps_map_len = i;
+ }
+ rtnl_unlock();
+
+ return len;
+}
+
+static ssize_t show_rps_cpus(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *net = to_net_dev(dev);
+ size_t len;
+ cpumask_t mask;
+ int i;
+
+ read_lock(&dev_base_lock);
+ if (dev_isalive(net))
+ for (i = 0; i < net->rps_map_len; i++)
+ cpu_set(net->rps_map[i], mask);
+ read_unlock(&dev_base_lock);
+
+ len = cpumask_scnprintf(buf, PAGE_SIZE, &mask);
+ if (PAGE_SIZE - len < 2)
+ return -EINVAL;
+
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+
static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -263,6 +320,8 @@ static struct device_attribute net_class_attributes[] = {
__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
store_tx_queue_len),
+ __ATTR(soft_rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus,
+ store_rps_cpus),
{}
};
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 4:03 [PATCH v2] Receive Packet Steering Tom Herbert
@ 2009-05-04 5:30 ` Eric Dumazet
2009-05-04 6:10 ` Eric Dumazet
` (2 more replies)
2009-05-04 7:08 ` Eric Dumazet
` (4 subsequent siblings)
5 siblings, 3 replies; 23+ messages in thread
From: Eric Dumazet @ 2009-05-04 5:30 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, David Miller
Tom Herbert a écrit :
> This is an update of the receive packet steering patch (RPS) based on received
> comments (thanks for all the comments). Improvements are:
>
> 1) Removed config option for the feature.
> 2) Made scheduling of backlog NAPI devices between CPUs lockless and much
> simpler.
> 3) Added new softirq to do defer sending IPIs for coalescing.
> 4) Imported hash from simple_rx_hash. Eliminates modulo operation to convert
> hash to index.
> 5) If no cpu is found for packet steering, then netif_receive_skb processes
> packet inline as before without queueing. In paritcular if RPS is not
> configured on a device the receive path is unchanged from current for
> NAPI devices (one additional conditional).
>
> Tom
Seems cool, but I found two errors this morning before my cofee ;)
Is it a working patch or an RFC ?
Its also not clear from ChangeLog how this is working, and even
after reading your patch, its not yet very clear. Please provide
more documentation, on every submission.
What about latencies ? I really do think that if cpu handling
device is lightly loaded, it should handle packet itself, without
giving it to another cpu, incurring many cache lines bounces.
>
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 91658d0..32ec04b 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -260,6 +260,7 @@ enum
> TIMER_SOFTIRQ,
> NET_TX_SOFTIRQ,
> NET_RX_SOFTIRQ,
> + NET_RPS_SOFTIRQ,
> BLOCK_SOFTIRQ,
> TASKLET_SOFTIRQ,
> SCHED_SOFTIRQ,
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index be3ebd7..f94ee96 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -758,6 +758,8 @@ struct net_device
> void *ax25_ptr; /* AX.25 specific data */
> struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
> assign before registering */
> + u16 *rps_map;
> + int rps_map_len;
>
> /*
> * Cache line mostly used on receive path (including eth_type_trans())
> @@ -1170,6 +1172,9 @@ struct softnet_data
> struct Qdisc *output_queue;
> struct sk_buff_head input_pkt_queue;
> struct list_head poll_list;
> +
> + struct call_single_data csd;
> +
> struct sk_buff *completion_queue;
>
> struct napi_struct backlog;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 052dd47..3107544 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1906,6 +1906,142 @@ int weight_p __read_mostly = 64; /*
> old backlog weight */
>
> DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
>
> +static u32 simple_hashrnd;
> +static int simple_hashrnd_initialized;
> +
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> + u32 addr1, addr2, ports;
> + struct ipv6hdr *ip6;
> + struct iphdr *ip;
> + u32 hash, ihl;
> + u8 ip_proto;
> + int cpu;
> +
> + if (!dev->rps_map_len)
> + return -1;
> +
> + if (unlikely(!simple_hashrnd_initialized)) {
> + get_random_bytes(&simple_hashrnd, 4);
> + simple_hashrnd_initialized = 1;
> + }
> +
> + switch (skb->protocol) {
> + case __constant_htons(ETH_P_IP):
> + if (!pskb_may_pull(skb, sizeof(*ip)))
> + return -1;
> +
> + ip = (struct iphdr *) skb->data;
> + ip_proto = ip->protocol;
> + addr1 = ip->saddr;
> + addr2 = ip->daddr;
> + ihl = ip->ihl;
> + break;
> + case __constant_htons(ETH_P_IPV6):
> + if (!pskb_may_pull(skb, sizeof(*ip6)))
> + return -1;
> +
> + ip6 = (struct ipv6hdr *) skb->data;
> + ip_proto = ip6->nexthdr;
> + addr1 = ip6->saddr.s6_addr32[3];
> + addr2 = ip6->daddr.s6_addr32[3];
> + ihl = (40 >> 2);
> + break;
> + default:
> + return -1;
> + }
> + ports = 0;
> + switch (ip_proto) {
> + case IPPROTO_TCP:
> + case IPPROTO_UDP:
> + case IPPROTO_DCCP:
> + case IPPROTO_ESP:
> + case IPPROTO_AH:
> + case IPPROTO_SCTP:
> + case IPPROTO_UDPLITE:
> + if (pskb_may_pull(skb, (ihl * 4) + 4))
> + ports = *((u32 *) (skb->data + (ihl * 4)));
> + break;
> +
> + default:
> + break;
> + }
> +
> + hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
> +
> + cpu = skb->dev->rps_map[((u64) hash * dev->rps_map_len) >> 32];
> + return cpu_online(cpu) ? cpu : -1;
> +}
> +
> +static DEFINE_PER_CPU(cpumask_t, rps_remote_softirq_cpus);
> +
> +/* Called from hardirq (IPI) context */
> +static void trigger_softirq(void *data)
> +{
> + struct softnet_data *queue = data;
> + __napi_schedule(&queue->backlog);
> +}
> +
> +/**
> + * net_rps_action is called from NET_RPS_SOFTIRQ to do IPIs to schedule RX
> + * softirq on remote CPUs. Called in a separate softirq to allow for
> + * coalescing.
> + */
> +static void net_rps_action(struct softirq_action *h)
> +{
> + int cpu;
> +
> + local_irq_disable();
> +
> + for_each_cpu_mask_nr(cpu, __get_cpu_var(rps_remote_softirq_cpus)) {
> + struct softnet_data *queue = &per_cpu(softnet_data, cpu);
> + __smp_call_function_single(cpu, &queue->csd);
> + }
> + cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
> +
> + local_irq_enable();
> +}
> +
> +/**
> + * enqueue_to_backlog is called to queue an skb to a per CPU backlog
> + * queue (may be a remote CPU queue).
> + */
> +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
> +{
> + struct softnet_data *queue;
> + unsigned long flags;
> +
> + queue = &per_cpu(softnet_data, cpu);
> + spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
I wonder... isnt it going to really hurt with cache line ping pongs ?
> +
> + __get_cpu_var(netdev_rx_stat).total++;
> + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
> + if (queue->input_pkt_queue.qlen) {
> +enqueue:
> + __skb_queue_tail(&queue->input_pkt_queue, skb);
> + spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
> + flags);
> + return NET_RX_SUCCESS;
> + }
> +
> + /* Schedule NAPI for backlog device */
> + if (napi_schedule_prep(&queue->backlog)) {
> + if (cpu != smp_processor_id()) {
> + cpu_set(cpu,
> + get_cpu_var(rps_remote_softirq_cpus));
get_cpu_var() increases preempt_count (preempt_disable), where is the opposite decrease ?
> + __raise_softirq_irqoff(NET_RPS_SOFTIRQ);
> + } else
> + __napi_schedule(&queue->backlog);
> + }
> + goto enqueue;
> + }
> +
> + __get_cpu_var(netdev_rx_stat).dropped++;
> + spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
> +
> + kfree_skb(skb);
> + return NET_RX_DROP;
> +}
>
> /**
> * netif_rx - post buffer to the network code
> @@ -1924,8 +2060,7 @@ DEFINE_PER_CPU(struct netif_rx_stats,
> netdev_rx_stat) = { 0, };
>
> int netif_rx(struct sk_buff *skb)
> {
> - struct softnet_data *queue;
> - unsigned long flags;
> + int cpu;
>
> /* if netpoll wants it, pretend we never saw it */
> if (netpoll_rx(skb))
> @@ -1934,31 +2069,11 @@ int netif_rx(struct sk_buff *skb)
> if (!skb->tstamp.tv64)
> net_timestamp(skb);
>
> - /*
> - * The code is rearranged so that the path is the most
> - * short when CPU is congested, but is still operating.
> - */
> - local_irq_save(flags);
> - queue = &__get_cpu_var(softnet_data);
> + cpu = get_rps_cpu(skb->dev, skb);
> + if (cpu < 0)
> + cpu = smp_processor_id();
>
> - __get_cpu_var(netdev_rx_stat).total++;
> - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
> - if (queue->input_pkt_queue.qlen) {
> -enqueue:
> - __skb_queue_tail(&queue->input_pkt_queue, skb);
> - local_irq_restore(flags);
> - return NET_RX_SUCCESS;
> - }
> -
> - napi_schedule(&queue->backlog);
> - goto enqueue;
> - }
> -
> - __get_cpu_var(netdev_rx_stat).dropped++;
> - local_irq_restore(flags);
> -
> - kfree_skb(skb);
> - return NET_RX_DROP;
> + return enqueue_to_backlog(skb, cpu);
> }
>
> int netif_rx_ni(struct sk_buff *skb)
> @@ -2192,10 +2307,10 @@ void netif_nit_deliver(struct sk_buff *skb)
> }
>
> /**
> - * netif_receive_skb - process receive buffer from network
> + * __netif_receive_skb - process receive buffer from network
> * @skb: buffer to process
> *
> - * netif_receive_skb() is the main receive data processing function.
> + * __netif_receive_skb() is the main receive data processing function.
> * It always succeeds. The buffer may be dropped during processing
> * for congestion control or by the protocol layers.
> *
> @@ -2206,7 +2321,7 @@ void netif_nit_deliver(struct sk_buff *skb)
> * NET_RX_SUCCESS: no congestion
> * NET_RX_DROP: packet was dropped
> */
> -int netif_receive_skb(struct sk_buff *skb)
> +int __netif_receive_skb(struct sk_buff *skb)
> {
> struct packet_type *ptype, *pt_prev;
> struct net_device *orig_dev;
> @@ -2305,6 +2420,16 @@ out:
> return ret;
> }
>
> +int netif_receive_skb(struct sk_buff *skb)
> +{
> + int cpu = get_rps_cpu(skb->dev, skb);
> +
> + if (cpu < 0)
> + return __netif_receive_skb(skb);
> + else
> + return enqueue_to_backlog(skb, cpu);
> +}
> +
> /* Network device is going away, flush any packets still pending */
> static void flush_backlog(void *arg)
> {
> @@ -2347,7 +2472,7 @@ static int napi_gro_complete(struct sk_buff *skb)
>
> out:
> skb_shinfo(skb)->gso_size = 0;
> - return netif_receive_skb(skb);
> + return __netif_receive_skb(skb);
> }
>
> void napi_gro_flush(struct napi_struct *napi)
> @@ -2484,7 +2609,7 @@ int napi_skb_finish(int ret, struct sk_buff *skb)
>
> switch (ret) {
> case GRO_NORMAL:
> - return netif_receive_skb(skb);
> + return __netif_receive_skb(skb);
>
> case GRO_DROP:
> err = NET_RX_DROP;
> @@ -2585,7 +2710,7 @@ int napi_frags_finish(struct napi_struct *napi,
> struct sk_buff *skb, int ret)
> skb->protocol = eth_type_trans(skb, napi->dev);
>
> if (ret == GRO_NORMAL)
> - return netif_receive_skb(skb);
> + return __netif_receive_skb(skb);
>
> skb_gro_pull(skb, -ETH_HLEN);
> break;
> @@ -2619,19 +2744,26 @@ static int process_backlog(struct napi_struct
> *napi, int quota)
> int work = 0;
> struct softnet_data *queue = &__get_cpu_var(softnet_data);
> unsigned long start_time = jiffies;
> + unsigned long flags;
>
> napi->weight = weight_p;
> do {
> struct sk_buff *skb;
>
> - local_irq_disable();
> + spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
> skb = __skb_dequeue(&queue->input_pkt_queue);
> if (!skb) {
> - local_irq_enable();
> - napi_complete(napi);
> + spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
> + flags);
> + napi_gro_flush(napi);
> + spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
> + if (skb_queue_empty(&queue->input_pkt_queue))
> + __napi_complete(napi);
> + spin_unlock_irqrestore(&queue->input_pkt_queue.lock,
> + flags);
> goto out;
> }
> - local_irq_enable();
> + spin_unlock_irqrestore(&queue->input_pkt_queue.lock, flags);
>
> napi_gro_receive(napi, skb);
> } while (++work < quota && jiffies == start_time);
> @@ -4804,6 +4936,8 @@ void free_netdev(struct net_device *dev)
>
> kfree(dev->_tx);
>
> + kfree(dev->rps_map);
> +
> list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
> netif_napi_del(p);
>
> @@ -5239,6 +5373,10 @@ static int __init net_dev_init(void)
> queue->completion_queue = NULL;
> INIT_LIST_HEAD(&queue->poll_list);
>
> + queue->csd.func = trigger_softirq;
> + queue->csd.info = queue;
> + queue->csd.flags = 0;
> +
> queue->backlog.poll = process_backlog;
> queue->backlog.weight = weight_p;
> queue->backlog.gro_list = NULL;
> @@ -5264,6 +5402,7 @@ static int __init net_dev_init(void)
>
> open_softirq(NET_TX_SOFTIRQ, net_tx_action);
> open_softirq(NET_RX_SOFTIRQ, net_rx_action);
> + open_softirq(NET_RPS_SOFTIRQ, net_rps_action);
>
> hotcpu_notifier(dev_cpu_callback, 0);
> dst_init();
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 2da59a0..073a407 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -211,6 +211,63 @@ static ssize_t store_tx_queue_len(struct device *dev,
> return netdev_store(dev, attr, buf, len, change_tx_queue_len);
> }
>
> +static ssize_t store_rps_cpus(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct net_device *net = to_net_dev(dev);
> + cpumask_t mask;
> + int err, cpu;
> + int i = 0;
> +
> + if (!capable(CAP_NET_ADMIN))
> + return -EPERM;
> +
> + err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits);
> + if (err)
> + return err;
> +
> + rtnl_lock();
> + if (dev_isalive(net)) {
> + if (!net->rps_map) {
> + net->rps_map = kzalloc(sizeof(u16) *
> + num_possible_cpus(), GFP_KERNEL);
num_possible_cpus() is not the max index of a cpu, but the number of possible cpus.
it can be for example 2, but cpu0 being index 0, and second cpu at index 511
So I believe you want nr_cpu_ids here
(num_possible_cpus() <= nr_cpu_ids), not necessarly equal.
> + if (!net->rps_map)
> + return -ENOMEM;
> + }
> + cpus_and(mask, mask, cpu_online_map);
> + for_each_cpu_mask(cpu, mask)
> + net->rps_map[i++] = cpu;
> + net->rps_map_len = i;
> + }
> + rtnl_unlock();
> +
> + return len;
> +}
> +
> +static ssize_t show_rps_cpus(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct net_device *net = to_net_dev(dev);
> + size_t len;
> + cpumask_t mask;
> + int i;
> +
> + read_lock(&dev_base_lock);
> + if (dev_isalive(net))
> + for (i = 0; i < net->rps_map_len; i++)
> + cpu_set(net->rps_map[i], mask);
> + read_unlock(&dev_base_lock);
> +
> + len = cpumask_scnprintf(buf, PAGE_SIZE, &mask);
> + if (PAGE_SIZE - len < 2)
> + return -EINVAL;
> +
> + len += sprintf(buf + len, "\n");
> + return len;
> +}
> +
> static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
> const char *buf, size_t len)
> {
> @@ -263,6 +320,8 @@ static struct device_attribute net_class_attributes[] = {
> __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
> __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
> store_tx_queue_len),
> + __ATTR(soft_rps_cpus, S_IRUGO | S_IWUSR, show_rps_cpus,
> + store_rps_cpus),
> {}
> };
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 5:30 ` Eric Dumazet
@ 2009-05-04 6:10 ` Eric Dumazet
2009-05-04 7:13 ` Eric Dumazet
2009-05-12 17:28 ` Tom Herbert
2 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2009-05-04 6:10 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, David Miller
Eric Dumazet a écrit :
> Tom Herbert a écrit :
>> This is an update of the receive packet steering patch (RPS) based on received
>> comments (thanks for all the comments). Improvements are:
>>
>> 1) Removed config option for the feature.
>> 2) Made scheduling of backlog NAPI devices between CPUs lockless and much
>> simpler.
>> 3) Added new softirq to do defer sending IPIs for coalescing.
>> 4) Imported hash from simple_rx_hash. Eliminates modulo operation to convert
>> hash to index.
>> 5) If no cpu is found for packet steering, then netif_receive_skb processes
>> packet inline as before without queueing. In paritcular if RPS is not
>> configured on a device the receive path is unchanged from current for
>> NAPI devices (one additional conditional).
>>
>> Tom
>
another error :
>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>> index 2da59a0..073a407 100644
>> --- a/net/core/net-sysfs.c
>> +++ b/net/core/net-sysfs.c
>> @@ -211,6 +211,63 @@ static ssize_t store_tx_queue_len(struct device *dev,
>> return netdev_store(dev, attr, buf, len, change_tx_queue_len);
>> }
>>
>> +static ssize_t store_rps_cpus(struct device *dev,
>> + struct device_attribute *attr,
>> + const char *buf, size_t len)
>> +{
>> + struct net_device *net = to_net_dev(dev);
>> + cpumask_t mask;
>> + int err, cpu;
>> + int i = 0;
>> +
>> + if (!capable(CAP_NET_ADMIN))
>> + return -EPERM;
>> +
>> + err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits);
>> + if (err)
>> + return err;
>> +
>> + rtnl_lock();
>> + if (dev_isalive(net)) {
>> + if (!net->rps_map) {
>> + net->rps_map = kzalloc(sizeof(u16) *
>> + num_possible_cpus(), GFP_KERNEL);
>
> num_possible_cpus() is not the max index of a cpu, but the number of possible cpus.
> it can be for example 2, but cpu0 being index 0, and second cpu at index 511
>
> So I believe you want nr_cpu_ids here
>
> (num_possible_cpus() <= nr_cpu_ids), not necessarly equal.
>
>
>> + if (!net->rps_map)
lacks a rtnl_unlock() here, before return
>> + return -ENOMEM;
>> + }
>> + cpus_and(mask, mask, cpu_online_map);
>> + for_each_cpu_mask(cpu, mask)
>> + net->rps_map[i++] = cpu;
>> + net->rps_map_len = i;
>> + }
>> + rtnl_unlock();
>> +
>> + return len;
>> +}
>> +
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 4:03 [PATCH v2] Receive Packet Steering Tom Herbert
2009-05-04 5:30 ` Eric Dumazet
@ 2009-05-04 7:08 ` Eric Dumazet
2009-05-04 7:59 ` Andi Kleen
` (3 subsequent siblings)
5 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2009-05-04 7:08 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, David Miller
Tom Herbert a écrit :
> This is an update of the receive packet steering patch (RPS) based on received
> comments (thanks for all the comments). Improvements are:
>
> 1) Removed config option for the feature.
> 2) Made scheduling of backlog NAPI devices between CPUs lockless and much
> simpler.
> 3) Added new softirq to do defer sending IPIs for coalescing.
> 4) Imported hash from simple_rx_hash. Eliminates modulo operation to convert
> hash to index.
> 5) If no cpu is found for packet steering, then netif_receive_skb processes
> packet inline as before without queueing. In paritcular if RPS is not
> configured on a device the receive path is unchanged from current for
> NAPI devices (one additional conditional).
>
Thinking again about this Tom, I also think something might/should be done
for the TX completion path. As spotted by earlier patches last weeks,
TX completion on TCP/UDP can do a lot of stuff, including wakeups (that
can be really expensive), and touching many socket cache lines.
Each cpu can call device transmit on its own, but TX completion path is taken
by one cpu only, and this can use all this cpu cycles.
(It would make sense that skb freeing is done by the same cpu that did the alloc,
especially on NUMA setups)
As this TX completion path should use same hash than one selected for RX (as
done in your patch), maybe this is something you could add, if we know
current cpu is saturated by softirq handling.
One comment about simple_hashrnd check/initialization in get_rps_cpu()
+ if (unlikely(!simple_hashrnd_initialized)) {
+ get_random_bytes(&simple_hashrnd, 4);
+ simple_hashrnd_initialized = 1;
+ }
+
You could move it to rps_map_len/rps_map initialization in store_rps_cpus()
to save a few cycles per packet.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 5:30 ` Eric Dumazet
2009-05-04 6:10 ` Eric Dumazet
@ 2009-05-04 7:13 ` Eric Dumazet
2009-05-12 17:28 ` Tom Herbert
2 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2009-05-04 7:13 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, David Miller
Eric Dumazet a écrit :
> Tom Herbert a écrit :
>> +static ssize_t store_rps_cpus(struct device *dev,
>> + struct device_attribute *attr,
>> + const char *buf, size_t len)
>> +{
>> + struct net_device *net = to_net_dev(dev);
>> + cpumask_t mask;
>> + int err, cpu;
>> + int i = 0;
>> +
>> + if (!capable(CAP_NET_ADMIN))
>> + return -EPERM;
>> +
>> + err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits);
>> + if (err)
>> + return err;
>> +
>> + rtnl_lock();
>> + if (dev_isalive(net)) {
>> + if (!net->rps_map) {
>> + net->rps_map = kzalloc(sizeof(u16) *
>> + num_possible_cpus(), GFP_KERNEL);
>
> num_possible_cpus() is not the max index of a cpu, but the number of possible cpus.
> it can be for example 2, but cpu0 being index 0, and second cpu at index 511
>
> So I believe you want nr_cpu_ids here
>
> (num_possible_cpus() <= nr_cpu_ids), not necessarly equal.
>
>
>> + if (!net->rps_map)
>> + return -ENOMEM;
>> + }
>> + cpus_and(mask, mask, cpu_online_map);
>> + for_each_cpu_mask(cpu, mask)
>> + net->rps_map[i++] = cpu;
>> + net->rps_map_len = i;
>> + }
>> + rtnl_unlock();
>> +
>> + return len;
I misread patch and my previous remark was wrong, as you put in rps_map
all possible cpu indexes, there is no hole in the array, so its size
is num_possible_cpus() indeed, sorry for the false alarm.
Now I wonder how this can handle cpu hotplug, I see nothing in your patch to transfert
packets ownership or rps_map[] rebuilding in case a cpu is offlined...
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 4:03 [PATCH v2] Receive Packet Steering Tom Herbert
2009-05-04 5:30 ` Eric Dumazet
2009-05-04 7:08 ` Eric Dumazet
@ 2009-05-04 7:59 ` Andi Kleen
2009-05-04 18:22 ` Jarek Poplawski
` (2 subsequent siblings)
5 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2009-05-04 7:59 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, David Miller
Tom Herbert <therbert@google.com> writes:
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 052dd47..3107544 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1906,6 +1906,142 @@ int weight_p __read_mostly = 64; /*
> old backlog weight */
>
> DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
>
> +static u32 simple_hashrnd;
This should be __read_mostly
> +static int simple_hashrnd_initialized;
Also I suspect you can just use 0 as uninitialized
and avoid one variable. If the RNG reports 0 you get
worst case another initialization.
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> + u32 addr1, addr2, ports;
> + struct ipv6hdr *ip6;
> + struct iphdr *ip;
> + u32 hash, ihl;
> + u8 ip_proto;
> + int cpu;
> +
> + if (!dev->rps_map_len)
> + return -1;
> +
> + if (unlikely(!simple_hashrnd_initialized)) {
> + get_random_bytes(&simple_hashrnd, 4);
> + simple_hashrnd_initialized = 1;
> + }
The usual problem of this is that if the kernel gets a packet sufficiently
early the random state will be always the default state, which is not
very random.
So either you use a timeout to reinit regularly (like other subsystems
do) or just use a fixed value for reproducibility. I suspect the
later would be nicer for benchmakers.
+ case __constant_htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, sizeof(*ip6)))
+ return -1;
+
+ ip6 = (struct ipv6hdr *) skb->data;
+ ip_proto = ip6->nexthdr;
+ addr1 = ip6->saddr.s6_addr32[3];
+ addr2 = ip6->daddr.s6_addr32[3];
Wouldn't it be better to hash in everything in the ipv6 address in this case?
> +
> + hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
> +
> + cpu = skb->dev->rps_map[((u64) hash * dev->rps_map_len) >> 32];
For 32bit systems it would be nice to avoid the u64 cast. gcc doesn't
generate very good code for that.
> + return cpu_online(cpu) ? cpu : -1;
I suspect this is still racy with cpu hotunplug.
> + cpus_clear(__get_cpu_var(rps_remote_softirq_cpus));
> +
> + local_irq_enable();
> +}
> +
> +/**
> + * enqueue_to_backlog is called to queue an skb to a per CPU backlog
> + * queue (may be a remote CPU queue).
> + */
> +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
> +{
> + struct softnet_data *queue;
> + unsigned long flags;
> +
> + queue = &per_cpu(softnet_data, cpu);
Are you sure preemption is disabled here? Otherwise this must be
one line below (can be tested by enabling preempt & preempt debug)
> +
> + if (!capable(CAP_NET_ADMIN))
> + return -EPERM;
> +
> + err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits);
> + if (err)
> + return err;
> +
> + rtnl_lock();
> + if (dev_isalive(net)) {
> + if (!net->rps_map) {
> + net->rps_map = kzalloc(sizeof(u16) *
> + num_possible_cpus(), GFP_KERNEL);
> + if (!net->rps_map)
> + return -ENOMEM;
You don't unlock rtnl_lock here.
-Andi
--
ak@linux.intel.com -- Speaking for myself only.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 4:03 [PATCH v2] Receive Packet Steering Tom Herbert
` (2 preceding siblings ...)
2009-05-04 7:59 ` Andi Kleen
@ 2009-05-04 18:22 ` Jarek Poplawski
2009-05-04 20:43 ` Jarek Poplawski
2009-06-10 8:23 ` David Miller
2009-07-13 17:49 ` David Miller
5 siblings, 1 reply; 23+ messages in thread
From: Jarek Poplawski @ 2009-05-04 18:22 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, David Miller
Tom Herbert wrote, On 05/04/2009 06:03 AM:
> This is an update of the receive packet steering patch (RPS) based on received
> comments (thanks for all the comments). Improvements are:
...
> +/* Called from hardirq (IPI) context */
> +static void trigger_softirq(void *data)
> +{
> + struct softnet_data *queue = data;
> + __napi_schedule(&queue->backlog);
> +}
Isn't full napi_schedule() needed here?
Jarek P.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 18:22 ` Jarek Poplawski
@ 2009-05-04 20:43 ` Jarek Poplawski
0 siblings, 0 replies; 23+ messages in thread
From: Jarek Poplawski @ 2009-05-04 20:43 UTC (permalink / raw)
To: Tom Herbert; +Cc: netdev, David Miller
Jarek Poplawski wrote, On 05/04/2009 08:22 PM:
> Tom Herbert wrote, On 05/04/2009 06:03 AM:
>
>> This is an update of the receive packet steering patch (RPS) based on received
>> comments (thanks for all the comments). Improvements are:
>
> ...
>
>> +/* Called from hardirq (IPI) context */
>> +static void trigger_softirq(void *data)
>> +{
>> + struct softnet_data *queue = data;
>> + __napi_schedule(&queue->backlog);
>> +}
>
>
> Isn't full napi_schedule() needed here?
Hmm... It seems to be OK yet.
Jarek P.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 5:30 ` Eric Dumazet
2009-05-04 6:10 ` Eric Dumazet
2009-05-04 7:13 ` Eric Dumazet
@ 2009-05-12 17:28 ` Tom Herbert
2 siblings, 0 replies; 23+ messages in thread
From: Tom Herbert @ 2009-05-12 17:28 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, David Miller
On Sun, May 3, 2009 at 10:30 PM, Eric Dumazet <dada1@cosmosbay.com> wrote:
>
> Tom Herbert a écrit :
> > This is an update of the receive packet steering patch (RPS) based on received
> > comments (thanks for all the comments). Improvements are:
> >
> > 1) Removed config option for the feature.
> > 2) Made scheduling of backlog NAPI devices between CPUs lockless and much
> > simpler.
> > 3) Added new softirq to do defer sending IPIs for coalescing.
> > 4) Imported hash from simple_rx_hash. Eliminates modulo operation to convert
> > hash to index.
> > 5) If no cpu is found for packet steering, then netif_receive_skb processes
> > packet inline as before without queueing. In paritcular if RPS is not
> > configured on a device the receive path is unchanged from current for
> > NAPI devices (one additional conditional).
> >
> > Tom
>
> Seems cool, but I found two errors this morning before my cofee ;)
>
> Is it a working patch or an RFC ?
>
Patch mostly works. It's based on code from an earlier kernel that
we've been running for more than year.
> Its also not clear from ChangeLog how this is working, and even
> after reading your patch, its not yet very clear. Please provide
> more documentation, on every submission.
>
Okay.
> What about latencies ? I really do think that if cpu handling
> device is lightly loaded, it should handle packet itself, without
> giving it to another cpu, incurring many cache lines bounces.
>
While it's true that this scheme adds overhead for processing a single
packet at a time, we've found that by setting the per device CPU mask
to CPUs sharing the same L2/L3 cache we can reduce that overhead
substantially to the point that even for a small number of active
connections (around ten in out setup) the benefits of parallelizing
the path overcome the extra overhead resulting in lower average
latency. So this would increase latency for doing a single ping, but
even for a moderate loaded server we see latency improvements.
> > +static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
> > +{
> > + struct softnet_data *queue;
> > + unsigned long flags;
> > +
> > + queue = &per_cpu(softnet_data, cpu);
> > + spin_lock_irqsave(&queue->input_pkt_queue.lock, flags);
>
> I wonder... isnt it going to really hurt with cache line ping pongs ?
>
I suppose it is possible, although we haven't see this pop up in
profiling. Coalescing packets before doing the IPI might be
alleviating that.
> > + /* Schedule NAPI for backlog device */
> > + if (napi_schedule_prep(&queue->backlog)) {
> > + if (cpu != smp_processor_id()) {
> > + cpu_set(cpu,
> > + get_cpu_var(rps_remote_softirq_cpus));
>
> get_cpu_var() increases preempt_count (preempt_disable), where is the opposite decrease ?
>
Right, should be __get_cpu_var.
Tom
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 4:03 [PATCH v2] Receive Packet Steering Tom Herbert
` (3 preceding siblings ...)
2009-05-04 18:22 ` Jarek Poplawski
@ 2009-06-10 8:23 ` David Miller
2009-06-15 5:54 ` Tom Herbert
[not found] ` <65634d660906142252y6f7fc021l844b172995c10044@mail.gmail.com>
2009-07-13 17:49 ` David Miller
5 siblings, 2 replies; 23+ messages in thread
From: David Miller @ 2009-06-10 8:23 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Sun, 3 May 2009 21:03:01 -0700
> This is an update of the receive packet steering patch (RPS) based on received
> comments (thanks for all the comments). Improvements are:
>
> 1) Removed config option for the feature.
> 2) Made scheduling of backlog NAPI devices between CPUs lockless and much
> simpler.
> 3) Added new softirq to do defer sending IPIs for coalescing.
> 4) Imported hash from simple_rx_hash. Eliminates modulo operation to convert
> hash to index.
> 5) If no cpu is found for packet steering, then netif_receive_skb processes
> packet inline as before without queueing. In paritcular if RPS is not
> configured on a device the receive path is unchanged from current for
> NAPI devices (one additional conditional).
>
> Signed-off-by: Tom Herbert <therbert@google.com>
Just to keep this topic alive, I want to mention two things:
1) Just the other day support for the IXGBE "Flow Director" was
added to net-next-2.6, it basically does flow steering in
hardware. It remembers where the last TX for a flow was
made, and steers RX traffic there.
It's essentially a HW implementation of what we're proposing
here to do in software.
2) I'm steadily still trying to get struct sk_buff to the point
where we can replace the list handling implementation with a
standard "struct list_head" and thus union that with a
"struct call_single_data" so we can use remote cpu soft-irqs
for software packet flow steering.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-06-10 8:23 ` David Miller
@ 2009-06-15 5:54 ` Tom Herbert
[not found] ` <65634d660906142252y6f7fc021l844b172995c10044@mail.gmail.com>
1 sibling, 0 replies; 23+ messages in thread
From: Tom Herbert @ 2009-06-15 5:54 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On Wed, Jun 10, 2009 at 1:23 AM, David Miller<davem@davemloft.net> wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Sun, 3 May 2009 21:03:01 -0700
>
>> This is an update of the receive packet steering patch (RPS) based on received
>> comments (thanks for all the comments). Improvements are:
>>
>> 1) Removed config option for the feature.
>> 2) Made scheduling of backlog NAPI devices between CPUs lockless and much
>> simpler.
>> 3) Added new softirq to do defer sending IPIs for coalescing.
>> 4) Imported hash from simple_rx_hash. Eliminates modulo operation to convert
>> hash to index.
>> 5) If no cpu is found for packet steering, then netif_receive_skb processes
>> packet inline as before without queueing. In paritcular if RPS is not
>> configured on a device the receive path is unchanged from current for
>> NAPI devices (one additional conditional).
>>
>> Signed-off-by: Tom Herbert <therbert@google.com>
>
> Just to keep this topic alive, I want to mention two things:
>
> 1) Just the other day support for the IXGBE "Flow Director" was
> added to net-next-2.6, it basically does flow steering in
> hardware. It remembers where the last TX for a flow was
> made, and steers RX traffic there.
>
> It's essentially a HW implementation of what we're proposing
> here to do in software.
>
That's very cool. Does it preserve in order delivery?
> 2) I'm steadily still trying to get struct sk_buff to the point
> where we can replace the list handling implementation with a
> standard "struct list_head" and thus union that with a
> "struct call_single_data" so we can use remote cpu soft-irqs
> for software packet flow steering.
>
I took another look at that an I have to wonder if it might be overly
complicated somehow. Seems like this use of the call_single_data
structures would be essentially creating another type of skbuff list
than sk_buff_head (but without qlen which I think still may be
important). I'm not sure that there's any less locking in that method
either. What is the advantage over using a shared skbuff queue and
making doing a single IPI to schedule the backlog device on the remote
CPU?
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
[not found] ` <65634d660906142252y6f7fc021l844b172995c10044@mail.gmail.com>
@ 2009-06-15 9:02 ` David Miller
2009-06-15 16:39 ` Tom Herbert
0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2009-06-15 9:02 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Sun, 14 Jun 2009 22:52:13 -0700
>> Just to keep this topic alive, I want to mention two things:
>>
>> 1) Just the other day support for the IXGBE "Flow Director" was
>> added to net-next-2.6, it basically does flow steering in
>> hardware. It remembers where the last TX for a flow was
>> made, and steers RX traffic there.
>>
>
> That's very cool. Is this able to preserve in order delivery?
I don't know how the hardware works to this level of detail,
sorry. But yet that's a very important issue.
> What is the advantage over using a shared skbuff queue and making
> doing a single IPI to schedule the backlog device on the remote CPU?
No locking. Queue is only ever accessed by the local cpu.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-06-15 9:02 ` David Miller
@ 2009-06-15 16:39 ` Tom Herbert
2009-06-15 23:18 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Tom Herbert @ 2009-06-15 16:39 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On Mon, Jun 15, 2009 at 2:02 AM, David Miller<davem@davemloft.net> wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Sun, 14 Jun 2009 22:52:13 -0700
>
>>> Just to keep this topic alive, I want to mention two things:
>>>
>>> 1) Just the other day support for the IXGBE "Flow Director" was
>>> added to net-next-2.6, it basically does flow steering in
>>> hardware. It remembers where the last TX for a flow was
>>> made, and steers RX traffic there.
>>>
>>
>> That's very cool. Is this able to preserve in order delivery?
>
> I don't know how the hardware works to this level of detail,
> sorry. But yet that's a very important issue.
>
>> What is the advantage over using a shared skbuff queue and making
>> doing a single IPI to schedule the backlog device on the remote CPU?
>
> No locking. Queue is only ever accessed by the local cpu.
>
There's no lock to put the call_single_data structures onto remote
CPU's list? Looking at generic_exec_single...
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-06-15 16:39 ` Tom Herbert
@ 2009-06-15 23:18 ` David Miller
0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2009-06-15 23:18 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Mon, 15 Jun 2009 09:39:20 -0700
> There's no lock to put the call_single_data structures onto remote
> CPU's list? Looking at generic_exec_single...
True, but hopefully that's cheaper than locking the remove
queue.
And also the latency should be much lower for actually
processing the packet.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-05-04 4:03 [PATCH v2] Receive Packet Steering Tom Herbert
` (4 preceding siblings ...)
2009-06-10 8:23 ` David Miller
@ 2009-07-13 17:49 ` David Miller
2009-07-13 22:04 ` Tom Herbert
5 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2009-07-13 17:49 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Sun, 3 May 2009 21:03:01 -0700
> @@ -758,6 +758,8 @@ struct net_device
> void *ax25_ptr; /* AX.25 specific data */
> struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
> assign before registering */
> + u16 *rps_map;
> + int rps_map_len;
>
> /*
> * Cache line mostly used on receive path (including eth_type_trans())
So essentially this table is a user defined (via sysctl) group of cpus
among which to distribute incoming traffic for a device, right?
Why not take this to it's logical end point, which is to monitor
transmits using a tiny flow lookup table, and map receives of the same
flow to the same cpu?
You can even "cheat" and not store the whole flow key in the small
lookup table, just use the resulting hash value as the key. Also,
if "best effort" is considered OK you can even do away with hash
chaining as well, further decreasing the space cost of the table.
If your goal is to steer traffic to the cpu on which the receiving
application is operating, that seems to me to be the only way to
reliably and consistently hit that target.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-13 17:49 ` David Miller
@ 2009-07-13 22:04 ` Tom Herbert
2009-07-14 19:33 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Tom Herbert @ 2009-07-13 22:04 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On Mon, Jul 13, 2009 at 10:49 AM, David Miller <davem@davemloft.net> wrote:
>
> From: Tom Herbert <therbert@google.com>
> Date: Sun, 3 May 2009 21:03:01 -0700
>
> > @@ -758,6 +758,8 @@ struct net_device
> > void *ax25_ptr; /* AX.25 specific data */
> > struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
> > assign before registering */
> > + u16 *rps_map;
> > + int rps_map_len;
> >
> > /*
> > * Cache line mostly used on receive path (including eth_type_trans())
>
> So essentially this table is a user defined (via sysctl) group of cpus
> among which to distribute incoming traffic for a device, right?
Yes. It's a simple static table.
>
> Why not take this to it's logical end point, which is to monitor
> transmits using a tiny flow lookup table, and map receives of the same
> flow to the same cpu?
Is it better do use transmits, or monitor where recvmsg was called?
>
> You can even "cheat" and not store the whole flow key in the small
> lookup table, just use the resulting hash value as the key. Also,
> if "best effort" is considered OK you can even do away with hash
> chaining as well, further decreasing the space cost of the table.
Right. In fact, just using the hash as the key is what you want when
device provides the hash (i.e. Toeplitz). The caveat is that we
should to prevent OOO packets when threads migrate, I think I have a
reasonable solution for that following your earlier suggestion.
I hope to have a new patch soon for steering packets to application
CPU and using device hash, thanks for bugging me on it!
>
> If your goal is to steer traffic to the cpu on which the receiving
> application is operating, that seems to me to be the only way to
> reliably and consistently hit that target.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-13 22:04 ` Tom Herbert
@ 2009-07-14 19:33 ` David Miller
2009-07-14 23:28 ` Tom Herbert
0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2009-07-14 19:33 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Mon, 13 Jul 2009 15:04:51 -0700
> On Mon, Jul 13, 2009 at 10:49 AM, David Miller <davem@davemloft.net> wrote:
>>
>> Why not take this to it's logical end point, which is to monitor
>> transmits using a tiny flow lookup table, and map receives of the same
>> flow to the same cpu?
>
> Is it better do use transmits, or monitor where recvmsg was called?
First of all, using recvmsg would be on the more stateful side, and
we try to avoid that. All of the logic should work if we had to
put it purely inside of dev_hard_xmit() and netif_receive_skb(),
way below the stack itself ever gets involved.
Second, Intel has committed the "monitor the TXs to device where to
steer RX" logic into silicon with their Flow Director stuff. What
does that tell you about it's effectiveness? :-)
It seems the best heuristic to me, especially since it satisfies
many of the design constraints.
> Right. In fact, just using the hash as the key is what you want when
> device provides the hash (i.e. Toeplitz). The caveat is that we
> should to prevent OOO packets when threads migrate, I think I have a
> reasonable solution for that following your earlier suggestion.
But as you found, compared to jhash, Toeplitz is expensive to compute
and you would need to do this if implemented by monitoring transmits.
And furthermore, from the device perspective:
1) some (such as NIU) provide different hash values, not Toeplitz
2) it is expensive to extract this value (must enable different,
larger, RX descriptor modes, thus more DMA traffic, etc.)
I really don't think Toeplitz is a net win, to be honest.
> I hope to have a new patch soon for steering packets to application
> CPU and using device hash, thanks for bugging me on it!
I had no choice, as I'm giving a presentation on this stuff tomorrow
night here in NYC :-)
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-14 19:33 ` David Miller
@ 2009-07-14 23:28 ` Tom Herbert
2009-07-17 2:48 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Tom Herbert @ 2009-07-14 23:28 UTC (permalink / raw)
To: David Miller; +Cc: netdev
>
> > Right. In fact, just using the hash as the key is what you want when
> > device provides the hash (i.e. Toeplitz). The caveat is that we
> > should to prevent OOO packets when threads migrate, I think I have a
> > reasonable solution for that following your earlier suggestion.
>
> But as you found, compared to jhash, Toeplitz is expensive to compute
> and you would need to do this if implemented by monitoring transmits.
Well I'd rather not have to compute any hash on transmit. Toeplitz
could be populated in another field of skb from a saved value in the
socket struct.
>
> And furthermore, from the device perspective:
>
> 1) some (such as NIU) provide different hash values, not Toeplitz
>
> 2) it is expensive to extract this value (must enable different,
> larger, RX descriptor modes, thus more DMA traffic, etc.)
>
> I really don't think Toeplitz is a net win, to be honest.
Using the Toeplitz hash in steering lookup has given us about 10% more
maximum pps (2 different NICs), and we haven't really noticed negative
effects because of the extra descriptor overhead-- so I'm not going to
give up on it too easily! A device provided hash could probably be
used as an opaque value-- used as hash key into steering table on RX,
saved in socket structure in RX, and then sent with skb on transmit
during which it is used to update the steering table. At the socket
layer, we would just need to collect and cache values in socket
structures.
>
> > I hope to have a new patch soon for steering packets to application
> > CPU and using device hash, thanks for bugging me on it!
>
> I had no choice, as I'm giving a presentation on this stuff tomorrow
> night here in NYC :-)
Cool, do you happen to have slides.... :-)
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-14 23:28 ` Tom Herbert
@ 2009-07-17 2:48 ` David Miller
2009-07-17 18:05 ` Tom Herbert
0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2009-07-17 2:48 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Tue, 14 Jul 2009 16:28:01 -0700
> Using the Toeplitz hash in steering lookup has given us about 10% more
> maximum pps (2 different NICs), and we haven't really noticed negative
> effects because of the extra descriptor overhead-- so I'm not going to
> give up on it too easily!
Do you have any idea why? Does Toeplitz distribute better?
If so, that could be merely because either:
1) Our modulus avoidance scheme somehow decreases the distribution
features of the hash
2) The way we feed data into the hash has a similar effect
It's worth checking out.
>> I had no choice, as I'm giving a presentation on this stuff tomorrow
>> night here in NYC :-)
>
> Cool, do you happen to have slides.... :-)
I'll post them soon.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-17 2:48 ` David Miller
@ 2009-07-17 18:05 ` Tom Herbert
2009-07-17 18:08 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Tom Herbert @ 2009-07-17 18:05 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On Thu, Jul 16, 2009 at 7:48 PM, David Miller <davem@davemloft.net> wrote:
>
> From: Tom Herbert <therbert@google.com>
> Date: Tue, 14 Jul 2009 16:28:01 -0700
>
> > Using the Toeplitz hash in steering lookup has given us about 10% more
> > maximum pps (2 different NICs), and we haven't really noticed negative
> > effects because of the extra descriptor overhead-- so I'm not going to
> > give up on it too easily!
>
> Do you have any idea why? Does Toeplitz distribute better?
> If so, that could be merely because either:
>
> 1) Our modulus avoidance scheme somehow decreases the distribution
> features of the hash
>
> 2) The way we feed data into the hash has a similar effect
>
> It's worth checking out.
The advantage is that Toeplitz, or any reasonable device provided
hash, allows packet steering to be done without taking any cache
misses on the packet itself. In particular, this helps with a NIC
that just provides Toeplitz hash (without multiQ),
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-17 18:05 ` Tom Herbert
@ 2009-07-17 18:08 ` David Miller
2009-07-17 19:59 ` Tom Herbert
0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2009-07-17 18:08 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Fri, 17 Jul 2009 11:05:58 -0700
> The advantage is that Toeplitz, or any reasonable device provided
> hash, allows packet steering to be done without taking any cache
> misses on the packet itself. In particular, this helps with a NIC
> that just provides Toeplitz hash (without multiQ),
Good point.
Depending upon the cache line size, however, we might have
at least the IP header in the cpu cache at this point since
eth_type_trans() had to pull in the entire ethernet header
underneath.
I totally misunderstood you, I thought you meant that with a
pure SW hash implementation, Toeplitz did better than jhash.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-17 18:08 ` David Miller
@ 2009-07-17 19:59 ` Tom Herbert
2009-07-18 3:54 ` David Miller
0 siblings, 1 reply; 23+ messages in thread
From: Tom Herbert @ 2009-07-17 19:59 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On Fri, Jul 17, 2009 at 11:08 AM, David Miller<davem@davemloft.net> wrote:
> From: Tom Herbert <therbert@google.com>
> Date: Fri, 17 Jul 2009 11:05:58 -0700
>
>> The advantage is that Toeplitz, or any reasonable device provided
>> hash, allows packet steering to be done without taking any cache
>> misses on the packet itself. In particular, this helps with a NIC
>> that just provides Toeplitz hash (without multiQ),
>
> Good point.
>
> Depending upon the cache line size, however, we might have
> at least the IP header in the cpu cache at this point since
> eth_type_trans() had to pull in the entire ethernet header
> underneath.
>
eth_type_trans does not need to even be called assuming that the
packet type information can be inferred from the RX descriptor. We
are doing steering completely based on contents of the RX descriptor.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH v2] Receive Packet Steering
2009-07-17 19:59 ` Tom Herbert
@ 2009-07-18 3:54 ` David Miller
0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2009-07-18 3:54 UTC (permalink / raw)
To: therbert; +Cc: netdev
From: Tom Herbert <therbert@google.com>
Date: Fri, 17 Jul 2009 12:59:31 -0700
> eth_type_trans does not need to even be called assuming that the
> packet type information can be inferred from the RX descriptor.
Great assumption :-)
We investigated doing that in the past but it's racey and can't be
made to work in all situations.
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2009-07-18 3:53 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-04 4:03 [PATCH v2] Receive Packet Steering Tom Herbert
2009-05-04 5:30 ` Eric Dumazet
2009-05-04 6:10 ` Eric Dumazet
2009-05-04 7:13 ` Eric Dumazet
2009-05-12 17:28 ` Tom Herbert
2009-05-04 7:08 ` Eric Dumazet
2009-05-04 7:59 ` Andi Kleen
2009-05-04 18:22 ` Jarek Poplawski
2009-05-04 20:43 ` Jarek Poplawski
2009-06-10 8:23 ` David Miller
2009-06-15 5:54 ` Tom Herbert
[not found] ` <65634d660906142252y6f7fc021l844b172995c10044@mail.gmail.com>
2009-06-15 9:02 ` David Miller
2009-06-15 16:39 ` Tom Herbert
2009-06-15 23:18 ` David Miller
2009-07-13 17:49 ` David Miller
2009-07-13 22:04 ` Tom Herbert
2009-07-14 19:33 ` David Miller
2009-07-14 23:28 ` Tom Herbert
2009-07-17 2:48 ` David Miller
2009-07-17 18:05 ` Tom Herbert
2009-07-17 18:08 ` David Miller
2009-07-17 19:59 ` Tom Herbert
2009-07-18 3:54 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).