* [PATCH net-next 1/2] sfc: Support setting rss_cpus to 'cores', 'packages' or 'hyperthreads'
2016-05-04 16:56 [PATCH net-next 0/2] sfc: RSS enhancements Edward Cree
@ 2016-05-04 17:01 ` Edward Cree
2016-05-06 19:38 ` David Miller
2016-05-04 17:02 ` [PATCH net-next 2/2] sfc: allocate rx pages on the same node as the interrupt Edward Cree
1 sibling, 1 reply; 6+ messages in thread
From: Edward Cree @ 2016-05-04 17:01 UTC (permalink / raw)
To: linux-net-drivers, netdev, David Miller
These settings autoconfigure the number of RSS channels to match the number of
CPUs present.
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
drivers/net/ethernet/sfc/efx.c | 143 ++++++++++++++++++++++++++++++++---------
1 file changed, 113 insertions(+), 30 deletions(-)
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 0705ec86..e6fdf35 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -22,6 +22,7 @@
#include <linux/gfp.h>
#include <linux/aer.h>
#include <linux/interrupt.h>
+#include <xen/xen.h>
#include "net_driver.h"
#include "efx.h"
#include "nic.h"
@@ -161,16 +162,19 @@ static unsigned int tx_irq_mod_usec = 150;
*/
static unsigned int interrupt_mode;
-/* This is the requested number of CPUs to use for Receive-Side Scaling (RSS),
- * i.e. the number of CPUs among which we may distribute simultaneous
- * interrupt handling.
+/* This is the requested number of CPUs to use for Receive-Side Scaling
+ * (RSS), i.e. the number of CPUs among which we may distribute
+ * simultaneous interrupt handling. Or alternatively it may be set to
+ * "packages", "cores" or "hyperthreads" to get one receive channel per
+ * package, core or hyperthread. The default is "cores".
*
- * Cards without MSI-X will only target one CPU via legacy or MSI interrupt.
- * The default (0) means to assign an interrupt to each core.
+ * Systems without MSI-X will only target one CPU via legacy or MSI
+ * interrupt.
*/
-static unsigned int rss_cpus;
-module_param(rss_cpus, uint, 0444);
-MODULE_PARM_DESC(rss_cpus, "Number of CPUs to use for Receive-Side Scaling");
+static char *rss_cpus;
+module_param(rss_cpus, charp, 0444);
+MODULE_PARM_DESC(rss_cpus, "Number of CPUs to use for Receive-Side Scaling, "
+ "or 'packages', 'cores' or 'hyperthreads'");
static bool phy_flash_cfg;
module_param(phy_flash_cfg, bool, 0644);
@@ -1324,51 +1328,130 @@ void efx_set_default_rx_indir_table(struct efx_nic *efx)
ethtool_rxfh_indir_default(i, efx->rss_spread);
}
-static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
+/* Count the number of unique packages in the given cpumask */
+static unsigned int efx_num_packages(const cpumask_t *in)
{
- cpumask_var_t thread_mask;
+ cpumask_var_t core_mask;
+ unsigned int count;
+ int cpu, cpu2;
+
+ if (unlikely(!zalloc_cpumask_var(&core_mask, GFP_KERNEL))) {
+ pr_warn("sfc: RSS disabled due to allocation failure\n");
+ return 1;
+ }
+
+ count = 0;
+ for_each_cpu(cpu, in) {
+ if (!cpumask_test_cpu(cpu, core_mask)) {
+ ++count;
+
+ /* Treat each numa node as a seperate package */
+ for_each_cpu(cpu2, topology_core_cpumask(cpu)) {
+ if (cpu_to_node(cpu) == cpu_to_node(cpu2))
+ cpumask_set_cpu(cpu2, core_mask);
+ }
+ }
+ }
+
+ free_cpumask_var(core_mask);
+
+ return count;
+}
+
+/* Count the number of unique cores in the given cpumask */
+static unsigned int efx_num_cores(const cpumask_t *in)
+{
+ cpumask_var_t core_mask;
unsigned int count;
int cpu;
- if (rss_cpus) {
- count = rss_cpus;
- } else {
- if (unlikely(!zalloc_cpumask_var(&thread_mask, GFP_KERNEL))) {
- netif_warn(efx, probe, efx->net_dev,
- "RSS disabled due to allocation failure\n");
- return 1;
+ if (unlikely(!zalloc_cpumask_var(&core_mask, GFP_KERNEL))) {
+ pr_warn("sfc: RSS disabled due to allocation failure\n");
+ return 1;
+ }
+
+ count = 0;
+ for_each_cpu(cpu, in) {
+ if (!cpumask_test_cpu(cpu, core_mask)) {
+ ++count;
+ cpumask_or(core_mask, core_mask,
+ topology_sibling_cpumask(cpu));
}
+ }
- count = 0;
- for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, thread_mask)) {
- ++count;
- cpumask_or(thread_mask, thread_mask,
- topology_sibling_cpumask(cpu));
- }
+ free_cpumask_var(core_mask);
+ return count;
+}
+
+static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
+{
+ struct net_device *net_dev = efx->net_dev;
+ unsigned int n_rxq;
+
+ if (rss_cpus == NULL) {
+ /* default to 'cores' */
+ goto cores;
+ } else if (strcmp(rss_cpus, "packages") == 0) {
+ if (xen_domain()) {
+ netif_warn(efx, drv, net_dev,
+ "Unable to determine CPU topology on Xen reliably. Using 4 rss channels.\n");
+ n_rxq = 4;
+ } else {
+ netif_dbg(efx, drv, net_dev,
+ "using efx_num_packages()\n");
+ n_rxq = efx_num_packages(cpu_online_mask);
+ /* Create two RSS queues even with a single package */
+ if (n_rxq == 1)
+ n_rxq = 2;
}
+ } else if (strcmp(rss_cpus, "cores") == 0) {
+cores:
+ if (xen_domain()) {
+ netif_warn(efx, drv, net_dev,
+ "Unable to determine CPU topology on Xen reliably. Assuming hyperthreading enabled.\n");
+ n_rxq = max_t(int, 1, num_online_cpus() / 2);
+ } else {
+ netif_dbg(efx, drv, net_dev,
+ "using efx_num_cores()\n");
+ n_rxq = efx_num_cores(cpu_online_mask);
+ }
+ } else if (strcmp(rss_cpus, "hyperthreads") == 0) {
+ n_rxq = num_online_cpus();
+ } else if (sscanf(rss_cpus, "%u", &n_rxq) == 1 && n_rxq > 0) {
+ /* nothing to do */
+ } else {
+ netif_err(efx, drv, net_dev,
+ "Bad value for module parameter rss_cpus='%s'\n",
+ rss_cpus);
+ /* default to 'cores' */
+ goto cores;
+ }
- free_cpumask_var(thread_mask);
+ if (n_rxq > EFX_MAX_RX_QUEUES) {
+ netif_warn(efx, drv, net_dev,
+ "Reducing number of rss channels from %u to %u.\n",
+ n_rxq, EFX_MAX_RX_QUEUES);
+ n_rxq = EFX_MAX_RX_QUEUES;
}
+#ifdef CONFIG_SFC_SRIOV
/* If RSS is requested for the PF *and* VFs then we can't write RSS
* table entries that are inaccessible to VFs
*/
-#ifdef CONFIG_SFC_SRIOV
if (efx->type->sriov_wanted) {
if (efx->type->sriov_wanted(efx) && efx_vf_size(efx) > 1 &&
- count > efx_vf_size(efx)) {
+ n_rxq > efx_vf_size(efx)) {
netif_warn(efx, probe, efx->net_dev,
"Reducing number of RSS channels from %u to %u for "
"VF support. Increase vf-msix-limit to use more "
"channels on the PF.\n",
- count, efx_vf_size(efx));
- count = efx_vf_size(efx);
+ n_rxq, efx_vf_size(efx));
+ n_rxq = efx_vf_size(efx);
}
}
#endif
- return count;
+ return n_rxq;
}
/* Probe the number and type of interrupts we are able to obtain, and
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH net-next 2/2] sfc: allocate rx pages on the same node as the interrupt
2016-05-04 16:56 [PATCH net-next 0/2] sfc: RSS enhancements Edward Cree
2016-05-04 17:01 ` [PATCH net-next 1/2] sfc: Support setting rss_cpus to 'cores', 'packages' or 'hyperthreads' Edward Cree
@ 2016-05-04 17:02 ` Edward Cree
1 sibling, 0 replies; 6+ messages in thread
From: Edward Cree @ 2016-05-04 17:02 UTC (permalink / raw)
To: linux-net-drivers, netdev, David Miller; +Cc: Daniel Pieczko
From: Daniel Pieczko <dpieczko@solarflare.com>
When the interrupt servicing a channel is on a NUMA node that is
not local to the device, performance is improved by allocating
rx pages on the node local to the interrupt (remote to the device)
The performance-optimal case, where interrupts and applications
are pinned to CPUs on the same node as the device, is not altered
by this change.
This change gave a 1% improvement in transaction rate using Nginx
with all interrupts and Nginx threads on the node remote to the
device. It also gave a small reduction in round-trip latency,
again with the interrupt and application on a different node to
the device.
Allocating rx pages based on the channel->irq_node value is only
valid for the initial driver-load interrupt affinities; if an
interrupt is moved later, the wrong node may be used for the
allocation.
Signed-off-by: Bert Kenward <bkenward@solarflare.com>
Signed-off-by: Edward Cree <ecree@solarflare.com>
---
drivers/net/ethernet/sfc/efx.c | 38 +++++++++++++++++++++++++++++++++++
drivers/net/ethernet/sfc/net_driver.h | 3 +++
drivers/net/ethernet/sfc/rx.c | 18 ++++++++++++++---
3 files changed, 56 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index e6fdf35..f7e6ce5 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -449,6 +449,7 @@ efx_alloc_channel(struct efx_nic *efx, int i, struct efx_channel *old_channel)
channel->efx = efx;
channel->channel = i;
channel->type = &efx_default_channel_type;
+ channel->irq_mem_node = NUMA_NO_NODE;
for (j = 0; j < EFX_TXQ_TYPES; j++) {
tx_queue = &channel->tx_queue[j];
@@ -1569,6 +1570,38 @@ static int efx_probe_interrupts(struct efx_nic *efx)
return 0;
}
+#ifndef CONFIG_SMP
+static void efx_set_interrupt_affinity(struct efx_nic *efx __always_unused)
+{
+}
+
+static void efx_clear_interrupt_affinity(struct efx_nic *efx __always_unused)
+{
+}
+#else
+static void efx_set_interrupt_affinity(struct efx_nic *efx)
+{
+ struct efx_channel *channel;
+ unsigned int cpu;
+
+ efx_for_each_channel(channel, efx) {
+ cpu = cpumask_local_spread(channel->channel,
+ pcibus_to_node(efx->pci_dev->bus));
+
+ irq_set_affinity_hint(channel->irq, cpumask_of(cpu));
+ channel->irq_mem_node = cpu_to_mem(cpu);
+ }
+}
+
+static void efx_clear_interrupt_affinity(struct efx_nic *efx)
+{
+ struct efx_channel *channel;
+
+ efx_for_each_channel(channel, efx)
+ irq_set_affinity_hint(channel->irq, NULL);
+}
+#endif /* CONFIG_SMP */
+
static int efx_soft_enable_interrupts(struct efx_nic *efx)
{
struct efx_channel *channel, *end_channel;
@@ -3017,6 +3050,7 @@ static void efx_pci_remove_main(struct efx_nic *efx)
cancel_work_sync(&efx->reset_work);
efx_disable_interrupts(efx);
+ efx_clear_interrupt_affinity(efx);
efx_nic_fini_interrupt(efx);
efx_fini_port(efx);
efx->type->fini(efx);
@@ -3166,6 +3200,9 @@ static int efx_pci_probe_main(struct efx_nic *efx)
rc = efx_nic_init_interrupt(efx);
if (rc)
goto fail5;
+
+ efx_set_interrupt_affinity(efx);
+
rc = efx_enable_interrupts(efx);
if (rc)
goto fail6;
@@ -3173,6 +3210,7 @@ static int efx_pci_probe_main(struct efx_nic *efx)
return 0;
fail6:
+ efx_clear_interrupt_affinity(efx);
efx_nic_fini_interrupt(efx);
fail5:
efx_fini_port(efx);
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 38c4223..28c3673 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -423,6 +423,7 @@ enum efx_sync_events_state {
* @sync_events_state: Current state of sync events on this channel
* @sync_timestamp_major: Major part of the last ptp sync event
* @sync_timestamp_minor: Minor part of the last ptp sync event
+ * @irq_mem_node: Memory NUMA node of interrupt
*/
struct efx_channel {
struct efx_nic *efx;
@@ -468,6 +469,8 @@ struct efx_channel {
enum efx_sync_events_state sync_events_state;
u32 sync_timestamp_major;
u32 sync_timestamp_minor;
+
+ int irq_mem_node;
};
#ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 8956995..eed0c3b 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -163,9 +163,21 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
do {
page = efx_reuse_page(rx_queue);
if (page == NULL) {
- page = alloc_pages(__GFP_COLD | __GFP_COMP |
- (atomic ? GFP_ATOMIC : GFP_KERNEL),
- efx->rx_buffer_order);
+ /* GFP_ATOMIC may fail because of various reasons,
+ * and we re-schedule rx_fill from non-atomic
+ * context in such a case. So, use __GFP_NO_WARN
+ * in case of atomic.
+ */
+ struct efx_channel *channel;
+
+ channel = efx_rx_queue_channel(rx_queue);
+ page = alloc_pages_node(channel->irq_mem_node,
+ __GFP_COMP |
+ (atomic ?
+ (GFP_ATOMIC | __GFP_NOWARN)
+ : GFP_KERNEL),
+ efx->rx_buffer_order);
+
if (unlikely(page == NULL))
return -ENOMEM;
dma_addr =
^ permalink raw reply related [flat|nested] 6+ messages in thread