Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH V2 net 08/20] net/ena: add hardware hints capability to the driver
From: Matt Wilson @ 2016-12-05  4:31 UTC (permalink / raw)
  To: Netanel Belgazal
  Cc: linux-kernel, davem, netdev, dwmw, zorik, alex, saeed, msw,
	aliguori, nafea
In-Reply-To: <1480857578-5065-9-git-send-email-netanel@annapurnalabs.com>

On Sun, Dec 04, 2016 at 03:19:26PM +0200, Netanel Belgazal wrote:
> The ENA device can update the ena driver about the desire timeouts.
> The hardware hints are transmitted as Asynchronous event to the driver.

This is really a new feature, not a bugfix - correct? If it is a new
feature, submit it separately. If the built-in defaults need to be
changed, submit that as a bugfix.

--msw

> In case the device does not support this capability, the driver
> will use its own defines.
> 
> Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
> ---
>  drivers/net/ethernet/amazon/ena/ena_admin_defs.h | 31 +++++++++
>  drivers/net/ethernet/amazon/ena/ena_com.c        | 41 ++++++++---
>  drivers/net/ethernet/amazon/ena/ena_com.h        |  5 ++
>  drivers/net/ethernet/amazon/ena/ena_ethtool.c    |  1 -
>  drivers/net/ethernet/amazon/ena/ena_netdev.c     | 86 +++++++++++++++++++-----
>  drivers/net/ethernet/amazon/ena/ena_netdev.h     | 19 +++++-
>  drivers/net/ethernet/amazon/ena/ena_regs_defs.h  |  2 +
>  7 files changed, 157 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
> index 6d70bf5..35ae511 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
> +++ b/drivers/net/ethernet/amazon/ena/ena_admin_defs.h
> @@ -70,6 +70,8 @@ enum ena_admin_aq_feature_id {
>  
>  	ENA_ADMIN_MAX_QUEUES_NUM		= 2,
>  
> +	ENA_ADMIN_HW_HINTS			= 3,
> +
>  	ENA_ADMIN_RSS_HASH_FUNCTION		= 10,
>  
>  	ENA_ADMIN_STATELESS_OFFLOAD_CONFIG	= 11,
> @@ -749,6 +751,31 @@ struct ena_admin_feature_rss_ind_table {
>  	struct ena_admin_rss_ind_table_entry inline_entry;
>  };
>  
> +/* When hint value is 0, driver should use it's own predefined value */
> +struct ena_admin_ena_hw_hints {
> +	/* value in ms */
> +	u16 mmio_read_timeout;
> +
> +	/* value in ms */
> +	u16 driver_watchdog_timeout;
> +
> +	/* Per packet tx completion timeout. value in ms */
> +	u16 missing_tx_completion_timeout;
> +
> +	u16 missed_tx_completion_count_threshold_to_reset;
> +
> +	/* value in ms */
> +	u16 admin_completion_tx_timeout;
> +
> +	u16 netdev_wd_timeout;
> +
> +	u16 max_tx_sgl_size;
> +
> +	u16 max_rx_sgl_size;
> +
> +	u16 reserved[8];
> +};
> +
>  struct ena_admin_get_feat_cmd {
>  	struct ena_admin_aq_common_desc aq_common_descriptor;
>  
> @@ -782,6 +809,8 @@ struct ena_admin_get_feat_resp {
>  		struct ena_admin_feature_rss_ind_table ind_table;
>  
>  		struct ena_admin_feature_intr_moder_desc intr_moderation;
> +
> +		struct ena_admin_ena_hw_hints hw_hints;
>  	} u;
>  };
>  
> @@ -857,6 +886,8 @@ enum ena_admin_aenq_notification_syndrom {
>  	ENA_ADMIN_SUSPEND	= 0,
>  
>  	ENA_ADMIN_RESUME	= 1,
> +
> +	ENA_ADMIN_UPDATE_HINTS	= 2,
>  };
>  
>  struct ena_admin_aenq_entry {
> diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
> index 46aad3a..f1e4f04 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_com.c
> +++ b/drivers/net/ethernet/amazon/ena/ena_com.c
> @@ -508,15 +508,13 @@ static int ena_com_comp_status_to_errno(u8 comp_status)
>  static int ena_com_wait_and_process_admin_cq_polling(struct ena_comp_ctx *comp_ctx,
>  						     struct ena_com_admin_queue *admin_queue)
>  {
> -	unsigned long flags;
> -	u32 start_time;
> +	unsigned long flags, timeout;
>  	int ret;
>  
> -	start_time = ((u32)jiffies_to_usecs(jiffies));
> +	timeout = jiffies + usecs_to_jiffies(admin_queue->completion_timeout);
>  
>  	while (comp_ctx->status == ENA_CMD_SUBMITTED) {
> -		if ((((u32)jiffies_to_usecs(jiffies)) - start_time) >
> -		    ADMIN_CMD_TIMEOUT_US) {
> +		if (time_is_before_jiffies(timeout)) {
>  			pr_err("Wait for completion (polling) timeout\n");
>  			/* ENA didn't have any completion */
>  			spin_lock_irqsave(&admin_queue->q_lock, flags);
> @@ -560,7 +558,8 @@ static int ena_com_wait_and_process_admin_cq_interrupts(struct ena_comp_ctx *com
>  	int ret;
>  
>  	wait_for_completion_timeout(&comp_ctx->wait_event,
> -				    usecs_to_jiffies(ADMIN_CMD_TIMEOUT_US));
> +				    usecs_to_jiffies(
> +					    admin_queue->completion_timeout));
>  
>  	/* In case the command wasn't completed find out the root cause.
>  	 * There might be 2 kinds of errors
> @@ -600,12 +599,14 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
>  	struct ena_com_mmio_read *mmio_read = &ena_dev->mmio_read;
>  	volatile struct ena_admin_ena_mmio_req_read_less_resp *read_resp =
>  		mmio_read->read_resp;
> -	u32 mmio_read_reg, ret;
> +	u32 mmio_read_reg, timeout, ret;
>  	unsigned long flags;
>  	int i;
>  
>  	might_sleep();
>  
> +	timeout = mmio_read->reg_read_to ? : ENA_REG_READ_TIMEOUT;
> +
>  	/* If readless is disabled, perform regular read */
>  	if (!mmio_read->readless_supported)
>  		return readl(ena_dev->reg_bar + offset);
> @@ -626,14 +627,14 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
>  
>  	writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF);
>  
> -	for (i = 0; i < ENA_REG_READ_TIMEOUT; i++) {
> +	for (i = 0; i < timeout; i++) {
>  		if (read_resp->req_id == mmio_read->seq_num)
>  			break;
>  
>  		udelay(1);
>  	}
>  
> -	if (unlikely(i == ENA_REG_READ_TIMEOUT)) {
> +	if (unlikely(i == timeout)) {
>  		pr_err("reading reg failed for timeout. expected: req id[%hu] offset[%hu] actual: req id[%hu] offset[%hu]\n",
>  		       mmio_read->seq_num, offset, read_resp->req_id,
>  		       read_resp->reg_off);
> @@ -1717,6 +1718,20 @@ int ena_com_get_dev_attr_feat(struct ena_com_dev *ena_dev,
>  	memcpy(&get_feat_ctx->offload, &get_resp.u.offload,
>  	       sizeof(get_resp.u.offload));
>  
> +	/* Driver hints isn't mandatory admin command. So in case the
> +	 * command isn't supported set driver hints to 0
> +	 */
> +	rc = ena_com_get_feature(ena_dev, &get_resp, ENA_ADMIN_HW_HINTS);
> +
> +	if (!rc)
> +		memcpy(&get_feat_ctx->hw_hints, &get_resp.u.hw_hints,
> +		       sizeof(get_resp.u.hw_hints));
> +	else if (rc == -EPERM)
> +		memset(&get_feat_ctx->hw_hints, 0x0,
> +		       sizeof(get_feat_ctx->hw_hints));
> +	else
> +		return rc;
> +
>  	return 0;
>  }
>  
> @@ -1842,6 +1857,14 @@ int ena_com_dev_reset(struct ena_com_dev *ena_dev)
>  		return rc;
>  	}
>  
> +	timeout = (cap & ENA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
> +		ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
> +	if (timeout)
> +		/* the resolution of timeout reg is 100ms */
> +		ena_dev->admin_queue.completion_timeout = timeout * 100000;
> +	else
> +		ena_dev->admin_queue.completion_timeout = ADMIN_CMD_TIMEOUT_US;
> +
>  	return 0;
>  }
>  
> diff --git a/drivers/net/ethernet/amazon/ena/ena_com.h b/drivers/net/ethernet/amazon/ena/ena_com.h
> index 509d7b8..6883ee5 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_com.h
> +++ b/drivers/net/ethernet/amazon/ena/ena_com.h
> @@ -96,6 +96,8 @@
>  #define ENA_INTR_MODER_LEVEL_STRIDE			2
>  #define ENA_INTR_BYTE_COUNT_NOT_SUPPORTED		0xFFFFFF
>  
> +#define ENA_HW_HINTS_NO_TIMEOUT				0xFFFF
> +
>  enum ena_intr_moder_level {
>  	ENA_INTR_MODER_LOWEST = 0,
>  	ENA_INTR_MODER_LOW,
> @@ -232,6 +234,7 @@ struct ena_com_admin_queue {
>  	void *q_dmadev;
>  	spinlock_t q_lock; /* spinlock for the admin queue */
>  	struct ena_comp_ctx *comp_ctx;
> +	u32 completion_timeout;
>  	u16 q_depth;
>  	struct ena_com_admin_cq cq;
>  	struct ena_com_admin_sq sq;
> @@ -266,6 +269,7 @@ struct ena_com_aenq {
>  struct ena_com_mmio_read {
>  	struct ena_admin_ena_mmio_req_read_less_resp *read_resp;
>  	dma_addr_t read_resp_dma_addr;
> +	u32 reg_read_to; /* in us */
>  	u16 seq_num;
>  	bool readless_supported;
>  	/* spin lock to ensure a single outstanding read */
> @@ -335,6 +339,7 @@ struct ena_com_dev_get_features_ctx {
>  	struct ena_admin_device_attr_feature_desc dev_attr;
>  	struct ena_admin_feature_aenq_desc aenq;
>  	struct ena_admin_feature_offload_desc offload;
> +	struct ena_admin_ena_hw_hints hw_hints;
>  };
>  
>  struct ena_com_create_io_ctx {
> diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
> index 67b2338f..a1fbfc2 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
> +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
> @@ -80,7 +80,6 @@ static const struct ena_stats ena_stats_tx_strings[] = {
>  	ENA_STAT_TX_ENTRY(tx_poll),
>  	ENA_STAT_TX_ENTRY(doorbells),
>  	ENA_STAT_TX_ENTRY(prepare_ctx_err),
> -	ENA_STAT_TX_ENTRY(missing_tx_comp),
>  	ENA_STAT_TX_ENTRY(bad_req_id),
>  };
>  
> diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
> index 962ffb5..0b718ee 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
> +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
> @@ -1981,6 +1981,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  	tx_info->tx_descs = nb_hw_desc;
>  	tx_info->last_jiffies = jiffies;
> +	tx_info->print_once = 0;
>  
>  	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
>  		tx_ring->ring_size);
> @@ -2554,33 +2555,34 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
>  	if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags))
>  		return;
>  
> +	if (adapter->missing_tx_completion_to == ENA_HW_HINTS_NO_TIMEOUT)
> +		return;
> +
>  	budget = ENA_MONITORED_TX_QUEUES;
>  
>  	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
>  		tx_ring = &adapter->tx_ring[i];
>  
> +		missed_tx = 0;
> +
>  		for (j = 0; j < tx_ring->ring_size; j++) {
>  			tx_buf = &tx_ring->tx_buffer_info[j];
>  			last_jiffies = tx_buf->last_jiffies;
> -			if (unlikely(last_jiffies && time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
> -				netif_notice(adapter, tx_err, adapter->netdev,
> -					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
> -					     tx_ring->qid, j);
> +			if (unlikely(last_jiffies && time_is_before_jiffies(last_jiffies + adapter->missing_tx_completion_to))) {
> +				if (!tx_buf->print_once)
> +					netif_notice(adapter, tx_err, adapter->netdev,
> +						     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
> +						     tx_ring->qid, j);
>  
> -				u64_stats_update_begin(&tx_ring->syncp);
> -				missed_tx = tx_ring->tx_stats.missing_tx_comp++;
> -				u64_stats_update_end(&tx_ring->syncp);
> +				tx_buf->print_once = 1;
> +				missed_tx++;
>  
> -				/* Clear last jiffies so the lost buffer won't
> -				 * be counted twice.
> -				 */
> -				tx_buf->last_jiffies = 0;
> -
> -				if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
> +				if (unlikely(missed_tx > adapter->missing_tx_completion_threshold)) {
>  					netif_err(adapter, tx_err, adapter->netdev,
>  						  "The number of lost tx completion is above the threshold (%d > %d). Reset the device\n",
> -						  missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
> +						  missed_tx, adapter->missing_tx_completion_threshold);
>  					set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
> +					return;
>  				}
>  			}
>  		}
> @@ -2601,8 +2603,11 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter)
>  	if (!adapter->wd_state)
>  		return;
>  
> -	keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies
> -					   + ENA_DEVICE_KALIVE_TIMEOUT);
> +	if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT)
> +		return;
> +
> +	keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies +
> +					   adapter->keep_alive_timeout);
>  	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
>  		netif_err(adapter, drv, adapter->netdev,
>  			  "Keep alive watchdog timeout.\n");
> @@ -2625,6 +2630,44 @@ static void check_for_admin_com_state(struct ena_adapter *adapter)
>  	}
>  }
>  
> +static void ena_update_hints(struct ena_adapter *adapter,
> +			     struct ena_admin_ena_hw_hints *hints)
> +{
> +	struct net_device *netdev = adapter->netdev;
> +
> +	if (hints->admin_completion_tx_timeout)
> +		adapter->ena_dev->admin_queue.completion_timeout =
> +			hints->admin_completion_tx_timeout * 1000;
> +
> +	if (hints->mmio_read_timeout)
> +		/* convert to usec */
> +		adapter->ena_dev->mmio_read.reg_read_to =
> +			hints->mmio_read_timeout * 1000;
> +
> +	if (hints->missed_tx_completion_count_threshold_to_reset)
> +		adapter->missing_tx_completion_threshold =
> +			hints->missed_tx_completion_count_threshold_to_reset;
> +
> +	if (hints->missing_tx_completion_timeout) {
> +		if (hints->missing_tx_completion_timeout == ENA_HW_HINTS_NO_TIMEOUT)
> +			adapter->missing_tx_completion_to = ENA_HW_HINTS_NO_TIMEOUT;
> +		else
> +			adapter->missing_tx_completion_to =
> +				msecs_to_jiffies(hints->missing_tx_completion_timeout);
> +	}
> +
> +	if (hints->netdev_wd_timeout)
> +		netdev->watchdog_timeo = msecs_to_jiffies(hints->netdev_wd_timeout);
> +
> +	if (hints->driver_watchdog_timeout) {
> +		if (hints->driver_watchdog_timeout == ENA_HW_HINTS_NO_TIMEOUT)
> +			adapter->keep_alive_timeout = ENA_HW_HINTS_NO_TIMEOUT;
> +		else
> +			adapter->keep_alive_timeout =
> +				msecs_to_jiffies(hints->driver_watchdog_timeout);
> +	}
> +}
> +
>  static void ena_update_host_info(struct ena_admin_host_info *host_info,
>  				 struct net_device *netdev)
>  {
> @@ -3036,6 +3079,11 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  	INIT_WORK(&adapter->reset_task, ena_fw_reset_device);
>  
>  	adapter->last_keep_alive_jiffies = jiffies;
> +	adapter->keep_alive_timeout = ENA_DEVICE_KALIVE_TIMEOUT;
> +	adapter->missing_tx_completion_to = TX_TIMEOUT;
> +	adapter->missing_tx_completion_threshold = MAX_NUM_OF_TIMEOUTED_PACKETS;
> +
> +	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
>  
>  	init_timer(&adapter->timer_service);
>  	adapter->timer_service.expires = round_jiffies(jiffies + HZ);
> @@ -3256,6 +3304,7 @@ static void ena_notification(void *adapter_data,
>  			     struct ena_admin_aenq_entry *aenq_e)
>  {
>  	struct ena_adapter *adapter = (struct ena_adapter *)adapter_data;
> +	struct ena_admin_ena_hw_hints *hints;
>  
>  	WARN(aenq_e->aenq_common_desc.group != ENA_ADMIN_NOTIFICATION,
>  	     "Invalid group(%x) expected %x\n",
> @@ -3273,6 +3322,11 @@ static void ena_notification(void *adapter_data,
>  	case ENA_ADMIN_RESUME:
>  		queue_work(ena_wq, &adapter->resume_io_task);
>  		break;
> +	case ENA_ADMIN_UPDATE_HINTS:
> +		hints = (struct ena_admin_ena_hw_hints *)
> +			(&aenq_e->inline_data_w4);
> +		ena_update_hints(adapter, hints);
> +		break;
>  	default:
>  		netif_err(adapter, drv, adapter->netdev,
>  			  "Invalid aenq notification link state %d\n",
> diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
> index f0ddc11..2897fab 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
> +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
> @@ -146,7 +146,18 @@ struct ena_tx_buffer {
>  	u32 tx_descs;
>  	/* num of buffers used by this skb */
>  	u32 num_of_bufs;
> -	/* Save the last jiffies to detect missing tx packets */
> +
> +	/* Used for detect missing tx packets to limit the number of prints */
> +	u32 print_once;
> +	/* Save the last jiffies to detect missing tx packets
> +	 *
> +	 * sets to non zero value on ena_start_xmit and set to zero on
> +	 * napi and timer_Service_routine.
> +	 *
> +	 * while this value is not protected by lock,
> +	 * a given packet is not expected to be handled by ena_start_xmit
> +	 * and by napi/timer_service at the same time.
> +	 */
>  	unsigned long last_jiffies;
>  	struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
>  } ____cacheline_aligned;
> @@ -170,7 +181,6 @@ struct ena_stats_tx {
>  	u64 napi_comp;
>  	u64 tx_poll;
>  	u64 doorbells;
> -	u64 missing_tx_comp;
>  	u64 bad_req_id;
>  };
>  
> @@ -270,6 +280,8 @@ struct ena_adapter {
>  	struct msix_entry *msix_entries;
>  	int msix_vecs;
>  
> +	u32 missing_tx_completion_threshold;
> +
>  	u32 tx_usecs, rx_usecs; /* interrupt moderation */
>  	u32 tx_frames, rx_frames; /* interrupt moderation */
>  
> @@ -283,6 +295,9 @@ struct ena_adapter {
>  
>  	u8 mac_addr[ETH_ALEN];
>  
> +	unsigned long keep_alive_timeout;
> +	unsigned long missing_tx_completion_to;
> +
>  	char name[ENA_NAME_MAX_LEN];
>  
>  	unsigned long flags;
> diff --git a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
> index 26097a2..c3891c5 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
> +++ b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
> @@ -78,6 +78,8 @@
>  #define ENA_REGS_CAPS_RESET_TIMEOUT_MASK		0x3e
>  #define ENA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT		8
>  #define ENA_REGS_CAPS_DMA_ADDR_WIDTH_MASK		0xff00
> +#define ENA_REGS_CAPS_ADMIN_CMD_TO_SHIFT		16
> +#define ENA_REGS_CAPS_ADMIN_CMD_TO_MASK		0xf0000
>  
>  /* aq_caps register */
>  #define ENA_REGS_AQ_CAPS_AQ_DEPTH_MASK		0xffff

^ permalink raw reply

* Re: [PATCH V2 net 13/20] net/ena: change driver's default timeouts
From: Matt Wilson @ 2016-12-05  4:35 UTC (permalink / raw)
  To: Netanel Belgazal
  Cc: linux-kernel, davem, netdev, dwmw, zorik, alex, saeed, msw,
	aliguori, nafea
In-Reply-To: <1480857578-5065-14-git-send-email-netanel@annapurnalabs.com>

On Sun, Dec 04, 2016 at 03:19:31PM +0200, Netanel Belgazal wrote:

... because? (they turned out to be too aggressive, I believe.)

> Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
> ---
>  drivers/net/ethernet/amazon/ena/ena_com.c    | 4 ++--
>  drivers/net/ethernet/amazon/ena/ena_netdev.h | 7 ++++---
>  2 files changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
> index 4147d6e..a550c8a 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_com.c
> +++ b/drivers/net/ethernet/amazon/ena/ena_com.c
> @@ -36,9 +36,9 @@
>  /*****************************************************************************/
>  
>  /* Timeout in micro-sec */
> -#define ADMIN_CMD_TIMEOUT_US (1000000)
> +#define ADMIN_CMD_TIMEOUT_US (3000000)
>  
> -#define ENA_ASYNC_QUEUE_DEPTH 4
> +#define ENA_ASYNC_QUEUE_DEPTH 16

Why is this changed at the same time?

>  #define ENA_ADMIN_QUEUE_DEPTH 32
>  
>  #define MIN_ENA_VER (((ENA_COMMON_SPEC_VERSION_MAJOR) << \
> diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
> index c081fd3..ed42e07 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
> +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
> @@ -39,6 +39,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/netdevice.h>
>  #include <linux/skbuff.h>
> +#include <linux/u64_stats_sync.h>

This change seems unrelated.

>  #include "ena_com.h"
>  #include "ena_eth_com.h"
> @@ -100,7 +101,7 @@
>  /* Number of queues to check for missing queues per timer service */
>  #define ENA_MONITORED_TX_QUEUES	4
>  /* Max timeout packets before device reset */
> -#define MAX_NUM_OF_TIMEOUTED_PACKETS 32
> +#define MAX_NUM_OF_TIMEOUTED_PACKETS 128
>  
>  #define ENA_TX_RING_IDX_NEXT(idx, ring_size) (((idx) + 1) & ((ring_size) - 1))
>  
> @@ -116,9 +117,9 @@
>  #define ENA_IO_IRQ_IDX(q)		(ENA_IO_IRQ_FIRST_IDX + (q))
>  
>  /* ENA device should send keep alive msg every 1 sec.
> - * We wait for 3 sec just to be on the safe side.
> + * We wait for 6 sec just to be on the safe side.
>   */
> -#define ENA_DEVICE_KALIVE_TIMEOUT	(3 * HZ)
> +#define ENA_DEVICE_KALIVE_TIMEOUT	(6 * HZ)
>  
>  #define ENA_MMIO_DISABLE_REG_READ	BIT(0)
>  

^ permalink raw reply

* Re: [PATCH] iproute2: ss: escape all null bytes in abstract unix domain socket
From: Isaac Boukris @ 2016-12-05  5:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, David Miller, netdev
In-Reply-To: <1480721075.18162.389.camel@edumazet-glaptop3.roam.corp.google.com>

On Sat, Dec 3, 2016 at 1:24 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2016-12-02 at 15:18 -0800, Stephen Hemminger wrote:
>>                                     name[i] = '@';
>> >
>> > ss.c: In function 'unix_show_sock':
>> > ss.c:3128:4: error: 'for' loop initial declarations are only allowed in C99 mode
>> > ss.c:3128:4: note: use option -std=c99 or -std=gnu99 to compile your code
>> > make[1]: *** [ss.o] Error 1
>> >
>> >
>> >
>>
>> Thanks, fixed by patch from Simon
>
> Right, thanks !


Thanks for notifying me. Sorry for the bug and thanks for the fix!

^ permalink raw reply

* Re: [PATCH V2 net 10/20] net/ena: remove redundant logic in napi callback for busy poll mode
From: Eric Dumazet @ 2016-12-05  5:45 UTC (permalink / raw)
  To: Netanel Belgazal
  Cc: linux-kernel, davem, netdev, dwmw, zorik, alex, saeed, msw,
	aliguori, nafea
In-Reply-To: <1480857578-5065-11-git-send-email-netanel@annapurnalabs.com>

On Sun, 2016-12-04 at 15:19 +0200, Netanel Belgazal wrote:
> sk_busy_loop can call the napi callback few million times a sec.
> For each call there is unmask interrupt.
> We want to reduce the number of unmasks.
> 
> Add an atomic variable that will tell the napi handler if
> it was called from irq context or not.
> Unmask the interrupt only from irq context.
> 
> A schenario where the driver left with missed unmask isn't feasible.
> when ena_intr_msix_io is called the driver have 2 options:
> 1)Before napi completes and call napi_complete_done
> 2)After calling napi_complete_done
> 
> In the former case the napi will unmask the interrupt as needed.
> In the latter case napi_complete_done will remove napi from the schedule
> list so napi will be rescheduled (by ena_intr_msix_io) and interrupt
> will be unmasked as desire in the 2nd napi call.
> 
> Signed-off-by: Netanel Belgazal <netanel@annapurnalabs.com>
> ---


This looks very complicated to me.

I guess you missed the recent patches that happened on net-next ?

2e713283751f494596655d9125c168aeb913f71d net/mlx4_en: use napi_complete_done() return value
364b6055738b4c752c30ccaaf25c624e69d76195 net: busy-poll: return busypolling status to drivers
21cb84c48ca0619181106f0f44f3802a989de024 net: busy-poll: remove need_resched() from sk_can_busy_loop()
217f6974368188fd8bd7804bf5a036aa5762c5e4 net: busy-poll: allow preemption in sk_busy_loop()

napi_complete_done() return code can be used by a driver,
no need to add yet another atomic operation in fast path.

Anyway, this looks wrong :

@@ -1186,6 +1201,7 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 {
        struct ena_napi *ena_napi = data;
 
+       atomic_set(&ena_napi->unmask_interrupt, 1);
        napi_schedule(&ena_napi->napi);
 
You probably wanted :

if (napi_schedule_prep(n)) {
	atomic_set(&ena_napi->unmask_interrupt, 1);
	__napi_schedule(n);
}



Please rework this napi poll using core infrastructure.

busypoll logic should be centralized, not reimplemented in different ways in a driver.

Thanks.

^ permalink raw reply

* [PATCH 01/28] bnxt_en: Add bnxt_set_max_func_irqs().
From: Selvin Xavier @ 2016-12-05  6:38 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Michael Chan, David Miller,
	netdev-u79uwXL29TY76Z2rM5mHXA, Selvin Xavier
In-Reply-To: <1480919912-1079-1-git-send-email-selvin.xavier-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

From: Michael Chan <michael.chan-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

By refactoring existing code into this new function.  The new function
will be used in subsequent patches.

Cc: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: <netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>
Signed-off-by: Michael Chan <michael.chan-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
Signed-off-by: Selvin Xavier <selvin.xavier-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 17 +++++++++++------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 +
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index e8ab5fd..6cdfe3e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4743,6 +4743,16 @@ static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
 	return 0;
 }
 
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
+{
+	if (BNXT_PF(bp))
+		bp->pf.max_irqs = max_irqs;
+#if defined(CONFIG_BNXT_SRIOV)
+	else
+		bp->vf.max_irqs = max_irqs;
+#endif
+}
+
 static int bnxt_setup_msix(struct bnxt *bp)
 {
 	struct msix_entry *msix_ent;
@@ -6949,12 +6959,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	bnxt_set_tpa_flags(bp);
 	bnxt_set_ring_params(bp);
-	if (BNXT_PF(bp))
-		bp->pf.max_irqs = max_irqs;
-#if defined(CONFIG_BNXT_SRIOV)
-	else
-		bp->vf.max_irqs = max_irqs;
-#endif
+	bnxt_set_max_func_irqs(bp, max_irqs);
 	bnxt_set_dflt_rings(bp);
 
 	/* Default RSS hash cfg. */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index b4abc1b..8327d0d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1235,6 +1235,7 @@ int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
 int bnxt_hwrm_func_qcaps(struct bnxt *);
+void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
 int bnxt_hwrm_set_pause(struct bnxt *);
-- 
2.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 06/28] bnxt_en: Refactor the driver registration function with firmware.
From: Selvin Xavier @ 2016-12-05  6:38 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Michael Chan, David Miller,
	netdev-u79uwXL29TY76Z2rM5mHXA, Selvin Xavier
In-Reply-To: <1480919912-1079-1-git-send-email-selvin.xavier-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

From: Michael Chan <michael.chan-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>

The driver register function with firmware consists of passing version
information and registering for async events.  To support the RDMA driver,
the async events that we need to register may change.  Separate the
driver register function into 2 parts so that we can just update the
async events for the RDMA driver.

Cc: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: <netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>
Signed-off-by: Michael Chan <michael.chan-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
Signed-off-by: Selvin Xavier <selvin.xavier-dY08KVG/lbpWk0Htik3J/w@public.gmane.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 34 ++++++++++++++++++++++++++-----
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  2 ++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7218d65..c26735ea 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3117,27 +3117,46 @@ int hwrm_send_message_silent(struct bnxt *bp, void *msg, u32 msg_len,
 	return rc;
 }
 
-static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+				     int bmap_size)
 {
 	struct hwrm_func_drv_rgtr_input req = {0};
-	int i;
 	DECLARE_BITMAP(async_events_bmap, 256);
 	u32 *events = (u32 *)async_events_bmap;
+	int i;
 
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
 
 	req.enables =
-		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE |
-			    FUNC_DRV_RGTR_REQ_ENABLES_VER |
-			    FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD);
+		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD);
 
 	memset(async_events_bmap, 0, sizeof(async_events_bmap));
 	for (i = 0; i < ARRAY_SIZE(bnxt_async_events_arr); i++)
 		__set_bit(bnxt_async_events_arr[i], async_events_bmap);
 
+	if (bmap && bmap_size) {
+		for (i = 0; i < bmap_size; i++) {
+			if (test_bit(i, bmap))
+				__set_bit(i, async_events_bmap);
+		}
+	}
+
 	for (i = 0; i < 8; i++)
 		req.async_event_fwd[i] |= cpu_to_le32(events[i]);
 
+	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+}
+
+static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
+{
+	struct hwrm_func_drv_rgtr_input req = {0};
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_DRV_RGTR, -1, -1);
+
+	req.enables =
+		cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE |
+			    FUNC_DRV_RGTR_REQ_ENABLES_VER);
+
 	req.os_type = cpu_to_le16(FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX);
 	req.ver_maj = DRV_VER_MAJ;
 	req.ver_min = DRV_VER_MIN;
@@ -3146,6 +3165,7 @@ static int bnxt_hwrm_func_drv_rgtr(struct bnxt *bp)
 	if (BNXT_PF(bp)) {
 		DECLARE_BITMAP(vf_req_snif_bmap, 256);
 		u32 *data = (u32 *)vf_req_snif_bmap;
+		int i;
 
 		memset(vf_req_snif_bmap, 0, sizeof(vf_req_snif_bmap));
 		for (i = 0; i < ARRAY_SIZE(bnxt_vf_req_snif); i++)
@@ -7023,6 +7043,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
+	rc = bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+	if (rc)
+		goto init_err;
+
 	/* Get the MAX capabilities for this function */
 	rc = bnxt_hwrm_func_qcaps(bp);
 	if (rc) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index d796836..eec2415 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1240,6 +1240,8 @@ void bnxt_hwrm_cmd_hdr_init(struct bnxt *, void *, u16, u16, u16);
 int _hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
+int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
+				     int bmap_size);
 int bnxt_hwrm_set_coal(struct bnxt *);
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
-- 
2.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 02/28] bnxt_en: Enable MSIX early in bnxt_init_one().
From: Selvin Xavier @ 2016-12-05  6:38 UTC (permalink / raw)
  To: dledford; +Cc: linux-rdma, Michael Chan, David Miller, netdev, Selvin Xavier
In-Reply-To: <1480919912-1079-1-git-send-email-selvin.xavier@broadcom.com>

From: Michael Chan <michael.chan@broadcom.com>

To better support the new RDMA driver, we need to move pci_enable_msix()
from bnxt_open() to bnxt_init_one().  This way, MSIX vectors are available
to the RDMA driver whether the network device is up or down.

Part of the existing bnxt_setup_int_mode() function is now refactored into
a new bnxt_init_int_mode().  bnxt_init_int_mode() is called during
bnxt_init_one() to enable MSIX.  The remaining logic in
bnxt_setup_int_mode() to map the IRQs to the completion rings is called
during bnxt_open().

Cc: David Miller <davem@davemloft.net>
Cc: <netdev@vger.kernel.org>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 183 +++++++++++++++++++-----------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |   1 +
 2 files changed, 115 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 6cdfe3e..9178bf8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4743,6 +4743,80 @@ static int bnxt_trim_rings(struct bnxt *bp, int *rx, int *tx, int max,
 	return 0;
 }
 
+static void bnxt_setup_msix(struct bnxt *bp)
+{
+	const int len = sizeof(bp->irq_tbl[0].name);
+	struct net_device *dev = bp->dev;
+	int tcs, i;
+
+	tcs = netdev_get_num_tc(dev);
+	if (tcs > 1) {
+		bp->tx_nr_rings_per_tc = bp->tx_nr_rings / tcs;
+		if (bp->tx_nr_rings_per_tc == 0) {
+			netdev_reset_tc(dev);
+			bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+		} else {
+			int i, off, count;
+
+			bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tcs;
+			for (i = 0; i < tcs; i++) {
+				count = bp->tx_nr_rings_per_tc;
+				off = i * count;
+				netdev_set_tc_queue(dev, i, count, off);
+			}
+		}
+	}
+
+	for (i = 0; i < bp->cp_nr_rings; i++) {
+		char *attr;
+
+		if (bp->flags & BNXT_FLAG_SHARED_RINGS)
+			attr = "TxRx";
+		else if (i < bp->rx_nr_rings)
+			attr = "rx";
+		else
+			attr = "tx";
+
+		snprintf(bp->irq_tbl[i].name, len, "%s-%s-%d", dev->name, attr,
+			 i);
+		bp->irq_tbl[i].handler = bnxt_msix;
+	}
+}
+
+static void bnxt_setup_inta(struct bnxt *bp)
+{
+	const int len = sizeof(bp->irq_tbl[0].name);
+
+	if (netdev_get_num_tc(bp->dev))
+		netdev_reset_tc(bp->dev);
+
+	snprintf(bp->irq_tbl[0].name, len, "%s-%s-%d", bp->dev->name, "TxRx",
+		 0);
+	bp->irq_tbl[0].handler = bnxt_inta;
+}
+
+static int bnxt_setup_int_mode(struct bnxt *bp)
+{
+	int rc;
+
+	if (bp->flags & BNXT_FLAG_USING_MSIX)
+		bnxt_setup_msix(bp);
+	else
+		bnxt_setup_inta(bp);
+
+	rc = bnxt_set_real_num_queues(bp);
+	return rc;
+}
+
+static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
+{
+	if (BNXT_PF(bp))
+		return bp->pf.max_irqs;
+#if defined(CONFIG_BNXT_SRIOV)
+	return bp->vf.max_irqs;
+#endif
+}
+
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
 {
 	if (BNXT_PF(bp))
@@ -4753,16 +4827,12 @@ void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max_irqs)
 #endif
 }
 
-static int bnxt_setup_msix(struct bnxt *bp)
+static int bnxt_init_msix(struct bnxt *bp)
 {
-	struct msix_entry *msix_ent;
-	struct net_device *dev = bp->dev;
 	int i, total_vecs, rc = 0, min = 1;
-	const int len = sizeof(bp->irq_tbl[0].name);
-
-	bp->flags &= ~BNXT_FLAG_USING_MSIX;
-	total_vecs = bp->cp_nr_rings;
+	struct msix_entry *msix_ent;
 
+	total_vecs = bnxt_get_max_func_irqs(bp);
 	msix_ent = kcalloc(total_vecs, sizeof(struct msix_entry), GFP_KERNEL);
 	if (!msix_ent)
 		return -ENOMEM;
@@ -4783,8 +4853,10 @@ static int bnxt_setup_msix(struct bnxt *bp)
 
 	bp->irq_tbl = kcalloc(total_vecs, sizeof(struct bnxt_irq), GFP_KERNEL);
 	if (bp->irq_tbl) {
-		int tcs;
+		for (i = 0; i < total_vecs; i++)
+			bp->irq_tbl[i].vector = msix_ent[i].vector;
 
+		bp->total_irqs = total_vecs;
 		/* Trim rings based upon num of vectors allocated */
 		rc = bnxt_trim_rings(bp, &bp->rx_nr_rings, &bp->tx_nr_rings,
 				     total_vecs, min == 1);
@@ -4792,43 +4864,10 @@ static int bnxt_setup_msix(struct bnxt *bp)
 			goto msix_setup_exit;
 
 		bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
-		tcs = netdev_get_num_tc(dev);
-		if (tcs > 1) {
-			bp->tx_nr_rings_per_tc = bp->tx_nr_rings / tcs;
-			if (bp->tx_nr_rings_per_tc == 0) {
-				netdev_reset_tc(dev);
-				bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
-			} else {
-				int i, off, count;
+		bp->cp_nr_rings = (min == 1) ?
+				  max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) :
+				  bp->tx_nr_rings + bp->rx_nr_rings;
 
-				bp->tx_nr_rings = bp->tx_nr_rings_per_tc * tcs;
-				for (i = 0; i < tcs; i++) {
-					count = bp->tx_nr_rings_per_tc;
-					off = i * count;
-					netdev_set_tc_queue(dev, i, count, off);
-				}
-			}
-		}
-		bp->cp_nr_rings = total_vecs;
-
-		for (i = 0; i < bp->cp_nr_rings; i++) {
-			char *attr;
-
-			bp->irq_tbl[i].vector = msix_ent[i].vector;
-			if (bp->flags & BNXT_FLAG_SHARED_RINGS)
-				attr = "TxRx";
-			else if (i < bp->rx_nr_rings)
-				attr = "rx";
-			else
-				attr = "tx";
-
-			snprintf(bp->irq_tbl[i].name, len,
-				 "%s-%s-%d", dev->name, attr, i);
-			bp->irq_tbl[i].handler = bnxt_msix;
-		}
-		rc = bnxt_set_real_num_queues(bp);
-		if (rc)
-			goto msix_setup_exit;
 	} else {
 		rc = -ENOMEM;
 		goto msix_setup_exit;
@@ -4838,52 +4877,54 @@ static int bnxt_setup_msix(struct bnxt *bp)
 	return 0;
 
 msix_setup_exit:
-	netdev_err(bp->dev, "bnxt_setup_msix err: %x\n", rc);
+	netdev_err(bp->dev, "bnxt_init_msix err: %x\n", rc);
+	kfree(bp->irq_tbl);
+	bp->irq_tbl = NULL;
 	pci_disable_msix(bp->pdev);
 	kfree(msix_ent);
 	return rc;
 }
 
-static int bnxt_setup_inta(struct bnxt *bp)
+static int bnxt_init_inta(struct bnxt *bp)
 {
-	int rc;
-	const int len = sizeof(bp->irq_tbl[0].name);
-
-	if (netdev_get_num_tc(bp->dev))
-		netdev_reset_tc(bp->dev);
-
 	bp->irq_tbl = kcalloc(1, sizeof(struct bnxt_irq), GFP_KERNEL);
-	if (!bp->irq_tbl) {
-		rc = -ENOMEM;
-		return rc;
-	}
+	if (!bp->irq_tbl)
+		return -ENOMEM;
+
+	bp->total_irqs = 1;
 	bp->rx_nr_rings = 1;
 	bp->tx_nr_rings = 1;
 	bp->cp_nr_rings = 1;
 	bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
 	bp->flags |= BNXT_FLAG_SHARED_RINGS;
 	bp->irq_tbl[0].vector = bp->pdev->irq;
-	snprintf(bp->irq_tbl[0].name, len,
-		 "%s-%s-%d", bp->dev->name, "TxRx", 0);
-	bp->irq_tbl[0].handler = bnxt_inta;
-	rc = bnxt_set_real_num_queues(bp);
-	return rc;
+	return 0;
 }
 
-static int bnxt_setup_int_mode(struct bnxt *bp)
+static int bnxt_init_int_mode(struct bnxt *bp)
 {
 	int rc = 0;
 
 	if (bp->flags & BNXT_FLAG_MSIX_CAP)
-		rc = bnxt_setup_msix(bp);
+		rc = bnxt_init_msix(bp);
 
 	if (!(bp->flags & BNXT_FLAG_USING_MSIX) && BNXT_PF(bp)) {
 		/* fallback to INTA */
-		rc = bnxt_setup_inta(bp);
+		rc = bnxt_init_inta(bp);
 	}
 	return rc;
 }
 
+static void bnxt_clear_int_mode(struct bnxt *bp)
+{
+	if (bp->flags & BNXT_FLAG_USING_MSIX)
+		pci_disable_msix(bp->pdev);
+
+	kfree(bp->irq_tbl);
+	bp->irq_tbl = NULL;
+	bp->flags &= ~BNXT_FLAG_USING_MSIX;
+}
+
 static void bnxt_free_irq(struct bnxt *bp)
 {
 	struct bnxt_irq *irq;
@@ -4902,10 +4943,6 @@ static void bnxt_free_irq(struct bnxt *bp)
 			free_irq(irq->vector, bp->bnapi[i]);
 		irq->requested = 0;
 	}
-	if (bp->flags & BNXT_FLAG_USING_MSIX)
-		pci_disable_msix(bp->pdev);
-	kfree(bp->irq_tbl);
-	bp->irq_tbl = NULL;
 }
 
 static int bnxt_request_irq(struct bnxt *bp)
@@ -6695,6 +6732,7 @@ static void bnxt_remove_one(struct pci_dev *pdev)
 	cancel_work_sync(&bp->sp_task);
 	bp->sp_event = 0;
 
+	bnxt_clear_int_mode(bp);
 	bnxt_hwrm_func_drv_unrgtr(bp);
 	bnxt_free_hwrm_resources(bp);
 	bnxt_dcb_free(bp);
@@ -6990,10 +7028,14 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
-	rc = register_netdev(dev);
+	rc = bnxt_init_int_mode(bp);
 	if (rc)
 		goto init_err;
 
+	rc = register_netdev(dev);
+	if (rc)
+		goto init_err_clr_int;
+
 	netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
 		    board_info[ent->driver_data].name,
 		    (long)pci_resource_start(pdev, 0), dev->dev_addr);
@@ -7002,6 +7044,9 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	return 0;
 
+init_err_clr_int:
+	bnxt_clear_int_mode(bp);
+
 init_err:
 	pci_iounmap(pdev, bp->bar0);
 	pci_release_regions(pdev);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 8327d0d..1461355 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1024,6 +1024,7 @@ struct bnxt {
 #define BNXT_STATE_FN_RST_DONE	2
 
 	struct bnxt_irq	*irq_tbl;
+	int			total_irqs;
 	u8			mac_addr[ETH_ALEN];
 
 #ifdef CONFIG_BNXT_DCB
-- 
2.5.5

^ permalink raw reply related

* [PATCH 03/28] bnxt_en: Move function reset to bnxt_init_one().
From: Selvin Xavier @ 2016-12-05  6:38 UTC (permalink / raw)
  To: dledford; +Cc: linux-rdma, Michael Chan, David Miller, netdev, Selvin Xavier
In-Reply-To: <1480919912-1079-1-git-send-email-selvin.xavier@broadcom.com>

From: Michael Chan <michael.chan@broadcom.com>

Now that MSIX is enabled in bnxt_init_one(), resources may be allocated by
the RDMA driver before the network device is opened.  So we cannot do
function reset in bnxt_open() which will clear all the resources.

The proper place to do function reset now is in bnxt_init_one().
If we get AER, we'll do function reset as well.

Cc: David Miller <davem@davemloft.net>
Cc: <netdev@vger.kernel.org>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 25 ++++++-------------------
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  1 -
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9178bf8..8b00ef4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5613,22 +5613,7 @@ int bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 static int bnxt_open(struct net_device *dev)
 {
 	struct bnxt *bp = netdev_priv(dev);
-	int rc = 0;
 
-	if (!test_bit(BNXT_STATE_FN_RST_DONE, &bp->state)) {
-		rc = bnxt_hwrm_func_reset(bp);
-		if (rc) {
-			netdev_err(bp->dev, "hwrm chip reset failure rc: %x\n",
-				   rc);
-			rc = -EBUSY;
-			return rc;
-		}
-		/* Do func_reset during the 1st PF open only to prevent killing
-		 * the VFs when the PF is brought down and up.
-		 */
-		if (BNXT_PF(bp))
-			set_bit(BNXT_STATE_FN_RST_DONE, &bp->state);
-	}
 	return __bnxt_open_nic(bp, true, true);
 }
 
@@ -7028,6 +7013,10 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
+	rc = bnxt_hwrm_func_reset(bp);
+	if (rc)
+		goto init_err;
+
 	rc = bnxt_init_int_mode(bp);
 	if (rc)
 		goto init_err;
@@ -7069,7 +7058,6 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
 					       pci_channel_state_t state)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
-	struct bnxt *bp = netdev_priv(netdev);
 
 	netdev_info(netdev, "PCI I/O error detected\n");
 
@@ -7084,8 +7072,6 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev,
 	if (netif_running(netdev))
 		bnxt_close(netdev);
 
-	/* So that func_reset will be done during slot_reset */
-	clear_bit(BNXT_STATE_FN_RST_DONE, &bp->state);
 	pci_disable_device(pdev);
 	rtnl_unlock();
 
@@ -7119,7 +7105,8 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev)
 	} else {
 		pci_set_master(pdev);
 
-		if (netif_running(netdev))
+		err = bnxt_hwrm_func_reset(bp);
+		if (!err && netif_running(netdev))
 			err = bnxt_open(netdev);
 
 		if (!err)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 1461355..0ee2cc4 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1021,7 +1021,6 @@ struct bnxt {
 	unsigned long		state;
 #define BNXT_STATE_OPEN		0
 #define BNXT_STATE_IN_SP_TASK	1
-#define BNXT_STATE_FN_RST_DONE	2
 
 	struct bnxt_irq	*irq_tbl;
 	int			total_irqs;
-- 
2.5.5

^ permalink raw reply related

* [PATCH 07/28] bnxt_en: Add interface to support RDMA driver.
From: Selvin Xavier @ 2016-12-05  6:38 UTC (permalink / raw)
  To: dledford; +Cc: linux-rdma, Michael Chan, David Miller, netdev, Selvin Xavier
In-Reply-To: <1480919912-1079-1-git-send-email-selvin.xavier@broadcom.com>

From: Michael Chan <michael.chan@broadcom.com>

Since the network driver and RDMA driver operate on the same PCI function,
we need to create an interface to allow the RDMA driver to share resources
with the network driver.

1. Create a new bnxt_en_dev struct which will be returned by
bnxt_ulp_probe() upon success.  After that, all calls from the RDMA driver
to bnxt_en will pass a pointer to this struct.

2. This struct contains additional function pointers to register, request
msix, send fw messages, register for async events.

3. If the RDMA driver wants to enable RDMA on the function, it needs to
call the function pointer bnxt_register_device().  A ulp_ops structure
is passed for upcalls from bnxt_en to the RDMA driver.

4. MSIX is requested by calling the function pointer bnxt_request_msix().

5. The RMDA driver can call firmware APIs using the bnxt_send_fw_msg()
function pointer.

6. 1 stats context is reserved when the RDMA driver registers.  MSIX
and completion rings are reserved when the RDMA driver requests for
MSIX.

7. When the RDMA driver calls bnxt_unregister_device(), all RDMA resources
will be cleaned up.

Cc: David Miller <davem@davemloft.net>
Cc: <netdev@vger.kernel.org>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/Makefile   |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  34 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |   6 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 288 ++++++++++++++++++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h |  91 ++++++++
 5 files changed, 417 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h

diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile
index b233a86..6082ed1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_BNXT) += bnxt_en.o
 
-bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c26735ea..19f26b8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -52,6 +52,7 @@
 
 #include "bnxt_hsi.h"
 #include "bnxt.h"
+#include "bnxt_ulp.h"
 #include "bnxt_sriov.h"
 #include "bnxt_ethtool.h"
 #include "bnxt_dcb.h"
@@ -1528,12 +1529,11 @@ static int bnxt_async_event_process(struct bnxt *bp,
 		set_bit(BNXT_RESET_TASK_SILENT_SP_EVENT, &bp->sp_event);
 		break;
 	default:
-		netdev_err(bp->dev, "unhandled ASYNC event (id 0x%x)\n",
-			   event_id);
 		goto async_event_process_exit;
 	}
 	schedule_work(&bp->sp_task);
 async_event_process_exit:
+	bnxt_ulp_async_events(bp, cmpl);
 	return 0;
 }
 
@@ -3547,7 +3547,7 @@ static int bnxt_hwrm_vnic_ctx_alloc(struct bnxt *bp, u16 vnic_id, u16 ctx_idx)
 	return rc;
 }
 
-static int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
 {
 	unsigned int ring = 0, grp_idx;
 	struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id];
@@ -3595,6 +3595,9 @@ static int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id)
 #endif
 	if ((bp->flags & BNXT_FLAG_STRIP_VLAN) || def_vlan)
 		req.flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE);
+	if (!vnic_id && bnxt_ulp_registered(bp->edev, BNXT_ROCE_ULP))
+		req.flags |=
+			cpu_to_le32(VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE);
 
 	return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 }
@@ -4842,6 +4845,16 @@ unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
 #endif
 }
 
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max)
+{
+	if (BNXT_PF(bp))
+		bp->pf.max_stat_ctxs = max;
+#if defined(CONFIG_BNXT_SRIOV)
+	else
+		bp->vf.max_stat_ctxs = max;
+#endif
+}
+
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
 {
 	if (BNXT_PF(bp))
@@ -4851,6 +4864,16 @@ unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
 #endif
 }
 
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max)
+{
+	if (BNXT_PF(bp))
+		bp->pf.max_cp_rings = max;
+#if defined(CONFIG_BNXT_SRIOV)
+	else
+		bp->vf.max_cp_rings = max;
+#endif
+}
+
 static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
 {
 	if (BNXT_PF(bp))
@@ -6767,6 +6790,8 @@ static void bnxt_remove_one(struct pci_dev *pdev)
 	pci_iounmap(pdev, bp->bar2);
 	pci_iounmap(pdev, bp->bar1);
 	pci_iounmap(pdev, bp->bar0);
+	kfree(bp->edev);
+	bp->edev = NULL;
 	free_netdev(dev);
 
 	pci_release_regions(pdev);
@@ -6936,6 +6961,7 @@ void bnxt_restore_pf_fw_resources(struct bnxt *bp)
 {
 	ASSERT_RTNL();
 	bnxt_hwrm_func_qcaps(bp);
+	bnxt_subtract_ulp_resources(bp, BNXT_ROCE_ULP);
 }
 
 static void bnxt_parse_log_pcie_link(struct bnxt *bp)
@@ -7047,6 +7073,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto init_err;
 
+	bp->ulp_probe = bnxt_ulp_probe;
+
 	/* Get the MAX capabilities for this function */
 	rc = bnxt_hwrm_func_qcaps(bp);
 	if (rc) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index eec2415..16defe9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -972,6 +972,9 @@ struct bnxt {
 #define BNXT_SINGLE_PF(bp)	(BNXT_PF(bp) && !BNXT_NPAR(bp))
 #define BNXT_CHIP_TYPE_NITRO_A0(bp) ((bp)->flags & BNXT_FLAG_CHIP_NITRO_A0)
 
+	struct bnxt_en_dev	*edev;
+	struct bnxt_en_dev *	(*ulp_probe)(struct net_device *);
+
 	struct bnxt_napi	**bnapi;
 
 	struct bnxt_rx_ring_info	*rx_ring;
@@ -1242,9 +1245,12 @@ int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_func_rgtr_async_events(struct bnxt *bp, unsigned long *bmap,
 				     int bmap_size);
+int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id);
 int bnxt_hwrm_set_coal(struct bnxt *);
 unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
+void bnxt_set_max_func_stat_ctxs(struct bnxt *bp, unsigned int max);
 unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
+void bnxt_set_max_func_cp_rings(struct bnxt *bp, unsigned int max);
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
new file mode 100644
index 0000000..a018f4b
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -0,0 +1,288 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <asm/byteorder.h>
+#include <linux/bitmap.h>
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+#include "bnxt_ulp.h"
+
+static int bnxt_register_dev(struct bnxt_en_dev *edev, int ulp_id,
+			     struct bnxt_ulp_ops *ulp_ops, void *handle)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+
+	ASSERT_RTNL();
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	if (rcu_access_pointer(ulp->ulp_ops)) {
+		netdev_err(bp->dev, "ulp id %d already registered\n", ulp_id);
+		return -EBUSY;
+	}
+	if (ulp_id == BNXT_ROCE_ULP) {
+		unsigned int max_stat_ctxs;
+
+		max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+		if (max_stat_ctxs <= BNXT_MIN_ROCE_STAT_CTXS ||
+		    bp->num_stat_ctxs == max_stat_ctxs)
+			return -ENOMEM;
+		bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs -
+					    BNXT_MIN_ROCE_STAT_CTXS);
+	}
+
+	atomic_set(&ulp->ref_count, 0);
+	ulp->handle = handle;
+	rcu_assign_pointer(ulp->ulp_ops, ulp_ops);
+
+	if (ulp_id == BNXT_ROCE_ULP) {
+		if (test_bit(BNXT_STATE_OPEN, &bp->state))
+			bnxt_hwrm_vnic_cfg(bp, 0);
+	}
+
+	return 0;
+}
+
+static int bnxt_unregister_dev(struct bnxt_en_dev *edev, int ulp_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+
+	ASSERT_RTNL();
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	if (!rcu_access_pointer(ulp->ulp_ops)) {
+		netdev_err(bp->dev, "ulp id %d not registered\n", ulp_id);
+		return -EINVAL;
+	}
+	if (ulp_id == BNXT_ROCE_ULP) {
+		unsigned int max_stat_ctxs;
+
+		max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp);
+		bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs + 1);
+	}
+	if (ulp->max_async_event_id)
+		bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0);
+
+	RCU_INIT_POINTER(ulp->ulp_ops, NULL);
+	synchronize_rcu();
+	ulp->max_async_event_id = 0;
+	ulp->async_events_bmap = NULL;
+	return 0;
+}
+
+static int bnxt_req_msix_vecs(struct bnxt_en_dev *edev, int ulp_id,
+			      struct bnxt_msix_entry *ent, int num_msix)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	int max_idx, max_cp_rings;
+	int avail_msix, i, idx;
+
+	ASSERT_RTNL();
+	if (ulp_id != BNXT_ROCE_ULP)
+		return -EINVAL;
+
+	if (!(bp->flags & BNXT_FLAG_USING_MSIX))
+		return -ENODEV;
+
+	max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+	max_idx = min_t(int, bp->total_irqs, max_cp_rings);
+	avail_msix = max_idx - bp->cp_nr_rings;
+	if (!avail_msix)
+		return -ENOMEM;
+	if (avail_msix > num_msix)
+		avail_msix = num_msix;
+
+	idx = max_idx - avail_msix;
+	for (i = 0; i < avail_msix; i++) {
+		ent[i].vector = bp->irq_tbl[idx + i].vector;
+		ent[i].ring_idx = idx + i;
+		ent[i].db_offset = (idx + i) * 0x80;
+	}
+	bnxt_set_max_func_irqs(bp, max_idx - avail_msix);
+	bnxt_set_max_func_cp_rings(bp, max_cp_rings - avail_msix);
+	edev->ulp_tbl[ulp_id].msix_requested = avail_msix;
+	return avail_msix;
+}
+
+static int bnxt_free_msix_vecs(struct bnxt_en_dev *edev, int ulp_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	int max_cp_rings, msix_requested;
+
+	ASSERT_RTNL();
+	if (ulp_id != BNXT_ROCE_ULP)
+		return -EINVAL;
+
+	max_cp_rings = bnxt_get_max_func_cp_rings(bp);
+	msix_requested = edev->ulp_tbl[ulp_id].msix_requested;
+	bnxt_set_max_func_cp_rings(bp, max_cp_rings + msix_requested);
+	edev->ulp_tbl[ulp_id].msix_requested = 0;
+	bnxt_set_max_func_irqs(bp, bp->total_irqs);
+	return 0;
+}
+
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id)
+{
+	ASSERT_RTNL();
+	if (bnxt_ulp_registered(bp->edev, ulp_id)) {
+		struct bnxt_en_dev *edev = bp->edev;
+		unsigned int msix_req, max;
+
+		msix_req = edev->ulp_tbl[ulp_id].msix_requested;
+		max = bnxt_get_max_func_cp_rings(bp);
+		bnxt_set_max_func_cp_rings(bp, max - msix_req);
+		max = bnxt_get_max_func_stat_ctxs(bp);
+		bnxt_set_max_func_stat_ctxs(bp, max - 1);
+	}
+}
+
+static int bnxt_send_msg(struct bnxt_en_dev *edev, int ulp_id,
+			 struct bnxt_fw_msg *fw_msg)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct input *req;
+	int rc;
+
+	mutex_lock(&bp->hwrm_cmd_lock);
+	req = fw_msg->msg;
+	req->resp_addr = cpu_to_le64(bp->hwrm_cmd_resp_dma_addr);
+	rc = _hwrm_send_message(bp, fw_msg->msg, fw_msg->msg_len,
+				fw_msg->timeout);
+	if (!rc) {
+		struct output *resp = bp->hwrm_cmd_resp_addr;
+		u32 len = le16_to_cpu(resp->resp_len);
+
+		if (fw_msg->resp_max_len < len)
+			len = fw_msg->resp_max_len;
+
+		memcpy(fw_msg->resp, resp, len);
+	}
+	mutex_unlock(&bp->hwrm_cmd_lock);
+	return rc;
+}
+
+void bnxt_ulp_stop(struct bnxt *bp)
+{
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	rcu_read_lock();
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rcu_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_stop)
+			continue;
+	}
+	rcu_read_unlock();
+}
+
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl)
+{
+	u16 event_id = le16_to_cpu(cmpl->event_id);
+	struct bnxt_en_dev *edev = bp->edev;
+	struct bnxt_ulp_ops *ops;
+	int i;
+
+	if (!edev)
+		return;
+
+	rcu_read_lock();
+	for (i = 0; i < BNXT_MAX_ULP; i++) {
+		struct bnxt_ulp *ulp = &edev->ulp_tbl[i];
+
+		ops = rcu_dereference(ulp->ulp_ops);
+		if (!ops || !ops->ulp_async_notifier)
+			continue;
+		if (!ulp->async_events_bmap ||
+		    event_id > ulp->max_async_event_id)
+			continue;
+
+		/* Read max_async_event_id first before testing the bitmap. */
+		smp_rmb();
+		if (test_bit(event_id, ulp->async_events_bmap))
+			ops->ulp_async_notifier(ulp->handle, cmpl);
+	}
+	rcu_read_unlock();
+}
+
+static int bnxt_register_async_events(struct bnxt_en_dev *edev, int ulp_id,
+				      unsigned long *events_bmap, u16 max_id)
+{
+	struct net_device *dev = edev->net;
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_ulp *ulp;
+
+	if (ulp_id >= BNXT_MAX_ULP)
+		return -EINVAL;
+
+	ulp = &edev->ulp_tbl[ulp_id];
+	ulp->async_events_bmap = events_bmap;
+	/* Make sure bnxt_ulp_async_events() sees this order */
+	smp_wmb();
+	ulp->max_async_event_id = max_id;
+	bnxt_hwrm_func_rgtr_async_events(bp, events_bmap, max_id + 1);
+	return 0;
+}
+
+static const struct bnxt_en_ops bnxt_en_ops_tbl = {
+	.bnxt_register_device	= bnxt_register_dev,
+	.bnxt_unregister_device	= bnxt_unregister_dev,
+	.bnxt_request_msix	= bnxt_req_msix_vecs,
+	.bnxt_free_msix		= bnxt_free_msix_vecs,
+	.bnxt_send_fw_msg	= bnxt_send_msg,
+	.bnxt_register_fw_async_events	= bnxt_register_async_events,
+};
+
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev)
+{
+	struct bnxt *bp = netdev_priv(dev);
+	struct bnxt_en_dev *edev;
+
+	edev = bp->edev;
+	if (!edev) {
+		edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+		if (!edev)
+			return ERR_PTR(-ENOMEM);
+		edev->en_ops = &bnxt_en_ops_tbl;
+		if (bp->flags & BNXT_FLAG_ROCEV1_CAP)
+			edev->flags |= BNXT_EN_FLAG_ROCEV1_CAP;
+		if (bp->flags & BNXT_FLAG_ROCEV2_CAP)
+			edev->flags |= BNXT_EN_FLAG_ROCEV2_CAP;
+		edev->net = dev;
+		edev->pdev = bp->pdev;
+		bp->edev = edev;
+	}
+	return bp->edev;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
new file mode 100644
index 0000000..1ebccd8
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h
@@ -0,0 +1,91 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2016 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef BNXT_ULP_H
+#define BNXT_ULP_H
+
+#define BNXT_ROCE_ULP	0
+#define BNXT_OTHER_ULP	1
+#define BNXT_MAX_ULP	2
+
+#define BNXT_MIN_ROCE_CP_RINGS	2
+#define BNXT_MIN_ROCE_STAT_CTXS	1
+
+struct hwrm_async_event_cmpl;
+struct bnxt;
+
+struct bnxt_ulp_ops {
+	/* async_notifier() cannot sleep (in BH context) */
+	void (*ulp_async_notifier)(void *, struct hwrm_async_event_cmpl *);
+	void (*ulp_stop)(void *);
+	void (*ulp_start)(void *);
+	void (*ulp_sriov_config)(void *, int);
+};
+
+struct bnxt_msix_entry {
+	u32	vector;
+	u32	ring_idx;
+	u32	db_offset;
+};
+
+struct bnxt_fw_msg {
+	void	*msg;
+	int	msg_len;
+	void	*resp;
+	int	resp_max_len;
+	int	timeout;
+};
+
+struct bnxt_ulp {
+	void		*handle;
+	struct bnxt_ulp_ops __rcu *ulp_ops;
+	unsigned long	*async_events_bmap;
+	u16		max_async_event_id;
+	u16		msix_requested;
+	atomic_t	ref_count;
+};
+
+struct bnxt_en_dev {
+	struct net_device *net;
+	struct pci_dev *pdev;
+	u32 flags;
+	#define BNXT_EN_FLAG_ROCEV1_CAP		0x1
+	#define BNXT_EN_FLAG_ROCEV2_CAP		0x2
+	#define BNXT_EN_FLAG_ROCE_CAP		(BNXT_EN_FLAG_ROCEV1_CAP | \
+						 BNXT_EN_FLAG_ROCEV2_CAP)
+	const struct bnxt_en_ops	*en_ops;
+	struct bnxt_ulp			ulp_tbl[BNXT_MAX_ULP];
+};
+
+struct bnxt_en_ops {
+	int (*bnxt_register_device)(struct bnxt_en_dev *, int,
+				    struct bnxt_ulp_ops *, void *);
+	int (*bnxt_unregister_device)(struct bnxt_en_dev *, int);
+	int (*bnxt_request_msix)(struct bnxt_en_dev *, int,
+				 struct bnxt_msix_entry *, int);
+	int (*bnxt_free_msix)(struct bnxt_en_dev *, int);
+	int (*bnxt_send_fw_msg)(struct bnxt_en_dev *, int,
+				struct bnxt_fw_msg *);
+	int (*bnxt_register_fw_async_events)(struct bnxt_en_dev *, int,
+					     unsigned long *, u16);
+};
+
+static inline bool bnxt_ulp_registered(struct bnxt_en_dev *edev, int ulp_id)
+{
+	if (edev && rcu_access_pointer(edev->ulp_tbl[ulp_id].ulp_ops))
+		return true;
+	return false;
+}
+
+void bnxt_subtract_ulp_resources(struct bnxt *bp, int ulp_id);
+void bnxt_ulp_stop(struct bnxt *bp);
+void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl);
+struct bnxt_en_dev *bnxt_ulp_probe(struct net_device *dev);
+
+#endif
-- 
2.5.5

^ permalink raw reply related

* [PATCH 04/28] bnxt_en: Improve completion ring allocation for VFs.
From: Selvin Xavier @ 2016-12-05  6:38 UTC (permalink / raw)
  To: dledford; +Cc: linux-rdma, Michael Chan, David Miller, netdev, Selvin Xavier
In-Reply-To: <1480919912-1079-1-git-send-email-selvin.xavier@broadcom.com>

From: Michael Chan <michael.chan@broadcom.com>

All available remaining completion rings not used by the PF should be
made available for the VFs so that there are enough rings in the VF to
support RDMA.  The earlier workaround code of capping the rings by the
statistics context is removed.

When SRIOV is disabled, call a new function bnxt_restore_pf_fw_resources()
to restore FW resources.  Later on we need to add some logic to account
for RDMA resources.

Cc: David Miller <davem@davemloft.net>
Cc: <netdev@vger.kernel.org>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c       |  8 +++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h       |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 14 ++++----------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 8b00ef4..1f6be83 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4152,7 +4152,7 @@ static int bnxt_hwrm_func_qcfg(struct bnxt *bp)
 	return rc;
 }
 
-int bnxt_hwrm_func_qcaps(struct bnxt *bp)
+static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
 {
 	int rc = 0;
 	struct hwrm_func_qcaps_input req = {0};
@@ -6856,6 +6856,12 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
 	return rc;
 }
 
+void bnxt_restore_pf_fw_resources(struct bnxt *bp)
+{
+	ASSERT_RTNL();
+	bnxt_hwrm_func_qcaps(bp);
+}
+
 static void bnxt_parse_log_pcie_link(struct bnxt *bp)
 {
 	enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 0ee2cc4..43a4b17 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1234,7 +1234,6 @@ int _hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
-int bnxt_hwrm_func_qcaps(struct bnxt *);
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
@@ -1245,4 +1244,5 @@ int bnxt_open_nic(struct bnxt *, bool, bool);
 int bnxt_close_nic(struct bnxt *, bool, bool);
 int bnxt_setup_mq_tc(struct net_device *dev, u8 tc);
 int bnxt_get_max_rings(struct bnxt *, int *, int *, bool);
+void bnxt_restore_pf_fw_resources(struct bnxt *bp);
 #endif
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index bff626a..c696025 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -420,15 +420,7 @@ static int bnxt_hwrm_func_cfg(struct bnxt *bp, int num_vfs)
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1);
 
 	/* Remaining rings are distributed equally amongs VF's for now */
-	/* TODO: the following workaroud is needed to restrict total number
-	 * of vf_cp_rings not exceed number of HW ring groups. This WA should
-	 * be removed once new HWRM provides HW ring groups capability in
-	 * hwrm_func_qcap.
-	 */
-	vf_cp_rings = min_t(u16, pf->max_cp_rings, pf->max_stat_ctxs);
-	vf_cp_rings = (vf_cp_rings - bp->cp_nr_rings) / num_vfs;
-	/* TODO: restore this logic below once the WA above is removed */
-	/* vf_cp_rings = (pf->max_cp_rings - bp->cp_nr_rings) / num_vfs; */
+	vf_cp_rings = (pf->max_cp_rings - bp->cp_nr_rings) / num_vfs;
 	vf_stat_ctx = (pf->max_stat_ctxs - bp->num_stat_ctxs) / num_vfs;
 	if (bp->flags & BNXT_FLAG_AGG_RINGS)
 		vf_rx_rings = (pf->max_rx_rings - bp->rx_nr_rings * 2) /
@@ -590,7 +582,9 @@ void bnxt_sriov_disable(struct bnxt *bp)
 
 	bp->pf.active_vfs = 0;
 	/* Reclaim all resources for the PF. */
-	bnxt_hwrm_func_qcaps(bp);
+	rtnl_lock();
+	bnxt_restore_pf_fw_resources(bp);
+	rtnl_unlock();
 }
 
 int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs)
-- 
2.5.5

^ permalink raw reply related

* [PATCH 05/28] bnxt_en: Reserve RDMA resources by default.
From: Selvin Xavier @ 2016-12-05  6:38 UTC (permalink / raw)
  To: dledford; +Cc: linux-rdma, Michael Chan, David Miller, netdev, Selvin Xavier
In-Reply-To: <1480919912-1079-1-git-send-email-selvin.xavier@broadcom.com>

From: Michael Chan <michael.chan@broadcom.com>

If the device supports RDMA, we'll setup network default rings so that
there are enough minimum resources for RDMA, if possible.  However, the
user can still increase network rings to the max if he wants.  The actual
RDMA resources won't be reserved until the RDMA driver registers.

Cc: David Miller <davem@davemloft.net>
Cc: <netdev@vger.kernel.org>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 58 ++++++++++++++++++++++++++++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  9 +++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1f6be83..7218d65 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4166,6 +4166,11 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp)
 	if (rc)
 		goto hwrm_func_qcaps_exit;
 
+	if (resp->flags & cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_ROCE_V1_SUPPORTED))
+		bp->flags |= BNXT_FLAG_ROCEV1_CAP;
+	if (resp->flags & cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_ROCE_V2_SUPPORTED))
+		bp->flags |= BNXT_FLAG_ROCEV2_CAP;
+
 	bp->tx_push_thresh = 0;
 	if (resp->flags &
 	    cpu_to_le32(FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED))
@@ -4808,6 +4813,24 @@ static int bnxt_setup_int_mode(struct bnxt *bp)
 	return rc;
 }
 
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp)
+{
+	if (BNXT_PF(bp))
+		return bp->pf.max_stat_ctxs;
+#if defined(CONFIG_BNXT_SRIOV)
+	return bp->vf.max_stat_ctxs;
+#endif
+}
+
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp)
+{
+	if (BNXT_PF(bp))
+		return bp->pf.max_cp_rings;
+#if defined(CONFIG_BNXT_SRIOV)
+	return bp->vf.max_cp_rings;
+#endif
+}
+
 static unsigned int bnxt_get_max_func_irqs(struct bnxt *bp)
 {
 	if (BNXT_PF(bp))
@@ -6832,6 +6855,39 @@ int bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, bool shared)
 	return bnxt_trim_rings(bp, max_rx, max_tx, cp, shared);
 }
 
+static int bnxt_get_dflt_rings(struct bnxt *bp, int *max_rx, int *max_tx,
+			       bool shared)
+{
+	int rc;
+
+	rc = bnxt_get_max_rings(bp, max_rx, max_tx, shared);
+	if (rc)
+		return rc;
+
+	if (bp->flags & BNXT_FLAG_ROCE_CAP) {
+		int max_cp, max_stat, max_irq;
+
+		/* Reserve minimum resources for RoCE */
+		max_cp = bnxt_get_max_func_cp_rings(bp);
+		max_stat = bnxt_get_max_func_stat_ctxs(bp);
+		max_irq = bnxt_get_max_func_irqs(bp);
+		if (max_cp <= BNXT_MIN_ROCE_CP_RINGS ||
+		    max_irq <= BNXT_MIN_ROCE_CP_RINGS ||
+		    max_stat <= BNXT_MIN_ROCE_STAT_CTXS)
+			return 0;
+
+		max_cp -= BNXT_MIN_ROCE_CP_RINGS;
+		max_irq -= BNXT_MIN_ROCE_CP_RINGS;
+		max_stat -= BNXT_MIN_ROCE_STAT_CTXS;
+		max_cp = min_t(int, max_cp, max_irq);
+		max_cp = min_t(int, max_cp, max_stat);
+		rc = bnxt_trim_rings(bp, max_rx, max_tx, max_cp, shared);
+		if (rc)
+			rc = 0;
+	}
+	return rc;
+}
+
 static int bnxt_set_dflt_rings(struct bnxt *bp)
 {
 	int dflt_rings, max_rx_rings, max_tx_rings, rc;
@@ -6840,7 +6896,7 @@ static int bnxt_set_dflt_rings(struct bnxt *bp)
 	if (sh)
 		bp->flags |= BNXT_FLAG_SHARED_RINGS;
 	dflt_rings = netif_get_num_default_rss_queues();
-	rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh);
+	rc = bnxt_get_dflt_rings(bp, &max_rx_rings, &max_tx_rings, sh);
 	if (rc)
 		return rc;
 	bp->rx_nr_rings = min_t(int, dflt_rings, max_rx_rings);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 43a4b17..d796836 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -387,6 +387,9 @@ struct rx_tpa_end_cmp_ext {
 #define DB_KEY_TX_PUSH						(0x4 << 28)
 #define DB_LONG_TX_PUSH						(0x2 << 24)
 
+#define BNXT_MIN_ROCE_CP_RINGS	2
+#define BNXT_MIN_ROCE_STAT_CTXS	1
+
 #define INVALID_HW_RING_ID	((u16)-1)
 
 /* The hardware supports certain page sizes.  Use the supported page sizes
@@ -953,6 +956,10 @@ struct bnxt {
 	#define BNXT_FLAG_PORT_STATS	0x400
 	#define BNXT_FLAG_UDP_RSS_CAP	0x800
 	#define BNXT_FLAG_EEE_CAP	0x1000
+	#define BNXT_FLAG_ROCEV1_CAP	0x8000
+	#define BNXT_FLAG_ROCEV2_CAP	0x10000
+	#define BNXT_FLAG_ROCE_CAP	(BNXT_FLAG_ROCEV1_CAP |	\
+					 BNXT_FLAG_ROCEV2_CAP)
 	#define BNXT_FLAG_CHIP_NITRO_A0	0x1000000
 
 	#define BNXT_FLAG_ALL_CONFIG_FEATS (BNXT_FLAG_TPA |		\
@@ -1234,6 +1241,8 @@ int _hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message(struct bnxt *, void *, u32, int);
 int hwrm_send_message_silent(struct bnxt *, void *, u32, int);
 int bnxt_hwrm_set_coal(struct bnxt *);
+unsigned int bnxt_get_max_func_stat_ctxs(struct bnxt *bp);
+unsigned int bnxt_get_max_func_cp_rings(struct bnxt *bp);
 void bnxt_set_max_func_irqs(struct bnxt *bp, unsigned int max);
 void bnxt_tx_disable(struct bnxt *bp);
 void bnxt_tx_enable(struct bnxt *bp);
-- 
2.5.5

^ permalink raw reply related

* [net-next][PATCH 00/18] net: RDS updates
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel

Series consist of:
 - RDMA transport fixes for map failure, listen sequence, handler panic and
   composite message notification.
 - Couple of sparse fixes.
 - Message logging improvements for bind failure, use once mr semantics
   and connection remote address, active end point.
 - Performance improvement for RDMA transport by reducing the post send
   pressure on the queue and spreading the CQ vectors.
 - Useful statistics for socket send/recv usage and receive cache usage.
   rds-tools already equipped to parse this info.
 - Additional RDS CMSG used by application to track the RDS message
   stages for certain type of traffic to find out latency spots.
   Can be enabled/disabled per socket.
    
Series generated against 'net-next'. Patchset is also available on
below git tree.

The following changes since commit adc176c5472214971d77c1a61c83db9b01e9cdc7:

  ipv6 addrconf: Implemented enhanced DAD (RFC7527) (2016-12-03 23:21:37 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux.git for_4.10/net-next/rds

for you to fetch changes up to 35b76a2744f63f936a8b2e5c306d47618883144b:

  RDS: IB: add missing connection cache usage info (2016-12-04 17:06:30 -0800)

----------------------------------------------------------------

Avinash Repaka (1):
  RDS: make message size limit compliant with spec

Qing Huang (1):
  RDS: RDMA: start rdma listening after init

Santosh Shilimkar (15):
  RDS: log the address on bind failure
  RDS: mark few internal functions static to make sparse build happy
  RDS: IB: include faddr in connection log
  RDS: IB: make the transport retry count smallest
  RDS: RDMA: fix the ib_map_mr_sg_zbva() argument
  RDS: RDMA: return appropriate error on rdma map failures
  RDS: IB: split the mr registration and invalidation path
  RDS: RDMA: silence the use_once mr log flood
  RDS: IB: track and log active side endpoint in connection
  RDS: IB: add few useful cache stasts
  RDS: IB: Add vector spreading for cqs
  RDS: RDMA: Fix the composite message user notification
  RDS: IB: fix panic due to handlers running post teardown
  RDS: add receive message trace used by application
  RDS: IB: add missing connection cache usage info

Venkat Venkatsubra (1):
  RDS: add stat for socket recv memory usage

 include/uapi/linux/rds.h | 34 ++++++++++++++++++
 net/rds/af_rds.c         | 28 +++++++++++++++
 net/rds/bind.c           |  4 +--
 net/rds/connection.c     |  2 +-
 net/rds/ib.c             | 12 +++++++
 net/rds/ib.h             | 22 ++++++++++--
 net/rds/ib_cm.c          | 89 ++++++++++++++++++++++++++++++++++++++----------
 net/rds/ib_frmr.c        | 16 +++++----
 net/rds/ib_recv.c        | 14 ++++++--
 net/rds/ib_send.c        | 29 +++++++++-------
 net/rds/ib_stats.c       |  2 ++
 net/rds/rdma.c           | 22 ++++++++++--
 net/rds/rdma_transport.c | 11 ++----
 net/rds/rds.h            | 17 +++++++++
 net/rds/recv.c           | 36 ++++++++++++++++++--
 net/rds/send.c           | 50 ++++++++++++++++++++++++---
 net/rds/tcp_listen.c     |  1 +
 net/rds/tcp_recv.c       |  5 +++
 18 files changed, 333 insertions(+), 61 deletions(-)

-- 
1.9.1

^ permalink raw reply

* [net-next][PATCH 01/18] RDS: log the address on bind failure
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

It's useful to know the IP address when RDS fails to bind a
connection. Thus, adding it to the error message.

Orabug: 21894138
Reviewed-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/bind.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 095f6ce..3a915be 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (!trans) {
 		ret = -EADDRNOTAVAIL;
 		rds_remove_bound(rs);
-		printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
-				"load rds_tcp or rds_rdma?\n");
+		pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
+				    __func__, &sin->sin_addr.s_addr);
 		goto out;
 	}
 
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 02/18] RDS: mark few internal functions static to make sparse build happy
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Fixes below warnings:
warning: symbol 'rds_send_probe' was not declared. Should it be static?
warning: symbol 'rds_send_ping' was not declared. Should it be static?
warning: symbol 'rds_tcp_accept_one_path' was not declared. Should it be static?
warning: symbol 'rds_walk_conn_path_info' was not declared. Should it be static?

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/connection.c | 2 +-
 net/rds/send.c       | 4 ++--
 net/rds/tcp_listen.c | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index fe9d31c..26533b2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -545,7 +545,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 }
 EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
 
-void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
+static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
 			     struct rds_info_iterator *iter,
 			     struct rds_info_lengths *lens,
 			     int (*visitor)(struct rds_conn_path *, void *),
diff --git a/net/rds/send.c b/net/rds/send.c
index 77c8c6e..bb13c56 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1169,7 +1169,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
  * or
  *   RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED
  */
-int
+static int
 rds_send_probe(struct rds_conn_path *cp, __be16 sport,
 	       __be16 dport, u8 h_flags)
 {
@@ -1238,7 +1238,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	return rds_send_probe(cp, 0, dport, 0);
 }
 
-void
+static void
 rds_send_ping(struct rds_connection *conn)
 {
 	unsigned long flags;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index f74bab3..67d0929 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -79,6 +79,7 @@ int rds_tcp_keepalive(struct socket *sock)
  * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
  * by moving them to CONNECTING in this function.
  */
+static
 struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 {
 	int i;
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 03/18] RDS: IB: include faddr in connection log
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Also use pr_* for it.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_cm.c   | 19 +++++++++----------
 net/rds/ib_recv.c |  4 ++--
 net/rds/ib_send.c |  4 ++--
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 5b2ab95..b9da1e5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -113,19 +113,18 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	}
 
 	if (conn->c_version < RDS_PROTOCOL(3, 1)) {
-		printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
-		       " no longer supported\n",
-		       &conn->c_faddr,
-		       RDS_PROTOCOL_MAJOR(conn->c_version),
-		       RDS_PROTOCOL_MINOR(conn->c_version));
+		pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version));
 		rds_conn_destroy(conn);
 		return;
 	} else {
-		printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-		       &conn->c_faddr,
-		       RDS_PROTOCOL_MAJOR(conn->c_version),
-		       RDS_PROTOCOL_MINOR(conn->c_version),
-		       ic->i_flowctl ? ", flow control" : "");
+		pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n",
+			  &conn->c_laddr, &conn->c_faddr,
+			  RDS_PROTOCOL_MAJOR(conn->c_version),
+			  RDS_PROTOCOL_MINOR(conn->c_version),
+			  ic->i_flowctl ? ", flow control" : "");
 	}
 
 	/*
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 606a11f..6803b75 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -980,8 +980,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
 	} else {
 		/* We expect errors as the qp is drained during shutdown */
 		if (rds_conn_up(conn) || rds_conn_connecting(conn))
-			rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
-					  &conn->c_faddr,
+			rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+					  &conn->c_laddr, &conn->c_faddr,
 					  wc->status,
 					  ib_wc_status_msg(wc->status));
 	}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 84d90c9..19eca5c 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -300,8 +300,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 
 	/* We expect errors as the qp is drained during shutdown */
 	if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-		rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
-				  &conn->c_faddr, wc->status,
+		rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+				  &conn->c_laddr, &conn->c_faddr, wc->status,
 				  ib_wc_status_msg(wc->status));
 	}
 }
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 04/18] RDS: IB: make the transport retry count smallest
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Transport retry is not much useful since it indicate packet loss
in fabric so its better to failover fast rather than longer retry.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 45ac8e8..f4e8121 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -16,7 +16,7 @@
 #define RDS_IB_DEFAULT_SEND_WR		256
 #define RDS_IB_DEFAULT_FR_WR		512
 
-#define RDS_IB_DEFAULT_RETRY_COUNT	2
+#define RDS_IB_DEFAULT_RETRY_COUNT	1
 
 #define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
 
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 05/18] RDS: RDMA: fix the ib_map_mr_sg_zbva() argument
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Fixes warning: Using plain integer as NULL pointer

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_frmr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index d921adc..66b3d62 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
 	struct ib_send_wr *failed_wr;
 	struct ib_reg_wr reg_wr;
-	int ret;
+	int ret, off = 0;
 
 	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
 		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		cpu_relax();
 	}
 
-	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE);
+	ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
+				&off, PAGE_SIZE);
 	if (unlikely(ret != ibmr->sg_len))
 		return ret < 0 ? ret : -EINVAL;
 
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 06/18] RDS: RDMA: start rdma listening after init
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

From: Qing Huang <qing.huang@oracle.com>

This prevents RDS from handling incoming rdma packets before RDS
completes initializing its recv/send components.

Signed-off-by: Qing Huang <qing.huang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma_transport.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 345f090..250c6b8 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -203,18 +203,13 @@ static int rds_rdma_init(void)
 {
 	int ret;
 
-	ret = rds_rdma_listen_init();
+	ret = rds_ib_init();
 	if (ret)
 		goto out;
 
-	ret = rds_ib_init();
+	ret = rds_rdma_listen_init();
 	if (ret)
-		goto err_ib_init;
-
-	goto out;
-
-err_ib_init:
-	rds_rdma_listen_stop();
+		rds_ib_exit();
 out:
 	return ret;
 }
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 08/18] RDS: IB: split the mr registration and invalidation path
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

MR invalidation in RDS is done in background thread and not in
data path like registration. So break the dependency between them
which helps to remove the performance bottleneck.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h      |  4 +++-
 net/rds/ib_cm.c   |  9 +++++++--
 net/rds/ib_frmr.c | 11 ++++++-----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index f4e8121..f14c26d 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -14,7 +14,8 @@
 
 #define RDS_IB_DEFAULT_RECV_WR		1024
 #define RDS_IB_DEFAULT_SEND_WR		256
-#define RDS_IB_DEFAULT_FR_WR		512
+#define RDS_IB_DEFAULT_FR_WR		256
+#define RDS_IB_DEFAULT_FR_INV_WR	256
 
 #define RDS_IB_DEFAULT_RETRY_COUNT	1
 
@@ -125,6 +126,7 @@ struct rds_ib_connection {
 
 	/* To control the number of wrs from fastreg */
 	atomic_t		i_fastreg_wrs;
+	atomic_t		i_fastunreg_wrs;
 
 	/* interrupt handling */
 	struct tasklet_struct	i_send_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index b9da1e5..3002acf 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -382,7 +382,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	 * completion queue and send queue. This extra space is used for FRMR
 	 * registration and invalidation work requests
 	 */
-	fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+	fr_queue_space = rds_ibdev->use_fastreg ?
+			 (RDS_IB_DEFAULT_FR_WR + 1) +
+			 (RDS_IB_DEFAULT_FR_INV_WR + 1)
+			 : 0;
 
 	/* add the conn now so that connection establishment has the dev */
 	rds_ib_add_conn(rds_ibdev, conn);
@@ -444,6 +447,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.send_cq = ic->i_send_cq;
 	attr.recv_cq = ic->i_recv_cq;
 	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
+	atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR);
 
 	/*
 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -766,7 +770,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		wait_event(rds_ib_ring_empty_wait,
 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
 			   (atomic_read(&ic->i_signaled_sends) == 0) &&
-			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
+			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) &&
+			   (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR));
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
 
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 66b3d62..48332a6 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -241,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (frmr->fr_state != FRMR_IS_INUSE)
 		goto out;
 
-	while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+	while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) {
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		cpu_relax();
 	}
 
@@ -261,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 	if (unlikely(ret)) {
 		frmr->fr_state = FRMR_IS_STALE;
 		frmr->fr_inv = false;
-		atomic_inc(&ibmr->ic->i_fastreg_wrs);
+		atomic_inc(&ibmr->ic->i_fastunreg_wrs);
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
 	}
@@ -289,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	if (frmr->fr_inv) {
 		frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
+		atomic_inc(&ic->i_fastreg_wrs);
+	} else {
+		atomic_inc(&ic->i_fastunreg_wrs);
 	}
-
-	atomic_inc(&ic->i_fastreg_wrs);
 }
 
 void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 09/18] RDS: RDMA: silence the use_once mr log flood
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

In absence of extension headers, message log will keep
flooding the console. As such even without use_once we can
clean up the MRs so its not really an error case message
so make it debug message

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 4c93bad..8151c49 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -415,7 +415,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 	if (!mr) {
-		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+		pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
+			 r_key);
 		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 		return;
 	}
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 10/18] RDS: IB: track and log active side endpoint in connection
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Useful to know the active and passive end points in a
RDS IB connection.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h    |  3 +++
 net/rds/ib_cm.c | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index f14c26d..97e7696 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -181,6 +181,9 @@ struct rds_ib_connection {
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
+
+	/* Endpoint role in connection */
+	int			i_active_side;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 3002acf..4d1bf04 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -120,16 +120,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 		rds_conn_destroy(conn);
 		return;
 	} else {
-		pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n",
+		pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+			  ic->i_active_side ? "Active" : "Passive",
 			  &conn->c_laddr, &conn->c_faddr,
 			  RDS_PROTOCOL_MAJOR(conn->c_version),
 			  RDS_PROTOCOL_MINOR(conn->c_version),
 			  ic->i_flowctl ? ", flow control" : "");
 	}
 
-	/*
-	 * Init rings and fill recv. this needs to wait until protocol negotiation
-	 * is complete, since ring layout is different from 3.0 to 3.1.
+	/* Init rings and fill recv. this needs to wait until protocol
+	 * negotiation is complete, since ring layout is different
+	 * from 3.1 to 4.1.
 	 */
 	rds_ib_send_init_ring(ic);
 	rds_ib_recv_init_ring(ic);
@@ -685,6 +686,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
 		if (ic->i_cm_id == cm_id)
 			ret = 0;
 	}
+	ic->i_active_side = true;
 	return ret;
 }
 
@@ -859,6 +861,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 	ic->i_sends = NULL;
 	vfree(ic->i_recvs);
 	ic->i_recvs = NULL;
+	ic->i_active_side = false;
 }
 
 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 11/18] RDS: IB: add few useful cache stasts
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

Tracks the ib receive cache total, incoming and frag allocations.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib.h       | 7 +++++++
 net/rds/ib_recv.c  | 6 ++++++
 net/rds/ib_stats.c | 2 ++
 3 files changed, 15 insertions(+)

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 97e7696..4987387 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -151,6 +151,7 @@ struct rds_ib_connection {
 	u64			i_ack_recv;	/* last ACK received */
 	struct rds_ib_refill_cache i_cache_incs;
 	struct rds_ib_refill_cache i_cache_frags;
+	atomic_t		i_cache_allocs;
 
 	/* sending acks */
 	unsigned long		i_ack_flags;
@@ -254,6 +255,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rx_refill_from_cq;
 	uint64_t	s_ib_rx_refill_from_thread;
 	uint64_t	s_ib_rx_alloc_limit;
+	uint64_t	s_ib_rx_total_frags;
+	uint64_t	s_ib_rx_total_incs;
 	uint64_t	s_ib_rx_credit_updates;
 	uint64_t	s_ib_ack_sent;
 	uint64_t	s_ib_ack_send_failure;
@@ -276,6 +279,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rdma_mr_1m_reused;
 	uint64_t	s_ib_atomic_cswp;
 	uint64_t	s_ib_atomic_fadd;
+	uint64_t	s_ib_recv_added_to_cache;
+	uint64_t	s_ib_recv_removed_from_cache;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
@@ -406,6 +411,8 @@ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
 #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+#define rds_ib_stats_add(member, count) \
+		rds_stats_add_which(rds_ib_stats, member, count)
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
 				    unsigned int avail);
 
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 6803b75..4b0f126 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic,
 	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
 
 	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+	atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+	rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
 }
 
 /* Recycle inc after freeing attached frags */
@@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
 			atomic_dec(&rds_ib_allocation);
 			return NULL;
 		}
+		rds_ib_stats_inc(s_ib_rx_total_incs);
 	}
 	INIT_LIST_HEAD(&ibinc->ii_frags);
 	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
@@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
 	if (cache_item) {
 		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+		atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
+		rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
 	} else {
 		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
 		if (!frag)
@@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 			kmem_cache_free(rds_ib_frag_slab, frag);
 			return NULL;
 		}
+		rds_ib_stats_inc(s_ib_rx_total_frags);
 	}
 
 	INIT_LIST_HEAD(&frag->f_item);
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 7e78dca..9252ad1 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -55,6 +55,8 @@
 	"ib_rx_refill_from_cq",
 	"ib_rx_refill_from_thread",
 	"ib_rx_alloc_limit",
+	"ib_rx_total_frags",
+	"ib_rx_total_incs",
 	"ib_rx_credit_updates",
 	"ib_ack_sent",
 	"ib_ack_send_failure",
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 13/18] RDS: RDMA: Fix the composite message user notification
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

When application sends an RDS RDMA composite message consist of
RDMA transfer to be followed up by non RDMA payload, it expect to
be notified *only* when the full message gets delivered. RDS RDMA
notification doesn't behave this way though.

Thanks to Venkat for debug and root casuing the issue
where only first part of the message(RDMA) was
successfully delivered but remainder payload delivery failed.
In that case, application should not be notified with
a false positive of message delivery success.

Fix this case by making sure the user gets notified only after
the full message delivery.

Reviewed-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_send.c | 25 +++++++++++++++----------
 net/rds/rdma.c    | 10 ++++++++++
 net/rds/rds.h     |  1 +
 net/rds/send.c    |  4 +++-
 4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 19eca5c..5e72de1 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm,
 	complete(rm, notify_status);
 }
 
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
-				   struct rm_data_op *op,
-				   int wc_status)
-{
-	if (op->op_nents)
-		ib_dma_unmap_sg(ic->i_cm_id->device,
-				op->op_sg, op->op_nents,
-				DMA_TO_DEVICE);
-}
-
 static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
 				   struct rm_rdma_op *op,
 				   int wc_status)
@@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
 		rds_ib_stats_inc(s_ib_atomic_fadd);
 }
 
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	struct rds_message *rm = container_of(op, struct rds_message, data);
+
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+
+	if (rm->rdma.op_active && rm->data.op_notify)
+		rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status);
+}
+
 /*
  * Unmap the resources associated with a struct send_work.
  *
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 8151c49..dd508e0 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -627,6 +627,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 		}
 		op->op_notifier->n_user_token = args->user_token;
 		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+
+		/* Enable rmda notification on data operation for composite
+		 * rds messages and make sure notification is enabled only
+		 * for the data operation which follows it so that application
+		 * gets notified only after full message gets delivered.
+		 */
+		if (rm->data.op_sg) {
+			rm->rdma.op_notify = 0;
+			rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+		}
 	}
 
 	/* The cookie contains the R_Key of the remote memory region, and
diff --git a/net/rds/rds.h b/net/rds/rds.h
index ebbf909..0bb8213 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -419,6 +419,7 @@ struct rds_message {
 		} rdma;
 		struct rm_data_op {
 			unsigned int		op_active:1;
+			unsigned int		op_notify:1;
 			unsigned int		op_nents;
 			unsigned int		op_count;
 			unsigned int		op_dmasg;
diff --git a/net/rds/send.c b/net/rds/send.c
index 0a6f38b..45e025b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -476,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 	struct rm_rdma_op *ro;
 	struct rds_notifier *notifier;
 	unsigned long flags;
+	unsigned int notify = 0;
 
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
+	notify =  rm->rdma.op_notify | rm->data.op_notify;
 	ro = &rm->rdma;
 	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro->op_active && ro->op_notify && ro->op_notifier) {
+	    ro->op_active && notify && ro->op_notifier) {
 		notifier = ro->op_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 15/18] RDS: add stat for socket recv memory usage
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

From: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>

Tracks the receive side memory added to scokets and removed from sockets.

Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rds.h  | 3 +++
 net/rds/recv.c | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0bb8213..8ccd5a9 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -631,6 +631,9 @@ struct rds_statistics {
 	uint64_t	s_cong_update_received;
 	uint64_t	s_cong_send_error;
 	uint64_t	s_cong_send_blocked;
+	uint64_t	s_recv_bytes_added_to_socket;
+	uint64_t	s_recv_bytes_removed_from_socket;
+
 };
 
 /* af_rds.c */
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 9d0666e..ba19eee 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -94,6 +94,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
 		return;
 
 	rs->rs_rcv_bytes += delta;
+	if (delta > 0)
+		rds_stats_add(s_recv_bytes_added_to_socket, delta);
+	else
+		rds_stats_add(s_recv_bytes_removed_from_socket, -delta);
 	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
 
 	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
-- 
1.9.1

^ permalink raw reply related

* [net-next][PATCH 16/18] RDS: make message size limit compliant with spec
From: Santosh Shilimkar @ 2016-12-05  6:57 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1480921073-9140-1-git-send-email-santosh.shilimkar@oracle.com>

From: Avinash Repaka <avinash.repaka@oracle.com>

RDS support max message size as 1M but the code doesn't check this
in all cases. Patch fixes it for RDMA & non-RDMA and RDS MR size
and its enforced irrespective of underlying transport.

Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/rdma.c |  9 ++++++++-
 net/rds/rds.h  |  3 +++
 net/rds/send.c | 31 +++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index dd508e0..6e8db33 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -40,7 +40,6 @@
 /*
  * XXX
  *  - build with sparse
- *  - should we limit the size of a mr region?  let transport return failure?
  *  - should we detect duplicate keys on a socket?  hmm.
  *  - an rdma is an mlock, apply rlimit?
  */
@@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 		goto out;
 	}
 
+	/* Restrict the size of mr irrespective of underlying transport
+	 * To account for unaligned mr regions, subtract one from nr_pages
+	 */
+	if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
 		args->vec.addr, args->vec.bytes, nr_pages);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 8ccd5a9..f713194 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...)
 #define RDS_FRAG_SHIFT	12
 #define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
 
+/* Used to limit both RDMA and non-RDMA RDS message to 1MB */
+#define RDS_MAX_MSG_SIZE	((unsigned int)(1 << 20))
+
 #define RDS_CONG_MAP_BYTES	(65536 / 8)
 #define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
 #define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
diff --git a/net/rds/send.c b/net/rds/send.c
index 45e025b..5cc6403 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -994,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
 	return hash;
 }
 
+static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
+{
+	struct rds_rdma_args *args;
+	struct cmsghdr *cmsg;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
+			args = CMSG_DATA(cmsg);
+			*rdma_bytes += args->remote_vec.bytes;
+		}
+	}
+	return 0;
+}
+
 int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 {
 	struct sock *sk = sock->sk;
@@ -1008,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
 	long timeo = sock_sndtimeo(sk, nonblock);
 	struct rds_conn_path *cpath;
+	size_t total_payload_len = payload_len, rdma_payload_len = 0;
 
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
@@ -1040,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	}
 	release_sock(sk);
 
+	ret = rds_rdma_bytes(msg, &rdma_payload_len);
+	if (ret)
+		goto out;
+
+	total_payload_len += rdma_payload_len;
+	if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
 	if (payload_len > rds_sk_sndbuf(rs)) {
 		ret = -EMSGSIZE;
 		goto out;
-- 
1.9.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox