Netdev List
 help / color / mirror / Atom feed
* Re: [Intel-wired-lan] [PATCH 06/15] ice: Initialize PF and setup miscellaneous interrupt
From: Shannon Nelson @ 2018-03-13  2:05 UTC (permalink / raw)
  To: Anirudh Venkataramanan, intel-wired-lan; +Cc: netdev
In-Reply-To: <20180309172136.9073-7-anirudh.venkataramanan@intel.com>

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:
> This patch continues the initialization flow as follows:
> 
> 1) Allocate and initialize necessary fields (like vsi, num_alloc_vsi,
>     irq_tracker, etc) in the ice_pf instance.
> 
> 2) Setup the miscellaneous interrupt handler. This also known as the
>     "other interrupt causes" (OIC) handler and is used to handle non
>     hotpath interrupts (like control queue events, link events,
>     exceptions, etc.
> 
> 3) Implement a background task to process admin queue receive (ARQ)
>     events received by the driver.
> 
> Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
> ---
>   drivers/net/ethernet/intel/ice/ice.h            |  84 +++
>   drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |   2 +
>   drivers/net/ethernet/intel/ice/ice_common.c     |   6 +
>   drivers/net/ethernet/intel/ice/ice_common.h     |   3 +
>   drivers/net/ethernet/intel/ice/ice_controlq.c   | 101 ++++
>   drivers/net/ethernet/intel/ice/ice_controlq.h   |   8 +
>   drivers/net/ethernet/intel/ice/ice_hw_autogen.h |  63 +++
>   drivers/net/ethernet/intel/ice/ice_main.c       | 719 +++++++++++++++++++++++-
>   drivers/net/ethernet/intel/ice/ice_txrx.h       |  43 ++
>   drivers/net/ethernet/intel/ice/ice_type.h       |  11 +
>   10 files changed, 1039 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/net/ethernet/intel/ice/ice_txrx.h
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> index 9681e971bcab..c8079c852a48 100644
> --- a/drivers/net/ethernet/intel/ice/ice.h
> +++ b/drivers/net/ethernet/intel/ice/ice.h
> @@ -26,29 +26,113 @@
>   #include <linux/compiler.h>
>   #include <linux/etherdevice.h>
>   #include <linux/pci.h>
> +#include <linux/workqueue.h>
>   #include <linux/aer.h>
> +#include <linux/interrupt.h>
> +#include <linux/timer.h>
>   #include <linux/delay.h>
>   #include <linux/bitmap.h>
> +#include <linux/if_bridge.h>
>   #include "ice_devids.h"
>   #include "ice_type.h"
> +#include "ice_txrx.h"
>   #include "ice_switch.h"
>   #include "ice_common.h"
>   #include "ice_sched.h"
>   
>   #define ICE_BAR0		0
> +#define ICE_INT_NAME_STR_LEN	(IFNAMSIZ + 16)
>   #define ICE_AQ_LEN		64
> +#define ICE_MIN_MSIX		2
> +#define ICE_MAX_VSI_ALLOC	130
> +#define ICE_MAX_TXQS		2048
> +#define ICE_MAX_RXQS		2048
> +#define ICE_RES_VALID_BIT	0x8000
> +#define ICE_RES_MISC_VEC_ID	(ICE_RES_VALID_BIT - 1)
>   
>   #define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
>   
> +struct ice_res_tracker {
> +	u16 num_entries;
> +	u16 search_hint;
> +	u16 list[1];
> +};
> +
> +struct ice_sw {
> +	struct ice_pf *pf;
> +	u16 sw_id;		/* switch ID for this switch */
> +	u16 bridge_mode;	/* VEB/VEPA/Port Virtualizer */
> +};
> +
>   enum ice_state {
>   	__ICE_DOWN,
> +	__ICE_PFR_REQ,			/* set by driver and peers */
> +	__ICE_ADMINQ_EVENT_PENDING,
> +	__ICE_SERVICE_SCHED,
>   	__ICE_STATE_NBITS		/* must be last */
>   };
>   
> +/* struct that defines a VSI, associated with a dev */
> +struct ice_vsi {
> +	struct net_device *netdev;
> +	struct ice_port_info *port_info; /* back pointer to port_info */
> +	u16 vsi_num;			 /* HW (absolute) index of this VSI */
> +} ____cacheline_internodealigned_in_smp;
> +
> +enum ice_pf_flags {
> +	ICE_FLAG_MSIX_ENA,
> +	ICE_FLAG_FLTR_SYNC,
> +	ICE_FLAG_RSS_ENA,
> +	ICE_PF_FLAGS_NBITS		/* must be last */
> +};
> +
>   struct ice_pf {
>   	struct pci_dev *pdev;
> +	struct msix_entry *msix_entries;
> +	struct ice_res_tracker *irq_tracker;
> +	struct ice_vsi **vsi;		/* VSIs created by the driver */
> +	struct ice_sw *first_sw;	/* first switch created by firmware */
>   	DECLARE_BITMAP(state, __ICE_STATE_NBITS);
> +	DECLARE_BITMAP(avail_txqs, ICE_MAX_TXQS);
> +	DECLARE_BITMAP(avail_rxqs, ICE_MAX_RXQS);
> +	DECLARE_BITMAP(flags, ICE_PF_FLAGS_NBITS);
> +	unsigned long serv_tmr_period;
> +	unsigned long serv_tmr_prev;
> +	struct timer_list serv_tmr;
> +	struct work_struct serv_task;
> +	struct mutex avail_q_mutex;	/* protects access to avail_[rx|tx]qs */
> +	struct mutex sw_mutex;		/* lock for protecting VSI alloc flow */
>   	u32 msg_enable;
> +	u32 oicr_idx;		/* Other interrupt cause vector index */
> +	u32 num_lan_msix;	/* Total MSIX vectors for base driver */
> +	u32 num_avail_msix;	/* remaining MSIX vectors left unclaimed */
> +	u16 num_lan_tx;		/* num lan tx queues setup */
> +	u16 num_lan_rx;		/* num lan rx queues setup */
> +	u16 q_left_tx;		/* remaining num tx queues left unclaimed */
> +	u16 q_left_rx;		/* remaining num rx queues left unclaimed */
> +	u16 next_vsi;		/* Next free slot in pf->vsi[] - 0-based! */
> +	u16 num_alloc_vsi;
> +
>   	struct ice_hw hw;
> +	char int_name[ICE_INT_NAME_STR_LEN];
>   };
> +
> +/**
> + * ice_irq_dynamic_ena - Enable default interrupt generation settings
> + * @hw: pointer to hw struct
> + */
> +static inline void ice_irq_dynamic_ena(struct ice_hw *hw)
> +{
> +	u32 vector = ((struct ice_pf *)hw->back)->oicr_idx;
> +	int itr = ICE_ITR_NONE;
> +	u32 val;
> +
> +	/* clear the PBA here, as this function is meant to clean out all
> +	 * previous interrupts and enable the interrupt
> +	 */
> +	val = GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M |
> +	      (itr << GLINT_DYN_CTL_ITR_INDX_S);
> +
> +	wr32(hw, GLINT_DYN_CTL(vector), val);
> +}
>   #endif /* _ICE_H_ */
> diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
> index 13e3b7f3e24d..1acd936eec49 100644
> --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
> +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
> @@ -597,11 +597,13 @@ struct ice_aq_desc {
>   /* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */
>   #define ICE_AQ_LG_BUF	512
>   
> +#define ICE_AQ_FLAG_ERR_S	2
>   #define ICE_AQ_FLAG_LB_S	9
>   #define ICE_AQ_FLAG_RD_S	10
>   #define ICE_AQ_FLAG_BUF_S	12
>   #define ICE_AQ_FLAG_SI_S	13
>   
> +#define ICE_AQ_FLAG_ERR		BIT(ICE_AQ_FLAG_ERR_S) /* 0x4    */
>   #define ICE_AQ_FLAG_LB		BIT(ICE_AQ_FLAG_LB_S)  /* 0x200  */
>   #define ICE_AQ_FLAG_RD		BIT(ICE_AQ_FLAG_RD_S)  /* 0x400  */
>   #define ICE_AQ_FLAG_BUF		BIT(ICE_AQ_FLAG_BUF_S) /* 0x1000 */
> diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
> index 78677a3fe448..4b94f737d7f3 100644
> --- a/drivers/net/ethernet/intel/ice/ice_common.c
> +++ b/drivers/net/ethernet/intel/ice/ice_common.c
> @@ -298,6 +298,12 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
>   	if (status)
>   		return status;
>   
> +	/* set these values to minimum allowed */
> +	hw->itr_gran_200 = ICE_ITR_GRAN_MIN_200;
> +	hw->itr_gran_100 = ICE_ITR_GRAN_MIN_100;
> +	hw->itr_gran_50 = ICE_ITR_GRAN_MIN_50;
> +	hw->itr_gran_25 = ICE_ITR_GRAN_MIN_25;
> +
>   	status = ice_init_all_ctrlq(hw);
>   	if (status)
>   		goto err_unroll_cqinit;
> diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
> index 3e3b18fc421d..ab47204dfc5a 100644
> --- a/drivers/net/ethernet/intel/ice/ice_common.h
> +++ b/drivers/net/ethernet/intel/ice/ice_common.h
> @@ -31,6 +31,9 @@ enum ice_status ice_reset(struct ice_hw *hw, enum ice_reset_req req);
>   enum ice_status ice_init_all_ctrlq(struct ice_hw *hw);
>   void ice_shutdown_all_ctrlq(struct ice_hw *hw);
>   enum ice_status
> +ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
> +		  struct ice_rq_event_info *e, u16 *pending);
> +enum ice_status
>   ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
>   		enum ice_aq_res_access_type access);
>   void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res);
> diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c
> index b1143d66d4bd..3f63a20b45c0 100644
> --- a/drivers/net/ethernet/intel/ice/ice_controlq.c
> +++ b/drivers/net/ethernet/intel/ice/ice_controlq.c
> @@ -977,3 +977,104 @@ void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode)
>   	desc->opcode = cpu_to_le16(opcode);
>   	desc->flags = cpu_to_le16(ICE_AQ_FLAG_SI);
>   }
> +
> +/**
> + * ice_clean_rq_elem
> + * @hw: pointer to the hw struct
> + * @cq: pointer to the specific Control queue
> + * @e: event info from the receive descriptor, includes any buffers
> + * @pending: number of events that could be left to process
> + *
> + * This function cleans one Admin Receive Queue element and returns
> + * the contents through e.  It can also return how many events are
> + * left to process through 'pending'.
> + */
> +enum ice_status
> +ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
> +		  struct ice_rq_event_info *e, u16 *pending)
> +{
> +	u16 ntc = cq->rq.next_to_clean;
> +	enum ice_status ret_code = 0;
> +	struct ice_aq_desc *desc;
> +	struct ice_dma_mem *bi;
> +	u16 desc_idx;
> +	u16 datalen;
> +	u16 flags;
> +	u16 ntu;
> +
> +	/* pre-clean the event info */
> +	memset(&e->desc, 0, sizeof(e->desc));
> +
> +	/* take the lock before we start messing with the ring */
> +	mutex_lock(&cq->rq_lock);
> +
> +	if (!cq->rq.count) {
> +		ice_debug(hw, ICE_DBG_AQ_MSG,
> +			  "Control Receive queue not initialized.\n");
> +		ret_code = ICE_ERR_AQ_EMPTY;
> +		goto clean_rq_elem_err;
> +	}
> +
> +	/* set next_to_use to head */
> +	ntu = (u16)(rd32(hw, cq->rq.head) & cq->rq.head_mask);
> +
> +	if (ntu == ntc) {
> +		/* nothing to do - shouldn't need to update ring's values */
> +		ret_code = ICE_ERR_AQ_NO_WORK;
> +		goto clean_rq_elem_out;
> +	}
> +
> +	/* now clean the next descriptor */
> +	desc = ICE_CTL_Q_DESC(cq->rq, ntc);
> +	desc_idx = ntc;
> +
> +	flags = le16_to_cpu(desc->flags);
> +	if (flags & ICE_AQ_FLAG_ERR) {
> +		ret_code = ICE_ERR_AQ_ERROR;
> +		cq->rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
> +		ice_debug(hw, ICE_DBG_AQ_MSG,
> +			  "Control Receive Queue Event received with error 0x%x\n",
> +			  cq->rq_last_status);
> +	}
> +	memcpy(&e->desc, desc, sizeof(e->desc));
> +	datalen = le16_to_cpu(desc->datalen);
> +	e->msg_len = min(datalen, e->buf_len);
> +	if (e->msg_buf && e->msg_len)
> +		memcpy(e->msg_buf, cq->rq.r.rq_bi[desc_idx].va, e->msg_len);
> +
> +	ice_debug(hw, ICE_DBG_AQ_MSG, "ARQ: desc and buffer:\n");
> +
> +	ice_debug_cq(hw, ICE_DBG_AQ_CMD, (void *)desc, e->msg_buf,
> +		     cq->rq_buf_size);
> +
> +	/* Restore the original datalen and buffer address in the desc,
> +	 * FW updates datalen to indicate the event message size
> +	 */
> +	bi = &cq->rq.r.rq_bi[ntc];
> +	memset(desc, 0, sizeof(*desc));
> +
> +	desc->flags = cpu_to_le16(ICE_AQ_FLAG_BUF);
> +	if (cq->rq_buf_size > ICE_AQ_LG_BUF)
> +		desc->flags |= cpu_to_le16(ICE_AQ_FLAG_LB);
> +	desc->datalen = cpu_to_le16(bi->size);
> +	desc->params.generic.addr_high = cpu_to_le32(upper_32_bits(bi->pa));
> +	desc->params.generic.addr_low = cpu_to_le32(lower_32_bits(bi->pa));
> +
> +	/* set tail = the last cleaned desc index. */
> +	wr32(hw, cq->rq.tail, ntc);
> +	/* ntc is updated to tail + 1 */
> +	ntc++;
> +	if (ntc == cq->num_rq_entries)
> +		ntc = 0;
> +	cq->rq.next_to_clean = ntc;
> +	cq->rq.next_to_use = ntu;
> +
> +clean_rq_elem_out:
> +	/* Set pending if needed, unlock and return */
> +	if (pending)
> +		*pending = (u16)((ntc > ntu ? cq->rq.count : 0) + (ntu - ntc));
> +clean_rq_elem_err:
> +	mutex_unlock(&cq->rq_lock);
> +
> +	return ret_code;
> +}
> diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.h b/drivers/net/ethernet/intel/ice/ice_controlq.h
> index 835c035419a3..403613606652 100644
> --- a/drivers/net/ethernet/intel/ice/ice_controlq.h
> +++ b/drivers/net/ethernet/intel/ice/ice_controlq.h
> @@ -81,6 +81,14 @@ struct ice_sq_cd {
>   
>   #define ICE_CTL_Q_DETAILS(R, i) (&(((struct ice_sq_cd *)((R).cmd_buf))[i]))
>   
> +/* rq event information */
> +struct ice_rq_event_info {
> +	struct ice_aq_desc desc;
> +	u16 msg_len;
> +	u16 buf_len;
> +	u8 *msg_buf;
> +};
> +
>   /* Control Queue information */
>   struct ice_ctl_q_info {
>   	enum ice_ctl_q qtype;
> diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
> index e258a12099b8..700edc7e7280 100644
> --- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
> +++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
> @@ -28,6 +28,12 @@
>   #define PF_FW_ARQLEN			0x00080280
>   #define PF_FW_ARQLEN_ARQLEN_S		0
>   #define PF_FW_ARQLEN_ARQLEN_M		ICE_M(0x3FF, PF_FW_ARQLEN_ARQLEN_S)
> +#define PF_FW_ARQLEN_ARQVFE_S		28
> +#define PF_FW_ARQLEN_ARQVFE_M		BIT(PF_FW_ARQLEN_ARQVFE_S)
> +#define PF_FW_ARQLEN_ARQOVFL_S		29
> +#define PF_FW_ARQLEN_ARQOVFL_M		BIT(PF_FW_ARQLEN_ARQOVFL_S)
> +#define PF_FW_ARQLEN_ARQCRIT_S		30
> +#define PF_FW_ARQLEN_ARQCRIT_M		BIT(PF_FW_ARQLEN_ARQCRIT_S)
>   #define PF_FW_ARQLEN_ARQENABLE_S	31
>   #define PF_FW_ARQLEN_ARQENABLE_M	BIT(PF_FW_ARQLEN_ARQENABLE_S)
>   #define PF_FW_ARQT			0x00080480
> @@ -39,6 +45,12 @@
>   #define PF_FW_ATQLEN			0x00080200
>   #define PF_FW_ATQLEN_ATQLEN_S		0
>   #define PF_FW_ATQLEN_ATQLEN_M		ICE_M(0x3FF, PF_FW_ATQLEN_ATQLEN_S)
> +#define PF_FW_ATQLEN_ATQVFE_S		28
> +#define PF_FW_ATQLEN_ATQVFE_M		BIT(PF_FW_ATQLEN_ATQVFE_S)
> +#define PF_FW_ATQLEN_ATQOVFL_S		29
> +#define PF_FW_ATQLEN_ATQOVFL_M		BIT(PF_FW_ATQLEN_ATQOVFL_S)
> +#define PF_FW_ATQLEN_ATQCRIT_S		30
> +#define PF_FW_ATQLEN_ATQCRIT_M		BIT(PF_FW_ATQLEN_ATQCRIT_S)
>   #define PF_FW_ATQLEN_ATQENABLE_S	31
>   #define PF_FW_ATQLEN_ATQENABLE_M	BIT(PF_FW_ATQLEN_ATQENABLE_S)
>   #define PF_FW_ATQT			0x00080400
> @@ -57,6 +69,57 @@
>   #define PFGEN_CTRL			0x00091000
>   #define PFGEN_CTRL_PFSWR_S		0
>   #define PFGEN_CTRL_PFSWR_M		BIT(PFGEN_CTRL_PFSWR_S)
> +#define PFHMC_ERRORDATA			0x00520500
> +#define PFHMC_ERRORINFO			0x00520400
> +#define GLINT_DYN_CTL(_INT)		(0x00160000 + ((_INT) * 4))
> +#define GLINT_DYN_CTL_INTENA_S		0
> +#define GLINT_DYN_CTL_INTENA_M		BIT(GLINT_DYN_CTL_INTENA_S)
> +#define GLINT_DYN_CTL_CLEARPBA_S	1
> +#define GLINT_DYN_CTL_CLEARPBA_M	BIT(GLINT_DYN_CTL_CLEARPBA_S)
> +#define GLINT_DYN_CTL_ITR_INDX_S	3
> +#define GLINT_DYN_CTL_SW_ITR_INDX_S	25
> +#define GLINT_DYN_CTL_SW_ITR_INDX_M	ICE_M(0x3, GLINT_DYN_CTL_SW_ITR_INDX_S)
> +#define GLINT_DYN_CTL_INTENA_MSK_S	31
> +#define GLINT_DYN_CTL_INTENA_MSK_M	BIT(GLINT_DYN_CTL_INTENA_MSK_S)
> +#define GLINT_ITR(_i, _INT)		(0x00154000 + ((_i) * 8192 + (_INT) * 4))
> +#define PFINT_FW_CTL			0x0016C800
> +#define PFINT_FW_CTL_MSIX_INDX_S	0
> +#define PFINT_FW_CTL_MSIX_INDX_M	ICE_M(0x7FF, PFINT_FW_CTL_MSIX_INDX_S)
> +#define PFINT_FW_CTL_ITR_INDX_S		11
> +#define PFINT_FW_CTL_ITR_INDX_M		ICE_M(0x3, PFINT_FW_CTL_ITR_INDX_S)
> +#define PFINT_FW_CTL_CAUSE_ENA_S	30
> +#define PFINT_FW_CTL_CAUSE_ENA_M	BIT(PFINT_FW_CTL_CAUSE_ENA_S)
> +#define PFINT_OICR			0x0016CA00
> +#define PFINT_OICR_INTEVENT_S		0
> +#define PFINT_OICR_INTEVENT_M		BIT(PFINT_OICR_INTEVENT_S)
> +#define PFINT_OICR_HLP_RDY_S		14
> +#define PFINT_OICR_HLP_RDY_M		BIT(PFINT_OICR_HLP_RDY_S)
> +#define PFINT_OICR_CPM_RDY_S		15
> +#define PFINT_OICR_CPM_RDY_M		BIT(PFINT_OICR_CPM_RDY_S)
> +#define PFINT_OICR_ECC_ERR_S		16
> +#define PFINT_OICR_ECC_ERR_M		BIT(PFINT_OICR_ECC_ERR_S)
> +#define PFINT_OICR_MAL_DETECT_S		19
> +#define PFINT_OICR_MAL_DETECT_M		BIT(PFINT_OICR_MAL_DETECT_S)
> +#define PFINT_OICR_GRST_S		20
> +#define PFINT_OICR_GRST_M		BIT(PFINT_OICR_GRST_S)
> +#define PFINT_OICR_PCI_EXCEPTION_S	21
> +#define PFINT_OICR_PCI_EXCEPTION_M	BIT(PFINT_OICR_PCI_EXCEPTION_S)
> +#define PFINT_OICR_GPIO_S		22
> +#define PFINT_OICR_GPIO_M		BIT(PFINT_OICR_GPIO_S)
> +#define PFINT_OICR_STORM_DETECT_S	24
> +#define PFINT_OICR_STORM_DETECT_M	BIT(PFINT_OICR_STORM_DETECT_S)
> +#define PFINT_OICR_HMC_ERR_S		26
> +#define PFINT_OICR_HMC_ERR_M		BIT(PFINT_OICR_HMC_ERR_S)
> +#define PFINT_OICR_PE_CRITERR_S		28
> +#define PFINT_OICR_PE_CRITERR_M		BIT(PFINT_OICR_PE_CRITERR_S)
> +#define PFINT_OICR_CTL			0x0016CA80
> +#define PFINT_OICR_CTL_MSIX_INDX_S	0
> +#define PFINT_OICR_CTL_MSIX_INDX_M	ICE_M(0x7FF, PFINT_OICR_CTL_MSIX_INDX_S)
> +#define PFINT_OICR_CTL_ITR_INDX_S	11
> +#define PFINT_OICR_CTL_ITR_INDX_M	ICE_M(0x3, PFINT_OICR_CTL_ITR_INDX_S)
> +#define PFINT_OICR_CTL_CAUSE_ENA_S	30
> +#define PFINT_OICR_CTL_CAUSE_ENA_M	BIT(PFINT_OICR_CTL_CAUSE_ENA_S)
> +#define PFINT_OICR_ENA			0x0016C900
>   #define GLLAN_RCTL_0			0x002941F8
>   #define GLNVM_FLA			0x000B6108
>   #define GLNVM_FLA_LOCKED_S		6
> diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> index 2ee4a0547ba3..b07ce86381bb 100644
> --- a/drivers/net/ethernet/intel/ice/ice_main.c
> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> @@ -40,6 +40,294 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all), hw debug_mask (0x8XXXX
>   MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
>   #endif /* !CONFIG_DYNAMIC_DEBUG */
>   
> +static struct workqueue_struct *ice_wq;
> +
> +/**
> + * ice_search_res - Search the tracker for a block of resources
> + * @res: pointer to the resource
> + * @needed: size of the block needed
> + * @id: identifier to track owner
> + * Returns the base item index of the block, or -ENOMEM for error
> + */
> +static int ice_search_res(struct ice_res_tracker *res, u16 needed, u16 id)
> +{
> +	int start = res->search_hint;
> +	int end = start;
> +
> +	id |= ICE_RES_VALID_BIT;
> +
> +	do {
> +		/* skip already allocated entries */
> +		if (res->list[end++] & ICE_RES_VALID_BIT) {
> +			start = end;
> +			if ((start + needed) > res->num_entries)
> +				break;
> +		}
> +
> +		if (end == (start + needed)) {
> +			int i = start;
> +
> +			/* there was enough, so assign it to the requestor */
> +			while (i != end)
> +				res->list[i++] = id;
> +
> +			if (end == res->num_entries)
> +				end = 0;
> +
> +			res->search_hint = end;
> +			return start;
> +		}
> +	} while (1);
> +
> +	return -ENOMEM;
> +}
> +
> +/**
> + * ice_get_res - get a block of resources
> + * @pf: board private structure
> + * @res: pointer to the resource
> + * @needed: size of the block needed
> + * @id: identifier to track owner
> + *
> + * Returns the base item index of the block, or -ENOMEM for error
> + * The search_hint trick and lack of advanced fit-finding only works
> + * because we're highly likely to have all the same size lump requests.

The new naming for this resource tracking is much better than what 
someone used in i40e, but you can probably replace the "lump" reference 
here as well.

Now that there is a 2nd driver using essentially the same code, should 
there be some effort to make it generic and only have the code once in 
the kernel, if it isn't already available?

sln

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH 07/15] ice: Add support for VSI allocation and deallocation
From: Shannon Nelson @ 2018-03-13  2:05 UTC (permalink / raw)
  To: Anirudh Venkataramanan, intel-wired-lan; +Cc: netdev
In-Reply-To: <20180309172136.9073-8-anirudh.venkataramanan@intel.com>

On 3/9/2018 9:21 AM, Anirudh Venkataramanan wrote:
> This patch introduces data structures and functions to alloc/free
> VSIs. The driver represents a VSI using the ice_vsi structure.
> 
> Some noteworthy points about VSI allocation:
> 
> 1) A VSI is allocated in the firmware using the "add VSI" admin queue
>     command (implemented as ice_aq_add_vsi). The firmware returns an
>     identifier for the allocated VSI. The VSI context is used to program
>     certain aspects (loopback, queue map, etc.) of the VSI's configuration.
> 
> 2) A VSI is deleted using the "free VSI" admin queue command (implemented
>     as ice_aq_free_vsi).
> 
> 3) The driver represents a VSI using struct ice_vsi. This is allocated
>     and initialized as part of the ice_vsi_alloc flow, and deallocated
>     as part of the ice_vsi_delete flow.
> 
> 4) Once the VSI is created, a netdev is allocated and associated with it.
>     The VSI's ring and vector related data structures are also allocated
>     and initialized.
> 
> 5) A VSI's queues can either be contiguous or scattered. To do this, the
>     driver maintains a bitmap (vsi->avail_txqs) which is kept in sync with
>     the firmware's VSI queue allocation imap. If the VSI can't get a
>     contiguous queue allocation, it will fallback to scatter. This is
>     implemented in ice_vsi_get_qs which is called as part of the VSI setup
>     flow. In the release flow, the VSI's queues are released and the bitmap
>     is updated to reflect this by ice_vsi_put_qs.
> 
> Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
> ---
>   drivers/net/ethernet/intel/ice/ice.h            |   71 ++
>   drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |  199 ++++
>   drivers/net/ethernet/intel/ice/ice_main.c       | 1108 +++++++++++++++++++++++
>   drivers/net/ethernet/intel/ice/ice_switch.c     |  115 +++
>   drivers/net/ethernet/intel/ice/ice_switch.h     |   21 +
>   drivers/net/ethernet/intel/ice/ice_txrx.h       |   26 +
>   drivers/net/ethernet/intel/ice/ice_type.h       |    4 +
>   7 files changed, 1544 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> index c8079c852a48..b169f3751cc9 100644
> --- a/drivers/net/ethernet/intel/ice/ice.h
> +++ b/drivers/net/ethernet/intel/ice/ice.h
> @@ -25,6 +25,8 @@
>   #include <linux/netdevice.h>
>   #include <linux/compiler.h>
>   #include <linux/etherdevice.h>
> +#include <linux/cpumask.h>
> +#include <linux/if_vlan.h>
>   #include <linux/pci.h>
>   #include <linux/workqueue.h>
>   #include <linux/aer.h>
> @@ -32,6 +34,7 @@
>   #include <linux/timer.h>
>   #include <linux/delay.h>
>   #include <linux/bitmap.h>
> +#include <linux/log2.h>
>   #include <linux/if_bridge.h>
>   #include "ice_devids.h"
>   #include "ice_type.h"
> @@ -41,17 +44,42 @@
>   #include "ice_sched.h"
>   
>   #define ICE_BAR0		0
> +#define ICE_DFLT_NUM_DESC	128
> +#define ICE_REQ_DESC_MULTIPLE	32
>   #define ICE_INT_NAME_STR_LEN	(IFNAMSIZ + 16)
>   #define ICE_AQ_LEN		64
>   #define ICE_MIN_MSIX		2
>   #define ICE_MAX_VSI_ALLOC	130
>   #define ICE_MAX_TXQS		2048
>   #define ICE_MAX_RXQS		2048
> +#define ICE_VSI_MAP_CONTIG	0
> +#define ICE_VSI_MAP_SCATTER	1
> +#define ICE_MAX_SCATTER_TXQS	16
> +#define ICE_MAX_SCATTER_RXQS	16
>   #define ICE_RES_VALID_BIT	0x8000
>   #define ICE_RES_MISC_VEC_ID	(ICE_RES_VALID_BIT - 1)
> +#define ICE_INVAL_Q_INDEX	0xffff
>   
>   #define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK)
>   
> +#define ICE_MAX_MTU	(ICE_AQ_SET_MAC_FRAME_SIZE_MAX - \
> +			 ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN)
> +
> +#define ICE_UP_TABLE_TRANSLATE(val, i) \
> +		(((val) << ICE_AQ_VSI_UP_TABLE_UP##i##_S) & \
> +		  ICE_AQ_VSI_UP_TABLE_UP##i##_M)
> +
> +struct ice_tc_info {
> +	u16 qoffset;
> +	u16 qcount;
> +};
> +
> +struct ice_tc_cfg {
> +	u8 numtc; /* Total number of enabled TCs */
> +	u8 ena_tc; /* TX map */
> +	struct ice_tc_info tc_info[ICE_MAX_TRAFFIC_CLASS];
> +};
> +
>   struct ice_res_tracker {
>   	u16 num_entries;
>   	u16 search_hint;
> @@ -75,8 +103,47 @@ enum ice_state {
>   /* struct that defines a VSI, associated with a dev */
>   struct ice_vsi {
>   	struct net_device *netdev;
> +	struct ice_sw *vsw;		 /* switch this VSI is on */
> +	struct ice_pf *back;		 /* back pointer to PF */
>   	struct ice_port_info *port_info; /* back pointer to port_info */
> +	struct ice_ring **rx_rings;	 /* rx ring array */
> +	struct ice_ring **tx_rings;	 /* tx ring array */
> +	struct ice_q_vector **q_vectors; /* q_vector array */
> +	DECLARE_BITMAP(state, __ICE_STATE_NBITS);
> +	int num_q_vectors;
> +	int base_vector;
> +	enum ice_vsi_type type;
>   	u16 vsi_num;			 /* HW (absolute) index of this VSI */
> +	u16 idx;			 /* software index in pf->vsi[] */
> +
> +	/* Interrupt thresholds */
> +	u16 work_lmt;
> +
> +	struct ice_aqc_vsi_props info;	 /* VSI properties */
> +
> +	/* queue information */
> +	u8 tx_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */
> +	u8 rx_mapping_mode;		 /* ICE_MAP_MODE_[CONTIG|SCATTER] */
> +	u16 txq_map[ICE_MAX_TXQS];	 /* index in pf->avail_txqs */
> +	u16 rxq_map[ICE_MAX_RXQS];	 /* index in pf->avail_rxqs */
> +	u16 alloc_txq;			 /* Allocated Tx queues */
> +	u16 num_txq;			 /* Used Tx queues */
> +	u16 alloc_rxq;			 /* Allocated Rx queues */
> +	u16 num_rxq;			 /* Used Rx queues */
> +	u16 num_desc;
> +	struct ice_tc_cfg tc_cfg;
> +} ____cacheline_internodealigned_in_smp;
> +
> +/* struct that defines an interrupt vector */
> +struct ice_q_vector {
> +	struct ice_vsi *vsi;
> +	cpumask_t affinity_mask;
> +	struct napi_struct napi;
> +	struct ice_ring_container rx;
> +	struct ice_ring_container tx;
> +	u16 v_idx;			/* index in the vsi->q_vector array. */
> +	u8 num_ring_tx;			/* total number of tx rings in vector */
> +	u8 num_ring_rx;			/* total number of rx rings in vector */
>   } ____cacheline_internodealigned_in_smp;
>   
>   enum ice_pf_flags {
> @@ -117,6 +184,10 @@ struct ice_pf {
>   	char int_name[ICE_INT_NAME_STR_LEN];
>   };
>   
> +struct ice_netdev_priv {
> +	struct ice_vsi *vsi;
> +};
> +
>   /**
>    * ice_irq_dynamic_ena - Enable default interrupt generation settings
>    * @hw: pointer to hw struct
> diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
> index 1acd936eec49..570169c99786 100644
> --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
> +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
> @@ -23,6 +23,7 @@
>    */
>   
>   #define ICE_AQC_TOPO_MAX_LEVEL_NUM	0x9
> +#define ICE_AQ_SET_MAC_FRAME_SIZE_MAX	9728
>   
>   struct ice_aqc_generic {
>   	__le32 param0;
> @@ -204,6 +205,199 @@ struct ice_aqc_get_sw_cfg_resp {
>   	struct ice_aqc_get_sw_cfg_resp_elem elements[1];
>   };
>   
> +/* Add VSI (indirect 0x0210)
> + * Update VSI (indirect 0x0211)
> + * Get VSI (indirect 0x0212)
> + * Free VSI (indirect 0x0213)
> + */
> +struct ice_aqc_add_get_update_free_vsi {
> +	__le16 vsi_num;
> +#define ICE_AQ_VSI_NUM_S	0
> +#define ICE_AQ_VSI_NUM_M	(0x03FF << ICE_AQ_VSI_NUM_S)
> +#define ICE_AQ_VSI_IS_VALID	BIT(15)
> +	__le16 cmd_flags;
> +#define ICE_AQ_VSI_KEEP_ALLOC	0x1
> +	u8 vf_id;
> +	u8 reserved;
> +	__le16 vsi_flags;
> +#define ICE_AQ_VSI_TYPE_S	0
> +#define ICE_AQ_VSI_TYPE_M	(0x3 << ICE_AQ_VSI_TYPE_S)
> +#define ICE_AQ_VSI_TYPE_VF	0x0
> +#define ICE_AQ_VSI_TYPE_VMDQ2	0x1
> +#define ICE_AQ_VSI_TYPE_PF	0x2
> +#define ICE_AQ_VSI_TYPE_EMP_MNG	0x3
> +	__le32 addr_high;
> +	__le32 addr_low;
> +};
> +
> +/* Response descriptor for:
> + * Add VSI (indirect 0x0210)
> + * Update VSI (indirect 0x0211)
> + * Free VSI (indirect 0x0213)
> + */
> +struct ice_aqc_add_update_free_vsi_resp {
> +	__le16 vsi_num;
> +	__le16 ext_status;
> +	__le16 vsi_used;
> +	__le16 vsi_free;
> +	__le32 addr_high;
> +	__le32 addr_low;
> +};
> +
> +struct ice_aqc_vsi_props {
> +	__le16 valid_sections;
> +#define ICE_AQ_VSI_PROP_SW_VALID		BIT(0)
> +#define ICE_AQ_VSI_PROP_SECURITY_VALID		BIT(1)
> +#define ICE_AQ_VSI_PROP_VLAN_VALID		BIT(2)
> +#define ICE_AQ_VSI_PROP_OUTER_TAG_VALID		BIT(3)
> +#define ICE_AQ_VSI_PROP_INGRESS_UP_VALID	BIT(4)
> +#define ICE_AQ_VSI_PROP_EGRESS_UP_VALID		BIT(5)
> +#define ICE_AQ_VSI_PROP_RXQ_MAP_VALID		BIT(6)
> +#define ICE_AQ_VSI_PROP_Q_OPT_VALID		BIT(7)
> +#define ICE_AQ_VSI_PROP_OUTER_UP_VALID		BIT(8)
> +#define ICE_AQ_VSI_PROP_FLOW_DIR_VALID		BIT(11)
> +#define ICE_AQ_VSI_PROP_PASID_VALID		BIT(12)
> +	/* switch section */
> +	u8 sw_id;
> +	u8 sw_flags;
> +#define ICE_AQ_VSI_SW_FLAG_ALLOW_LB		BIT(5)
> +#define ICE_AQ_VSI_SW_FLAG_LOCAL_LB		BIT(6)
> +#define ICE_AQ_VSI_SW_FLAG_SRC_PRUNE		BIT(7)
> +	u8 sw_flags2;
> +#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S	0
> +#define ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_M	\
> +				(0xF << ICE_AQ_VSI_SW_FLAG_RX_PRUNE_EN_S)
> +#define ICE_AQ_VSI_SW_FLAG_RX_VLAN_PRUNE_ENA	BIT(0)
> +#define ICE_AQ_VSI_SW_FLAG_LAN_ENA		BIT(4)
> +	u8 veb_stat_id;
> +#define ICE_AQ_VSI_SW_VEB_STAT_ID_S		0
> +#define ICE_AQ_VSI_SW_VEB_STAT_ID_M	(0x1F << ICE_AQ_VSI_SW_VEB_STAT_ID_S)
> +#define ICE_AQ_VSI_SW_VEB_STAT_ID_VALID		BIT(5)
> +	/* security section */
> +	u8 sec_flags;
> +#define ICE_AQ_VSI_SEC_FLAG_ALLOW_DEST_OVRD	BIT(0)
> +#define ICE_AQ_VSI_SEC_FLAG_ENA_MAC_ANTI_SPOOF	BIT(2)
> +#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S	4
> +#define ICE_AQ_VSI_SEC_TX_PRUNE_ENA_M	(0xF << ICE_AQ_VSI_SEC_TX_PRUNE_ENA_S)
> +#define ICE_AQ_VSI_SEC_TX_VLAN_PRUNE_ENA	BIT(0)
> +	u8 sec_reserved;
> +	/* VLAN section */
> +	__le16 pvid; /* VLANS include priority bits */
> +	u8 pvlan_reserved[2];
> +	u8 port_vlan_flags;
> +#define ICE_AQ_VSI_PVLAN_MODE_S	0
> +#define ICE_AQ_VSI_PVLAN_MODE_M	(0x3 << ICE_AQ_VSI_PVLAN_MODE_S)
> +#define ICE_AQ_VSI_PVLAN_MODE_UNTAGGED	0x1
> +#define ICE_AQ_VSI_PVLAN_MODE_TAGGED	0x2
> +#define ICE_AQ_VSI_PVLAN_MODE_ALL	0x3
> +#define ICE_AQ_VSI_PVLAN_INSERT_PVID	BIT(2)
> +#define ICE_AQ_VSI_PVLAN_EMOD_S	3
> +#define ICE_AQ_VSI_PVLAN_EMOD_M	(0x3 << ICE_AQ_VSI_PVLAN_EMOD_S)
> +#define ICE_AQ_VSI_PVLAN_EMOD_STR_BOTH	(0x0 << ICE_AQ_VSI_PVLAN_EMOD_S)
> +#define ICE_AQ_VSI_PVLAN_EMOD_STR_UP	(0x1 << ICE_AQ_VSI_PVLAN_EMOD_S)
> +#define ICE_AQ_VSI_PVLAN_EMOD_STR	(0x2 << ICE_AQ_VSI_PVLAN_EMOD_S)
> +#define ICE_AQ_VSI_PVLAN_EMOD_NOTHING	(0x3 << ICE_AQ_VSI_PVLAN_EMOD_S)
> +	u8 pvlan_reserved2[3];
> +	/* ingress egress up sections */
> +	__le32 ingress_table; /* bitmap, 3 bits per up */
> +#define ICE_AQ_VSI_UP_TABLE_UP0_S	0
> +#define ICE_AQ_VSI_UP_TABLE_UP0_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP0_S)
> +#define ICE_AQ_VSI_UP_TABLE_UP1_S	3
> +#define ICE_AQ_VSI_UP_TABLE_UP1_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP1_S)
> +#define ICE_AQ_VSI_UP_TABLE_UP2_S	6
> +#define ICE_AQ_VSI_UP_TABLE_UP2_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP2_S)
> +#define ICE_AQ_VSI_UP_TABLE_UP3_S	9
> +#define ICE_AQ_VSI_UP_TABLE_UP3_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP3_S)
> +#define ICE_AQ_VSI_UP_TABLE_UP4_S	12
> +#define ICE_AQ_VSI_UP_TABLE_UP4_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP4_S)
> +#define ICE_AQ_VSI_UP_TABLE_UP5_S	15
> +#define ICE_AQ_VSI_UP_TABLE_UP5_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP5_S)
> +#define ICE_AQ_VSI_UP_TABLE_UP6_S	18
> +#define ICE_AQ_VSI_UP_TABLE_UP6_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP6_S)
> +#define ICE_AQ_VSI_UP_TABLE_UP7_S	21
> +#define ICE_AQ_VSI_UP_TABLE_UP7_M	(0x7 << ICE_AQ_VSI_UP_TABLE_UP7_S)
> +	__le32 egress_table;   /* same defines as for ingress table */
> +	/* outer tags section */
> +	__le16 outer_tag;
> +	u8 outer_tag_flags;
> +#define ICE_AQ_VSI_OUTER_TAG_MODE_S	0
> +#define ICE_AQ_VSI_OUTER_TAG_MODE_M	(0x3 << ICE_AQ_VSI_OUTER_TAG_MODE_S)
> +#define ICE_AQ_VSI_OUTER_TAG_NOTHING	0x0
> +#define ICE_AQ_VSI_OUTER_TAG_REMOVE	0x1
> +#define ICE_AQ_VSI_OUTER_TAG_COPY	0x2
> +#define ICE_AQ_VSI_OUTER_TAG_TYPE_S	2
> +#define ICE_AQ_VSI_OUTER_TAG_TYPE_M	(0x3 << ICE_AQ_VSI_OUTER_TAG_TYPE_S)
> +#define ICE_AQ_VSI_OUTER_TAG_NONE	0x0
> +#define ICE_AQ_VSI_OUTER_TAG_STAG	0x1
> +#define ICE_AQ_VSI_OUTER_TAG_VLAN_8100	0x2
> +#define ICE_AQ_VSI_OUTER_TAG_VLAN_9100	0x3
> +#define ICE_AQ_VSI_OUTER_TAG_INSERT	BIT(4)
> +#define ICE_AQ_VSI_OUTER_TAG_ACCEPT_HOST BIT(6)
> +	u8 outer_tag_reserved;
> +	/* queue mapping section */
> +	__le16 mapping_flags;
> +#define ICE_AQ_VSI_Q_MAP_CONTIG	0x0
> +#define ICE_AQ_VSI_Q_MAP_NONCONTIG	BIT(0)
> +	__le16 q_mapping[16];
> +#define ICE_AQ_VSI_Q_S		0
> +#define ICE_AQ_VSI_Q_M		(0x7FF << ICE_AQ_VSI_Q_S)
> +	__le16 tc_mapping[8];
> +#define ICE_AQ_VSI_TC_Q_OFFSET_S	0
> +#define ICE_AQ_VSI_TC_Q_OFFSET_M	(0x7FF << ICE_AQ_VSI_TC_Q_OFFSET_S)
> +#define ICE_AQ_VSI_TC_Q_NUM_S		11
> +#define ICE_AQ_VSI_TC_Q_NUM_M		(0xF << ICE_AQ_VSI_TC_Q_NUM_S)
> +	/* queueing option section */
> +	u8 q_opt_rss;
> +#define ICE_AQ_VSI_Q_OPT_RSS_LUT_S	0
> +#define ICE_AQ_VSI_Q_OPT_RSS_LUT_M	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_LUT_S)
> +#define ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI	0x0
> +#define ICE_AQ_VSI_Q_OPT_RSS_LUT_PF	0x2
> +#define ICE_AQ_VSI_Q_OPT_RSS_LUT_GBL	0x3
> +#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S	2
> +#define ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M	(0xF << ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_S)
> +#define ICE_AQ_VSI_Q_OPT_RSS_HASH_S	6
> +#define ICE_AQ_VSI_Q_OPT_RSS_HASH_M	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
> +#define ICE_AQ_VSI_Q_OPT_RSS_TPLZ	(0x0 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
> +#define ICE_AQ_VSI_Q_OPT_RSS_SYM_TPLZ	(0x1 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
> +#define ICE_AQ_VSI_Q_OPT_RSS_XOR	(0x2 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
> +#define ICE_AQ_VSI_Q_OPT_RSS_JHASH	(0x3 << ICE_AQ_VSI_Q_OPT_RSS_HASH_S)
> +	u8 q_opt_tc;
> +#define ICE_AQ_VSI_Q_OPT_TC_OVR_S	0
> +#define ICE_AQ_VSI_Q_OPT_TC_OVR_M	(0x1F << ICE_AQ_VSI_Q_OPT_TC_OVR_S)
> +#define ICE_AQ_VSI_Q_OPT_PROF_TC_OVR	BIT(7)
> +	u8 q_opt_flags;
> +#define ICE_AQ_VSI_Q_OPT_PE_FLTR_EN	BIT(0)
> +	u8 q_opt_reserved[3];
> +	/* outer up section */
> +	__le32 outer_up_table; /* same structure and defines as ingress tbl */
> +	/* section 10 */
> +	__le16 sect_10_reserved;
> +	/* flow director section */
> +	__le16 fd_options;
> +#define ICE_AQ_VSI_FD_ENABLE		BIT(0)
> +#define ICE_AQ_VSI_FD_TX_AUTO_ENABLE	BIT(1)
> +#define ICE_AQ_VSI_FD_PROG_ENABLE	BIT(3)
> +	__le16 max_fd_fltr_dedicated;
> +	__le16 max_fd_fltr_shared;
> +	__le16 fd_def_q;
> +#define ICE_AQ_VSI_FD_DEF_Q_S		0
> +#define ICE_AQ_VSI_FD_DEF_Q_M		(0x7FF << ICE_AQ_VSI_FD_DEF_Q_S)
> +#define ICE_AQ_VSI_FD_DEF_GRP_S	12
> +#define ICE_AQ_VSI_FD_DEF_GRP_M	(0x7 << ICE_AQ_VSI_FD_DEF_GRP_S)
> +	__le16 fd_report_opt;
> +#define ICE_AQ_VSI_FD_REPORT_Q_S	0
> +#define ICE_AQ_VSI_FD_REPORT_Q_M	(0x7FF << ICE_AQ_VSI_FD_REPORT_Q_S)
> +#define ICE_AQ_VSI_FD_DEF_PRIORITY_S	12
> +#define ICE_AQ_VSI_FD_DEF_PRIORITY_M	(0x7 << ICE_AQ_VSI_FD_DEF_PRIORITY_S)
> +#define ICE_AQ_VSI_FD_DEF_DROP		BIT(15)
> +	/* PASID section */
> +	__le32 pasid_id;
> +#define ICE_AQ_VSI_PASID_ID_S		0
> +#define ICE_AQ_VSI_PASID_ID_M		(0xFFFFF << ICE_AQ_VSI_PASID_ID_S)
> +#define ICE_AQ_VSI_PASID_ID_VALID	BIT(31)
> +	u8 reserved[24];
> +};
> +
>   /* Get Default Topology (indirect 0x0400) */
>   struct ice_aqc_get_topo {
>   	u8 port_num;
> @@ -590,6 +784,7 @@ struct ice_aq_desc {
>   		struct ice_aqc_query_txsched_res query_sched_res;
>   		struct ice_aqc_add_move_delete_elem add_move_delete_elem;
>   		struct ice_aqc_nvm nvm;
> +		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
>   		struct ice_aqc_get_link_status get_link_status;
>   	} params;
>   };
> @@ -640,6 +835,10 @@ enum ice_adminq_opc {
>   	/* internal switch commands */
>   	ice_aqc_opc_get_sw_cfg				= 0x0200,
>   
> +	/* VSI commands */
> +	ice_aqc_opc_add_vsi				= 0x0210,
> +	ice_aqc_opc_update_vsi				= 0x0211,
> +	ice_aqc_opc_free_vsi				= 0x0213,
>   	ice_aqc_opc_clear_pf_cfg			= 0x02A4,
>   
>   	/* transmit scheduler commands */
> diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> index b07ce86381bb..f9d0c99cae64 100644
> --- a/drivers/net/ethernet/intel/ice/ice_main.c
> +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> @@ -42,6 +42,37 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
>   
>   static struct workqueue_struct *ice_wq;
>   
> +static int ice_vsi_release(struct ice_vsi *vsi);
> +
> +/**
> + * ice_get_free_slot - get the next non-NULL location index in array
> + * @array: array to search
> + * @size: size of the array
> + * @curr: last known occupied index to be used as a search hint
> + *
> + * void * is being used to keep the functionality generic. This lets us use this
> + * function on any array of pointers.
> + */
> +static int ice_get_free_slot(void *array, int size, int curr)
> +{
> +	int **tmp_array = (int **)array;
> +	int next;
> +
> +	if (curr < (size - 1) && !tmp_array[curr + 1]) {
> +		next = curr + 1;
> +	} else {
> +		int i = 0;
> +
> +		while ((i < size) && (tmp_array[i]))
> +			i++;
> +		if (i == size)
> +			next = 0xffff;

Please use a named #define rather than a magic number

> +		else
> +			next = i;
> +	}
> +	return next;
> +}
> +
>   /**
>    * ice_search_res - Search the tracker for a block of resources
>    * @res: pointer to the resource
> @@ -340,6 +371,270 @@ static void ice_set_ctrlq_len(struct ice_hw *hw)
>   	hw->adminq.sq_buf_size = ICE_AQ_MAX_BUF_LEN;
>   }
>   
> +/**
> + * ice_vsi_delete - delete a VSI from the switch
> + * @vsi: pointer to VSI being removed
> + */
> +static void ice_vsi_delete(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	struct ice_vsi_ctx ctxt;
> +	enum ice_status status;
> +
> +	ctxt.vsi_num = vsi->vsi_num;
> +
> +	memcpy(&ctxt.info, &vsi->info, sizeof(struct ice_aqc_vsi_props));
> +
> +	status = ice_aq_free_vsi(&pf->hw, &ctxt, false, NULL);
> +	if (status)
> +		dev_err(&pf->pdev->dev, "Failed to delete VSI %i in FW\n",
> +			vsi->vsi_num);
> +}
> +
> +/**
> + * ice_vsi_setup_q_map - Setup a VSI queue map
> + * @vsi: the VSI being configured
> + * @ctxt: VSI context structure
> + */
> +static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
> +{
> +	u16 offset = 0, qmap = 0, pow = 0, qcount;
> +	u16 qcount_tx = vsi->alloc_txq;
> +	u16 qcount_rx = vsi->alloc_rxq;
> +	bool ena_tc0 = false;
> +	int i;
> +
> +	/* at least TC0 should be enabled by default */
> +	if (vsi->tc_cfg.numtc) {
> +		if (!(vsi->tc_cfg.ena_tc & BIT(0)))
> +			ena_tc0 =  true;
> +	} else {
> +		ena_tc0 =  true;
> +	}
> +
> +	if (ena_tc0) {
> +		vsi->tc_cfg.numtc++;
> +		vsi->tc_cfg.ena_tc |= 1;
> +	}
> +
> +	qcount = qcount_rx / vsi->tc_cfg.numtc;
> +
> +	/* find higher power-of-2 of qcount */
> +	pow = ilog2(qcount);
> +
> +	if (!is_power_of_2(qcount))
> +		pow++;
> +
> +	/* TC mapping is a function of the number of Rx queues assigned to the
> +	 * VSI for each traffic class and the offset of these queues.
> +	 * The first 10 bits are for queue offset for TC0, next 4 bits for no:of
> +	 * queues allocated to TC0. No:of queues is a power-of-2.
> +	 *
> +	 * If TC is not enabled, the queue offset is set to 0, and allocate one
> +	 * queue, this way, traffic for the given TC will be sent to the default
> +	 * queue.
> +	 *
> +	 * Setup number and offset of Rx queues for all TCs for the VSI
> +	 */
> +	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
> +		if (!(vsi->tc_cfg.ena_tc & BIT(i))) {
> +			/* TC is not enabled */
> +			vsi->tc_cfg.tc_info[i].qoffset = 0;
> +			vsi->tc_cfg.tc_info[i].qcount = 1;
> +			ctxt->info.tc_mapping[i] = 0;
> +			continue;
> +		}
> +
> +		/* TC is enabled */
> +		vsi->tc_cfg.tc_info[i].qoffset = offset;
> +		vsi->tc_cfg.tc_info[i].qcount = qcount;
> +
> +		qmap = ((offset << ICE_AQ_VSI_TC_Q_OFFSET_S) &
> +			ICE_AQ_VSI_TC_Q_OFFSET_M) |
> +			((pow << ICE_AQ_VSI_TC_Q_NUM_S) &
> +			 ICE_AQ_VSI_TC_Q_NUM_M);
> +		offset += qcount;
> +		ctxt->info.tc_mapping[i] = cpu_to_le16(qmap);
> +	}
> +
> +	vsi->num_txq = qcount_tx;
> +	vsi->num_rxq = offset;
> +
> +	/* Rx queue mapping */
> +	ctxt->info.mapping_flags |= cpu_to_le16(ICE_AQ_VSI_Q_MAP_CONTIG);
> +	/* q_mapping buffer holds the info for the first queue allocated for
> +	 * this VSI in the PF space and also the number of queues associated
> +	 * with this VSI.
> +	 */
> +	ctxt->info.q_mapping[0] = cpu_to_le16(vsi->rxq_map[0]);
> +	ctxt->info.q_mapping[1] = cpu_to_le16(vsi->num_rxq);
> +}
> +
> +/**
> + * ice_set_def_vsi_ctx - Set default VSI context before adding a VSI
> + * @ctxt: the VSI context being set
> + *
> + * This initializes a default VSI context for all sections except the Queues.
> + */
> +static void ice_set_def_vsi_ctx(struct ice_vsi_ctx *ctxt)
> +{
> +	u32 table = 0;
> +
> +	memset(&ctxt->info, 0, sizeof(ctxt->info));
> +	/* VSI's should be allocated from shared pool */
> +	ctxt->alloc_from_pool = true;
> +	/* Src pruning enabled by default */
> +	ctxt->info.sw_flags = ICE_AQ_VSI_SW_FLAG_SRC_PRUNE;
> +	/* Traffic from VSI can be sent to LAN */
> +	ctxt->info.sw_flags2 = ICE_AQ_VSI_SW_FLAG_LAN_ENA;
> +	/* Allow all packets untagged/tagged */
> +	ctxt->info.port_vlan_flags = ((ICE_AQ_VSI_PVLAN_MODE_ALL &
> +				       ICE_AQ_VSI_PVLAN_MODE_M) >>
> +				      ICE_AQ_VSI_PVLAN_MODE_S);
> +	/* Show VLAN/UP from packets in Rx descriptors */
> +	ctxt->info.port_vlan_flags |= ((ICE_AQ_VSI_PVLAN_EMOD_STR_BOTH &
> +					ICE_AQ_VSI_PVLAN_EMOD_M) >>
> +				       ICE_AQ_VSI_PVLAN_EMOD_S);
> +	/* Have 1:1 UP mapping for both ingress/egress tables */
> +	table |= ICE_UP_TABLE_TRANSLATE(0, 0);
> +	table |= ICE_UP_TABLE_TRANSLATE(1, 1);
> +	table |= ICE_UP_TABLE_TRANSLATE(2, 2);
> +	table |= ICE_UP_TABLE_TRANSLATE(3, 3);
> +	table |= ICE_UP_TABLE_TRANSLATE(4, 4);
> +	table |= ICE_UP_TABLE_TRANSLATE(5, 5);
> +	table |= ICE_UP_TABLE_TRANSLATE(6, 6);
> +	table |= ICE_UP_TABLE_TRANSLATE(7, 7);
> +	ctxt->info.ingress_table = cpu_to_le32(table);
> +	ctxt->info.egress_table = cpu_to_le32(table);
> +	/* Have 1:1 UP mapping for outer to inner UP table */
> +	ctxt->info.outer_up_table = cpu_to_le32(table);
> +	/* No Outer tag support outer_tag_flags remains to zero */
> +}
> +
> +/**
> + * ice_vsi_add - Create a new VSI or fetch preallocated VSI
> + * @vsi: the VSI being configured
> + *
> + * This initializes a VSI context depending on the VSI type to be added and
> + * passes it down to the add_vsi aq command to create a new VSI.
> + */
> +static int ice_vsi_add(struct ice_vsi *vsi)
> +{
> +	struct ice_vsi_ctx ctxt = { 0 };
> +	struct ice_pf *pf = vsi->back;
> +	struct ice_hw *hw = &pf->hw;
> +	int ret = 0;
> +
> +	switch (vsi->type) {
> +	case ICE_VSI_PF:
> +		ctxt.flags = ICE_AQ_VSI_TYPE_PF;
> +		break;
> +	default:
> +		return -ENODEV;
> +	}
> +
> +	ice_set_def_vsi_ctx(&ctxt);
> +	/* if the switch is in VEB mode, allow VSI loopback */
> +	if (vsi->vsw->bridge_mode == BRIDGE_MODE_VEB)
> +		ctxt.info.sw_flags |= ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
> +
> +	ctxt.info.sw_id = vsi->port_info->sw_id;
> +	ice_vsi_setup_q_map(vsi, &ctxt);
> +
> +	ret = ice_aq_add_vsi(hw, &ctxt, NULL);
> +	if (ret) {
> +		dev_err(&vsi->back->pdev->dev,
> +			"Add VSI AQ call failed, err %d\n", ret);
> +		return -EIO;
> +	}
> +	vsi->info = ctxt.info;
> +	vsi->vsi_num = ctxt.vsi_num;
> +
> +	return ret;
> +}
> +
> +/**
> + * ice_vsi_clear_rings - Deallocates the Tx and Rx rings for VSI
> + * @vsi: the VSI having rings deallocated
> + */
> +static void ice_vsi_clear_rings(struct ice_vsi *vsi)
> +{
> +	int i;
> +
> +	if (vsi->tx_rings) {
> +		for (i = 0; i < vsi->alloc_txq; i++) {
> +			if (vsi->tx_rings[i]) {
> +				kfree_rcu(vsi->tx_rings[i], rcu);
> +				vsi->tx_rings[i] = NULL;
> +			}
> +		}
> +	}
> +	if (vsi->rx_rings) {
> +		for (i = 0; i < vsi->alloc_rxq; i++) {
> +			if (vsi->rx_rings[i]) {
> +				kfree_rcu(vsi->rx_rings[i], rcu);
> +				vsi->rx_rings[i] = NULL;
> +			}
> +		}
> +	}
> +}
> +
> +/**
> + * ice_vsi_alloc_rings - Allocates Tx and Rx rings for the VSI
> + * @vsi: VSI which is having rings allocated
> + */
> +static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	int i;
> +
> +	/* Allocate tx_rings */
> +	for (i = 0; i < vsi->alloc_txq; i++) {
> +		struct ice_ring *ring;
> +
> +		/* allocate with kzalloc(), free with kfree_rcu() */
> +		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
> +
> +		if (!ring)
> +			goto err_out;
> +
> +		ring->q_index = i;
> +		ring->reg_idx = vsi->txq_map[i];
> +		ring->ring_active = false;
> +		ring->vsi = vsi;
> +		ring->netdev = vsi->netdev;
> +		ring->dev = &pf->pdev->dev;
> +		ring->count = vsi->num_desc;
> +
> +		vsi->tx_rings[i] = ring;
> +	}
> +
> +	/* Allocate rx_rings */
> +	for (i = 0; i < vsi->alloc_rxq; i++) {
> +		struct ice_ring *ring;
> +
> +		/* allocate with kzalloc(), free with kfree_rcu() */
> +		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
> +		if (!ring)
> +			goto err_out;
> +
> +		ring->q_index = i;
> +		ring->reg_idx = vsi->rxq_map[i];
> +		ring->ring_active = false;
> +		ring->vsi = vsi;
> +		ring->netdev = vsi->netdev;
> +		ring->dev = &pf->pdev->dev;
> +		ring->count = vsi->num_desc;
> +		vsi->rx_rings[i] = ring;
> +	}
> +
> +	return 0;
> +
> +err_out:
> +	ice_vsi_clear_rings(vsi);
> +	return -ENOMEM;
> +}
> +
>   /**
>    * ice_ena_misc_vector - enable the non-queue interrupts
>    * @pf: board private structure
> @@ -426,6 +721,188 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
>   	return ret;
>   }
>   
> +/**
> + * ice_vsi_map_rings_to_vectors - Map VSI rings to interrupt vectors
> + * @vsi: the VSI being configured
> + *
> + * This function maps descriptor rings to the queue-specific vectors allotted
> + * through the MSI-X enabling code. On a constrained vector budget, we map Tx
> + * and Rx rings to the vector as "efficiently" as possible.
> + */
> +static void ice_vsi_map_rings_to_vectors(struct ice_vsi *vsi)
> +{
> +	int q_vectors = vsi->num_q_vectors;
> +	int tx_rings_rem, rx_rings_rem;
> +	int v_id;
> +
> +	/* initially assigning remaining rings count to VSIs num queue value */
> +	tx_rings_rem = vsi->num_txq;
> +	rx_rings_rem = vsi->num_rxq;
> +
> +	for (v_id = 0; v_id < q_vectors; v_id++) {
> +		struct ice_q_vector *q_vector = vsi->q_vectors[v_id];
> +		int tx_rings_per_v, rx_rings_per_v, q_id, q_base;
> +
> +		/* Tx rings mapping to vector */
> +		tx_rings_per_v = DIV_ROUND_UP(tx_rings_rem, q_vectors - v_id);
> +		q_vector->num_ring_tx = tx_rings_per_v;
> +		q_vector->tx.ring = NULL;
> +		q_base = vsi->num_txq - tx_rings_rem;
> +
> +		for (q_id = q_base; q_id < (q_base + tx_rings_per_v); q_id++) {
> +			struct ice_ring *tx_ring = vsi->tx_rings[q_id];
> +
> +			tx_ring->q_vector = q_vector;
> +			tx_ring->next = q_vector->tx.ring;
> +			q_vector->tx.ring = tx_ring;
> +		}
> +		tx_rings_rem -= tx_rings_per_v;
> +
> +		/* Rx rings mapping to vector */
> +		rx_rings_per_v = DIV_ROUND_UP(rx_rings_rem, q_vectors - v_id);
> +		q_vector->num_ring_rx = rx_rings_per_v;
> +		q_vector->rx.ring = NULL;
> +		q_base = vsi->num_rxq - rx_rings_rem;
> +
> +		for (q_id = q_base; q_id < (q_base + rx_rings_per_v); q_id++) {
> +			struct ice_ring *rx_ring = vsi->rx_rings[q_id];
> +
> +			rx_ring->q_vector = q_vector;
> +			rx_ring->next = q_vector->rx.ring;
> +			q_vector->rx.ring = rx_ring;
> +		}
> +		rx_rings_rem -= rx_rings_per_v;
> +	}
> +}
> +
> +/**
> + * ice_vsi_set_num_qs - Set num queues, descriptors and vectors for a VSI
> + * @vsi: the VSI being configured
> + *
> + * Return 0 on success and a negative value on error
> + */
> +static void ice_vsi_set_num_qs(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +
> +	switch (vsi->type) {
> +	case ICE_VSI_PF:
> +		vsi->alloc_txq = pf->num_lan_tx;
> +		vsi->alloc_rxq = pf->num_lan_rx;
> +		vsi->num_desc = ALIGN(ICE_DFLT_NUM_DESC, ICE_REQ_DESC_MULTIPLE);
> +		vsi->num_q_vectors = max_t(int, pf->num_lan_rx, pf->num_lan_tx);
> +		break;
> +	default:
> +		dev_warn(&vsi->back->pdev->dev, "Unknown VSI type %d\n",
> +			 vsi->type);

There should be a break or return here.

> +	}
> +}
> +
> +/**
> + * ice_vsi_alloc_arrays - Allocate queue and vector pointer arrays for the vsi
> + * @vsi: VSI pointer
> + * @alloc_qvectors: a bool to specify if q_vectors need to be allocated.
> + *
> + * On error: returns error code (negative)
> + * On success: returns 0
> + */
> +static int ice_vsi_alloc_arrays(struct ice_vsi *vsi, bool alloc_qvectors)
> +{
> +	struct ice_pf *pf = vsi->back;
> +
> +	/* allocate memory for both Tx and Rx ring pointers */
> +	vsi->tx_rings = devm_kcalloc(&pf->pdev->dev, vsi->alloc_txq,
> +				     sizeof(struct ice_ring *), GFP_KERNEL);
> +	if (!vsi->tx_rings)
> +		goto err_txrings;
> +
> +	vsi->rx_rings = devm_kcalloc(&pf->pdev->dev, vsi->alloc_rxq,
> +				     sizeof(struct ice_ring *), GFP_KERNEL);
> +	if (!vsi->rx_rings)
> +		goto err_rxrings;
> +
> +	if (alloc_qvectors) {
> +		/* allocate memory for q_vector pointers */
> +		vsi->q_vectors = devm_kcalloc(&pf->pdev->dev,
> +					      vsi->num_q_vectors,
> +					      sizeof(struct ice_q_vector *),
> +					      GFP_KERNEL);
> +		if (!vsi->q_vectors)
> +			goto err_vectors;
> +	}
> +
> +	return 0;
> +
> +err_vectors:
> +	devm_kfree(&pf->pdev->dev, vsi->rx_rings);
> +err_rxrings:
> +	devm_kfree(&pf->pdev->dev, vsi->tx_rings);
> +err_txrings:
> +	return -ENOMEM;
> +}
> +
> +/**
> + * ice_vsi_alloc - Allocates the next available struct vsi in the PF
> + * @pf: board private structure
> + * @type: type of VSI
> + *
> + * returns a pointer to a VSI on success, NULL on failure.
> + */
> +static struct ice_vsi *ice_vsi_alloc(struct ice_pf *pf, enum ice_vsi_type type)
> +{
> +	struct ice_vsi *vsi = NULL;
> +
> +	/* Need to protect the allocation of the VSIs at the PF level */
> +	mutex_lock(&pf->sw_mutex);
> +
> +	/* If we have already allocated our maximum number of VSIs,
> +	 * pf->next_vsi will be 0xffff. If not, pf->next_vsi index
> +	 * is available to be populated
> +	 */
> +	if (pf->next_vsi == 0xffff) {

Magic number

> +		dev_dbg(&pf->pdev->dev, "out of VSI slots!\n");
> +		goto unlock_pf;
> +	}
> +
> +	vsi = devm_kzalloc(&pf->pdev->dev, sizeof(*vsi), GFP_KERNEL);
> +	if (!vsi)
> +		goto unlock_pf;
> +
> +	vsi->type = type;
> +	vsi->back = pf;
> +	set_bit(__ICE_DOWN, vsi->state);
> +	vsi->idx = pf->next_vsi;
> +	vsi->work_lmt = ICE_DFLT_IRQ_WORK;
> +
> +	ice_vsi_set_num_qs(vsi);
> +
> +	switch (vsi->type) {
> +	case ICE_VSI_PF:
> +		if (ice_vsi_alloc_arrays(vsi, true))
> +			goto err_rings;
> +
> +		break;
> +	default:
> +		dev_warn(&pf->pdev->dev, "Unknown VSI type %d\n", vsi->type);
> +		goto unlock_pf;
> +	}
> +
> +	/* fill VSI slot in the PF struct */
> +	pf->vsi[pf->next_vsi] = vsi;
> +
> +	/* prepare pf->next_vsi for next use */
> +	pf->next_vsi = ice_get_free_slot(pf->vsi, pf->num_alloc_vsi,
> +					 pf->next_vsi);
> +	goto unlock_pf;
> +
> +err_rings:
> +	devm_kfree(&pf->pdev->dev, vsi);
> +	vsi = NULL;
> +unlock_pf:
> +	mutex_unlock(&pf->sw_mutex);
> +	return vsi;
> +}
> +
>   /**
>    * ice_free_irq_msix_misc - Unroll misc vector setup
>    * @pf: board private structure
> @@ -507,6 +984,579 @@ static int ice_req_irq_msix_misc(struct ice_pf *pf)
>   	return 0;
>   }
>   
> +/**
> + * ice_vsi_get_qs_contig - Assign a contiguous chunk of queues to VSI
> + * @vsi: the VSI getting queues
> + *
> + * Return 0 on success and a negative value on error
> + */
> +static int ice_vsi_get_qs_contig(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	int offset, ret = 0;
> +
> +	mutex_lock(&pf->avail_q_mutex);
> +	/* look for contiguous block of queues for tx */
> +	offset = bitmap_find_next_zero_area(pf->avail_txqs, ICE_MAX_TXQS,
> +					    0, vsi->alloc_txq, 0);
> +	if (offset < ICE_MAX_TXQS) {
> +		int i;
> +
> +		bitmap_set(pf->avail_txqs, offset, vsi->alloc_txq);
> +		for (i = 0; i < vsi->alloc_txq; i++)
> +			vsi->txq_map[i] = i + offset;
> +	} else {
> +		ret = -ENOMEM;
> +		vsi->tx_mapping_mode = ICE_VSI_MAP_SCATTER;
> +	}
> +
> +	/* look for contiguous block of queues for rx */
> +	offset = bitmap_find_next_zero_area(pf->avail_rxqs, ICE_MAX_RXQS,
> +					    0, vsi->alloc_rxq, 0);
> +	if (offset < ICE_MAX_RXQS) {
> +		int i;
> +
> +		bitmap_set(pf->avail_rxqs, offset, vsi->alloc_rxq);
> +		for (i = 0; i < vsi->alloc_rxq; i++)
> +			vsi->rxq_map[i] = i + offset;
> +	} else {
> +		ret = -ENOMEM;
> +		vsi->rx_mapping_mode = ICE_VSI_MAP_SCATTER;
> +	}
> +	mutex_unlock(&pf->avail_q_mutex);
> +
> +	return ret;
> +}
> +
> +/**
> + * ice_vsi_get_qs_scatter - Assign a scattered queues to VSI
> + * @vsi: the VSI getting queues
> + *
> + * Return 0 on success and a negative value on error
> + */
> +static int ice_vsi_get_qs_scatter(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	int i, index = 0;
> +
> +	mutex_lock(&pf->avail_q_mutex);
> +
> +	if (vsi->tx_mapping_mode == ICE_VSI_MAP_SCATTER) {
> +		for (i = 0; i < vsi->alloc_txq; i++) {
> +			index = find_next_zero_bit(pf->avail_txqs,
> +						   ICE_MAX_TXQS, index);
> +			if (index < ICE_MAX_TXQS) {
> +				set_bit(index, pf->avail_txqs);
> +				vsi->txq_map[i] = index;
> +			} else {
> +				goto err_scatter_tx;
> +			}
> +		}
> +	}
> +
> +	if (vsi->rx_mapping_mode == ICE_VSI_MAP_SCATTER)
> +		for (i = 0; i < vsi->alloc_rxq; i++) {
> +			index = find_next_zero_bit(pf->avail_rxqs,
> +						   ICE_MAX_RXQS, index);
> +			if (index < ICE_MAX_RXQS) {
> +				set_bit(index, pf->avail_rxqs);
> +				vsi->rxq_map[i] = index;
> +			} else {
> +				goto err_scatter_rx;
> +			}
> +		}

Put {}'s around this if block, like you did on the previous one.

> +
> +	mutex_unlock(&pf->avail_q_mutex);
> +	return 0;
> +
> +err_scatter_rx:
> +	/* unflag any queues we have grabbed (i is failed position) */
> +	for (index = 0; index < i; index++) {
> +		clear_bit(vsi->rxq_map[index], pf->avail_rxqs);
> +		vsi->rxq_map[index] = 0;
> +	}
> +	i = vsi->alloc_txq;
> +err_scatter_tx:
> +	/* i is either position of failed attempt or vsi->alloc_txq */
> +	for (index = 0; index < i; index++) {
> +		clear_bit(vsi->txq_map[index], pf->avail_txqs);
> +		vsi->txq_map[index] = 0;
> +	}
> +
> +	mutex_unlock(&pf->avail_q_mutex);
> +	return -ENOMEM;
> +}
> +
> +/**
> + * ice_vsi_get_qs - Assign queues from PF to VSI
> + * @vsi: the VSI to assign queues to
> + *
> + * Returns 0 on success and a negative value on error
> + */
> +static int ice_vsi_get_qs(struct ice_vsi *vsi)
> +{
> +	int ret = 0;
> +
> +	vsi->tx_mapping_mode = ICE_VSI_MAP_CONTIG;
> +	vsi->rx_mapping_mode = ICE_VSI_MAP_CONTIG;
> +
> +	/* NOTE: ice_vsi_get_qs_contig() will set the rx/tx mapping
> +	 * modes individually to scatter if assigning contiguous queues
> +	 * to rx or tx fails
> +	 */
> +	ret = ice_vsi_get_qs_contig(vsi);
> +	if (ret < 0) {
> +		if (vsi->tx_mapping_mode == ICE_VSI_MAP_SCATTER)
> +			vsi->alloc_txq = max_t(u16, vsi->alloc_txq,
> +					       ICE_MAX_SCATTER_TXQS);
> +		if (vsi->rx_mapping_mode == ICE_VSI_MAP_SCATTER)
> +			vsi->alloc_rxq = max_t(u16, vsi->alloc_rxq,
> +					       ICE_MAX_SCATTER_RXQS);
> +		ret = ice_vsi_get_qs_scatter(vsi);
> +	}
> +
> +	return ret;
> +}
> +
> +/**
> + * ice_vsi_put_qs - Release queues from VSI to PF
> + * @vsi: the VSI thats going to release queues
> + */
> +static void ice_vsi_put_qs(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	int i;
> +
> +	mutex_lock(&pf->avail_q_mutex);
> +
> +	for (i = 0; i < vsi->alloc_txq; i++) {
> +		clear_bit(vsi->txq_map[i], pf->avail_txqs);
> +		vsi->txq_map[i] = ICE_INVAL_Q_INDEX;
> +	}
> +
> +	for (i = 0; i < vsi->alloc_rxq; i++) {
> +		clear_bit(vsi->rxq_map[i], pf->avail_rxqs);
> +		vsi->rxq_map[i] = ICE_INVAL_Q_INDEX;
> +	}
> +
> +	mutex_unlock(&pf->avail_q_mutex);
> +}
> +
> +/**
> + * ice_free_q_vector - Free memory allocated for a specific interrupt vector
> + * @vsi: VSI having the memory freed
> + * @v_idx: index of the vector to be freed
> + */
> +static void ice_free_q_vector(struct ice_vsi *vsi, int v_idx)
> +{
> +	struct ice_q_vector *q_vector;
> +	struct ice_ring *ring;
> +
> +	if (!vsi->q_vectors[v_idx]) {
> +		dev_dbg(&vsi->back->pdev->dev, "Queue vector at index %d not found\n",
> +			v_idx);
> +		return;
> +	}
> +	q_vector = vsi->q_vectors[v_idx];
> +
> +	ice_for_each_ring(ring, q_vector->tx)
> +		ring->q_vector = NULL;
> +	ice_for_each_ring(ring, q_vector->rx)
> +		ring->q_vector = NULL;
> +
> +	/* only VSI with an associated netdev is set up with NAPI */
> +	if (vsi->netdev)
> +		netif_napi_del(&q_vector->napi);
> +
> +	devm_kfree(&vsi->back->pdev->dev, q_vector);
> +	vsi->q_vectors[v_idx] = NULL;
> +}
> +
> +/**
> + * ice_vsi_free_q_vectors - Free memory allocated for interrupt vectors
> + * @vsi: the VSI having memory freed
> + */
> +static void ice_vsi_free_q_vectors(struct ice_vsi *vsi)
> +{
> +	int v_idx;
> +
> +	for (v_idx = 0; v_idx < vsi->num_q_vectors; v_idx++)
> +		ice_free_q_vector(vsi, v_idx);
> +}
> +
> +/**
> + * ice_cfg_netdev - Setup the netdev flags
> + * @vsi: the VSI being configured
> + *
> + * Returns 0 on success, negative value on failure
> + */
> +static int ice_cfg_netdev(struct ice_vsi *vsi)
> +{
> +	struct ice_netdev_priv *np;
> +	struct net_device *netdev;
> +	u8 mac_addr[ETH_ALEN];
> +
> +	netdev = alloc_etherdev_mqs(sizeof(struct ice_netdev_priv),
> +				    vsi->alloc_txq, vsi->alloc_rxq);
> +	if (!netdev)
> +		return -ENOMEM;
> +
> +	vsi->netdev = netdev;
> +	np = netdev_priv(netdev);
> +	np->vsi = vsi;
> +
> +	/* set features that user can change */
> +	netdev->hw_features = NETIF_F_SG	|
> +			      NETIF_F_HIGHDMA	|
> +			      NETIF_F_RXHASH;
> +
> +	/* enable features */
> +	netdev->features |= netdev->hw_features;
> +
> +	if (vsi->type == ICE_VSI_PF) {
> +		SET_NETDEV_DEV(netdev, &vsi->back->pdev->dev);
> +		ether_addr_copy(mac_addr, vsi->port_info->mac.perm_addr);
> +
> +		ether_addr_copy(netdev->dev_addr, mac_addr);
> +		ether_addr_copy(netdev->perm_addr, mac_addr);
> +	}
> +
> +	netdev->priv_flags |= IFF_UNICAST_FLT;
> +
> +	/* setup watchdog timeout value to be 5 second */
> +	netdev->watchdog_timeo = 5 * HZ;
> +
> +	netdev->min_mtu = ETH_MIN_MTU;
> +	netdev->max_mtu = ICE_MAX_MTU;
> +
> +	return 0;
> +}
> +
> +/**
> + * ice_vsi_free_arrays - clean up vsi resources
> + * @vsi: pointer to VSI being cleared
> + * @free_qvectors: bool to specify if q_vectors should be deallocated
> + */
> +static void ice_vsi_free_arrays(struct ice_vsi *vsi, bool free_qvectors)
> +{
> +	struct ice_pf *pf = vsi->back;
> +
> +	/* free the ring and vector containers */
> +	if (free_qvectors && vsi->q_vectors) {
> +		devm_kfree(&pf->pdev->dev, vsi->q_vectors);
> +		vsi->q_vectors = NULL;
> +	}
> +	if (vsi->tx_rings) {
> +		devm_kfree(&pf->pdev->dev, vsi->tx_rings);
> +		vsi->tx_rings = NULL;
> +	}
> +	if (vsi->rx_rings) {
> +		devm_kfree(&pf->pdev->dev, vsi->rx_rings);
> +		vsi->rx_rings = NULL;
> +	}
> +}
> +
> +/**
> + * ice_vsi_clear - clean up and deallocate the provided vsi
> + * @vsi: pointer to VSI being cleared
> + *
> + * This deallocates the vsi's queue resources, removes it from the PF's
> + * VSI array if necessary, and deallocates the VSI
> + *
> + * Returns 0 on success, negative on failure
> + */
> +static int ice_vsi_clear(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = NULL;
> +
> +	if (!vsi)
> +		return 0;
> +
> +	if (!vsi->back)
> +		return -EINVAL;
> +
> +	pf = vsi->back;
> +
> +	if (!pf->vsi[vsi->idx] || pf->vsi[vsi->idx] != vsi) {
> +		dev_dbg(&pf->pdev->dev, "vsi does not exist at pf->vsi[%d]\n",
> +			vsi->idx);
> +		return -EINVAL;
> +	}
> +
> +	mutex_lock(&pf->sw_mutex);
> +	/* updates the PF for this cleared vsi */
> +
> +	pf->vsi[vsi->idx] = NULL;
> +	if (vsi->idx < pf->next_vsi)
> +		pf->next_vsi = vsi->idx;
> +
> +	ice_vsi_free_arrays(vsi, true);
> +	mutex_unlock(&pf->sw_mutex);
> +	devm_kfree(&pf->pdev->dev, vsi);
> +
> +	return 0;
> +}
> +
> +/**
> + * ice_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
> + * @vsi: the VSI being configured
> + * @v_idx: index of the vector in the vsi struct
> + *
> + * We allocate one q_vector.  If allocation fails we return -ENOMEM.
> + */
> +static int ice_vsi_alloc_q_vector(struct ice_vsi *vsi, int v_idx)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	struct ice_q_vector *q_vector;
> +
> +	/* allocate q_vector */
> +	q_vector = devm_kzalloc(&pf->pdev->dev, sizeof(*q_vector), GFP_KERNEL);
> +	if (!q_vector)
> +		return -ENOMEM;
> +
> +	q_vector->vsi = vsi;
> +	q_vector->v_idx = v_idx;
> +	/* only set affinity_mask if the CPU is online */
> +	if (cpu_online(v_idx))
> +		cpumask_set_cpu(v_idx, &q_vector->affinity_mask);
> +
> +	/* tie q_vector and vsi together */
> +	vsi->q_vectors[v_idx] = q_vector;
> +
> +	return 0;
> +}
> +
> +/**
> + * ice_vsi_alloc_q_vectors - Allocate memory for interrupt vectors
> + * @vsi: the VSI being configured
> + *
> + * We allocate one q_vector per queue interrupt.  If allocation fails we
> + * return -ENOMEM.
> + */
> +static int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	int v_idx = 0, num_q_vectors;
> +	int err;
> +
> +	if (vsi->q_vectors[0]) {
> +		dev_dbg(&pf->pdev->dev, "VSI %d has existing q_vectors\n",
> +			vsi->vsi_num);
> +		return -EEXIST;
> +	}
> +
> +	if (test_bit(ICE_FLAG_MSIX_ENA, pf->flags)) {
> +		num_q_vectors = vsi->num_q_vectors;
> +	} else {
> +		err = -EINVAL;
> +		goto err_out;
> +	}
> +
> +	for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
> +		err = ice_vsi_alloc_q_vector(vsi, v_idx);
> +		if (err)
> +			goto err_out;
> +	}
> +
> +	return 0;
> +
> +err_out:
> +	while (v_idx--)
> +		ice_free_q_vector(vsi, v_idx);
> +
> +	dev_err(&pf->pdev->dev,
> +		"Failed to allocate %d q_vector for VSI %d, ret=%d\n",
> +		vsi->num_q_vectors, vsi->vsi_num, err);
> +	vsi->num_q_vectors = 0;
> +	return err;
> +}
> +
> +/**
> + * ice_vsi_setup_vector_base - Set up the base vector for the given VSI
> + * @vsi: ptr to the VSI
> + *
> + * This should only be called after ice_vsi_alloc() which allocates the
> + * corresponding SW VSI structure and initializes num_queue_pairs for the
> + * newly allocated VSI.
> + *
> + * Returns 0 on success or negative on failure
> + */
> +static int ice_vsi_setup_vector_base(struct ice_vsi *vsi)
> +{
> +	struct ice_pf *pf = vsi->back;
> +	int num_q_vectors = 0;
> +
> +	if (vsi->base_vector) {
> +		dev_dbg(&pf->pdev->dev, "VSI %d has non-zero base vector %d\n",
> +			vsi->vsi_num, vsi->base_vector);
> +		return -EEXIST;
> +	}
> +
> +	if (!test_bit(ICE_FLAG_MSIX_ENA, pf->flags))
> +		return -ENOENT;
> +
> +	switch (vsi->type) {
> +	case ICE_VSI_PF:
> +		num_q_vectors = vsi->num_q_vectors;
> +		break;
> +	default:
> +		dev_warn(&vsi->back->pdev->dev, "Unknown VSI type %d\n",
> +			 vsi->type);

There should be a break here

sln

^ permalink raw reply

* [PATCH net-next 0/4] net: qualcomm: rmnet: Updates 2018-03-12
From: Subash Abhinov Kasiviswanathan @ 2018-03-13  2:26 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan

This series contains some minor updates for rmnet driver.

Patch 1 contains fixes for sparse warnings.
Patch 2 updates the copyright date to 2018.
Patch 3 is a cleanup in receive path.
Patch 4 has the implementation of the fill_info operation.

Subash Abhinov Kasiviswanathan (4):
  net: qualcomm: rmnet: Fix casting issues
  net: qualcomm: rmnet: Update copyright year to 2018
  net: qualcomm: rmnet: Remove unnecessary device assignment
  net: qualcomm: rmnet: Implement fill_info

 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 32 +++++++++++++++++++++-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c   |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.h   |  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h    |  4 +--
 .../ethernet/qualcomm/rmnet/rmnet_map_command.c    |  6 ++--
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   |  8 +++---
 .../net/ethernet/qualcomm/rmnet/rmnet_private.h    |  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c    |  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h    |  2 +-
 10 files changed, 46 insertions(+), 16 deletions(-)

-- 
1.9.1

^ permalink raw reply

* [PATCH net-next 1/4] net: qualcomm: rmnet: Fix casting issues
From: Subash Abhinov Kasiviswanathan @ 2018-03-13  2:26 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520907969-16696-1-git-send-email-subashab@codeaurora.org>

Fix warnings which were reported when running with sparse
(make C=1 CF=-D__CHECK_ENDIAN__)

drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c:81:15:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:271:37:
warning: incorrect type in assignment (different base types)
expected unsigned short [unsigned] [usertype] pkt_len
got restricted __be16 [usertype] <noident>
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:287:29:
warning: incorrect type in assignment (different base types)
expected unsigned short [unsigned] [usertype] pkt_len
got restricted __be16 [usertype] <noident>
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:310:22:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:319:13:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:49:18:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:50:18:
warning: cast to restricted __be32
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:74:21:
warning: cast to restricted __be16

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h         | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 4 ++--
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c    | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 6ce31e2..65b074e 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -44,7 +44,7 @@ struct rmnet_map_header {
 	u8  reserved_bit:1;
 	u8  cd_bit:1;
 	u8  mux_id;
-	u16 pkt_len;
+	__be16 pkt_len;
 }  __aligned(1);
 
 struct rmnet_map_dl_csum_trailer {
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
index b0dbca0..b39b73b 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
@@ -46,8 +46,8 @@ static u8 rmnet_map_do_flow_control(struct sk_buff *skb,
 	vnd = ep->egress_dev;
 
 	ip_family = cmd->flow_control.ip_family;
-	fc_seq = ntohs(cmd->flow_control.flow_control_seq_num);
-	qos_id = ntohl(cmd->flow_control.qos_id);
+	fc_seq = ntohs((__force __be16)cmd->flow_control.flow_control_seq_num);
+	qos_id = ntohl((__force __be32)cmd->flow_control.qos_id);
 
 	/* Ignore the ip family and pass the sequence number for both v4 and v6
 	 * sequence. User space does not support creating dedicated flows for
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index c74a6c5..4e342a3 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -307,7 +307,8 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
 		return NULL;
 
 	maph = (struct rmnet_map_header *)skb->data;
-	packet_len = ntohs(maph->pkt_len) + sizeof(struct rmnet_map_header);
+	packet_len = ntohs((__force __be16)maph->pkt_len) +
+		     sizeof(struct rmnet_map_header);
 
 	if (port->data_format & RMNET_INGRESS_FORMAT_MAP_CKSUMV4)
 		packet_len += sizeof(struct rmnet_map_dl_csum_trailer);
@@ -316,7 +317,7 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
 		return NULL;
 
 	/* Some hardware can send us empty frames. Catch them */
-	if (ntohs(maph->pkt_len) == 0)
+	if (ntohs((__force __be16)maph->pkt_len) == 0)
 		return NULL;
 
 	skbn = alloc_skb(packet_len + RMNET_MAP_DEAGGR_SPACING, GFP_ATOMIC);
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 2/4] net: qualcomm: rmnet: Update copyright year to 2018
From: Subash Abhinov Kasiviswanathan @ 2018-03-13  2:26 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520907969-16696-1-git-send-email-subashab@codeaurora.org>

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c      | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h      | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c    | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h    | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h         | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c    | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h     | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c         | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h         | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index c494918..096301a 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index 00e4634..0b5b5da 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2014, 2016-2017 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2014, 2016-2018 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 601edec..c758248 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
index 3537e4c..1bc6828 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013, 2016-2017 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013, 2016-2018 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 65b074e..2aaf093 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
index b39b73b..88f2595 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 4e342a3..fad3d0b 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
index de0143e..98365ef 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2014, 2016-2017 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2014, 2016-2018 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
index 346d310..2ea16a0 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
index 71e4c32..daab75c 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 3/4] net: qualcomm: rmnet: Remove unnecessary device assignment
From: Subash Abhinov Kasiviswanathan @ 2018-03-13  2:26 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520907969-16696-1-git-send-email-subashab@codeaurora.org>

Device of the de-aggregated skb is correctly assigned after inspecting
the mux_id, so remove the assignment in rmnet_map_deaggregate().

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index fad3d0b..134da43 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -324,7 +324,6 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
 	if (!skbn)
 		return NULL;
 
-	skbn->dev = skb->dev;
 	skb_reserve(skbn, RMNET_MAP_DEAGGR_HEADROOM);
 	skb_put(skbn, packet_len);
 	memcpy(skbn->data, skb->data, packet_len);
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 4/4] net: qualcomm: rmnet: Implement fill_info
From: Subash Abhinov Kasiviswanathan @ 2018-03-13  2:26 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520907969-16696-1-git-send-email-subashab@codeaurora.org>

This is needed to query the mux_id and flags of a rmnet device.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 096301a..d0f3e0f 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -331,6 +331,35 @@ static size_t rmnet_get_size(const struct net_device *dev)
 	       nla_total_size(sizeof(struct ifla_vlan_flags)); /* IFLA_VLAN_FLAGS */
 }
 
+static int rmnet_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct rmnet_priv *priv = netdev_priv(dev);
+	struct net_device *real_dev;
+	struct ifla_vlan_flags f;
+	struct rmnet_port *port;
+
+	real_dev = priv->real_dev;
+
+	if (!rmnet_is_real_dev_registered(real_dev))
+		return -ENODEV;
+
+	if (nla_put_u16(skb, IFLA_VLAN_ID, priv->mux_id))
+		goto nla_put_failure;
+
+	port = rmnet_get_port_rtnl(real_dev);
+
+	f.flags = port->data_format;
+	f.mask  = ~0;
+
+	if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 struct rtnl_link_ops rmnet_link_ops __read_mostly = {
 	.kind		= "rmnet",
 	.maxtype	= __IFLA_VLAN_MAX,
@@ -341,6 +370,7 @@ struct rtnl_link_ops rmnet_link_ops __read_mostly = {
 	.dellink	= rmnet_dellink,
 	.get_size	= rmnet_get_size,
 	.changelink     = rmnet_changelink,
+	.fill_info	= rmnet_fill_info,
 };
 
 /* Needs either rcu_read_lock() or rtnl lock */
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH] net: hns: use put_device() if device_register fail
From: arvindY @ 2018-03-13  2:57 UTC (permalink / raw)
  To: Richard Weinberger
  Cc: David Miller, yisen.zhuang, salil.mehta, linyunsheng, LKML,
	netdev, linux-mtd
In-Reply-To: <CAFLxGvw1U2OZn4kU1O6uJ17yWYf1deE=+Cr56Foffdq7Q=5u3A@mail.gmail.com>



On Monday 12 March 2018 10:59 PM, Richard Weinberger wrote:
> On Mon, Mar 12, 2018 at 5:27 PM, arvindY <arvind.yadav.cs@gmail.com> wrote:
>>
>> On Monday 12 March 2018 08:13 PM, David Miller wrote:
>>> From: Arvind Yadav <arvind.yadav.cs@gmail.com>
>>> Date: Fri,  9 Mar 2018 16:11:17 +0530
>>>
>>>> if device_register() returned an error! Always use put_device()
>>>> to give up the reference initialized.
>>>>
>>>> Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
>>> I do not see anything giving cls_dev an initial non-zero reference
>>> count before this device_register() call.
>> Yes,  you are correct there is nothing to release (hnae_release).
> Wait, this is not what DaveM said.
> Since you sent also patches to MTD I care about this too.
>
> Do we have to call put_device() in any case after a failure of
> device_register() or not?
> In this case the release function is empty, so nothing is going to released.
> But technically a put_device() is needed and correct according to your
> change log.
>
> I have to admit I don't know all details of the driver core, maybe you
> can help me.
> device_register() calls device_initialize() followed by device_add().
> device_initialize() does a kobject_init() which again sets the
> reference counter to 1 via kref_init().
> In the next step device_add() does a get_device() --> reference
> counter goes up to 2.
> But in the exit path of the function a put_device() is done, also in
> case of an error.
> So device_register() always returns with reference count 1.
>
> If I understand this correctly a put_device() is mandatory.
> Also in this driver.
> Can you please enlighten me? :-)
>
Oh, I just miss understood david question.

But what ever you are telling that is correct.
device_initialize() will call kref_init() which is setting
refcount 1 by refcount_set(&kref->refcount, 1).
and device_add() will call kref_get() to increment refcount
by calling refcount_inc(&kref->refcount).
So we need put_device() to decrement and release the
kboject.
Internally it'll call kref_put() -> refcount_dec_and_test(&kref->refcount)
to decrement refcount.

~arvind

^ permalink raw reply

* [PATCH net-next v2] sctp: fix error return code in sctp_sendmsg_new_asoc()
From: Wei Yongjun @ 2018-03-13  3:03 UTC (permalink / raw)
  To: Vlad Yasevich, Neil Horman, Xin Long
  Cc: Wei Yongjun, linux-sctp, netdev, kernel-janitors
In-Reply-To: <1520856964-132516-1-git-send-email-weiyongjun1@huawei.com>

Return error code -EINVAL in the address len check error handling
case since 'err' can be overwrite to 0 by 'err = sctp_verify_addr()'
in the for loop.

Fixes: 2c0dbaa0c43d ("sctp: add support for SCTP_DSTADDRV4/6 Information for sendmsg")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v1 -> v2: remove the 'err' initialization
---
 net/sctp/socket.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 7d3476a..af5cf29 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1677,7 +1677,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 	struct sctp_association *asoc;
 	enum sctp_scope scope;
 	struct cmsghdr *cmsg;
-	int err = -EINVAL;
+	int err;
 
 	*tp = NULL;
 
@@ -1761,16 +1761,20 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 		memset(daddr, 0, sizeof(*daddr));
 		dlen = cmsg->cmsg_len - sizeof(struct cmsghdr);
 		if (cmsg->cmsg_type == SCTP_DSTADDRV4) {
-			if (dlen < sizeof(struct in_addr))
+			if (dlen < sizeof(struct in_addr)) {
+				err = -EINVAL;
 				goto free;
+			}
 
 			dlen = sizeof(struct in_addr);
 			daddr->v4.sin_family = AF_INET;
 			daddr->v4.sin_port = htons(asoc->peer.port);
 			memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen);
 		} else {
-			if (dlen < sizeof(struct in6_addr))
+			if (dlen < sizeof(struct in6_addr)) {
+				err = -EINVAL;
 				goto free;
+			}
 
 			dlen = sizeof(struct in6_addr);
 			daddr->v6.sin6_family = AF_INET6;

^ permalink raw reply related

* [PATCH v2] netfilter: nf_tables: remove VLA usage
From: Gustavo A. R. Silva @ 2018-03-13  3:16 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
	David S. Miller
  Cc: netfilter-devel, coreteam, netdev, linux-kernel,
	Gustavo A. R. Silva

In preparation to enabling -Wvla, remove VLA and replace it
with dynamic memory allocation.

>From a security viewpoint, the use of Variable Length Arrays can be
a vector for stack overflow attacks. Also, in general, as the code
evolves it is easy to lose track of how big a VLA can get. Thus, we
can end up having segfaults that are hard to debug.

Also, fixed as part of the directive to remove all VLAs from
the kernel: https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
Changes in v2:
 - Use kmalloc_array instead of kcalloc.

 net/netfilter/nf_tables_api.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3f815b6..5a42e97 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4357,16 +4357,20 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 				       const struct nft_object_type *type,
 				       const struct nlattr *attr)
 {
-	struct nlattr *tb[type->maxattr + 1];
+	struct nlattr **tb;
 	const struct nft_object_ops *ops;
 	struct nft_object *obj;
-	int err;
+	int err = -ENOMEM;
+
+	tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL);
+	if (!tb)
+		goto err1;
 
 	if (attr) {
 		err = nla_parse_nested(tb, type->maxattr, attr, type->policy,
 				       NULL);
 		if (err < 0)
-			goto err1;
+			goto err2;
 	} else {
 		memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
 	}
@@ -4375,7 +4379,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 		ops = type->select_ops(ctx, (const struct nlattr * const *)tb);
 		if (IS_ERR(ops)) {
 			err = PTR_ERR(ops);
-			goto err1;
+			goto err2;
 		}
 	} else {
 		ops = type->ops;
@@ -4383,18 +4387,21 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 
 	err = -ENOMEM;
 	obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
-	if (obj == NULL)
-		goto err1;
+	if (!obj)
+		goto err2;
 
 	err = ops->init(ctx, (const struct nlattr * const *)tb, obj);
 	if (err < 0)
-		goto err2;
+		goto err3;
 
 	obj->ops = ops;
 
+	kfree(tb);
 	return obj;
-err2:
+err3:
 	kfree(obj);
+err2:
+	kfree(tb);
 err1:
 	return ERR_PTR(err);
 }
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v3 net-next 1/4] net: macb: Reorganize macb_mii bringup
From: kbuild test robot @ 2018-03-13  4:25 UTC (permalink / raw)
  To: Brad Mouring
  Cc: kbuild-all, Nicolas Ferre, Rob Herring, David S . Miller,
	Michael Grzeschik, Andrew Lunn, Mark Rutland, netdev,
	Julia Cartwright, devicetree, Brad Mouring
In-Reply-To: <20180312171001.129209-2-brad.mouring@ni.com>

[-- Attachment #1: Type: text/plain, Size: 8078 bytes --]

Hi Brad,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on v4.16-rc4]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Brad-Mouring/net-macb-Reorganize-macb_mii-bringup/20180313-113743
config: x86_64-randconfig-x003-201810 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

Note: the linux-review/Brad-Mouring/net-macb-Reorganize-macb_mii-bringup/20180313-113743 HEAD dbe45eb4d14a7cd3d0b9c7ee8e7ca034c1ef3852 builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

   drivers/net/ethernet/cadence/macb_main.c: In function 'macb_mii_init':
>> drivers/net/ethernet/cadence/macb_main.c:596:8: error: 'i' undeclared (first use in this function)
      for (i = 0; i < PHY_MAX_ADDR; i++)
           ^
   drivers/net/ethernet/cadence/macb_main.c:596:8: note: each undeclared identifier is reported only once for each function it appears in

vim +/i +596 drivers/net/ethernet/cadence/macb_main.c

6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  564  
421d9df0 drivers/net/ethernet/cadence/macb.c      Cyrille Pitchen    2015-03-07  565  static int macb_mii_init(struct macb *bp)
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  566  {
84e0cdb0 drivers/net/ethernet/cadence/macb.c      Jamie Iles         2011-03-08  567  	struct macb_platform_data *pdata;
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  568  	struct device_node *np;
6576bbb8 drivers/net/ethernet/cadence/macb_main.c Brad Mouring       2018-03-12  569  	int err;
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  570  
3dbda77e drivers/net/macb.c                       Uwe Kleine-Koenig  2009-07-23  571  	/* Enable management port */
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  572  	macb_writel(bp, NCR, MACB_BIT(MPE));
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  573  
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  574  	bp->mii_bus = mdiobus_alloc();
aa50b552 drivers/net/ethernet/cadence/macb.c      Moritz Fischer     2016-03-29  575  	if (!bp->mii_bus) {
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  576  		err = -ENOMEM;
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  577  		goto err_out;
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  578  	}
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  579  
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  580  	bp->mii_bus->name = "MACB_mii_bus";
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  581  	bp->mii_bus->read = &macb_mdio_read;
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  582  	bp->mii_bus->write = &macb_mdio_write;
98d5e57e drivers/net/ethernet/cadence/macb.c      Florian Fainelli   2012-01-09  583  	snprintf(bp->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
98d5e57e drivers/net/ethernet/cadence/macb.c      Florian Fainelli   2012-01-09  584  		 bp->pdev->name, bp->pdev->id);
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  585  	bp->mii_bus->priv = bp;
cf669660 drivers/net/ethernet/cadence/macb.c      Florian Fainelli   2016-05-02  586  	bp->mii_bus->parent = &bp->pdev->dev;
c607a0d9 drivers/net/ethernet/cadence/macb.c      Jingoo Han         2013-08-30  587  	pdata = dev_get_platdata(&bp->pdev->dev);
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  588  
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  589  	dev_set_drvdata(&bp->dev->dev, bp->mii_bus);
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  590  
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  591  	np = bp->pdev->dev.of_node;
dacdbb4d drivers/net/ethernet/cadence/macb.c      Michael Grzeschik  2017-06-23  592  
6576bbb8 drivers/net/ethernet/cadence/macb_main.c Brad Mouring       2018-03-12  593  	if (np) {
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  594  		err = of_mdiobus_register(bp->mii_bus, np);
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  595  	} else {
83a77e9e drivers/net/ethernet/cadence/macb.c      Bartosz Folta      2016-12-14 @596  		for (i = 0; i < PHY_MAX_ADDR; i++)
83a77e9e drivers/net/ethernet/cadence/macb.c      Bartosz Folta      2016-12-14  597  			bp->mii_bus->irq[i] = PHY_POLL;
83a77e9e drivers/net/ethernet/cadence/macb.c      Bartosz Folta      2016-12-14  598  
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  599  		if (pdata)
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  600  			bp->mii_bus->phy_mask = pdata->phy_mask;
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  601  
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  602  		err = mdiobus_register(bp->mii_bus);
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  603  	}
89e5785f drivers/net/macb.c                       Haavard Skinnemoen 2006-11-09  604  
148cbb53 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-22  605  	if (err)
e7f4dc35 drivers/net/ethernet/cadence/macb.c      Andrew Lunn        2016-01-06  606  		goto err_out_free_mdiobus;
89e5785f drivers/net/macb.c                       Haavard Skinnemoen 2006-11-09  607  
7daa78e3 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-27  608  	err = macb_mii_probe(bp->dev);
7daa78e3 drivers/net/ethernet/cadence/macb.c      Boris Brezillon    2013-08-27  609  	if (err)
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  610  		goto err_out_unregister_bus;
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  611  
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  612  	return 0;
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  613  
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  614  err_out_unregister_bus:
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  615  	mdiobus_unregister(bp->mii_bus);
9ce98140 drivers/net/ethernet/cadence/macb_main.c Michael Grzeschik  2017-11-08  616  	if (np && of_phy_is_fixed_link(np))
9ce98140 drivers/net/ethernet/cadence/macb_main.c Michael Grzeschik  2017-11-08  617  		of_phy_deregister_fixed_link(np);
6576bbb8 drivers/net/ethernet/cadence/macb_main.c Brad Mouring       2018-03-12  618  err_out_free_mdiobus:
6576bbb8 drivers/net/ethernet/cadence/macb_main.c Brad Mouring       2018-03-12  619  	of_node_put(bp->phy_node);
298cf9be drivers/net/macb.c                       Lennert Buytenhek  2008-10-08  620  	mdiobus_free(bp->mii_bus);
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  621  err_out:
6c36a707 drivers/net/macb.c                       frederic RODO      2007-07-12  622  	return err;
89e5785f drivers/net/macb.c                       Haavard Skinnemoen 2006-11-09  623  }
89e5785f drivers/net/macb.c                       Haavard Skinnemoen 2006-11-09  624  

:::::: The code at line 596 was first introduced by commit
:::::: 83a77e9ec4150ee4acc635638f7dedd9da523a26 net: macb: Added PCI wrapper for Platform Driver.

:::::: TO: Bartosz Folta <bfolta@cadence.com>
:::::: CC: David S. Miller <davem@davemloft.net>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29486 bytes --]

^ permalink raw reply

* Re: [PATCH v3] kernel.h: Skip single-eval logic on literals in min()/max()
From: Kees Cook @ 2018-03-13  4:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Linux Kernel Mailing List, Josh Poimboeuf,
	Rasmus Villemoes, Gustavo A. R. Silva, Tobin C. Harding,
	Steven Rostedt, Jonathan Corbet, Chris Mason, Josef Bacik,
	David Sterba, David S. Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Ingo Molnar, Peter Zijlstra, Thomas Gleixner,
	Masahiro Yamada, Borislav Petkov, Randy Dunlap <rdunl
In-Reply-To: <CA+55aFwK7W0SsaP5eR+2TxOj-j_Mu_E11fRw2Gk8ptV71ebvww@mail.gmail.com>

On Mon, Mar 12, 2018 at 4:57 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, Mar 12, 2018 at 3:55 PM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
>>
>> Replacing the __builtin_choose_expr() with ?: works of course.
>
> Hmm. That sounds like the right thing to do. We were so myopically
> staring at the __builtin_choose_expr() problem that we overlooked the
> obvious solution.
>
> Using __builtin_constant_p() together with a ?: is in fact our common
> pattern, so that should be fine. The only real reason to use
> __builtin_choose_expr() is if you want to get the *type* to vary
> depending on which side you choose, but that's not an issue for
> min/max.

This doesn't solve it for -Wvla, unfortunately. That was the point of
Josh's original suggestion of __builtin_choose_expr().

Try building with KCFLAGS=-Wval and checking net/ipv6/proc.c:

net/ipv6/proc.c: In function ‘snmp6_seq_show_item’:
net/ipv6/proc.c:198:2: warning: ISO C90 forbids array ‘buff’ whose
size can’t be evaluated [-Wvla]
  unsigned long buff[SNMP_MIB_MAX];
  ^~~~~~~~


-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH 01/15] ice: Add basic driver framework for Intel(R) E800 Series
From: Jakub Kicinski @ 2018-03-13  4:41 UTC (permalink / raw)
  To: Jeff Kirsher; +Cc: Anirudh Venkataramanan, netdev, intel-wired-lan
In-Reply-To: <1520873790.19785.4.camel@intel.com>

On Mon, 12 Mar 2018 09:56:30 -0700, Jeff Kirsher wrote:
> On Fri, 2018-03-09 at 14:16 -0800, Jakub Kicinski wrote:
> > On Fri,  9 Mar 2018 09:21:22 -0800, Anirudh Venkataramanan wrote:  
> > > diff --git a/Documentation/networking/ice.txt
> > > b/Documentation/networking/ice.txt
> > > new file mode 100644
> > > index 000000000000..6261c46378e1
> > > --- /dev/null
> > > +++ b/Documentation/networking/ice.txt  
> > 
> > nit: ice.rst, maybe?  
> 
> If that is the desired direction for kernel documentation, then we can
> work on generating *.rst for all of our wired ethernet drivers.  For
> now, we just have the simple text file.
> 
> Currently there are only 5 networking drivers that have a *.rst for
> documentation, so it looks like an opportunity to convert all the
> networking documentation to *.rst format, if that is desired. 

Oh, just a suggestion, I had a quick look and your file actually works
pretty nicely as rst though I'm not an expert..

^ permalink raw reply

* Re: [PATCH 1/1] net: check dev->reg_state before deref of napi netdev_ops
From: Josh Elsasser @ 2018-03-13  5:17 UTC (permalink / raw)
  To: Cong Wang
  Cc: David Miller, Greg Kroah-Hartman, Eric Dumazet, Sasha Levin,
	Willem de Bruijn, Alexander Potapenko, Michal Kubeček,
	Linux Kernel Network Developers, LKML
In-Reply-To: <CAM_iQpVWQ+4YKX=ZCCpJ3fzbexNVYJ8wONz9QnwuxBtbQwx3Lg@mail.gmail.com>

On Mon, Mar 12, 2018 at 4:17 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Sun, Mar 11, 2018 at 12:22 PM, Josh Elsasser <jelsasser@appneta.com> wrote:
>> init_dummy_netdev() leaves its netdev_ops pointer zeroed. This leads
>> to a NULL pointer dereference when sk_busy_loop fires against an iwlwifi
>> wireless adapter and checks napi->dev->netdev_ops->ndo_busy_poll.
>>
>> Avoid this by ensuring that napi->dev is not a dummy device before
>> dereferencing napi dev's netdev_ops, preventing the following panic:
>
> Hmm, how about just checking ->netdev_ops? Checking reg_state looks
> odd, although works.

Fair point. I was trying to differentiate between an unexpected
NULL pointer and a dummy netdev, but I guess it was clever
at the expense of readability.

I'll push up a v2 that just does the obvious.

^ permalink raw reply

* [PATCH 0/1] net: avoid a kernel panic during sk_busy_loop
From: Josh Elsasser @ 2018-03-13  5:31 UTC (permalink / raw)
  To: davem
  Cc: Josh Elsasser, Greg Kroah-Hartman, Eric Dumazet, Willem de Bruijn,
	Cong Wang, Michal Kubeček, Vlad Yasevich, netdev,
	linux-kernel

V2: just check napi->dev->netdev_ops instead of getting clever with the
netdev registration state.

Original cover letter:

Hi Dave,

I stumbled across a reproducible kernel panic while playing around with
busy_poll on a Linux 4.9.86 kernel. There's an unfortunate interaction
between init_dummy_netdev, which doesn't bother to fill in netdev_ops, and
sk_busy_loop, which assumed netdev_ops is a valid pointer.

To reproduce on the device under test (DUT), I did:

  $ ip addr show dev wlan0
  8: wlan0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq [...]
      inet 172.16.122.6/23 brd 172.16.123.255 scope global wlan0
  $ sysctl -w net.core.busy_read=50
  $ nc -l 172.16.122.6 5001

Then transmitted some data to this socket from a second host:

  $ echo "foo" | nc 172.16.122.6 5001

The DUT immediately hits a kernel panic.

I've attached a patch that applies cleanly to the 4.9.87 stable release.
This fix isn't necessary for net/net-next (ndo_busy_poll was removed in
linux-4.11), but a further backport of this commit is likely required for
any stable releases older than linux-4.5.

I hope this is the right way to raise something like this. I couldn't find
a clear answer from the -stable and netdev on how to handle bugs in features
that no longer exist in mainline.

Thanks,
    Josh

^ permalink raw reply

* [PATCH v2 1/1] net: check before dereferencing netdev_ops during busy poll
From: Josh Elsasser @ 2018-03-13  5:32 UTC (permalink / raw)
  To: davem
  Cc: Josh Elsasser, Greg Kroah-Hartman, Eric Dumazet, Willem de Bruijn,
	Alexander Potapenko, Cong Wang, Vlad Yasevich,
	Michal Kubeček, netdev, linux-kernel
In-Reply-To: <20180313053248.13654-1-jelsasser@appneta.com>

init_dummy_netdev() leaves its netdev_ops pointer zeroed. This leads
to a NULL pointer dereference when sk_busy_loop fires against an iwlwifi
wireless adapter and checks napi->dev->netdev_ops->ndo_busy_poll.

Avoid this by ensuring napi->dev->netdev_ops is valid before following
the pointer, avoiding the following panic when busy polling on a dummy
netdev:

  BUG: unable to handle kernel NULL pointer dereference at 00000000000000c8
  IP: [<ffffffff817b4b72>] sk_busy_loop+0x92/0x2f0
  Call Trace:
   [<ffffffff815a3134>] ? uart_write_room+0x74/0xf0
   [<ffffffff817964a9>] sock_poll+0x99/0xa0
   [<ffffffff81223142>] do_sys_poll+0x2e2/0x520
   [<ffffffff8118d3fc>] ? get_page_from_freelist+0x3bc/0xa30
   [<ffffffff810ada22>] ? update_curr+0x62/0x140
   [<ffffffff811ea671>] ? __slab_free+0xa1/0x2a0
   [<ffffffff811ea671>] ? __slab_free+0xa1/0x2a0
   [<ffffffff8179dbb1>] ? skb_free_head+0x21/0x30
   [<ffffffff81221bd0>] ? poll_initwait+0x50/0x50
   [<ffffffff811eaa36>] ? kmem_cache_free+0x1c6/0x1e0
   [<ffffffff815a4884>] ? uart_write+0x124/0x1d0
   [<ffffffff810bd1cd>] ? remove_wait_queue+0x4d/0x60
   [<ffffffff810bd224>] ? __wake_up+0x44/0x50
   [<ffffffff81582731>] ? tty_write_unlock+0x31/0x40
   [<ffffffff8158c5c6>] ? tty_ldisc_deref+0x16/0x20
   [<ffffffff81584820>] ? tty_write+0x1e0/0x2f0
   [<ffffffff81587e50>] ? process_echoes+0x80/0x80
   [<ffffffff8120c17b>] ? __vfs_write+0x2b/0x130
   [<ffffffff8120d09a>] ? vfs_write+0x15a/0x1a0
   [<ffffffff81223455>] SyS_poll+0x75/0x100
   [<ffffffff819a6524>] entry_SYSCALL_64_fastpath+0x24/0xcf

Commit 79e7fff47b7b ("net: remove support for per driver ndo_busy_poll()")
indirectly fixed this upstream in linux-4.11 by removing the offending
pointer usage. No other users of napi->dev touch its netdev_ops.

Fixes: 060212928670 ("net: add low latency socket poll")
Fixes: ce6aea93f751 ("net: network drivers no longer need to implement ndo_busy_poll()") - 4.9.y
Signed-off-by: Josh Elsasser <jelsasser@appneta.com>
---
 net/core/dev.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 8898618bf341..1f50c131ed15 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5042,7 +5042,10 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 		goto out;
 
 	/* Note: ndo_busy_poll method is optional in linux-4.5 */
-	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
+	if (napi->dev->netdev_ops)
+		busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
+	else
+		busy_poll = NULL;
 
 	do {
 		rc = 0;
-- 
2.11.0

^ permalink raw reply related

* BUG_ON triggered in skb_segment
From: Yonghong Song @ 2018-03-13  5:45 UTC (permalink / raw)
  To: Daniel Borkmann, Eric Dumazet, Alexei Starovoitov, netdev,
	Martin Lau, Yonghong Song

Hi,

One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at
net-next function skb_segment, line 3667.

3472 struct sk_buff *skb_segment(struct sk_buff *head_skb,
3473                             netdev_features_t features)
3474 {
3475         struct sk_buff *segs = NULL;
3476         struct sk_buff *tail = NULL;
...
3665                 while (pos < offset + len) {
3666                         if (i >= nfrags) {
3667                                 BUG_ON(skb_headlen(list_skb));
3668
3669                                 i = 0;
3670                                 nfrags = 
skb_shinfo(list_skb)->nr_frags;
3671                                 frag = skb_shinfo(list_skb)->frags;
3672                                 frag_skb = list_skb;
...

call stack:
...
#0 [ffff883ffef034f8] machine_kexec at ffffffff81044c41
  #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525
  #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc
  #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7
  #4 [ffff883ffef03668] die at ffffffff8101deb2
  #5 [ffff883ffef03698] do_trap at ffffffff8101a700
  #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe
  #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0
  #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab
     [exception RIP: skb_segment+3044]
     RIP: ffffffff817e4dd4  RSP: ffff883ffef03860  RFLAGS: 00010216
     RAX: 0000000000002bf6  RBX: ffff883feb7aaa00  RCX: 0000000000000011
     RDX: ffff883fb87910c0  RSI: 0000000000000011  RDI: ffff883feb7ab500
     RBP: ffff883ffef03928   R8: 0000000000002ce2   R9: 00000000000027da
     R10: 000001ea00000000  R11: 0000000000002d82  R12: ffff883f90a1ee80
     R13: ffff883fb8791120  R14: ffff883feb7abc00  R15: 0000000000002ce2
     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
  #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7
#10 [ffff883ffef03990] tcp4_gso_segment at ffffffff818717d8
#11 [ffff883ffef039b0] inet_gso_segment at ffffffff81882c9b
#12 [ffff883ffef03a10] skb_mac_gso_segment at ffffffff817f39b8
#13 [ffff883ffef03a38] __skb_gso_segment at ffffffff817f3ac9
#14 [ffff883ffef03a68] validate_xmit_skb at ffffffff817f3eed
#15 [ffff883ffef03aa8] validate_xmit_skb_list at ffffffff817f40a2
#16 [ffff883ffef03ad8] sch_direct_xmit at ffffffff81824efb
#17 [ffff883ffef03b20] __qdisc_run at ffffffff818251aa
#18 [ffff883ffef03b90] __dev_queue_xmit at ffffffff817f45ed
#19 [ffff883ffef03c08] dev_queue_xmit at ffffffff817f4b90
#20 [ffff883ffef03c18] __bpf_redirect at ffffffff81812b66
#21 [ffff883ffef03c40] skb_do_redirect at ffffffff81813209
#22 [ffff883ffef03c60] __netif_receive_skb_core at ffffffff817f310d
#23 [ffff883ffef03cc8] __netif_receive_skb at ffffffff817f32e8
#24 [ffff883ffef03ce8] netif_receive_skb_internal at ffffffff817f5538
#25 [ffff883ffef03d10] napi_gro_complete at ffffffff817f56c0
#26 [ffff883ffef03d28] dev_gro_receive at ffffffff817f5ea6
#27 [ffff883ffef03d78] napi_gro_receive at ffffffff817f6168
#28 [ffff883ffef03da0] mlx5e_handle_rx_cqe_mpwrq at ffffffff817381c2
#29 [ffff883ffef03e30] mlx5e_poll_rx_cq at ffffffff817386c2
#30 [ffff883ffef03e80] mlx5e_napi_poll at ffffffff8173926e
#31 [ffff883ffef03ed0] net_rx_action at ffffffff817f5a6e
#32 [ffff883ffef03f48] __softirqentry_text_start at ffffffff81c000c3
#33 [ffff883ffef03fa8] irq_exit at ffffffff8108f515
#34 [ffff883ffef03fb8] do_IRQ at ffffffff81a01b11
--- <IRQ stack> ---
bt: cannot transition from IRQ stack to current process stack:
         IRQ stack pointer: ffff883ffef034f8
     process stack pointer: ffffffff81a01ae9
        current stack base: ffffc9000c5c4000
...
Setup:
=====

The test will involve three machines:
   M_ipv6 <-> M_nat <-> M_ipv4

The M_nat will do ipv4<->ipv6 address translation and then forward packet
to proper destination. The control plane will configure M_nat properly
will understand virtual ipv4 address for machine M_ipv6, and
virtual ipv6 address for machine M_ipv4.

M_nat runs a bpf program, which is attached to clsact (ingress) qdisc.
The program uses bpf_skb_change_proto to do protocol conversion.
bpf_skb_change_proto will adjust skb header_len and len properly
based on protocol change.
After the conversion, the program will make proper change on
ethhdr and ip4/6 header, recalculate checksum, and send the packet out
through bpf_redirect.

Experiment:
===========

MTU: 1500B for all three machines.

The tso/lro/gro are enabled on the M_nat box.

ping works on both ways of M_ipv6 <-> M_ipv4.
It works for transfering a small file (4KB) between M_ipv6 and M_ipv4 
(both ways).
Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4, failed with 
the above BUG_ON, really fast.
Did not really test from M_ipv4 to M_ipv6 with large file.

The error path likely to be (also from the above call stack):
   nic -> lro/gro -> bpf_program -> gso (BUG_ON)

In one of experiments, I explicitly printed the skb->len and 
skb->data_len. The values are below:
   skb_segment: len 2856, data_len 2686
They should be equal to avoid BUG.

In another experiment, I got:
   skb_segment: len 1428, data_len 1258

In both cases, the difference is 170 bytes. Not sure whether
this is just a coincidence or not.

Workaround:
===========

A workaround to avoid BUG_ON is to disable lro/gro. This way,
kernel will not receive big packets and hence gso is not really called.

I am not familiar with gso code. Does anybody hit this BUG_ON before?
Any suggestion on how to debug this?

Thanks!

Yonghong

^ permalink raw reply

* Re: [PATCH v2 1/1] net: check before dereferencing netdev_ops during busy poll
From: Eric Dumazet @ 2018-03-13  5:50 UTC (permalink / raw)
  To: Josh Elsasser, davem
  Cc: Greg Kroah-Hartman, Eric Dumazet, Willem de Bruijn,
	Alexander Potapenko, Cong Wang, Vlad Yasevich,
	Michal Kubeček, netdev, linux-kernel
In-Reply-To: <20180313053248.13654-2-jelsasser@appneta.com>



On 03/12/2018 10:32 PM, Josh Elsasser wrote:
> init_dummy_netdev() leaves its netdev_ops pointer zeroed. This leads
> to a NULL pointer dereference when sk_busy_loop fires against an iwlwifi
> wireless adapter and checks napi->dev->netdev_ops->ndo_busy_poll.
> 
> Avoid this by ensuring napi->dev->netdev_ops is valid before following
> the pointer, avoiding the following panic when busy polling on a dummy
> netdev:
>


> 
> Fixes: 060212928670 ("net: add low latency socket poll")
> Fixes: ce6aea93f751 ("net: network drivers no longer need to implement ndo_busy_poll()") - 4.9.y
> Signed-off-by: Josh Elsasser <jelsasser@appneta.com>
> ---
>   net/core/dev.c | 5 ++++-
>   1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 8898618bf341..1f50c131ed15 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5042,7 +5042,10 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
>   		goto out;
>   
>   	/* Note: ndo_busy_poll method is optional in linux-4.5 */
> -	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
> +	if (napi->dev->netdev_ops)
> +		busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
> +	else
> +		busy_poll = NULL;
>   
>   	do {
>   		rc = 0;
> 
We could instead setup a non NULL netdev_ops pointer on these 'dummy' 
devices to not add a check in fast path, but I presume we do
not really care since this fix is for old kernels, and considering how 
long it took to discover this bug.

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* RE: [PATCH v4 2/6] staging: fsl-dpaa2/ethsw: Add Freescale DPAA2 Ethernet Switch driver
From: Razvan Stefanescu @ 2018-03-13  5:51 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: gregkh@linuxfoundation.org, devel@driverdev.osuosl.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	Alexander Graf, arnd@arndb.de, Alexandru Marginean,
	Ruxandra Ioana Ciocoi Radulescu, Ioana Ciornei, Laurentiu Tudor,
	stuyoder@gmail.com
In-Reply-To: <20180312143654.GB21068@lunn.ch>



> -----Original Message-----
> From: Andrew Lunn [mailto:andrew@lunn.ch]
> Sent: Monday, March 12, 2018 4:37 PM
> To: Razvan Stefanescu <razvan.stefanescu@nxp.com>
> Cc: gregkh@linuxfoundation.org; devel@driverdev.osuosl.org; linux-
> kernel@vger.kernel.org; netdev@vger.kernel.org; Alexander Graf
> <agraf@suse.de>; arnd@arndb.de; Alexandru Marginean
> <alexandru.marginean@nxp.com>; Ruxandra Ioana Ciocoi Radulescu
> <ruxandra.radulescu@nxp.com>; Ioana Ciornei <ioana.ciornei@nxp.com>;
> Laurentiu Tudor <laurentiu.tudor@nxp.com>; stuyoder@gmail.com
> Subject: Re: [PATCH v4 2/6] staging: fsl-dpaa2/ethsw: Add Freescale DPAA2
> Ethernet Switch driver
> 
> > +static int port_netdevice_event(struct notifier_block *unused,
> > +				unsigned long event, void *ptr)
> > +{
> > +	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
> > +	struct netdev_notifier_changeupper_info *info = ptr;
> > +	struct net_device *upper_dev;
> > +	int err = 0;
> > +
> > +	if (netdev->netdev_ops != &ethsw_port_ops)
> > +		return NOTIFY_DONE;
> > +
> > +	/* Handle just upper dev link/unlink for the moment */
> > +	if (event == NETDEV_CHANGEUPPER) {
> > +		upper_dev = info->upper_dev;
> > +		if (netif_is_bridge_master(upper_dev)) {
> > +			if (info->linking)
> > +				err = port_bridge_join(netdev);
> > +			else
> > +				err = port_bridge_leave(netdev);
> > +		}
> > +	}
> > +
> > +	return notifier_from_errno(err);
> > +}
> 
> I could be missing something here, but don't you need to pass to
> port_bridge_join() which bridge the port is joining. There can be
> multiple bridges, so you need to ensure the port joins the correct
> bridge.
> 
Thank you for noticing this. I'll add proper checks in next version.

	Razvan

> 	Andrew

^ permalink raw reply

* [PATCH net 1/2] net: Fix vlan untag for bridge and vlan_dev with reorder_hdr off
From: Toshiaki Makita @ 2018-03-13  5:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: Toshiaki Makita, netdev, Brandon Carpenter, Vlad Yasevich
In-Reply-To: <1520920288-2483-1-git-send-email-makita.toshiaki@lab.ntt.co.jp>

When we have a bridge with vlan_filtering on and a vlan device on top of
it, packets would be corrupted in skb_vlan_untag() called from
br_dev_xmit().

The problem sits in skb_reorder_vlan_header() used in skb_vlan_untag(),
which makes use of skb->mac_len. In this function mac_len is meant for
handling rx path with vlan devices with reorder_header disabled, but in
tx path mac_len is typically 0 and cannot be used, which is the problem
in this case.

The current code even does not properly handle rx path (skb_vlan_untag()
called from __netif_receive_skb_core()) with reorder_header off actually.

In rx path single tag case, it works as follows:

- Before skb_reorder_vlan_header()

 mac_header                                data
   v                                        v
   +-------------------+-------------+------+----
   |        ETH        |    VLAN     | ETH  |
   |       ADDRS       | TPID | TCI  | TYPE |
   +-------------------+-------------+------+----
   <-------- mac_len --------->
                       <------------->
                        to be removed

- After skb_reorder_vlan_header()

            mac_header                     data
                 v                          v
                 +-------------------+------+----
                 |        ETH        | ETH  |
                 |       ADDRS       | TYPE |
                 +-------------------+------+----
                 <-------- mac_len --------->

This is ok, but in rx double tag case, it corrupts packets:

- Before skb_reorder_vlan_header()

 mac_header                                              data
   v                                                      v
   +-------------------+-------------+-------------+------+----
   |        ETH        |    VLAN     |    VLAN     | ETH  |
   |       ADDRS       | TPID | TCI  | TPID | TCI  | TYPE |
   +-------------------+-------------+-------------+------+----
   <--------------- mac_len ---------------->
                                     <------------->
                                    should be removed
                       <--------------------------->
                         actually will be removed

- After skb_reorder_vlan_header()

            mac_header                                   data
                 v                                        v
                               +-------------------+------+----
                               |        ETH        | ETH  |
                               |       ADDRS       | TYPE |
                               +-------------------+------+----
                 <--------------- mac_len ---------------->

So, two of vlan tags are both removed while only inner one should be
removed and mac_header (and mac_len) is broken.

skb_vlan_untag() is meant for removing the vlan header at (skb->data - 2),
so use skb->data and skb->mac_header to calculate the right offset.

Reported-by: Brandon Carpenter <brandon.carpenter@cypherpath.com>
Fixes: a6e18ff11170 ("vlan: Fix untag operations of stacked vlans with REORDER_HEADER off")
Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 include/uapi/linux/if_ether.h | 1 +
 net/core/skbuff.c             | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 8bbbcb5..820de5d 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -30,6 +30,7 @@
  */
 
 #define ETH_ALEN	6		/* Octets in one ethernet addr	 */
+#define ETH_TLEN	2		/* Octets in ethernet type field */
 #define ETH_HLEN	14		/* Total octets in header.	 */
 #define ETH_ZLEN	60		/* Min. octets in frame sans FCS */
 #define ETH_DATA_LEN	1500		/* Max. octets in payload	 */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index baf9905..b103f46 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5020,13 +5020,16 @@ bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
 
 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
 {
+	int mac_len;
+
 	if (skb_cow(skb, skb_headroom(skb)) < 0) {
 		kfree_skb(skb);
 		return NULL;
 	}
 
-	memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
-		2 * ETH_ALEN);
+	mac_len = skb->data - skb_mac_header(skb);
+	memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
+		mac_len - VLAN_HLEN - ETH_TLEN);
 	skb->mac_header += VLAN_HLEN;
 	return skb;
 }
-- 
1.8.3.1

^ permalink raw reply related

* Re: BUG_ON triggered in skb_segment
From: Eric Dumazet @ 2018-03-13  6:04 UTC (permalink / raw)
  To: Yonghong Song, Daniel Borkmann, Eric Dumazet, Alexei Starovoitov,
	netdev, Martin Lau
In-Reply-To: <9265b93f-253d-6b8c-f2b8-4b54eff1835c@fb.com>



On 03/12/2018 10:45 PM, Yonghong Song wrote:
> Hi,
> 
> One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at
> net-next function skb_segment, line 3667.
> 
> 3472 struct sk_buff *skb_segment(struct sk_buff *head_skb,
> 3473                             netdev_features_t features)
> 3474 {
> 3475         struct sk_buff *segs = NULL;
> 3476         struct sk_buff *tail = NULL;
> ...
> 3665                 while (pos < offset + len) {
> 3666                         if (i >= nfrags) {
> 3667                                 BUG_ON(skb_headlen(list_skb));
> 3668
> 3669                                 i = 0;
> 3670                                 nfrags = 
> skb_shinfo(list_skb)->nr_frags;
> 3671                                 frag = skb_shinfo(list_skb)->frags;
> 3672                                 frag_skb = list_skb;
> ...
> 
> call stack:
> ...
> #0 [ffff883ffef034f8] machine_kexec at ffffffff81044c41
>   #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525
>   #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc
>   #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7
>   #4 [ffff883ffef03668] die at ffffffff8101deb2
>   #5 [ffff883ffef03698] do_trap at ffffffff8101a700
>   #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe
>   #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0
>   #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab
>      [exception RIP: skb_segment+3044]
>      RIP: ffffffff817e4dd4  RSP: ffff883ffef03860  RFLAGS: 00010216
>      RAX: 0000000000002bf6  RBX: ffff883feb7aaa00  RCX: 0000000000000011
>      RDX: ffff883fb87910c0  RSI: 0000000000000011  RDI: ffff883feb7ab500
>      RBP: ffff883ffef03928   R8: 0000000000002ce2   R9: 00000000000027da
>      R10: 000001ea00000000  R11: 0000000000002d82  R12: ffff883f90a1ee80
>      R13: ffff883fb8791120  R14: ffff883feb7abc00  R15: 0000000000002ce2
>      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>   #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7
> #10 [ffff883ffef03990] tcp4_gso_segment at ffffffff818717d8
> #11 [ffff883ffef039b0] inet_gso_segment at ffffffff81882c9b
> #12 [ffff883ffef03a10] skb_mac_gso_segment at ffffffff817f39b8
> #13 [ffff883ffef03a38] __skb_gso_segment at ffffffff817f3ac9
> #14 [ffff883ffef03a68] validate_xmit_skb at ffffffff817f3eed
> #15 [ffff883ffef03aa8] validate_xmit_skb_list at ffffffff817f40a2
> #16 [ffff883ffef03ad8] sch_direct_xmit at ffffffff81824efb
> #17 [ffff883ffef03b20] __qdisc_run at ffffffff818251aa
> #18 [ffff883ffef03b90] __dev_queue_xmit at ffffffff817f45ed
> #19 [ffff883ffef03c08] dev_queue_xmit at ffffffff817f4b90
> #20 [ffff883ffef03c18] __bpf_redirect at ffffffff81812b66
> #21 [ffff883ffef03c40] skb_do_redirect at ffffffff81813209
> #22 [ffff883ffef03c60] __netif_receive_skb_core at ffffffff817f310d
> #23 [ffff883ffef03cc8] __netif_receive_skb at ffffffff817f32e8
> #24 [ffff883ffef03ce8] netif_receive_skb_internal at ffffffff817f5538
> #25 [ffff883ffef03d10] napi_gro_complete at ffffffff817f56c0
> #26 [ffff883ffef03d28] dev_gro_receive at ffffffff817f5ea6
> #27 [ffff883ffef03d78] napi_gro_receive at ffffffff817f6168
> #28 [ffff883ffef03da0] mlx5e_handle_rx_cqe_mpwrq at ffffffff817381c2
> #29 [ffff883ffef03e30] mlx5e_poll_rx_cq at ffffffff817386c2
> #30 [ffff883ffef03e80] mlx5e_napi_poll at ffffffff8173926e
> #31 [ffff883ffef03ed0] net_rx_action at ffffffff817f5a6e
> #32 [ffff883ffef03f48] __softirqentry_text_start at ffffffff81c000c3
> #33 [ffff883ffef03fa8] irq_exit at ffffffff8108f515
> #34 [ffff883ffef03fb8] do_IRQ at ffffffff81a01b11
> --- <IRQ stack> ---
> bt: cannot transition from IRQ stack to current process stack:
>          IRQ stack pointer: ffff883ffef034f8
>      process stack pointer: ffffffff81a01ae9
>         current stack base: ffffc9000c5c4000
> ...
> Setup:
> =====
> 
> The test will involve three machines:
>    M_ipv6 <-> M_nat <-> M_ipv4
> 
> The M_nat will do ipv4<->ipv6 address translation and then forward packet
> to proper destination. The control plane will configure M_nat properly
> will understand virtual ipv4 address for machine M_ipv6, and
> virtual ipv6 address for machine M_ipv4.
> 
> M_nat runs a bpf program, which is attached to clsact (ingress) qdisc.
> The program uses bpf_skb_change_proto to do protocol conversion.
> bpf_skb_change_proto will adjust skb header_len and len properly
> based on protocol change.
> After the conversion, the program will make proper change on
> ethhdr and ip4/6 header, recalculate checksum, and send the packet out
> through bpf_redirect.
> 
> Experiment:
> ===========
> 
> MTU: 1500B for all three machines.
> 
> The tso/lro/gro are enabled on the M_nat box.
> 
> ping works on both ways of M_ipv6 <-> M_ipv4.
> It works for transfering a small file (4KB) between M_ipv6 and M_ipv4 
> (both ways).
> Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4, failed with 
> the above BUG_ON, really fast.
> Did not really test from M_ipv4 to M_ipv6 with large file.
> 
> The error path likely to be (also from the above call stack):
>    nic -> lro/gro -> bpf_program -> gso (BUG_ON)
> 
> In one of experiments, I explicitly printed the skb->len and 
> skb->data_len. The values are below:
>    skb_segment: len 2856, data_len 2686
> They should be equal to avoid BUG.
> 
> In another experiment, I got:
>    skb_segment: len 1428, data_len 1258
> 
> In both cases, the difference is 170 bytes. Not sure whether
> this is just a coincidence or not.
> 
> Workaround:
> ===========
> 
> A workaround to avoid BUG_ON is to disable lro/gro. This way,
> kernel will not receive big packets and hence gso is not really called.
> 
> I am not familiar with gso code. Does anybody hit this BUG_ON before?
> Any suggestion on how to debug this?
> 

skb_segment() works if incoming GRO packet is not modified in its geometry.

In your case it seems you had to adjust gso_size (calling 
skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks 
skb_segment() badly, because geometry changes, unless you had specific 
MTU/MSS restrictions.

You will have to make skb_segment() more generic if you really want this.

^ permalink raw reply

* Re: BUG_ON triggered in skb_segment
From: Yonghong Song @ 2018-03-13  6:08 UTC (permalink / raw)
  To: Eric Dumazet, Daniel Borkmann, Alexei Starovoitov, netdev,
	Martin Lau
In-Reply-To: <875f59f2-d1ec-c47c-cdd7-2ce4985c5143@gmail.com>



On 3/12/18 11:04 PM, Eric Dumazet wrote:
> 
> 
> On 03/12/2018 10:45 PM, Yonghong Song wrote:
>> Hi,
>>
>> One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at
>> net-next function skb_segment, line 3667.
>>
>> 3472 struct sk_buff *skb_segment(struct sk_buff *head_skb,
>> 3473                             netdev_features_t features)
>> 3474 {
>> 3475         struct sk_buff *segs = NULL;
>> 3476         struct sk_buff *tail = NULL;
>> ...
>> 3665                 while (pos < offset + len) {
>> 3666                         if (i >= nfrags) {
>> 3667                                 BUG_ON(skb_headlen(list_skb));
>> 3668
>> 3669                                 i = 0;
>> 3670                                 nfrags = 
>> skb_shinfo(list_skb)->nr_frags;
>> 3671                                 frag = skb_shinfo(list_skb)->frags;
>> 3672                                 frag_skb = list_skb;
>> ...
>>
>> call stack:
>> ...
>> #0 [ffff883ffef034f8] machine_kexec at ffffffff81044c41
>>   #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525
>>   #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc
>>   #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7
>>   #4 [ffff883ffef03668] die at ffffffff8101deb2
>>   #5 [ffff883ffef03698] do_trap at ffffffff8101a700
>>   #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe
>>   #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0
>>   #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab
>>      [exception RIP: skb_segment+3044]
>>      RIP: ffffffff817e4dd4  RSP: ffff883ffef03860  RFLAGS: 00010216
>>      RAX: 0000000000002bf6  RBX: ffff883feb7aaa00  RCX: 0000000000000011
>>      RDX: ffff883fb87910c0  RSI: 0000000000000011  RDI: ffff883feb7ab500
>>      RBP: ffff883ffef03928   R8: 0000000000002ce2   R9: 00000000000027da
>>      R10: 000001ea00000000  R11: 0000000000002d82  R12: ffff883f90a1ee80
>>      R13: ffff883fb8791120  R14: ffff883feb7abc00  R15: 0000000000002ce2
>>      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>>   #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7
>> #10 [ffff883ffef03990] tcp4_gso_segment at ffffffff818717d8
>> #11 [ffff883ffef039b0] inet_gso_segment at ffffffff81882c9b
>> #12 [ffff883ffef03a10] skb_mac_gso_segment at ffffffff817f39b8
>> #13 [ffff883ffef03a38] __skb_gso_segment at ffffffff817f3ac9
>> #14 [ffff883ffef03a68] validate_xmit_skb at ffffffff817f3eed
>> #15 [ffff883ffef03aa8] validate_xmit_skb_list at ffffffff817f40a2
>> #16 [ffff883ffef03ad8] sch_direct_xmit at ffffffff81824efb
>> #17 [ffff883ffef03b20] __qdisc_run at ffffffff818251aa
>> #18 [ffff883ffef03b90] __dev_queue_xmit at ffffffff817f45ed
>> #19 [ffff883ffef03c08] dev_queue_xmit at ffffffff817f4b90
>> #20 [ffff883ffef03c18] __bpf_redirect at ffffffff81812b66
>> #21 [ffff883ffef03c40] skb_do_redirect at ffffffff81813209
>> #22 [ffff883ffef03c60] __netif_receive_skb_core at ffffffff817f310d
>> #23 [ffff883ffef03cc8] __netif_receive_skb at ffffffff817f32e8
>> #24 [ffff883ffef03ce8] netif_receive_skb_internal at ffffffff817f5538
>> #25 [ffff883ffef03d10] napi_gro_complete at ffffffff817f56c0
>> #26 [ffff883ffef03d28] dev_gro_receive at ffffffff817f5ea6
>> #27 [ffff883ffef03d78] napi_gro_receive at ffffffff817f6168
>> #28 [ffff883ffef03da0] mlx5e_handle_rx_cqe_mpwrq at ffffffff817381c2
>> #29 [ffff883ffef03e30] mlx5e_poll_rx_cq at ffffffff817386c2
>> #30 [ffff883ffef03e80] mlx5e_napi_poll at ffffffff8173926e
>> #31 [ffff883ffef03ed0] net_rx_action at ffffffff817f5a6e
>> #32 [ffff883ffef03f48] __softirqentry_text_start at ffffffff81c000c3
>> #33 [ffff883ffef03fa8] irq_exit at ffffffff8108f515
>> #34 [ffff883ffef03fb8] do_IRQ at ffffffff81a01b11
>> --- <IRQ stack> ---
>> bt: cannot transition from IRQ stack to current process stack:
>>          IRQ stack pointer: ffff883ffef034f8
>>      process stack pointer: ffffffff81a01ae9
>>         current stack base: ffffc9000c5c4000
>> ...
>> Setup:
>> =====
>>
>> The test will involve three machines:
>>    M_ipv6 <-> M_nat <-> M_ipv4
>>
>> The M_nat will do ipv4<->ipv6 address translation and then forward packet
>> to proper destination. The control plane will configure M_nat properly
>> will understand virtual ipv4 address for machine M_ipv6, and
>> virtual ipv6 address for machine M_ipv4.
>>
>> M_nat runs a bpf program, which is attached to clsact (ingress) qdisc.
>> The program uses bpf_skb_change_proto to do protocol conversion.
>> bpf_skb_change_proto will adjust skb header_len and len properly
>> based on protocol change.
>> After the conversion, the program will make proper change on
>> ethhdr and ip4/6 header, recalculate checksum, and send the packet out
>> through bpf_redirect.
>>
>> Experiment:
>> ===========
>>
>> MTU: 1500B for all three machines.
>>
>> The tso/lro/gro are enabled on the M_nat box.
>>
>> ping works on both ways of M_ipv6 <-> M_ipv4.
>> It works for transfering a small file (4KB) between M_ipv6 and M_ipv4 
>> (both ways).
>> Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4, failed 
>> with the above BUG_ON, really fast.
>> Did not really test from M_ipv4 to M_ipv6 with large file.
>>
>> The error path likely to be (also from the above call stack):
>>    nic -> lro/gro -> bpf_program -> gso (BUG_ON)
>>
>> In one of experiments, I explicitly printed the skb->len and 
>> skb->data_len. The values are below:
>>    skb_segment: len 2856, data_len 2686
>> They should be equal to avoid BUG.
>>
>> In another experiment, I got:
>>    skb_segment: len 1428, data_len 1258
>>
>> In both cases, the difference is 170 bytes. Not sure whether
>> this is just a coincidence or not.
>>
>> Workaround:
>> ===========
>>
>> A workaround to avoid BUG_ON is to disable lro/gro. This way,
>> kernel will not receive big packets and hence gso is not really called.
>>
>> I am not familiar with gso code. Does anybody hit this BUG_ON before?
>> Any suggestion on how to debug this?
>>
> 
> skb_segment() works if incoming GRO packet is not modified in its geometry.
> 
> In your case it seems you had to adjust gso_size (calling 
> skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks 
> skb_segment() badly, because geometry changes, unless you had specific 
> MTU/MSS restrictions.
> 
> You will have to make skb_segment() more generic if you really want this.

In net/core/filter.c function bpf_skb_change_proto, which is called
in the bpf program, does some GSO adjustment. Could you help check
whether it satisfies my above use case or not? Thanks!

^ permalink raw reply

* linux-next: build warning after merge of the net-next tree
From: Stephen Rothwell @ 2018-03-13  6:11 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List,
	Gustavo A. R. Silva

[-- Attachment #1: Type: text/plain, Size: 392 bytes --]

Hi all,

After merging the net-next tree, today's linux-next build (sparc
defconfig) produced this warning:

net/core/pktgen.c: In function 'pktgen_if_write':
net/core/pktgen.c:1710:1: warning: the frame size of 1048 bytes is larger than 1024 bytes [-Wframe-larger-than=]
 }
 ^

Introduced by commit

  35951393bbff ("pktgen: Remove VLA usage")

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: BUG_ON triggered in skb_segment
From: Yunsheng Lin @ 2018-03-13  6:18 UTC (permalink / raw)
  To: Yonghong Song, Daniel Borkmann, Eric Dumazet, Alexei Starovoitov,
	netdev, Martin Lau
In-Reply-To: <9265b93f-253d-6b8c-f2b8-4b54eff1835c@fb.com>

Hi, Song

On 2018/3/13 13:45, Yonghong Song wrote:
> Hi,
> 
> One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at
> net-next function skb_segment, line 3667.
> 
> 3472 struct sk_buff *skb_segment(struct sk_buff *head_skb,
> 3473                             netdev_features_t features)
> 3474 {
> 3475         struct sk_buff *segs = NULL;
> 3476         struct sk_buff *tail = NULL;
> ...
> 3665                 while (pos < offset + len) {
> 3666                         if (i >= nfrags) {
> 3667                                 BUG_ON(skb_headlen(list_skb));
> 3668
> 3669                                 i = 0;
> 3670                                 nfrags = skb_shinfo(list_skb)->nr_frags;
> 3671                                 frag = skb_shinfo(list_skb)->frags;
> 3672                                 frag_skb = list_skb;
> ...
> 
> call stack:
> ...
> #0 [ffff883ffef034f8] machine_kexec at ffffffff81044c41
>  #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525
>  #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc
>  #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7
>  #4 [ffff883ffef03668] die at ffffffff8101deb2
>  #5 [ffff883ffef03698] do_trap at ffffffff8101a700
>  #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe
>  #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0
>  #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab
>     [exception RIP: skb_segment+3044]
>     RIP: ffffffff817e4dd4  RSP: ffff883ffef03860  RFLAGS: 00010216
>     RAX: 0000000000002bf6  RBX: ffff883feb7aaa00  RCX: 0000000000000011
>     RDX: ffff883fb87910c0  RSI: 0000000000000011  RDI: ffff883feb7ab500
>     RBP: ffff883ffef03928   R8: 0000000000002ce2   R9: 00000000000027da
>     R10: 000001ea00000000  R11: 0000000000002d82  R12: ffff883f90a1ee80
>     R13: ffff883fb8791120  R14: ffff883feb7abc00  R15: 0000000000002ce2
>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>  #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7
> #10 [ffff883ffef03990] tcp4_gso_segment at ffffffff818717d8
> #11 [ffff883ffef039b0] inet_gso_segment at ffffffff81882c9b
> #12 [ffff883ffef03a10] skb_mac_gso_segment at ffffffff817f39b8
> #13 [ffff883ffef03a38] __skb_gso_segment at ffffffff817f3ac9
> #14 [ffff883ffef03a68] validate_xmit_skb at ffffffff817f3eed
> #15 [ffff883ffef03aa8] validate_xmit_skb_list at ffffffff817f40a2
> #16 [ffff883ffef03ad8] sch_direct_xmit at ffffffff81824efb
> #17 [ffff883ffef03b20] __qdisc_run at ffffffff818251aa
> #18 [ffff883ffef03b90] __dev_queue_xmit at ffffffff817f45ed
> #19 [ffff883ffef03c08] dev_queue_xmit at ffffffff817f4b90
> #20 [ffff883ffef03c18] __bpf_redirect at ffffffff81812b66
> #21 [ffff883ffef03c40] skb_do_redirect at ffffffff81813209
> #22 [ffff883ffef03c60] __netif_receive_skb_core at ffffffff817f310d
> #23 [ffff883ffef03cc8] __netif_receive_skb at ffffffff817f32e8
> #24 [ffff883ffef03ce8] netif_receive_skb_internal at ffffffff817f5538
> #25 [ffff883ffef03d10] napi_gro_complete at ffffffff817f56c0
> #26 [ffff883ffef03d28] dev_gro_receive at ffffffff817f5ea6
> #27 [ffff883ffef03d78] napi_gro_receive at ffffffff817f6168
> #28 [ffff883ffef03da0] mlx5e_handle_rx_cqe_mpwrq at ffffffff817381c2
> #29 [ffff883ffef03e30] mlx5e_poll_rx_cq at ffffffff817386c2
> #30 [ffff883ffef03e80] mlx5e_napi_poll at ffffffff8173926e
> #31 [ffff883ffef03ed0] net_rx_action at ffffffff817f5a6e
> #32 [ffff883ffef03f48] __softirqentry_text_start at ffffffff81c000c3
> #33 [ffff883ffef03fa8] irq_exit at ffffffff8108f515
> #34 [ffff883ffef03fb8] do_IRQ at ffffffff81a01b11
> --- <IRQ stack> ---
> bt: cannot transition from IRQ stack to current process stack:
>         IRQ stack pointer: ffff883ffef034f8
>     process stack pointer: ffffffff81a01ae9
>        current stack base: ffffc9000c5c4000
> ...
> Setup:
> =====
> 
> The test will involve three machines:
>   M_ipv6 <-> M_nat <-> M_ipv4
> 
> The M_nat will do ipv4<->ipv6 address translation and then forward packet
> to proper destination. The control plane will configure M_nat properly
> will understand virtual ipv4 address for machine M_ipv6, and
> virtual ipv6 address for machine M_ipv4.
> 
> M_nat runs a bpf program, which is attached to clsact (ingress) qdisc.
> The program uses bpf_skb_change_proto to do protocol conversion.
> bpf_skb_change_proto will adjust skb header_len and len properly
> based on protocol change.
> After the conversion, the program will make proper change on
> ethhdr and ip4/6 header, recalculate checksum, and send the packet out
> through bpf_redirect.
> 
> Experiment:
> ===========
> 
> MTU: 1500B for all three machines.
> 
> The tso/lro/gro are enabled on the M_nat box.
> 
> ping works on both ways of M_ipv6 <-> M_ipv4.
> It works for transfering a small file (4KB) between M_ipv6 and M_ipv4 (both ways).
> Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4, failed with the above BUG_ON, really fast.
> Did not really test from M_ipv4 to M_ipv6 with large file.
> 
> The error path likely to be (also from the above call stack):
>   nic -> lro/gro -> bpf_program -> gso (BUG_ON)
> 
> In one of experiments, I explicitly printed the skb->len and skb->data_len. The values are below:
>   skb_segment: len 2856, data_len 2686
> They should be equal to avoid BUG.
> 
> In another experiment, I got:
>   skb_segment: len 1428, data_len 1258
> 
> In both cases, the difference is 170 bytes. Not sure whether
> this is just a coincidence or not.
> 
> Workaround:
> ===========
> 
> A workaround to avoid BUG_ON is to disable lro/gro. This way,
> kernel will not receive big packets and hence gso is not really called.
> 
> I am not familiar with gso code. Does anybody hit this BUG_ON before?
> Any suggestion on how to debug this?

When the bpf_program do ipv4<->ipv6 address translation , shinfo->gso_type
maybe need to be changed, for example, SKB_GSO_TCPV4 -> SKB_GSO_TCPV6.
I am not sure if there are other field related to gro need to be changed,
you may want to debug it.

You may call call skb_mac_gso_segment with the packet after address
translation to debug this problem.

Hope this will help.

> 
> Thanks!
> 
> Yonghong
> 
> 

^ permalink raw reply

* Re: Problem with bridge (mcast-to-ucast + hairpin) and Broadcom's 802.11f in their FullMAC fw
From: Rafał Miłecki @ 2018-03-13  6:23 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Linus Lüssing, Felix Fietkau, Arend van Spriel, Franky Lin,
	Hante Meuleman, Chi-Hsien Lin, Wright Feng, Pieter-Paul Giesberts,
	Network Development,
	bridge-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	brcm80211-dev-list.pdl-dY08KVG/lbpWk0Htik3J/w,
	brcm80211-dev-list-+wT8y+m8/X5BDgjK7y7TUQ
In-Reply-To: <20180312160103.1a043936@xeon-e3>

On 03/13/2018 12:01 AM, Stephen Hemminger wrote:
> On Mon, 12 Mar 2018 23:42:48 +0100
> Rafał Miłecki <zajec5-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>
>> 2) Blame bridge + mcast-to-ucast + hairpin for 802.11f incompatibility
>>
>> If we agree that 802.11f support in FullMAC firmware is acceptable, then
>> we have to make sure Linux's bridge doesn't break it by passing 802.11f
>> (broadcast) frames back to the source interface. That would require a
>> check like in below diff + proper code for handling such packets. I'm
>> afraid I'm not familiar with bridge code enough to complete that.
>>
>> diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
>> index edae702..9e5d6ea 100644
>> --- a/net/bridge/br_input.c
>> +++ b/net/bridge/br_input.c
>> @@ -126,6 +126,27 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
>>   	}
>>   }
>>
>> +static bool br_skb_is_iapp_add_packet(struct sk_buff *skb)
>> +{
>> +	const u8 iapp_add_packet[6] __aligned(2) = {
>> +		0x00, 0x01, 0xaf, 0x81, 0x01, 0x00,
>> +	};
>> +#if !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
>> +	const u16 *a = (const u16 *)skb->data;
>> +	const u16 *b = (const u16 *)iapp_add_packet;
>> +#endif
>> +
>> +	if (skb->len != 6)
>> +		return false;
>> +
>> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
>> +	return !(((*(const u32 *)skb->data) ^ (*(const u32 *)iapp_add_packet)) |
>> +	         ((*(const u16 *)(skb->data + 4)) ^ (*(const u16 *)(iapp_add_packet + 4))));
>> +#else
>> +	return !((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]));
>> +#endif
>> +}
>> +
>>   /* note: already called with rcu_read_lock */
>>   int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
>>   {
>> @@ -155,6 +176,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
>>   	if (is_multicast_ether_addr(dest)) {
>>   		/* by definition the broadcast is also a multicast address */
>>   		if (is_broadcast_ether_addr(dest)) {
>> +			if (br_skb_is_iapp_add_packet(skb))
>> +				pr_warn("This packet should not be passed back to the source interface!\n");
>>   			pkt_type = BR_PKT_BROADCAST;
>>   			local_rcv = true;
>>   		} else {
>
>
> Don't like bridge doing special case code for magic received values directly in input path.
> Really needs to be generic which is why I suggested ebtables.

We need in-bridge solution only if we decide to support FullMAC
firmwares with 802.11f implementation.

In that case is this possible to use ebtables as a workaround at all?
Can I really use ebtables to set switch to don't pass 802.11f ADD frames
back to the original interface?

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox