Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next 07/15] ice: Refactor VSI allocation, deletion and rebuild flow
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

This patch refactors aspects of the VSI allocation, deletion and rebuild
flow. Some of the more noteworthy changes are described below.

1) On reset, all switch filters applied in the hardware are lost. In
   the rebuild flow, only MAC and broadcast filters are being restored.
   Instead, use a new function ice_replay_all_fltr to restore all the
   filters that were previously added. To do this, remove calls to
   ice_remove_vsi_fltr to prevent cleaning out the internal bookkeeping
   structures that ice_replay_all_fltr uses to replay filters.

2) Introduce a new state bit __ICE_PREPARED_FOR_RESET to distinguish the
   PF that requested the reset (and consequently prepared for it) from
   the rest of the PFs. These other PFs will prepare for reset only
   when they receive an interrupt from the firmware.

3) Use new functions ice_add_vsi and ice_free_vsi to create and destroy
   VSIs respectively. These functions accept a handle to uniquely
   identify a VSI. This same handle is required to rebuild the VSI post
   reset. To prevent confusion, the existing ice_vsi_add was renamed to
   ice_vsi_init.

4) Enhance ice_vsi_setup for the upcoming SR-IOV changes and expose a
   new wrapper function ice_pf_vsi_setup to create PF VSIs. Rework the
   error handling path in ice_setup_pf_sw.

5) Introduce a new function ice_vsi_release_all to release all PF VSIs.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h          |   2 +
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |   1 +
 drivers/net/ethernet/intel/ice/ice_common.c   |   2 +
 drivers/net/ethernet/intel/ice/ice_main.c     | 371 +++++++++++-------
 drivers/net/ethernet/intel/ice/ice_switch.c   | 353 +++++++++++++++--
 drivers/net/ethernet/intel/ice/ice_switch.h   |  14 +-
 drivers/net/ethernet/intel/ice/ice_type.h     |   8 +-
 7 files changed, 580 insertions(+), 171 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 868f4a1d0f72..e17030db0bee 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -62,6 +62,7 @@ extern const char ice_drv_ver[];
 #define ICE_RES_VALID_BIT	0x8000
 #define ICE_RES_MISC_VEC_ID	(ICE_RES_VALID_BIT - 1)
 #define ICE_INVAL_Q_INDEX	0xffff
+#define ICE_INVAL_VFID		256
 
 #define ICE_VSIQF_HKEY_ARRAY_SIZE	((VSIQF_HKEY_MAX_INDEX + 1) *	4)
 
@@ -122,6 +123,7 @@ struct ice_sw {
 enum ice_state {
 	__ICE_DOWN,
 	__ICE_NEEDS_RESTART,
+	__ICE_PREPARED_FOR_RESET,	/* set by driver when prepared */
 	__ICE_RESET_RECOVERY_PENDING,	/* set by driver when reset starts */
 	__ICE_PFR_REQ,			/* set by driver and peers */
 	__ICE_CORER_REQ,		/* set by driver and peers */
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 87b304db9cad..55e8275ce2ee 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1253,6 +1253,7 @@ struct ice_aq_desc {
 		struct ice_aqc_add_txqs add_txqs;
 		struct ice_aqc_dis_txqs dis_txqs;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
+		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
 		struct ice_aqc_alloc_free_res_cmd sw_res_ctrl;
 		struct ice_aqc_set_event_mask set_event_mask;
 		struct ice_aqc_get_link_status get_link_status;
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 4c6b1038dc5f..b2bb42def038 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -711,6 +711,8 @@ enum ice_status ice_reset(struct ice_hw *hw, enum ice_reset_req req)
 		ice_debug(hw, ICE_DBG_INIT, "GlobalR requested\n");
 		val = GLGEN_RTRIG_GLOBR_M;
 		break;
+	default:
+		return ICE_ERR_PARAM;
 	}
 
 	val |= rd32(hw, GLGEN_RTRIG);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 014a2f3ea76c..1ef63bf98cd8 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -32,6 +32,7 @@ static const struct net_device_ops ice_netdev_ops;
 static void ice_pf_dis_all_vsi(struct ice_pf *pf);
 static void ice_rebuild(struct ice_pf *pf);
 static int ice_vsi_release(struct ice_vsi *vsi);
+static void ice_vsi_release_all(struct ice_pf *pf);
 static void ice_update_vsi_stats(struct ice_vsi *vsi);
 static void ice_update_pf_stats(struct ice_pf *pf);
 
@@ -456,23 +457,13 @@ static void
 ice_prepare_for_reset(struct ice_pf *pf)
 {
 	struct ice_hw *hw = &pf->hw;
-	u32 v;
-
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			ice_remove_vsi_fltr(hw, pf->vsi[v]->vsi_num);
-
-	dev_dbg(&pf->pdev->dev, "Tearing down internal switch for reset\n");
 
 	/* disable the VSIs and their queues that are not already DOWN */
-	/* pf_dis_all_vsi modifies netdev structures -rtnl_lock needed */
 	ice_pf_dis_all_vsi(pf);
 
-	ice_for_each_vsi(pf, v)
-		if (pf->vsi[v])
-			pf->vsi[v]->vsi_num = 0;
-
 	ice_shutdown_all_ctrlq(hw);
+
+	set_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 }
 
 /**
@@ -490,26 +481,32 @@ static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 	WARN_ON(in_interrupt());
 
 	/* PFR is a bit of a special case because it doesn't result in an OICR
-	 * interrupt. So for PFR, we prepare for reset, issue the reset and
-	 * rebuild sequentially.
+	 * interrupt. Set pending bit here which otherwise gets set in the
+	 * OICR handler.
 	 */
-	if (reset_type == ICE_RESET_PFR) {
+	if (reset_type == ICE_RESET_PFR)
 		set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
-		ice_prepare_for_reset(pf);
-	}
+
+	ice_prepare_for_reset(pf);
 
 	/* trigger the reset */
 	if (ice_reset(hw, reset_type)) {
 		dev_err(dev, "reset %d failed\n", reset_type);
 		set_bit(__ICE_RESET_FAILED, pf->state);
 		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 		return;
 	}
 
+	/* PFR is a bit of a special case because it doesn't result in an OICR
+	 * interrupt. So for PFR, rebuild after the reset and clear the reset-
+	 * associated state bits.
+	 */
 	if (reset_type == ICE_RESET_PFR) {
 		pf->pfr_count++;
 		ice_rebuild(pf);
 		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 	}
 }
 
@@ -519,20 +516,23 @@ static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
  */
 static void ice_reset_subtask(struct ice_pf *pf)
 {
-	enum ice_reset_req reset_type;
-
-	rtnl_lock();
+	enum ice_reset_req reset_type = ICE_RESET_INVAL;
 
 	/* When a CORER/GLOBR/EMPR is about to happen, the hardware triggers an
-	 * OICR interrupt. The OICR handler (ice_misc_intr) determines what
-	 * type of reset happened and sets __ICE_RESET_RECOVERY_PENDING bit in
-	 * pf->state. So if reset/recovery is pending (as indicated by this bit)
-	 * we do a rebuild and return.
+	 * OICR interrupt. The OICR handler (ice_misc_intr) determines what type
+	 * of reset is pending and sets bits in pf->state indicating the reset
+	 * type and __ICE_RESET_RECOVERY_PENDING.  So, if the latter bit is set
+	 * prepare for pending reset if not already (for PF software-initiated
+	 * global resets the software should already be prepared for it as
+	 * indicated by __ICE_PREPARED_FOR_RESET; for global resets initiated
+	 * by firmware or software on other PFs, that bit is not set so prepare
+	 * for the reset now), poll for reset done, rebuild and return.
 	 */
 	if (ice_is_reset_recovery_pending(pf->state)) {
 		clear_bit(__ICE_GLOBR_RECV, pf->state);
 		clear_bit(__ICE_CORER_RECV, pf->state);
-		ice_prepare_for_reset(pf);
+		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state))
+			ice_prepare_for_reset(pf);
 
 		/* make sure we are ready to rebuild */
 		if (ice_check_reset(&pf->hw)) {
@@ -541,29 +541,32 @@ static void ice_reset_subtask(struct ice_pf *pf)
 			/* done with reset. start rebuild */
 			pf->hw.reset_ongoing = false;
 			ice_rebuild(pf);
+			/* clear bit to resume normal operations, but
+			 * ICE_NEEDS_RESTART bit is set incase rebuild failed
+			 */
+			clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+			clear_bit(__ICE_PREPARED_FOR_RESET, pf->state);
 		}
-		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
-		goto unlock;
+
+		return;
 	}
 
 	/* No pending resets to finish processing. Check for new resets */
+	if (test_and_clear_bit(__ICE_PFR_REQ, pf->state))
+		reset_type = ICE_RESET_PFR;
+	if (test_and_clear_bit(__ICE_CORER_REQ, pf->state))
+		reset_type = ICE_RESET_CORER;
 	if (test_and_clear_bit(__ICE_GLOBR_REQ, pf->state))
 		reset_type = ICE_RESET_GLOBR;
-	else if (test_and_clear_bit(__ICE_CORER_REQ, pf->state))
-		reset_type = ICE_RESET_CORER;
-	else if (test_and_clear_bit(__ICE_PFR_REQ, pf->state))
-		reset_type = ICE_RESET_PFR;
-	else
-		goto unlock;
+	/* If no valid reset type requested just return */
+	if (reset_type == ICE_RESET_INVAL)
+		return;
 
-	/* reset if not already down or resetting */
+	/* reset if not already down or busy */
 	if (!test_bit(__ICE_DOWN, pf->state) &&
 	    !test_bit(__ICE_CFG_BUSY, pf->state)) {
 		ice_do_reset(pf, reset_type);
 	}
-
-unlock:
-	rtnl_unlock();
 }
 
 /**
@@ -970,7 +973,8 @@ static void ice_clean_adminq_subtask(struct ice_pf *pf)
 static void ice_service_task_schedule(struct ice_pf *pf)
 {
 	if (!test_bit(__ICE_DOWN, pf->state) &&
-	    !test_and_set_bit(__ICE_SERVICE_SCHED, pf->state))
+	    !test_and_set_bit(__ICE_SERVICE_SCHED, pf->state) &&
+	    !test_bit(__ICE_NEEDS_RESTART, pf->state))
 		queue_work(ice_wq, &pf->serv_task);
 }
 
@@ -1013,9 +1017,10 @@ static void ice_service_task(struct work_struct *work)
 	/* process reset requests first */
 	ice_reset_subtask(pf);
 
-	/* bail if a reset/recovery cycle is pending */
+	/* bail if a reset/recovery cycle is pending or rebuild failed */
 	if (ice_is_reset_recovery_pending(pf->state) ||
-	    test_bit(__ICE_SUSPENDED, pf->state)) {
+	    test_bit(__ICE_SUSPENDED, pf->state) ||
+	    test_bit(__ICE_NEEDS_RESTART, pf->state)) {
 		ice_service_task_complete(pf);
 		return;
 	}
@@ -1160,7 +1165,7 @@ static void ice_vsi_delete(struct ice_vsi *vsi)
 
 	memcpy(&ctxt.info, &vsi->info, sizeof(struct ice_aqc_vsi_props));
 
-	status = ice_aq_free_vsi(&pf->hw, &ctxt, false, NULL);
+	status = ice_free_vsi(&pf->hw, vsi->idx, &ctxt, false, NULL);
 	if (status)
 		dev_err(&pf->pdev->dev, "Failed to delete VSI %i in FW\n",
 			vsi->vsi_num);
@@ -1423,13 +1428,13 @@ static void ice_set_rss_vsi_ctx(struct ice_vsi_ctx *ctxt, struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_add - Create a new VSI or fetch preallocated VSI
+ * ice_vsi_init - Create and initialize a VSI
  * @vsi: the VSI being configured
  *
  * This initializes a VSI context depending on the VSI type to be added and
  * passes it down to the add_vsi aq command to create a new VSI.
  */
-static int ice_vsi_add(struct ice_vsi *vsi)
+static int ice_vsi_init(struct ice_vsi *vsi)
 {
 	struct ice_vsi_ctx ctxt = { 0 };
 	struct ice_pf *pf = vsi->back;
@@ -1456,13 +1461,17 @@ static int ice_vsi_add(struct ice_vsi *vsi)
 	ctxt.info.sw_id = vsi->port_info->sw_id;
 	ice_vsi_setup_q_map(vsi, &ctxt);
 
-	ret = ice_aq_add_vsi(hw, &ctxt, NULL);
+	ret = ice_add_vsi(hw, vsi->idx, &ctxt, NULL);
 	if (ret) {
-		dev_err(&vsi->back->pdev->dev,
-			"Add VSI AQ call failed, err %d\n", ret);
+		dev_err(&pf->pdev->dev,
+			"Add VSI failed, err %d\n", ret);
 		return -EIO;
 	}
+
+	/* keep context for update VSI operations */
 	vsi->info = ctxt.info;
+
+	/* record VSI number returned */
 	vsi->vsi_num = ctxt.vsi_num;
 
 	return ret;
@@ -2652,14 +2661,12 @@ static int ice_vsi_cfg_rss(struct ice_vsi *vsi)
 }
 
 /**
- * ice_vsi_reinit_setup - return resource and reallocate resource for a VSI
- * @vsi: pointer to the ice_vsi
- *
- * This reallocates the VSIs queue resources
+ * ice_vsi_rebuild - Rebuild VSI after reset
+ * @vsi: vsi to be rebuild
  *
  * Returns 0 on success and negative value on failure
  */
-static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
+static int ice_vsi_rebuild(struct ice_vsi *vsi)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
 	int ret, i;
@@ -2675,7 +2682,7 @@ static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
 	ice_vsi_set_num_qs(vsi);
 
 	/* Initialize VSI struct elements and create VSI in FW */
-	ret = ice_vsi_add(vsi);
+	ret = ice_vsi_init(vsi);
 	if (ret < 0)
 		goto err_vsi;
 
@@ -2685,19 +2692,7 @@ static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
 
 	switch (vsi->type) {
 	case ICE_VSI_PF:
-		if (!vsi->netdev) {
-			ret = ice_cfg_netdev(vsi);
-			if (ret)
-				goto err_rings;
-
-			ret = register_netdev(vsi->netdev);
-			if (ret)
-				goto err_rings;
-
-			netif_carrier_off(vsi->netdev);
-			netif_tx_stop_all_queues(vsi->netdev);
-		}
-
+		/* fall through */
 		ret = ice_vsi_alloc_q_vectors(vsi);
 		if (ret)
 			goto err_rings;
@@ -2749,21 +2744,23 @@ static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
 /**
  * ice_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
- * @type: VSI type
  * @pi: pointer to the port_info instance
+ * @type: VSI type
+ * @vf_id: defines VF id to which this VSI connects. This field is meant to be
+ *         used only for ICE_VSI_VF VSI type. For other VSI types, should
+ *         fill-in ICE_INVAL_VFID as input.
  *
  * This allocates the sw VSI structure and its queue resources.
  *
- * Returns pointer to the successfully allocated and configure VSI sw struct on
- * success, otherwise returns NULL on failure.
+ * Returns pointer to the successfully allocated and configured VSI sw struct on
+ * success, NULL on failure.
  */
 static struct ice_vsi *
-ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
-	      struct ice_port_info *pi)
+ice_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi,
+	      enum ice_vsi_type type, u16 __always_unused vf_id)
 {
 	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
 	struct device *dev = &pf->pdev->dev;
-	struct ice_vsi_ctx ctxt = { 0 };
 	struct ice_vsi *vsi;
 	int ret, i;
 
@@ -2786,12 +2783,10 @@ ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 	ice_vsi_set_rss_params(vsi);
 
 	/* create the VSI */
-	ret = ice_vsi_add(vsi);
+	ret = ice_vsi_init(vsi);
 	if (ret)
 		goto err_vsi;
 
-	ctxt.vsi_num = vsi->vsi_num;
-
 	switch (vsi->type) {
 	case ICE_VSI_PF:
 		ret = ice_cfg_netdev(vsi);
@@ -2860,10 +2855,7 @@ ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 		vsi->netdev = NULL;
 	}
 err_cfg_netdev:
-	ret = ice_aq_free_vsi(&pf->hw, &ctxt, false, NULL);
-	if (ret)
-		dev_err(&vsi->back->pdev->dev,
-			"Free VSI AQ call failed, err %d\n", ret);
+	ice_vsi_delete(vsi);
 err_vsi:
 	ice_vsi_put_qs(vsi);
 err_get_qs:
@@ -2874,6 +2866,20 @@ ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 	return NULL;
 }
 
+/**
+ * ice_pf_vsi_setup - Set up a PF VSI
+ * @pf: board private structure
+ * @pi: pointer to the port_info instance
+ *
+ * Returns pointer to the successfully allocated VSI sw struct on success,
+ * otherwise returns NULL on failure.
+ */
+static struct ice_vsi *
+ice_pf_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	return ice_vsi_setup(pf, pi, ICE_VSI_PF, ICE_INVAL_VFID);
+}
+
 /**
  * ice_vsi_add_vlan - Add vsi membership for given vlan
  * @vsi: the vsi being configured
@@ -3021,50 +3027,48 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
 	struct ice_vsi *vsi;
 	int status = 0;
 
-	if (!ice_is_reset_recovery_pending(pf->state)) {
-		vsi = ice_vsi_setup(pf, ICE_VSI_PF, pf->hw.port_info);
-		if (!vsi) {
-			status = -ENOMEM;
-			goto error_exit;
-		}
-	} else {
-		vsi = pf->vsi[0];
-		status = ice_vsi_reinit_setup(vsi);
-		if (status < 0)
-			return -EIO;
+	if (ice_is_reset_recovery_pending(pf->state))
+		return -EBUSY;
+
+	vsi = ice_pf_vsi_setup(pf, pf->hw.port_info);
+	if (!vsi) {
+		status = -ENOMEM;
+		goto unroll_vsi_setup;
 	}
 
-	/* tmp_add_list contains a list of MAC addresses for which MAC
-	 * filters need to be programmed. Add the VSI's unicast MAC to
-	 * this list
+	/* To add a MAC filter, first add the MAC to a list and then
+	 * pass the list to ice_add_mac.
 	 */
+
+	 /* Add a unicast MAC filter so the VSI can get its packets */
 	status = ice_add_mac_to_list(vsi, &tmp_add_list,
 				     vsi->port_info->mac.perm_addr);
 	if (status)
-		goto error_exit;
+		goto unroll_vsi_setup;
 
 	/* VSI needs to receive broadcast traffic, so add the broadcast
-	 * MAC address to the list.
+	 * MAC address to the list as well.
 	 */
 	eth_broadcast_addr(broadcast);
 	status = ice_add_mac_to_list(vsi, &tmp_add_list, broadcast);
 	if (status)
-		goto error_exit;
+		goto free_mac_list;
 
 	/* program MAC filters for entries in tmp_add_list */
 	status = ice_add_mac(&pf->hw, &tmp_add_list);
 	if (status) {
 		dev_err(&pf->pdev->dev, "Could not add MAC filters\n");
 		status = -ENOMEM;
-		goto error_exit;
+		goto free_mac_list;
 	}
 
 	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
 	return status;
 
-error_exit:
+free_mac_list:
 	ice_free_fltr_list(&pf->pdev->dev, &tmp_add_list);
 
+unroll_vsi_setup:
 	if (vsi) {
 		ice_vsi_free_q_vectors(vsi);
 		if (vsi->netdev && vsi->netdev->reg_state == NETREG_REGISTERED)
@@ -3453,24 +3457,13 @@ static int ice_probe(struct pci_dev *pdev,
 static void ice_remove(struct pci_dev *pdev)
 {
 	struct ice_pf *pf = pci_get_drvdata(pdev);
-	int i = 0;
-	int err;
 
 	if (!pf)
 		return;
 
 	set_bit(__ICE_DOWN, pf->state);
 
-	for (i = 0; i < pf->num_alloc_vsi; i++) {
-		if (!pf->vsi[i])
-			continue;
-
-		err = ice_vsi_release(pf->vsi[i]);
-		if (err)
-			dev_dbg(&pf->pdev->dev, "Failed to release VSI index %d (err %d)\n",
-				i, err);
-	}
-
+	ice_vsi_release_all(pf);
 	ice_free_irq_msix_misc(pf);
 	ice_clear_interrupt_scheme(pf);
 	ice_deinit_pf(pf);
@@ -3517,7 +3510,7 @@ static int __init ice_module_init(void)
 	pr_info("%s - version %s\n", ice_driver_string, ice_drv_ver);
 	pr_info("%s\n", ice_copyright);
 
-	ice_wq = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, KBUILD_MODNAME);
+	ice_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, KBUILD_MODNAME);
 	if (!ice_wq) {
 		pr_err("Failed to create workqueue\n");
 		return -ENOMEM;
@@ -5104,8 +5097,14 @@ static int ice_vsi_release(struct ice_vsi *vsi)
 	if (!vsi->back)
 		return -ENODEV;
 	pf = vsi->back;
-
-	if (vsi->netdev) {
+	/* do not unregister and free netdevs while driver is in the reset
+	 * recovery pending state. Since reset/rebuild happens through PF
+	 * service task workqueue, its not a good idea to unregister netdev
+	 * that is associated to the PF that is running the work queue items
+	 * currently. This is done to avoid check_flush_dependency() warning
+	 * on this wq
+	 */
+	if (vsi->netdev && !ice_is_reset_recovery_pending(pf->state)) {
 		unregister_netdev(vsi->netdev);
 		free_netdev(vsi->netdev);
 		vsi->netdev = NULL;
@@ -5131,11 +5130,39 @@ static int ice_vsi_release(struct ice_vsi *vsi)
 	pf->q_left_tx += vsi->alloc_txq;
 	pf->q_left_rx += vsi->alloc_rxq;
 
-	ice_vsi_clear(vsi);
+	/* retain SW VSI data structure since it is needed to unregister and
+	 * free VSI netdev when PF is not in reset recovery pending state,\
+	 * for ex: during rmmod.
+	 */
+	if (!ice_is_reset_recovery_pending(pf->state))
+		ice_vsi_clear(vsi);
 
 	return 0;
 }
 
+/**
+ * ice_vsi_release_all - Delete all VSIs
+ * @pf: PF from which all VSIs are being removed
+ */
+static void ice_vsi_release_all(struct ice_pf *pf)
+{
+	int err, i;
+
+	if (!pf->vsi)
+		return;
+
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		if (!pf->vsi[i])
+			continue;
+
+		err = ice_vsi_release(pf->vsi[i]);
+		if (err)
+			dev_dbg(&pf->pdev->dev,
+				"Failed to release pf->vsi[%d], err %d, vsi_num = %d\n",
+				i, err, pf->vsi[i]->vsi_num);
+	}
+}
+
 /**
  * ice_dis_vsi - pause a VSI
  * @vsi: the VSI being paused
@@ -5148,27 +5175,31 @@ static void ice_dis_vsi(struct ice_vsi *vsi)
 	set_bit(__ICE_NEEDS_RESTART, vsi->state);
 
 	if (vsi->netdev && netif_running(vsi->netdev) &&
-	    vsi->type == ICE_VSI_PF)
+	    vsi->type == ICE_VSI_PF) {
+		rtnl_lock();
 		vsi->netdev->netdev_ops->ndo_stop(vsi->netdev);
-
-	ice_vsi_close(vsi);
+		rtnl_unlock();
+	} else {
+		ice_vsi_close(vsi);
+	}
 }
 
 /**
  * ice_ena_vsi - resume a VSI
  * @vsi: the VSI being resume
  */
-static void ice_ena_vsi(struct ice_vsi *vsi)
+static int ice_ena_vsi(struct ice_vsi *vsi)
 {
-	if (!test_and_clear_bit(__ICE_NEEDS_RESTART, vsi->state))
-		return;
+	int err = 0;
+
+	if (test_and_clear_bit(__ICE_NEEDS_RESTART, vsi->state))
+		if (vsi->netdev && netif_running(vsi->netdev)) {
+			rtnl_lock();
+			err = vsi->netdev->netdev_ops->ndo_open(vsi->netdev);
+			rtnl_unlock();
+		}
 
-	if (vsi->netdev && netif_running(vsi->netdev))
-		vsi->netdev->netdev_ops->ndo_open(vsi->netdev);
-	else if (ice_vsi_open(vsi))
-		/* this clears the DOWN bit */
-		dev_dbg(&vsi->back->pdev->dev, "Failed open VSI 0x%04X on switch 0x%04X\n",
-			vsi->vsi_num, vsi->vsw->sw_id);
+	return err;
 }
 
 /**
@@ -5188,13 +5219,47 @@ static void ice_pf_dis_all_vsi(struct ice_pf *pf)
  * ice_pf_ena_all_vsi - Resume all VSIs on a PF
  * @pf: the PF
  */
-static void ice_pf_ena_all_vsi(struct ice_pf *pf)
+static int ice_pf_ena_all_vsi(struct ice_pf *pf)
 {
 	int v;
 
 	ice_for_each_vsi(pf, v)
 		if (pf->vsi[v])
-			ice_ena_vsi(pf->vsi[v]);
+			if (ice_ena_vsi(pf->vsi[v]))
+				return -EIO;
+
+	return 0;
+}
+
+/**
+ * ice_vsi_rebuild_all - rebuild all VSIs in pf
+ * @pf: the PF
+ */
+static int ice_vsi_rebuild_all(struct ice_pf *pf)
+{
+	int i;
+
+	/* loop through pf->vsi array and reinit the VSI if found */
+	for (i = 0; i < pf->num_alloc_vsi; i++) {
+		int err;
+
+		if (!pf->vsi[i])
+			continue;
+
+		err = ice_vsi_rebuild(pf->vsi[i]);
+		if (err) {
+			dev_err(&pf->pdev->dev,
+				"VSI at index %d rebuild failed\n",
+				pf->vsi[i]->idx);
+			return err;
+		}
+
+		dev_info(&pf->pdev->dev,
+			 "VSI at index %d rebuilt. vsi_num = 0x%x\n",
+			 pf->vsi[i]->idx, pf->vsi[i]->vsi_num);
+	}
+
+	return 0;
 }
 
 /**
@@ -5216,13 +5281,13 @@ static void ice_rebuild(struct ice_pf *pf)
 	ret = ice_init_all_ctrlq(hw);
 	if (ret) {
 		dev_err(dev, "control queues init failed %d\n", ret);
-		goto fail_reset;
+		goto err_init_ctrlq;
 	}
 
 	ret = ice_clear_pf_cfg(hw);
 	if (ret) {
 		dev_err(dev, "clear PF configuration failed %d\n", ret);
-		goto fail_reset;
+		goto err_init_ctrlq;
 	}
 
 	ice_clear_pxe_mode(hw);
@@ -5230,14 +5295,24 @@ static void ice_rebuild(struct ice_pf *pf)
 	ret = ice_get_caps(hw);
 	if (ret) {
 		dev_err(dev, "ice_get_caps failed %d\n", ret);
-		goto fail_reset;
+		goto err_init_ctrlq;
 	}
 
-	/* basic nic switch setup */
-	err = ice_setup_pf_sw(pf);
+	err = ice_sched_init_port(hw->port_info);
+	if (err)
+		goto err_sched_init_port;
+
+	err = ice_vsi_rebuild_all(pf);
 	if (err) {
-		dev_err(dev, "ice_setup_pf_sw failed\n");
-		goto fail_reset;
+		dev_err(dev, "ice_vsi_rebuild_all failed\n");
+		goto err_vsi_rebuild;
+	}
+
+	ret = ice_replay_all_fltr(&pf->hw);
+	if (ret) {
+		dev_err(&pf->pdev->dev,
+			"error replaying switch filter rules\n");
+		goto err_vsi_rebuild;
 	}
 
 	/* start misc vector */
@@ -5245,20 +5320,35 @@ static void ice_rebuild(struct ice_pf *pf)
 		err = ice_req_irq_msix_misc(pf);
 		if (err) {
 			dev_err(dev, "misc vector setup failed: %d\n", err);
-			goto fail_reset;
+			goto err_vsi_rebuild;
 		}
 	}
 
 	/* restart the VSIs that were rebuilt and running before the reset */
-	ice_pf_ena_all_vsi(pf);
+	err = ice_pf_ena_all_vsi(pf);
+	if (err) {
+		dev_err(&pf->pdev->dev, "error enabling VSIs\n");
+		/* no need to disable VSIs in tear down path in ice_rebuild()
+		 * since its already taken care in ice_vsi_open()
+		 */
+		goto err_vsi_rebuild;
+	}
 
+	/* if we get here, reset flow is successful */
+	clear_bit(__ICE_RESET_FAILED, pf->state);
 	return;
 
-fail_reset:
+err_vsi_rebuild:
+	ice_vsi_release_all(pf);
+err_sched_init_port:
+	ice_sched_cleanup_all(hw);
+err_init_ctrlq:
 	ice_shutdown_all_ctrlq(hw);
 	set_bit(__ICE_RESET_FAILED, pf->state);
 clear_recovery:
-	set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+	/* set this bit in PF state to control service task scheduling */
+	set_bit(__ICE_NEEDS_RESTART, pf->state);
+	dev_err(dev, "Rebuild failed, unload and reload driver\n");
 }
 
 /**
@@ -5431,6 +5521,11 @@ static int ice_open(struct net_device *netdev)
 	struct ice_vsi *vsi = np->vsi;
 	int err;
 
+	if (test_bit(__ICE_NEEDS_RESTART, vsi->back->state)) {
+		netdev_err(netdev, "driver needs to be unloaded and reloaded\n");
+		return -EIO;
+	}
+
 	netif_carrier_off(netdev);
 
 	err = ice_vsi_open(vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 2693bebef977..ac66c96a123d 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -169,17 +169,17 @@ ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp *buf,
  *
  * Add a VSI context to the hardware (0x0210)
  */
-enum ice_status
+static enum ice_status
 ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 	       struct ice_sq_cd *cd)
 {
 	struct ice_aqc_add_update_free_vsi_resp *res;
 	struct ice_aqc_add_get_update_free_vsi *cmd;
-	enum ice_status status;
 	struct ice_aq_desc desc;
+	enum ice_status status;
 
 	cmd = &desc.params.vsi_cmd;
-	res = (struct ice_aqc_add_update_free_vsi_resp *)&desc.params.raw;
+	res = &desc.params.add_update_free_vsi_res;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_vsi);
 
@@ -203,6 +203,42 @@ ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 	return status;
 }
 
+/**
+ * ice_aq_free_vsi
+ * @hw: pointer to the hw struct
+ * @vsi_ctx: pointer to a VSI context struct
+ * @keep_vsi_alloc: keep VSI allocation as part of this PF's resources
+ * @cd: pointer to command details structure or NULL
+ *
+ * Free VSI context info from hardware (0x0213)
+ */
+static enum ice_status
+ice_aq_free_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
+		bool keep_vsi_alloc, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_update_free_vsi_resp *resp;
+	struct ice_aqc_add_get_update_free_vsi *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.vsi_cmd;
+	resp = &desc.params.add_update_free_vsi_res;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_free_vsi);
+
+	cmd->vsi_num = cpu_to_le16(vsi_ctx->vsi_num | ICE_AQ_VSI_IS_VALID);
+	if (keep_vsi_alloc)
+		cmd->cmd_flags = cpu_to_le16(ICE_AQ_VSI_KEEP_ALLOC);
+
+	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+	if (!status) {
+		vsi_ctx->vsis_allocd = le16_to_cpu(resp->vsi_used);
+		vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
+	}
+
+	return status;
+}
+
 /**
  * ice_aq_update_vsi
  * @hw: pointer to the hw struct
@@ -221,7 +257,7 @@ ice_aq_update_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 	enum ice_status status;
 
 	cmd = &desc.params.vsi_cmd;
-	resp = (struct ice_aqc_add_update_free_vsi_resp *)&desc.params.raw;
+	resp = &desc.params.add_update_free_vsi_res;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_update_vsi);
 
@@ -241,38 +277,202 @@ ice_aq_update_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 }
 
 /**
- * ice_aq_free_vsi
+ * ice_update_fltr_vsi_map - update given filter VSI map
+ * @list_head: list for which filters needs to be updated
+ * @list_lock: filter lock which needs to be updated
+ * @old_vsi_num: old VSI HW id
+ * @new_vsi_num: new VSI HW id
+ *
+ * update the VSI map for a given filter list
+ */
+static void
+ice_update_fltr_vsi_map(struct list_head *list_head,
+			struct mutex *list_lock, u16 old_vsi_num,
+			u16 new_vsi_num)
+{
+	struct ice_fltr_mgmt_list_entry *itr;
+
+	mutex_lock(list_lock);
+	if (list_empty(list_head))
+		goto exit_update_map;
+
+	list_for_each_entry(itr, list_head, list_entry) {
+		if (itr->vsi_list_info &&
+		    test_bit(old_vsi_num, itr->vsi_list_info->vsi_map)) {
+			clear_bit(old_vsi_num, itr->vsi_list_info->vsi_map);
+			set_bit(new_vsi_num, itr->vsi_list_info->vsi_map);
+		} else if (itr->fltr_info.fltr_act == ICE_FWD_TO_VSI &&
+			   itr->fltr_info.fwd_id.vsi_id == old_vsi_num) {
+			itr->fltr_info.fwd_id.vsi_id = new_vsi_num;
+			itr->fltr_info.src = new_vsi_num;
+		}
+	}
+exit_update_map:
+	mutex_unlock(list_lock);
+}
+
+/**
+ * ice_update_all_fltr_vsi_map - update all filters VSI map
+ * @hw: pointer to the hardware structure
+ * @old_vsi_num: old VSI HW id
+ * @new_vsi_num: new VSI HW id
+ *
+ * update all filters VSI map
+ */
+static void
+ice_update_all_fltr_vsi_map(struct ice_hw *hw, u16 old_vsi_num, u16 new_vsi_num)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	u8 i;
+
+	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+		struct list_head *head = &sw->recp_list[i].filt_rules;
+		struct mutex *lock; /* Lock to protect filter rule list */
+
+		lock = &sw->recp_list[i].filt_rule_lock;
+		ice_update_fltr_vsi_map(head, lock, old_vsi_num,
+					new_vsi_num);
+	}
+}
+
+/**
+ * ice_is_vsi_valid - check whether the VSI is valid or not
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * check whether the VSI is valid or not
+ */
+static bool ice_is_vsi_valid(struct ice_hw *hw, u16 vsi_handle)
+{
+	return vsi_handle < ICE_MAX_VSI && hw->vsi_ctx[vsi_handle];
+}
+
+/**
+ * ice_get_hw_vsi_num - return the hw VSI number
  * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * return the hw VSI number
+ * Caution: call this function only if VSI is valid (ice_is_vsi_valid)
+ */
+static u16 ice_get_hw_vsi_num(struct ice_hw *hw, u16 vsi_handle)
+{
+	return hw->vsi_ctx[vsi_handle]->vsi_num;
+}
+
+/**
+ * ice_get_vsi_ctx - return the VSI context entry for a given VSI handle
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * return the VSI context entry for a given VSI handle
+ */
+static struct ice_vsi_ctx *ice_get_vsi_ctx(struct ice_hw *hw, u16 vsi_handle)
+{
+	return (vsi_handle >= ICE_MAX_VSI) ? NULL : hw->vsi_ctx[vsi_handle];
+}
+
+/**
+ * ice_save_vsi_ctx - save the VSI context for a given VSI handle
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ * @vsi: VSI context pointer
+ *
+ * save the VSI context entry for a given VSI handle
+ */
+static void ice_save_vsi_ctx(struct ice_hw *hw, u16 vsi_handle,
+			     struct ice_vsi_ctx *vsi)
+{
+	hw->vsi_ctx[vsi_handle] = vsi;
+}
+
+/**
+ * ice_clear_vsi_ctx - clear the VSI context entry
+ * @hw: pointer to the hw struct
+ * @vsi_handle: VSI handle
+ *
+ * clear the VSI context entry
+ */
+static void ice_clear_vsi_ctx(struct ice_hw *hw, u16 vsi_handle)
+{
+	struct ice_vsi_ctx *vsi;
+
+	vsi = ice_get_vsi_ctx(hw, vsi_handle);
+	if (vsi) {
+		devm_kfree(ice_hw_to_dev(hw), vsi);
+		hw->vsi_ctx[vsi_handle] = NULL;
+	}
+}
+
+/**
+ * ice_add_vsi - add VSI context to the hardware and VSI handle list
+ * @hw: pointer to the hw struct
+ * @vsi_handle: unique VSI handle provided by drivers
  * @vsi_ctx: pointer to a VSI context struct
- * @keep_vsi_alloc: keep VSI allocation as part of this PF's resources
  * @cd: pointer to command details structure or NULL
  *
- * Get VSI context info from hardware (0x0213)
+ * Add a VSI context to the hardware also add it into the VSI handle list.
+ * If this function gets called after reset for existing VSIs then update
+ * with the new HW VSI number in the corresponding VSI handle list entry.
  */
 enum ice_status
-ice_aq_free_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
-		bool keep_vsi_alloc, struct ice_sq_cd *cd)
+ice_add_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	    struct ice_sq_cd *cd)
 {
-	struct ice_aqc_add_update_free_vsi_resp *resp;
-	struct ice_aqc_add_get_update_free_vsi *cmd;
-	struct ice_aq_desc desc;
+	struct ice_vsi_ctx *tmp_vsi_ctx;
 	enum ice_status status;
 
-	cmd = &desc.params.vsi_cmd;
-	resp = (struct ice_aqc_add_update_free_vsi_resp *)&desc.params.raw;
-
-	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_free_vsi);
+	if (vsi_handle >= ICE_MAX_VSI)
+		return ICE_ERR_PARAM;
+	status = ice_aq_add_vsi(hw, vsi_ctx, cd);
+	if (status)
+		return status;
+	tmp_vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle);
+	if (!tmp_vsi_ctx) {
+		/* Create a new vsi context */
+		tmp_vsi_ctx = devm_kzalloc(ice_hw_to_dev(hw),
+					   sizeof(*tmp_vsi_ctx), GFP_KERNEL);
+		if (!tmp_vsi_ctx) {
+			ice_aq_free_vsi(hw, vsi_ctx, false, cd);
+			return ICE_ERR_NO_MEMORY;
+		}
+		*tmp_vsi_ctx = *vsi_ctx;
+		ice_save_vsi_ctx(hw, vsi_handle, tmp_vsi_ctx);
+	} else {
+		/* update with new HW VSI num */
+		if (tmp_vsi_ctx->vsi_num != vsi_ctx->vsi_num) {
+			/* update all filter lists with new HW VSI num */
+			ice_update_all_fltr_vsi_map(hw, tmp_vsi_ctx->vsi_num,
+						    vsi_ctx->vsi_num);
+			tmp_vsi_ctx->vsi_num = vsi_ctx->vsi_num;
+		}
+	}
 
-	cmd->vsi_num = cpu_to_le16(vsi_ctx->vsi_num | ICE_AQ_VSI_IS_VALID);
-	if (keep_vsi_alloc)
-		cmd->cmd_flags = cpu_to_le16(ICE_AQ_VSI_KEEP_ALLOC);
+	return status;
+}
 
-	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
-	if (!status) {
-		vsi_ctx->vsis_allocd = le16_to_cpu(resp->vsi_used);
-		vsi_ctx->vsis_unallocated = le16_to_cpu(resp->vsi_free);
-	}
+/**
+ * ice_free_vsi- free VSI context from hardware and VSI handle list
+ * @hw: pointer to the hw struct
+ * @vsi_handle: unique VSI handle
+ * @vsi_ctx: pointer to a VSI context struct
+ * @keep_vsi_alloc: keep VSI allocation as part of this PF's resources
+ * @cd: pointer to command details structure or NULL
+ *
+ * Free VSI context info from hardware as well as from VSI handle list
+ */
+enum ice_status
+ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	     bool keep_vsi_alloc, struct ice_sq_cd *cd)
+{
+	enum ice_status status;
 
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+	vsi_ctx->vsi_num = ice_get_hw_vsi_num(hw, vsi_handle);
+	status = ice_aq_free_vsi(hw, vsi_ctx, keep_vsi_alloc, cd);
+	if (!status)
+		ice_clear_vsi_ctx(hw, vsi_handle);
 	return status;
 }
 
@@ -1516,6 +1716,25 @@ ice_add_vlan(struct ice_hw *hw, struct list_head *v_list)
 	return 0;
 }
 
+/**
+ * ice_rem_sw_rule_info
+ * @hw: pointer to the hardware structure
+ * @rule_head: pointer to the switch list structure that we want to delete
+ */
+static void
+ice_rem_sw_rule_info(struct ice_hw *hw, struct list_head *rule_head)
+{
+	if (!list_empty(rule_head)) {
+		struct ice_fltr_mgmt_list_entry *entry;
+		struct ice_fltr_mgmt_list_entry *tmp;
+
+		list_for_each_entry_safe(entry, tmp, rule_head, list_entry) {
+			list_del(&entry->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), entry);
+		}
+	}
+}
+
 /**
  * ice_cfg_dflt_vsi - change state of VSI to set/clear default
  * @hw: pointer to the hardware structure
@@ -1822,3 +2041,89 @@ void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_id)
 	ice_remove_vsi_lkup_fltr(hw, vsi_id, ICE_SW_LKUP_ETHERTYPE_MAC);
 	ice_remove_vsi_lkup_fltr(hw, vsi_id, ICE_SW_LKUP_PROMISC_VLAN);
 }
+
+/**
+ * ice_replay_fltr - Replay all the filters stored by a specific list head
+ * @hw: pointer to the hardware structure
+ * @list_head: list for which filters needs to be replayed
+ * @recp_id: Recipe id for which rules need to be replayed
+ */
+static enum ice_status
+ice_replay_fltr(struct ice_hw *hw, u8 recp_id, struct list_head *list_head)
+{
+	struct ice_fltr_mgmt_list_entry *itr;
+	struct list_head l_head;
+	enum ice_status status = 0;
+
+	if (list_empty(list_head))
+		return status;
+
+	/* Move entries from the given list_head to a temporary l_head so that
+	 * they can be replayed. Otherwise when trying to re-add the same
+	 * filter, the function will return already exists
+	 */
+	list_replace_init(list_head, &l_head);
+
+	/* Mark the given list_head empty by reinitializing it so filters
+	 * could be added again by *handler
+	 */
+	list_for_each_entry(itr, &l_head, list_entry) {
+		struct ice_fltr_list_entry f_entry;
+
+		f_entry.fltr_info = itr->fltr_info;
+		if (itr->vsi_count < 2 && recp_id != ICE_SW_LKUP_VLAN) {
+			status = ice_add_rule_internal(hw, recp_id, &f_entry);
+			if (status)
+				goto end;
+			continue;
+		}
+
+		/* Add a filter per vsi separately */
+		while (1) {
+			u16 vsi;
+
+			vsi = find_first_bit(itr->vsi_list_info->vsi_map,
+					     ICE_MAX_VSI);
+			if (vsi == ICE_MAX_VSI)
+				break;
+
+			clear_bit(vsi, itr->vsi_list_info->vsi_map);
+			f_entry.fltr_info.fwd_id.vsi_id = vsi;
+			f_entry.fltr_info.fltr_act = ICE_FWD_TO_VSI;
+			if (recp_id == ICE_SW_LKUP_VLAN)
+				status = ice_add_vlan_internal(hw, &f_entry);
+			else
+				status = ice_add_rule_internal(hw, recp_id,
+							       &f_entry);
+			if (status)
+				goto end;
+		}
+	}
+end:
+	/* Clear the filter management list */
+	ice_rem_sw_rule_info(hw, &l_head);
+	return status;
+}
+
+/**
+ * ice_replay_all_fltr - replay all filters stored in bookkeeping lists
+ * @hw: pointer to the hardware structure
+ *
+ * NOTE: This function does not clean up partially added filters on error.
+ * It is up to caller of the function to issue a reset or fail early.
+ */
+enum ice_status ice_replay_all_fltr(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	enum ice_status status = 0;
+	u8 i;
+
+	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+		struct list_head *head = &sw->recp_list[i].filt_rules;
+
+		status = ice_replay_fltr(hw, i, head);
+		if (status)
+			return status;
+	}
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index 09a3b6d6541a..11a481644dac 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -158,15 +158,14 @@ struct ice_fltr_mgmt_list_entry {
 
 /* VSI related commands */
 enum ice_status
-ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
-	       struct ice_sq_cd *cd);
-enum ice_status
 ice_aq_update_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 		  struct ice_sq_cd *cd);
 enum ice_status
-ice_aq_free_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
-		bool keep_vsi_alloc, struct ice_sq_cd *cd);
-
+ice_add_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	    struct ice_sq_cd *cd);
+enum ice_status
+ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
+	     bool keep_vsi_alloc, struct ice_sq_cd *cd);
 enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw);
 
 /* Switch/bridge related commands */
@@ -177,6 +176,9 @@ enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
 enum ice_status ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
 enum ice_status
 ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_id, bool set, u8 direction);
+
+enum ice_status ice_replay_all_fltr(struct ice_hw *hw);
+
 enum ice_status ice_init_def_sw_recp(struct ice_hw *hw);
 
 #endif /* _ICE_SWITCH_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 473d82fdb570..8b7efa87ce56 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -149,9 +149,10 @@ struct ice_mac_info {
 
 /* Various RESET request, These are not tied with HW reset types */
 enum ice_reset_req {
-	ICE_RESET_PFR	= 0,
-	ICE_RESET_CORER	= 1,
-	ICE_RESET_GLOBR	= 2,
+	ICE_RESET_INVAL	= 0,
+	ICE_RESET_PFR	= 1,
+	ICE_RESET_CORER	= 2,
+	ICE_RESET_GLOBR	= 3,
 };
 
 /* Bus parameters */
@@ -283,6 +284,7 @@ struct ice_hw {
 	u8 sw_entry_point_layer;
 	u16 max_children[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 
+	struct ice_vsi_ctx *vsi_ctx[ICE_MAX_VSI];
 	u8 evb_veb;		/* true for VEB, false for VEPA */
 	u8 reset_ongoing;	/* true if hw is in reset, false otherwise */
 	struct ice_bus_info bus;
-- 
2.17.1

^ permalink raw reply related

* [net-next 11/15] ice: Implement ice_bridge_getlink and ice_bridge_setlink
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Md Fahad Iqbal Polash, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com>

ice_bridge_getlink returns the current bridge mode using
ndo_dflt_bridge_getlink and the mode parameter available in
first_switch->bridge_mode.

ice_bridge_setlink is invoked when the bridge mode needs to
changed. The value to be changed to is available as a netlink
message which is parsed in this function. If the mode has to
be changed, switch_flags is set appropriately (set ALLOW_LB
for VEB mode and clear it for VEPA mode) and ice_aq_update_vsi
is called. Also change the unicast switch filter rules.

Signed-off-by: Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c   | 140 +++++++++++++++++++-
 drivers/net/ethernet/intel/ice/ice_switch.c |  41 ++++++
 drivers/net/ethernet/intel/ice/ice_switch.h |   1 +
 3 files changed, 181 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index fccecb6fa618..cbeae1355593 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -3599,7 +3599,11 @@ static int ice_probe(struct pci_dev *pdev,
 		goto err_msix_misc_unroll;
 	}
 
-	pf->first_sw->bridge_mode = BRIDGE_MODE_VEB;
+	if (hw->evb_veb)
+		pf->first_sw->bridge_mode = BRIDGE_MODE_VEB;
+	else
+		pf->first_sw->bridge_mode = BRIDGE_MODE_VEPA;
+
 	pf->first_sw->pf = pf;
 
 	/* record the sw_id available for later use */
@@ -5695,6 +5699,138 @@ int ice_get_rss(struct ice_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
 	return 0;
 }
 
+/**
+ * ice_bridge_getlink - Get the hardware bridge mode
+ * @skb: skb buff
+ * @pid: process id
+ * @seq: RTNL message seq
+ * @dev: the netdev being configured
+ * @filter_mask: filter mask passed in
+ * @nlflags: netlink flags passed in
+ *
+ * Return the bridge mode (VEB/VEPA)
+ */
+static int
+ice_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+		   struct net_device *dev, u32 filter_mask, int nlflags)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u16 bmode;
+
+	bmode = pf->first_sw->bridge_mode;
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, bmode, 0, 0, nlflags,
+				       filter_mask, NULL);
+}
+
+/**
+ * ice_vsi_update_bridge_mode - Update VSI for switching bridge mode (VEB/VEPA)
+ * @vsi: Pointer to VSI structure
+ * @bmode: Hardware bridge mode (VEB/VEPA)
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode)
+{
+	struct device *dev = &vsi->back->pdev->dev;
+	struct ice_aqc_vsi_props *vsi_props;
+	struct ice_hw *hw = &vsi->back->hw;
+	struct ice_vsi_ctx ctxt = { 0 };
+	enum ice_status status;
+
+	vsi_props = &vsi->info;
+	ctxt.info = vsi->info;
+
+	if (bmode == BRIDGE_MODE_VEB)
+		/* change from VEPA to VEB mode */
+		ctxt.info.sw_flags |= ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
+	else
+		/* change from VEB to VEPA mode */
+		ctxt.info.sw_flags &= ~ICE_AQ_VSI_SW_FLAG_ALLOW_LB;
+	ctxt.vsi_num = vsi->vsi_num;
+	ctxt.info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_SW_VALID);
+	status = ice_aq_update_vsi(hw, &ctxt, NULL);
+	if (status) {
+		dev_err(dev, "update VSI for bridge mode failed, bmode = %d err %d aq_err %d\n",
+			bmode, status, hw->adminq.sq_last_status);
+		return -EIO;
+	}
+	/* Update sw flags for book keeping */
+	vsi_props->sw_flags = ctxt.info.sw_flags;
+
+	return 0;
+}
+
+/**
+ * ice_bridge_setlink - Set the hardware bridge mode
+ * @dev: the netdev being configured
+ * @nlh: RTNL message
+ * @flags: bridge setlink flags
+ *
+ * Sets the bridge mode (VEB/VEPA) of the switch to which the netdev (VSI) is
+ * hooked up to. Iterates through the PF VSI list and sets the loopback mode (if
+ * not already set for all VSIs connected to this switch. And also update the
+ * unicast switch filter rules for the corresponding switch of the netdev.
+ */
+static int
+ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+		   u16 __always_unused flags)
+{
+	struct ice_netdev_priv *np = netdev_priv(dev);
+	struct ice_pf *pf = np->vsi->back;
+	struct nlattr *attr, *br_spec;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status;
+	struct ice_sw *pf_sw;
+	int rem, v, err = 0;
+
+	pf_sw = pf->first_sw;
+	/* find the attribute in the netlink message */
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		__u16 mode;
+
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+		mode = nla_get_u16(attr);
+		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
+			return -EINVAL;
+		/* Continue  if bridge mode is not being flipped */
+		if (mode == pf_sw->bridge_mode)
+			continue;
+		/* Iterates through the PF VSI list and update the loopback
+		 * mode of the VSI
+		 */
+		ice_for_each_vsi(pf, v) {
+			if (!pf->vsi[v])
+				continue;
+			err = ice_vsi_update_bridge_mode(pf->vsi[v], mode);
+			if (err)
+				return err;
+		}
+
+		hw->evb_veb = (mode == BRIDGE_MODE_VEB);
+		/* Update the unicast switch filter rules for the corresponding
+		 * switch of the netdev
+		 */
+		status = ice_update_sw_rule_bridge_mode(hw);
+		if (status) {
+			netdev_err(dev, "update SW_RULE for bridge mode failed,  = %d err %d aq_err %d\n",
+				   mode, status, hw->adminq.sq_last_status);
+			/* revert hw->evb_veb */
+			hw->evb_veb = (pf_sw->bridge_mode == BRIDGE_MODE_VEB);
+			return -EIO;
+		}
+
+		pf_sw->bridge_mode = mode;
+	}
+
+	return 0;
+}
+
 /**
  * ice_tx_timeout - Respond to a Tx Hang
  * @netdev: network interface device structure
@@ -5907,6 +6043,8 @@ static const struct net_device_ops ice_netdev_ops = {
 	.ndo_vlan_rx_add_vid = ice_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = ice_vlan_rx_kill_vid,
 	.ndo_set_features = ice_set_features,
+	.ndo_bridge_getlink = ice_bridge_getlink,
+	.ndo_bridge_setlink = ice_bridge_setlink,
 	.ndo_fdb_add = ice_fdb_add,
 	.ndo_fdb_del = ice_fdb_del,
 	.ndo_tx_timeout = ice_tx_timeout,
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index ac66c96a123d..65b4e1cca6be 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -1130,6 +1130,47 @@ ice_update_pkt_fwd_rule(struct ice_hw *hw, struct ice_fltr_info *f_info)
 	return status;
 }
 
+/**
+ * ice_update_sw_rule_bridge_mode
+ * @hw: pointer to the hw struct
+ *
+ * Updates unicast switch filter rules based on VEB/VEPA mode
+ */
+enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_fltr_mgmt_list_entry *fm_entry;
+	enum ice_status status = 0;
+	struct list_head *rule_head;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
+	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
+
+	mutex_lock(rule_lock);
+	list_for_each_entry(fm_entry, rule_head, list_entry) {
+		struct ice_fltr_info *fi = &fm_entry->fltr_info;
+		u8 *addr = fi->l_data.mac.mac_addr;
+
+		/* Update unicast Tx rules to reflect the selected
+		 * VEB/VEPA mode
+		 */
+		if ((fi->flag & ICE_FLTR_TX) && is_unicast_ether_addr(addr) &&
+		    (fi->fltr_act == ICE_FWD_TO_VSI ||
+		     fi->fltr_act == ICE_FWD_TO_VSI_LIST ||
+		     fi->fltr_act == ICE_FWD_TO_Q ||
+		     fi->fltr_act == ICE_FWD_TO_QGRP)) {
+			status = ice_update_pkt_fwd_rule(hw, fi);
+			if (status)
+				break;
+		}
+	}
+
+	mutex_unlock(rule_lock);
+
+	return status;
+}
+
 /**
  * ice_add_update_vsi_list
  * @hw: pointer to the hardware structure
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index 11a481644dac..646389ca1238 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -169,6 +169,7 @@ ice_free_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx,
 enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw);
 
 /* Switch/bridge related commands */
+enum ice_status ice_update_sw_rule_bridge_mode(struct ice_hw *hw);
 enum ice_status ice_add_mac(struct ice_hw *hw, struct list_head *m_lst);
 enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_lst);
 void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_id);
-- 
2.17.1

^ permalink raw reply related

* [net-next 09/15] ice: Clean up register file
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

This patch cleans up the existing register definitions.

1) Several instances of long defines names used in the BIT() macro
   were replaced to use the actual values they represent. As a
   result some defines for shifts (ending with _S) that were used
   only to create bitmasks were removed completely.

2) Apply more consistent tab spacing.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 .../net/ethernet/intel/ice/ice_hw_autogen.h   | 417 ++++++++----------
 1 file changed, 188 insertions(+), 229 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index 6076fc87df9d..067ca26a1d94 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -6,251 +6,210 @@
 #ifndef _ICE_HW_AUTOGEN_H_
 #define _ICE_HW_AUTOGEN_H_
 
-#define QTX_COMM_DBELL(_DBQM)		(0x002C0000 + ((_DBQM) * 4))
-#define PF_FW_ARQBAH			0x00080180
-#define PF_FW_ARQBAL			0x00080080
-#define PF_FW_ARQH			0x00080380
-#define PF_FW_ARQH_ARQH_S		0
-#define PF_FW_ARQH_ARQH_M		ICE_M(0x3FF, PF_FW_ARQH_ARQH_S)
-#define PF_FW_ARQLEN			0x00080280
-#define PF_FW_ARQLEN_ARQLEN_S		0
-#define PF_FW_ARQLEN_ARQLEN_M		ICE_M(0x3FF, PF_FW_ARQLEN_ARQLEN_S)
-#define PF_FW_ARQLEN_ARQVFE_S		28
-#define PF_FW_ARQLEN_ARQVFE_M		BIT(PF_FW_ARQLEN_ARQVFE_S)
-#define PF_FW_ARQLEN_ARQOVFL_S		29
-#define PF_FW_ARQLEN_ARQOVFL_M		BIT(PF_FW_ARQLEN_ARQOVFL_S)
-#define PF_FW_ARQLEN_ARQCRIT_S		30
-#define PF_FW_ARQLEN_ARQCRIT_M		BIT(PF_FW_ARQLEN_ARQCRIT_S)
-#define PF_FW_ARQLEN_ARQENABLE_S	31
-#define PF_FW_ARQLEN_ARQENABLE_M	BIT(PF_FW_ARQLEN_ARQENABLE_S)
-#define PF_FW_ARQT			0x00080480
-#define PF_FW_ATQBAH			0x00080100
-#define PF_FW_ATQBAL			0x00080000
-#define PF_FW_ATQH			0x00080300
-#define PF_FW_ATQH_ATQH_S		0
-#define PF_FW_ATQH_ATQH_M		ICE_M(0x3FF, PF_FW_ATQH_ATQH_S)
-#define PF_FW_ATQLEN			0x00080200
-#define PF_FW_ATQLEN_ATQLEN_S		0
-#define PF_FW_ATQLEN_ATQLEN_M		ICE_M(0x3FF, PF_FW_ATQLEN_ATQLEN_S)
-#define PF_FW_ATQLEN_ATQVFE_S		28
-#define PF_FW_ATQLEN_ATQVFE_M		BIT(PF_FW_ATQLEN_ATQVFE_S)
-#define PF_FW_ATQLEN_ATQOVFL_S		29
-#define PF_FW_ATQLEN_ATQOVFL_M		BIT(PF_FW_ATQLEN_ATQOVFL_S)
-#define PF_FW_ATQLEN_ATQCRIT_S		30
-#define PF_FW_ATQLEN_ATQCRIT_M		BIT(PF_FW_ATQLEN_ATQCRIT_S)
-#define PF_FW_ATQLEN_ATQENABLE_S	31
-#define PF_FW_ATQLEN_ATQENABLE_M	BIT(PF_FW_ATQLEN_ATQENABLE_S)
-#define PF_FW_ATQT			0x00080400
-
+#define QTX_COMM_DBELL(_DBQM)			(0x002C0000 + ((_DBQM) * 4))
+#define PF_FW_ARQBAH				0x00080180
+#define PF_FW_ARQBAL				0x00080080
+#define PF_FW_ARQH				0x00080380
+#define PF_FW_ARQH_ARQH_M			ICE_M(0x3FF, 0)
+#define PF_FW_ARQLEN				0x00080280
+#define PF_FW_ARQLEN_ARQLEN_M			ICE_M(0x3FF, 0)
+#define PF_FW_ARQLEN_ARQVFE_M			BIT(28)
+#define PF_FW_ARQLEN_ARQOVFL_M			BIT(29)
+#define PF_FW_ARQLEN_ARQCRIT_M			BIT(30)
+#define PF_FW_ARQLEN_ARQENABLE_M		BIT(31)
+#define PF_FW_ARQT				0x00080480
+#define PF_FW_ATQBAH				0x00080100
+#define PF_FW_ATQBAL				0x00080000
+#define PF_FW_ATQH				0x00080300
+#define PF_FW_ATQH_ATQH_M			ICE_M(0x3FF, 0)
+#define PF_FW_ATQLEN				0x00080200
+#define PF_FW_ATQLEN_ATQLEN_M			ICE_M(0x3FF, 0)
+#define PF_FW_ATQLEN_ATQVFE_M			BIT(28)
+#define PF_FW_ATQLEN_ATQOVFL_M			BIT(29)
+#define PF_FW_ATQLEN_ATQCRIT_M			BIT(30)
+#define PF_FW_ATQLEN_ATQENABLE_M		BIT(31)
+#define PF_FW_ATQT				0x00080400
 #define GLFLXP_RXDID_FLAGS(_i, _j)		(0x0045D000 + ((_i) * 4 + (_j) * 256))
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S	0
-#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M	ICE_M(0x3F, GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S)
+#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M	ICE_M(0x3F, 0)
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S	8
-#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_M	ICE_M(0x3F, GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S)
+#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_M	ICE_M(0x3F, 8)
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_S	16
-#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M	ICE_M(0x3F, GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_S)
+#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_2_M	ICE_M(0x3F, 16)
 #define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S	24
-#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M	ICE_M(0x3F, GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_S)
+#define GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_3_M	ICE_M(0x3F, 24)
 #define GLFLXP_RXDID_FLX_WRD_0(_i)		(0x0045c800 + ((_i) * 4))
 #define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_S	0
-#define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_M	ICE_M(0xFF, GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_S)
+#define GLFLXP_RXDID_FLX_WRD_0_PROT_MDID_M	ICE_M(0xFF, 0)
 #define GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_S	30
-#define GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_M	ICE_M(0x3, GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_S)
+#define GLFLXP_RXDID_FLX_WRD_0_RXDID_OPCODE_M	ICE_M(0x3, 30)
 #define GLFLXP_RXDID_FLX_WRD_1(_i)		(0x0045c900 + ((_i) * 4))
 #define GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_S	0
-#define GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_M	ICE_M(0xFF, GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_S)
+#define GLFLXP_RXDID_FLX_WRD_1_PROT_MDID_M	ICE_M(0xFF, 0)
 #define GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_S	30
-#define GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_M	ICE_M(0x3, GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_S)
+#define GLFLXP_RXDID_FLX_WRD_1_RXDID_OPCODE_M	ICE_M(0x3, 30)
 #define GLFLXP_RXDID_FLX_WRD_2(_i)		(0x0045ca00 + ((_i) * 4))
 #define GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_S	0
-#define GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_M	ICE_M(0xFF, GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_S)
+#define GLFLXP_RXDID_FLX_WRD_2_PROT_MDID_M	ICE_M(0xFF, 0)
 #define GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_S	30
-#define GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_M	ICE_M(0x3, GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_S)
+#define GLFLXP_RXDID_FLX_WRD_2_RXDID_OPCODE_M	ICE_M(0x3, 30)
 #define GLFLXP_RXDID_FLX_WRD_3(_i)		(0x0045cb00 + ((_i) * 4))
 #define GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_S	0
-#define GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_M	ICE_M(0xFF, GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_S)
+#define GLFLXP_RXDID_FLX_WRD_3_PROT_MDID_M	ICE_M(0xFF, 0)
 #define GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_S	30
-#define GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_M	ICE_M(0x3, GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_S)
-
-#define QRXFLXP_CNTXT(_QRX)		(0x00480000 + ((_QRX) * 4))
-#define QRXFLXP_CNTXT_RXDID_IDX_S	0
-#define QRXFLXP_CNTXT_RXDID_IDX_M	ICE_M(0x3F, QRXFLXP_CNTXT_RXDID_IDX_S)
-#define QRXFLXP_CNTXT_RXDID_PRIO_S	8
-#define QRXFLXP_CNTXT_RXDID_PRIO_M	ICE_M(0x7, QRXFLXP_CNTXT_RXDID_PRIO_S)
-#define QRXFLXP_CNTXT_TS_S		11
-#define QRXFLXP_CNTXT_TS_M		BIT(QRXFLXP_CNTXT_TS_S)
-#define GLGEN_RSTAT			0x000B8188
-#define GLGEN_RSTAT_DEVSTATE_S		0
-#define GLGEN_RSTAT_DEVSTATE_M		ICE_M(0x3, GLGEN_RSTAT_DEVSTATE_S)
-#define GLGEN_RSTCTL			0x000B8180
-#define GLGEN_RSTCTL_GRSTDEL_S		0
-#define GLGEN_RSTCTL_GRSTDEL_M		ICE_M(0x3F, GLGEN_RSTCTL_GRSTDEL_S)
-#define GLGEN_RSTAT_RESET_TYPE_S	2
-#define GLGEN_RSTAT_RESET_TYPE_M	ICE_M(0x3, GLGEN_RSTAT_RESET_TYPE_S)
-#define GLGEN_RTRIG			0x000B8190
-#define GLGEN_RTRIG_CORER_S		0
-#define GLGEN_RTRIG_CORER_M		BIT(GLGEN_RTRIG_CORER_S)
-#define GLGEN_RTRIG_GLOBR_S		1
-#define GLGEN_RTRIG_GLOBR_M		BIT(GLGEN_RTRIG_GLOBR_S)
-#define GLGEN_STAT			0x000B612C
-#define PFGEN_CTRL			0x00091000
-#define PFGEN_CTRL_PFSWR_S		0
-#define PFGEN_CTRL_PFSWR_M		BIT(PFGEN_CTRL_PFSWR_S)
-#define PFGEN_STATE			0x00088000
-#define PRTGEN_STATUS			0x000B8100
-#define PFHMC_ERRORDATA			0x00520500
-#define PFHMC_ERRORINFO			0x00520400
-#define GLINT_DYN_CTL(_INT)		(0x00160000 + ((_INT) * 4))
-#define GLINT_DYN_CTL_INTENA_S		0
-#define GLINT_DYN_CTL_INTENA_M		BIT(GLINT_DYN_CTL_INTENA_S)
-#define GLINT_DYN_CTL_CLEARPBA_S	1
-#define GLINT_DYN_CTL_CLEARPBA_M	BIT(GLINT_DYN_CTL_CLEARPBA_S)
-#define GLINT_DYN_CTL_SWINT_TRIG_S	2
-#define GLINT_DYN_CTL_SWINT_TRIG_M	BIT(GLINT_DYN_CTL_SWINT_TRIG_S)
-#define GLINT_DYN_CTL_ITR_INDX_S	3
-#define GLINT_DYN_CTL_SW_ITR_INDX_S	25
-#define GLINT_DYN_CTL_SW_ITR_INDX_M	ICE_M(0x3, GLINT_DYN_CTL_SW_ITR_INDX_S)
-#define GLINT_DYN_CTL_INTENA_MSK_S	31
-#define GLINT_DYN_CTL_INTENA_MSK_M	BIT(GLINT_DYN_CTL_INTENA_MSK_S)
-#define GLINT_ITR(_i, _INT)		(0x00154000 + ((_i) * 8192 + (_INT) * 4))
-#define PFINT_FW_CTL			0x0016C800
-#define PFINT_FW_CTL_MSIX_INDX_S	0
-#define PFINT_FW_CTL_MSIX_INDX_M	ICE_M(0x7FF, PFINT_FW_CTL_MSIX_INDX_S)
-#define PFINT_FW_CTL_ITR_INDX_S		11
-#define PFINT_FW_CTL_ITR_INDX_M		ICE_M(0x3, PFINT_FW_CTL_ITR_INDX_S)
-#define PFINT_FW_CTL_CAUSE_ENA_S	30
-#define PFINT_FW_CTL_CAUSE_ENA_M	BIT(PFINT_FW_CTL_CAUSE_ENA_S)
-#define PFINT_OICR			0x0016CA00
-#define PFINT_OICR_ECC_ERR_S		16
-#define PFINT_OICR_ECC_ERR_M		BIT(PFINT_OICR_ECC_ERR_S)
-#define PFINT_OICR_MAL_DETECT_S		19
-#define PFINT_OICR_MAL_DETECT_M		BIT(PFINT_OICR_MAL_DETECT_S)
-#define PFINT_OICR_GRST_S		20
-#define PFINT_OICR_GRST_M		BIT(PFINT_OICR_GRST_S)
-#define PFINT_OICR_PCI_EXCEPTION_S	21
-#define PFINT_OICR_PCI_EXCEPTION_M	BIT(PFINT_OICR_PCI_EXCEPTION_S)
-#define PFINT_OICR_HMC_ERR_S		26
-#define PFINT_OICR_HMC_ERR_M		BIT(PFINT_OICR_HMC_ERR_S)
-#define PFINT_OICR_PE_CRITERR_S		28
-#define PFINT_OICR_PE_CRITERR_M		BIT(PFINT_OICR_PE_CRITERR_S)
-#define PFINT_OICR_CTL			0x0016CA80
-#define PFINT_OICR_CTL_MSIX_INDX_S	0
-#define PFINT_OICR_CTL_MSIX_INDX_M	ICE_M(0x7FF, PFINT_OICR_CTL_MSIX_INDX_S)
-#define PFINT_OICR_CTL_ITR_INDX_S	11
-#define PFINT_OICR_CTL_ITR_INDX_M	ICE_M(0x3, PFINT_OICR_CTL_ITR_INDX_S)
-#define PFINT_OICR_CTL_CAUSE_ENA_S	30
-#define PFINT_OICR_CTL_CAUSE_ENA_M	BIT(PFINT_OICR_CTL_CAUSE_ENA_S)
-#define PFINT_OICR_ENA			0x0016C900
-#define QINT_RQCTL(_QRX)		(0x00150000 + ((_QRX) * 4))
-#define QINT_RQCTL_MSIX_INDX_S		0
-#define QINT_RQCTL_ITR_INDX_S		11
-#define QINT_RQCTL_CAUSE_ENA_S		30
-#define QINT_RQCTL_CAUSE_ENA_M		BIT(QINT_RQCTL_CAUSE_ENA_S)
-#define QINT_TQCTL(_DBQM)		(0x00140000 + ((_DBQM) * 4))
-#define QINT_TQCTL_MSIX_INDX_S		0
-#define QINT_TQCTL_ITR_INDX_S		11
-#define QINT_TQCTL_CAUSE_ENA_S		30
-#define QINT_TQCTL_CAUSE_ENA_M		BIT(QINT_TQCTL_CAUSE_ENA_S)
-#define GLLAN_RCTL_0			0x002941F8
-#define QRX_CONTEXT(_i, _QRX)		(0x00280000 + ((_i) * 8192 + (_QRX) * 4))
-#define QRX_CTRL(_QRX)			(0x00120000 + ((_QRX) * 4))
-#define QRX_CTRL_MAX_INDEX		2047
-#define QRX_CTRL_QENA_REQ_S		0
-#define QRX_CTRL_QENA_REQ_M		BIT(QRX_CTRL_QENA_REQ_S)
-#define QRX_CTRL_QENA_STAT_S		2
-#define QRX_CTRL_QENA_STAT_M		BIT(QRX_CTRL_QENA_STAT_S)
-#define QRX_ITR(_QRX)			(0x00292000 + ((_QRX) * 4))
-#define QRX_TAIL(_QRX)			(0x00290000 + ((_QRX) * 4))
-#define GLNVM_FLA			0x000B6108
-#define GLNVM_FLA_LOCKED_S		6
-#define GLNVM_FLA_LOCKED_M		BIT(GLNVM_FLA_LOCKED_S)
-#define GLNVM_GENS			0x000B6100
-#define GLNVM_GENS_SR_SIZE_S		5
-#define GLNVM_GENS_SR_SIZE_M		ICE_M(0x7, GLNVM_GENS_SR_SIZE_S)
-#define GLNVM_ULD			0x000B6008
-#define GLNVM_ULD_CORER_DONE_S		3
-#define GLNVM_ULD_CORER_DONE_M		BIT(GLNVM_ULD_CORER_DONE_S)
-#define GLNVM_ULD_GLOBR_DONE_S		4
-#define GLNVM_ULD_GLOBR_DONE_M		BIT(GLNVM_ULD_GLOBR_DONE_S)
-#define PF_FUNC_RID			0x0009E880
-#define PF_FUNC_RID_FUNC_NUM_S		0
-#define PF_FUNC_RID_FUNC_NUM_M		ICE_M(0x7, PF_FUNC_RID_FUNC_NUM_S)
-#define GLPRT_BPRCH(_i)			(0x00381384 + ((_i) * 8))
-#define GLPRT_BPRCL(_i)			(0x00381380 + ((_i) * 8))
-#define GLPRT_BPTCH(_i)			(0x00381244 + ((_i) * 8))
-#define GLPRT_BPTCL(_i)			(0x00381240 + ((_i) * 8))
-#define GLPRT_CRCERRS(_i)		(0x00380100 + ((_i) * 8))
-#define GLPRT_GORCH(_i)			(0x00380004 + ((_i) * 8))
-#define GLPRT_GORCL(_i)			(0x00380000 + ((_i) * 8))
-#define GLPRT_GOTCH(_i)			(0x00380B44 + ((_i) * 8))
-#define GLPRT_GOTCL(_i)			(0x00380B40 + ((_i) * 8))
-#define GLPRT_ILLERRC(_i)		(0x003801C0 + ((_i) * 8))
-#define GLPRT_LXOFFRXC(_i)		(0x003802C0 + ((_i) * 8))
-#define GLPRT_LXOFFTXC(_i)		(0x00381180 + ((_i) * 8))
-#define GLPRT_LXONRXC(_i)		(0x00380280 + ((_i) * 8))
-#define GLPRT_LXONTXC(_i)		(0x00381140 + ((_i) * 8))
-#define GLPRT_MLFC(_i)			(0x00380040 + ((_i) * 8))
-#define GLPRT_MPRCH(_i)			(0x00381344 + ((_i) * 8))
-#define GLPRT_MPRCL(_i)			(0x00381340 + ((_i) * 8))
-#define GLPRT_MPTCH(_i)			(0x00381204 + ((_i) * 8))
-#define GLPRT_MPTCL(_i)			(0x00381200 + ((_i) * 8))
-#define GLPRT_MRFC(_i)			(0x00380080 + ((_i) * 8))
-#define GLPRT_PRC1023H(_i)		(0x00380A04 + ((_i) * 8))
-#define GLPRT_PRC1023L(_i)		(0x00380A00 + ((_i) * 8))
-#define GLPRT_PRC127H(_i)		(0x00380944 + ((_i) * 8))
-#define GLPRT_PRC127L(_i)		(0x00380940 + ((_i) * 8))
-#define GLPRT_PRC1522H(_i)		(0x00380A44 + ((_i) * 8))
-#define GLPRT_PRC1522L(_i)		(0x00380A40 + ((_i) * 8))
-#define GLPRT_PRC255H(_i)		(0x00380984 + ((_i) * 8))
-#define GLPRT_PRC255L(_i)		(0x00380980 + ((_i) * 8))
-#define GLPRT_PRC511H(_i)		(0x003809C4 + ((_i) * 8))
-#define GLPRT_PRC511L(_i)		(0x003809C0 + ((_i) * 8))
-#define GLPRT_PRC64H(_i)		(0x00380904 + ((_i) * 8))
-#define GLPRT_PRC64L(_i)		(0x00380900 + ((_i) * 8))
-#define GLPRT_PRC9522H(_i)		(0x00380A84 + ((_i) * 8))
-#define GLPRT_PRC9522L(_i)		(0x00380A80 + ((_i) * 8))
-#define GLPRT_PTC1023H(_i)		(0x00380C84 + ((_i) * 8))
-#define GLPRT_PTC1023L(_i)		(0x00380C80 + ((_i) * 8))
-#define GLPRT_PTC127H(_i)		(0x00380BC4 + ((_i) * 8))
-#define GLPRT_PTC127L(_i)		(0x00380BC0 + ((_i) * 8))
-#define GLPRT_PTC1522H(_i)		(0x00380CC4 + ((_i) * 8))
-#define GLPRT_PTC1522L(_i)		(0x00380CC0 + ((_i) * 8))
-#define GLPRT_PTC255H(_i)		(0x00380C04 + ((_i) * 8))
-#define GLPRT_PTC255L(_i)		(0x00380C00 + ((_i) * 8))
-#define GLPRT_PTC511H(_i)		(0x00380C44 + ((_i) * 8))
-#define GLPRT_PTC511L(_i)		(0x00380C40 + ((_i) * 8))
-#define GLPRT_PTC64H(_i)		(0x00380B84 + ((_i) * 8))
-#define GLPRT_PTC64L(_i)		(0x00380B80 + ((_i) * 8))
-#define GLPRT_PTC9522H(_i)		(0x00380D04 + ((_i) * 8))
-#define GLPRT_PTC9522L(_i)		(0x00380D00 + ((_i) * 8))
-#define GLPRT_RFC(_i)			(0x00380AC0 + ((_i) * 8))
-#define GLPRT_RJC(_i)			(0x00380B00 + ((_i) * 8))
-#define GLPRT_RLEC(_i)			(0x00380140 + ((_i) * 8))
-#define GLPRT_ROC(_i)			(0x00380240 + ((_i) * 8))
-#define GLPRT_RUC(_i)			(0x00380200 + ((_i) * 8))
-#define GLPRT_TDOLD(_i)			(0x00381280 + ((_i) * 8))
-#define GLPRT_UPRCH(_i)			(0x00381304 + ((_i) * 8))
-#define GLPRT_UPRCL(_i)			(0x00381300 + ((_i) * 8))
-#define GLPRT_UPTCH(_i)			(0x003811C4 + ((_i) * 8))
-#define GLPRT_UPTCL(_i)			(0x003811C0 + ((_i) * 8))
-#define GLV_BPRCH(_i)			(0x003B6004 + ((_i) * 8))
-#define GLV_BPRCL(_i)			(0x003B6000 + ((_i) * 8))
-#define GLV_BPTCH(_i)			(0x0030E004 + ((_i) * 8))
-#define GLV_BPTCL(_i)			(0x0030E000 + ((_i) * 8))
-#define GLV_GORCH(_i)			(0x003B0004 + ((_i) * 8))
-#define GLV_GORCL(_i)			(0x003B0000 + ((_i) * 8))
-#define GLV_GOTCH(_i)			(0x00300004 + ((_i) * 8))
-#define GLV_GOTCL(_i)			(0x00300000 + ((_i) * 8))
-#define GLV_MPRCH(_i)			(0x003B4004 + ((_i) * 8))
-#define GLV_MPRCL(_i)			(0x003B4000 + ((_i) * 8))
-#define GLV_MPTCH(_i)			(0x0030C004 + ((_i) * 8))
-#define GLV_MPTCL(_i)			(0x0030C000 + ((_i) * 8))
-#define GLV_RDPC(_i)			(0x00294C04 + ((_i) * 4))
-#define GLV_TEPC(_VSI)			(0x00312000 + ((_VSI) * 4))
-#define GLV_UPRCH(_i)			(0x003B2004 + ((_i) * 8))
-#define GLV_UPRCL(_i)			(0x003B2000 + ((_i) * 8))
-#define GLV_UPTCH(_i)			(0x0030A004 + ((_i) * 8))
-#define GLV_UPTCL(_i)			(0x0030A000 + ((_i) * 8))
-#define VSIQF_HKEY_MAX_INDEX		12
+#define GLFLXP_RXDID_FLX_WRD_3_RXDID_OPCODE_M	ICE_M(0x3, 30)
+#define QRXFLXP_CNTXT(_QRX)			(0x00480000 + ((_QRX) * 4))
+#define QRXFLXP_CNTXT_RXDID_IDX_S		0
+#define QRXFLXP_CNTXT_RXDID_IDX_M		ICE_M(0x3F, 0)
+#define QRXFLXP_CNTXT_RXDID_PRIO_S		8
+#define QRXFLXP_CNTXT_RXDID_PRIO_M		ICE_M(0x7, 8)
+#define GLGEN_RSTAT				0x000B8188
+#define GLGEN_RSTAT_DEVSTATE_M			ICE_M(0x3, 0)
+#define GLGEN_RSTCTL				0x000B8180
+#define GLGEN_RSTCTL_GRSTDEL_S			0
+#define GLGEN_RSTCTL_GRSTDEL_M			ICE_M(0x3F, GLGEN_RSTCTL_GRSTDEL_S)
+#define GLGEN_RSTAT_RESET_TYPE_S		2
+#define GLGEN_RSTAT_RESET_TYPE_M		ICE_M(0x3, 2)
+#define GLGEN_RTRIG				0x000B8190
+#define GLGEN_RTRIG_CORER_M			BIT(0)
+#define GLGEN_RTRIG_GLOBR_M			BIT(1)
+#define GLGEN_STAT				0x000B612C
+#define PFGEN_CTRL				0x00091000
+#define PFGEN_CTRL_PFSWR_M			BIT(0)
+#define PFGEN_STATE				0x00088000
+#define PRTGEN_STATUS				0x000B8100
+#define PFHMC_ERRORDATA				0x00520500
+#define PFHMC_ERRORINFO				0x00520400
+#define GLINT_DYN_CTL(_INT)			(0x00160000 + ((_INT) * 4))
+#define GLINT_DYN_CTL_INTENA_M			BIT(0)
+#define GLINT_DYN_CTL_CLEARPBA_M		BIT(1)
+#define GLINT_DYN_CTL_SWINT_TRIG_M		BIT(2)
+#define GLINT_DYN_CTL_ITR_INDX_S		3
+#define GLINT_DYN_CTL_SW_ITR_INDX_M		ICE_M(0x3, 25)
+#define GLINT_DYN_CTL_INTENA_MSK_M		BIT(31)
+#define GLINT_ITR(_i, _INT)			(0x00154000 + ((_i) * 8192 + (_INT) * 4))
+#define PFINT_FW_CTL				0x0016C800
+#define PFINT_FW_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PFINT_FW_CTL_ITR_INDX_S			11
+#define PFINT_FW_CTL_ITR_INDX_M			ICE_M(0x3, 11)
+#define PFINT_FW_CTL_CAUSE_ENA_M		BIT(30)
+#define PFINT_OICR				0x0016CA00
+#define PFINT_OICR_ECC_ERR_M			BIT(16)
+#define PFINT_OICR_MAL_DETECT_M			BIT(19)
+#define PFINT_OICR_GRST_M			BIT(20)
+#define PFINT_OICR_PCI_EXCEPTION_M		BIT(21)
+#define PFINT_OICR_HMC_ERR_M			BIT(26)
+#define PFINT_OICR_PE_CRITERR_M			BIT(28)
+#define PFINT_OICR_CTL				0x0016CA80
+#define PFINT_OICR_CTL_MSIX_INDX_M		ICE_M(0x7FF, 0)
+#define PFINT_OICR_CTL_ITR_INDX_S		11
+#define PFINT_OICR_CTL_ITR_INDX_M		ICE_M(0x3, 11)
+#define PFINT_OICR_CTL_CAUSE_ENA_M		BIT(30)
+#define PFINT_OICR_ENA				0x0016C900
+#define QINT_RQCTL(_QRX)			(0x00150000 + ((_QRX) * 4))
+#define QINT_RQCTL_MSIX_INDX_S			0
+#define QINT_RQCTL_ITR_INDX_S			11
+#define QINT_RQCTL_CAUSE_ENA_M			BIT(30)
+#define QINT_TQCTL(_DBQM)			(0x00140000 + ((_DBQM) * 4))
+#define QINT_TQCTL_MSIX_INDX_S			0
+#define QINT_TQCTL_ITR_INDX_S			11
+#define QINT_TQCTL_CAUSE_ENA_M			BIT(30)
+#define QRX_CONTEXT(_i, _QRX)			(0x00280000 + ((_i) * 8192 + (_QRX) * 4))
+#define QRX_CTRL(_QRX)				(0x00120000 + ((_QRX) * 4))
+#define QRX_CTRL_MAX_INDEX			2047
+#define QRX_CTRL_QENA_REQ_S			0
+#define QRX_CTRL_QENA_REQ_M			BIT(0)
+#define QRX_CTRL_QENA_STAT_S			2
+#define QRX_CTRL_QENA_STAT_M			BIT(2)
+#define QRX_ITR(_QRX)				(0x00292000 + ((_QRX) * 4))
+#define QRX_TAIL(_QRX)				(0x00290000 + ((_QRX) * 4))
+#define GLNVM_FLA				0x000B6108
+#define GLNVM_FLA_LOCKED_M			BIT(6)
+#define GLNVM_GENS				0x000B6100
+#define GLNVM_GENS_SR_SIZE_S			5
+#define GLNVM_GENS_SR_SIZE_M			ICE_M(0x7, 5)
+#define GLNVM_ULD				0x000B6008
+#define GLNVM_ULD_CORER_DONE_M			BIT(3)
+#define GLNVM_ULD_GLOBR_DONE_M			BIT(4)
+#define PF_FUNC_RID				0x0009E880
+#define PF_FUNC_RID_FUNC_NUM_S			0
+#define PF_FUNC_RID_FUNC_NUM_M			ICE_M(0x7, 0)
+#define GLPRT_BPRCH(_i)				(0x00381384 + ((_i) * 8))
+#define GLPRT_BPRCL(_i)				(0x00381380 + ((_i) * 8))
+#define GLPRT_BPTCH(_i)				(0x00381244 + ((_i) * 8))
+#define GLPRT_BPTCL(_i)				(0x00381240 + ((_i) * 8))
+#define GLPRT_CRCERRS(_i)			(0x00380100 + ((_i) * 8))
+#define GLPRT_GORCH(_i)				(0x00380004 + ((_i) * 8))
+#define GLPRT_GORCL(_i)				(0x00380000 + ((_i) * 8))
+#define GLPRT_GOTCH(_i)				(0x00380B44 + ((_i) * 8))
+#define GLPRT_GOTCL(_i)				(0x00380B40 + ((_i) * 8))
+#define GLPRT_ILLERRC(_i)			(0x003801C0 + ((_i) * 8))
+#define GLPRT_LXOFFRXC(_i)			(0x003802C0 + ((_i) * 8))
+#define GLPRT_LXOFFTXC(_i)			(0x00381180 + ((_i) * 8))
+#define GLPRT_LXONRXC(_i)			(0x00380280 + ((_i) * 8))
+#define GLPRT_LXONTXC(_i)			(0x00381140 + ((_i) * 8))
+#define GLPRT_MLFC(_i)				(0x00380040 + ((_i) * 8))
+#define GLPRT_MPRCH(_i)				(0x00381344 + ((_i) * 8))
+#define GLPRT_MPRCL(_i)				(0x00381340 + ((_i) * 8))
+#define GLPRT_MPTCH(_i)				(0x00381204 + ((_i) * 8))
+#define GLPRT_MPTCL(_i)				(0x00381200 + ((_i) * 8))
+#define GLPRT_MRFC(_i)				(0x00380080 + ((_i) * 8))
+#define GLPRT_PRC1023H(_i)			(0x00380A04 + ((_i) * 8))
+#define GLPRT_PRC1023L(_i)			(0x00380A00 + ((_i) * 8))
+#define GLPRT_PRC127H(_i)			(0x00380944 + ((_i) * 8))
+#define GLPRT_PRC127L(_i)			(0x00380940 + ((_i) * 8))
+#define GLPRT_PRC1522H(_i)			(0x00380A44 + ((_i) * 8))
+#define GLPRT_PRC1522L(_i)			(0x00380A40 + ((_i) * 8))
+#define GLPRT_PRC255H(_i)			(0x00380984 + ((_i) * 8))
+#define GLPRT_PRC255L(_i)			(0x00380980 + ((_i) * 8))
+#define GLPRT_PRC511H(_i)			(0x003809C4 + ((_i) * 8))
+#define GLPRT_PRC511L(_i)			(0x003809C0 + ((_i) * 8))
+#define GLPRT_PRC64H(_i)			(0x00380904 + ((_i) * 8))
+#define GLPRT_PRC64L(_i)			(0x00380900 + ((_i) * 8))
+#define GLPRT_PRC9522H(_i)			(0x00380A84 + ((_i) * 8))
+#define GLPRT_PRC9522L(_i)			(0x00380A80 + ((_i) * 8))
+#define GLPRT_PTC1023H(_i)			(0x00380C84 + ((_i) * 8))
+#define GLPRT_PTC1023L(_i)			(0x00380C80 + ((_i) * 8))
+#define GLPRT_PTC127H(_i)			(0x00380BC4 + ((_i) * 8))
+#define GLPRT_PTC127L(_i)			(0x00380BC0 + ((_i) * 8))
+#define GLPRT_PTC1522H(_i)			(0x00380CC4 + ((_i) * 8))
+#define GLPRT_PTC1522L(_i)			(0x00380CC0 + ((_i) * 8))
+#define GLPRT_PTC255H(_i)			(0x00380C04 + ((_i) * 8))
+#define GLPRT_PTC255L(_i)			(0x00380C00 + ((_i) * 8))
+#define GLPRT_PTC511H(_i)			(0x00380C44 + ((_i) * 8))
+#define GLPRT_PTC511L(_i)			(0x00380C40 + ((_i) * 8))
+#define GLPRT_PTC64H(_i)			(0x00380B84 + ((_i) * 8))
+#define GLPRT_PTC64L(_i)			(0x00380B80 + ((_i) * 8))
+#define GLPRT_PTC9522H(_i)			(0x00380D04 + ((_i) * 8))
+#define GLPRT_PTC9522L(_i)			(0x00380D00 + ((_i) * 8))
+#define GLPRT_RFC(_i)				(0x00380AC0 + ((_i) * 8))
+#define GLPRT_RJC(_i)				(0x00380B00 + ((_i) * 8))
+#define GLPRT_RLEC(_i)				(0x00380140 + ((_i) * 8))
+#define GLPRT_ROC(_i)				(0x00380240 + ((_i) * 8))
+#define GLPRT_RUC(_i)				(0x00380200 + ((_i) * 8))
+#define GLPRT_TDOLD(_i)				(0x00381280 + ((_i) * 8))
+#define GLPRT_UPRCH(_i)				(0x00381304 + ((_i) * 8))
+#define GLPRT_UPRCL(_i)				(0x00381300 + ((_i) * 8))
+#define GLPRT_UPTCH(_i)				(0x003811C4 + ((_i) * 8))
+#define GLPRT_UPTCL(_i)				(0x003811C0 + ((_i) * 8))
+#define GLV_BPRCH(_i)				(0x003B6004 + ((_i) * 8))
+#define GLV_BPRCL(_i)				(0x003B6000 + ((_i) * 8))
+#define GLV_BPTCH(_i)				(0x0030E004 + ((_i) * 8))
+#define GLV_BPTCL(_i)				(0x0030E000 + ((_i) * 8))
+#define GLV_GORCH(_i)				(0x003B0004 + ((_i) * 8))
+#define GLV_GORCL(_i)				(0x003B0000 + ((_i) * 8))
+#define GLV_GOTCH(_i)				(0x00300004 + ((_i) * 8))
+#define GLV_GOTCL(_i)				(0x00300000 + ((_i) * 8))
+#define GLV_MPRCH(_i)				(0x003B4004 + ((_i) * 8))
+#define GLV_MPRCL(_i)				(0x003B4000 + ((_i) * 8))
+#define GLV_MPTCH(_i)				(0x0030C004 + ((_i) * 8))
+#define GLV_MPTCL(_i)				(0x0030C000 + ((_i) * 8))
+#define GLV_RDPC(_i)				(0x00294C04 + ((_i) * 4))
+#define GLV_TEPC(_VSI)				(0x00312000 + ((_VSI) * 4))
+#define GLV_UPRCH(_i)				(0x003B2004 + ((_i) * 8))
+#define GLV_UPRCL(_i)				(0x003B2000 + ((_i) * 8))
+#define GLV_UPTCH(_i)				(0x0030A004 + ((_i) * 8))
+#define GLV_UPTCL(_i)				(0x0030A000 + ((_i) * 8))
+#define VSIQF_HKEY_MAX_INDEX			12
 
 #endif /* _ICE_HW_AUTOGEN_H_ */
-- 
2.17.1

^ permalink raw reply related

* [net-next 05/15] ice: Code optimization for ice_fill_sw_rule()
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Zhenning Xiao, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Zhenning Xiao <zhenning.xiao@intel.com>

Use the buffer in the s_rule structure directly instead of using
a local array eth_hdr[DUMMY_ETH_HDR_LEN]

Signed-off-by: Zhenning Xiao <zhenning.xiao@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_switch.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 6b7ec2ae5ad6..d8b18cabc3a8 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -464,8 +464,9 @@ ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
 		 struct ice_aqc_sw_rules_elem *s_rule, enum ice_adminq_opc opc)
 {
 	u16 vlan_id = ICE_MAX_VLAN_ID + 1;
-	u8 eth_hdr[DUMMY_ETH_HDR_LEN];
 	void *daddr = NULL;
+	u16 eth_hdr_sz;
+	u8 *eth_hdr;
 	u32 act = 0;
 	__be16 *off;
 
@@ -477,8 +478,11 @@ ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
 		return;
 	}
 
+	eth_hdr_sz = sizeof(dummy_eth_header);
+	eth_hdr = s_rule->pdata.lkup_tx_rx.hdr;
+
 	/* initialize the ether header with a dummy header */
-	memcpy(eth_hdr, dummy_eth_header, sizeof(dummy_eth_header));
+	memcpy(eth_hdr, dummy_eth_header, eth_hdr_sz);
 	ice_fill_sw_info(hw, f_info);
 
 	switch (f_info->fltr_act) {
@@ -536,7 +540,7 @@ ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
 		daddr = f_info->l_data.ethertype_mac.mac_addr;
 		/* fall-through */
 	case ICE_SW_LKUP_ETHERTYPE:
-		off = (__be16 *)&eth_hdr[ICE_ETH_ETHTYPE_OFFSET];
+		off = (__be16 *)(eth_hdr + ICE_ETH_ETHTYPE_OFFSET);
 		*off = cpu_to_be16(f_info->l_data.ethertype_mac.ethertype);
 		break;
 	case ICE_SW_LKUP_MAC_VLAN:
@@ -563,18 +567,16 @@ ice_fill_sw_rule(struct ice_hw *hw, struct ice_fltr_info *f_info,
 	s_rule->pdata.lkup_tx_rx.act = cpu_to_le32(act);
 
 	if (daddr)
-		ether_addr_copy(&eth_hdr[ICE_ETH_DA_OFFSET], daddr);
+		ether_addr_copy(eth_hdr + ICE_ETH_DA_OFFSET, daddr);
 
 	if (!(vlan_id > ICE_MAX_VLAN_ID)) {
-		off = (__be16 *)&eth_hdr[ICE_ETH_VLAN_TCI_OFFSET];
+		off = (__be16 *)(eth_hdr + ICE_ETH_VLAN_TCI_OFFSET);
 		*off = cpu_to_be16(vlan_id);
 	}
 
 	/* Create the switch rule with the final dummy Ethernet header */
 	if (opc != ice_aqc_opc_update_sw_rules)
-		s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(sizeof(eth_hdr));
-
-	memcpy(s_rule->pdata.lkup_tx_rx.hdr, eth_hdr, sizeof(eth_hdr));
+		s_rule->pdata.lkup_tx_rx.hdr_len = cpu_to_le16(eth_hdr_sz);
 }
 
 /**
-- 
2.17.1

^ permalink raw reply related

* [net-next 04/15] ice: Prevent control queue operations during reset
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

Once reset is issued, the driver loses all control queue interfaces.
Exercising control queue operations during reset is incorrect and
may result in long timeouts.

This patch introduces a new field 'reset_ongoing' in the hw structure.
This is set to 1 by the core driver when it receives a reset interrupt.
ice_sq_send_cmd checks reset_ongoing before actually issuing the control
queue operation. If a reset is in progress, it returns a soft error code
(ICE_ERR_RESET_PENDING) to the caller. The caller may or may not have to
take any action based on this return. Once the driver knows that the
reset is done, it has to set reset_ongoing back to 0. This will allow
control queue operations to be posted to the hardware again.

This "bail out" logic was specifically added to ice_sq_send_cmd (which
is pretty low level function) so that we have one solution in one place
that applies to all types of control queues.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_controlq.c |  3 ++
 drivers/net/ethernet/intel/ice/ice_main.c     | 34 ++++++++++++++++---
 drivers/net/ethernet/intel/ice/ice_status.h   |  1 +
 drivers/net/ethernet/intel/ice/ice_type.h     |  1 +
 4 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c
index 62be72fdc8f3..1fe026a65d75 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.c
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.c
@@ -806,6 +806,9 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	u16 retval = 0;
 	u32 val = 0;
 
+	/* if reset is in progress return a soft error */
+	if (hw->reset_ongoing)
+		return ICE_ERR_RESET_ONGOING;
 	mutex_lock(&cq->sq_lock);
 
 	cq->sq_last_status = ICE_AQ_RC_OK;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index f1e80eed2fd6..014a2f3ea76c 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -535,10 +535,13 @@ static void ice_reset_subtask(struct ice_pf *pf)
 		ice_prepare_for_reset(pf);
 
 		/* make sure we are ready to rebuild */
-		if (ice_check_reset(&pf->hw))
+		if (ice_check_reset(&pf->hw)) {
 			set_bit(__ICE_RESET_FAILED, pf->state);
-		else
+		} else {
+			/* done with reset. start rebuild */
+			pf->hw.reset_ongoing = false;
 			ice_rebuild(pf);
+		}
 		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
 		goto unlock;
 	}
@@ -1754,7 +1757,8 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 		 * We also make note of which reset happened so that peer
 		 * devices/drivers can be informed.
 		 */
-		if (!test_bit(__ICE_RESET_RECOVERY_PENDING, pf->state)) {
+		if (!test_and_set_bit(__ICE_RESET_RECOVERY_PENDING,
+				      pf->state)) {
 			if (reset == ICE_RESET_CORER)
 				set_bit(__ICE_CORER_RECV, pf->state);
 			else if (reset == ICE_RESET_GLOBR)
@@ -1762,7 +1766,20 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 			else
 				set_bit(__ICE_EMPR_RECV, pf->state);
 
-			set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+			/* There are couple of different bits at play here.
+			 * hw->reset_ongoing indicates whether the hardware is
+			 * in reset. This is set to true when a reset interrupt
+			 * is received and set back to false after the driver
+			 * has determined that the hardware is out of reset.
+			 *
+			 * __ICE_RESET_RECOVERY_PENDING in pf->state indicates
+			 * that a post reset rebuild is required before the
+			 * driver is operational again. This is set above.
+			 *
+			 * As this is the start of the reset/rebuild cycle, set
+			 * both to indicate that.
+			 */
+			hw->reset_ongoing = true;
 		}
 	}
 
@@ -4185,7 +4202,14 @@ static int ice_vsi_stop_tx_rings(struct ice_vsi *vsi)
 	}
 	status = ice_dis_vsi_txq(vsi->port_info, vsi->num_txq, q_ids, q_teids,
 				 NULL);
-	if (status) {
+	/* if the disable queue command was exercised during an active reset
+	 * flow, ICE_ERR_RESET_ONGOING is returned. This is not an error as
+	 * the reset operation disables queues at the hardware level anyway.
+	 */
+	if (status == ICE_ERR_RESET_ONGOING) {
+		dev_dbg(&pf->pdev->dev,
+			"Reset in progress. LAN Tx queues already disabled\n");
+	} else if (status) {
 		dev_err(&pf->pdev->dev,
 			"Failed to disable LAN Tx queues, error: %d\n",
 			status);
diff --git a/drivers/net/ethernet/intel/ice/ice_status.h b/drivers/net/ethernet/intel/ice/ice_status.h
index 9a95c4ffd7d7..d2dae913d81e 100644
--- a/drivers/net/ethernet/intel/ice/ice_status.h
+++ b/drivers/net/ethernet/intel/ice/ice_status.h
@@ -20,6 +20,7 @@ enum ice_status {
 	ICE_ERR_ALREADY_EXISTS			= -14,
 	ICE_ERR_DOES_NOT_EXIST			= -15,
 	ICE_ERR_MAX_LIMIT			= -17,
+	ICE_ERR_RESET_ONGOING			= -18,
 	ICE_ERR_BUF_TOO_SHORT			= -52,
 	ICE_ERR_NVM_BLANK_MODE			= -53,
 	ICE_ERR_AQ_ERROR			= -100,
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index e1a0d0e5d074..adfe131bd3f3 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -295,6 +295,7 @@ struct ice_hw {
 	u16 max_children[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 
 	u8 evb_veb;		/* true for VEB, false for VEPA */
+	u8 reset_ongoing;	/* true if hw is in reset, false otherwise */
 	struct ice_bus_info bus;
 	struct ice_nvm_info nvm;
 	struct ice_hw_dev_caps dev_caps;	/* device capabilities */
-- 
2.17.1

^ permalink raw reply related

* [net-next 08/15] ice: Implement handlers for ethtool PHY/link operations
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Chinh Cao, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Chinh Cao <chinh.t.cao@intel.com>

This patch implements handlers for ethtool get_link_ksettings and
set_link_ksettings. Helper functions use by these handlers are also
introduced in this patch.

Signed-off-by: Chinh Cao <chinh.t.cao@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |   8 +-
 drivers/net/ethernet/intel/ice/ice_common.c   | 121 ++-
 drivers/net/ethernet/intel/ice/ice_common.h   |  14 +-
 drivers/net/ethernet/intel/ice/ice_ethtool.c  | 801 +++++++++++++++++-
 4 files changed, 891 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 55e8275ce2ee..3dadb2b01b5c 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -920,9 +920,11 @@ struct ice_aqc_set_phy_cfg_data {
 	u8 caps;
 #define ICE_AQ_PHY_ENA_TX_PAUSE_ABILITY		BIT(0)
 #define ICE_AQ_PHY_ENA_RX_PAUSE_ABILITY		BIT(1)
-#define ICE_AQ_PHY_ENA_LOW_POWER		BIT(2)
-#define ICE_AQ_PHY_ENA_LINK			BIT(3)
-#define ICE_AQ_PHY_ENA_ATOMIC_LINK		BIT(5)
+#define ICE_AQ_PHY_ENA_LOW_POWER	BIT(2)
+#define ICE_AQ_PHY_ENA_LINK		BIT(3)
+#define ICE_AQ_PHY_ENA_AUTO_LINK_UPDT	BIT(5)
+#define ICE_AQ_PHY_ENA_LESM		BIT(6)
+#define ICE_AQ_PHY_ENA_AUTO_FEC		BIT(7)
 	u8 low_power_ctrl;
 	__le16 eee_cap; /* Value from ice_aqc_get_phy_caps */
 	__le16 eeer_value;
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index b2bb42def038..52c2bf4f108e 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -125,7 +125,7 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
  *
  * Returns the various PHY capabilities supported on the Port (0x0600)
  */
-static enum ice_status
+enum ice_status
 ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 		    struct ice_aqc_get_phy_caps_data *pcaps,
 		    struct ice_sq_cd *cd)
@@ -1408,6 +1408,110 @@ void ice_clear_pxe_mode(struct ice_hw *hw)
 		ice_aq_clear_pxe_mode(hw);
 }
 
+/**
+ * ice_get_link_speed_based_on_phy_type - returns link speed
+ * @phy_type_low: lower part of phy_type
+ *
+ * This helper function will convert a phy_type_low to its corresponding link
+ * speed.
+ * Note: In the structure of phy_type_low, there should be one bit set, as
+ * this function will convert one phy type to its speed.
+ * If no bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
+ * If more than one bit gets set, ICE_LINK_SPEED_UNKNOWN will be returned
+ */
+static u16
+ice_get_link_speed_based_on_phy_type(u64 phy_type_low)
+{
+	u16 speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
+
+	switch (phy_type_low) {
+	case ICE_PHY_TYPE_LOW_100BASE_TX:
+	case ICE_PHY_TYPE_LOW_100M_SGMII:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_100MB;
+		break;
+	case ICE_PHY_TYPE_LOW_1000BASE_T:
+	case ICE_PHY_TYPE_LOW_1000BASE_SX:
+	case ICE_PHY_TYPE_LOW_1000BASE_LX:
+	case ICE_PHY_TYPE_LOW_1000BASE_KX:
+	case ICE_PHY_TYPE_LOW_1G_SGMII:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_1000MB;
+		break;
+	case ICE_PHY_TYPE_LOW_2500BASE_T:
+	case ICE_PHY_TYPE_LOW_2500BASE_X:
+	case ICE_PHY_TYPE_LOW_2500BASE_KX:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_2500MB;
+		break;
+	case ICE_PHY_TYPE_LOW_5GBASE_T:
+	case ICE_PHY_TYPE_LOW_5GBASE_KR:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_5GB;
+		break;
+	case ICE_PHY_TYPE_LOW_10GBASE_T:
+	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
+	case ICE_PHY_TYPE_LOW_10GBASE_SR:
+	case ICE_PHY_TYPE_LOW_10GBASE_LR:
+	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
+	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_10GB;
+		break;
+	case ICE_PHY_TYPE_LOW_25GBASE_T:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
+	case ICE_PHY_TYPE_LOW_25GBASE_SR:
+	case ICE_PHY_TYPE_LOW_25GBASE_LR:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
+	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_25G_AUI_C2C:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_25GB;
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
+	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_40GB;
+		break;
+	default:
+		speed_phy_type_low = ICE_AQ_LINK_SPEED_UNKNOWN;
+		break;
+	}
+
+	return speed_phy_type_low;
+}
+
+/**
+ * ice_update_phy_type
+ * @phy_type_low: pointer to the lower part of phy_type
+ * @link_speeds_bitmap: targeted link speeds bitmap
+ *
+ * Note: For the link_speeds_bitmap structure, you can check it at
+ * [ice_aqc_get_link_status->link_speed]. Caller can pass in
+ * link_speeds_bitmap include multiple speeds.
+ *
+ * The value of phy_type_low will present a certain link speed. This helper
+ * function will turn on bits in the phy_type_low based on the value of
+ * link_speeds_bitmap input parameter.
+ */
+void ice_update_phy_type(u64 *phy_type_low, u16 link_speeds_bitmap)
+{
+	u16 speed = ICE_AQ_LINK_SPEED_UNKNOWN;
+	u64 pt_low;
+	int index;
+
+	/* We first check with low part of phy_type */
+	for (index = 0; index <= ICE_PHY_TYPE_LOW_MAX_INDEX; index++) {
+		pt_low = BIT_ULL(index);
+		speed = ice_get_link_speed_based_on_phy_type(pt_low);
+
+		if (link_speeds_bitmap & speed)
+			*phy_type_low |= BIT_ULL(index);
+	}
+}
+
 /**
  * ice_aq_set_phy_cfg
  * @hw: pointer to the hw struct
@@ -1420,19 +1524,18 @@ void ice_clear_pxe_mode(struct ice_hw *hw)
  * mode as the PF may not have the privilege to set some of the PHY Config
  * parameters. This status will be indicated by the command response (0x0601).
  */
-static enum ice_status
+enum ice_status
 ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport,
 		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd)
 {
-	struct ice_aqc_set_phy_cfg *cmd;
 	struct ice_aq_desc desc;
 
 	if (!cfg)
 		return ICE_ERR_PARAM;
 
-	cmd = &desc.params.set_phy;
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_phy_cfg);
-	cmd->lport_num = lport;
+	desc.params.set_phy.lport_num = lport;
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
 
 	return ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd);
 }
@@ -1481,12 +1584,12 @@ ice_update_link_info(struct ice_port_info *pi)
  * ice_set_fc
  * @pi: port information structure
  * @aq_failures: pointer to status code, specific to ice_set_fc routine
- * @atomic_restart: enable automatic link update
+ * @ena_auto_link_update: enable automatic link update
  *
  * Set the requested flow control mode.
  */
 enum ice_status
-ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool atomic_restart)
+ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool ena_auto_link_update)
 {
 	struct ice_aqc_set_phy_cfg_data cfg = { 0 };
 	struct ice_aqc_get_phy_caps_data *pcaps;
@@ -1536,8 +1639,8 @@ ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool atomic_restart)
 		int retry_count, retry_max = 10;
 
 		/* Auto restart link so settings take effect */
-		if (atomic_restart)
-			cfg.caps |= ICE_AQ_PHY_ENA_ATOMIC_LINK;
+		if (ena_auto_link_update)
+			cfg.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
 		/* Copy over all the old settings */
 		cfg.phy_type_low = pcaps->phy_type_low;
 		cfg.low_power_ctrl = pcaps->low_power_ctrl;
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index 6455b6952ec8..409b9957f407 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -58,12 +58,24 @@ enum ice_status
 ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc,
 		void *buf, u16 buf_size, struct ice_sq_cd *cd);
 enum ice_status ice_aq_get_fw_ver(struct ice_hw *hw, struct ice_sq_cd *cd);
+
+enum ice_status
+ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
+		    struct ice_aqc_get_phy_caps_data *caps,
+		    struct ice_sq_cd *cd);
+void
+ice_update_phy_type(u64 *phy_type_low, u16 link_speeds_bitmap);
 enum ice_status
 ice_aq_manage_mac_write(struct ice_hw *hw, u8 *mac_addr, u8 flags,
 			struct ice_sq_cd *cd);
 enum ice_status ice_clear_pf_cfg(struct ice_hw *hw);
 enum ice_status
-ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool atomic_restart);
+ice_aq_set_phy_cfg(struct ice_hw *hw, u8 lport,
+		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd);
+enum ice_status
+ice_set_fc(struct ice_port_info *pi, u8 *aq_failures,
+	   bool ena_auto_link_update);
+
 enum ice_status
 ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 			   struct ice_sq_cd *cd);
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index c71a9b528d6d..db2c502ae932 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -332,58 +332,473 @@ ice_get_ethtool_stats(struct net_device *netdev,
 	}
 }
 
-static int
-ice_get_link_ksettings(struct net_device *netdev,
-		       struct ethtool_link_ksettings *ks)
+/**
+ * ice_phy_type_to_ethtool - convert the phy_types to ethtool link modes
+ * @netdev: network interface device structure
+ * @ks: ethtool link ksettings struct to fill out
+ */
+static void ice_phy_type_to_ethtool(struct net_device *netdev,
+				    struct ethtool_link_ksettings *ks)
 {
 	struct ice_netdev_priv *np = netdev_priv(netdev);
 	struct ice_link_status *hw_link_info;
 	struct ice_vsi *vsi = np->vsi;
-	bool link_up;
+	u64 phy_types_low;
 
 	hw_link_info = &vsi->port_info->phy.link_info;
-	link_up = hw_link_info->link_info & ICE_AQ_LINK_UP;
+	phy_types_low = vsi->port_info->phy.phy_type_low;
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+
+	if (phy_types_low & ICE_PHY_TYPE_LOW_100BASE_TX ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_100M_SGMII) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100baseT_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_100MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     100baseT_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_1G_SGMII) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseT_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_KX) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseKX_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseKX_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_SX ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_LX) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseX_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_1000MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     1000baseX_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_T) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseT_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_2500MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     2500baseT_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_X ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_KX) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseX_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_2500MB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     2500baseX_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_KR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     5000baseT_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_5GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     5000baseT_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_DA ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_10G_SFI_C2C) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseT_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_KR_CR1) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKR_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseKR_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_SR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseSR_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseSR_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_LR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseLR_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_10GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     10000baseLR_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR_S ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR1 ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25G_AUI_C2C) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     25000baseCR_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_SR ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_LR) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseSR_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     25000baseSR_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR_S ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR1) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseKR_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_25GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     25000baseKR_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_KR4) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseKR4_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     40000baseKR4_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_CR4 ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_40G_XLAUI) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     40000baseCR4_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_SR4) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseSR4_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     40000baseSR4_Full);
+	}
+	if (phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_LR4) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseLR4_Full);
+		if (hw_link_info->req_speeds & ICE_AQ_LINK_SPEED_40GB)
+			ethtool_link_ksettings_add_link_mode(ks, advertising,
+							     40000baseLR4_Full);
+	}
 
-	ethtool_link_ksettings_add_link_mode(ks, supported,
-					     10000baseT_Full);
-	ethtool_link_ksettings_add_link_mode(ks, advertising,
-					     10000baseT_Full);
+	/* Autoneg PHY types */
+	if (phy_types_low & ICE_PHY_TYPE_LOW_100BASE_TX ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_1000BASE_KX ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_2500BASE_KX ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_5GBASE_KR ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_10GBASE_KR_CR1 ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_T ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR_S ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_CR1 ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR_S ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_25GBASE_KR1 ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_CR4 ||
+	    phy_types_low & ICE_PHY_TYPE_LOW_40GBASE_KR4) {
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     Autoneg);
+	}
+}
 
-	/* set speed and duplex */
-	if (link_up) {
-		switch (hw_link_info->link_speed) {
-		case ICE_AQ_LINK_SPEED_100MB:
-			ks->base.speed = SPEED_100;
-			break;
-		case ICE_AQ_LINK_SPEED_2500MB:
-			ks->base.speed = SPEED_2500;
-			break;
-		case ICE_AQ_LINK_SPEED_5GB:
-			ks->base.speed = SPEED_5000;
-			break;
-		case ICE_AQ_LINK_SPEED_10GB:
-			ks->base.speed = SPEED_10000;
-			break;
-		case ICE_AQ_LINK_SPEED_25GB:
-			ks->base.speed = SPEED_25000;
-			break;
-		case ICE_AQ_LINK_SPEED_40GB:
-			ks->base.speed = SPEED_40000;
-			break;
-		default:
-			ks->base.speed = SPEED_UNKNOWN;
-			break;
-		}
+#define TEST_SET_BITS_TIMEOUT	50
+#define TEST_SET_BITS_SLEEP_MAX	2000
+#define TEST_SET_BITS_SLEEP_MIN	1000
 
-		ks->base.duplex = DUPLEX_FULL;
-	} else {
-		ks->base.speed = SPEED_UNKNOWN;
-		ks->base.duplex = DUPLEX_UNKNOWN;
+/**
+ * ice_get_settings_link_up - Get Link settings for when link is up
+ * @ks: ethtool ksettings to fill in
+ * @netdev: network interface device structure
+ */
+static void ice_get_settings_link_up(struct ethtool_link_ksettings *ks,
+				     struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ethtool_link_ksettings cap_ksettings;
+	struct ice_link_status *link_info;
+	struct ice_vsi *vsi = np->vsi;
+	bool unrecog_phy_low = false;
+
+	link_info = &vsi->port_info->phy.link_info;
+
+	/* Initialize supported and advertised settings based on phy settings */
+	switch (link_info->phy_type_low) {
+	case ICE_PHY_TYPE_LOW_100BASE_TX:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     100baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_100M_SGMII:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     100baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_1000BASE_T:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     1000baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_1G_SGMII:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_1000BASE_SX:
+	case ICE_PHY_TYPE_LOW_1000BASE_LX:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseX_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_1000BASE_KX:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     1000baseKX_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     1000baseKX_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_2500BASE_T:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     2500baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_2500BASE_X:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseX_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_2500BASE_KX:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     2500baseX_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     2500baseX_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_5GBASE_T:
+	case ICE_PHY_TYPE_LOW_5GBASE_KR:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     5000baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     5000baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_10GBASE_T:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_10G_SFI_DA:
+	case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_10G_SFI_C2C:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseT_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_10GBASE_SR:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseSR_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_10GBASE_LR:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseLR_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     10000baseKR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     10000baseKR_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_25GBASE_T:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR_S:
+	case ICE_PHY_TYPE_LOW_25GBASE_CR1:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     25000baseCR_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseCR_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_25GBASE_SR:
+	case ICE_PHY_TYPE_LOW_25GBASE_LR:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseSR_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_25GBASE_KR:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR1:
+	case ICE_PHY_TYPE_LOW_25GBASE_KR_S:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     25000baseKR_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     25000baseKR_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_CR4:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     40000baseCR4_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC:
+	case ICE_PHY_TYPE_LOW_40G_XLAUI:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseCR4_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_SR4:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseSR4_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_LR4:
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseLR4_Full);
+		break;
+	case ICE_PHY_TYPE_LOW_40GBASE_KR4:
+		ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, supported,
+						     40000baseKR4_Full);
+		ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg);
+		ethtool_link_ksettings_add_link_mode(ks, advertising,
+						     40000baseKR4_Full);
+		break;
+	default:
+		unrecog_phy_low = true;
+	}
+
+	if (unrecog_phy_low) {
+		/* if we got here and link is up something bad is afoot */
+		netdev_info(netdev, "WARNING: Unrecognized PHY_Low (0x%llx).\n",
+			    (u64)link_info->phy_type_low);
 	}
 
+	/* Now that we've worked out everything that could be supported by the
+	 * current PHY type, get what is supported by the NVM and intersect
+	 * them to get what is truly supported
+	 */
+	memset(&cap_ksettings, 0, sizeof(struct ethtool_link_ksettings));
+	ice_phy_type_to_ethtool(netdev, &cap_ksettings);
+	ethtool_intersect_link_masks(ks, &cap_ksettings);
+
+	switch (link_info->link_speed) {
+	case ICE_AQ_LINK_SPEED_40GB:
+		ks->base.speed = SPEED_40000;
+		break;
+	case ICE_AQ_LINK_SPEED_25GB:
+		ks->base.speed = SPEED_25000;
+		break;
+	case ICE_AQ_LINK_SPEED_20GB:
+		ks->base.speed = SPEED_20000;
+		break;
+	case ICE_AQ_LINK_SPEED_10GB:
+		ks->base.speed = SPEED_10000;
+		break;
+	case ICE_AQ_LINK_SPEED_5GB:
+		ks->base.speed = SPEED_5000;
+		break;
+	case ICE_AQ_LINK_SPEED_2500MB:
+		ks->base.speed = SPEED_2500;
+		break;
+	case ICE_AQ_LINK_SPEED_1000MB:
+		ks->base.speed = SPEED_1000;
+		break;
+	case ICE_AQ_LINK_SPEED_100MB:
+		ks->base.speed = SPEED_100;
+		break;
+	default:
+		netdev_info(netdev,
+			    "WARNING: Unrecognized link_speed (0x%x).\n",
+			    link_info->link_speed);
+		break;
+	}
+	ks->base.duplex = DUPLEX_FULL;
+}
+
+/**
+ * ice_get_settings_link_down - Get the Link settings when link is down
+ * @ks: ethtool ksettings to fill in
+ * @netdev: network interface device structure
+ *
+ * Reports link settings that can be determined when link is down
+ */
+static void
+ice_get_settings_link_down(struct ethtool_link_ksettings *ks,
+			   struct net_device __always_unused *netdev)
+{
+	/* link is down and the driver needs to fall back on
+	 * supported phy types to figure out what info to display
+	 */
+	ice_phy_type_to_ethtool(netdev, ks);
+
+	/* With no link, speed and duplex are unknown */
+	ks->base.speed = SPEED_UNKNOWN;
+	ks->base.duplex = DUPLEX_UNKNOWN;
+}
+
+/**
+ * ice_get_link_ksettings - Get Link Speed and Duplex settings
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Reports speed/duplex settings based on media_type
+ */
+static int ice_get_link_ksettings(struct net_device *netdev,
+				  struct ethtool_link_ksettings *ks)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_link_status *hw_link_info;
+	struct ice_vsi *vsi = np->vsi;
+
+	ethtool_link_ksettings_zero_link_mode(ks, supported);
+	ethtool_link_ksettings_zero_link_mode(ks, advertising);
+	hw_link_info = &vsi->port_info->phy.link_info;
+
+	/* set speed and duplex */
+	if (hw_link_info->link_info & ICE_AQ_LINK_UP)
+		ice_get_settings_link_up(ks, netdev);
+	else
+		ice_get_settings_link_down(ks, netdev);
+
 	/* set autoneg settings */
-	ks->base.autoneg = ((hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
-			    AUTONEG_ENABLE : AUTONEG_DISABLE);
+	ks->base.autoneg = (hw_link_info->an_info & ICE_AQ_AN_COMPLETED) ?
+		AUTONEG_ENABLE : AUTONEG_DISABLE;
 
 	/* set media type settings */
 	switch (vsi->port_info->phy.media_type) {
@@ -441,6 +856,311 @@ ice_get_link_ksettings(struct net_device *netdev,
 	return 0;
 }
 
+/**
+ * ice_ksettings_find_adv_link_speed - Find advertising link speed
+ * @ks: ethtool ksettings
+ */
+static u16
+ice_ksettings_find_adv_link_speed(const struct ethtool_link_ksettings *ks)
+{
+	u16 adv_link_speed = 0;
+
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  100baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_100MB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  1000baseKX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_1000MB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  2500baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  2500baseX_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_2500MB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  5000baseT_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_5GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseT_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseKR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseSR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseLR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_10GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseCR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseSR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  25000baseKR_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_25GB;
+	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseCR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseSR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseLR4_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  40000baseKR4_Full))
+		adv_link_speed |= ICE_AQ_LINK_SPEED_40GB;
+
+	return adv_link_speed;
+}
+
+/**
+ * ice_setup_autoneg
+ * @p: port info
+ * @ks: ethtool_link_ksettings
+ * @config: configuration that will be sent down to FW
+ * @autoneg_enabled: autonegotiation is enabled or not
+ * @autoneg_changed: will there a change in autonegotiation
+ * @netdev: network interface device structure
+ *
+ * Setup PHY autonegotiation feature
+ */
+static int
+ice_setup_autoneg(struct ice_port_info *p, struct ethtool_link_ksettings *ks,
+		  struct ice_aqc_set_phy_cfg_data *config,
+		  u8 autoneg_enabled, u8 *autoneg_changed,
+		  struct net_device *netdev)
+{
+	int err = 0;
+
+	*autoneg_changed = 0;
+
+	/* Check autoneg */
+	if (autoneg_enabled == AUTONEG_ENABLE) {
+		/* If autoneg was not already enabled */
+		if (!(p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED)) {
+			/* If autoneg is not supported, return error */
+			if (!ethtool_link_ksettings_test_link_mode(ks,
+								   supported,
+								   Autoneg)) {
+				netdev_info(netdev, "Autoneg not supported on this phy.\n");
+				err = -EINVAL;
+			} else {
+				/* Autoneg is allowed to change */
+				config->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+				*autoneg_changed = 1;
+			}
+		}
+	} else {
+		/* If autoneg is currently enabled */
+		if (p->phy.link_info.an_info & ICE_AQ_AN_COMPLETED) {
+			/* If autoneg is supported 10GBASE_T is the only phy
+			 * that can disable it, so otherwise return error
+			 */
+			if (ethtool_link_ksettings_test_link_mode(ks,
+								  supported,
+								  Autoneg)) {
+				netdev_info(netdev, "Autoneg cannot be disabled on this phy\n");
+				err = -EINVAL;
+			} else {
+				/* Autoneg is allowed to change */
+				config->caps &= ~ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+				*autoneg_changed = 1;
+			}
+		}
+	}
+
+	return err;
+}
+
+/**
+ * ice_set_link_ksettings - Set Speed and Duplex
+ * @netdev: network interface device structure
+ * @ks: ethtool ksettings
+ *
+ * Set speed/duplex per media_types advertised/forced
+ */
+static int ice_set_link_ksettings(struct net_device *netdev,
+				  const struct ethtool_link_ksettings *ks)
+{
+	u8 autoneg, timeout = TEST_SET_BITS_TIMEOUT, lport = 0;
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ethtool_link_ksettings safe_ks, copy_ks;
+	struct ice_aqc_get_phy_caps_data *abilities;
+	u16 adv_link_speed, curr_link_speed, idx;
+	struct ice_aqc_set_phy_cfg_data config;
+	struct ice_pf *pf = np->vsi->back;
+	struct ice_port_info *p;
+	u8 autoneg_changed = 0;
+	enum ice_status status;
+	u64 phy_type_low;
+	int err = 0;
+	bool linkup;
+
+	p = np->vsi->port_info;
+
+	if (!p)
+		return -EOPNOTSUPP;
+
+	/* Check if this is lan vsi */
+	for (idx = 0 ; idx <  pf->num_alloc_vsi ; idx++) {
+		if (pf->vsi[idx]->type == ICE_VSI_PF) {
+			if (np->vsi != pf->vsi[idx])
+				return -EOPNOTSUPP;
+			break;
+		}
+	}
+
+	if (p->phy.media_type != ICE_MEDIA_BASET &&
+	    p->phy.media_type != ICE_MEDIA_FIBER &&
+	    p->phy.media_type != ICE_MEDIA_BACKPLANE &&
+	    p->phy.media_type != ICE_MEDIA_DA &&
+	    p->phy.link_info.link_info & ICE_AQ_LINK_UP)
+		return -EOPNOTSUPP;
+
+	/* copy the ksettings to copy_ks to avoid modifying the original */
+	memcpy(&copy_ks, ks, sizeof(struct ethtool_link_ksettings));
+
+	/* save autoneg out of ksettings */
+	autoneg = copy_ks.base.autoneg;
+
+	memset(&safe_ks, 0, sizeof(safe_ks));
+
+	/* Get link modes supported by hardware.*/
+	ice_phy_type_to_ethtool(netdev, &safe_ks);
+
+	/* and check against modes requested by user.
+	 * Return an error if unsupported mode was set.
+	 */
+	if (!bitmap_subset(copy_ks.link_modes.advertising,
+			   safe_ks.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS))
+		return -EINVAL;
+
+	/* get our own copy of the bits to check against */
+	memset(&safe_ks, 0, sizeof(struct ethtool_link_ksettings));
+	safe_ks.base.cmd = copy_ks.base.cmd;
+	safe_ks.base.link_mode_masks_nwords =
+		copy_ks.base.link_mode_masks_nwords;
+	ice_get_link_ksettings(netdev, &safe_ks);
+
+	/* set autoneg back to what it currently is */
+	copy_ks.base.autoneg = safe_ks.base.autoneg;
+	/* we don't compare the speed */
+	copy_ks.base.speed = safe_ks.base.speed;
+
+	/* If copy_ks.base and safe_ks.base are not the same now, then they are
+	 * trying to set something that we do not support.
+	 */
+	if (memcmp(&copy_ks.base, &safe_ks.base,
+		   sizeof(struct ethtool_link_settings)))
+		return -EOPNOTSUPP;
+
+	while (test_and_set_bit(__ICE_CFG_BUSY, pf->state)) {
+		timeout--;
+		if (!timeout)
+			return -EBUSY;
+		usleep_range(TEST_SET_BITS_SLEEP_MIN, TEST_SET_BITS_SLEEP_MAX);
+	}
+
+	abilities = devm_kzalloc(&pf->pdev->dev, sizeof(*abilities),
+				 GFP_KERNEL);
+	if (!abilities)
+		return -ENOMEM;
+
+	/* Get the current phy config */
+	status = ice_aq_get_phy_caps(p, false, ICE_AQC_REPORT_SW_CFG, abilities,
+				     NULL);
+	if (status) {
+		err = -EAGAIN;
+		goto done;
+	}
+
+	/* Copy abilities to config in case autoneg is not set below */
+	memset(&config, 0, sizeof(struct ice_aqc_set_phy_cfg_data));
+	config.caps = abilities->caps & ~ICE_AQC_PHY_AN_MODE;
+	if (abilities->caps & ICE_AQC_PHY_AN_MODE)
+		config.caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+
+	/* Check autoneg */
+	err = ice_setup_autoneg(p, &safe_ks, &config, autoneg, &autoneg_changed,
+				netdev);
+
+	if (err)
+		goto done;
+
+	/* Call to get the current link speed */
+	p->phy.get_link_info = true;
+	status = ice_get_link_status(p, &linkup);
+	if (status) {
+		err = -EAGAIN;
+		goto done;
+	}
+
+	curr_link_speed = p->phy.link_info.link_speed;
+	adv_link_speed = ice_ksettings_find_adv_link_speed(ks);
+
+	/* If speed didn't get set, set it to what it currently is.
+	 * This is needed because if advertise is 0 (as it is when autoneg
+	 * is disabled) then speed won't get set.
+	 */
+	if (!adv_link_speed)
+		adv_link_speed = curr_link_speed;
+
+	/* Convert the advertise link speeds to their corresponded PHY_TYPE */
+	ice_update_phy_type(&phy_type_low, adv_link_speed);
+
+	if (!autoneg_changed && adv_link_speed == curr_link_speed) {
+		netdev_info(netdev, "Nothing changed, exiting without setting anything.\n");
+		goto done;
+	}
+
+	/* copy over the rest of the abilities */
+	config.low_power_ctrl = abilities->low_power_ctrl;
+	config.eee_cap = abilities->eee_cap;
+	config.eeer_value = abilities->eeer_value;
+	config.link_fec_opt = abilities->link_fec_options;
+
+	/* save the requested speeds */
+	p->phy.link_info.req_speeds = adv_link_speed;
+
+	/* set link and auto negotiation so changes take effect */
+	config.caps |= ICE_AQ_PHY_ENA_LINK;
+
+	if (phy_type_low) {
+		config.phy_type_low = cpu_to_le64(phy_type_low) &
+			abilities->phy_type_low;
+	} else {
+		err = -EAGAIN;
+		netdev_info(netdev, "Nothing changed. No PHY_TYPE is corresponded to advertised link speed.\n");
+		goto done;
+	}
+
+	/* If link is up put link down */
+	if (p->phy.link_info.link_info & ICE_AQ_LINK_UP) {
+		/* Tell the OS link is going down, the link will go
+		 * back up when fw says it is ready asynchronously
+		 */
+		ice_print_link_msg(np->vsi, false);
+		netif_carrier_off(netdev);
+		netif_tx_stop_all_queues(netdev);
+	}
+
+	/* make the aq call */
+	status = ice_aq_set_phy_cfg(&pf->hw, lport, &config, NULL);
+	if (status) {
+		netdev_info(netdev, "Set phy config failed,\n");
+		err = -EAGAIN;
+	}
+
+done:
+	devm_kfree(&pf->pdev->dev, abilities);
+	clear_bit(__ICE_CFG_BUSY, pf->state);
+
+	return err;
+}
+
 /**
  * ice_get_rxnfc - command to get RX flow classification rules
  * @netdev: network interface device structure
@@ -933,6 +1653,7 @@ static int ice_set_rxfh(struct net_device *netdev, const u32 *indir,
 
 static const struct ethtool_ops ice_ethtool_ops = {
 	.get_link_ksettings	= ice_get_link_ksettings,
+	.set_link_ksettings	= ice_set_link_ksettings,
 	.get_drvinfo            = ice_get_drvinfo,
 	.get_regs_len           = ice_get_regs_len,
 	.get_regs               = ice_get_regs,
-- 
2.17.1

^ permalink raw reply related

* [net-next 06/15] ice: Refactor switch rule management structures and functions
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Shannon Nelson, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

This patch is an adaptation of the work originally done by Grishma
Kotecha <grishma.kotecha@intel.com> that in summary refactors the
switch filtering logic in the driver. More specifically,
 - Update the recipe structure to also store list of rules
 - Update the existing code for recipes like MAC, VLAN, ethtype etc to
   use list head that is attached to switch recipe structure
 - Add a common function to search for a rule entry and add a new rule
   entry. Update the code to use this new function.
 - Refactor the rem_handle_vsi_list function to simplify the logic

CC: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |   2 +
 drivers/net/ethernet/intel/ice/ice_common.c   |  36 +-
 drivers/net/ethernet/intel/ice/ice_switch.c   | 967 ++++++++----------
 drivers/net/ethernet/intel/ice/ice_switch.h   |  35 +-
 drivers/net/ethernet/intel/ice/ice_type.h     |  13 +-
 5 files changed, 500 insertions(+), 553 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 9a33fb95c0ea..87b304db9cad 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -443,6 +443,8 @@ struct ice_aqc_vsi_props {
 	u8 reserved[24];
 };
 
+#define ICE_MAX_NUM_RECIPES 64
+
 /* Add/Update/Remove/Get switch rules (indirect 0x02A0, 0x02A1, 0x02A2, 0x02A3)
  */
 struct ice_aqc_sw_rules {
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 2a1e13576ce2..4c6b1038dc5f 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -388,20 +388,7 @@ static enum ice_status ice_init_fltr_mgmt_struct(struct ice_hw *hw)
 
 	INIT_LIST_HEAD(&sw->vsi_list_map_head);
 
-	mutex_init(&sw->mac_list_lock);
-	INIT_LIST_HEAD(&sw->mac_list_head);
-
-	mutex_init(&sw->vlan_list_lock);
-	INIT_LIST_HEAD(&sw->vlan_list_head);
-
-	mutex_init(&sw->eth_m_list_lock);
-	INIT_LIST_HEAD(&sw->eth_m_list_head);
-
-	mutex_init(&sw->promisc_list_lock);
-	INIT_LIST_HEAD(&sw->promisc_list_head);
-
-	mutex_init(&sw->mac_vlan_list_lock);
-	INIT_LIST_HEAD(&sw->mac_vlan_list_head);
+	ice_init_def_sw_recp(hw);
 
 	return 0;
 }
@@ -415,19 +402,28 @@ static void ice_cleanup_fltr_mgmt_struct(struct ice_hw *hw)
 	struct ice_switch_info *sw = hw->switch_info;
 	struct ice_vsi_list_map_info *v_pos_map;
 	struct ice_vsi_list_map_info *v_tmp_map;
+	struct ice_sw_recipe *recps;
+	u8 i;
 
 	list_for_each_entry_safe(v_pos_map, v_tmp_map, &sw->vsi_list_map_head,
 				 list_entry) {
 		list_del(&v_pos_map->list_entry);
 		devm_kfree(ice_hw_to_dev(hw), v_pos_map);
 	}
+	recps = hw->switch_info->recp_list;
+	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+		struct ice_fltr_mgmt_list_entry *lst_itr, *tmp_entry;
+
+		recps[i].root_rid = i;
+		mutex_destroy(&recps[i].filt_rule_lock);
+		list_for_each_entry_safe(lst_itr, tmp_entry,
+					 &recps[i].filt_rules, list_entry) {
+			list_del(&lst_itr->list_entry);
+			devm_kfree(ice_hw_to_dev(hw), lst_itr);
+		}
+	}
 
-	mutex_destroy(&sw->mac_list_lock);
-	mutex_destroy(&sw->vlan_list_lock);
-	mutex_destroy(&sw->eth_m_list_lock);
-	mutex_destroy(&sw->promisc_list_lock);
-	mutex_destroy(&sw->mac_vlan_list_lock);
-
+	devm_kfree(ice_hw_to_dev(hw), sw->recp_list);
 	devm_kfree(ice_hw_to_dev(hw), sw);
 }
 
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index d8b18cabc3a8..2693bebef977 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -85,6 +85,35 @@ ice_aq_alloc_free_res(struct ice_hw *hw, u16 num_entries,
 	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 }
 
+/**
+ * ice_init_def_sw_recp - initialize the recipe book keeping tables
+ * @hw: pointer to the hw struct
+ *
+ * Allocate memory for the entire recipe table and initialize the structures/
+ * entries corresponding to basic recipes.
+ */
+enum ice_status
+ice_init_def_sw_recp(struct ice_hw *hw)
+{
+	struct ice_sw_recipe *recps;
+	u8 i;
+
+	recps = devm_kcalloc(ice_hw_to_dev(hw), ICE_MAX_NUM_RECIPES,
+			     sizeof(struct ice_sw_recipe), GFP_KERNEL);
+	if (!recps)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < ICE_SW_LKUP_LAST; i++) {
+		recps[i].root_rid = i;
+		INIT_LIST_HEAD(&recps[i].filt_rules);
+		mutex_init(&recps[i].filt_rule_lock);
+	}
+
+	hw->switch_info->recp_list = recps;
+
+	return 0;
+}
+
 /**
  * ice_aq_get_sw_cfg - get switch configuration
  * @hw: pointer to the hardware structure
@@ -818,10 +847,10 @@ static enum ice_status
 ice_create_pkt_fwd_rule(struct ice_hw *hw,
 			struct ice_fltr_list_entry *f_entry)
 {
-	struct ice_switch_info *sw = hw->switch_info;
 	struct ice_fltr_mgmt_list_entry *fm_entry;
 	struct ice_aqc_sw_rules_elem *s_rule;
 	enum ice_sw_lkup_type l_type;
+	struct ice_sw_recipe *recp;
 	enum ice_status status;
 
 	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
@@ -862,31 +891,9 @@ ice_create_pkt_fwd_rule(struct ice_hw *hw,
 	 * calls remove filter AQ command
 	 */
 	l_type = fm_entry->fltr_info.lkup_type;
-	if (l_type == ICE_SW_LKUP_MAC) {
-		mutex_lock(&sw->mac_list_lock);
-		list_add(&fm_entry->list_entry, &sw->mac_list_head);
-		mutex_unlock(&sw->mac_list_lock);
-	} else if (l_type == ICE_SW_LKUP_VLAN) {
-		mutex_lock(&sw->vlan_list_lock);
-		list_add(&fm_entry->list_entry, &sw->vlan_list_head);
-		mutex_unlock(&sw->vlan_list_lock);
-	} else if (l_type == ICE_SW_LKUP_ETHERTYPE ||
-		   l_type == ICE_SW_LKUP_ETHERTYPE_MAC) {
-		mutex_lock(&sw->eth_m_list_lock);
-		list_add(&fm_entry->list_entry, &sw->eth_m_list_head);
-		mutex_unlock(&sw->eth_m_list_lock);
-	} else if (l_type == ICE_SW_LKUP_PROMISC ||
-		   l_type == ICE_SW_LKUP_PROMISC_VLAN) {
-		mutex_lock(&sw->promisc_list_lock);
-		list_add(&fm_entry->list_entry, &sw->promisc_list_head);
-		mutex_unlock(&sw->promisc_list_lock);
-	} else if (fm_entry->fltr_info.lkup_type == ICE_SW_LKUP_MAC_VLAN) {
-		mutex_lock(&sw->mac_vlan_list_lock);
-		list_add(&fm_entry->list_entry, &sw->mac_vlan_list_head);
-		mutex_unlock(&sw->mac_vlan_list_lock);
-	} else {
-		status = ICE_ERR_NOT_IMPL;
-	}
+	recp = &hw->switch_info->recp_list[l_type];
+	list_add(&fm_entry->list_entry, &recp->filt_rules);
+
 ice_create_pkt_fwd_rule_exit:
 	devm_kfree(ice_hw_to_dev(hw), s_rule);
 	return status;
@@ -895,19 +902,15 @@ ice_create_pkt_fwd_rule(struct ice_hw *hw,
 /**
  * ice_update_pkt_fwd_rule
  * @hw: pointer to the hardware structure
- * @rule_id: rule of previously created switch rule to update
- * @vsi_list_id: VSI list id to be updated with
- * @f_info: ice_fltr_info to pull other information for switch rule
+ * @f_info: filter information for switch rule
  *
  * Call AQ command to update a previously created switch rule with a
  * VSI list id
  */
 static enum ice_status
-ice_update_pkt_fwd_rule(struct ice_hw *hw, u16 rule_id, u16 vsi_list_id,
-			struct ice_fltr_info f_info)
+ice_update_pkt_fwd_rule(struct ice_hw *hw, struct ice_fltr_info *f_info)
 {
 	struct ice_aqc_sw_rules_elem *s_rule;
-	struct ice_fltr_info tmp_fltr;
 	enum ice_status status;
 
 	s_rule = devm_kzalloc(ice_hw_to_dev(hw),
@@ -915,14 +918,9 @@ ice_update_pkt_fwd_rule(struct ice_hw *hw, u16 rule_id, u16 vsi_list_id,
 	if (!s_rule)
 		return ICE_ERR_NO_MEMORY;
 
-	tmp_fltr = f_info;
-	tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
-	tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
-
-	ice_fill_sw_rule(hw, &tmp_fltr, s_rule,
-			 ice_aqc_opc_update_sw_rules);
+	ice_fill_sw_rule(hw, f_info, s_rule, ice_aqc_opc_update_sw_rules);
 
-	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(rule_id);
+	s_rule->pdata.lkup_tx_rx.index = cpu_to_le16(f_info->fltr_rule_id);
 
 	/* Update switch rule with new rule set to forward VSI list */
 	status = ice_aq_sw_rules(hw, s_rule, ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
@@ -933,7 +931,7 @@ ice_update_pkt_fwd_rule(struct ice_hw *hw, u16 rule_id, u16 vsi_list_id,
 }
 
 /**
- * ice_handle_vsi_list_mgmt
+ * ice_add_update_vsi_list
  * @hw: pointer to the hardware structure
  * @m_entry: pointer to current filter management list entry
  * @cur_fltr: filter information from the book keeping entry
@@ -954,10 +952,10 @@ ice_update_pkt_fwd_rule(struct ice_hw *hw, u16 rule_id, u16 vsi_list_id,
  *		using the update switch rule command
  */
 static enum ice_status
-ice_handle_vsi_list_mgmt(struct ice_hw *hw,
-			 struct ice_fltr_mgmt_list_entry *m_entry,
-			 struct ice_fltr_info *cur_fltr,
-			 struct ice_fltr_info *new_fltr)
+ice_add_update_vsi_list(struct ice_hw *hw,
+			struct ice_fltr_mgmt_list_entry *m_entry,
+			struct ice_fltr_info *cur_fltr,
+			struct ice_fltr_info *new_fltr)
 {
 	enum ice_status status = 0;
 	u16 vsi_list_id = 0;
@@ -977,8 +975,8 @@ ice_handle_vsi_list_mgmt(struct ice_hw *hw,
 		 * a part of a VSI list. So, create a VSI list with the old and
 		 * new VSIs.
 		 */
+		struct ice_fltr_info tmp_fltr;
 		u16 vsi_id_arr[2];
-		u16 fltr_rule;
 
 		/* A rule already exists with the new VSI being added */
 		if (cur_fltr->fwd_id.vsi_id == new_fltr->fwd_id.vsi_id)
@@ -992,12 +990,14 @@ ice_handle_vsi_list_mgmt(struct ice_hw *hw,
 		if (status)
 			return status;
 
-		fltr_rule = cur_fltr->fltr_rule_id;
+		tmp_fltr = *new_fltr;
+		tmp_fltr.fltr_rule_id = cur_fltr->fltr_rule_id;
+		tmp_fltr.fltr_act = ICE_FWD_TO_VSI_LIST;
+		tmp_fltr.fwd_id.vsi_list_id = vsi_list_id;
 		/* Update the previous switch rule of "MAC forward to VSI" to
 		 * "MAC fwd to VSI list"
 		 */
-		status = ice_update_pkt_fwd_rule(hw, fltr_rule, vsi_list_id,
-						 *new_fltr);
+		status = ice_update_pkt_fwd_rule(hw, &tmp_fltr);
 		if (status)
 			return status;
 
@@ -1042,54 +1042,245 @@ ice_handle_vsi_list_mgmt(struct ice_hw *hw,
 }
 
 /**
- * ice_find_mac_entry
+ * ice_find_rule_entry - Search a rule entry
  * @hw: pointer to the hardware structure
- * @mac_addr: MAC address to search for
+ * @recp_id: lookup type for which the specified rule needs to be searched
+ * @f_info: rule information
  *
- * Helper function to search for a MAC entry using a given MAC address
- * Returns pointer to the entry if found.
+ * Helper function to search for a given rule entry
+ * Returns pointer to entry storing the rule if found
  */
 static struct ice_fltr_mgmt_list_entry *
-ice_find_mac_entry(struct ice_hw *hw, u8 *mac_addr)
+ice_find_rule_entry(struct ice_hw *hw, u8 recp_id, struct ice_fltr_info *f_info)
 {
-	struct ice_fltr_mgmt_list_entry *m_list_itr, *mac_ret = NULL;
+	struct ice_fltr_mgmt_list_entry *list_itr, *ret = NULL;
 	struct ice_switch_info *sw = hw->switch_info;
-
-	mutex_lock(&sw->mac_list_lock);
-	list_for_each_entry(m_list_itr, &sw->mac_list_head, list_entry) {
-		u8 *buf = &m_list_itr->fltr_info.l_data.mac.mac_addr[0];
-
-		if (ether_addr_equal(buf, mac_addr)) {
-			mac_ret = m_list_itr;
+	struct list_head *list_head;
+
+	list_head = &sw->recp_list[recp_id].filt_rules;
+	list_for_each_entry(list_itr, list_head, list_entry) {
+		if (!memcmp(&f_info->l_data, &list_itr->fltr_info.l_data,
+			    sizeof(f_info->l_data)) &&
+		    f_info->flag == list_itr->fltr_info.flag) {
+			ret = list_itr;
 			break;
 		}
 	}
-	mutex_unlock(&sw->mac_list_lock);
-	return mac_ret;
+	return ret;
 }
 
 /**
- * ice_add_shared_mac - Add one MAC shared filter rule
+ * ice_add_rule_internal - add rule for a given lookup type
  * @hw: pointer to the hardware structure
+ * @recp_id: lookup type (recipe id) for which rule has to be added
  * @f_entry: structure containing MAC forwarding information
  *
- * Adds or updates the book keeping list for the MAC addresses
+ * Adds or updates the rule lists for a given recipe
  */
 static enum ice_status
-ice_add_shared_mac(struct ice_hw *hw, struct ice_fltr_list_entry *f_entry)
+ice_add_rule_internal(struct ice_hw *hw, u8 recp_id,
+		      struct ice_fltr_list_entry *f_entry)
 {
+	struct ice_switch_info *sw = hw->switch_info;
 	struct ice_fltr_info *new_fltr, *cur_fltr;
 	struct ice_fltr_mgmt_list_entry *m_entry;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
 
-	new_fltr = &f_entry->fltr_info;
+	rule_lock = &sw->recp_list[recp_id].filt_rule_lock;
 
-	m_entry = ice_find_mac_entry(hw, &new_fltr->l_data.mac.mac_addr[0]);
-	if (!m_entry)
+	mutex_lock(rule_lock);
+	new_fltr = &f_entry->fltr_info;
+	if (new_fltr->flag & ICE_FLTR_RX)
+		new_fltr->src = hw->port_info->lport;
+	else if (new_fltr->flag & ICE_FLTR_TX)
+		new_fltr->src = f_entry->fltr_info.fwd_id.vsi_id;
+
+	m_entry = ice_find_rule_entry(hw, recp_id, new_fltr);
+	if (!m_entry) {
+		mutex_unlock(rule_lock);
 		return ice_create_pkt_fwd_rule(hw, f_entry);
+	}
 
 	cur_fltr = &m_entry->fltr_info;
+	status = ice_add_update_vsi_list(hw, m_entry, cur_fltr, new_fltr);
+	mutex_unlock(rule_lock);
+
+	return status;
+}
+
+/**
+ * ice_remove_vsi_list_rule
+ * @hw: pointer to the hardware structure
+ * @vsi_list_id: VSI list id generated as part of allocate resource
+ * @lkup_type: switch rule filter lookup type
+ *
+ * The VSI list should be emptied before this function is called to remove the
+ * VSI list.
+ */
+static enum ice_status
+ice_remove_vsi_list_rule(struct ice_hw *hw, u16 vsi_list_id,
+			 enum ice_sw_lkup_type lkup_type)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	enum ice_status status;
+	u16 s_rule_size;
+
+	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(0);
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
+	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
+
+	/* Free the vsi_list resource that we allocated. It is assumed that the
+	 * list is empty at this point.
+	 */
+	status = ice_aq_alloc_free_vsi_list(hw, &vsi_list_id, lkup_type,
+					    ice_aqc_opc_free_res);
 
-	return ice_handle_vsi_list_mgmt(hw, m_entry, cur_fltr, new_fltr);
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
+/**
+ * ice_rem_update_vsi_list
+ * @hw: pointer to the hardware structure
+ * @vsi_id: ID of the VSI to remove
+ * @fm_list: filter management entry for which the VSI list management needs to
+ *           be done
+ */
+static enum ice_status
+ice_rem_update_vsi_list(struct ice_hw *hw, u16 vsi_id,
+			struct ice_fltr_mgmt_list_entry *fm_list)
+{
+	enum ice_sw_lkup_type lkup_type;
+	enum ice_status status = 0;
+	u16 vsi_list_id;
+
+	if (fm_list->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST ||
+	    fm_list->vsi_count == 0)
+		return ICE_ERR_PARAM;
+
+	/* A rule with the VSI being removed does not exist */
+	if (!test_bit(vsi_id, fm_list->vsi_list_info->vsi_map))
+		return ICE_ERR_DOES_NOT_EXIST;
+
+	lkup_type = fm_list->fltr_info.lkup_type;
+	vsi_list_id = fm_list->fltr_info.fwd_id.vsi_list_id;
+
+	status = ice_update_vsi_list_rule(hw, &vsi_id, 1, vsi_list_id, true,
+					  ice_aqc_opc_update_sw_rules,
+					  lkup_type);
+	if (status)
+		return status;
+
+	fm_list->vsi_count--;
+	clear_bit(vsi_id, fm_list->vsi_list_info->vsi_map);
+
+	if ((fm_list->vsi_count == 1 && lkup_type != ICE_SW_LKUP_VLAN) ||
+	    (fm_list->vsi_count == 0 && lkup_type == ICE_SW_LKUP_VLAN)) {
+		struct ice_vsi_list_map_info *vsi_list_info =
+			fm_list->vsi_list_info;
+		u16 rem_vsi_id;
+
+		rem_vsi_id = find_first_bit(vsi_list_info->vsi_map,
+					    ICE_MAX_VSI);
+		if (rem_vsi_id == ICE_MAX_VSI)
+			return ICE_ERR_OUT_OF_RANGE;
+
+		status = ice_update_vsi_list_rule(hw, &rem_vsi_id, 1,
+						  vsi_list_id, true,
+						  ice_aqc_opc_update_sw_rules,
+						  lkup_type);
+		if (status)
+			return status;
+
+		/* Remove the VSI list since it is no longer used */
+		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
+		if (status)
+			return status;
+
+		/* Change the list entry action from VSI_LIST to VSI */
+		fm_list->fltr_info.fltr_act = ICE_FWD_TO_VSI;
+		fm_list->fltr_info.fwd_id.vsi_id = rem_vsi_id;
+
+		list_del(&vsi_list_info->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
+		fm_list->vsi_list_info = NULL;
+	}
+
+	return status;
+}
+
+/**
+ * ice_remove_rule_internal - Remove a filter rule of a given type
+ * @hw: pointer to the hardware structure
+ * @recp_id: recipe id for which the rule needs to removed
+ * @f_entry: rule entry containing filter information
+ */
+static enum ice_status
+ice_remove_rule_internal(struct ice_hw *hw, u8 recp_id,
+			 struct ice_fltr_list_entry *f_entry)
+{
+	struct ice_switch_info *sw = hw->switch_info;
+	struct ice_fltr_mgmt_list_entry *list_elem;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
+	bool remove_rule = false;
+	u16 vsi_id;
+
+	rule_lock = &sw->recp_list[recp_id].filt_rule_lock;
+	mutex_lock(rule_lock);
+	list_elem = ice_find_rule_entry(hw, recp_id, &f_entry->fltr_info);
+	if (!list_elem) {
+		status = ICE_ERR_DOES_NOT_EXIST;
+		goto exit;
+	}
+
+	if (list_elem->fltr_info.fltr_act != ICE_FWD_TO_VSI_LIST) {
+		remove_rule = true;
+	} else {
+		vsi_id = f_entry->fltr_info.fwd_id.vsi_id;
+		status = ice_rem_update_vsi_list(hw, vsi_id, list_elem);
+		if (status)
+			goto exit;
+		/* if vsi count goes to zero after updating the vsi list */
+		if (list_elem->vsi_count == 0)
+			remove_rule = true;
+	}
+
+	if (remove_rule) {
+		/* Remove the lookup rule */
+		struct ice_aqc_sw_rules_elem *s_rule;
+
+		s_rule = devm_kzalloc(ice_hw_to_dev(hw),
+				      ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
+				      GFP_KERNEL);
+		if (!s_rule) {
+			status = ICE_ERR_NO_MEMORY;
+			goto exit;
+		}
+
+		ice_fill_sw_rule(hw, &list_elem->fltr_info, s_rule,
+				 ice_aqc_opc_remove_sw_rules);
+
+		status = ice_aq_sw_rules(hw, s_rule,
+					 ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
+					 ice_aqc_opc_remove_sw_rules, NULL);
+		if (status)
+			goto exit;
+
+		/* Remove a book keeping from the list */
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
+
+		list_del(&list_elem->list_entry);
+		devm_kfree(ice_hw_to_dev(hw), list_elem);
+	}
+exit:
+	mutex_unlock(rule_lock);
+	return status;
 }
 
 /**
@@ -1108,7 +1299,10 @@ ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
 {
 	struct ice_aqc_sw_rules_elem *s_rule, *r_iter;
 	struct ice_fltr_list_entry *m_list_itr;
+	struct list_head *rule_head;
 	u16 elem_sent, total_elem_left;
+	struct ice_switch_info *sw;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
 	enum ice_status status = 0;
 	u16 num_unicast = 0;
 	u16 s_rule_size;
@@ -1116,48 +1310,62 @@ ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
 	if (!m_list || !hw)
 		return ICE_ERR_PARAM;
 
+	s_rule = NULL;
+	sw = hw->switch_info;
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rule_lock;
 	list_for_each_entry(m_list_itr, m_list, list_entry) {
 		u8 *add = &m_list_itr->fltr_info.l_data.mac.mac_addr[0];
 
-		if (m_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_MAC)
-			return ICE_ERR_PARAM;
-		if (is_zero_ether_addr(add))
+		m_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		if (m_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_MAC ||
+		    is_zero_ether_addr(add))
 			return ICE_ERR_PARAM;
 		if (is_unicast_ether_addr(add) && !hw->ucast_shared) {
 			/* Don't overwrite the unicast address */
-			if (ice_find_mac_entry(hw, add))
+			mutex_lock(rule_lock);
+			if (ice_find_rule_entry(hw, ICE_SW_LKUP_MAC,
+						&m_list_itr->fltr_info)) {
+				mutex_unlock(rule_lock);
 				return ICE_ERR_ALREADY_EXISTS;
+			}
+			mutex_unlock(rule_lock);
 			num_unicast++;
 		} else if (is_multicast_ether_addr(add) ||
 			   (is_unicast_ether_addr(add) && hw->ucast_shared)) {
-			status = ice_add_shared_mac(hw, m_list_itr);
-			if (status) {
-				m_list_itr->status = ICE_FLTR_STATUS_FW_FAIL;
-				return status;
-			}
-			m_list_itr->status = ICE_FLTR_STATUS_FW_SUCCESS;
+			m_list_itr->status =
+				ice_add_rule_internal(hw, ICE_SW_LKUP_MAC,
+						      m_list_itr);
+			if (m_list_itr->status)
+				return m_list_itr->status;
 		}
 	}
 
+	mutex_lock(rule_lock);
 	/* Exit if no suitable entries were found for adding bulk switch rule */
-	if (!num_unicast)
-		return 0;
+	if (!num_unicast) {
+		status = 0;
+		goto ice_add_mac_exit;
+	}
+
+	rule_head = &sw->recp_list[ICE_SW_LKUP_MAC].filt_rules;
 
 	/* Allocate switch rule buffer for the bulk update for unicast */
 	s_rule_size = ICE_SW_RULE_RX_TX_ETH_HDR_SIZE;
 	s_rule = devm_kcalloc(ice_hw_to_dev(hw), num_unicast, s_rule_size,
 			      GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
+	if (!s_rule) {
+		status = ICE_ERR_NO_MEMORY;
+		goto ice_add_mac_exit;
+	}
 
 	r_iter = s_rule;
 	list_for_each_entry(m_list_itr, m_list, list_entry) {
 		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
-		u8 *addr = &f_info->l_data.mac.mac_addr[0];
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
 
-		if (is_unicast_ether_addr(addr)) {
-			ice_fill_sw_rule(hw, &m_list_itr->fltr_info,
-					 r_iter, ice_aqc_opc_add_sw_rules);
+		if (is_unicast_ether_addr(mac_addr)) {
+			ice_fill_sw_rule(hw, &m_list_itr->fltr_info, r_iter,
+					 ice_aqc_opc_add_sw_rules);
 			r_iter = (struct ice_aqc_sw_rules_elem *)
 				((u8 *)r_iter + s_rule_size);
 		}
@@ -1185,11 +1393,10 @@ ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
 	r_iter = s_rule;
 	list_for_each_entry(m_list_itr, m_list, list_entry) {
 		struct ice_fltr_info *f_info = &m_list_itr->fltr_info;
-		u8 *addr = &f_info->l_data.mac.mac_addr[0];
-		struct ice_switch_info *sw = hw->switch_info;
+		u8 *mac_addr = &f_info->l_data.mac.mac_addr[0];
 		struct ice_fltr_mgmt_list_entry *fm_entry;
 
-		if (is_unicast_ether_addr(addr)) {
+		if (is_unicast_ether_addr(mac_addr)) {
 			f_info->fltr_rule_id =
 				le16_to_cpu(r_iter->pdata.lkup_tx_rx.index);
 			f_info->fltr_act = ICE_FWD_TO_VSI;
@@ -1205,45 +1412,20 @@ ice_add_mac(struct ice_hw *hw, struct list_head *m_list)
 			/* The book keeping entries will get removed when
 			 * base driver calls remove filter AQ command
 			 */
-			mutex_lock(&sw->mac_list_lock);
-			list_add(&fm_entry->list_entry, &sw->mac_list_head);
-			mutex_unlock(&sw->mac_list_lock);
 
+			list_add(&fm_entry->list_entry, rule_head);
 			r_iter = (struct ice_aqc_sw_rules_elem *)
 				((u8 *)r_iter + s_rule_size);
 		}
 	}
 
 ice_add_mac_exit:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	mutex_unlock(rule_lock);
+	if (s_rule)
+		devm_kfree(ice_hw_to_dev(hw), s_rule);
 	return status;
 }
 
-/**
- * ice_find_vlan_entry
- * @hw: pointer to the hardware structure
- * @vlan_id: VLAN id to search for
- *
- * Helper function to search for a VLAN entry using a given VLAN id
- * Returns pointer to the entry if found.
- */
-static struct ice_fltr_mgmt_list_entry *
-ice_find_vlan_entry(struct ice_hw *hw, u16 vlan_id)
-{
-	struct ice_fltr_mgmt_list_entry *vlan_list_itr, *vlan_ret = NULL;
-	struct ice_switch_info *sw = hw->switch_info;
-
-	mutex_lock(&sw->vlan_list_lock);
-	list_for_each_entry(vlan_list_itr, &sw->vlan_list_head, list_entry)
-		if (vlan_list_itr->fltr_info.l_data.vlan.vlan_id == vlan_id) {
-			vlan_ret = vlan_list_itr;
-			break;
-		}
-
-	mutex_unlock(&sw->vlan_list_lock);
-	return vlan_ret;
-}
-
 /**
  * ice_add_vlan_internal - Add one VLAN based filter rule
  * @hw: pointer to the hardware structure
@@ -1252,20 +1434,22 @@ ice_find_vlan_entry(struct ice_hw *hw, u16 vlan_id)
 static enum ice_status
 ice_add_vlan_internal(struct ice_hw *hw, struct ice_fltr_list_entry *f_entry)
 {
+	struct ice_switch_info *sw = hw->switch_info;
 	struct ice_fltr_info *new_fltr, *cur_fltr;
 	struct ice_fltr_mgmt_list_entry *v_list_itr;
-	u16 vlan_id;
+	struct mutex *rule_lock; /* Lock to protect filter rule list */
+	enum ice_status status = 0;
 
 	new_fltr = &f_entry->fltr_info;
 	/* VLAN id should only be 12 bits */
 	if (new_fltr->l_data.vlan.vlan_id > ICE_MAX_VLAN_ID)
 		return ICE_ERR_PARAM;
 
-	vlan_id = new_fltr->l_data.vlan.vlan_id;
-	v_list_itr = ice_find_vlan_entry(hw, vlan_id);
+	rule_lock = &sw->recp_list[ICE_SW_LKUP_VLAN].filt_rule_lock;
+	mutex_lock(rule_lock);
+	v_list_itr = ice_find_rule_entry(hw, ICE_SW_LKUP_VLAN, new_fltr);
 	if (!v_list_itr) {
 		u16 vsi_id = ICE_VSI_INVAL_ID;
-		enum ice_status status;
 		u16 vsi_list_id = 0;
 
 		if (new_fltr->fltr_act == ICE_FWD_TO_VSI) {
@@ -1279,26 +1463,33 @@ ice_add_vlan_internal(struct ice_hw *hw, struct ice_fltr_list_entry *f_entry)
 							  &vsi_list_id,
 							  lkup_type);
 			if (status)
-				return status;
+				goto exit;
 			new_fltr->fltr_act = ICE_FWD_TO_VSI_LIST;
 			new_fltr->fwd_id.vsi_list_id = vsi_list_id;
 		}
 
 		status = ice_create_pkt_fwd_rule(hw, f_entry);
 		if (!status && vsi_id != ICE_VSI_INVAL_ID) {
-			v_list_itr = ice_find_vlan_entry(hw, vlan_id);
-			if (!v_list_itr)
-				return ICE_ERR_DOES_NOT_EXIST;
+			v_list_itr = ice_find_rule_entry(hw, ICE_SW_LKUP_VLAN,
+							 new_fltr);
+			if (!v_list_itr) {
+				status = ICE_ERR_DOES_NOT_EXIST;
+				goto exit;
+			}
 			v_list_itr->vsi_list_info =
 				ice_create_vsi_list_map(hw, &vsi_id, 1,
 							vsi_list_id);
 		}
 
-		return status;
+		goto exit;
 	}
 
 	cur_fltr = &v_list_itr->fltr_info;
-	return ice_handle_vsi_list_mgmt(hw, v_list_itr, cur_fltr, new_fltr);
+	status = ice_add_update_vsi_list(hw, v_list_itr, cur_fltr, new_fltr);
+
+exit:
+	mutex_unlock(rule_lock);
+	return status;
 }
 
 /**
@@ -1315,326 +1506,25 @@ ice_add_vlan(struct ice_hw *hw, struct list_head *v_list)
 		return ICE_ERR_PARAM;
 
 	list_for_each_entry(v_list_itr, v_list, list_entry) {
-		enum ice_status status;
-
 		if (v_list_itr->fltr_info.lkup_type != ICE_SW_LKUP_VLAN)
 			return ICE_ERR_PARAM;
-
-		status = ice_add_vlan_internal(hw, v_list_itr);
-		if (status) {
-			v_list_itr->status = ICE_FLTR_STATUS_FW_FAIL;
-			return status;
-		}
-		v_list_itr->status = ICE_FLTR_STATUS_FW_SUCCESS;
+		v_list_itr->fltr_info.flag = ICE_FLTR_TX;
+		v_list_itr->status = ice_add_vlan_internal(hw, v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
 	}
 	return 0;
 }
 
 /**
- * ice_remove_vsi_list_rule
- * @hw: pointer to the hardware structure
- * @vsi_list_id: VSI list id generated as part of allocate resource
- * @lkup_type: switch rule filter lookup type
- */
-static enum ice_status
-ice_remove_vsi_list_rule(struct ice_hw *hw, u16 vsi_list_id,
-			 enum ice_sw_lkup_type lkup_type)
-{
-	struct ice_aqc_sw_rules_elem *s_rule;
-	enum ice_status status;
-	u16 s_rule_size;
-
-	s_rule_size = (u16)ICE_SW_RULE_VSI_LIST_SIZE(0);
-	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-
-	s_rule->type = cpu_to_le16(ICE_AQC_SW_RULES_T_VSI_LIST_CLEAR);
-	s_rule->pdata.vsi_list.index = cpu_to_le16(vsi_list_id);
-	/* FW expects number of VSIs in vsi_list resource to be 0 for clear
-	 * command. Since memory is zero'ed out during initialization, it's not
-	 * necessary to explicitly initialize the variable to 0.
-	 */
-
-	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1,
-				 ice_aqc_opc_remove_sw_rules, NULL);
-	if (!status)
-		/* Free the vsi_list resource that we allocated */
-		status = ice_aq_alloc_free_vsi_list(hw, &vsi_list_id, lkup_type,
-						    ice_aqc_opc_free_res);
-
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
-}
-
-/**
- * ice_handle_rem_vsi_list_mgmt
- * @hw: pointer to the hardware structure
- * @vsi_id: ID of the VSI to remove
- * @fm_list_itr: filter management entry for which the VSI list management
- * needs to be done
- */
-static enum ice_status
-ice_handle_rem_vsi_list_mgmt(struct ice_hw *hw, u16 vsi_id,
-			     struct ice_fltr_mgmt_list_entry *fm_list_itr)
-{
-	struct ice_switch_info *sw = hw->switch_info;
-	enum ice_status status = 0;
-	enum ice_sw_lkup_type lkup_type;
-	bool is_last_elem = true;
-	bool conv_list = false;
-	bool del_list = false;
-	u16 vsi_list_id;
-
-	lkup_type = fm_list_itr->fltr_info.lkup_type;
-	vsi_list_id = fm_list_itr->fltr_info.fwd_id.vsi_list_id;
-
-	if (fm_list_itr->vsi_count > 1) {
-		status = ice_update_vsi_list_rule(hw, &vsi_id, 1, vsi_list_id,
-						  true,
-						  ice_aqc_opc_update_sw_rules,
-						  lkup_type);
-		if (status)
-			return status;
-		fm_list_itr->vsi_count--;
-		is_last_elem = false;
-		clear_bit(vsi_id, fm_list_itr->vsi_list_info->vsi_map);
-	}
-
-	/* For non-VLAN rules that forward packets to a VSI list, convert them
-	 * to forwarding packets to a VSI if there is only one VSI left in the
-	 * list.  Unused lists are then removed.
-	 * VLAN rules need to use VSI lists even with only one VSI.
-	 */
-	if (fm_list_itr->fltr_info.fltr_act == ICE_FWD_TO_VSI_LIST) {
-		if (lkup_type == ICE_SW_LKUP_VLAN) {
-			del_list = is_last_elem;
-		} else if (fm_list_itr->vsi_count == 1) {
-			conv_list = true;
-			del_list = true;
-		}
-	}
-
-	if (del_list) {
-		/* Remove the VSI list since it is no longer used */
-		struct ice_vsi_list_map_info *vsi_list_info =
-			fm_list_itr->vsi_list_info;
-
-		status = ice_remove_vsi_list_rule(hw, vsi_list_id, lkup_type);
-		if (status)
-			return status;
-
-		if (conv_list) {
-			u16 rem_vsi_id;
-
-			rem_vsi_id = find_first_bit(vsi_list_info->vsi_map,
-						    ICE_MAX_VSI);
-
-			/* Error out when the expected last element is not in
-			 * the VSI list map
-			 */
-			if (rem_vsi_id == ICE_MAX_VSI)
-				return ICE_ERR_OUT_OF_RANGE;
-
-			/* Change the list entry action from VSI_LIST to VSI */
-			fm_list_itr->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-			fm_list_itr->fltr_info.fwd_id.vsi_id = rem_vsi_id;
-		}
-
-		list_del(&vsi_list_info->list_entry);
-		devm_kfree(ice_hw_to_dev(hw), vsi_list_info);
-		fm_list_itr->vsi_list_info = NULL;
-	}
-
-	if (conv_list) {
-		/* Convert the rule's forward action to forwarding packets to
-		 * a VSI
-		 */
-		struct ice_aqc_sw_rules_elem *s_rule;
-
-		s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-				      ICE_SW_RULE_RX_TX_ETH_HDR_SIZE,
-				      GFP_KERNEL);
-		if (!s_rule)
-			return ICE_ERR_NO_MEMORY;
-
-		ice_fill_sw_rule(hw, &fm_list_itr->fltr_info, s_rule,
-				 ice_aqc_opc_update_sw_rules);
-
-		s_rule->pdata.lkup_tx_rx.index =
-			cpu_to_le16(fm_list_itr->fltr_info.fltr_rule_id);
-
-		status = ice_aq_sw_rules(hw, s_rule,
-					 ICE_SW_RULE_RX_TX_ETH_HDR_SIZE, 1,
-					 ice_aqc_opc_update_sw_rules, NULL);
-		devm_kfree(ice_hw_to_dev(hw), s_rule);
-		if (status)
-			return status;
-	}
-
-	if (is_last_elem) {
-		/* Remove the lookup rule */
-		struct ice_aqc_sw_rules_elem *s_rule;
-
-		s_rule = devm_kzalloc(ice_hw_to_dev(hw),
-				      ICE_SW_RULE_RX_TX_NO_HDR_SIZE,
-				      GFP_KERNEL);
-		if (!s_rule)
-			return ICE_ERR_NO_MEMORY;
-
-		ice_fill_sw_rule(hw, &fm_list_itr->fltr_info, s_rule,
-				 ice_aqc_opc_remove_sw_rules);
-
-		status = ice_aq_sw_rules(hw, s_rule,
-					 ICE_SW_RULE_RX_TX_NO_HDR_SIZE, 1,
-					 ice_aqc_opc_remove_sw_rules, NULL);
-		if (status)
-			return status;
-
-		/* Remove a book keeping entry from the MAC address list */
-		mutex_lock(&sw->mac_list_lock);
-		list_del(&fm_list_itr->list_entry);
-		mutex_unlock(&sw->mac_list_lock);
-		devm_kfree(ice_hw_to_dev(hw), fm_list_itr);
-		devm_kfree(ice_hw_to_dev(hw), s_rule);
-	}
-	return status;
-}
-
-/**
- * ice_remove_mac_entry
- * @hw: pointer to the hardware structure
- * @f_entry: structure containing MAC forwarding information
- */
-static enum ice_status
-ice_remove_mac_entry(struct ice_hw *hw, struct ice_fltr_list_entry *f_entry)
-{
-	struct ice_fltr_mgmt_list_entry *m_entry;
-	u16 vsi_id;
-	u8 *add;
-
-	add = &f_entry->fltr_info.l_data.mac.mac_addr[0];
-
-	m_entry = ice_find_mac_entry(hw, add);
-	if (!m_entry)
-		return ICE_ERR_PARAM;
-
-	vsi_id = f_entry->fltr_info.fwd_id.vsi_id;
-	return ice_handle_rem_vsi_list_mgmt(hw, vsi_id, m_entry);
-}
-
-/**
- * ice_remove_mac - remove a MAC address based filter rule
- * @hw: pointer to the hardware structure
- * @m_list: list of MAC addresses and forwarding information
- *
- * This function removes either a MAC filter rule or a specific VSI from a
- * VSI list for a multicast MAC address.
- *
- * Returns ICE_ERR_DOES_NOT_EXIST if a given entry was not added by
- * ice_add_mac. Caller should be aware that this call will only work if all
- * the entries passed into m_list were added previously. It will not attempt to
- * do a partial remove of entries that were found.
- */
-enum ice_status
-ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
-{
-	struct ice_aqc_sw_rules_elem *s_rule, *r_iter;
-	u8 s_rule_size = ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
-	struct ice_switch_info *sw = hw->switch_info;
-	struct ice_fltr_mgmt_list_entry *m_entry;
-	struct ice_fltr_list_entry *m_list_itr;
-	u16 elem_sent, total_elem_left;
-	enum ice_status status = 0;
-	u16 num_unicast = 0;
-
-	if (!m_list)
-		return ICE_ERR_PARAM;
-
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		u8 *addr = m_list_itr->fltr_info.l_data.mac.mac_addr;
-
-		if (is_unicast_ether_addr(addr) && !hw->ucast_shared)
-			num_unicast++;
-		else if (is_multicast_ether_addr(addr) ||
-			 (is_unicast_ether_addr(addr) && hw->ucast_shared))
-			ice_remove_mac_entry(hw, m_list_itr);
-	}
-
-	/* Exit if no unicast addresses found. Multicast switch rules
-	 * were added individually
-	 */
-	if (!num_unicast)
-		return 0;
-
-	/* Allocate switch rule buffer for the bulk update for unicast */
-	s_rule = devm_kcalloc(ice_hw_to_dev(hw), num_unicast, s_rule_size,
-			      GFP_KERNEL);
-	if (!s_rule)
-		return ICE_ERR_NO_MEMORY;
-
-	r_iter = s_rule;
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		u8 *addr = m_list_itr->fltr_info.l_data.mac.mac_addr;
-
-		if (is_unicast_ether_addr(addr)) {
-			m_entry = ice_find_mac_entry(hw, addr);
-			if (!m_entry) {
-				status = ICE_ERR_DOES_NOT_EXIST;
-				goto ice_remove_mac_exit;
-			}
-
-			ice_fill_sw_rule(hw, &m_entry->fltr_info,
-					 r_iter, ice_aqc_opc_remove_sw_rules);
-			r_iter = (struct ice_aqc_sw_rules_elem *)
-				((u8 *)r_iter + s_rule_size);
-		}
-	}
-
-	/* Call AQ bulk switch rule update for all unicast addresses */
-	r_iter = s_rule;
-	/* Call AQ switch rule in AQ_MAX chunk */
-	for (total_elem_left = num_unicast; total_elem_left > 0;
-	     total_elem_left -= elem_sent) {
-		struct ice_aqc_sw_rules_elem *entry = r_iter;
-
-		elem_sent = min(total_elem_left,
-				(u16)(ICE_AQ_MAX_BUF_LEN / s_rule_size));
-		status = ice_aq_sw_rules(hw, entry, elem_sent * s_rule_size,
-					 elem_sent, ice_aqc_opc_remove_sw_rules,
-					 NULL);
-		if (status)
-			break;
-		r_iter = (struct ice_aqc_sw_rules_elem *)
-			((u8 *)r_iter + s_rule_size);
-	}
-
-	list_for_each_entry(m_list_itr, m_list, list_entry) {
-		u8 *addr = m_list_itr->fltr_info.l_data.mac.mac_addr;
-
-		if (is_unicast_ether_addr(addr)) {
-			m_entry = ice_find_mac_entry(hw, addr);
-			if (!m_entry)
-				return ICE_ERR_OUT_OF_RANGE;
-			mutex_lock(&sw->mac_list_lock);
-			list_del(&m_entry->list_entry);
-			mutex_unlock(&sw->mac_list_lock);
-			devm_kfree(ice_hw_to_dev(hw), m_entry);
-		}
-	}
-
-ice_remove_mac_exit:
-	devm_kfree(ice_hw_to_dev(hw), s_rule);
-	return status;
-}
-
-/**
- * ice_cfg_dflt_vsi - add filter rule to set/unset given VSI as default
- * VSI for the switch (represented by swid)
+ * ice_cfg_dflt_vsi - change state of VSI to set/clear default
  * @hw: pointer to the hardware structure
  * @vsi_id: number of VSI to set as default
  * @set: true to add the above mentioned switch rule, false to remove it
  * @direction: ICE_FLTR_RX or ICE_FLTR_TX
+ *
+ * add filter rule to set/unset given VSI as default VSI for the switch
+ * (represented by swid)
  */
 enum ice_status
 ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_id, bool set, u8 direction)
@@ -1706,26 +1596,38 @@ ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_id, bool set, u8 direction)
 }
 
 /**
- * ice_remove_vlan_internal - Remove one VLAN based filter rule
+ * ice_remove_mac - remove a MAC address based filter rule
  * @hw: pointer to the hardware structure
- * @f_entry: filter entry containing one VLAN information
+ * @m_list: list of MAC addresses and forwarding information
+ *
+ * This function removes either a MAC filter rule or a specific VSI from a
+ * VSI list for a multicast MAC address.
+ *
+ * Returns ICE_ERR_DOES_NOT_EXIST if a given entry was not added by
+ * ice_add_mac. Caller should be aware that this call will only work if all
+ * the entries passed into m_list were added previously. It will not attempt to
+ * do a partial remove of entries that were found.
  */
-static enum ice_status
-ice_remove_vlan_internal(struct ice_hw *hw,
-			 struct ice_fltr_list_entry *f_entry)
+enum ice_status
+ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
 {
-	struct ice_fltr_info *new_fltr;
-	struct ice_fltr_mgmt_list_entry *v_list_elem;
-	u16 vsi_id;
-
-	new_fltr = &f_entry->fltr_info;
+	struct ice_fltr_list_entry *list_itr;
 
-	v_list_elem = ice_find_vlan_entry(hw, new_fltr->l_data.vlan.vlan_id);
-	if (!v_list_elem)
+	if (!m_list)
 		return ICE_ERR_PARAM;
 
-	vsi_id = f_entry->fltr_info.fwd_id.vsi_id;
-	return ice_handle_rem_vsi_list_mgmt(hw, vsi_id, v_list_elem);
+	list_for_each_entry(list_itr, m_list, list_entry) {
+		enum ice_sw_lkup_type l_type = list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_MAC)
+			return ICE_ERR_PARAM;
+		list_itr->status = ice_remove_rule_internal(hw,
+							    ICE_SW_LKUP_MAC,
+							    list_itr);
+		if (list_itr->status)
+			return list_itr->status;
+	}
+	return 0;
 }
 
 /**
@@ -1737,20 +1639,78 @@ enum ice_status
 ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list)
 {
 	struct ice_fltr_list_entry *v_list_itr;
-	enum ice_status status = 0;
 
 	if (!v_list || !hw)
 		return ICE_ERR_PARAM;
 
 	list_for_each_entry(v_list_itr, v_list, list_entry) {
-		status = ice_remove_vlan_internal(hw, v_list_itr);
-		if (status) {
-			v_list_itr->status = ICE_FLTR_STATUS_FW_FAIL;
-			return status;
-		}
-		v_list_itr->status = ICE_FLTR_STATUS_FW_SUCCESS;
+		enum ice_sw_lkup_type l_type = v_list_itr->fltr_info.lkup_type;
+
+		if (l_type != ICE_SW_LKUP_VLAN)
+			return ICE_ERR_PARAM;
+		v_list_itr->status = ice_remove_rule_internal(hw,
+							      ICE_SW_LKUP_VLAN,
+							      v_list_itr);
+		if (v_list_itr->status)
+			return v_list_itr->status;
 	}
-	return status;
+	return 0;
+}
+
+/**
+ * ice_vsi_uses_fltr - Determine if given VSI uses specified filter
+ * @fm_entry: filter entry to inspect
+ * @vsi_id: ID of VSI to compare with filter info
+ */
+static bool
+ice_vsi_uses_fltr(struct ice_fltr_mgmt_list_entry *fm_entry, u16 vsi_id)
+{
+	return ((fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI &&
+		 fm_entry->fltr_info.fwd_id.vsi_id == vsi_id) ||
+		(fm_entry->fltr_info.fltr_act == ICE_FWD_TO_VSI_LIST &&
+		 (test_bit(vsi_id, fm_entry->vsi_list_info->vsi_map))));
+}
+
+/**
+ * ice_add_entry_to_vsi_fltr_list - Add copy of fltr_list_entry to remove list
+ * @hw: pointer to the hardware structure
+ * @vsi_id: ID of VSI to remove filters from
+ * @vsi_list_head: pointer to the list to add entry to
+ * @fi: pointer to fltr_info of filter entry to copy & add
+ *
+ * Helper function, used when creating a list of filters to remove from
+ * a specific VSI. The entry added to vsi_list_head is a COPY of the
+ * original filter entry, with the exception of fltr_info.fltr_act and
+ * fltr_info.fwd_id fields. These are set such that later logic can
+ * extract which VSI to remove the fltr from, and pass on that information.
+ */
+static enum ice_status
+ice_add_entry_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_id,
+			       struct list_head *vsi_list_head,
+			       struct ice_fltr_info *fi)
+{
+	struct ice_fltr_list_entry *tmp;
+
+	/* this memory is freed up in the caller function
+	 * once filters for this VSI are removed
+	 */
+	tmp = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return ICE_ERR_NO_MEMORY;
+
+	tmp->fltr_info = *fi;
+
+	/* Overwrite these fields to indicate which VSI to remove filter from,
+	 * so find and remove logic can extract the information from the
+	 * list entries. Note that original entries will still have proper
+	 * values.
+	 */
+	tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
+	tmp->fltr_info.fwd_id.vsi_id = vsi_id;
+
+	list_add(&tmp->list_entry, vsi_list_head);
+
+	return 0;
 }
 
 /**
@@ -1759,6 +1719,12 @@ ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list)
  * @vsi_id: ID of VSI to remove filters from
  * @lkup_list_head: pointer to the list that has certain lookup type filters
  * @vsi_list_head: pointer to the list pertaining to VSI with vsi_id
+ *
+ * Locates all filters in lkup_list_head that are used by the given VSI,
+ * and adds COPIES of those entries to vsi_list_head (intended to be used
+ * to remove the listed filters).
+ * Note that this means all entries in vsi_list_head must be explicitly
+ * deallocated by the caller when done with list.
  */
 static enum ice_status
 ice_add_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_id,
@@ -1766,46 +1732,25 @@ ice_add_to_vsi_fltr_list(struct ice_hw *hw, u16 vsi_id,
 			 struct list_head *vsi_list_head)
 {
 	struct ice_fltr_mgmt_list_entry *fm_entry;
+	enum ice_status status = 0;
 
 	/* check to make sure VSI id is valid and within boundary */
-	if (vsi_id >=
-	    (sizeof(fm_entry->vsi_list_info->vsi_map) * BITS_PER_BYTE - 1))
+	if (vsi_id >= ICE_MAX_VSI)
 		return ICE_ERR_PARAM;
 
 	list_for_each_entry(fm_entry, lkup_list_head, list_entry) {
 		struct ice_fltr_info *fi;
 
 		fi = &fm_entry->fltr_info;
-		if ((fi->fltr_act == ICE_FWD_TO_VSI &&
-		     fi->fwd_id.vsi_id == vsi_id) ||
-		    (fi->fltr_act == ICE_FWD_TO_VSI_LIST &&
-		     (test_bit(vsi_id, fm_entry->vsi_list_info->vsi_map)))) {
-			struct ice_fltr_list_entry *tmp;
-
-			/* this memory is freed up in the caller function
-			 * ice_remove_vsi_lkup_fltr() once filters for
-			 * this VSI are removed
-			 */
-			tmp = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*tmp),
-					   GFP_KERNEL);
-			if (!tmp)
-				return ICE_ERR_NO_MEMORY;
-
-			memcpy(&tmp->fltr_info, fi, sizeof(*fi));
-
-			/* Expected below fields to be set to ICE_FWD_TO_VSI and
-			 * the particular VSI id since we are only removing this
-			 * one VSI
-			 */
-			if (fi->fltr_act == ICE_FWD_TO_VSI_LIST) {
-				tmp->fltr_info.fltr_act = ICE_FWD_TO_VSI;
-				tmp->fltr_info.fwd_id.vsi_id = vsi_id;
-			}
+		if (!ice_vsi_uses_fltr(fm_entry, vsi_id))
+			continue;
 
-			list_add(&tmp->list_entry, vsi_list_head);
-		}
+		status = ice_add_entry_to_vsi_fltr_list(hw, vsi_id,
+							vsi_list_head, fi);
+		if (status)
+			return status;
 	}
-	return 0;
+	return status;
 }
 
 /**
@@ -1821,46 +1766,40 @@ ice_remove_vsi_lkup_fltr(struct ice_hw *hw, u16 vsi_id,
 	struct ice_switch_info *sw = hw->switch_info;
 	struct ice_fltr_list_entry *fm_entry;
 	struct list_head remove_list_head;
+	struct list_head *rule_head;
 	struct ice_fltr_list_entry *tmp;
+	struct mutex *rule_lock;	/* Lock to protect filter rule list */
 	enum ice_status status;
 
 	INIT_LIST_HEAD(&remove_list_head);
+	rule_lock = &sw->recp_list[lkup].filt_rule_lock;
+	rule_head = &sw->recp_list[lkup].filt_rules;
+	mutex_lock(rule_lock);
+	status = ice_add_to_vsi_fltr_list(hw, vsi_id, rule_head,
+					  &remove_list_head);
+	mutex_unlock(rule_lock);
+	if (status)
+		return;
+
 	switch (lkup) {
 	case ICE_SW_LKUP_MAC:
-		mutex_lock(&sw->mac_list_lock);
-		status = ice_add_to_vsi_fltr_list(hw, vsi_id,
-						  &sw->mac_list_head,
-						  &remove_list_head);
-		mutex_unlock(&sw->mac_list_lock);
-		if (!status) {
-			ice_remove_mac(hw, &remove_list_head);
-			goto free_fltr_list;
-		}
+		ice_remove_mac(hw, &remove_list_head);
 		break;
 	case ICE_SW_LKUP_VLAN:
-		mutex_lock(&sw->vlan_list_lock);
-		status = ice_add_to_vsi_fltr_list(hw, vsi_id,
-						  &sw->vlan_list_head,
-						  &remove_list_head);
-		mutex_unlock(&sw->vlan_list_lock);
-		if (!status) {
-			ice_remove_vlan(hw, &remove_list_head);
-			goto free_fltr_list;
-		}
+		ice_remove_vlan(hw, &remove_list_head);
 		break;
 	case ICE_SW_LKUP_MAC_VLAN:
 	case ICE_SW_LKUP_ETHERTYPE:
 	case ICE_SW_LKUP_ETHERTYPE_MAC:
 	case ICE_SW_LKUP_PROMISC:
-	case ICE_SW_LKUP_PROMISC_VLAN:
 	case ICE_SW_LKUP_DFLT:
-		ice_debug(hw, ICE_DBG_SW,
-			  "Remove filters for this lookup type hasn't been implemented yet\n");
+	case ICE_SW_LKUP_PROMISC_VLAN:
+	case ICE_SW_LKUP_LAST:
+	default:
+		ice_debug(hw, ICE_DBG_SW, "Unsupported lookup type %d\n", lkup);
 		break;
 	}
 
-	return;
-free_fltr_list:
 	list_for_each_entry_safe(fm_entry, tmp, &remove_list_head, list_entry) {
 		list_del(&fm_entry->list_entry);
 		devm_kfree(ice_hw_to_dev(hw), fm_entry);
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index 9b8ec128ee31..09a3b6d6541a 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -39,6 +39,7 @@ enum ice_sw_lkup_type {
 	ICE_SW_LKUP_DFLT = 5,
 	ICE_SW_LKUP_ETHERTYPE_MAC = 8,
 	ICE_SW_LKUP_PROMISC_VLAN = 9,
+	ICE_SW_LKUP_LAST
 };
 
 struct ice_fltr_info {
@@ -98,6 +99,31 @@ struct ice_fltr_info {
 	u8 lan_en;	/* Indicate if packet can be forwarded to the uplink */
 };
 
+struct ice_sw_recipe {
+	struct list_head l_entry;
+
+	/* To protect modification of filt_rule list
+	 * defined below
+	 */
+	struct mutex filt_rule_lock;
+
+	/* List of type ice_fltr_mgmt_list_entry */
+	struct list_head filt_rules;
+
+	/* linked list of type recipe_list_entry */
+	struct list_head rg_list;
+	/* linked list of type ice_sw_fv_list_entry*/
+	struct list_head fv_list;
+	struct ice_aqc_recipe_data_elem *r_buf;
+	u8 recp_count;
+	u8 root_rid;
+	u8 num_profs;
+	u8 *prof_ids;
+
+	/* recipe bitmap: what all recipes makes this recipe */
+	DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES);
+};
+
 /* Bookkeeping structure to hold bitmap of VSIs corresponding to VSI list id */
 struct ice_vsi_list_map_info {
 	struct list_head list_entry;
@@ -105,15 +131,9 @@ struct ice_vsi_list_map_info {
 	u16 vsi_list_id;
 };
 
-enum ice_sw_fltr_status {
-	ICE_FLTR_STATUS_NEW = 0,
-	ICE_FLTR_STATUS_FW_SUCCESS,
-	ICE_FLTR_STATUS_FW_FAIL,
-};
-
 struct ice_fltr_list_entry {
 	struct list_head list_entry;
-	enum ice_sw_fltr_status status;
+	enum ice_status status;
 	struct ice_fltr_info fltr_info;
 };
 
@@ -157,5 +177,6 @@ enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
 enum ice_status ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
 enum ice_status
 ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_id, bool set, u8 direction);
+enum ice_status ice_init_def_sw_recp(struct ice_hw *hw);
 
 #endif /* _ICE_SWITCH_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index adfe131bd3f3..473d82fdb570 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -253,19 +253,8 @@ struct ice_port_info {
 };
 
 struct ice_switch_info {
-	/* Switch VSI lists to MAC/VLAN translation */
-	struct mutex mac_list_lock;		/* protect MAC list */
-	struct list_head mac_list_head;
-	struct mutex vlan_list_lock;		/* protect VLAN list */
-	struct list_head vlan_list_head;
-	struct mutex eth_m_list_lock;	/* protect ethtype list */
-	struct list_head eth_m_list_head;
-	struct mutex promisc_list_lock;	/* protect promisc mode list */
-	struct list_head promisc_list_head;
-	struct mutex mac_vlan_list_lock;	/* protect MAC-VLAN list */
-	struct list_head mac_vlan_list_head;
-
 	struct list_head vsi_list_map_head;
+	struct ice_sw_recipe *recp_list;
 };
 
 /* Port hardware description */
-- 
2.17.1

^ permalink raw reply related

* [net-next 02/15] ice: Updates to Tx scheduler code
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

1) The maximum device nodes is a global value and shared by the whole
   device. Add element AQ command would fail if there is no space to
   add new nodes so the check for max nodes isn't required. So remove
   ice_sched_get_num_nodes_per_layer and ice_sched_val_max_nodes.

2) In ice_sched_add_elems, set default node's CIR/EIR bandwidth weight.

3) Fix default scheduler topology buffer size as the firmware expects
   a 4KB buffer at all times, and will error out if one of any other
   size is provided.

4) In the latest spec, max children per node per layer is replaced by
   max sibling group size. Now it provides the max children of the below
   layer node, not the current layer node.

5) Fix some newline/whitespace issues for consistency.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |   5 +-
 drivers/net/ethernet/intel/ice/ice_common.c   |   7 +
 drivers/net/ethernet/intel/ice/ice_sched.c    | 161 ++++++------------
 drivers/net/ethernet/intel/ice/ice_type.h     |   2 +
 4 files changed, 61 insertions(+), 114 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index a0614f472658..9a33fb95c0ea 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -771,9 +771,8 @@ struct ice_aqc_layer_props {
 	u8 chunk_size;
 	__le16 max_device_nodes;
 	__le16 max_pf_nodes;
-	u8 rsvd0[2];
-	__le16 max_shared_rate_lmtr;
-	__le16 max_children;
+	u8 rsvd0[4];
+	__le16 max_sibl_grp_sz;
 	__le16 max_cir_rl_profiles;
 	__le16 max_eir_rl_profiles;
 	__le16 max_srl_profiles;
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 53cbfd942d03..b315655eab27 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -527,6 +527,13 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 	if (status)
 		goto err_unroll_sched;
 
+	/* need a valid SW entry point to build a Tx tree */
+	if (!hw->sw_entry_point_layer) {
+		ice_debug(hw, ICE_DBG_SCHED, "invalid sw entry point\n");
+		status = ICE_ERR_CFG;
+		goto err_unroll_sched;
+	}
+
 	status = ice_init_fltr_mgmt_struct(hw);
 	if (status)
 		goto err_unroll_sched;
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c
index eeae199469b6..9b7b50554952 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.c
+++ b/drivers/net/ethernet/intel/ice/ice_sched.c
@@ -17,7 +17,6 @@ ice_sched_add_root_node(struct ice_port_info *pi,
 {
 	struct ice_sched_node *root;
 	struct ice_hw *hw;
-	u16 max_children;
 
 	if (!pi)
 		return ICE_ERR_PARAM;
@@ -28,8 +27,8 @@ ice_sched_add_root_node(struct ice_port_info *pi,
 	if (!root)
 		return ICE_ERR_NO_MEMORY;
 
-	max_children = le16_to_cpu(hw->layer_info[0].max_children);
-	root->children = devm_kcalloc(ice_hw_to_dev(hw), max_children,
+	/* coverity[suspicious_sizeof] */
+	root->children = devm_kcalloc(ice_hw_to_dev(hw), hw->max_children[0],
 				      sizeof(*root), GFP_KERNEL);
 	if (!root->children) {
 		devm_kfree(ice_hw_to_dev(hw), root);
@@ -100,7 +99,6 @@ ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 	struct ice_sched_node *parent;
 	struct ice_sched_node *node;
 	struct ice_hw *hw;
-	u16 max_children;
 
 	if (!pi)
 		return ICE_ERR_PARAM;
@@ -120,9 +118,10 @@ ice_sched_add_node(struct ice_port_info *pi, u8 layer,
 	node = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*node), GFP_KERNEL);
 	if (!node)
 		return ICE_ERR_NO_MEMORY;
-	max_children = le16_to_cpu(hw->layer_info[layer].max_children);
-	if (max_children) {
-		node->children = devm_kcalloc(ice_hw_to_dev(hw), max_children,
+	if (hw->max_children[layer]) {
+		/* coverity[suspicious_sizeof] */
+		node->children = devm_kcalloc(ice_hw_to_dev(hw),
+					      hw->max_children[layer],
 					      sizeof(*node), GFP_KERNEL);
 		if (!node->children) {
 			devm_kfree(ice_hw_to_dev(hw), node);
@@ -192,14 +191,17 @@ ice_sched_remove_elems(struct ice_hw *hw, struct ice_sched_node *parent,
 	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
 	if (!buf)
 		return ICE_ERR_NO_MEMORY;
+
 	buf->hdr.parent_teid = parent->info.node_teid;
 	buf->hdr.num_elems = cpu_to_le16(num_nodes);
 	for (i = 0; i < num_nodes; i++)
 		buf->teid[i] = cpu_to_le32(node_teids[i]);
+
 	status = ice_aq_delete_sched_elems(hw, 1, buf, buf_size,
 					   &num_groups_removed, NULL);
 	if (status || num_groups_removed != 1)
 		ice_debug(hw, ICE_DBG_SCHED, "remove elements failed\n");
+
 	devm_kfree(ice_hw_to_dev(hw), buf);
 	return status;
 }
@@ -592,13 +594,16 @@ static void ice_sched_clear_port(struct ice_port_info *pi)
  */
 void ice_sched_cleanup_all(struct ice_hw *hw)
 {
-	if (!hw || !hw->port_info)
+	if (!hw)
 		return;
 
-	if (hw->layer_info)
+	if (hw->layer_info) {
 		devm_kfree(ice_hw_to_dev(hw), hw->layer_info);
+		hw->layer_info = NULL;
+	}
 
-	ice_sched_clear_port(hw->port_info);
+	if (hw->port_info)
+		ice_sched_clear_port(hw->port_info);
 
 	hw->num_tx_sched_layers = 0;
 	hw->num_tx_sched_phys_layers = 0;
@@ -671,9 +676,13 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 			ICE_AQC_ELEM_VALID_EIR;
 		buf->generic[i].data.generic = 0;
 		buf->generic[i].data.cir_bw.bw_profile_idx =
-			ICE_SCHED_DFLT_RL_PROF_ID;
+			cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->generic[i].data.cir_bw.bw_alloc =
+			cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
 		buf->generic[i].data.eir_bw.bw_profile_idx =
-			ICE_SCHED_DFLT_RL_PROF_ID;
+			cpu_to_le16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->generic[i].data.eir_bw.bw_alloc =
+			cpu_to_le16(ICE_SCHED_DFLT_BW_WT);
 	}
 
 	status = ice_aq_add_sched_elems(hw, 1, buf, buf_size,
@@ -697,7 +706,6 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 
 		teid = le32_to_cpu(buf->generic[i].node_teid);
 		new_node = ice_sched_find_node_by_teid(parent, teid);
-
 		if (!new_node) {
 			ice_debug(hw, ICE_DBG_SCHED,
 				  "Node is missing for teid =%d\n", teid);
@@ -710,7 +718,6 @@ ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
 		/* add it to previous node sibling pointer */
 		/* Note: siblings are not linked across branches */
 		prev = ice_sched_get_first_node(hw, tc_node, layer);
-
 		if (prev && prev != new_node) {
 			while (prev->sibling)
 				prev = prev->sibling;
@@ -760,8 +767,7 @@ ice_sched_add_nodes_to_layer(struct ice_port_info *pi,
 		return ICE_ERR_PARAM;
 
 	/* max children per node per layer */
-	max_child_nodes =
-	    le16_to_cpu(hw->layer_info[parent->tx_sched_layer].max_children);
+	max_child_nodes = hw->max_children[parent->tx_sched_layer];
 
 	/* current number of children + required nodes exceed max children ? */
 	if ((parent->num_children + num_nodes) > max_child_nodes) {
@@ -850,78 +856,6 @@ static u8 ice_sched_get_vsi_layer(struct ice_hw *hw)
 	return hw->sw_entry_point_layer;
 }
 
-/**
- * ice_sched_get_num_nodes_per_layer - Get the total number of nodes per layer
- * @pi: pointer to the port info struct
- * @layer: layer number
- *
- * This function calculates the number of nodes present in the scheduler tree
- * including all the branches for a given layer
- */
-static u16
-ice_sched_get_num_nodes_per_layer(struct ice_port_info *pi, u8 layer)
-{
-	struct ice_hw *hw;
-	u16 num_nodes = 0;
-	u8 i;
-
-	if (!pi)
-		return num_nodes;
-
-	hw = pi->hw;
-
-	/* Calculate the number of nodes for all TCs */
-	for (i = 0; i < pi->root->num_children; i++) {
-		struct ice_sched_node *tc_node, *node;
-
-		tc_node = pi->root->children[i];
-
-		/* Get the first node */
-		node = ice_sched_get_first_node(hw, tc_node, layer);
-		if (!node)
-			continue;
-
-		/* count the siblings */
-		while (node) {
-			num_nodes++;
-			node = node->sibling;
-		}
-	}
-
-	return num_nodes;
-}
-
-/**
- * ice_sched_val_max_nodes - check max number of nodes reached or not
- * @pi: port information structure
- * @new_num_nodes_per_layer: pointer to the new number of nodes array
- *
- * This function checks whether the scheduler tree layers have enough space to
- * add new nodes
- */
-static enum ice_status
-ice_sched_validate_for_max_nodes(struct ice_port_info *pi,
-				 u16 *new_num_nodes_per_layer)
-{
-	struct ice_hw *hw = pi->hw;
-	u8 i, qg_layer;
-	u16 num_nodes;
-
-	qg_layer = ice_sched_get_qgrp_layer(hw);
-
-	/* walk through all the layers from SW entry point to qgroup layer */
-	for (i = hw->sw_entry_point_layer; i <= qg_layer; i++) {
-		num_nodes = ice_sched_get_num_nodes_per_layer(pi, i);
-		if (num_nodes + new_num_nodes_per_layer[i] >
-		    le16_to_cpu(hw->layer_info[i].max_pf_nodes)) {
-			ice_debug(hw, ICE_DBG_SCHED,
-				  "max nodes reached for layer = %d\n", i);
-			return ICE_ERR_CFG;
-		}
-	}
-	return 0;
-}
-
 /**
  * ice_rm_dflt_leaf_node - remove the default leaf node in the tree
  * @pi: port information structure
@@ -1003,14 +937,12 @@ enum ice_status ice_sched_init_port(struct ice_port_info *pi)
 	hw = pi->hw;
 
 	/* Query the Default Topology from FW */
-	buf = devm_kcalloc(ice_hw_to_dev(hw), ICE_TXSCHED_MAX_BRANCHES,
-			   sizeof(*buf), GFP_KERNEL);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), ICE_AQ_MAX_BUF_LEN, GFP_KERNEL);
 	if (!buf)
 		return ICE_ERR_NO_MEMORY;
 
 	/* Query default scheduling tree topology */
-	status = ice_aq_get_dflt_topo(hw, pi->lport, buf,
-				      sizeof(*buf) * ICE_TXSCHED_MAX_BRANCHES,
+	status = ice_aq_get_dflt_topo(hw, pi->lport, buf, ICE_AQ_MAX_BUF_LEN,
 				      &num_branches, NULL);
 	if (status)
 		goto err_init_port;
@@ -1097,6 +1029,8 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
 {
 	struct ice_aqc_query_txsched_res_resp *buf;
 	enum ice_status status = 0;
+	__le16 max_sibl;
+	u8 i;
 
 	if (hw->layer_info)
 		return status;
@@ -1115,7 +1049,20 @@ enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
 	hw->flattened_layers = buf->sched_props.flattening_bitmap;
 	hw->max_cgds = buf->sched_props.max_pf_cgds;
 
-	 hw->layer_info = devm_kmemdup(ice_hw_to_dev(hw), buf->layer_props,
+	/* max sibling group size of current layer refers to the max children
+	 * of the below layer node.
+	 * layer 1 node max children will be layer 2 max sibling group size
+	 * layer 2 node max children will be layer 3 max sibling group size
+	 * and so on. This array will be populated from root (index 0) to
+	 * qgroup layer 7. Leaf node has no children.
+	 */
+	for (i = 0; i < hw->num_tx_sched_layers; i++) {
+		max_sibl = buf->layer_props[i].max_sibl_grp_sz;
+		hw->max_children[i] = le16_to_cpu(max_sibl);
+	}
+
+	hw->layer_info = (struct ice_aqc_layer_props *)
+			  devm_kmemdup(ice_hw_to_dev(hw), buf->layer_props,
 				       (hw->num_tx_sched_layers *
 					sizeof(*hw->layer_info)),
 				       GFP_KERNEL);
@@ -1202,7 +1149,7 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_id, u8 tc,
 	u8 qgrp_layer;
 
 	qgrp_layer = ice_sched_get_qgrp_layer(pi->hw);
-	max_children = le16_to_cpu(pi->hw->layer_info[qgrp_layer].max_children);
+	max_children = pi->hw->max_children[qgrp_layer];
 
 	list_elem = ice_sched_get_vsi_info_entry(pi, vsi_id);
 	if (!list_elem)
@@ -1278,10 +1225,8 @@ ice_sched_calc_vsi_child_nodes(struct ice_hw *hw, u16 num_qs, u16 *num_nodes)
 
 	/* calculate num nodes from q group to VSI layer */
 	for (i = qgl; i > vsil; i--) {
-		u16 max_children = le16_to_cpu(hw->layer_info[i].max_children);
-
 		/* round to the next integer if there is a remainder */
-		num = DIV_ROUND_UP(num, max_children);
+		num = DIV_ROUND_UP(num, hw->max_children[i]);
 
 		/* need at least one node */
 		num_nodes[i] = num ? num : 1;
@@ -1311,16 +1256,13 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_id,
 	u16 num_added = 0;
 	u8 i, qgl, vsil;
 
-	status = ice_sched_validate_for_max_nodes(pi, num_nodes);
-	if (status)
-		return status;
-
 	qgl = ice_sched_get_qgrp_layer(hw);
 	vsil = ice_sched_get_vsi_layer(hw);
 	parent = ice_sched_get_vsi_node(hw, tc_node, vsi_id);
 	for (i = vsil + 1; i <= qgl; i++) {
 		if (!parent)
 			return ICE_ERR_CFG;
+
 		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
 						      num_nodes[i],
 						      &first_node_teid,
@@ -1398,8 +1340,8 @@ ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
 				 struct ice_sched_node *tc_node, u16 *num_nodes)
 {
 	struct ice_sched_node *node;
-	u16 max_child;
-	u8 i, vsil;
+	u8 vsil;
+	int i;
 
 	vsil = ice_sched_get_vsi_layer(hw);
 	for (i = vsil; i >= hw->sw_entry_point_layer; i--)
@@ -1412,12 +1354,10 @@ ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
 			/* If intermediate nodes are reached max children
 			 * then add a new one.
 			 */
-			node = ice_sched_get_first_node(hw, tc_node, i);
-			max_child = le16_to_cpu(hw->layer_info[i].max_children);
-
+			node = ice_sched_get_first_node(hw, tc_node, (u8)i);
 			/* scan all the siblings */
 			while (node) {
-				if (node->num_children < max_child)
+				if (node->num_children < hw->max_children[i])
 					break;
 				node = node->sibling;
 			}
@@ -1451,10 +1391,6 @@ ice_sched_add_vsi_support_nodes(struct ice_port_info *pi, u16 vsi_id,
 	if (!pi)
 		return ICE_ERR_PARAM;
 
-	status = ice_sched_validate_for_max_nodes(pi, num_nodes);
-	if (status)
-		return status;
-
 	vsil = ice_sched_get_vsi_layer(pi->hw);
 	for (i = pi->hw->sw_entry_point_layer; i <= vsil; i++) {
 		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent,
@@ -1479,6 +1415,7 @@ ice_sched_add_vsi_support_nodes(struct ice_port_info *pi, u16 vsi_id,
 		if (i == vsil)
 			parent->vsi_id = vsi_id;
 	}
+
 	return 0;
 }
 
@@ -1633,9 +1570,11 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_id, u8 tc, u16 maxqs,
 		status = ice_sched_add_vsi_to_topo(pi, vsi_id, tc);
 		if (status)
 			return status;
+
 		vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_id);
 		if (!vsi_node)
 			return ICE_ERR_CFG;
+
 		vsi->vsi_node[tc] = vsi_node;
 		vsi_node->in_use = true;
 	}
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 97c366e0ca59..df953f5f219b 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -204,6 +204,7 @@ enum ice_agg_type {
 };
 
 #define ICE_SCHED_DFLT_RL_PROF_ID	0
+#define ICE_SCHED_DFLT_BW_WT		1
 
 /* vsi type list entry to locate corresponding vsi/ag nodes */
 struct ice_sched_vsi_info {
@@ -286,6 +287,7 @@ struct ice_hw {
 	u8 flattened_layers;
 	u8 max_cgds;
 	u8 sw_entry_point_layer;
+	u16 max_children[ICE_AQC_TOPO_MAX_LEVEL_NUM];
 
 	u8 evb_veb;		/* true for VEB, false for VEPA */
 	struct ice_bus_info bus;
-- 
2.17.1

^ permalink raw reply related

* [net-next 03/15] ice: Update request resource command to latest specification
From: Jeff Kirsher @ 2018-08-28 19:04 UTC (permalink / raw)
  To: davem
  Cc: Dan Nowlin, netdev, nhorman, sassmann, jogreene,
	Anirudh Venkataramanan, Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Dan Nowlin <dan.nowlin@intel.com>

Align Request Resource Ownership AQ command (0x0008) to the latest
specification. This includes:

- Correcting the resource IDs for the Global Cfg and Change locks.
- new enum ICE_CHANGE_LOCK_RES_ID
- new enum ICE_GLOBAL_CFG_LOCK_RES_ID
- Altering the flow for Global Config Lock to allow only the first PF to
  download the package.

Signed-off-by: Dan Nowlin <dan.nowlin@intel.com>
Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_common.c | 75 ++++++++++++++++-----
 drivers/net/ethernet/intel/ice/ice_common.h |  2 +-
 drivers/net/ethernet/intel/ice/ice_nvm.c    |  2 +-
 drivers/net/ethernet/intel/ice/ice_type.h   |  9 ++-
 4 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index b315655eab27..2a1e13576ce2 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -967,7 +967,22 @@ enum ice_status ice_aq_q_shutdown(struct ice_hw *hw, bool unloading)
  * @timeout: the maximum time in ms that the driver may hold the resource
  * @cd: pointer to command details structure or NULL
  *
- * requests common resource using the admin queue commands (0x0008)
+ * Requests common resource using the admin queue commands (0x0008).
+ * When attempting to acquire the Global Config Lock, the driver can
+ * learn of three states:
+ *  1) ICE_SUCCESS -        acquired lock, and can perform download package
+ *  2) ICE_ERR_AQ_ERROR -   did not get lock, driver should fail to load
+ *  3) ICE_ERR_AQ_NO_WORK - did not get lock, but another driver has
+ *                          successfully downloaded the package; the driver does
+ *                          not have to download the package and can continue
+ *                          loading
+ *
+ * Note that if the caller is in an acquire lock, perform action, release lock
+ * phase of operation, it is possible that the FW may detect a timeout and issue
+ * a CORER. In this case, the driver will receive a CORER interrupt and will
+ * have to determine its cause. The calling thread that is handling this flow
+ * will likely get an error propagated back to it indicating the Download
+ * Package, Update Package or the Release Resource AQ commands timed out.
  */
 static enum ice_status
 ice_aq_req_res(struct ice_hw *hw, enum ice_aq_res_ids res,
@@ -985,13 +1000,43 @@ ice_aq_req_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 	cmd_resp->res_id = cpu_to_le16(res);
 	cmd_resp->access_type = cpu_to_le16(access);
 	cmd_resp->res_number = cpu_to_le32(sdp_number);
+	cmd_resp->timeout = cpu_to_le32(*timeout);
+	*timeout = 0;
 
 	status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+
 	/* The completion specifies the maximum time in ms that the driver
 	 * may hold the resource in the Timeout field.
-	 * If the resource is held by someone else, the command completes with
-	 * busy return value and the timeout field indicates the maximum time
-	 * the current owner of the resource has to free it.
+	 */
+
+	/* Global config lock response utilizes an additional status field.
+	 *
+	 * If the Global config lock resource is held by some other driver, the
+	 * command completes with ICE_AQ_RES_GLBL_IN_PROG in the status field
+	 * and the timeout field indicates the maximum time the current owner
+	 * of the resource has to free it.
+	 */
+	if (res == ICE_GLOBAL_CFG_LOCK_RES_ID) {
+		if (le16_to_cpu(cmd_resp->status) == ICE_AQ_RES_GLBL_SUCCESS) {
+			*timeout = le32_to_cpu(cmd_resp->timeout);
+			return 0;
+		} else if (le16_to_cpu(cmd_resp->status) ==
+			   ICE_AQ_RES_GLBL_IN_PROG) {
+			*timeout = le32_to_cpu(cmd_resp->timeout);
+			return ICE_ERR_AQ_ERROR;
+		} else if (le16_to_cpu(cmd_resp->status) ==
+			   ICE_AQ_RES_GLBL_DONE) {
+			return ICE_ERR_AQ_NO_WORK;
+		}
+
+		/* invalid FW response, force a timeout immediately */
+		*timeout = 0;
+		return ICE_ERR_AQ_ERROR;
+	}
+
+	/* If the resource is held by some other driver, the command completes
+	 * with a busy return value and the timeout field indicates the maximum
+	 * time the current owner of the resource has to free it.
 	 */
 	if (!status || hw->adminq.sq_last_status == ICE_AQ_RC_EBUSY)
 		*timeout = le32_to_cpu(cmd_resp->timeout);
@@ -1030,30 +1075,28 @@ ice_aq_release_res(struct ice_hw *hw, enum ice_aq_res_ids res, u8 sdp_number,
  * @hw: pointer to the HW structure
  * @res: resource id
  * @access: access type (read or write)
+ * @timeout: timeout in milliseconds
  *
  * This function will attempt to acquire the ownership of a resource.
  */
 enum ice_status
 ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
-		enum ice_aq_res_access_type access)
+		enum ice_aq_res_access_type access, u32 timeout)
 {
 #define ICE_RES_POLLING_DELAY_MS	10
 	u32 delay = ICE_RES_POLLING_DELAY_MS;
+	u32 time_left = timeout;
 	enum ice_status status;
-	u32 time_left = 0;
-	u32 timeout;
 
 	status = ice_aq_req_res(hw, res, access, 0, &time_left, NULL);
 
-	/* An admin queue return code of ICE_AQ_RC_EEXIST means that another
-	 * driver has previously acquired the resource and performed any
-	 * necessary updates; in this case the caller does not obtain the
-	 * resource and has no further work to do.
+	/* A return code of ICE_ERR_AQ_NO_WORK means that another driver has
+	 * previously acquired the resource and performed any necessary updates;
+	 * in this case the caller does not obtain the resource and has no
+	 * further work to do.
 	 */
-	if (hw->adminq.sq_last_status == ICE_AQ_RC_EEXIST) {
-		status = ICE_ERR_AQ_NO_WORK;
+	if (status == ICE_ERR_AQ_NO_WORK)
 		goto ice_acquire_res_exit;
-	}
 
 	if (status)
 		ice_debug(hw, ICE_DBG_RES,
@@ -1066,11 +1109,9 @@ ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 		timeout = (timeout > delay) ? timeout - delay : 0;
 		status = ice_aq_req_res(hw, res, access, 0, &time_left, NULL);
 
-		if (hw->adminq.sq_last_status == ICE_AQ_RC_EEXIST) {
+		if (status == ICE_ERR_AQ_NO_WORK)
 			/* lock free, but no work to do */
-			status = ICE_ERR_AQ_NO_WORK;
 			break;
-		}
 
 		if (!status)
 			/* lock acquired */
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index 9a5519130af1..6455b6952ec8 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -23,7 +23,7 @@ enum ice_status
 ice_get_link_status(struct ice_port_info *pi, bool *link_up);
 enum ice_status
 ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
-		enum ice_aq_res_access_type access);
+		enum ice_aq_res_access_type access, u32 timeout);
 void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res);
 enum ice_status ice_init_nvm(struct ice_hw *hw);
 enum ice_status
diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c
index 295a8cd87fc1..3274c543283c 100644
--- a/drivers/net/ethernet/intel/ice/ice_nvm.c
+++ b/drivers/net/ethernet/intel/ice/ice_nvm.c
@@ -137,7 +137,7 @@ ice_acquire_nvm(struct ice_hw *hw, enum ice_aq_res_access_type access)
 	if (hw->nvm.blank_nvm_mode)
 		return 0;
 
-	return ice_acquire_res(hw, ICE_NVM_RES_ID, access);
+	return ice_acquire_res(hw, ICE_NVM_RES_ID, access, ICE_NVM_TIMEOUT);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index df953f5f219b..e1a0d0e5d074 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -34,10 +34,15 @@ static inline bool ice_is_tc_ena(u8 bitmap, u8 tc)
 enum ice_aq_res_ids {
 	ICE_NVM_RES_ID = 1,
 	ICE_SPD_RES_ID,
-	ICE_GLOBAL_CFG_LOCK_RES_ID,
-	ICE_CHANGE_LOCK_RES_ID
+	ICE_CHANGE_LOCK_RES_ID,
+	ICE_GLOBAL_CFG_LOCK_RES_ID
 };
 
+/* FW update timeout definitions are in milliseconds */
+#define ICE_NVM_TIMEOUT			180000
+#define ICE_CHANGE_LOCK_TIMEOUT		1000
+#define ICE_GLOBAL_CFG_LOCK_TIMEOUT	3000
+
 enum ice_aq_res_access_type {
 	ICE_RES_READ = 1,
 	ICE_RES_WRITE
-- 
2.17.1

^ permalink raw reply related

* [net-next 00/15][pull request] 100GbE Intel Wired LAN Driver Updates 2018-08-28
From: Jeff Kirsher @ 2018-08-28 19:03 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene

This series contains new features and implementation updates for the
ice driver.

Anirudh reworks the current flex programming logic to add support for
a second flex descriptor profile.  Updated the transmit scheduler
code to handle changes to the spec, specifically the firmware expects
a 4KB buffer at all times so fix the default scheduler topology buffer
size.  Also the maximum children per node per layer is replaced by
maximum sibling group size.  Adds a check to ensure a reset is not in
progress before exercising a control queue operation.  Refactored the
switch rule management functions and structures to simply the logic and
to add a common function to search for a rule entry and add a new rule
entry.  Refactored the VSI allocation, deletion and rebuild flow so that
on reset we can restore all the filters that were previously added.  Did
some spring cleaning of define names and macros.

Dan updates the admin queue command for requesting resource ownership
to the latest specification by adding new enum's and change the locks.

Zhenning optimizes the driver by using the existing buffer in a
structure directly versus a local array.

Chinh implements handlers for ethtool for get and set link settings.

Sudheer implements transmit hang/timeout detection and malicious driver
detection in the driver.

Md Fahad Iqbal implements the get and set bridge mode operations.

Hieu adds the ability for firmware logging during initialization.

Brett updates the driver to only enable VSI transmit and receive pruning
when VLAN 0 is active, and when VLAN 0 is removed/not active, pruning is
disabled.

Akeem adds a flag to use for stopping the service task.

The following are changes since commit 050cdc6c9501abcd64720b8cc3e7941efee9547d:
  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 100GbE

Akeem G Abodunrin (1):
  ice: Introduce SERVICE_DIS flag and service routine functions

Anirudh Venkataramanan (7):
  ice: Rework flex descriptor programming
  ice: Updates to Tx scheduler code
  ice: Prevent control queue operations during reset
  ice: Refactor switch rule management structures and functions
  ice: Refactor VSI allocation, deletion and rebuild flow
  ice: Clean up register file
  ice: Fix and update driver version string

Brett Creeley (1):
  ice: Enable VSI Rx/Tx pruning only when VLAN 0 is active

Chinh Cao (1):
  ice: Implement handlers for ethtool PHY/link operations

Dan Nowlin (1):
  ice: Update request resource command to latest specification

Hieu Tran (1):
  ice: Enable firmware logging during device initialization.

Md Fahad Iqbal Polash (1):
  ice: Implement ice_bridge_getlink and ice_bridge_setlink

Sudheer Mogilappagari (1):
  ice: Add support for Tx hang, Tx timeout and malicious driver
    detection

Zhenning Xiao (1):
  ice: Code optimization for ice_fill_sw_rule()

 drivers/net/ethernet/intel/ice/ice.h          |    7 +
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |   99 +-
 drivers/net/ethernet/intel/ice/ice_common.c   |  525 +++++-
 drivers/net/ethernet/intel/ice/ice_common.h   |   17 +-
 drivers/net/ethernet/intel/ice/ice_controlq.c |    3 +
 drivers/net/ethernet/intel/ice/ice_ethtool.c  |  801 ++++++++-
 .../net/ethernet/intel/ice/ice_hw_autogen.h   |  456 +++---
 .../net/ethernet/intel/ice/ice_lan_tx_rx.h    |   24 +-
 drivers/net/ethernet/intel/ice/ice_main.c     |  964 +++++++++--
 drivers/net/ethernet/intel/ice/ice_nvm.c      |    2 +-
 drivers/net/ethernet/intel/ice/ice_sched.c    |  161 +-
 drivers/net/ethernet/intel/ice/ice_status.h   |    1 +
 drivers/net/ethernet/intel/ice/ice_switch.c   | 1459 ++++++++++-------
 drivers/net/ethernet/intel/ice/ice_switch.h   |   50 +-
 drivers/net/ethernet/intel/ice/ice_txrx.c     |    1 +
 drivers/net/ethernet/intel/ice/ice_txrx.h     |    1 +
 drivers/net/ethernet/intel/ice/ice_type.h     |   52 +-
 17 files changed, 3375 insertions(+), 1248 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [net-next 01/15] ice: Rework flex descriptor programming
From: Jeff Kirsher @ 2018-08-28 19:03 UTC (permalink / raw)
  To: davem
  Cc: Anirudh Venkataramanan, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20180828190413.29869-1-jeffrey.t.kirsher@intel.com>

From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>

The driver can support two flex descriptor profiles, ICE_RXDID_FLEX_NIC
and ICE_RXDID_FLEX_NIC_2. This patch reworks the current flex programming
logic to add support for the latter profile.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_common.c   | 102 ++++++++++++++----
 .../net/ethernet/intel/ice/ice_lan_tx_rx.h    |  24 +++--
 2 files changed, 92 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 661beea6af79..53cbfd942d03 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -7,16 +7,16 @@
 
 #define ICE_PF_RESET_WAIT_COUNT	200
 
-#define ICE_NIC_FLX_ENTRY(hw, mdid, idx) \
-	wr32((hw), GLFLXP_RXDID_FLX_WRD_##idx(ICE_RXDID_FLEX_NIC), \
+#define ICE_PROG_FLEX_ENTRY(hw, rxdid, mdid, idx) \
+	wr32((hw), GLFLXP_RXDID_FLX_WRD_##idx(rxdid), \
 	     ((ICE_RX_OPC_MDID << \
 	       GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_S) & \
 	      GLFLXP_RXDID_FLX_WRD_##idx##_RXDID_OPCODE_M) | \
 	     (((mdid) << GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_S) & \
 	      GLFLXP_RXDID_FLX_WRD_##idx##_PROT_MDID_M))
 
-#define ICE_NIC_FLX_FLG_ENTRY(hw, flg_0, flg_1, flg_2, flg_3, idx) \
-	wr32((hw), GLFLXP_RXDID_FLAGS(ICE_RXDID_FLEX_NIC, idx), \
+#define ICE_PROG_FLG_ENTRY(hw, rxdid, flg_0, flg_1, flg_2, flg_3, idx) \
+	wr32((hw), GLFLXP_RXDID_FLAGS(rxdid, idx), \
 	     (((flg_0) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_S) & \
 	      GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_M) | \
 	     (((flg_1) << GLFLXP_RXDID_FLAGS_FLEXIFLAG_4N_1_S) & \
@@ -290,30 +290,85 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 }
 
 /**
- * ice_init_flex_parser - initialize rx flex parser
+ * ice_init_flex_flags
  * @hw: pointer to the hardware structure
+ * @prof_id: Rx Descriptor Builder profile ID
  *
- * Function to initialize flex descriptors
+ * Function to initialize Rx flex flags
  */
-static void ice_init_flex_parser(struct ice_hw *hw)
+static void ice_init_flex_flags(struct ice_hw *hw, enum ice_rxdid prof_id)
 {
 	u8 idx = 0;
 
-	ICE_NIC_FLX_ENTRY(hw, ICE_RX_MDID_HASH_LOW, 0);
-	ICE_NIC_FLX_ENTRY(hw, ICE_RX_MDID_HASH_HIGH, 1);
-	ICE_NIC_FLX_ENTRY(hw, ICE_RX_MDID_FLOW_ID_LOWER, 2);
-	ICE_NIC_FLX_ENTRY(hw, ICE_RX_MDID_FLOW_ID_HIGH, 3);
-	ICE_NIC_FLX_FLG_ENTRY(hw, ICE_RXFLG_PKT_FRG, ICE_RXFLG_UDP_GRE,
-			      ICE_RXFLG_PKT_DSI, ICE_RXFLG_FIN, idx++);
-	ICE_NIC_FLX_FLG_ENTRY(hw, ICE_RXFLG_SYN, ICE_RXFLG_RST,
-			      ICE_RXFLG_PKT_DSI, ICE_RXFLG_PKT_DSI, idx++);
-	ICE_NIC_FLX_FLG_ENTRY(hw, ICE_RXFLG_PKT_DSI, ICE_RXFLG_PKT_DSI,
-			      ICE_RXFLG_EVLAN_x8100, ICE_RXFLG_EVLAN_x9100,
-			      idx++);
-	ICE_NIC_FLX_FLG_ENTRY(hw, ICE_RXFLG_VLAN_x8100, ICE_RXFLG_TNL_VLAN,
-			      ICE_RXFLG_TNL_MAC, ICE_RXFLG_TNL0, idx++);
-	ICE_NIC_FLX_FLG_ENTRY(hw, ICE_RXFLG_TNL1, ICE_RXFLG_TNL2,
-			      ICE_RXFLG_PKT_DSI, ICE_RXFLG_PKT_DSI, idx);
+	/* Flex-flag fields (0-2) are programmed with FLG64 bits with layout:
+	 * flexiflags0[5:0] - TCP flags, is_packet_fragmented, is_packet_UDP_GRE
+	 * flexiflags1[3:0] - Not used for flag programming
+	 * flexiflags2[7:0] - Tunnel and VLAN types
+	 * 2 invalid fields in last index
+	 */
+	switch (prof_id) {
+	/* Rx flex flags are currently programmed for the NIC profiles only.
+	 * Different flag bit programming configurations can be added per
+	 * profile as needed.
+	 */
+	case ICE_RXDID_FLEX_NIC:
+	case ICE_RXDID_FLEX_NIC_2:
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_PKT_FRG,
+				   ICE_RXFLG_UDP_GRE, ICE_RXFLG_PKT_DSI,
+				   ICE_RXFLG_FIN, idx++);
+		/* flex flag 1 is not used for flexi-flag programming, skipping
+		 * these four FLG64 bits.
+		 */
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_SYN, ICE_RXFLG_RST,
+				   ICE_RXFLG_PKT_DSI, ICE_RXFLG_PKT_DSI, idx++);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_PKT_DSI,
+				   ICE_RXFLG_PKT_DSI, ICE_RXFLG_EVLAN_x8100,
+				   ICE_RXFLG_EVLAN_x9100, idx++);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_VLAN_x8100,
+				   ICE_RXFLG_TNL_VLAN, ICE_RXFLG_TNL_MAC,
+				   ICE_RXFLG_TNL0, idx++);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_TNL1, ICE_RXFLG_TNL2,
+				   ICE_RXFLG_PKT_DSI, ICE_RXFLG_PKT_DSI, idx);
+		break;
+
+	default:
+		ice_debug(hw, ICE_DBG_INIT,
+			  "Flag programming for profile ID %d not supported\n",
+			  prof_id);
+	}
+}
+
+/**
+ * ice_init_flex_flds
+ * @hw: pointer to the hardware structure
+ * @prof_id: Rx Descriptor Builder profile ID
+ *
+ * Function to initialize flex descriptors
+ */
+static void ice_init_flex_flds(struct ice_hw *hw, enum ice_rxdid prof_id)
+{
+	enum ice_flex_rx_mdid mdid;
+
+	switch (prof_id) {
+	case ICE_RXDID_FLEX_NIC:
+	case ICE_RXDID_FLEX_NIC_2:
+		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_LOW, 0);
+		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_HASH_HIGH, 1);
+		ICE_PROG_FLEX_ENTRY(hw, prof_id, ICE_RX_MDID_FLOW_ID_LOWER, 2);
+
+		mdid = (prof_id == ICE_RXDID_FLEX_NIC_2) ?
+			ICE_RX_MDID_SRC_VSI : ICE_RX_MDID_FLOW_ID_HIGH;
+
+		ICE_PROG_FLEX_ENTRY(hw, prof_id, mdid, 3);
+
+		ice_init_flex_flags(hw, prof_id);
+		break;
+
+	default:
+		ice_debug(hw, ICE_DBG_INIT,
+			  "Field init for profile ID %d not supported\n",
+			  prof_id);
+	}
 }
 
 /**
@@ -494,7 +549,8 @@ enum ice_status ice_init_hw(struct ice_hw *hw)
 	if (status)
 		goto err_unroll_fltr_mgmt_struct;
 
-	ice_init_flex_parser(hw);
+	ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC);
+	ice_init_flex_flds(hw, ICE_RXDID_FLEX_NIC_2);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index 068dbc740b76..94504023d86e 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -188,23 +188,25 @@ struct ice_32b_rx_flex_desc_nic {
  * with a specific metadata (profile 7 reserved for HW)
  */
 enum ice_rxdid {
-	ICE_RXDID_START			= 0,
-	ICE_RXDID_LEGACY_0		= ICE_RXDID_START,
-	ICE_RXDID_LEGACY_1,
-	ICE_RXDID_FLX_START,
-	ICE_RXDID_FLEX_NIC		= ICE_RXDID_FLX_START,
-	ICE_RXDID_FLX_LAST		= 63,
-	ICE_RXDID_LAST			= ICE_RXDID_FLX_LAST
+	ICE_RXDID_LEGACY_0		= 0,
+	ICE_RXDID_LEGACY_1		= 1,
+	ICE_RXDID_FLEX_NIC		= 2,
+	ICE_RXDID_FLEX_NIC_2		= 6,
+	ICE_RXDID_HW			= 7,
+	ICE_RXDID_LAST			= 63,
 };
 
 /* Receive Flex Descriptor Rx opcode values */
 #define ICE_RX_OPC_MDID		0x01
 
 /* Receive Descriptor MDID values */
-#define ICE_RX_MDID_FLOW_ID_LOWER	5
-#define ICE_RX_MDID_FLOW_ID_HIGH	6
-#define ICE_RX_MDID_HASH_LOW		56
-#define ICE_RX_MDID_HASH_HIGH		57
+enum ice_flex_rx_mdid {
+	ICE_RX_MDID_FLOW_ID_LOWER	= 5,
+	ICE_RX_MDID_FLOW_ID_HIGH,
+	ICE_RX_MDID_SRC_VSI		= 19,
+	ICE_RX_MDID_HASH_LOW		= 56,
+	ICE_RX_MDID_HASH_HIGH,
+};
 
 /* Rx Flag64 packet flag bits */
 enum ice_rx_flg64_bits {
-- 
2.17.1

^ permalink raw reply related

* Re: phys_port_id in switchdev mode?
From: Jakub Kicinski @ 2018-08-28 18:43 UTC (permalink / raw)
  To: Florian Fainelli, Or Gerlitz, Simon Horman, Andy Gospodarek,
	mchan@broadcom.com, Jiri Pirko, Alexander Duyck, Frederick Botha
  Cc: nick viljoen, netdev@vger.kernel.org
In-Reply-To: <20180828200539.1c0fe607@cakuba.netronome.com>

Ugh, CC: netdev..

On Tue, 28 Aug 2018 20:05:39 +0200, Jakub Kicinski wrote:
> Hi!
> 
> I wonder if we can use phys_port_id in switchdev to group together
> interfaces of a single PCI PF?  Here is the problem:
> 
> With a mix of PF and VF interfaces it gets increasingly difficult to
> figure out which one corresponds to which PF.  We can identify which
> *representor* is which, by means of phys_port_name and devlink
> flavours.  But if the actual VF/PF interfaces are also present on the
> same host, it gets confusing when one tries to identify the PF they
> came from.  Generally one has to resort of matching between PCI DBDF of
> the PF and VFs or read relevant info out of ethtool -i.
> 
> In multi host scenario this is particularly painful, as there seems to
> be no immediately obvious way to match PCI interface ID of a card (0,
> 1, 2, 3, 4...) to the DBDF we have connected.
> 
> Another angle to this is legacy SR-IOV NDOs.  User space picks a netdev
> from /sys/bus/pci/$VF_DBDF/physfn/net/ to run the NDOs on in somehow
> random manner, which means we have to provide those for all devices with
> link to the PF (all reprs).  And we have to link them (a) because it's
> right (tm) and (b) to get correct naming.  The only reliable way to make
> user space (libvirt) choose the repr it should run the NDOs on (which is
> IMHO the corresponding PF repr) is to set phys_port_id on actual VFs,
> VF reprs, PFs and PF reprs to a value corresponding to the *PCI PF*,
> not the external/Ethernet port when in switchdev mode.  User space
> should understand phys_port_id in this context, given it was originally
> introduced for matching VFs to ports.
> 
> I hope this explanation makes sense, and is correct.  Please point out
> errors in my understanding, any comments would be appreciated! :)
> 
> Jiri?  Or?  Others?

^ permalink raw reply

* [PATCH net-next 2/2] selftests/net: add ip_defrag selftest
From: Peter Oskolkov @ 2018-08-28 18:36 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: Peter Oskolkov
In-Reply-To: <20180828183620.101597-1-posk@google.com>

This test creates a raw IPv4 socket, fragments a largish UDP
datagram and sends the fragments out of order.

Then repeats in a loop with different message and fragment lengths.

Then does the same with overlapping fragments (with overlapping
fragments the expectation is that the recv times out).

Tested:

root@<host># time ./ip_defrag.sh
ipv4 defrag
PASS
ipv4 defrag with overlaps
PASS

real    1m7.679s
user    0m0.628s
sys     0m2.242s

A similar test for IPv6 is to follow.

Signed-off-by: Peter Oskolkov <posk@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
---
 tools/testing/selftests/net/.gitignore   |   2 +
 tools/testing/selftests/net/Makefile     |   4 +-
 tools/testing/selftests/net/ip_defrag.c  | 313 +++++++++++++++++++++++
 tools/testing/selftests/net/ip_defrag.sh |  29 +++
 4 files changed, 346 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/ip_defrag.c
 create mode 100755 tools/testing/selftests/net/ip_defrag.sh

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 78b24cf76f40..2836e0cf2d81 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -14,3 +14,5 @@ udpgso_bench_rx
 udpgso_bench_tx
 tcp_inq
 tls
+ip_defrag
+
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 9cca68e440a0..cccdb2295567 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -5,13 +5,13 @@ CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
 CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
-TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
+TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
 TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd
-TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx
+TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
 
diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c
new file mode 100644
index 000000000000..55fdcdc78eef
--- /dev/null
+++ b/tools/testing/selftests/net/ip_defrag.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+static bool		cfg_do_ipv4;
+static bool		cfg_do_ipv6;
+static bool		cfg_verbose;
+static bool		cfg_overlap;
+static unsigned short	cfg_port = 9000;
+
+const struct in_addr addr4 = { .s_addr = __constant_htonl(INADDR_LOOPBACK + 2) };
+
+#define IP4_HLEN	(sizeof(struct iphdr))
+#define IP6_HLEN	(sizeof(struct ip6_hdr))
+#define UDP_HLEN	(sizeof(struct udphdr))
+
+static int msg_len;
+static int max_frag_len;
+
+#define MSG_LEN_MAX	60000	/* Max UDP payload length. */
+
+#define IP4_MF		(1u << 13)  /* IPv4 MF flag. */
+
+static uint8_t udp_payload[MSG_LEN_MAX];
+static uint8_t ip_frame[IP_MAXPACKET];
+static uint16_t ip_id = 0xabcd;
+static int msg_counter;
+static int frag_counter;
+static unsigned int seed;
+
+/* Receive a UDP packet. Validate it matches udp_payload. */
+static void recv_validate_udp(int fd_udp)
+{
+	ssize_t ret;
+	static uint8_t recv_buff[MSG_LEN_MAX];
+
+	ret = recv(fd_udp, recv_buff, msg_len, 0);
+	msg_counter++;
+
+	if (cfg_overlap) {
+		if (ret != -1)
+			error(1, 0, "recv: expected timeout; got %d; seed = %u",
+				(int)ret, seed);
+		if (errno != ETIMEDOUT && errno != EAGAIN)
+			error(1, errno, "recv: expected timeout: %d; seed = %u",
+				 errno, seed);
+		return;  /* OK */
+	}
+
+	if (ret == -1)
+		error(1, errno, "recv: msg_len = %d max_frag_len = %d",
+			msg_len, max_frag_len);
+	if (ret != msg_len)
+		error(1, 0, "recv: wrong size: %d vs %d", (int)ret, msg_len);
+	if (memcmp(udp_payload, recv_buff, msg_len))
+		error(1, 0, "recv: wrong data");
+}
+
+static uint32_t raw_checksum(uint8_t *buf, int len, uint32_t sum)
+{
+	int i;
+
+	for (i = 0; i < (len & ~1U); i += 2) {
+		sum += (u_int16_t)ntohs(*((u_int16_t *)(buf + i)));
+		if (sum > 0xffff)
+			sum -= 0xffff;
+	}
+
+	if (i < len) {
+		sum += buf[i] << 8;
+		if (sum > 0xffff)
+			sum -= 0xffff;
+	}
+
+	return sum;
+}
+
+static uint16_t udp_checksum(struct ip *iphdr, struct udphdr *udphdr)
+{
+	uint32_t sum = 0;
+
+	sum = raw_checksum((uint8_t *)&iphdr->ip_src, 2 * sizeof(iphdr->ip_src),
+				IPPROTO_UDP + (uint32_t)(UDP_HLEN + msg_len));
+	sum = raw_checksum((uint8_t *)udp_payload, msg_len, sum);
+	sum = raw_checksum((uint8_t *)udphdr, UDP_HLEN, sum);
+	return htons(0xffff & ~sum);
+}
+
+static void send_fragment(int fd_raw, struct sockaddr *addr, socklen_t alen,
+				struct ip *iphdr, int offset)
+{
+	int frag_len;
+	int res;
+
+	if (msg_len - offset <= max_frag_len) {
+		/* This is the last fragment. */
+		frag_len = IP4_HLEN + msg_len - offset;
+		iphdr->ip_off = htons((offset + UDP_HLEN) / 8);
+	} else {
+		frag_len = IP4_HLEN + max_frag_len;
+		iphdr->ip_off = htons((offset + UDP_HLEN) / 8 | IP4_MF);
+	}
+	iphdr->ip_len = htons(frag_len);
+	memcpy(ip_frame + IP4_HLEN, udp_payload + offset,
+		 frag_len - IP4_HLEN);
+
+	res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen);
+	if (res < 0)
+		error(1, errno, "send_fragment");
+	if (res != frag_len)
+		error(1, 0, "send_fragment: %d vs %d", res, frag_len);
+
+	frag_counter++;
+}
+
+static void send_udp_frags_v4(int fd_raw, struct sockaddr *addr, socklen_t alen)
+{
+	struct ip *iphdr = (struct ip *)ip_frame;
+	struct udphdr udphdr;
+	int res;
+	int offset;
+	int frag_len;
+
+	/* Send the UDP datagram using raw IP fragments: the 0th fragment
+	 * has the UDP header; other fragments are pieces of udp_payload
+	 * split in chunks of frag_len size.
+	 *
+	 * Odd fragments (1st, 3rd, 5th, etc.) are sent out first, then
+	 * even fragments (0th, 2nd, etc.) are sent out.
+	 */
+	memset(iphdr, 0, sizeof(*iphdr));
+	iphdr->ip_hl = 5;
+	iphdr->ip_v = 4;
+	iphdr->ip_tos = 0;
+	iphdr->ip_id = htons(ip_id++);
+	iphdr->ip_ttl = 0x40;
+	iphdr->ip_p = IPPROTO_UDP;
+	iphdr->ip_src.s_addr = htonl(INADDR_LOOPBACK);
+	iphdr->ip_dst = addr4;
+	iphdr->ip_sum = 0;
+
+	/* Odd fragments. */
+	offset = 0;
+	while (offset < msg_len) {
+		send_fragment(fd_raw, addr, alen, iphdr, offset);
+		offset += 2 * max_frag_len;
+	}
+
+	if (cfg_overlap) {
+		/* Send an extra random fragment. */
+		offset = rand() % (UDP_HLEN + msg_len - 1);
+		/* sendto() returns EINVAL if offset + frag_len is too small. */
+		frag_len = IP4_HLEN + UDP_HLEN + rand() % 256;
+		iphdr->ip_off = htons(offset / 8 | IP4_MF);
+		iphdr->ip_len = htons(frag_len);
+		res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen);
+		if (res < 0)
+			error(1, errno, "sendto overlap");
+		if (res != frag_len)
+			error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len);
+		frag_counter++;
+	}
+
+	/* Zeroth fragment (UDP header). */
+	frag_len = IP4_HLEN + UDP_HLEN;
+	iphdr->ip_len = htons(frag_len);
+	iphdr->ip_off = htons(IP4_MF);
+
+	udphdr.source = htons(cfg_port + 1);
+	udphdr.dest = htons(cfg_port);
+	udphdr.len = htons(UDP_HLEN + msg_len);
+	udphdr.check = 0;
+	udphdr.check = udp_checksum(iphdr, &udphdr);
+
+	memcpy(ip_frame + IP4_HLEN, &udphdr, UDP_HLEN);
+	res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen);
+	if (res < 0)
+		error(1, errno, "sendto UDP header");
+	if (res != frag_len)
+		error(1, 0, "sendto UDP header: %d vs %d", (int)res, frag_len);
+	frag_counter++;
+
+	/* Even fragments. */
+	offset = max_frag_len;
+	while (offset < msg_len) {
+		send_fragment(fd_raw, addr, alen, iphdr, offset);
+		offset += 2 * max_frag_len;
+	}
+}
+
+static void run_test(struct sockaddr *addr, socklen_t alen)
+{
+	int fd_tx_udp, fd_tx_raw, fd_rx_udp;
+	struct timeval tv = { .tv_sec = 0, .tv_usec = 10 * 1000 };
+	int idx;
+
+	/* Initialize the payload. */
+	for (idx = 0; idx < MSG_LEN_MAX; ++idx)
+		udp_payload[idx] = idx % 256;
+
+	/* Open sockets. */
+	fd_tx_udp = socket(addr->sa_family, SOCK_DGRAM, 0);
+	if (fd_tx_udp == -1)
+		error(1, errno, "socket tx_udp");
+
+	fd_tx_raw = socket(addr->sa_family, SOCK_RAW, IPPROTO_RAW);
+	if (fd_tx_raw == -1)
+		error(1, errno, "socket tx_raw");
+
+	fd_rx_udp = socket(addr->sa_family, SOCK_DGRAM, 0);
+	if (fd_rx_udp == -1)
+		error(1, errno, "socket rx_udp");
+	if (bind(fd_rx_udp, addr, alen))
+		error(1, errno, "bind");
+	/* Fail fast. */
+	if (setsockopt(fd_rx_udp, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+		error(1, errno, "setsockopt rcv timeout");
+
+	for (msg_len = 1; msg_len < MSG_LEN_MAX; msg_len += (rand() % 4096)) {
+		if (cfg_verbose)
+			printf("msg_len: %d\n", msg_len);
+		max_frag_len = addr->sa_family == AF_INET ? 8 : 1280;
+		for (; max_frag_len < 1500 && max_frag_len <= msg_len;
+				max_frag_len += 8) {
+			send_udp_frags_v4(fd_tx_raw, addr, alen);
+			recv_validate_udp(fd_rx_udp);
+		}
+	}
+
+	/* Cleanup. */
+	if (close(fd_tx_raw))
+		error(1, errno, "close tx_raw");
+	if (close(fd_tx_udp))
+		error(1, errno, "close tx_udp");
+	if (close(fd_rx_udp))
+		error(1, errno, "close rx_udp");
+
+	if (cfg_verbose)
+		printf("processed %d messages, %d fragments\n",
+			msg_counter, frag_counter);
+
+	fprintf(stderr, "PASS\n");
+}
+
+
+static void run_test_v4(void)
+{
+	struct sockaddr_in addr = {0};
+
+	addr.sin_family = AF_INET;
+	addr.sin_port = htons(cfg_port);
+	addr.sin_addr = addr4;
+
+	run_test((void *)&addr, sizeof(addr));
+}
+
+static void run_test_v6(void)
+{
+	fprintf(stderr, "NOT IMPL.\n");
+	exit(1);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "46ov")) != -1) {
+		switch (c) {
+		case '4':
+			cfg_do_ipv4 = true;
+			break;
+		case '6':
+			cfg_do_ipv6 = true;
+			break;
+		case 'o':
+			cfg_overlap = true;
+			break;
+		case 'v':
+			cfg_verbose = true;
+			break;
+		default:
+			error(1, 0, "%s: parse error", argv[0]);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+	seed = time(NULL);
+	srand(seed);
+
+	if (cfg_do_ipv4)
+		run_test_v4();
+	if (cfg_do_ipv6)
+		run_test_v6();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/net/ip_defrag.sh b/tools/testing/selftests/net/ip_defrag.sh
new file mode 100755
index 000000000000..53a1ed46790d
--- /dev/null
+++ b/tools/testing/selftests/net/ip_defrag.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a couple of IP defragmentation tests.
+
+set +x
+set -e
+
+echo "ipv4 defrag"
+
+run_v4() {
+sysctl -w net.ipv4.ipfrag_high_thresh=9000000 &> /dev/null
+sysctl -w net.ipv4.ipfrag_low_thresh=7000000 &> /dev/null
+./ip_defrag -4
+}
+export -f run_v4
+
+./in_netns.sh "run_v4"
+
+echo "ipv4 defrag with overlaps"
+run_v4o() {
+sysctl -w net.ipv4.ipfrag_high_thresh=9000000 &> /dev/null
+sysctl -w net.ipv4.ipfrag_low_thresh=7000000 &> /dev/null
+./ip_defrag -4o
+}
+export -f run_v4o
+
+./in_netns.sh "run_v4o"
+
-- 
2.19.0.rc0.228.g281dcd1b4d0-goog

^ permalink raw reply related

* [PATCH net-next 1/2] ip: fail fast on IP defrag errors
From: Peter Oskolkov @ 2018-08-28 18:36 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: Peter Oskolkov

The current behavior of IP defragmentation is inconsistent:
- some overlapping/wrong length fragments are dropped without
  affecting the queue;
- most overlapping fragments cause the whole frag queue to be dropped.

This patch brings consistency: if a bad fragment is detected,
the whole frag queue is dropped. Two major benefits:
- fail fast: corrupted frag queues are cleared immediately, instead of
  by timeout;
- testing of overlapping fragments is now much easier: any kind of
  random fragment length mutation now leads to the frag queue being
  discarded (IP packet dropped); before this patch, some overlaps were
  "corrected", with tests not seeing expected packet drops.

Note that in one case (see "if (end&7)" conditional) the current
behavior is preserved as there are concerns that this could be
legitimate padding.

Signed-off-by: Peter Oskolkov <posk@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/ip_fragment.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 88281fbce88c..330f62353b11 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -382,7 +382,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		 */
 		if (end < qp->q.len ||
 		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
-			goto err;
+			goto discard_qp;
 		qp->q.flags |= INET_FRAG_LAST_IN;
 		qp->q.len = end;
 	} else {
@@ -394,20 +394,20 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		if (end > qp->q.len) {
 			/* Some bits beyond end -> corruption. */
 			if (qp->q.flags & INET_FRAG_LAST_IN)
-				goto err;
+				goto discard_qp;
 			qp->q.len = end;
 		}
 	}
 	if (end == offset)
-		goto err;
+		goto discard_qp;
 
 	err = -ENOMEM;
 	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
-		goto err;
+		goto discard_qp;
 
 	err = pskb_trim_rcsum(skb, end - offset);
 	if (err)
-		goto err;
+		goto discard_qp;
 
 	/* Note : skb->rbnode and skb->dev share the same location. */
 	dev = skb->dev;
@@ -423,6 +423,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	 * We do the same here for IPv4 (and increment an snmp counter).
 	 */
 
+	err = -EINVAL;
 	/* Find out where to put this fragment.  */
 	prev_tail = qp->q.fragments_tail;
 	if (!prev_tail)
@@ -431,7 +432,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		/* This is the common case: skb goes to the end. */
 		/* Detect and discard overlaps. */
 		if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
-			goto discard_qp;
+			goto overlap;
 		if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
 			ip4_frag_append_to_last_run(&qp->q, skb);
 		else
@@ -450,7 +451,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 						FRAG_CB(skb1)->frag_run_len)
 				rbn = &parent->rb_right;
 			else /* Found an overlap with skb1. */
-				goto discard_qp;
+				goto overlap;
 		} while (*rbn);
 		/* Here we have parent properly set, and rbn pointing to
 		 * one of its NULL left/right children. Insert skb.
@@ -487,16 +488,18 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		skb->_skb_refdst = 0UL;
 		err = ip_frag_reasm(qp, skb, prev_tail, dev);
 		skb->_skb_refdst = orefdst;
+		if (err)
+			inet_frag_kill(&qp->q);
 		return err;
 	}
 
 	skb_dst_drop(skb);
 	return -EINPROGRESS;
 
+overlap:
+	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
 discard_qp:
 	inet_frag_kill(&qp->q);
-	err = -EINVAL;
-	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
 err:
 	kfree_skb(skb);
 	return err;
-- 
2.19.0.rc0.228.g281dcd1b4d0-goog

^ permalink raw reply related

* [PATCH net-next] liquidio: fix race condition in instruction completion processing
From: Felix Manlunas @ 2018-08-28 18:32 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	felix.manlunas, ricardo.farrington

From: Rick Farrington <ricardo.farrington@cavium.com>

In lio_enable_irq, the pkt_in_done count register was being cleared to
zero.  However, there could be some completed instructions which were not
yet processed due to budget and limit constraints.
So, only write this register with the number of actual completions
that were processed.

Signed-off-by: Rick Farrington <ricardo.farrington@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/octeon_device.c   | 5 +++--
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h       | 2 ++
 drivers/net/ethernet/cavium/liquidio/request_manager.c | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
index f878a55..d0ed6c4 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
@@ -1450,8 +1450,9 @@ void lio_enable_irq(struct octeon_droq *droq, struct octeon_instr_queue *iq)
 	}
 	if (iq) {
 		spin_lock_bh(&iq->lock);
-		writel(iq->pkt_in_done, iq->inst_cnt_reg);
-		iq->pkt_in_done = 0;
+		writel(iq->pkts_processed, iq->inst_cnt_reg);
+		iq->pkt_in_done -= iq->pkts_processed;
+		iq->pkts_processed = 0;
 		/* this write needs to be flushed before we release the lock */
 		mmiowb();
 		spin_unlock_bh(&iq->lock);
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_iq.h b/drivers/net/ethernet/cavium/liquidio/octeon_iq.h
index 2327062..aecd0d3 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_iq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_iq.h
@@ -94,6 +94,8 @@ struct octeon_instr_queue {
 
 	u32 pkt_in_done;
 
+	u32 pkts_processed;
+
 	/** A spinlock to protect access to the input ring.*/
 	spinlock_t iq_flush_running_lock;
 
diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c
index 8f746e1..f943aa7 100644
--- a/drivers/net/ethernet/cavium/liquidio/request_manager.c
+++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c
@@ -123,6 +123,7 @@ int octeon_init_instr_queue(struct octeon_device *oct,
 	iq->do_auto_flush = 1;
 	iq->db_timeout = (u32)conf->db_timeout;
 	atomic_set(&iq->instr_pending, 0);
+	iq->pkts_processed = 0;
 
 	/* Initialize the spinlock for this instruction queue */
 	spin_lock_init(&iq->lock);
@@ -495,6 +496,7 @@ static inline void __copy_cmd_into_iq(struct octeon_instr_queue *iq,
 				lio_process_iq_request_list(oct, iq, 0);
 
 		if (inst_processed) {
+			iq->pkts_processed += inst_processed;
 			atomic_sub(inst_processed, &iq->instr_pending);
 			iq->stats.instr_processed += inst_processed;
 		}
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next] net: nixge: Add support for 64-bit platforms
From: Moritz Fischer @ 2018-08-28 22:16 UTC (permalink / raw)
  To: davem
  Cc: keescook, netdev, linux-kernel, alex.williams, Moritz Fischer,
	Florian Fainelli

Add support for 64-bit platforms to driver.

The hardware only supports 32-bit register accesses
so the accesses need to be split up into two writes
when setting the current and tail descriptor values.

Signed-off-by: Moritz Fischer <mdf@kernel.org>
Cc: Florian Fainelli <f.fainelli@gmail.com>
---
Hi Dave,

I'll submit the other changes for fixed_phy / non MDIO support
as a separate series since I'm ironing out some issues on ARM
with devicetree overlays.

Thanks,
Moritz
---
 drivers/net/ethernet/ni/Kconfig |   3 +-
 drivers/net/ethernet/ni/nixge.c | 168 ++++++++++++++++++++++----------
 2 files changed, 116 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/ni/Kconfig b/drivers/net/ethernet/ni/Kconfig
index aa41e5f6e437..04e315704f71 100644
--- a/drivers/net/ethernet/ni/Kconfig
+++ b/drivers/net/ethernet/ni/Kconfig
@@ -18,8 +18,9 @@ if NET_VENDOR_NI
 
 config NI_XGE_MANAGEMENT_ENET
 	tristate "National Instruments XGE management enet support"
-	depends on ARCH_ZYNQ
+	depends on HAS_IOMEM && HAS_DMA
 	select PHYLIB
+	select OF_MDIO
 	help
 	  Simple LAN device for debug or management purposes. Can
 	  support either 10G or 1G PHYs via SFP+ ports.
diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index 76efed058f33..74cf52e3fb09 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -106,10 +106,10 @@
 	(NIXGE_JUMBO_MTU + NIXGE_HDR_SIZE + NIXGE_TRL_SIZE)
 
 struct nixge_hw_dma_bd {
-	u32 next;
-	u32 reserved1;
-	u32 phys;
-	u32 reserved2;
+	u32 next_lo;
+	u32 next_hi;
+	u32 phys_lo;
+	u32 phys_hi;
 	u32 reserved3;
 	u32 reserved4;
 	u32 cntrl;
@@ -119,11 +119,39 @@ struct nixge_hw_dma_bd {
 	u32 app2;
 	u32 app3;
 	u32 app4;
-	u32 sw_id_offset;
-	u32 reserved5;
+	u32 sw_id_offset_lo;
+	u32 sw_id_offset_hi;
 	u32 reserved6;
 };
 
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+#define nixge_hw_dma_bd_set_addr(bd, field, addr) \
+	do { \
+		(bd)->field##_lo = lower_32_bits(((u64)addr)); \
+		(bd)->field##_hi = upper_32_bits(((u64)addr)); \
+	} while (0)
+#else
+#define nixge_hw_dma_bd_set_addr(bd, field, addr) \
+	((bd)->field##_lo = lower_32_bits((addr)))
+#endif
+
+#define nixge_hw_dma_bd_set_phys(bd, addr) \
+	nixge_hw_dma_bd_set_addr((bd), phys, (addr))
+
+#define nixge_hw_dma_bd_set_next(bd, addr) \
+	nixge_hw_dma_bd_set_addr((bd), next, (addr))
+
+#define nixge_hw_dma_bd_set_offset(bd, addr) \
+	nixge_hw_dma_bd_set_addr((bd), sw_id_offset, (addr))
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+#define nixge_hw_dma_bd_get_addr(bd, field) \
+	(dma_addr_t)((((u64)(bd)->field##_hi) << 32) | ((bd)->field##_lo))
+#else
+#define nixge_hw_dma_bd_get_addr(bd, field) \
+	(dma_addr_t)((bd)->field##_lo)
+#endif
+
 struct nixge_tx_skb {
 	struct sk_buff *skb;
 	dma_addr_t mapping;
@@ -176,6 +204,15 @@ static void nixge_dma_write_reg(struct nixge_priv *priv, off_t offset, u32 val)
 	writel(val, priv->dma_regs + offset);
 }
 
+static void nixge_dma_write_desc_reg(struct nixge_priv *priv, off_t offset,
+				     dma_addr_t addr)
+{
+	writel(lower_32_bits(addr), priv->dma_regs + offset);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	writel(upper_32_bits(addr), priv->dma_regs + offset + 4);
+#endif
+}
+
 static u32 nixge_dma_read_reg(const struct nixge_priv *priv, off_t offset)
 {
 	return readl(priv->dma_regs + offset);
@@ -202,13 +239,22 @@ static u32 nixge_ctrl_read_reg(struct nixge_priv *priv, off_t offset)
 static void nixge_hw_dma_bd_release(struct net_device *ndev)
 {
 	struct nixge_priv *priv = netdev_priv(ndev);
+	dma_addr_t phys_addr;
+	struct sk_buff *skb;
 	int i;
 
 	for (i = 0; i < RX_BD_NUM; i++) {
-		dma_unmap_single(ndev->dev.parent, priv->rx_bd_v[i].phys,
-				 NIXGE_MAX_JUMBO_FRAME_SIZE, DMA_FROM_DEVICE);
-		dev_kfree_skb((struct sk_buff *)
-			      (priv->rx_bd_v[i].sw_id_offset));
+		phys_addr = nixge_hw_dma_bd_get_addr(&priv->rx_bd_v[i],
+						     phys);
+
+		dma_unmap_single(ndev->dev.parent, phys_addr,
+				 NIXGE_MAX_JUMBO_FRAME_SIZE,
+				 DMA_FROM_DEVICE);
+
+		skb = (struct sk_buff *)
+			nixge_hw_dma_bd_get_addr(&priv->rx_bd_v[i],
+						 sw_id_offset);
+		dev_kfree_skb(skb);
 	}
 
 	if (priv->rx_bd_v)
@@ -231,6 +277,7 @@ static int nixge_hw_dma_bd_init(struct net_device *ndev)
 {
 	struct nixge_priv *priv = netdev_priv(ndev);
 	struct sk_buff *skb;
+	dma_addr_t phys;
 	u32 cr;
 	int i;
 
@@ -259,27 +306,30 @@ static int nixge_hw_dma_bd_init(struct net_device *ndev)
 		goto out;
 
 	for (i = 0; i < TX_BD_NUM; i++) {
-		priv->tx_bd_v[i].next = priv->tx_bd_p +
-				      sizeof(*priv->tx_bd_v) *
-				      ((i + 1) % TX_BD_NUM);
+		nixge_hw_dma_bd_set_next(&priv->tx_bd_v[i],
+					 priv->tx_bd_p +
+					 sizeof(*priv->tx_bd_v) *
+					 ((i + 1) % TX_BD_NUM));
 	}
 
 	for (i = 0; i < RX_BD_NUM; i++) {
-		priv->rx_bd_v[i].next = priv->rx_bd_p +
-				      sizeof(*priv->rx_bd_v) *
-				      ((i + 1) % RX_BD_NUM);
+		nixge_hw_dma_bd_set_next(&priv->rx_bd_v[i],
+					 priv->rx_bd_p
+					 + sizeof(*priv->rx_bd_v) *
+					 ((i + 1) % RX_BD_NUM));
 
 		skb = netdev_alloc_skb_ip_align(ndev,
 						NIXGE_MAX_JUMBO_FRAME_SIZE);
 		if (!skb)
 			goto out;
 
-		priv->rx_bd_v[i].sw_id_offset = (u32)skb;
-		priv->rx_bd_v[i].phys =
-			dma_map_single(ndev->dev.parent,
-				       skb->data,
-				       NIXGE_MAX_JUMBO_FRAME_SIZE,
-				       DMA_FROM_DEVICE);
+		nixge_hw_dma_bd_set_offset(&priv->rx_bd_v[i], skb);
+		phys = dma_map_single(ndev->dev.parent, skb->data,
+				      NIXGE_MAX_JUMBO_FRAME_SIZE,
+				      DMA_FROM_DEVICE);
+
+		nixge_hw_dma_bd_set_phys(&priv->rx_bd_v[i], phys);
+
 		priv->rx_bd_v[i].cntrl = NIXGE_MAX_JUMBO_FRAME_SIZE;
 	}
 
@@ -312,18 +362,18 @@ static int nixge_hw_dma_bd_init(struct net_device *ndev)
 	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
 	 * halted state. This will make the Rx side ready for reception.
 	 */
-	nixge_dma_write_reg(priv, XAXIDMA_RX_CDESC_OFFSET, priv->rx_bd_p);
+	nixge_dma_write_desc_reg(priv, XAXIDMA_RX_CDESC_OFFSET, priv->rx_bd_p);
 	cr = nixge_dma_read_reg(priv, XAXIDMA_RX_CR_OFFSET);
 	nixge_dma_write_reg(priv, XAXIDMA_RX_CR_OFFSET,
 			    cr | XAXIDMA_CR_RUNSTOP_MASK);
-	nixge_dma_write_reg(priv, XAXIDMA_RX_TDESC_OFFSET, priv->rx_bd_p +
+	nixge_dma_write_desc_reg(priv, XAXIDMA_RX_TDESC_OFFSET, priv->rx_bd_p +
 			    (sizeof(*priv->rx_bd_v) * (RX_BD_NUM - 1)));
 
 	/* Write to the RS (Run-stop) bit in the Tx channel control register.
 	 * Tx channel is now ready to run. But only after we write to the
 	 * tail pointer register that the Tx channel will start transmitting.
 	 */
-	nixge_dma_write_reg(priv, XAXIDMA_TX_CDESC_OFFSET, priv->tx_bd_p);
+	nixge_dma_write_desc_reg(priv, XAXIDMA_TX_CDESC_OFFSET, priv->tx_bd_p);
 	cr = nixge_dma_read_reg(priv, XAXIDMA_TX_CR_OFFSET);
 	nixge_dma_write_reg(priv, XAXIDMA_TX_CR_OFFSET,
 			    cr | XAXIDMA_CR_RUNSTOP_MASK);
@@ -451,7 +501,7 @@ static int nixge_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	struct nixge_priv *priv = netdev_priv(ndev);
 	struct nixge_hw_dma_bd *cur_p;
 	struct nixge_tx_skb *tx_skb;
-	dma_addr_t tail_p;
+	dma_addr_t tail_p, cur_phys;
 	skb_frag_t *frag;
 	u32 num_frag;
 	u32 ii;
@@ -466,15 +516,16 @@ static int nixge_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 		return NETDEV_TX_OK;
 	}
 
-	cur_p->phys = dma_map_single(ndev->dev.parent, skb->data,
-				     skb_headlen(skb), DMA_TO_DEVICE);
-	if (dma_mapping_error(ndev->dev.parent, cur_p->phys))
+	cur_phys = dma_map_single(ndev->dev.parent, skb->data,
+				  skb_headlen(skb), DMA_TO_DEVICE);
+	if (dma_mapping_error(ndev->dev.parent, cur_phys))
 		goto drop;
+	nixge_hw_dma_bd_set_phys(cur_p, cur_phys);
 
 	cur_p->cntrl = skb_headlen(skb) | XAXIDMA_BD_CTRL_TXSOF_MASK;
 
 	tx_skb->skb = NULL;
-	tx_skb->mapping = cur_p->phys;
+	tx_skb->mapping = cur_phys;
 	tx_skb->size = skb_headlen(skb);
 	tx_skb->mapped_as_page = false;
 
@@ -485,16 +536,17 @@ static int nixge_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 		tx_skb = &priv->tx_skb[priv->tx_bd_tail];
 		frag = &skb_shinfo(skb)->frags[ii];
 
-		cur_p->phys = skb_frag_dma_map(ndev->dev.parent, frag, 0,
-					       skb_frag_size(frag),
-					       DMA_TO_DEVICE);
-		if (dma_mapping_error(ndev->dev.parent, cur_p->phys))
+		cur_phys = skb_frag_dma_map(ndev->dev.parent, frag, 0,
+					    skb_frag_size(frag),
+					    DMA_TO_DEVICE);
+		if (dma_mapping_error(ndev->dev.parent, cur_phys))
 			goto frag_err;
+		nixge_hw_dma_bd_set_phys(cur_p, cur_phys);
 
 		cur_p->cntrl = skb_frag_size(frag);
 
 		tx_skb->skb = NULL;
-		tx_skb->mapping = cur_p->phys;
+		tx_skb->mapping = cur_phys;
 		tx_skb->size = skb_frag_size(frag);
 		tx_skb->mapped_as_page = true;
 	}
@@ -506,7 +558,7 @@ static int nixge_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 
 	tail_p = priv->tx_bd_p + sizeof(*priv->tx_bd_v) * priv->tx_bd_tail;
 	/* Start the transfer */
-	nixge_dma_write_reg(priv, XAXIDMA_TX_TDESC_OFFSET, tail_p);
+	nixge_dma_write_desc_reg(priv, XAXIDMA_TX_TDESC_OFFSET, tail_p);
 	++priv->tx_bd_tail;
 	priv->tx_bd_tail %= TX_BD_NUM;
 
@@ -537,7 +589,7 @@ static int nixge_recv(struct net_device *ndev, int budget)
 	struct nixge_priv *priv = netdev_priv(ndev);
 	struct sk_buff *skb, *new_skb;
 	struct nixge_hw_dma_bd *cur_p;
-	dma_addr_t tail_p = 0;
+	dma_addr_t tail_p = 0, cur_phys = 0;
 	u32 packets = 0;
 	u32 length = 0;
 	u32 size = 0;
@@ -549,13 +601,15 @@ static int nixge_recv(struct net_device *ndev, int budget)
 		tail_p = priv->rx_bd_p + sizeof(*priv->rx_bd_v) *
 			 priv->rx_bd_ci;
 
-		skb = (struct sk_buff *)(cur_p->sw_id_offset);
+		skb = (struct sk_buff *)nixge_hw_dma_bd_get_addr(cur_p,
+								 sw_id_offset);
 
 		length = cur_p->status & XAXIDMA_BD_STS_ACTUAL_LEN_MASK;
 		if (length > NIXGE_MAX_JUMBO_FRAME_SIZE)
 			length = NIXGE_MAX_JUMBO_FRAME_SIZE;
 
-		dma_unmap_single(ndev->dev.parent, cur_p->phys,
+		dma_unmap_single(ndev->dev.parent,
+				 nixge_hw_dma_bd_get_addr(cur_p, phys),
 				 NIXGE_MAX_JUMBO_FRAME_SIZE,
 				 DMA_FROM_DEVICE);
 
@@ -579,16 +633,17 @@ static int nixge_recv(struct net_device *ndev, int budget)
 		if (!new_skb)
 			return packets;
 
-		cur_p->phys = dma_map_single(ndev->dev.parent, new_skb->data,
-					     NIXGE_MAX_JUMBO_FRAME_SIZE,
-					     DMA_FROM_DEVICE);
-		if (dma_mapping_error(ndev->dev.parent, cur_p->phys)) {
+		cur_phys = dma_map_single(ndev->dev.parent, new_skb->data,
+					  NIXGE_MAX_JUMBO_FRAME_SIZE,
+					  DMA_FROM_DEVICE);
+		if (dma_mapping_error(ndev->dev.parent, cur_phys)) {
 			/* FIXME: bail out and clean up */
 			netdev_err(ndev, "Failed to map ...\n");
 		}
+		nixge_hw_dma_bd_set_phys(cur_p, cur_phys);
 		cur_p->cntrl = NIXGE_MAX_JUMBO_FRAME_SIZE;
 		cur_p->status = 0;
-		cur_p->sw_id_offset = (u32)new_skb;
+		nixge_hw_dma_bd_set_offset(cur_p, new_skb);
 
 		++priv->rx_bd_ci;
 		priv->rx_bd_ci %= RX_BD_NUM;
@@ -599,7 +654,7 @@ static int nixge_recv(struct net_device *ndev, int budget)
 	ndev->stats.rx_bytes += size;
 
 	if (tail_p)
-		nixge_dma_write_reg(priv, XAXIDMA_RX_TDESC_OFFSET, tail_p);
+		nixge_dma_write_desc_reg(priv, XAXIDMA_RX_TDESC_OFFSET, tail_p);
 
 	return packets;
 }
@@ -637,6 +692,7 @@ static irqreturn_t nixge_tx_irq(int irq, void *_ndev)
 	struct nixge_priv *priv = netdev_priv(_ndev);
 	struct net_device *ndev = _ndev;
 	unsigned int status;
+	dma_addr_t phys;
 	u32 cr;
 
 	status = nixge_dma_read_reg(priv, XAXIDMA_TX_SR_OFFSET);
@@ -650,9 +706,11 @@ static irqreturn_t nixge_tx_irq(int irq, void *_ndev)
 		return IRQ_NONE;
 	}
 	if (status & XAXIDMA_IRQ_ERROR_MASK) {
+		phys = nixge_hw_dma_bd_get_addr(&priv->tx_bd_v[priv->tx_bd_ci],
+						phys);
+
 		netdev_err(ndev, "DMA Tx error 0x%x\n", status);
-		netdev_err(ndev, "Current BD is at: 0x%x\n",
-			   (priv->tx_bd_v[priv->tx_bd_ci]).phys);
+		netdev_err(ndev, "Current BD is at: 0x%llx\n", (u64)phys);
 
 		cr = nixge_dma_read_reg(priv, XAXIDMA_TX_CR_OFFSET);
 		/* Disable coalesce, delay timer and error interrupts */
@@ -678,6 +736,7 @@ static irqreturn_t nixge_rx_irq(int irq, void *_ndev)
 	struct nixge_priv *priv = netdev_priv(_ndev);
 	struct net_device *ndev = _ndev;
 	unsigned int status;
+	dma_addr_t phys;
 	u32 cr;
 
 	status = nixge_dma_read_reg(priv, XAXIDMA_RX_SR_OFFSET);
@@ -697,9 +756,10 @@ static irqreturn_t nixge_rx_irq(int irq, void *_ndev)
 		return IRQ_NONE;
 	}
 	if (status & XAXIDMA_IRQ_ERROR_MASK) {
+		phys = nixge_hw_dma_bd_get_addr(&priv->rx_bd_v[priv->rx_bd_ci],
+						phys);
 		netdev_err(ndev, "DMA Rx error 0x%x\n", status);
-		netdev_err(ndev, "Current BD is at: 0x%x\n",
-			   (priv->rx_bd_v[priv->rx_bd_ci]).phys);
+		netdev_err(ndev, "Current BD is at: 0x%llx\n", (u64)phys);
 
 		cr = nixge_dma_read_reg(priv, XAXIDMA_TX_CR_OFFSET);
 		/* Disable coalesce, delay timer and error interrupts */
@@ -735,10 +795,10 @@ static void nixge_dma_err_handler(unsigned long data)
 		tx_skb = &lp->tx_skb[i];
 		nixge_tx_skb_unmap(lp, tx_skb);
 
-		cur_p->phys = 0;
+		nixge_hw_dma_bd_set_phys(cur_p, 0);
 		cur_p->cntrl = 0;
 		cur_p->status = 0;
-		cur_p->sw_id_offset = 0;
+		nixge_hw_dma_bd_set_offset(cur_p, 0);
 	}
 
 	for (i = 0; i < RX_BD_NUM; i++) {
@@ -779,18 +839,18 @@ static void nixge_dma_err_handler(unsigned long data)
 	/* Populate the tail pointer and bring the Rx Axi DMA engine out of
 	 * halted state. This will make the Rx side ready for reception.
 	 */
-	nixge_dma_write_reg(lp, XAXIDMA_RX_CDESC_OFFSET, lp->rx_bd_p);
+	nixge_dma_write_desc_reg(lp, XAXIDMA_RX_CDESC_OFFSET, lp->rx_bd_p);
 	cr = nixge_dma_read_reg(lp, XAXIDMA_RX_CR_OFFSET);
 	nixge_dma_write_reg(lp, XAXIDMA_RX_CR_OFFSET,
 			    cr | XAXIDMA_CR_RUNSTOP_MASK);
-	nixge_dma_write_reg(lp, XAXIDMA_RX_TDESC_OFFSET, lp->rx_bd_p +
+	nixge_dma_write_desc_reg(lp, XAXIDMA_RX_TDESC_OFFSET, lp->rx_bd_p +
 			    (sizeof(*lp->rx_bd_v) * (RX_BD_NUM - 1)));
 
 	/* Write to the RS (Run-stop) bit in the Tx channel control register.
 	 * Tx channel is now ready to run. But only after we write to the
 	 * tail pointer register that the Tx channel will start transmitting
 	 */
-	nixge_dma_write_reg(lp, XAXIDMA_TX_CDESC_OFFSET, lp->tx_bd_p);
+	nixge_dma_write_desc_reg(lp, XAXIDMA_TX_CDESC_OFFSET, lp->tx_bd_p);
 	cr = nixge_dma_read_reg(lp, XAXIDMA_TX_CR_OFFSET);
 	nixge_dma_write_reg(lp, XAXIDMA_TX_CR_OFFSET,
 			    cr | XAXIDMA_CR_RUNSTOP_MASK);
-- 
2.18.0

^ permalink raw reply related

* [PATCH net-next] liquidio: remove unnecessary delay when processing IQ responses
From: Felix Manlunas @ 2018-08-28 18:19 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	felix.manlunas, ricardo.farrington

From: Rick Farrington <ricardo.farrington@cavium.com>

Signed-off-by: Rick Farrington <ricardo.farrington@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/request_manager.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c
index 8f746e1..0a06fbb 100644
--- a/drivers/net/ethernet/cavium/liquidio/request_manager.c
+++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c
@@ -459,7 +459,7 @@ static inline void __copy_cmd_into_iq(struct octeon_instr_queue *iq,
 
 	if (atomic_read(&oct->response_list
 			[OCTEON_ORDERED_SC_LIST].pending_req_count))
-		queue_delayed_work(cwq->wq, &cwq->wk.work, msecs_to_jiffies(1));
+		queue_work(cwq->wq, &cwq->wk.work.work);
 
 	return inst_count;
 }
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH bpf-next 01/11] xdp: implement convert_to_xdp_frame for MEM_TYPE_ZERO_COPY
From: Björn Töpel @ 2018-08-28 17:42 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Karlsson, Magnus, Magnus Karlsson, Duyck, Alexander H,
	Alexander Duyck, ast, Daniel Borkmann, Netdev, Brandeburg, Jesse,
	Singhai, Anjali, peter.waskiewicz.jr, Björn Töpel,
	michael.lundkvist, Willem de Bruijn, John Fastabend,
	Jakub Kicinski, neerav.parikh, MykytaI Iziumtsev, Francois Ozog
In-Reply-To: <20180828161102.45a00204@redhat.com>

Den tis 28 aug. 2018 kl 16:11 skrev Jesper Dangaard Brouer <brouer@redhat.com>:
>
> On Tue, 28 Aug 2018 14:44:25 +0200
> Björn Töpel <bjorn.topel@gmail.com> wrote:
>
> > From: Björn Töpel <bjorn.topel@intel.com>
> >
> > This commit adds proper MEM_TYPE_ZERO_COPY support for
> > convert_to_xdp_frame. Converting a MEM_TYPE_ZERO_COPY xdp_buff to an
> > xdp_frame is done by transforming the MEM_TYPE_ZERO_COPY buffer into a
> > MEM_TYPE_PAGE_ORDER0 frame. This is costly, and in the future it might
> > make sense to implement a more sophisticated thread-safe alloc/free
> > scheme for MEM_TYPE_ZERO_COPY, so that no allocation and copy is
> > required in the fast-path.
>
> This is going to be slow. Especially the dev_alloc_page() call, which
> for small frames is likely going to be slower than the data copy.
> I guess this is a good first step, but I do hope we will circle back and
> optimize this later.  (It would also be quite easy to use
> MEM_TYPE_PAGE_POOL instead to get page recycling in devmap redirect case).
>

Yes, slow. :-( Still, I think this is a good starting point, and then
introduce a page pool in later performance oriented series to make XDP
faster for the AF_XDP scenario.

But I'm definitely on your side here; This need to be addressed -- but
not now IMO.


And thanks for spending time on the series!
Björn

> I would have liked the MEM_TYPE_ZERO_COPY frame to travel one level
> deeper into the redirect-core code.  Allowing devmap to send these
> frame without copy, and allow cpumap to do the dev_alloc_page() call
> (+copy) on the remote CPU.
>
>
> > Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> > ---
> >  include/net/xdp.h |  5 +++--
> >  net/core/xdp.c    | 39 +++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 42 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index 76b95256c266..0d5c6fb4b2e2 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -91,6 +91,8 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
> >       frame->dev_rx = NULL;
> >  }
> >
> > +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
> > +
> >  /* Convert xdp_buff to xdp_frame */
> >  static inline
> >  struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> > @@ -99,9 +101,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> >       int metasize;
> >       int headroom;
> >
> > -     /* TODO: implement clone, copy, use "native" MEM_TYPE */
> >       if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> > -             return NULL;
> > +             return xdp_convert_zc_to_xdp_frame(xdp);
> >
> >       /* Assure headroom is available for storing info */
> >       headroom = xdp->data - xdp->data_hard_start;
> > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > index 89b6785cef2a..be6cb2f0e722 100644
> > --- a/net/core/xdp.c
> > +++ b/net/core/xdp.c
> > @@ -398,3 +398,42 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> >       info->flags = bpf->flags;
> >  }
> >  EXPORT_SYMBOL_GPL(xdp_attachment_setup);
> > +
> > +struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
> > +{
> > +     unsigned int metasize, headroom, totsize;
> > +     void *addr, *data_to_copy;
> > +     struct xdp_frame *xdpf;
> > +     struct page *page;
> > +
> > +     /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
> > +     metasize = xdp_data_meta_unsupported(xdp) ? 0 :
> > +                xdp->data - xdp->data_meta;
> > +     headroom = xdp->data - xdp->data_hard_start;
> > +     totsize = xdp->data_end - xdp->data + metasize;
> > +
> > +     if (sizeof(*xdpf) + totsize > PAGE_SIZE)
> > +             return NULL;
> > +
> > +     page = dev_alloc_page();
> > +     if (!page)
> > +             return NULL;
> > +
> > +     addr = page_to_virt(page);
> > +     xdpf = addr;
> > +     memset(xdpf, 0, sizeof(*xdpf));
> > +
> > +     addr += sizeof(*xdpf);
> > +     data_to_copy = metasize ? xdp->data_meta : xdp->data;
> > +     memcpy(addr, data_to_copy, totsize);
> > +
> > +     xdpf->data = addr + metasize;
> > +     xdpf->len = totsize - metasize;
> > +     xdpf->headroom = 0;
> > +     xdpf->metasize = metasize;
> > +     xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
> > +
> > +     xdp_return_buff(xdp);
> > +     return xdpf;
> > +}
> > +EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
>
>
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: Oops running iptables -F OUTPUT
From: Andreas Schwab @ 2018-08-28 17:28 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Nicholas Piggin, <netdev@vger.kernel.org>, linuxppc-dev,
	Jessica Yu, Michael Ellerman, Will Deacon, Ingo Molnar,
	Andrew Morton, linux-arch
In-Reply-To: <CAKv+Gu_hPaxrVtsBOoviRraYk4FWnT9zQVCVF=i27xd_nGHryw@mail.gmail.com>

On Aug 28 2018, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 6a501b25dd85..57d09d5ceb1a 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -779,7 +779,6 @@ EXPORT_SYMBOL(__per_cpu_offset);
>
>  void __init setup_per_cpu_areas(void)
>  {
> -       const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
>         size_t atom_size;
>         unsigned long delta;
>         unsigned int cpu;
> @@ -795,7 +794,9 @@ void __init setup_per_cpu_areas(void)
>         else
>                 atom_size = 1 << 20;
>
> -       rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance,
> +       rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
> +                                   PERCPU_DYNAMIC_RESERVE,
> +                                   atom_size, pcpu_cpu_distance,
>                                     pcpu_fc_alloc, pcpu_fc_free);
>         if (rc < 0)
>                 panic("cannot initialize percpu area (err=%d)", rc);

That didn't help.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH] i40e: report correct statistics when XDP is enabled
From: Björn Töpel @ 2018-08-28 17:10 UTC (permalink / raw)
  To: Paul Menzel
  Cc: Jesper Dangaard Brouer, netdev, intel-wired-lan, Magnus Karlsson,
	Magnus Karlsson
In-Reply-To: <a20ab03d-25e0-126a-d996-5c1750de0639@molgen.mpg.de>

On 2018-08-28 19:00, Paul Menzel wrote:
> Dear Björn,
> 
> 
> On 08/24/18 16:00, Jesper Dangaard Brouer wrote:
>> On Fri, 24 Aug 2018 13:21:59 +0200
>> Björn Töpel <bjorn.topel@intel.com> wrote:
>>
>>> When XDP is enabled, the driver will report incorrect
>>> statistics. Received frames will reported as transmitted frames.
>>>
>>> This commits fixes the i40e implementation of ndo_get_stats64 (struct
> 
> Should you send a v2, then please use singular for *commit*:
> 
> This commit ….
> 
>>> net_device_ops), so that iproute2 will report correct statistics
>>> (e.g. when running "ip -stats link show dev eth0") even when XDP is
>>> enabled.
> 
> In the future, I’d be great, if you could describe your fix in the
> commit message too. For example, why the if statement needs to move up.
>

Thanks for the review, Paul. I'll address your comments, if we'll end up
with V2.


Björn

>>> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
>>> Fixes: 74608d17fe29 ("i40e: add support for XDP_TX action")
>>
>> Stable candidate:
>>   $ git describe --contains 74608d17fe29
>>   v4.13-rc1~157^2~128^2~13
>>
>>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>>
>> It works for me:
>>
>> Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
>>
>> I'm explicitly _not_ ACK'ing the patch, as I think the your code changes
>> below makes it harder to follow whether a TX or RX ring is getting
>> updated. But it is 100% up to the driver maintainers to say if this is
>> acceptable from a maintenance PoV.
>>
>>> ---
>>>   drivers/net/ethernet/intel/i40e/i40e_main.c | 24 +++++++++++----------
>>>   1 file changed, 13 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
>>> index e40c023cc7b6..7c122dd3faa1 100644
>>> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
>>> @@ -425,9 +425,9 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
>>>   				  struct rtnl_link_stats64 *stats)
>>>   {
>>>   	struct i40e_netdev_priv *np = netdev_priv(netdev);
>>> -	struct i40e_ring *tx_ring, *rx_ring;
>>>   	struct i40e_vsi *vsi = np->vsi;
>>>   	struct rtnl_link_stats64 *vsi_stats = i40e_get_vsi_stats_struct(vsi);
>>> +	struct i40e_ring *ring;
>>>   	int i;
>>>   
>>>   	if (test_bit(__I40E_VSI_DOWN, vsi->state))
>>> @@ -441,24 +441,26 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
>>>   		u64 bytes, packets;
>>>   		unsigned int start;
>>>   
>>> -		tx_ring = READ_ONCE(vsi->tx_rings[i]);
>>> -		if (!tx_ring)
>>> +		ring = READ_ONCE(vsi->tx_rings[i]);
>>> +		if (!ring)
>>>   			continue;
>>> -		i40e_get_netdev_stats_struct_tx(tx_ring, stats);
>>> +		i40e_get_netdev_stats_struct_tx(ring, stats);
>>>   
>>> -		rx_ring = &tx_ring[1];
>>> +		if (i40e_enabled_xdp_vsi(vsi)) {
>>> +			ring++;
>>> +			i40e_get_netdev_stats_struct_tx(ring, stats);
>>> +		}
>>>   
>>> +		ring++;
>>>   		do {
>>> -			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
>>> -			packets = rx_ring->stats.packets;
>>> -			bytes   = rx_ring->stats.bytes;
>>> -		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
>>> +			start   = u64_stats_fetch_begin_irq(&ring->syncp);
>>> +			packets = ring->stats.packets;
>>> +			bytes   = ring->stats.bytes;
>>> +		} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
>>>   
>>>   		stats->rx_packets += packets;
>>>   		stats->rx_bytes   += bytes;
>>>   
>>> -		if (i40e_enabled_xdp_vsi(vsi))
>>> -			i40e_get_netdev_stats_struct_tx(&rx_ring[1], stats);
>>>   	}
>>>   	rcu_read_unlock();
>>>   
> 
> 
> Kind regards,
> 
> Paul
> 

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH] i40e: report correct statistics when XDP is enabled
From: Paul Menzel @ 2018-08-28 17:00 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Jesper Dangaard Brouer, netdev, intel-wired-lan, Magnus Karlsson,
	Magnus Karlsson
In-Reply-To: <20180824160016.245b08d8@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 3498 bytes --]

Dear Björn,


On 08/24/18 16:00, Jesper Dangaard Brouer wrote:
> On Fri, 24 Aug 2018 13:21:59 +0200
> Björn Töpel <bjorn.topel@intel.com> wrote:
> 
>> When XDP is enabled, the driver will report incorrect
>> statistics. Received frames will reported as transmitted frames.
>>
>> This commits fixes the i40e implementation of ndo_get_stats64 (struct

Should you send a v2, then please use singular for *commit*:

This commit ….

>> net_device_ops), so that iproute2 will report correct statistics
>> (e.g. when running "ip -stats link show dev eth0") even when XDP is
>> enabled.

In the future, I’d be great, if you could describe your fix in the
commit message too. For example, why the if statement needs to move up.

>> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> Fixes: 74608d17fe29 ("i40e: add support for XDP_TX action")
> 
> Stable candidate:
>  $ git describe --contains 74608d17fe29
>  v4.13-rc1~157^2~128^2~13
> 
>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> 
> It works for me:
> 
> Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> 
> I'm explicitly _not_ ACK'ing the patch, as I think the your code changes
> below makes it harder to follow whether a TX or RX ring is getting
> updated. But it is 100% up to the driver maintainers to say if this is
> acceptable from a maintenance PoV.
> 
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_main.c | 24 +++++++++++----------
>>  1 file changed, 13 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
>> index e40c023cc7b6..7c122dd3faa1 100644
>> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
>> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
>> @@ -425,9 +425,9 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
>>  				  struct rtnl_link_stats64 *stats)
>>  {
>>  	struct i40e_netdev_priv *np = netdev_priv(netdev);
>> -	struct i40e_ring *tx_ring, *rx_ring;
>>  	struct i40e_vsi *vsi = np->vsi;
>>  	struct rtnl_link_stats64 *vsi_stats = i40e_get_vsi_stats_struct(vsi);
>> +	struct i40e_ring *ring;
>>  	int i;
>>  
>>  	if (test_bit(__I40E_VSI_DOWN, vsi->state))
>> @@ -441,24 +441,26 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
>>  		u64 bytes, packets;
>>  		unsigned int start;
>>  
>> -		tx_ring = READ_ONCE(vsi->tx_rings[i]);
>> -		if (!tx_ring)
>> +		ring = READ_ONCE(vsi->tx_rings[i]);
>> +		if (!ring)
>>  			continue;
>> -		i40e_get_netdev_stats_struct_tx(tx_ring, stats);
>> +		i40e_get_netdev_stats_struct_tx(ring, stats);
>>  
>> -		rx_ring = &tx_ring[1];
>> +		if (i40e_enabled_xdp_vsi(vsi)) {
>> +			ring++;
>> +			i40e_get_netdev_stats_struct_tx(ring, stats);
>> +		}
>>  
>> +		ring++;
>>  		do {
>> -			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
>> -			packets = rx_ring->stats.packets;
>> -			bytes   = rx_ring->stats.bytes;
>> -		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
>> +			start   = u64_stats_fetch_begin_irq(&ring->syncp);
>> +			packets = ring->stats.packets;
>> +			bytes   = ring->stats.bytes;
>> +		} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
>>  
>>  		stats->rx_packets += packets;
>>  		stats->rx_bytes   += bytes;
>>  
>> -		if (i40e_enabled_xdp_vsi(vsi))
>> -			i40e_get_netdev_stats_struct_tx(&rx_ring[1], stats);
>>  	}
>>  	rcu_read_unlock();
>>  


Kind regards,

Paul


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply

* Re: [PATCH net 1/2] net_sched: reject unknown tcfa_action values
From: Cong Wang @ 2018-08-28 16:59 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller, Davide Caratti
In-Reply-To: <0bfdb4f6816e8eae8e86e97b78eb86d3adba0a01.1535465984.git.pabeni@redhat.com>

On Tue, Aug 28, 2018 at 7:25 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> +int tcf_action_destroy_one(struct tc_action *a, int bind)
> +{
> +       struct tc_action *actions[] = { a, NULL };
> +
> +       return tcf_action_destroy(actions, bind);
> +}

Make it static.


> +
>  static int tcf_action_put(struct tc_action *p)
>  {
>         return __tcf_action_put(p, false);
> @@ -881,17 +888,16 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
>         if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
>                 err = tcf_action_goto_chain_init(a, tp);
>                 if (err) {
> -                       struct tc_action *actions[] = { a, NULL };
> -
> -                       tcf_action_destroy(actions, bind);
>                         NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
> +                       tcf_action_destroy_one(a, bind);
>                         return ERR_PTR(err);
>                 }
>         }
>
>         if (!tcf_action_valid(a->tcfa_action)) {
>                 NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead");


You need to adjust this extack too.



> -               a->tcfa_action = TC_ACT_UNSPEC;
> +               tcf_action_destroy_one(a, bind);
> +               return ERR_PTR(-EINVAL);
>         }

Thanks.

^ permalink raw reply

* [PATCH net-next] net: thunderbolt: Convert to use SPDX identifier
From: Mika Westerberg @ 2018-08-28 16:58 UTC (permalink / raw)
  To: David S . Miller; +Cc: Michael Jamet, Yehezkel Bernat, Mika Westerberg, netdev

This gets rid of the licence boilerblate in favor of SPDX identifier
which only takes a single line comment.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/net/thunderbolt.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c
index e0d6760f3219..c48c3a1eb1f8 100644
--- a/drivers/net/thunderbolt.c
+++ b/drivers/net/thunderbolt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Networking over Thunderbolt cable using Apple ThunderboltIP protocol
  *
@@ -5,10 +6,6 @@
  * Authors: Amir Levy <amir.jer.levy@intel.com>
  *          Michael Jamet <michael.jamet@intel.com>
  *          Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/atomic.h>
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH net] net/sched: act_pedit: fix dump of extended layered op
From: Cong Wang @ 2018-08-28 16:48 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Jamal Hadi Salim, David Miller, Linux Kernel Network Developers,
	Amir Vadai
In-Reply-To: <69b7819408d62dc49aa242c239fec558fa9acd8d.1535403029.git.dcaratti@redhat.com>

On Mon, Aug 27, 2018 at 1:56 PM Davide Caratti <dcaratti@redhat.com> wrote:
>
> in the (rare) case of failure in nla_nest_start(), missing NULL checks in
> tcf_pedit_key_ex_dump() can make the following command
>
>  # tc action add action pedit ex munge ip ttl set 64
>
> dereference a NULL pointer:
>
>  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
>  PGD 800000007d1cd067 P4D 800000007d1cd067 PUD 7acd3067 PMD 0
>  Oops: 0002 [#1] SMP PTI
>  CPU: 0 PID: 3336 Comm: tc Tainted: G            E     4.18.0.pedit+ #425
>  Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
>  RIP: 0010:tcf_pedit_dump+0x19d/0x358 [act_pedit]
>  Code: be 02 00 00 00 48 89 df 66 89 44 24 20 e8 9b b1 fd e0 85 c0 75 46 8b 83 c8 00 00 00 49 83 c5 08 48 03 83 d0 00 00 00 4d 39 f5 <66> 89 04 25 00 00 00 00 0f 84 81 01 00 00 41 8b 45 00 48 8d 4c 24
>  RSP: 0018:ffffb5d4004478a8 EFLAGS: 00010246
>  RAX: ffff8880fcda2070 RBX: ffff8880fadd2900 RCX: 0000000000000000
>  RDX: 0000000000000002 RSI: ffffb5d4004478ca RDI: ffff8880fcda206e
>  RBP: ffff8880fb9cb900 R08: 0000000000000008 R09: ffff8880fcda206e
>  R10: ffff8880fadd2900 R11: 0000000000000000 R12: ffff8880fd26cf40
>  R13: ffff8880fc957430 R14: ffff8880fc957430 R15: ffff8880fb9cb988
>  FS:  00007f75a537a740(0000) GS:ffff8880fda00000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 0000000000000000 CR3: 000000007a2fa005 CR4: 00000000001606f0
>  Call Trace:
>   ? __nla_reserve+0x38/0x50
>   tcf_action_dump_1+0xd2/0x130
>   tcf_action_dump+0x6a/0xf0
>   tca_get_fill.constprop.31+0xa3/0x120
>   tcf_action_add+0xd1/0x170
>   tc_ctl_action+0x137/0x150
>   rtnetlink_rcv_msg+0x263/0x2d0
>   ? _cond_resched+0x15/0x40
>   ? rtnl_calcit.isra.30+0x110/0x110
>   netlink_rcv_skb+0x4d/0x130
>   netlink_unicast+0x1a3/0x250
>   netlink_sendmsg+0x2ae/0x3a0
>   sock_sendmsg+0x36/0x40
>   ___sys_sendmsg+0x26f/0x2d0
>   ? do_wp_page+0x8e/0x5f0
>   ? handle_pte_fault+0x6c3/0xf50
>   ? __handle_mm_fault+0x38e/0x520
>   ? __sys_sendmsg+0x5e/0xa0
>   __sys_sendmsg+0x5e/0xa0
>   do_syscall_64+0x5b/0x180
>   entry_SYSCALL_64_after_hwframe+0x44/0xa9
>  RIP: 0033:0x7f75a4583ba0
>  Code: c3 48 8b 05 f2 62 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d fd c3 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ae cc 00 00 48 89 04 24
>  RSP: 002b:00007fff60ee7418 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
>  RAX: ffffffffffffffda RBX: 00007fff60ee7540 RCX: 00007f75a4583ba0
>  RDX: 0000000000000000 RSI: 00007fff60ee7490 RDI: 0000000000000003
>  RBP: 000000005b842d3e R08: 0000000000000002 R09: 0000000000000000
>  R10: 00007fff60ee6ea0 R11: 0000000000000246 R12: 0000000000000000
>  R13: 00007fff60ee7554 R14: 0000000000000001 R15: 000000000066c100
>  Modules linked in: act_pedit(E) ip6table_filter ip6_tables iptable_filter binfmt_misc crct10dif_pclmul ext4 crc32_pclmul mbcache ghash_clmulni_intel jbd2 pcbc snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd snd_timer cryptd glue_helper snd joydev pcspkr soundcore virtio_balloon i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi virtio_net net_failover virtio_blk virtio_console failover qxl crc32c_intel drm_kms_helper syscopyarea serio_raw sysfillrect sysimgblt fb_sys_fops ttm drm ata_piix virtio_pci libata virtio_ring i2c_core virtio floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_pedit]
>  CR2: 0000000000000000
>
> Like it's done for other TC actions, give up dumping pedit rules and return
> an error if nla_nest_start() returns NULL.

Looks good to me,

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

While you are at it, please fix act_tunnel_key too.

Thanks.

^ permalink raw reply

* [bpf-next PATCH 2/2] bpf: use --cgroup in test_suite if supplied
From: John Fastabend @ 2018-08-28 16:10 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: netdev
In-Reply-To: <20180828160921.24004.71893.stgit@john-Precision-Tower-5810>

If the user supplies a --cgroup value in the arguments when running
the test_suite go ahaead and run the self tests there. I use this
to test with multiple cgroup users.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 tools/testing/selftests/bpf/test_sockmap.c |   53 ++++++++++++++++------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index a0e77c6..ac7de38 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1345,9 +1345,9 @@ static int populate_progs(char *bpf_file)
 	return 0;
 }
 
-static int __test_suite(char *bpf_file)
+static int __test_suite(int cg_fd, char *bpf_file)
 {
-	int cg_fd, err;
+	int err, cleanup = cg_fd;
 
 	err = populate_progs(bpf_file);
 	if (err < 0) {
@@ -1355,22 +1355,24 @@ static int __test_suite(char *bpf_file)
 		return err;
 	}
 
-	if (setup_cgroup_environment()) {
-		fprintf(stderr, "ERROR: cgroup env failed\n");
-		return -EINVAL;
-	}
-
-	cg_fd = create_and_get_cgroup(CG_PATH);
 	if (cg_fd < 0) {
-		fprintf(stderr,
-			"ERROR: (%i) open cg path failed: %s\n",
-			cg_fd, optarg);
-		return cg_fd;
-	}
+		if (setup_cgroup_environment()) {
+			fprintf(stderr, "ERROR: cgroup env failed\n");
+			return -EINVAL;
+		}
+
+		cg_fd = create_and_get_cgroup(CG_PATH);
+		if (cg_fd < 0) {
+			fprintf(stderr,
+				"ERROR: (%i) open cg path failed: %s\n",
+				cg_fd, optarg);
+			return cg_fd;
+		}
 
-	if (join_cgroup(CG_PATH)) {
-		fprintf(stderr, "ERROR: failed to join cgroup\n");
-		return -EINVAL;
+		if (join_cgroup(CG_PATH)) {
+			fprintf(stderr, "ERROR: failed to join cgroup\n");
+			return -EINVAL;
+		}
 	}
 
 	/* Tests basic commands and APIs with range of iov values */
@@ -1391,20 +1393,24 @@ static int __test_suite(char *bpf_file)
 
 out:
 	printf("Summary: %i PASSED %i FAILED\n", passed, failed);
-	cleanup_cgroup_environment();
-	close(cg_fd);
+	if (cleanup < 0) {
+		cleanup_cgroup_environment();
+		close(cg_fd);
+	}
 	return err;
 }
 
-static int test_suite(void)
+static int test_suite(int cg_fd)
 {
 	int err;
 
-	err = __test_suite(BPF_SOCKMAP_FILENAME);
+	err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME);
 	if (err)
 		goto out;
-	err = __test_suite(BPF_SOCKHASH_FILENAME);
+	err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME);
 out:
+	if (cg_fd > -1)
+		close(cg_fd);
 	return err;
 }
 
@@ -1417,7 +1423,7 @@ int main(int argc, char **argv)
 	int test = PING_PONG;
 
 	if (argc < 2)
-		return test_suite();
+		return test_suite(-1);
 
 	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
 				  long_options, &longindex)) != -1) {
@@ -1483,6 +1489,9 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (argc <= 3 && cg_fd)
+		return test_suite(cg_fd);
+
 	if (!cg_fd) {
 		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
 			argv[0]);

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox