Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 13/15] ice: Update Tx scheduler tree for VSI multi-Tx queue support
From: Anirudh Venkataramanan @ 2018-03-15 23:48 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev
In-Reply-To: <20180315234802.31336-1-anirudh.venkataramanan@intel.com>

This patch adds the ability for a VSI to use multiple Tx queues. More
specifically, the patch
    1) Provides the ability to update the Tx scheduler tree in the
       firmware. The driver can configure the Tx scheduler tree by
       adding/removing multiple Tx queues per TC per VSI.

    2) Allows a VSI to reconfigure its Tx queues during runtime.

    3) Synchronizes the Tx scheduler update operations using locks.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h            |   7 +
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |  28 +
 drivers/net/ethernet/intel/ice/ice_common.c     |  54 ++
 drivers/net/ethernet/intel/ice/ice_common.h     |   3 +
 drivers/net/ethernet/intel/ice/ice_main.c       |  20 +-
 drivers/net/ethernet/intel/ice/ice_sched.c      | 886 ++++++++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_sched.h      |   4 +
 drivers/net/ethernet/intel/ice/ice_type.h       |   7 +
 8 files changed, 1006 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 6014ef9c36e1..cb1e8a127af1 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -56,6 +56,7 @@ extern const char ice_drv_ver[];
 #define ICE_MIN_NUM_DESC	8
 #define ICE_MAX_NUM_DESC	8160
 #define ICE_REQ_DESC_MULTIPLE	32
+#define ICE_DFLT_TRAFFIC_CLASS	BIT(0)
 #define ICE_INT_NAME_STR_LEN	(IFNAMSIZ + 16)
 #define ICE_ETHTOOL_FWVER_LEN	32
 #define ICE_AQ_LEN		64
@@ -275,6 +276,12 @@ static inline void ice_irq_dynamic_ena(struct ice_hw *hw, struct ice_vsi *vsi,
 	wr32(hw, GLINT_DYN_CTL(vector), val);
 }
 
+static inline void ice_vsi_set_tc_cfg(struct ice_vsi *vsi)
+{
+	vsi->tc_cfg.ena_tc =  ICE_DFLT_TRAFFIC_CLASS;
+	vsi->tc_cfg.numtc = 1;
+}
+
 void ice_set_ethtool_ops(struct net_device *netdev);
 int ice_up(struct ice_vsi *vsi);
 int ice_down(struct ice_vsi *vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 2c8d8533f87d..62509635fc5e 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -645,6 +645,25 @@ struct ice_aqc_get_topo {
 	__le32 addr_low;
 };
 
+/* Update TSE (indirect 0x0403)
+ * Get TSE (indirect 0x0404)
+ */
+struct ice_aqc_get_cfg_elem {
+	__le16 num_elem_req;	/* Used by commands */
+	__le16 num_elem_resp;	/* Used by responses */
+	__le32 reserved;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+/* This is the buffer for:
+ * Suspend Nodes (indirect 0x0409)
+ * Resume Nodes (indirect 0x040A)
+ */
+struct ice_aqc_suspend_resume_elem {
+	__le32 teid[1];
+};
+
 /* Add TSE (indirect 0x0401)
  * Delete TSE (indirect 0x040F)
  * Move TSE (indirect 0x0408)
@@ -705,6 +724,11 @@ struct ice_aqc_txsched_topo_grp_info_hdr {
 	__le16 reserved2;
 };
 
+struct ice_aqc_add_elem {
+	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
+	struct ice_aqc_txsched_elem_data generic[1];
+};
+
 struct ice_aqc_get_topo_elem {
 	struct ice_aqc_txsched_topo_grp_info_hdr hdr;
 	struct ice_aqc_txsched_elem_data
@@ -1195,6 +1219,7 @@ struct ice_aq_desc {
 		struct ice_aqc_get_sw_cfg get_sw_conf;
 		struct ice_aqc_sw_rules sw_rules;
 		struct ice_aqc_get_topo get_topo;
+		struct ice_aqc_get_cfg_elem get_update_elem;
 		struct ice_aqc_query_txsched_res query_sched_res;
 		struct ice_aqc_add_move_delete_elem add_move_delete_elem;
 		struct ice_aqc_nvm nvm;
@@ -1272,6 +1297,9 @@ enum ice_adminq_opc {
 
 	/* transmit scheduler commands */
 	ice_aqc_opc_get_dflt_topo			= 0x0400,
+	ice_aqc_opc_add_sched_elems			= 0x0401,
+	ice_aqc_opc_suspend_sched_elems			= 0x0409,
+	ice_aqc_opc_resume_sched_elems			= 0x040A,
 	ice_aqc_opc_delete_sched_elems			= 0x040F,
 	ice_aqc_opc_query_sched_res			= 0x0412,
 
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 43cca9370444..958161a21115 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -2103,3 +2103,57 @@ ice_dis_vsi_txq(struct ice_port_info *pi, u8 num_queues, u16 *q_ids,
 	mutex_unlock(&pi->sched_lock);
 	return status;
 }
+
+/**
+ * ice_cfg_vsi_qs - configure the new/exisiting VSI queues
+ * @pi: port information structure
+ * @vsi_id: VSI Id
+ * @tc_bitmap: TC bitmap
+ * @maxqs: max queues array per TC
+ * @owner: lan or rdma
+ *
+ * This function adds/updates the VSI queues per TC.
+ */
+static enum ice_status
+ice_cfg_vsi_qs(struct ice_port_info *pi, u16 vsi_id, u8 tc_bitmap,
+	       u16 *maxqs, u8 owner)
+{
+	enum ice_status status = 0;
+	u8 i;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	mutex_lock(&pi->sched_lock);
+
+	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
+		/* configuration is possible only if TC node is present */
+		if (!ice_sched_get_tc_node(pi, i))
+			continue;
+
+		status = ice_sched_cfg_vsi(pi, vsi_id, i, maxqs[i], owner,
+					   ice_is_tc_ena(tc_bitmap, i));
+		if (status)
+			break;
+	}
+
+	mutex_unlock(&pi->sched_lock);
+	return status;
+}
+
+/**
+ * ice_cfg_vsi_lan - configure VSI lan queues
+ * @pi: port information structure
+ * @vsi_id: VSI Id
+ * @tc_bitmap: TC bitmap
+ * @max_lanqs: max lan queues array per TC
+ *
+ * This function adds/updates the VSI lan queues per TC.
+ */
+enum ice_status
+ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_id, u8 tc_bitmap,
+		u16 *max_lanqs)
+{
+	return ice_cfg_vsi_qs(pi, vsi_id, tc_bitmap, max_lanqs,
+			      ICE_SCHED_NODE_OWNER_LAN);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index bc52b7bcc78c..3e33a47cb61a 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -83,6 +83,9 @@ enum ice_status
 ice_dis_vsi_txq(struct ice_port_info *pi, u8 num_queues, u16 *q_ids,
 		u32 *q_teids, struct ice_sq_cd *cmd_details);
 enum ice_status
+ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_id, u8 tc_bitmap,
+		u16 *max_lanqs);
+enum ice_status
 ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_id, u8 tc, u8 num_qgrps,
 		struct ice_aqc_add_tx_qgrp *buf, u16 buf_size,
 		struct ice_sq_cd *cd);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 1dbe510e80fa..8eef9a4c1d13 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2099,10 +2099,11 @@ static struct ice_vsi *
 ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 	      struct ice_port_info *pi)
 {
+	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
 	struct device *dev = &pf->pdev->dev;
 	struct ice_vsi_ctx ctxt = { 0 };
 	struct ice_vsi *vsi;
-	int ret;
+	int ret, i;
 
 	vsi = ice_vsi_alloc(pf, type);
 	if (!vsi) {
@@ -2170,6 +2171,20 @@ ice_vsi_setup(struct ice_pf *pf, enum ice_vsi_type type,
 		 */
 		goto err_rings;
 	}
+
+	ice_vsi_set_tc_cfg(vsi);
+
+	/* configure VSI nodes based on number of queues and TC's */
+	for (i = 0; i < vsi->tc_cfg.numtc; i++)
+		max_txqs[i] = vsi->num_txq;
+
+	ret = ice_cfg_vsi_lan(vsi->port_info, vsi->vsi_num,
+			      vsi->tc_cfg.ena_tc, max_txqs);
+	if (ret) {
+		dev_info(&pf->pdev->dev, "Failed VSI lan queue config\n");
+		goto err_rings;
+	}
+
 	return vsi;
 
 err_rings:
@@ -2412,8 +2427,7 @@ static void ice_determine_q_usage(struct ice_pf *pf)
 	q_left_tx = pf->hw.func_caps.common_cap.num_txq;
 	q_left_rx = pf->hw.func_caps.common_cap.num_rxq;
 
-	/* initial support for only 1 tx queue */
-	pf->num_lan_tx = 1;
+	pf->num_lan_tx = min_t(int, q_left_tx, num_online_cpus());
 
 	/* only 1 rx queue unless RSS is enabled */
 	if (!test_bit(ICE_FLAG_RSS_ENA, pf->flags))
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c
index 74dbdb3d5df2..57d678eed61e 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.c
+++ b/drivers/net/ethernet/intel/ice/ice_sched.c
@@ -373,6 +373,110 @@ ice_aq_get_dflt_topo(struct ice_hw *hw, u8 lport,
 	return status;
 }
 
+/**
+ * ice_aq_add_sched_elems - adds scheduling element
+ * @hw: pointer to the hw struct
+ * @grps_req: the number of groups that are requested to be added
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @grps_added: returns total number of groups added
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add scheduling elements (0x0401)
+ */
+static enum ice_status
+ice_aq_add_sched_elems(struct ice_hw *hw, u16 grps_req,
+		       struct ice_aqc_add_elem *buf, u16 buf_size,
+		       u16 *grps_added, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_move_delete_elem *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.add_move_delete_elem;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_sched_elems);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+	cmd->num_grps_req = cpu_to_le16(grps_req);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && grps_added)
+		*grps_added = le16_to_cpu(cmd->num_grps_updated);
+
+	return status;
+}
+
+/**
+ * ice_suspend_resume_elems - suspend/resume scheduler elements
+ * @hw: pointer to the hw struct
+ * @elems_req: number of elements to suspend
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @elems_ret: returns total number of elements suspended
+ * @cd: pointer to command details structure or NULL
+ * @cmd_code: command code for suspend or resume
+ *
+ * suspend/resume scheduler elements
+ */
+static enum ice_status
+ice_suspend_resume_elems(struct ice_hw *hw, u16 elems_req,
+			 struct ice_aqc_suspend_resume_elem *buf, u16 buf_size,
+			 u16 *elems_ret, struct ice_sq_cd *cd,
+			 enum ice_adminq_opc cmd_code)
+{
+	struct ice_aqc_get_cfg_elem *cmd;
+	struct ice_aq_desc desc;
+	enum ice_status status;
+
+	cmd = &desc.params.get_update_elem;
+	ice_fill_dflt_direct_cmd_desc(&desc, cmd_code);
+	cmd->num_elem_req = cpu_to_le16(elems_req);
+	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+	if (!status && elems_ret)
+		*elems_ret = le16_to_cpu(cmd->num_elem_resp);
+	return status;
+}
+
+/**
+ * ice_aq_suspend_sched_elems - suspend scheduler elements
+ * @hw: pointer to the hw struct
+ * @elems_req: number of elements to suspend
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @elems_ret: returns total number of elements suspended
+ * @cd: pointer to command details structure or NULL
+ *
+ * Suspend scheduling elements (0x0409)
+ */
+static enum ice_status
+ice_aq_suspend_sched_elems(struct ice_hw *hw, u16 elems_req,
+			   struct ice_aqc_suspend_resume_elem *buf,
+			   u16 buf_size, u16 *elems_ret, struct ice_sq_cd *cd)
+{
+	return ice_suspend_resume_elems(hw, elems_req, buf, buf_size, elems_ret,
+					cd, ice_aqc_opc_suspend_sched_elems);
+}
+
+/**
+ * ice_aq_resume_sched_elems - resume scheduler elements
+ * @hw: pointer to the hw struct
+ * @elems_req: number of elements to resume
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @elems_ret: returns total number of elements resumed
+ * @cd: pointer to command details structure or NULL
+ *
+ * resume scheduling elements (0x040A)
+ */
+static enum ice_status
+ice_aq_resume_sched_elems(struct ice_hw *hw, u16 elems_req,
+			  struct ice_aqc_suspend_resume_elem *buf,
+			  u16 buf_size, u16 *elems_ret, struct ice_sq_cd *cd)
+{
+	return ice_suspend_resume_elems(hw, elems_req, buf, buf_size, elems_ret,
+					cd, ice_aqc_opc_resume_sched_elems);
+}
+
 /**
  * ice_aq_query_sched_res - query scheduler resource
  * @hw: pointer to the hw struct
@@ -393,6 +497,46 @@ ice_aq_query_sched_res(struct ice_hw *hw, u16 buf_size,
 	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 }
 
+/**
+ * ice_sched_suspend_resume_elems - suspend or resume hw nodes
+ * @hw: pointer to the hw struct
+ * @num_nodes: number of nodes
+ * @node_teids: array of node teids to be suspended or resumed
+ * @suspend: true means suspend / false means resume
+ *
+ * This function suspends or resumes hw nodes
+ */
+static enum ice_status
+ice_sched_suspend_resume_elems(struct ice_hw *hw, u8 num_nodes, u32 *node_teids,
+			       bool suspend)
+{
+	struct ice_aqc_suspend_resume_elem *buf;
+	u16 i, buf_size, num_elem_ret = 0;
+	enum ice_status status;
+
+	buf_size = sizeof(*buf) * num_nodes;
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	for (i = 0; i < num_nodes; i++)
+		buf->teid[i] = cpu_to_le32(node_teids[i]);
+
+	if (suspend)
+		status = ice_aq_suspend_sched_elems(hw, num_nodes, buf,
+						    buf_size, &num_elem_ret,
+						    NULL);
+	else
+		status = ice_aq_resume_sched_elems(hw, num_nodes, buf,
+						   buf_size, &num_elem_ret,
+						   NULL);
+	if (status || num_elem_ret != num_nodes)
+		ice_debug(hw, ICE_DBG_SCHED, "suspend/resume failed\n");
+
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
+
 /**
  * ice_sched_clear_tx_topo - clears the schduler tree nodes
  * @pi: port information structure
@@ -476,6 +620,215 @@ void ice_sched_cleanup_all(struct ice_hw *hw)
 	hw->max_cgds = 0;
 }
 
+/**
+ * ice_sched_create_vsi_info_entry - create an empty new VSI entry
+ * @pi: port information structure
+ * @vsi_id: VSI Id
+ *
+ * This function creates a new VSI entry and adds it to list
+ */
+static struct ice_sched_vsi_info *
+ice_sched_create_vsi_info_entry(struct ice_port_info *pi, u16 vsi_id)
+{
+	struct ice_sched_vsi_info *vsi_elem;
+
+	if (!pi)
+		return NULL;
+
+	vsi_elem = devm_kzalloc(ice_hw_to_dev(pi->hw), sizeof(*vsi_elem),
+				GFP_KERNEL);
+	if (!vsi_elem)
+		return NULL;
+
+	list_add(&vsi_elem->list_entry, &pi->vsi_info_list);
+	vsi_elem->vsi_id = vsi_id;
+	return vsi_elem;
+}
+
+/**
+ * ice_sched_add_elems - add nodes to hw and SW DB
+ * @pi: port information structure
+ * @tc_node: pointer to the branch node
+ * @parent: pointer to the parent node
+ * @layer: layer number to add nodes
+ * @num_nodes: number of nodes
+ * @num_nodes_added: pointer to num nodes added
+ * @first_node_teid: if new nodes are added then return the teid of first node
+ *
+ * This function add nodes to hw as well as to SW DB for a given layer
+ */
+static enum ice_status
+ice_sched_add_elems(struct ice_port_info *pi, struct ice_sched_node *tc_node,
+		    struct ice_sched_node *parent, u8 layer, u16 num_nodes,
+		    u16 *num_nodes_added, u32 *first_node_teid)
+{
+	struct ice_sched_node *prev, *new_node;
+	struct ice_aqc_add_elem *buf;
+	u16 i, num_groups_added = 0;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u16 buf_size;
+	u32 teid;
+
+	buf_size = sizeof(*buf) + sizeof(*buf->generic) * (num_nodes - 1);
+	buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+
+	buf->hdr.parent_teid = parent->info.node_teid;
+	buf->hdr.num_elems = cpu_to_le16(num_nodes);
+	for (i = 0; i < num_nodes; i++) {
+		buf->generic[i].parent_teid = parent->info.node_teid;
+		buf->generic[i].data.elem_type = ICE_AQC_ELEM_TYPE_SE_GENERIC;
+		buf->generic[i].data.valid_sections =
+			ICE_AQC_ELEM_VALID_GENERIC | ICE_AQC_ELEM_VALID_CIR |
+			ICE_AQC_ELEM_VALID_EIR;
+		buf->generic[i].data.generic = 0;
+		buf->generic[i].data.cir_bw.bw_profile_idx =
+			ICE_SCHED_DFLT_RL_PROF_ID;
+		buf->generic[i].data.eir_bw.bw_profile_idx =
+			ICE_SCHED_DFLT_RL_PROF_ID;
+	}
+
+	status = ice_aq_add_sched_elems(hw, 1, buf, buf_size,
+					&num_groups_added, NULL);
+	if (status || num_groups_added != 1) {
+		ice_debug(hw, ICE_DBG_SCHED, "add elements failed\n");
+		devm_kfree(ice_hw_to_dev(hw), buf);
+		return ICE_ERR_CFG;
+	}
+
+	*num_nodes_added = num_nodes;
+	/* add nodes to the SW DB */
+	for (i = 0; i < num_nodes; i++) {
+		status = ice_sched_add_node(pi, layer, &buf->generic[i]);
+		if (status) {
+			ice_debug(hw, ICE_DBG_SCHED,
+				  "add nodes in SW DB failed status =%d\n",
+				  status);
+			break;
+		}
+
+		teid = le32_to_cpu(buf->generic[i].node_teid);
+		new_node = ice_sched_find_node_by_teid(parent, teid);
+
+		if (!new_node) {
+			ice_debug(hw, ICE_DBG_SCHED,
+				  "Node is missing for teid =%d\n", teid);
+			break;
+		}
+
+		new_node->sibling = NULL;
+		new_node->tc_num = tc_node->tc_num;
+
+		/* add it to previous node sibling pointer */
+		/* Note: siblings are not linked across branches */
+		prev = ice_sched_get_first_node(hw, tc_node, layer);
+
+		if (prev && prev != new_node) {
+			while (prev->sibling)
+				prev = prev->sibling;
+			prev->sibling = new_node;
+		}
+
+		if (i == 0)
+			*first_node_teid = teid;
+	}
+
+	devm_kfree(ice_hw_to_dev(hw), buf);
+	return status;
+}
+
+/**
+ * ice_sched_add_nodes_to_layer - Add nodes to a given layer
+ * @pi: port information structure
+ * @tc_node: pointer to TC node
+ * @parent: pointer to parent node
+ * @layer: layer number to add nodes
+ * @num_nodes: number of nodes to be added
+ * @first_node_teid: pointer to the first node teid
+ * @num_nodes_added: pointer to number of nodes added
+ *
+ * This function add nodes to a given layer.
+ */
+static enum ice_status
+ice_sched_add_nodes_to_layer(struct ice_port_info *pi,
+			     struct ice_sched_node *tc_node,
+			     struct ice_sched_node *parent, u8 layer,
+			     u16 num_nodes, u32 *first_node_teid,
+			     u16 *num_nodes_added)
+{
+	u32 *first_teid_ptr = first_node_teid;
+	u16 new_num_nodes, max_child_nodes;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u16 num_added = 0;
+	u32 temp;
+
+	if (!num_nodes)
+		return status;
+
+	if (!parent || layer < hw->sw_entry_point_layer)
+		return ICE_ERR_PARAM;
+
+	*num_nodes_added = 0;
+
+	/* max children per node per layer */
+	max_child_nodes =
+	    le16_to_cpu(hw->layer_info[parent->tx_sched_layer].max_children);
+
+	/* current number of children + required nodes exceed max children ? */
+	if ((parent->num_children + num_nodes) > max_child_nodes) {
+		/* Fail if the parent is a TC node */
+		if (parent == tc_node)
+			return ICE_ERR_CFG;
+
+		/* utilize all the spaces if the parent is not full */
+		if (parent->num_children < max_child_nodes) {
+			new_num_nodes = max_child_nodes - parent->num_children;
+			/* this recursion is intentional, and wouldn't
+			 * go more than 2 calls
+			 */
+			status = ice_sched_add_nodes_to_layer(pi, tc_node,
+							      parent, layer,
+							      new_num_nodes,
+							      first_node_teid,
+							      &num_added);
+			if (status)
+				return status;
+
+			*num_nodes_added += num_added;
+		}
+		/* Don't modify the first node teid memory if the first node was
+		 * added already in the above call. Instead send some temp
+		 * memory for all other recursive calls.
+		 */
+		if (num_added)
+			first_teid_ptr = &temp;
+
+		new_num_nodes = num_nodes - num_added;
+
+		/* This parent is full, try the next sibling */
+		parent = parent->sibling;
+
+		/* this recursion is intentional, for 1024 queues
+		 * per VSI, it goes max of 16 iterations.
+		 * 1024 / 8 = 128 layer 8 nodes
+		 * 128 /8 = 16 (add 8 nodes per iteration)
+		 */
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent,
+						      layer, new_num_nodes,
+						      first_teid_ptr,
+						      &num_added);
+		*num_nodes_added += num_added;
+		return status;
+	}
+
+	status = ice_sched_add_elems(pi, tc_node, parent, layer, num_nodes,
+				     num_nodes_added, first_node_teid);
+	return status;
+}
+
 /**
  * ice_sched_get_qgrp_layer - get the current queue group layer number
  * @hw: pointer to the hw struct
@@ -488,6 +841,101 @@ static u8 ice_sched_get_qgrp_layer(struct ice_hw *hw)
 	return hw->num_tx_sched_layers - ICE_QGRP_LAYER_OFFSET;
 }
 
+/**
+ * ice_sched_get_vsi_layer - get the current VSI layer number
+ * @hw: pointer to the hw struct
+ *
+ * This function returns the current VSI layer number
+ */
+static u8 ice_sched_get_vsi_layer(struct ice_hw *hw)
+{
+	/* Num Layers       VSI layer
+	 *     9               6
+	 *     7               4
+	 *     5 or less       sw_entry_point_layer
+	 */
+	/* calculate the vsi layer based on number of layers. */
+	if (hw->num_tx_sched_layers > ICE_VSI_LAYER_OFFSET + 1) {
+		u8 layer = hw->num_tx_sched_layers - ICE_VSI_LAYER_OFFSET;
+
+		if (layer > hw->sw_entry_point_layer)
+			return layer;
+	}
+	return hw->sw_entry_point_layer;
+}
+
+/**
+ * ice_sched_get_num_nodes_per_layer - Get the total number of nodes per layer
+ * @pi: pointer to the port info struct
+ * @layer: layer number
+ *
+ * This function calculates the number of nodes present in the scheduler tree
+ * including all the branches for a given layer
+ */
+static u16
+ice_sched_get_num_nodes_per_layer(struct ice_port_info *pi, u8 layer)
+{
+	struct ice_hw *hw;
+	u16 num_nodes = 0;
+	u8 i;
+
+	if (!pi)
+		return num_nodes;
+
+	hw = pi->hw;
+
+	/* Calculate the number of nodes for all TCs */
+	for (i = 0; i < pi->root->num_children; i++) {
+		struct ice_sched_node *tc_node, *node;
+
+		tc_node = pi->root->children[i];
+
+		/* Get the first node */
+		node = ice_sched_get_first_node(hw, tc_node, layer);
+		if (!node)
+			continue;
+
+		/* count the siblings */
+		while (node) {
+			num_nodes++;
+			node = node->sibling;
+		}
+	}
+
+	return num_nodes;
+}
+
+/**
+ * ice_sched_val_max_nodes - check max number of nodes reached or not
+ * @pi: port information structure
+ * @new_num_nodes_per_layer: pointer to the new number of nodes array
+ *
+ * This function checks whether the scheduler tree layers have enough space to
+ * add new nodes
+ */
+static enum ice_status
+ice_sched_validate_for_max_nodes(struct ice_port_info *pi,
+				 u16 *new_num_nodes_per_layer)
+{
+	struct ice_hw *hw = pi->hw;
+	u8 i, qg_layer;
+	u16 num_nodes;
+
+	qg_layer = ice_sched_get_qgrp_layer(hw);
+
+	/* walk through all the layers from SW entry point to qgroup layer */
+	for (i = hw->sw_entry_point_layer; i <= qg_layer; i++) {
+		num_nodes = ice_sched_get_num_nodes_per_layer(pi, i);
+		if (num_nodes + new_num_nodes_per_layer[i] >
+		    le16_to_cpu(hw->layer_info[i].max_pf_nodes)) {
+			ice_debug(hw, ICE_DBG_SCHED,
+				  "max nodes reached for layer = %d\n", i);
+			return ICE_ERR_CFG;
+		}
+	}
+	return 0;
+}
+
 /**
  * ice_rm_dflt_leaf_node - remove the default leaf node in the tree
  * @pi: port information structure
@@ -530,6 +978,7 @@ ice_sched_rm_dflt_nodes(struct ice_port_info *pi)
 	struct ice_sched_node *node;
 
 	ice_rm_dflt_leaf_node(pi);
+
 	/* remove the default nodes except TC and root nodes */
 	node = pi->root;
 	while (node) {
@@ -539,6 +988,7 @@ ice_sched_rm_dflt_nodes(struct ice_port_info *pi)
 			ice_free_sched_node(pi, node);
 			break;
 		}
+
 		if (!node->num_children)
 			break;
 		node = node->children[0];
@@ -734,8 +1184,10 @@ ice_sched_find_node_in_subtree(struct ice_hw *hw, struct ice_sched_node *base,
 
 		if (node == child)
 			return true;
+
 		if (child->tx_sched_layer > node->tx_sched_layer)
 			return false;
+
 		/* this recursion is intentional, and wouldn't
 		 * go more than 8 calls
 		 */
@@ -765,13 +1217,17 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_id, u8 tc,
 
 	qgrp_layer = ice_sched_get_qgrp_layer(pi->hw);
 	max_children = le16_to_cpu(pi->hw->layer_info[qgrp_layer].max_children);
+
 	list_elem = ice_sched_get_vsi_info_entry(pi, vsi_id);
 	if (!list_elem)
 		goto lan_q_exit;
+
 	vsi_node = list_elem->vsi_node[tc];
+
 	/* validate invalid VSI id */
 	if (!vsi_node)
 		goto lan_q_exit;
+
 	/* get the first q group node from VSI sub-tree */
 	qgrp_node = ice_sched_get_first_node(pi->hw, vsi_node, qgrp_layer);
 	while (qgrp_node) {
@@ -782,6 +1238,436 @@ ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_id, u8 tc,
 				break;
 		qgrp_node = qgrp_node->sibling;
 	}
+
 lan_q_exit:
 	return qgrp_node;
 }
+
+/**
+ * ice_sched_get_vsi_node - Get a VSI node based on VSI id
+ * @hw: pointer to the hw struct
+ * @tc_node: pointer to the TC node
+ * @vsi_id: VSI id
+ *
+ * This function retrieves a VSI node for a given VSI id from a given
+ * TC branch
+ */
+static struct ice_sched_node *
+ice_sched_get_vsi_node(struct ice_hw *hw, struct ice_sched_node *tc_node,
+		       u16 vsi_id)
+{
+	struct ice_sched_node *node;
+	u8 vsi_layer;
+
+	vsi_layer = ice_sched_get_vsi_layer(hw);
+	node = ice_sched_get_first_node(hw, tc_node, vsi_layer);
+
+	/* Check whether it already exists */
+	while (node) {
+		if (node->vsi_id == vsi_id)
+			return node;
+		node = node->sibling;
+	}
+
+	return node;
+}
+
+/**
+ * ice_sched_calc_vsi_child_nodes - calculate number of VSI child nodes
+ * @hw: pointer to the hw struct
+ * @num_qs: number of queues
+ * @num_nodes: num nodes array
+ *
+ * This function calculates the number of VSI child nodes based on the
+ * number of queues.
+ */
+static void
+ice_sched_calc_vsi_child_nodes(struct ice_hw *hw, u16 num_qs, u16 *num_nodes)
+{
+	u16 num = num_qs;
+	u8 i, qgl, vsil;
+
+	qgl = ice_sched_get_qgrp_layer(hw);
+	vsil = ice_sched_get_vsi_layer(hw);
+
+	/* calculate num nodes from q group to VSI layer */
+	for (i = qgl; i > vsil; i--) {
+		u16 max_children = le16_to_cpu(hw->layer_info[i].max_children);
+
+		/* round to the next integer if there is a remainder */
+		num = DIV_ROUND_UP(num, max_children);
+
+		/* need at least one node */
+		num_nodes[i] = num ? num : 1;
+	}
+}
+
+/**
+ * ice_sched_add_vsi_child_nodes - add VSI child nodes to tree
+ * @pi: port information structure
+ * @vsi_id: VSI id
+ * @tc_node: pointer to the TC node
+ * @num_nodes: pointer to the num nodes that needs to be added per layer
+ * @owner: node owner (lan or rdma)
+ *
+ * This function adds the VSI child nodes to tree. It gets called for
+ * lan and rdma separately.
+ */
+static enum ice_status
+ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_id,
+			      struct ice_sched_node *tc_node, u16 *num_nodes,
+			      u8 owner)
+{
+	struct ice_sched_node *parent, *node;
+	struct ice_hw *hw = pi->hw;
+	enum ice_status status;
+	u32 first_node_teid;
+	u16 num_added = 0;
+	u8 i, qgl, vsil;
+
+	status = ice_sched_validate_for_max_nodes(pi, num_nodes);
+	if (status)
+		return status;
+
+	qgl = ice_sched_get_qgrp_layer(hw);
+	vsil = ice_sched_get_vsi_layer(hw);
+	parent = ice_sched_get_vsi_node(hw, tc_node, vsi_id);
+	for (i = vsil + 1; i <= qgl; i++) {
+		if (!parent)
+			return ICE_ERR_CFG;
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent, i,
+						      num_nodes[i],
+						      &first_node_teid,
+						      &num_added);
+		if (status || num_nodes[i] != num_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_added) {
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+			node = parent;
+			while (node) {
+				node->owner = owner;
+				node = node->sibling;
+			}
+		} else {
+			parent = parent->children[0];
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_sched_rm_vsi_child_nodes - remove VSI child nodes from the tree
+ * @pi: port information structure
+ * @vsi_node: pointer to the VSI node
+ * @num_nodes: pointer to the num nodes that needs to be removed per layer
+ * @owner: node owner (lan or rdma)
+ *
+ * This function removes the VSI child nodes from the tree. It gets called for
+ * lan and rdma separately.
+ */
+static void
+ice_sched_rm_vsi_child_nodes(struct ice_port_info *pi,
+			     struct ice_sched_node *vsi_node, u16 *num_nodes,
+			     u8 owner)
+{
+	struct ice_sched_node *node, *next;
+	u8 i, qgl, vsil;
+	u16 num;
+
+	qgl = ice_sched_get_qgrp_layer(pi->hw);
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+
+	for (i = qgl; i > vsil; i--) {
+		num = num_nodes[i];
+		node = ice_sched_get_first_node(pi->hw, vsi_node, i);
+		while (node && num) {
+			next = node->sibling;
+			if (node->owner == owner && !node->num_children) {
+				ice_free_sched_node(pi, node);
+				num--;
+			}
+			node = next;
+		}
+	}
+}
+
+/**
+ * ice_sched_calc_vsi_support_nodes - calculate number of VSI support nodes
+ * @hw: pointer to the hw struct
+ * @tc_node: pointer to TC node
+ * @num_nodes: pointer to num nodes array
+ *
+ * This function calculates the number of supported nodes needed to add this
+ * VSI into tx tree including the VSI, parent and intermediate nodes in below
+ * layers
+ */
+static void
+ice_sched_calc_vsi_support_nodes(struct ice_hw *hw,
+				 struct ice_sched_node *tc_node, u16 *num_nodes)
+{
+	struct ice_sched_node *node;
+	u16 max_child;
+	u8 i, vsil;
+
+	vsil = ice_sched_get_vsi_layer(hw);
+	for (i = vsil; i >= hw->sw_entry_point_layer; i--)
+		/* Add intermediate nodes if TC has no children and
+		 * need at least one node for VSI
+		 */
+		if (!tc_node->num_children || i == vsil) {
+			num_nodes[i]++;
+		} else {
+			/* If intermediate nodes are reached max children
+			 * then add a new one.
+			 */
+			node = ice_sched_get_first_node(hw, tc_node, i);
+			max_child = le16_to_cpu(hw->layer_info[i].max_children);
+
+			/* scan all the siblings */
+			while (node) {
+				if (node->num_children < max_child)
+					break;
+				node = node->sibling;
+			}
+
+			/* all the nodes are full, allocate a new one */
+			if (!node)
+				num_nodes[i]++;
+		}
+}
+
+/**
+ * ice_sched_add_vsi_support_nodes - add VSI supported nodes into tx tree
+ * @pi: port information structure
+ * @vsi_id: VSI Id
+ * @tc_node: pointer to TC node
+ * @num_nodes: pointer to num nodes array
+ *
+ * This function adds the VSI supported nodes into tx tree including the
+ * VSI, its parent and intermediate nodes in below layers
+ */
+static enum ice_status
+ice_sched_add_vsi_support_nodes(struct ice_port_info *pi, u16 vsi_id,
+				struct ice_sched_node *tc_node, u16 *num_nodes)
+{
+	struct ice_sched_node *parent = tc_node;
+	enum ice_status status;
+	u32 first_node_teid;
+	u16 num_added = 0;
+	u8 i, vsil;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	status = ice_sched_validate_for_max_nodes(pi, num_nodes);
+	if (status)
+		return status;
+
+	vsil = ice_sched_get_vsi_layer(pi->hw);
+	for (i = pi->hw->sw_entry_point_layer; i <= vsil; i++) {
+		status = ice_sched_add_nodes_to_layer(pi, tc_node, parent,
+						      i, num_nodes[i],
+						      &first_node_teid,
+						      &num_added);
+		if (status || num_nodes[i] != num_added)
+			return ICE_ERR_CFG;
+
+		/* The newly added node can be a new parent for the next
+		 * layer nodes
+		 */
+		if (num_added)
+			parent = ice_sched_find_node_by_teid(tc_node,
+							     first_node_teid);
+		else
+			parent = parent->children[0];
+
+		if (!parent)
+			return ICE_ERR_CFG;
+
+		if (i == vsil)
+			parent->vsi_id = vsi_id;
+	}
+	return 0;
+}
+
+/**
+ * ice_sched_add_vsi_to_topo - add a new VSI into tree
+ * @pi: port information structure
+ * @vsi_id: VSI Id
+ * @tc: TC number
+ *
+ * This function adds a new VSI into scheduler tree
+ */
+static enum ice_status
+ice_sched_add_vsi_to_topo(struct ice_port_info *pi, u16 vsi_id, u8 tc)
+{
+	u16 num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	struct ice_sched_node *tc_node;
+	struct ice_hw *hw = pi->hw;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_PARAM;
+
+	/* calculate number of supported nodes needed for this VSI */
+	ice_sched_calc_vsi_support_nodes(hw, tc_node, num_nodes);
+
+	/* add vsi supported nodes to tc subtree */
+	return ice_sched_add_vsi_support_nodes(pi, vsi_id, tc_node, num_nodes);
+}
+
+/**
+ * ice_sched_update_vsi_child_nodes - update VSI child nodes
+ * @pi: port information structure
+ * @vsi_id: VSI Id
+ * @tc: TC number
+ * @new_numqs: new number of max queues
+ * @owner: owner of this subtree
+ *
+ * This function updates the VSI child nodes based on the number of queues
+ */
+static enum ice_status
+ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_id, u8 tc,
+				 u16 new_numqs, u8 owner)
+{
+	u16 prev_num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	u16 new_num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
+	struct ice_sched_node *vsi_node;
+	struct ice_sched_node *tc_node;
+	struct ice_sched_vsi_info *vsi;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+	u16 prev_numqs;
+	u8 i;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_CFG;
+
+	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_id);
+	if (!vsi_node)
+		return ICE_ERR_CFG;
+
+	vsi = ice_sched_get_vsi_info_entry(pi, vsi_id);
+	if (!vsi)
+		return ICE_ERR_CFG;
+
+	if (owner == ICE_SCHED_NODE_OWNER_LAN)
+		prev_numqs = vsi->max_lanq[tc];
+	else
+		return ICE_ERR_PARAM;
+
+	/* num queues are not changed */
+	if (prev_numqs == new_numqs)
+		return status;
+
+	/* calculate number of nodes based on prev/new number of qs */
+	if (prev_numqs)
+		ice_sched_calc_vsi_child_nodes(hw, prev_numqs, prev_num_nodes);
+
+	if (new_numqs)
+		ice_sched_calc_vsi_child_nodes(hw, new_numqs, new_num_nodes);
+
+	if (prev_numqs > new_numqs) {
+		for (i = 0; i < ICE_AQC_TOPO_MAX_LEVEL_NUM; i++)
+			new_num_nodes[i] = prev_num_nodes[i] - new_num_nodes[i];
+
+		ice_sched_rm_vsi_child_nodes(pi, vsi_node, new_num_nodes,
+					     owner);
+	} else {
+		for (i = 0; i < ICE_AQC_TOPO_MAX_LEVEL_NUM; i++)
+			new_num_nodes[i] -= prev_num_nodes[i];
+
+		status = ice_sched_add_vsi_child_nodes(pi, vsi_id, tc_node,
+						       new_num_nodes, owner);
+		if (status)
+			return status;
+	}
+
+	if (owner == ICE_SCHED_NODE_OWNER_LAN)
+		vsi->max_lanq[tc] = new_numqs;
+
+	return status;
+}
+
+/**
+ * ice_sched_cfg_vsi - configure the new/exisiting VSI
+ * @pi: port information structure
+ * @vsi_id: VSI Id
+ * @tc: TC number
+ * @maxqs: max number of queues
+ * @owner: lan or rdma
+ * @enable: TC enabled or disabled
+ *
+ * This function adds/updates VSI nodes based on the number of queues. If TC is
+ * enabled and VSI is in suspended state then resume the VSI back. If TC is
+ * disabled then suspend the VSI if it is not already.
+ */
+enum ice_status
+ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_id, u8 tc, u16 maxqs,
+		  u8 owner, bool enable)
+{
+	struct ice_sched_node *vsi_node, *tc_node;
+	struct ice_sched_vsi_info *vsi;
+	enum ice_status status = 0;
+	struct ice_hw *hw = pi->hw;
+
+	tc_node = ice_sched_get_tc_node(pi, tc);
+	if (!tc_node)
+		return ICE_ERR_PARAM;
+
+	vsi = ice_sched_get_vsi_info_entry(pi, vsi_id);
+	if (!vsi)
+		vsi = ice_sched_create_vsi_info_entry(pi, vsi_id);
+	if (!vsi)
+		return ICE_ERR_NO_MEMORY;
+
+	vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_id);
+
+	/* suspend the VSI if tc is not enabled */
+	if (!enable) {
+		if (vsi_node && vsi_node->in_use) {
+			u32 teid = le32_to_cpu(vsi_node->info.node_teid);
+
+			status = ice_sched_suspend_resume_elems(hw, 1, &teid,
+								true);
+			if (!status)
+				vsi_node->in_use = false;
+		}
+		return status;
+	}
+
+	/* TC is enabled, if it is a new VSI then add it to the tree */
+	if (!vsi_node) {
+		status = ice_sched_add_vsi_to_topo(pi, vsi_id, tc);
+		if (status)
+			return status;
+		vsi_node = ice_sched_get_vsi_node(hw, tc_node, vsi_id);
+		if (!vsi_node)
+			return ICE_ERR_CFG;
+		vsi->vsi_node[tc] = vsi_node;
+		vsi_node->in_use = true;
+	}
+
+	/* update the VSI child nodes */
+	status = ice_sched_update_vsi_child_nodes(pi, vsi_id, tc, maxqs, owner);
+	if (status)
+		return status;
+
+	/* TC is enabled, resume the VSI if it is in the suspend state */
+	if (!vsi_node->in_use) {
+		u32 teid = le32_to_cpu(vsi_node->info.node_teid);
+
+		status = ice_sched_suspend_resume_elems(hw, 1, &teid, false);
+		if (!status)
+			vsi_node->in_use = true;
+	}
+
+	return status;
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.h b/drivers/net/ethernet/intel/ice/ice_sched.h
index a3a9fc14603a..a17ca145c8bc 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.h
+++ b/drivers/net/ethernet/intel/ice/ice_sched.h
@@ -21,6 +21,7 @@
 #include "ice_common.h"
 
 #define ICE_QGRP_LAYER_OFFSET	2
+#define ICE_VSI_LAYER_OFFSET	4
 
 struct ice_sched_agg_vsi_info {
 	struct list_head list_entry;
@@ -50,4 +51,7 @@ struct ice_sched_node *ice_sched_get_tc_node(struct ice_port_info *pi, u8 tc);
 struct ice_sched_node *
 ice_sched_get_free_qparent(struct ice_port_info *pi, u16 vsi_id, u8 tc,
 			   u8 owner);
+enum ice_status
+ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_id, u8 tc, u16 maxqs,
+		  u8 owner, bool enable);
 #endif /* _ICE_SCHED_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index a576d173b07d..ce091e83b60a 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -24,6 +24,11 @@
 #include "ice_controlq.h"
 #include "ice_lan_tx_rx.h"
 
+static inline bool ice_is_tc_ena(u8 bitmap, u8 tc)
+{
+	return test_bit(tc, (unsigned long *)&bitmap);
+}
+
 /* debug masks - set these bits in hw->debug_mask to control output */
 #define ICE_DBG_INIT		BIT_ULL(1)
 #define ICE_DBG_QCTX		BIT_ULL(6)
@@ -208,6 +213,8 @@ enum ice_agg_type {
 	ICE_AGG_TYPE_QG
 };
 
+#define ICE_SCHED_DFLT_RL_PROF_ID	0
+
 /* vsi type list entry to locate corresponding vsi/ag nodes */
 struct ice_sched_vsi_info {
 	struct ice_sched_node *vsi_node[ICE_MAX_TRAFFIC_CLASS];
-- 
2.14.3

^ permalink raw reply related

* [PATCH v2 14/15] ice: Support link events, reset and rebuild
From: Anirudh Venkataramanan @ 2018-03-15 23:48 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev
In-Reply-To: <20180315234802.31336-1-anirudh.venkataramanan@intel.com>

Link events are posted to a PF's admin receive queue (ARQ). This patch
adds the ability to detect and process link events.

This patch also adds the ability to process resets.

The driver can process the following resets:
    1) EMP Reset (EMPR)
    2) Global Reset (GLOBR)
    3) Core Reset (CORER)
    4) Physical Function Reset (PFR)

EMPR is the largest level of reset that the driver can handle. An EMPR
resets the manageability block and also the data path, including PHY and
link for all the PFs. The affected PFs are notified of this event through
a miscellaneous interrupt.

GLOBR is a subset of EMPR. It does everything EMPR does except that it
doesn't reset the manageability block.

CORER is a subset of GLOBR. It does everything GLOBR does but doesn't
reset PHY and link.

PFR is a subset of CORER and affects only the given physical function.
In other words, PFR can be thought of as a CORER for a single PF. Since
only the issuing PF is affected, a PFR doesn't result in the miscellaneousi
interrupt being triggered.

All the resets have the following in common:
1) Tx/Rx is halted and all queues are stopped.
2) All the VSIs and filters programmed for the PF are lost and have to be
   reprogrammed.
3) Control queue interfaces are reset and have to be reprogrammed.

In the rebuild flow, control queues are reinitialized, VSIs are reallocated
and filters are restored.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h            |  19 +
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |  19 +
 drivers/net/ethernet/intel/ice/ice_common.c     |  60 +++
 drivers/net/ethernet/intel/ice/ice_common.h     |   5 +
 drivers/net/ethernet/intel/ice/ice_hw_autogen.h |   2 +
 drivers/net/ethernet/intel/ice/ice_main.c       | 581 +++++++++++++++++++++++-
 drivers/net/ethernet/intel/ice/ice_type.h       |   1 +
 7 files changed, 681 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index cb1e8a127af1..6d7d03b80dbf 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -92,6 +92,11 @@ extern const char ice_drv_ver[];
 #define ICE_RX_DESC(R, i) (&(((union ice_32b_rx_flex_desc *)((R)->desc))[i]))
 #define ICE_TX_CTX_DESC(R, i) (&(((struct ice_tx_ctx_desc *)((R)->desc))[i]))
 
+/* Macro for each VSI in a PF */
+#define ice_for_each_vsi(pf, i) \
+	for ((i) = 0; (i) < (pf)->num_alloc_vsi; (i)++)
+
+/* Macros for each tx/rx ring in a VSI */
 #define ice_for_each_txq(vsi, i) \
 	for ((i) = 0; (i) < (vsi)->num_txq; (i)++)
 
@@ -123,7 +128,16 @@ struct ice_sw {
 
 enum ice_state {
 	__ICE_DOWN,
+	__ICE_NEEDS_RESTART,
+	__ICE_RESET_RECOVERY_PENDING,	/* set by driver when reset starts */
 	__ICE_PFR_REQ,			/* set by driver and peers */
+	__ICE_CORER_REQ,		/* set by driver and peers */
+	__ICE_GLOBR_REQ,		/* set by driver and peers */
+	__ICE_CORER_RECV,		/* set by OICR handler */
+	__ICE_GLOBR_RECV,		/* set by OICR handler */
+	__ICE_EMPR_RECV,		/* set by OICR handler */
+	__ICE_SUSPENDED,		/* set on module remove path */
+	__ICE_RESET_FAILED,		/* set by reset/rebuild */
 	__ICE_ADMINQ_EVENT_PENDING,
 	__ICE_CFG_BUSY,
 	__ICE_SERVICE_SCHED,
@@ -240,6 +254,11 @@ struct ice_pf {
 	u16 q_left_rx;		/* remaining num rx queues left unclaimed */
 	u16 next_vsi;		/* Next free slot in pf->vsi[] - 0-based! */
 	u16 num_alloc_vsi;
+	u16 corer_count;	/* Core reset count */
+	u16 globr_count;	/* Global reset count */
+	u16 empr_count;		/* EMP reset count */
+	u16 pfr_count;		/* PF reset count */
+
 	struct ice_hw_port_stats stats;
 	struct ice_hw_port_stats stats_prev;
 	struct ice_hw hw;
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 62509635fc5e..8cade22c1cf6 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1023,6 +1023,23 @@ struct ice_aqc_get_link_status_data {
 	__le64 reserved4;
 };
 
+/* Set event mask command (direct 0x0613) */
+struct ice_aqc_set_event_mask {
+	u8	lport_num;
+	u8	reserved[7];
+	__le16	event_mask;
+#define ICE_AQ_LINK_EVENT_UPDOWN		BIT(1)
+#define ICE_AQ_LINK_EVENT_MEDIA_NA		BIT(2)
+#define ICE_AQ_LINK_EVENT_LINK_FAULT		BIT(3)
+#define ICE_AQ_LINK_EVENT_PHY_TEMP_ALARM	BIT(4)
+#define ICE_AQ_LINK_EVENT_EXCESSIVE_ERRORS	BIT(5)
+#define ICE_AQ_LINK_EVENT_SIGNAL_DETECT		BIT(6)
+#define ICE_AQ_LINK_EVENT_AN_COMPLETED		BIT(7)
+#define ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL	BIT(8)
+#define ICE_AQ_LINK_EVENT_PORT_TX_SUSPENDED	BIT(9)
+	u8	reserved1[6];
+};
+
 /* NVM Read command (indirect 0x0701)
  * NVM Erase commands (direct 0x0702)
  * NVM Update commands (indirect 0x0703)
@@ -1229,6 +1246,7 @@ struct ice_aq_desc {
 		struct ice_aqc_dis_txqs dis_txqs;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
 		struct ice_aqc_alloc_free_res_cmd sw_res_ctrl;
+		struct ice_aqc_set_event_mask set_event_mask;
 		struct ice_aqc_get_link_status get_link_status;
 	} params;
 };
@@ -1308,6 +1326,7 @@ enum ice_adminq_opc {
 	ice_aqc_opc_set_phy_cfg				= 0x0601,
 	ice_aqc_opc_restart_an				= 0x0605,
 	ice_aqc_opc_get_link_status			= 0x0607,
+	ice_aqc_opc_set_event_mask			= 0x0613,
 
 	/* NVM commands */
 	ice_aqc_opc_nvm_read				= 0x0701,
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 958161a21115..316262b3c7a0 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -1441,6 +1441,39 @@ ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool atomic_restart)
 	return status;
 }
 
+/**
+ * ice_get_link_status - get status of the HW network link
+ * @pi: port information structure
+ * @link_up: pointer to bool (true/false = linkup/linkdown)
+ *
+ * Variable link_up is true if link is up, false if link is down.
+ * The variable link_up is invalid if status is non zero. As a
+ * result of this call, link status reporting becomes enabled
+ */
+enum ice_status ice_get_link_status(struct ice_port_info *pi, bool *link_up)
+{
+	struct ice_phy_info *phy_info;
+	enum ice_status status = 0;
+
+	if (!pi)
+		return ICE_ERR_PARAM;
+
+	phy_info = &pi->phy;
+
+	if (phy_info->get_link_info) {
+		status = ice_update_link_info(pi);
+
+		if (status)
+			ice_debug(pi->hw, ICE_DBG_LINK,
+				  "get link status error, status = %d\n",
+				  status);
+	}
+
+	*link_up = phy_info->link_info.link_info & ICE_AQ_LINK_UP;
+
+	return status;
+}
+
 /**
  * ice_aq_set_link_restart_an
  * @pi: pointer to the port information structure
@@ -1470,6 +1503,33 @@ ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
 }
 
+/**
+ * ice_aq_set_event_mask
+ * @hw: pointer to the hw struct
+ * @port_num: port number of the physical function
+ * @mask: event mask to be set
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set event mask (0x0613)
+ */
+enum ice_status
+ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_event_mask *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_event_mask;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_event_mask);
+
+	cmd->lport_num = port_num;
+
+	cmd->event_mask = cpu_to_le16(mask);
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
 /**
  * __ice_aq_get_set_rss_lut
  * @hw: pointer to the hardware structure
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index 3e33a47cb61a..2921f3c6ce4b 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -34,6 +34,8 @@ enum ice_status
 ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		  struct ice_rq_event_info *e, u16 *pending);
 enum ice_status
+ice_get_link_status(struct ice_port_info *pi, bool *link_up);
+enum ice_status
 ice_acquire_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 		enum ice_aq_res_access_type access);
 void ice_release_res(struct ice_hw *hw, enum ice_aq_res_ids res);
@@ -80,6 +82,9 @@ enum ice_status
 ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 		     struct ice_link_status *link, struct ice_sq_cd *cd);
 enum ice_status
+ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
+		      struct ice_sq_cd *cd);
+enum ice_status
 ice_dis_vsi_txq(struct ice_port_info *pi, u8 num_queues, u16 *q_ids,
 		u32 *q_teids, struct ice_sq_cd *cmd_details);
 enum ice_status
diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
index 0d24ec3ca975..c371043c8946 100644
--- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
+++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h
@@ -99,6 +99,8 @@
 #define GLGEN_RSTCTL			0x000B8180
 #define GLGEN_RSTCTL_GRSTDEL_S		0
 #define GLGEN_RSTCTL_GRSTDEL_M		ICE_M(0x3F, GLGEN_RSTCTL_GRSTDEL_S)
+#define GLGEN_RSTAT_RESET_TYPE_S	2
+#define GLGEN_RSTAT_RESET_TYPE_M	ICE_M(0x3, GLGEN_RSTAT_RESET_TYPE_S)
 #define GLGEN_RTRIG			0x000B8190
 #define GLGEN_RTRIG_CORER_S		0
 #define GLGEN_RTRIG_CORER_M		BIT(GLGEN_RTRIG_CORER_S)
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 8eef9a4c1d13..90f44be5f858 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -43,6 +43,8 @@ MODULE_PARM_DESC(debug, "netif level (0=none,...,16=all)");
 static struct workqueue_struct *ice_wq;
 static const struct net_device_ops ice_netdev_ops;
 
+static void ice_pf_dis_all_vsi(struct ice_pf *pf);
+static void ice_rebuild(struct ice_pf *pf);
 static int ice_vsi_release(struct ice_vsi *vsi);
 static void ice_update_vsi_stats(struct ice_vsi *vsi);
 static void ice_update_pf_stats(struct ice_pf *pf);
@@ -230,6 +232,132 @@ static void ice_free_fltr_list(struct device *dev, struct list_head *h)
 	}
 }
 
+/**
+ * ice_is_reset_recovery_pending - schedule a reset
+ * @state: pf state field
+ */
+static bool ice_is_reset_recovery_pending(unsigned long int *state)
+{
+	return test_bit(__ICE_RESET_RECOVERY_PENDING, state);
+}
+
+/**
+ * ice_prepare_for_reset - prep for the core to reset
+ * @pf: board private structure
+ *
+ * Inform or close all dependent features in prep for reset.
+ */
+static void
+ice_prepare_for_reset(struct ice_pf *pf)
+{
+	struct ice_hw *hw = &pf->hw;
+	u32 v;
+
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			ice_remove_vsi_fltr(hw, pf->vsi[v]->vsi_num);
+
+	dev_dbg(&pf->pdev->dev, "Tearing down internal switch for reset\n");
+
+	/* disable the VSIs and their queues that are not already DOWN */
+	/* pf_dis_all_vsi modifies netdev structures -rtnl_lock needed */
+	ice_pf_dis_all_vsi(pf);
+
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			pf->vsi[v]->vsi_num = 0;
+
+	ice_shutdown_all_ctrlq(hw);
+}
+
+/**
+ * ice_do_reset - Initiate one of many types of resets
+ * @pf: board private structure
+ * @reset_type: reset type requested
+ * before this function was called.
+ */
+static void ice_do_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
+{
+	struct device *dev = &pf->pdev->dev;
+	struct ice_hw *hw = &pf->hw;
+
+	dev_dbg(dev, "reset_type 0x%x requested\n", reset_type);
+	WARN_ON(in_interrupt());
+
+	/* PFR is a bit of a special case because it doesn't result in an OICR
+	 * interrupt. So for PFR, we prepare for reset, issue the reset and
+	 * rebuild sequentially.
+	 */
+	if (reset_type == ICE_RESET_PFR) {
+		set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		ice_prepare_for_reset(pf);
+	}
+
+	/* trigger the reset */
+	if (ice_reset(hw, reset_type)) {
+		dev_err(dev, "reset %d failed\n", reset_type);
+		set_bit(__ICE_RESET_FAILED, pf->state);
+		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		return;
+	}
+
+	if (reset_type == ICE_RESET_PFR) {
+		pf->pfr_count++;
+		ice_rebuild(pf);
+		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+	}
+}
+
+/**
+ * ice_reset_subtask - Set up for resetting the device and driver
+ * @pf: board private structure
+ */
+static void ice_reset_subtask(struct ice_pf *pf)
+{
+	enum ice_reset_req reset_type;
+
+	rtnl_lock();
+
+	/* When a CORER/GLOBR/EMPR is about to happen, the hardware triggers an
+	 * OICR interrupt. The OICR handler (ice_misc_intr) determines what
+	 * type of reset happened and sets __ICE_RESET_RECOVERY_PENDING bit in
+	 * pf->state. So if reset/recovery is pending (as indicated by this bit)
+	 * we do a rebuild and return.
+	 */
+	if (ice_is_reset_recovery_pending(pf->state)) {
+		clear_bit(__ICE_GLOBR_RECV, pf->state);
+		clear_bit(__ICE_CORER_RECV, pf->state);
+		ice_prepare_for_reset(pf);
+
+		/* make sure we are ready to rebuild */
+		if (ice_check_reset(&pf->hw))
+			set_bit(__ICE_RESET_FAILED, pf->state);
+		else
+			ice_rebuild(pf);
+		clear_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		goto unlock;
+	}
+
+	/* No pending resets to finish processing. Check for new resets */
+	if (test_and_clear_bit(__ICE_GLOBR_REQ, pf->state))
+		reset_type = ICE_RESET_GLOBR;
+	else if (test_and_clear_bit(__ICE_CORER_REQ, pf->state))
+		reset_type = ICE_RESET_CORER;
+	else if (test_and_clear_bit(__ICE_PFR_REQ, pf->state))
+		reset_type = ICE_RESET_PFR;
+	else
+		goto unlock;
+
+	/* reset if not already down or resetting */
+	if (!test_bit(__ICE_DOWN, pf->state) &&
+	    !test_bit(__ICE_CFG_BUSY, pf->state)) {
+		ice_do_reset(pf, reset_type);
+	}
+
+unlock:
+	rtnl_unlock();
+}
+
 /**
  * ice_watchdog_subtask - periodic tasks not using event driven scheduling
  * @pf: board private structure
@@ -328,6 +456,144 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup)
 		    speed, fc);
 }
 
+/**
+ * ice_init_link_events - enable/initialize link events
+ * @pi: pointer to the port_info instance
+ *
+ * Returns -EIO on failure, 0 on success
+ */
+static int ice_init_link_events(struct ice_port_info *pi)
+{
+	u16 mask;
+
+	mask = ~((u16)(ICE_AQ_LINK_EVENT_UPDOWN | ICE_AQ_LINK_EVENT_MEDIA_NA |
+		       ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL));
+
+	if (ice_aq_set_event_mask(pi->hw, pi->lport, mask, NULL)) {
+		dev_dbg(ice_hw_to_dev(pi->hw),
+			"Failed to set link event mask for port %d\n",
+			pi->lport);
+		return -EIO;
+	}
+
+	if (ice_aq_get_link_info(pi, true, NULL, NULL)) {
+		dev_dbg(ice_hw_to_dev(pi->hw),
+			"Failed to enable link events for port %d\n",
+			pi->lport);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vsi_link_event - update the vsi's netdev
+ * @vsi: the vsi on which the link event occurred
+ * @link_up: whether or not the vsi needs to be set up or down
+ */
+static void ice_vsi_link_event(struct ice_vsi *vsi, bool link_up)
+{
+	if (!vsi || test_bit(__ICE_DOWN, vsi->state))
+		return;
+
+	if (vsi->type == ICE_VSI_PF) {
+		if (!vsi->netdev) {
+			dev_dbg(&vsi->back->pdev->dev,
+				"vsi->netdev is not initialized!\n");
+			return;
+		}
+		if (link_up) {
+			netif_carrier_on(vsi->netdev);
+			netif_tx_wake_all_queues(vsi->netdev);
+		} else {
+			netif_carrier_off(vsi->netdev);
+			netif_tx_stop_all_queues(vsi->netdev);
+		}
+	}
+}
+
+/**
+ * ice_link_event - process the link event
+ * @pf: pf that the link event is associated with
+ * @pi: port_info for the port that the link event is associated with
+ *
+ * Returns -EIO if ice_get_link_status() fails
+ * Returns 0 on success
+ */
+static int
+ice_link_event(struct ice_pf *pf, struct ice_port_info *pi)
+{
+	u8 new_link_speed, old_link_speed;
+	struct ice_phy_info *phy_info;
+	bool new_link_same_as_old;
+	bool new_link, old_link;
+	u8 lport;
+	u16 v;
+
+	phy_info = &pi->phy;
+	phy_info->link_info_old = phy_info->link_info;
+	/* Force ice_get_link_status() to update link info */
+	phy_info->get_link_info = true;
+
+	old_link = (phy_info->link_info_old.link_info & ICE_AQ_LINK_UP);
+	old_link_speed = phy_info->link_info_old.link_speed;
+
+	lport = pi->lport;
+	if (ice_get_link_status(pi, &new_link)) {
+		dev_dbg(&pf->pdev->dev,
+			"Could not get link status for port %d\n", lport);
+		return -EIO;
+	}
+
+	new_link_speed = phy_info->link_info.link_speed;
+
+	new_link_same_as_old = (new_link == old_link &&
+				new_link_speed == old_link_speed);
+
+	ice_for_each_vsi(pf, v) {
+		struct ice_vsi *vsi = pf->vsi[v];
+
+		if (!vsi || !vsi->port_info)
+			continue;
+
+		if (new_link_same_as_old &&
+		    (test_bit(__ICE_DOWN, vsi->state) ||
+		    new_link == netif_carrier_ok(vsi->netdev)))
+			continue;
+
+		if (vsi->port_info->lport == lport) {
+			ice_print_link_msg(vsi, new_link);
+			ice_vsi_link_event(vsi, new_link);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * ice_handle_link_event - handle link event via ARQ
+ * @pf: pf that the link event is associated with
+ *
+ * Return -EINVAL if port_info is null
+ * Return status on succes
+ */
+static int ice_handle_link_event(struct ice_pf *pf)
+{
+	struct ice_port_info *port_info;
+	int status;
+
+	port_info = pf->hw.port_info;
+	if (!port_info)
+		return -EINVAL;
+
+	status = ice_link_event(pf, port_info);
+	if (status)
+		dev_dbg(&pf->pdev->dev,
+			"Could not process link event, error %d\n", status);
+
+	return status;
+}
+
 /**
  * __ice_clean_ctrlq - helper function to clean controlq rings
  * @pf: ptr to struct ice_pf
@@ -342,6 +608,10 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
 	const char *qtype;
 	u32 oldval, val;
 
+	/* Do not clean control queue if/when PF reset fails */
+	if (test_bit(__ICE_RESET_FAILED, pf->state))
+		return 0;
+
 	switch (q_type) {
 	case ICE_CTL_Q_ADMIN:
 		cq = &hw->adminq;
@@ -408,6 +678,7 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
 
 	do {
 		enum ice_status ret;
+		u16 opcode;
 
 		ret = ice_clean_rq_elem(hw, cq, &event, &pending);
 		if (ret == ICE_ERR_AQ_NO_WORK)
@@ -418,6 +689,21 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
 				ret);
 			break;
 		}
+
+		opcode = le16_to_cpu(event.desc.opcode);
+
+		switch (opcode) {
+		case ice_aqc_opc_get_link_status:
+			if (ice_handle_link_event(pf))
+				dev_err(&pf->pdev->dev,
+					"Could not handle link event");
+			break;
+		default:
+			dev_dbg(&pf->pdev->dev,
+				"%s Receive Queue unknown event 0x%04x ignored\n",
+				qtype, opcode);
+			break;
+		}
 	} while (pending && (i++ < ICE_DFLT_IRQ_WORK));
 
 	devm_kfree(&pf->pdev->dev, event.msg_buf);
@@ -497,6 +783,17 @@ static void ice_service_task(struct work_struct *work)
 	unsigned long start_time = jiffies;
 
 	/* subtasks */
+
+	/* process reset requests first */
+	ice_reset_subtask(pf);
+
+	/* bail if a reset/recovery cycle is pending */
+	if (ice_is_reset_recovery_pending(pf->state) ||
+	    test_bit(__ICE_SUSPENDED, pf->state)) {
+		ice_service_task_complete(pf);
+		return;
+	}
+
 	ice_watchdog_subtask(pf);
 	ice_clean_adminq_subtask(pf);
 
@@ -1222,6 +1519,37 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 	if (!(oicr & PFINT_OICR_INTEVENT_M))
 		goto ena_intr;
 
+	if (oicr & PFINT_OICR_GRST_M) {
+		u32 reset;
+		/* we have a reset warning */
+		ena_mask &= ~PFINT_OICR_GRST_M;
+		reset = (rd32(hw, GLGEN_RSTAT) & GLGEN_RSTAT_RESET_TYPE_M) >>
+			GLGEN_RSTAT_RESET_TYPE_S;
+
+		if (reset == ICE_RESET_CORER)
+			pf->corer_count++;
+		else if (reset == ICE_RESET_GLOBR)
+			pf->globr_count++;
+		else
+			pf->empr_count++;
+
+		/* If a reset cycle isn't already in progress, we set a bit in
+		 * pf->state so that the service task can start a reset/rebuild.
+		 * We also make note of which reset happened so that peer
+		 * devices/drivers can be informed.
+		 */
+		if (!test_bit(__ICE_RESET_RECOVERY_PENDING, pf->state)) {
+			if (reset == ICE_RESET_CORER)
+				set_bit(__ICE_CORER_RECV, pf->state);
+			else if (reset == ICE_RESET_GLOBR)
+				set_bit(__ICE_GLOBR_RECV, pf->state);
+			else
+				set_bit(__ICE_EMPR_RECV, pf->state);
+
+			set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+		}
+	}
+
 	if (oicr & PFINT_OICR_HMC_ERR_M) {
 		ena_mask &= ~PFINT_OICR_HMC_ERR_M;
 		dev_dbg(&pf->pdev->dev,
@@ -1240,9 +1568,10 @@ static irqreturn_t ice_misc_intr(int __always_unused irq, void *data)
 		 */
 		if (oicr & (PFINT_OICR_PE_CRITERR_M |
 			    PFINT_OICR_PCI_EXCEPTION_M |
-			    PFINT_OICR_ECC_ERR_M))
+			    PFINT_OICR_ECC_ERR_M)) {
 			set_bit(__ICE_PFR_REQ, pf->state);
-
+			ice_service_task_schedule(pf);
+		}
 		ena_mask &= ~oicr;
 	}
 	ret = IRQ_HANDLED;
@@ -1499,6 +1828,13 @@ static int ice_req_irq_msix_misc(struct ice_pf *pf)
 			 dev_driver_string(&pf->pdev->dev),
 			 dev_name(&pf->pdev->dev));
 
+	/* Do not request IRQ but do enable OICR interrupt since settings are
+	 * lost during reset. Note that this function is called only during
+	 * rebuild path and not while reset is in progress.
+	 */
+	if (ice_is_reset_recovery_pending(pf->state))
+		goto skip_req_irq;
+
 	/* reserve one vector in irq_tracker for misc interrupts */
 	oicr_idx = ice_get_res(pf, pf->irq_tracker, 1, ICE_RES_MISC_VEC_ID);
 	if (oicr_idx < 0)
@@ -1517,6 +1853,7 @@ static int ice_req_irq_msix_misc(struct ice_pf *pf)
 		return err;
 	}
 
+skip_req_irq:
 	ice_ena_misc_vector(pf);
 
 	val = (pf->oicr_idx & PFINT_OICR_CTL_MSIX_INDX_M) |
@@ -2084,6 +2421,100 @@ static int ice_vsi_cfg_rss(struct ice_vsi *vsi)
 	return err;
 }
 
+/**
+ * ice_vsi_reinit_setup - return resource and reallocate resource for a VSI
+ * @vsi: pointer to the ice_vsi
+ *
+ * This reallocates the VSIs queue resources
+ *
+ * Returns 0 on success and negative value on failure
+ */
+static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
+{
+	u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 };
+	int ret, i;
+
+	if (!vsi)
+		return -EINVAL;
+
+	ice_vsi_free_q_vectors(vsi);
+	ice_free_res(vsi->back->irq_tracker, vsi->base_vector, vsi->idx);
+	vsi->base_vector = 0;
+	ice_vsi_clear_rings(vsi);
+	ice_vsi_free_arrays(vsi, false);
+	ice_vsi_set_num_qs(vsi);
+
+	/* Initialize VSI struct elements and create VSI in FW */
+	ret = ice_vsi_add(vsi);
+	if (ret < 0)
+		goto err_vsi;
+
+	ret = ice_vsi_alloc_arrays(vsi, false);
+	if (ret < 0)
+		goto err_vsi;
+
+	switch (vsi->type) {
+	case ICE_VSI_PF:
+		if (!vsi->netdev) {
+			ret = ice_cfg_netdev(vsi);
+			if (ret)
+				goto err_rings;
+
+			ret = register_netdev(vsi->netdev);
+			if (ret)
+				goto err_rings;
+
+			netif_carrier_off(vsi->netdev);
+			netif_tx_stop_all_queues(vsi->netdev);
+		}
+
+		ret = ice_vsi_alloc_q_vectors(vsi);
+		if (ret)
+			goto err_rings;
+
+		ret = ice_vsi_setup_vector_base(vsi);
+		if (ret)
+			goto err_vectors;
+
+		ret = ice_vsi_alloc_rings(vsi);
+		if (ret)
+			goto err_vectors;
+
+		ice_vsi_map_rings_to_vectors(vsi);
+		break;
+	default:
+		break;
+	}
+
+	ice_vsi_set_tc_cfg(vsi);
+
+	/* configure VSI nodes based on number of queues and TC's */
+	for (i = 0; i < vsi->tc_cfg.numtc; i++)
+		max_txqs[i] = vsi->num_txq;
+
+	ret = ice_cfg_vsi_lan(vsi->port_info, vsi->vsi_num,
+			      vsi->tc_cfg.ena_tc, max_txqs);
+	if (ret) {
+		dev_info(&vsi->back->pdev->dev,
+			 "Failed VSI lan queue config\n");
+		goto err_vectors;
+	}
+	return 0;
+
+err_vectors:
+	ice_vsi_free_q_vectors(vsi);
+err_rings:
+	if (vsi->netdev) {
+		unregister_netdev(vsi->netdev);
+		free_netdev(vsi->netdev);
+		vsi->netdev = NULL;
+	}
+err_vsi:
+	ice_vsi_clear(vsi);
+	set_bit(__ICE_RESET_FAILED, vsi->back->state);
+	return ret;
+}
+
 /**
  * ice_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
@@ -2359,10 +2790,17 @@ static int ice_setup_pf_sw(struct ice_pf *pf)
 	struct ice_vsi *vsi;
 	int status = 0;
 
-	vsi = ice_vsi_setup(pf, ICE_VSI_PF, pf->hw.port_info);
-	if (!vsi) {
-		status = -ENOMEM;
-		goto error_exit;
+	if (!ice_is_reset_recovery_pending(pf->state)) {
+		vsi = ice_vsi_setup(pf, ICE_VSI_PF, pf->hw.port_info);
+		if (!vsi) {
+			status = -ENOMEM;
+			goto error_exit;
+		}
+	} else {
+		vsi = pf->vsi[0];
+		status = ice_vsi_reinit_setup(vsi);
+		if (status < 0)
+			return -EIO;
 	}
 
 	/* tmp_add_list contains a list of MAC addresses for which MAC
@@ -2751,6 +3189,12 @@ static int ice_probe(struct pci_dev *pdev,
 	/* since everything is good, start the service timer */
 	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));
 
+	err = ice_init_link_events(pf->hw.port_info);
+	if (err) {
+		dev_err(&pdev->dev, "ice_init_link_events failed: %d\n", err);
+		goto err_alloc_sw_unroll;
+	}
+
 	return 0;
 
 err_alloc_sw_unroll:
@@ -4231,6 +4675,131 @@ static int ice_vsi_release(struct ice_vsi *vsi)
 	return 0;
 }
 
+/**
+ * ice_dis_vsi - pause a VSI
+ * @vsi: the VSI being paused
+ */
+static void ice_dis_vsi(struct ice_vsi *vsi)
+{
+	if (test_bit(__ICE_DOWN, vsi->state))
+		return;
+
+	set_bit(__ICE_NEEDS_RESTART, vsi->state);
+
+	if (vsi->netdev && netif_running(vsi->netdev) &&
+	    vsi->type == ICE_VSI_PF)
+		vsi->netdev->netdev_ops->ndo_stop(vsi->netdev);
+
+	ice_vsi_close(vsi);
+}
+
+/**
+ * ice_ena_vsi - resume a VSI
+ * @vsi: the VSI being resume
+ */
+static void ice_ena_vsi(struct ice_vsi *vsi)
+{
+	if (!test_and_clear_bit(__ICE_NEEDS_RESTART, vsi->state))
+		return;
+
+	if (vsi->netdev && netif_running(vsi->netdev))
+		vsi->netdev->netdev_ops->ndo_open(vsi->netdev);
+	else if (ice_vsi_open(vsi))
+		/* this clears the DOWN bit */
+		dev_dbg(&vsi->back->pdev->dev, "Failed open VSI 0x%04X on switch 0x%04X\n",
+			vsi->vsi_num, vsi->vsw->sw_id);
+}
+
+/**
+ * ice_pf_dis_all_vsi - Pause all VSIs on a PF
+ * @pf: the PF
+ */
+static void ice_pf_dis_all_vsi(struct ice_pf *pf)
+{
+	int v;
+
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			ice_dis_vsi(pf->vsi[v]);
+}
+
+/**
+ * ice_pf_ena_all_vsi - Resume all VSIs on a PF
+ * @pf: the PF
+ */
+static void ice_pf_ena_all_vsi(struct ice_pf *pf)
+{
+	int v;
+
+	ice_for_each_vsi(pf, v)
+		if (pf->vsi[v])
+			ice_ena_vsi(pf->vsi[v]);
+}
+
+/**
+ * ice_rebuild - rebuild after reset
+ * @pf: pf to rebuild
+ */
+static void ice_rebuild(struct ice_pf *pf)
+{
+	struct device *dev = &pf->pdev->dev;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status ret;
+	int err;
+
+	if (test_bit(__ICE_DOWN, pf->state))
+		goto clear_recovery;
+
+	dev_dbg(dev, "rebuilding pf\n");
+
+	ret = ice_init_all_ctrlq(hw);
+	if (ret) {
+		dev_err(dev, "control queues init failed %d\n", ret);
+		goto fail_reset;
+	}
+
+	ret = ice_clear_pf_cfg(hw);
+	if (ret) {
+		dev_err(dev, "clear PF configuration failed %d\n", ret);
+		goto fail_reset;
+	}
+
+	ice_clear_pxe_mode(hw);
+
+	ret = ice_get_caps(hw);
+	if (ret) {
+		dev_err(dev, "ice_get_caps failed %d\n", ret);
+		goto fail_reset;
+	}
+
+	/* basic nic switch setup */
+	err = ice_setup_pf_sw(pf);
+	if (err) {
+		dev_err(dev, "ice_setup_pf_sw failed\n");
+		goto fail_reset;
+	}
+
+	/* start misc vector */
+	if (test_bit(ICE_FLAG_MSIX_ENA, pf->flags)) {
+		err = ice_req_irq_msix_misc(pf);
+		if (err) {
+			dev_err(dev, "misc vector setup failed: %d\n", err);
+			goto fail_reset;
+		}
+	}
+
+	/* restart the VSIs that were rebuilt and running before the reset */
+	ice_pf_ena_all_vsi(pf);
+
+	return;
+
+fail_reset:
+	ice_shutdown_all_ctrlq(hw);
+	set_bit(__ICE_RESET_FAILED, pf->state);
+clear_recovery:
+	set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
+}
+
 /**
  * ice_set_rss - Set RSS keys and lut
  * @vsi: Pointer to VSI structure
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index ce091e83b60a..100cb3cf8364 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -31,6 +31,7 @@ static inline bool ice_is_tc_ena(u8 bitmap, u8 tc)
 
 /* debug masks - set these bits in hw->debug_mask to control output */
 #define ICE_DBG_INIT		BIT_ULL(1)
+#define ICE_DBG_LINK		BIT_ULL(4)
 #define ICE_DBG_QCTX		BIT_ULL(6)
 #define ICE_DBG_NVM		BIT_ULL(7)
 #define ICE_DBG_LAN		BIT_ULL(8)
-- 
2.14.3

^ permalink raw reply related

* [PATCH v2 15/15] ice: Implement filter sync, NDO operations and bump version
From: Anirudh Venkataramanan @ 2018-03-15 23:48 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev
In-Reply-To: <20180315234802.31336-1-anirudh.venkataramanan@intel.com>

This patch implements multiple pieces of functionality:

1. Added ice_vsi_sync_filters, which is called through the service task
   to push filter updates to the hardware.

2. Add support to enable/disable promiscuous mode on an interface.
   Enabling/disabling promiscuous mode on an interface results in
   addition/removal of a promisc filter rule through ice_vsi_sync_filters.

3. Implement handlers for ndo_set_mac_address, ndo_change_mtu,
   ndo_poll_controller and ndo_set_rx_mode.

This patch also marks the end of the driver addition by bumping up the
driver version.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h            |  14 +
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |  21 +
 drivers/net/ethernet/intel/ice/ice_common.c     |  28 ++
 drivers/net/ethernet/intel/ice/ice_common.h     |   3 +
 drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h  |  12 +
 drivers/net/ethernet/intel/ice/ice_main.c       | 567 +++++++++++++++++++++++-
 drivers/net/ethernet/intel/ice/ice_switch.c     |  77 ++++
 drivers/net/ethernet/intel/ice/ice_switch.h     |   2 +
 drivers/net/ethernet/intel/ice/ice_type.h       |   5 +
 9 files changed, 728 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 6d7d03b80dbf..9bb8a99b929e 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -139,11 +139,20 @@ enum ice_state {
 	__ICE_SUSPENDED,		/* set on module remove path */
 	__ICE_RESET_FAILED,		/* set by reset/rebuild */
 	__ICE_ADMINQ_EVENT_PENDING,
+	__ICE_FLTR_OVERFLOW_PROMISC,
 	__ICE_CFG_BUSY,
 	__ICE_SERVICE_SCHED,
 	__ICE_STATE_NBITS		/* must be last */
 };
 
+enum ice_vsi_flags {
+	ICE_VSI_FLAG_UMAC_FLTR_CHANGED,
+	ICE_VSI_FLAG_MMAC_FLTR_CHANGED,
+	ICE_VSI_FLAG_VLAN_FLTR_CHANGED,
+	ICE_VSI_FLAG_PROMISC_CHANGED,
+	ICE_VSI_FLAG_NBITS		/* must be last */
+};
+
 /* struct that defines a VSI, associated with a dev */
 struct ice_vsi {
 	struct net_device *netdev;
@@ -158,7 +167,9 @@ struct ice_vsi {
 
 	u64 tx_linearize;
 	DECLARE_BITMAP(state, __ICE_STATE_NBITS);
+	DECLARE_BITMAP(flags, ICE_VSI_FLAG_NBITS);
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	unsigned int current_netdev_flags;
 	u32 tx_restart;
 	u32 tx_busy;
 	u32 rx_buf_failed;
@@ -189,6 +200,9 @@ struct ice_vsi {
 	struct ice_eth_stats eth_stats;
 	struct ice_eth_stats eth_stats_prev;
 
+	struct list_head tmp_sync_list;		/* MAC filters to be synced */
+	struct list_head tmp_unsync_list;	/* MAC filters to be unsynced */
+
 	bool irqs_ready;
 	bool current_isup;		 /* Sync 'link up' logging */
 	bool stat_offsets_loaded;
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 8cade22c1cf6..fc19c287ebc5 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -149,6 +149,24 @@ struct ice_aqc_manage_mac_read_resp {
 	u8 mac_addr[ETH_ALEN];
 };
 
+/* Manage MAC address, write command - direct (0x0108) */
+struct ice_aqc_manage_mac_write {
+	u8 port_num;
+	u8 flags;
+#define ICE_AQC_MAN_MAC_WR_MC_MAG_EN		BIT(0)
+#define ICE_AQC_MAN_MAC_WR_WOL_LAA_PFR_KEEP	BIT(1)
+#define ICE_AQC_MAN_MAC_WR_S		6
+#define ICE_AQC_MAN_MAC_WR_M		(3 << ICE_AQC_MAN_MAC_WR_S)
+#define ICE_AQC_MAN_MAC_UPDATE_LAA	0
+#define ICE_AQC_MAN_MAC_UPDATE_LAA_WOL	(BIT(0) << ICE_AQC_MAN_MAC_WR_S)
+	/* High 16 bits of MAC address in big endian order */
+	__be16 sah;
+	/* Low 32 bits of MAC address in big endian order */
+	__be32 sal;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
 /* Clear PXE Command and response (direct 0x0110) */
 struct ice_aqc_clear_pxe {
 	u8 rx_cnt;
@@ -1228,6 +1246,7 @@ struct ice_aq_desc {
 		struct ice_aqc_q_shutdown q_shutdown;
 		struct ice_aqc_req_res res_owner;
 		struct ice_aqc_manage_mac_read mac_read;
+		struct ice_aqc_manage_mac_write mac_write;
 		struct ice_aqc_clear_pxe clear_pxe;
 		struct ice_aqc_list_caps get_cap;
 		struct ice_aqc_get_phy_caps get_phy;
@@ -1272,6 +1291,7 @@ enum ice_aq_err {
 	ICE_AQ_RC_ENOMEM	= 9,  /* Out of memory */
 	ICE_AQ_RC_EBUSY		= 12, /* Device or resource busy */
 	ICE_AQ_RC_EEXIST	= 13, /* object already exists */
+	ICE_AQ_RC_ENOSPC	= 16, /* No space left or allocation failure */
 };
 
 /* Admin Queue command opcodes */
@@ -1290,6 +1310,7 @@ enum ice_adminq_opc {
 
 	/* manage MAC address */
 	ice_aqc_opc_manage_mac_read			= 0x0107,
+	ice_aqc_opc_manage_mac_write			= 0x0108,
 
 	/* PXE */
 	ice_aqc_opc_clear_pxe_mode			= 0x0110,
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index 316262b3c7a0..a36220792026 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -1246,6 +1246,34 @@ enum ice_status ice_get_caps(struct ice_hw *hw)
 	return status;
 }
 
+/**
+ * ice_aq_manage_mac_write - manage MAC address write command
+ * @hw: pointer to the hw struct
+ * @mac_addr: MAC address to be written as LAA/LAA+WoL/Port address
+ * @flags: flags to control write behavior
+ * @cd: pointer to command details structure or NULL
+ *
+ * This function is used to write MAC address to the NVM (0x0108).
+ */
+enum ice_status
+ice_aq_manage_mac_write(struct ice_hw *hw, u8 *mac_addr, u8 flags,
+			struct ice_sq_cd *cd)
+{
+	struct ice_aqc_manage_mac_write *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.mac_write;
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_manage_mac_write);
+
+	cmd->flags = flags;
+
+	/* Prep values for flags, sah, sal */
+	cmd->sah = htons(*((u16 *)mac_addr));
+	cmd->sal = htonl(*((u32 *)(mac_addr + 2)));
+
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
 /**
  * ice_aq_clear_pxe_mode
  * @hw: pointer to the hw struct
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index 2921f3c6ce4b..8e87d3b95d5f 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -72,6 +72,9 @@ enum ice_status
 ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc,
 		void *buf, u16 buf_size, struct ice_sq_cd *cd);
 enum ice_status ice_aq_get_fw_ver(struct ice_hw *hw, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_manage_mac_write(struct ice_hw *hw, u8 *mac_addr, u8 flags,
+			struct ice_sq_cd *cd);
 enum ice_status ice_clear_pf_cfg(struct ice_hw *hw);
 enum ice_status
 ice_set_fc(struct ice_port_info *pi, u8 *aq_failures, bool atomic_restart);
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index b1f38624da21..67fb0a9ba0df 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -367,6 +367,18 @@ enum ice_tx_desc_len_fields {
 	ICE_TX_DESC_LEN_L4_LEN_S	= 14 /* 4 BITS */
 };
 
+#define ICE_TXD_QW1_MACLEN_M (0x7FUL << ICE_TX_DESC_LEN_MACLEN_S)
+#define ICE_TXD_QW1_IPLEN_M  (0x7FUL << ICE_TX_DESC_LEN_IPLEN_S)
+#define ICE_TXD_QW1_L4LEN_M  (0xFUL << ICE_TX_DESC_LEN_L4_LEN_S)
+
+/* Tx descriptor field limits in bytes */
+#define ICE_TXD_MACLEN_MAX ((ICE_TXD_QW1_MACLEN_M >> \
+			     ICE_TX_DESC_LEN_MACLEN_S) * ICE_BYTES_PER_WORD)
+#define ICE_TXD_IPLEN_MAX ((ICE_TXD_QW1_IPLEN_M >> \
+			    ICE_TX_DESC_LEN_IPLEN_S) * ICE_BYTES_PER_DWORD)
+#define ICE_TXD_L4LEN_MAX ((ICE_TXD_QW1_L4LEN_M >> \
+			    ICE_TX_DESC_LEN_L4_LEN_S) * ICE_BYTES_PER_DWORD)
+
 #define ICE_TXD_QW1_TX_BUF_SZ_S	34
 #define ICE_TXD_QW1_L2TAG1_S	48
 
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 90f44be5f858..faffb6c8f05c 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -21,7 +21,7 @@
 
 #include "ice.h"
 
-#define DRV_VERSION	"ice-0.0.1-k"
+#define DRV_VERSION	"ice-0.7.0-k"
 #define DRV_SUMMARY	"Intel(R) Ethernet Connection E800 Series Linux Driver"
 const char ice_drv_ver[] = DRV_VERSION;
 static const char ice_driver_string[] = DRV_SUMMARY;
@@ -214,6 +214,48 @@ static int ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list,
 	return 0;
 }
 
+/**
+ * ice_add_mac_to_sync_list - creates list of mac addresses to be synced
+ * @netdev: the net device on which the sync is happening
+ * @addr: mac address to sync
+ *
+ * This is a callback function which is called by the in kernel device sync
+ * functions (like __dev_uc_sync, __dev_mc_sync, etc). This function only
+ * populates the tmp_sync_list, which is later used by ice_add_mac to add the
+ * mac filters from the hardware.
+ */
+static int ice_add_mac_to_sync_list(struct net_device *netdev, const u8 *addr)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+
+	if (ice_add_mac_to_list(vsi, &vsi->tmp_sync_list, addr))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_add_mac_to_unsync_list - creates list of mac addresses to be unsynced
+ * @netdev: the net device on which the unsync is happening
+ * @addr: mac address to unsync
+ *
+ * This is a callback function which is called by the in kernel device unsync
+ * functions (like __dev_uc_unsync, __dev_mc_unsync, etc). This function only
+ * populates the tmp_unsync_list, which is later used by ice_remove_mac to
+ * delete the mac filters from the hardware.
+ */
+static int ice_add_mac_to_unsync_list(struct net_device *netdev, const u8 *addr)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+
+	if (ice_add_mac_to_list(vsi, &vsi->tmp_unsync_list, addr))
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * ice_free_fltr_list - free filter lists helper
  * @dev: pointer to the device struct
@@ -232,6 +274,183 @@ static void ice_free_fltr_list(struct device *dev, struct list_head *h)
 	}
 }
 
+/**
+ * ice_vsi_fltr_changed - check if filter state changed
+ * @vsi: VSI to be checked
+ *
+ * returns true if filter state has changed, false otherwise.
+ */
+static bool ice_vsi_fltr_changed(struct ice_vsi *vsi)
+{
+	return test_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags) ||
+	       test_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags) ||
+	       test_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
+}
+
+/**
+ * ice_vsi_sync_fltr - Update the VSI filter list to the HW
+ * @vsi: ptr to the VSI
+ *
+ * Push any outstanding VSI filter changes through the AdminQ.
+ */
+static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
+{
+	struct device *dev = &vsi->back->pdev->dev;
+	struct net_device *netdev = vsi->netdev;
+	bool promisc_forced_on = false;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	enum ice_status status = 0;
+	u32 changed_flags = 0;
+	int err = 0;
+
+	if (!vsi->netdev)
+		return -EINVAL;
+
+	while (test_and_set_bit(__ICE_CFG_BUSY, vsi->state))
+		usleep_range(1000, 2000);
+
+	changed_flags = vsi->current_netdev_flags ^ vsi->netdev->flags;
+	vsi->current_netdev_flags = vsi->netdev->flags;
+
+	INIT_LIST_HEAD(&vsi->tmp_sync_list);
+	INIT_LIST_HEAD(&vsi->tmp_unsync_list);
+
+	if (ice_vsi_fltr_changed(vsi)) {
+		clear_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
+		clear_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
+		clear_bit(ICE_VSI_FLAG_VLAN_FLTR_CHANGED, vsi->flags);
+
+		/* grab the netdev's addr_list_lock */
+		netif_addr_lock_bh(netdev);
+		__dev_uc_sync(netdev, ice_add_mac_to_sync_list,
+			      ice_add_mac_to_unsync_list);
+		__dev_mc_sync(netdev, ice_add_mac_to_sync_list,
+			      ice_add_mac_to_unsync_list);
+		/* our temp lists are populated. release lock */
+		netif_addr_unlock_bh(netdev);
+	}
+
+	/* Remove mac addresses in the unsync list */
+	status = ice_remove_mac(hw, &vsi->tmp_unsync_list);
+	ice_free_fltr_list(dev, &vsi->tmp_unsync_list);
+	if (status) {
+		netdev_err(netdev, "Failed to delete MAC filters\n");
+		/* if we failed because of alloc failures, just bail */
+		if (status == ICE_ERR_NO_MEMORY) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* Add mac addresses in the sync list */
+	status = ice_add_mac(hw, &vsi->tmp_sync_list);
+	ice_free_fltr_list(dev, &vsi->tmp_sync_list);
+	if (status) {
+		netdev_err(netdev, "Failed to add MAC filters\n");
+		/* If there is no more space for new umac filters, vsi
+		 * should go into promiscuous mode. There should be some
+		 * space reserved for promiscuous filters.
+		 */
+		if (hw->adminq.sq_last_status == ICE_AQ_RC_ENOSPC &&
+		    !test_and_set_bit(__ICE_FLTR_OVERFLOW_PROMISC,
+				      vsi->state)) {
+			promisc_forced_on = true;
+			netdev_warn(netdev,
+				    "Reached MAC filter limit, forcing promisc mode on VSI %d\n",
+				    vsi->vsi_num);
+		} else {
+			err = -EIO;
+			goto out;
+		}
+	}
+	/* check for changes in promiscuous modes */
+	if (changed_flags & IFF_ALLMULTI)
+		netdev_warn(netdev, "Unsupported configuration\n");
+
+	if (((changed_flags & IFF_PROMISC) || promisc_forced_on) ||
+	    test_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags)) {
+		clear_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags);
+		if (vsi->current_netdev_flags & IFF_PROMISC) {
+			/* Apply TX filter rule to get traffic from VMs */
+			status = ice_cfg_dflt_vsi(hw, vsi->vsi_num, true,
+						  ICE_FLTR_TX);
+			if (status) {
+				netdev_err(netdev, "Error setting default VSI %i tx rule\n",
+					   vsi->vsi_num);
+				vsi->current_netdev_flags &= ~IFF_PROMISC;
+				err = -EIO;
+				goto out_promisc;
+			}
+			/* Apply RX filter rule to get traffic from wire */
+			status = ice_cfg_dflt_vsi(hw, vsi->vsi_num, true,
+						  ICE_FLTR_RX);
+			if (status) {
+				netdev_err(netdev, "Error setting default VSI %i rx rule\n",
+					   vsi->vsi_num);
+				vsi->current_netdev_flags &= ~IFF_PROMISC;
+				err = -EIO;
+				goto out_promisc;
+			}
+		} else {
+			/* Clear TX filter rule to stop traffic from VMs */
+			status = ice_cfg_dflt_vsi(hw, vsi->vsi_num, false,
+						  ICE_FLTR_TX);
+			if (status) {
+				netdev_err(netdev, "Error clearing default VSI %i tx rule\n",
+					   vsi->vsi_num);
+				vsi->current_netdev_flags |= IFF_PROMISC;
+				err = -EIO;
+				goto out_promisc;
+			}
+			/* Clear filter RX to remove traffic from wire */
+			status = ice_cfg_dflt_vsi(hw, vsi->vsi_num, false,
+						  ICE_FLTR_RX);
+			if (status) {
+				netdev_err(netdev, "Error clearing default VSI %i rx rule\n",
+					   vsi->vsi_num);
+				vsi->current_netdev_flags |= IFF_PROMISC;
+				err = -EIO;
+				goto out_promisc;
+			}
+		}
+	}
+	goto exit;
+
+out_promisc:
+	set_bit(ICE_VSI_FLAG_PROMISC_CHANGED, vsi->flags);
+	goto exit;
+out:
+	/* if something went wrong then set the changed flag so we try again */
+	set_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
+exit:
+	clear_bit(__ICE_CFG_BUSY, vsi->state);
+	return err;
+}
+
+/**
+ * ice_sync_fltr_subtask - Sync the VSI filter list with HW
+ * @pf: board private structure
+ */
+static void ice_sync_fltr_subtask(struct ice_pf *pf)
+{
+	int v;
+
+	if (!pf || !(test_bit(ICE_FLAG_FLTR_SYNC, pf->flags)))
+		return;
+
+	clear_bit(ICE_FLAG_FLTR_SYNC, pf->flags);
+
+	for (v = 0; v < pf->num_alloc_vsi; v++)
+		if (pf->vsi[v] && ice_vsi_fltr_changed(pf->vsi[v]) &&
+		    ice_vsi_sync_fltr(pf->vsi[v])) {
+			/* come back and try again later */
+			set_bit(ICE_FLAG_FLTR_SYNC, pf->flags);
+			break;
+		}
+}
+
 /**
  * ice_is_reset_recovery_pending - schedule a reset
  * @state: pf state field
@@ -794,6 +1013,7 @@ static void ice_service_task(struct work_struct *work)
 		return;
 	}
 
+	ice_sync_fltr_subtask(pf);
 	ice_watchdog_subtask(pf);
 	ice_clean_adminq_subtask(pf);
 
@@ -2505,6 +2725,7 @@ static int ice_vsi_reinit_setup(struct ice_vsi *vsi)
 	ice_vsi_free_q_vectors(vsi);
 err_rings:
 	if (vsi->netdev) {
+		vsi->current_netdev_flags = 0;
 		unregister_netdev(vsi->netdev);
 		free_netdev(vsi->netdev);
 		vsi->netdev = NULL;
@@ -3314,6 +3535,197 @@ static void __exit ice_module_exit(void)
 }
 module_exit(ice_module_exit);
 
+/**
+ * ice_set_mac_address - NDO callback to set mac address
+ * @netdev: network interface device structure
+ * @pi: pointer to an address structure
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_set_mac_address(struct net_device *netdev, void *pi)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	struct ice_hw *hw = &pf->hw;
+	struct sockaddr *addr = pi;
+	enum ice_status status;
+	LIST_HEAD(a_mac_list);
+	LIST_HEAD(r_mac_list);
+	u8 flags = 0;
+	int err;
+	u8 *mac;
+
+	mac = (u8 *)addr->sa_data;
+
+	if (!is_valid_ether_addr(mac))
+		return -EADDRNOTAVAIL;
+
+	if (ether_addr_equal(netdev->dev_addr, mac)) {
+		netdev_warn(netdev, "already using mac %pM\n", mac);
+		return 0;
+	}
+
+	if (test_bit(__ICE_DOWN, pf->state) ||
+	    ice_is_reset_recovery_pending(pf->state)) {
+		netdev_err(netdev, "can't set mac %pM. device not ready\n",
+			   mac);
+		return -EBUSY;
+	}
+
+	/* When we change the mac address we also have to change the mac address
+	 * based filter rules that were created previously for the old mac
+	 * address. So first, we remove the old filter rule using ice_remove_mac
+	 * and then create a new filter rule using ice_add_mac. Note that for
+	 * both these operations, we first need to form a "list" of mac
+	 * addresses (even though in this case, we have only 1 mac address to be
+	 * added/removed) and this done using ice_add_mac_to_list. Depending on
+	 * the ensuing operation this "list" of mac addresses is either to be
+	 * added or removed from the filter.
+	 */
+	err = ice_add_mac_to_list(vsi, &r_mac_list, netdev->dev_addr);
+	if (err) {
+		err = -EADDRNOTAVAIL;
+		goto free_lists;
+	}
+
+	status = ice_remove_mac(hw, &r_mac_list);
+	if (status) {
+		err = -EADDRNOTAVAIL;
+		goto free_lists;
+	}
+
+	err = ice_add_mac_to_list(vsi, &a_mac_list, mac);
+	if (err) {
+		err = -EADDRNOTAVAIL;
+		goto free_lists;
+	}
+
+	status = ice_add_mac(hw, &a_mac_list);
+	if (status) {
+		err = -EADDRNOTAVAIL;
+		goto free_lists;
+	}
+
+free_lists:
+	/* free list entries */
+	ice_free_fltr_list(&pf->pdev->dev, &r_mac_list);
+	ice_free_fltr_list(&pf->pdev->dev, &a_mac_list);
+
+	if (err) {
+		netdev_err(netdev, "can't set mac %pM. filter update failed\n",
+			   mac);
+		return err;
+	}
+
+	/* change the netdev's mac address */
+	memcpy(netdev->dev_addr, mac, netdev->addr_len);
+	netdev_dbg(vsi->netdev, "updated mac address to %pM\n",
+		   netdev->dev_addr);
+
+	/* write new mac address to the firmware */
+	flags = ICE_AQC_MAN_MAC_UPDATE_LAA_WOL;
+	status = ice_aq_manage_mac_write(hw, mac, flags, NULL);
+	if (status) {
+		netdev_err(netdev, "can't set mac %pM. write to firmware failed.\n",
+			   mac);
+	}
+	return 0;
+}
+
+/**
+ * ice_set_rx_mode - NDO callback to set the netdev filters
+ * @netdev: network interface device structure
+ */
+static void ice_set_rx_mode(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+
+	if (!vsi)
+		return;
+
+	/* Set the flags to synchronize filters
+	 * ndo_set_rx_mode may be triggered even without a change in netdev
+	 * flags
+	 */
+	set_bit(ICE_VSI_FLAG_UMAC_FLTR_CHANGED, vsi->flags);
+	set_bit(ICE_VSI_FLAG_MMAC_FLTR_CHANGED, vsi->flags);
+	set_bit(ICE_FLAG_FLTR_SYNC, vsi->back->flags);
+
+	/* schedule our worker thread which will take care of
+	 * applying the new filter changes
+	 */
+	ice_service_task_schedule(vsi->back);
+}
+
+/**
+ * ice_fdb_add - add an entry to the hardware database
+ * @ndm: the input from the stack
+ * @tb: pointer to array of nladdr (unused)
+ * @dev: the net device pointer
+ * @addr: the MAC address entry being added
+ * @vid: VLAN id
+ * @flags: instructions from stack about fdb operation
+ */
+static int ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[],
+		       struct net_device *dev, const unsigned char *addr,
+		       u16 vid, u16 flags)
+{
+	int err;
+
+	if (vid) {
+		netdev_err(dev, "VLANs aren't supported yet for dev_uc|mc_add()\n");
+		return -EINVAL;
+	}
+	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+		netdev_err(dev, "FDB only supports static addresses\n");
+		return -EINVAL;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_add_excl(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_add_excl(dev, addr);
+	else
+		err = -EINVAL;
+
+	/* Only return duplicate errors if NLM_F_EXCL is set */
+	if (err == -EEXIST && !(flags & NLM_F_EXCL))
+		err = 0;
+
+	return err;
+}
+
+/**
+ * ice_fdb_del - delete an entry from the hardware database
+ * @ndm: the input from the stack
+ * @tb: pointer to array of nladdr (unused)
+ * @dev: the net device pointer
+ * @addr: the MAC address entry being added
+ * @vid: VLAN id
+ */
+static int ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[],
+		       struct net_device *dev, const unsigned char *addr,
+		       __always_unused u16 vid)
+{
+	int err;
+
+	if (ndm->ndm_state & NUD_PERMANENT) {
+		netdev_err(dev, "FDB only supports static addresses\n");
+		return -EINVAL;
+	}
+
+	if (is_unicast_ether_addr(addr))
+		err = dev_uc_del(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_del(dev, addr);
+	else
+		err = -EINVAL;
+
+	return err;
+}
+
 /**
  * ice_vsi_manage_vlan_insertion - Manage VLAN insertion for the VSI for Tx
  * @vsi: the vsi being changed
@@ -3704,6 +4116,8 @@ static int ice_vsi_cfg(struct ice_vsi *vsi)
 {
 	int err;
 
+	ice_set_rx_mode(vsi->netdev);
+
 	err = ice_restore_vlan(vsi);
 	if (err)
 		return err;
@@ -4393,6 +4807,30 @@ void ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats)
 	stats->rx_length_errors = vsi_stats->rx_length_errors;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/**
+ * ice_netpoll - polling "interrupt" handler
+ * @netdev: network interface device structure
+ *
+ * Used by netconsole to send skbs without having to re-enable interrupts.
+ * This is not called in the normal interrupt path.
+ */
+static void ice_netpoll(struct net_device *netdev)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	int i;
+
+	if (test_bit(__ICE_DOWN, vsi->state) ||
+	    !test_bit(ICE_FLAG_MSIX_ENA, pf->flags))
+		return;
+
+	for (i = 0; i < vsi->num_q_vectors; i++)
+		ice_msix_clean_rings(0, vsi->q_vectors[i]);
+}
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+
 /**
  * ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI
  * @vsi: VSI having NAPI disabled
@@ -4800,6 +5238,73 @@ static void ice_rebuild(struct ice_pf *pf)
 	set_bit(__ICE_RESET_RECOVERY_PENDING, pf->state);
 }
 
+/**
+ * ice_change_mtu - NDO callback to change the MTU
+ * @netdev: network interface device structure
+ * @new_mtu: new value for maximum frame size
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int ice_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct ice_netdev_priv *np = netdev_priv(netdev);
+	struct ice_vsi *vsi = np->vsi;
+	struct ice_pf *pf = vsi->back;
+	u8 count = 0;
+
+	if (new_mtu == netdev->mtu) {
+		netdev_warn(netdev, "mtu is already %d\n", netdev->mtu);
+		return 0;
+	}
+
+	if (new_mtu < netdev->min_mtu) {
+		netdev_err(netdev, "new mtu invalid. min_mtu is %d\n",
+			   netdev->min_mtu);
+		return -EINVAL;
+	} else if (new_mtu > netdev->max_mtu) {
+		netdev_err(netdev, "new mtu invalid. max_mtu is %d\n",
+			   netdev->min_mtu);
+		return -EINVAL;
+	}
+	/* if a reset is in progress, wait for some time for it to complete */
+	do {
+		if (ice_is_reset_recovery_pending(pf->state)) {
+			count++;
+			usleep_range(1000, 2000);
+		} else {
+			break;
+		}
+
+	} while (count < 100);
+
+	if (count == 100) {
+		netdev_err(netdev, "can't change mtu. Device is busy\n");
+		return -EBUSY;
+	}
+
+	netdev->mtu = new_mtu;
+
+	/* if VSI is up, bring it down and then back up */
+	if (!test_and_set_bit(__ICE_DOWN, vsi->state)) {
+		int err;
+
+		err = ice_down(vsi);
+		if (err) {
+			netdev_err(netdev, "change mtu if_up err %d\n", err);
+			return err;
+		}
+
+		err = ice_up(vsi);
+		if (err) {
+			netdev_err(netdev, "change mtu if_up err %d\n", err);
+			return err;
+		}
+	}
+
+	netdev_dbg(netdev, "changed mtu to %d\n", new_mtu);
+	return 0;
+}
+
 /**
  * ice_set_rss - Set RSS keys and lut
  * @vsi: Pointer to VSI structure
@@ -4933,12 +5438,72 @@ static int ice_stop(struct net_device *netdev)
 	return 0;
 }
 
+/**
+ * ice_features_check - Validate encapsulated packet conforms to limits
+ * @skb: skb buffer
+ * @netdev: This port's netdev
+ * @features: Offload features that the stack believes apply
+ */
+static netdev_features_t
+ice_features_check(struct sk_buff *skb,
+		   struct net_device __always_unused *netdev,
+		   netdev_features_t features)
+{
+	size_t len;
+
+	/* No point in doing any of this if neither checksum nor GSO are
+	 * being requested for this frame.  We can rule out both by just
+	 * checking for CHECKSUM_PARTIAL
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return features;
+
+	/* We cannot support GSO if the MSS is going to be less than
+	 * 64 bytes.  If it is then we need to drop support for GSO.
+	 */
+	if (skb_is_gso(skb) && (skb_shinfo(skb)->gso_size < 64))
+		features &= ~NETIF_F_GSO_MASK;
+
+	len = skb_network_header(skb) - skb->data;
+	if (len & ~(ICE_TXD_MACLEN_MAX))
+		goto out_rm_features;
+
+	len = skb_transport_header(skb) - skb_network_header(skb);
+	if (len & ~(ICE_TXD_IPLEN_MAX))
+		goto out_rm_features;
+
+	if (skb->encapsulation) {
+		len = skb_inner_network_header(skb) - skb_transport_header(skb);
+		if (len & ~(ICE_TXD_L4LEN_MAX))
+			goto out_rm_features;
+
+		len = skb_inner_transport_header(skb) -
+		      skb_inner_network_header(skb);
+		if (len & ~(ICE_TXD_IPLEN_MAX))
+			goto out_rm_features;
+	}
+
+	return features;
+out_rm_features:
+	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+}
+
 static const struct net_device_ops ice_netdev_ops = {
 	.ndo_open = ice_open,
 	.ndo_stop = ice_stop,
 	.ndo_start_xmit = ice_start_xmit,
+	.ndo_features_check = ice_features_check,
+	.ndo_set_rx_mode = ice_set_rx_mode,
+	.ndo_set_mac_address = ice_set_mac_address,
+	.ndo_validate_addr = eth_validate_addr,
+	.ndo_change_mtu = ice_change_mtu,
 	.ndo_get_stats64 = ice_get_stats64,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = ice_netpoll,
+#endif /* CONFIG_NET_POLL_CONTROLLER */
 	.ndo_vlan_rx_add_vid = ice_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid = ice_vlan_rx_kill_vid,
 	.ndo_set_features = ice_set_features,
+	.ndo_fdb_add = ice_fdb_add,
+	.ndo_fdb_del = ice_fdb_del,
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 424090f598c3..fbdfeed6aa4e 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -1640,6 +1640,83 @@ ice_remove_mac(struct ice_hw *hw, struct list_head *m_list)
 	return status;
 }
 
+/**
+ * ice_cfg_dflt_vsi - add filter rule to set/unset given VSI as default
+ * VSI for the switch (represented by swid)
+ * @hw: pointer to the hardware structure
+ * @vsi_id: number of VSI to set as default
+ * @set: true to add the above mentioned switch rule, false to remove it
+ * @direction: ICE_FLTR_RX or ICE_FLTR_TX
+ */
+enum ice_status
+ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_id, bool set, u8 direction)
+{
+	struct ice_aqc_sw_rules_elem *s_rule;
+	struct ice_fltr_info f_info;
+	enum ice_adminq_opc opcode;
+	enum ice_status status;
+	u16 s_rule_size;
+
+	s_rule_size = set ? ICE_SW_RULE_RX_TX_ETH_HDR_SIZE :
+			    ICE_SW_RULE_RX_TX_NO_HDR_SIZE;
+	s_rule = devm_kzalloc(ice_hw_to_dev(hw), s_rule_size, GFP_KERNEL);
+	if (!s_rule)
+		return ICE_ERR_NO_MEMORY;
+
+	memset(&f_info, 0, sizeof(f_info));
+
+	f_info.lkup_type = ICE_SW_LKUP_DFLT;
+	f_info.flag = direction;
+	f_info.fltr_act = ICE_FWD_TO_VSI;
+	f_info.fwd_id.vsi_id = vsi_id;
+
+	if (f_info.flag & ICE_FLTR_RX) {
+		f_info.src = hw->port_info->lport;
+		if (!set)
+			f_info.fltr_rule_id =
+				hw->port_info->dflt_rx_vsi_rule_id;
+	} else if (f_info.flag & ICE_FLTR_TX) {
+		f_info.src = vsi_id;
+		if (!set)
+			f_info.fltr_rule_id =
+				hw->port_info->dflt_tx_vsi_rule_id;
+	}
+
+	if (set)
+		opcode = ice_aqc_opc_add_sw_rules;
+	else
+		opcode = ice_aqc_opc_remove_sw_rules;
+
+	ice_fill_sw_rule(hw, &f_info, s_rule, opcode);
+
+	status = ice_aq_sw_rules(hw, s_rule, s_rule_size, 1, opcode, NULL);
+	if (status || !(f_info.flag & ICE_FLTR_TX_RX))
+		goto out;
+	if (set) {
+		u16 index = le16_to_cpu(s_rule->pdata.lkup_tx_rx.index);
+
+		if (f_info.flag & ICE_FLTR_TX) {
+			hw->port_info->dflt_tx_vsi_num = vsi_id;
+			hw->port_info->dflt_tx_vsi_rule_id = index;
+		} else if (f_info.flag & ICE_FLTR_RX) {
+			hw->port_info->dflt_rx_vsi_num = vsi_id;
+			hw->port_info->dflt_rx_vsi_rule_id = index;
+		}
+	} else {
+		if (f_info.flag & ICE_FLTR_TX) {
+			hw->port_info->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
+			hw->port_info->dflt_tx_vsi_rule_id = ICE_INVAL_ACT;
+		} else if (f_info.flag & ICE_FLTR_RX) {
+			hw->port_info->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
+			hw->port_info->dflt_rx_vsi_rule_id = ICE_INVAL_ACT;
+		}
+	}
+
+out:
+	devm_kfree(ice_hw_to_dev(hw), s_rule);
+	return status;
+}
+
 /**
  * ice_remove_vlan_internal - Remove one VLAN based filter rule
  * @hw: pointer to the hardware structure
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h
index 60c63264a393..b063b87f1558 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.h
+++ b/drivers/net/ethernet/intel/ice/ice_switch.h
@@ -169,5 +169,7 @@ enum ice_status ice_remove_mac(struct ice_hw *hw, struct list_head *m_lst);
 void ice_remove_vsi_fltr(struct ice_hw *hw, u16 vsi_id);
 enum ice_status ice_add_vlan(struct ice_hw *hw, struct list_head *m_list);
 enum ice_status ice_remove_vlan(struct ice_hw *hw, struct list_head *v_list);
+enum ice_status
+ice_cfg_dflt_vsi(struct ice_hw *hw, u16 vsi_id, bool set, u8 direction);
 
 #endif /* _ICE_SWITCH_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h
index 100cb3cf8364..a9e8e28b64be 100644
--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -24,6 +24,9 @@
 #include "ice_controlq.h"
 #include "ice_lan_tx_rx.h"
 
+#define ICE_BYTES_PER_WORD	2
+#define ICE_BYTES_PER_DWORD	4
+
 static inline bool ice_is_tc_ena(u8 bitmap, u8 tc)
 {
 	return test_bit(tc, (unsigned long *)&bitmap);
@@ -241,7 +244,9 @@ struct ice_port_info {
 	u8 port_state;
 #define ICE_SCHED_PORT_STATE_INIT	0x0
 #define ICE_SCHED_PORT_STATE_READY	0x1
+	u16 dflt_tx_vsi_rule_id;
 	u16 dflt_tx_vsi_num;
+	u16 dflt_rx_vsi_rule_id;
 	u16 dflt_rx_vsi_num;
 	struct ice_fc_info fc;
 	struct ice_mac_info mac;
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH v4 1/2] kernel.h: Introduce const_max() for VLA removal
From: Kees Cook @ 2018-03-15 23:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Josh Poimboeuf, Rasmus Villemoes, Randy Dunlap,
	Miguel Ojeda, Ingo Molnar, David Laight, Ian Abbott, linux-input,
	linux-btrfs, Network Development, Linux Kernel Mailing List,
	Kernel Hardening
In-Reply-To: <CA+55aFyRV9KXzeQZpVYsZYVUJm-ASgu_4_1+8Y8-0KH-YT2M8Q@mail.gmail.com>

On Thu, Mar 15, 2018 at 4:46 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> What I'm *not* so much ok with is "const_max(5,sizeof(x))" erroring
> out, or silently causing insane behavior due to hidden subtle type
> casts..

Yup! I like it as an explicit argument. Thanks!

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* Re: [PATCH v2 12/15] ice: Add stats and ethtool support
From: Stephen Hemminger @ 2018-03-15 23:52 UTC (permalink / raw)
  To: Anirudh Venkataramanan
  Cc: intel-wired-lan, netdev, Andrew Lunn, Jakub Kicinski
In-Reply-To: <20180315234802.31336-13-anirudh.venkataramanan@intel.com>

On Thu, 15 Mar 2018 16:47:59 -0700
Anirudh Venkataramanan <anirudh.venkataramanan@intel.com> wrote:

> +
> +static const struct ice_stats ice_gstrings_vsi_stats[] = {
> +	ICE_VSI_STAT("tx_unicast", eth_stats.tx_unicast),
> +	ICE_VSI_STAT("rx_unicast", eth_stats.rx_unicast),
> +	ICE_VSI_STAT("tx_multicast", eth_stats.tx_multicast),
> +	ICE_VSI_STAT("rx_multicast", eth_stats.rx_multicast),
> +	ICE_VSI_STAT("tx_broadcast", eth_stats.tx_broadcast),
> +	ICE_VSI_STAT("rx_broadcast", eth_stats.rx_broadcast),
> +	ICE_VSI_STAT("tx_bytes", eth_stats.tx_bytes),
> +	ICE_VSI_STAT("rx_bytes", eth_stats.rx_bytes),
> +	ICE_VSI_STAT("rx_discards", eth_stats.rx_discards),
> +	ICE_VSI_STAT("tx_errors", eth_stats.tx_errors),
> +	ICE_VSI_STAT("tx_linearize", tx_linearize),
> +	ICE_VSI_STAT("rx_unknown_protocol", eth_stats.rx_unknown_protocol),
> +	ICE_VSI_STAT("rx_alloc_fail", rx_buf_failed),
> +	ICE_VSI_STAT("rx_pg_alloc_fail", rx_page_failed),
> +};
> +

Ignoring feedback from maintainers is unlikely to help get your driver adopted.

^ permalink raw reply

* Re: [PATCH] mlx5: Remove call to ida_pre_get
From: Saeed Mahameed @ 2018-03-15 23:58 UTC (permalink / raw)
  To: Matan Barak, Maor Gottlieb, leon@kernel.org, willy@infradead.org
  Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org
In-Reply-To: <20180315025724.GB9973@bombadil.infradead.org>

On Wed, 2018-03-14 at 19:57 -0700, Matthew Wilcox wrote:
> From: Matthew Wilcox <mawilcox@microsoft.com>
> 
> The mlx5 driver calls ida_pre_get() in a loop for no readily apparent
> reason.  The driver uses ida_simple_get() which will call
> ida_pre_get()
> by itself and there's no need to use ida_pre_get() unless using
> ida_get_new().
> 

Hi Matthew,

Is this is causing any issues ? or just a simple cleanup ?

Adding Maor, the author of this change,

I believe the idea is to speed up insert_fte (which calls
ida_simple_get) since insert_fte runs under the FTE write semaphore,
in this case if ida_pre_get was successful before taking the semaphore
for all the FTE nodes in the loop, this will be a huge win for
ida_simple_get which will immediately return success without even
trying to allocate.

so it is a best effort to speed up critical path.

Maor, if this is really the case and this is not causing any issues,
then we need to consider adding a comment.

> Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> index 10e16381f20a..3ba07c7096ef 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
> @@ -1647,7 +1647,6 @@ try_add_to_existing_fg(struct mlx5_flow_table
> *ft,
>  
>  	list_for_each_entry(iter, match_head, list) {
>  		nested_down_read_ref_node(&iter->g->node,
> FS_LOCK_PARENT);
> -		ida_pre_get(&iter->g->fte_allocator, GFP_KERNEL);
>  	}
>  
>  search_again_locked:
> 

^ permalink raw reply

* Re: [PATCH v6 0/6] staging: Introduce DPAA2 Ethernet Switch driver
From: Andrew Lunn @ 2018-03-16  0:15 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: devel, stuyoder, arnd, gregkh, alexandru.marginean, agraf,
	linux-kernel, Razvan Stefanescu, ioana.ciornei, netdev,
	laurentiu.tudor
In-Reply-To: <20180315105642.szn2zhrsgwsv35yf@mwanda>

On Thu, Mar 15, 2018 at 01:56:42PM +0300, Dan Carpenter wrote:
> On Thu, Mar 15, 2018 at 12:44:37AM +0100, Andrew Lunn wrote:
> > On Wed, Mar 14, 2018 at 10:55:52AM -0500, Razvan Stefanescu wrote:
> > > This patchset introduces the Ethernet Switch Driver for Freescale/NXP SoCs
> > > with DPAA2 (DataPath Acceleration Architecture v2). The driver manages
> > > switch objects discovered on the fsl-mc bus. A description of the driver
> > > can be found in the associated README file.
> > 
> > Hi Greg
> > 
> > This code has much better quality than the usual stuff in staging. I
> > see no reason not to merge it. 
> 
> Yeah.  It seems pretty decent.  Stuart, Laurentiu, care to comment?
> 
> Meanwhile, netdev and DaveM aren't even on the CC list and they're the
> ones to ultimately decide.

The patches are for staging, so it is GregKH who decides at this
point, not really DaveM.

       Andrew

^ permalink raw reply

* Re: [PATCH 6/7] e1000: eliminate duplicate barriers on weakly-ordered archs
From: Alexander Duyck @ 2018-03-16  0:25 UTC (permalink / raw)
  To: Sinan Kaya
  Cc: Netdev, Timur Tabi, sulrich, linux-arm-msm, linux-arm-kernel,
	Jeff Kirsher, intel-wired-lan, LKML
In-Reply-To: <39dc5bb4-02b1-bf7e-fbfc-17fc484e4fb7@codeaurora.org>

On Thu, Mar 15, 2018 at 4:30 PM, Sinan Kaya <okaya@codeaurora.org> wrote:
> On 3/14/2018 9:41 PM, Alexander Duyck wrote:
>>>  }
>>>
>> So you missed the writel in e1000_xmit_frame. You should probably get
>> that one too while you are doing these updates. The wmb() is in
>> e1000_tx_queue().
>>
>
> I brought wmb() outside along with the next descriptor assignment to be
> similar to the rest of the other code.
>
> if wmb() and writel() are not visible in the same function, let's not touch
> the code.

Maybe for e1000 we should just skip the driver entirely. Odds are you
aren't going to have any e1000 parts running on ARM anyway since most
of them are legacy PCI or PCI-X parts that were made over 10 years
ago. Most of your efforts would probably be best spent on igb, igbvf,
ixgbe, ixgbevf, i40e, i40evf, and fm10k.

^ permalink raw reply

* Re: [PATCH net-next] net: ethernet: ti: cpsw: enable vlan rx vlan offload
From: Andrew Lunn @ 2018-03-16  0:29 UTC (permalink / raw)
  To: Grygorii Strashko
  Cc: David S. Miller, netdev, Sekhar Nori, linux-kernel, linux-omap
In-Reply-To: <20180315201550.21487-1-grygorii.strashko@ti.com>

On Thu, Mar 15, 2018 at 03:15:50PM -0500, Grygorii Strashko wrote:
> In VLAN_AWARE mode CPSW can insert VLAN header encapsulation word on Host
> port 0 egress (RX) before the packet data if RX_VLAN_ENCAP bit is set in
> CPSW_CONTROL register. VLAN header encapsulation word has following format:
> 
>  HDR_PKT_Priority bits 29-31 - Header Packet VLAN prio (Highest prio: 7)
>  HDR_PKT_CFI 	  bits 28 - Header Packet VLAN CFI bit.
>  HDR_PKT_Vid 	  bits 27-16 - Header Packet VLAN ID
>  PKT_Type         bits 8-9 - Packet Type. Indicates whether the packet is
>                  	VLAN-tagged, priority-tagged, or non-tagged.
> 	00: VLAN-tagged packet
> 	01: Reserved
> 	10: Priority-tagged packet
> 	11: Non-tagged packet
> 
> This feature can be used to implement TX VLAN offload in case of
> VLAN-tagged packets and to insert VLAN tag in case Non-tagged packet was
> received on port with PVID set. As per documentation, CPSW never modifies
> packet data on Host egress (RX) and as result, without this feature
> enabled, Host port will not be able to receive properly packets which
> entered switch non-tagged through external Port with PVID set (when
> non-tagged packet forwarded from external Port with PVID set to another
> external Port - packet will be VLAN tagged properly).

So, i think it is time to discuss the future of this driver. It should
really be replaced by a switchdev/DSA driver. There are plenty of
carrots for a new driver: Better statistics, working ethtool support
for all the PHYs, better user experience, etc. But maybe now it is
time for the stick. Should we Maintainers decide that no new features
should be added to the existing drivers, just bug fixes?

       Andrew

^ permalink raw reply

* Re: [bpf-next PATCH v2 05/18] bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data
From: Daniel Borkmann @ 2018-03-16  0:37 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: John Fastabend, davem, ast, davejwatson, netdev
In-Reply-To: <20180315230605.vndmzwxso57puskx@ast-mbp>

On 03/16/2018 12:06 AM, Alexei Starovoitov wrote:
> On Thu, Mar 15, 2018 at 11:55:39PM +0100, Daniel Borkmann wrote:
>> On 03/15/2018 11:20 PM, Alexei Starovoitov wrote:
>>> On Thu, Mar 15, 2018 at 11:17:12PM +0100, Daniel Borkmann wrote:
>>>> On 03/15/2018 10:59 PM, Alexei Starovoitov wrote:
>>>>> On Mon, Mar 12, 2018 at 12:23:29PM -0700, John Fastabend wrote:
>>>>>>  
>>>>>> +/* User return codes for SK_MSG prog type. */
>>>>>> +enum sk_msg_action {
>>>>>> +	SK_MSG_DROP = 0,
>>>>>> +	SK_MSG_PASS,
>>>>>> +};
>>>>>
>>>>> do we really need new enum here?
>>>>> It's the same as 'enum sk_action' and SK_DROP == SK_MSG_DROP
>>>>> and there will be only drop/pass in both enums.
>>>>> Also I don't see where these two new SK_MSG_* are used...
>>>>>
>>>>>> +
>>>>>> +/* user accessible metadata for SK_MSG packet hook, new fields must
>>>>>> + * be added to the end of this structure
>>>>>> + */
>>>>>> +struct sk_msg_md {
>>>>>> +	__u32 data;
>>>>>> +	__u32 data_end;
>>>>>> +};
>>>>>
>>>>> I think it's time for me to ask for forgiveness :)
>>>>
>>>> :-)
>>>>
>>>>> I used __u32 for data and data_end only because all other fields
>>>>> in __sk_buff were __u32 at the time and I couldn't easily figure out
>>>>> how to teach verifier to recognize 8-byte rewrites.
>>>>> Unfortunately my mistake stuck and was copied over into xdp.
>>>>> Since this is new struct let's do it right and add
>>>>> 'void *data, *data_end' here,
>>>>> since bpf prog will use them as 'void *' pointers.
>>>>> There are no compat issues here, since bpf is always 64-bit.
>>>>
>>>> But at least offset-wise when you do the ctx rewrite this would then
>>>> be a bit more tricky when you have 64 bit kernel with 32 bit user
>>>> space since void * members are in each cases at different offset. So
>>>> unless I'm missing something, this still should either be __u32 or
>>>> __u64 instead of void *, no?
>>>
>>> there is no 32-bit user space. these structs are seen by bpf progs only
>>> and bpf is 64-bit only too.
>>> unless I'm missing your point.
>>
>> Ok, so lets say you have 32 bit LLVM binary and compile the prog where
>> you access md->data_end. Given the void * in the struct will that access
>> end up being BPF_W at ctx offset 4 or BPF_DW at ctx offset 8 from clang
>> perspective (iow, is the back end treating this special and always use
>> fixed BPF_DW in such case)? If not and it would be the first case with
>> offset 4, then we could have the case that underlying 64 bit kernel is
>> expecting ctx offset 8 for doing the md ctx conversion.
> 
> i'm still not quite following.
> Whether llvm itself is 32-bit binary or it's arm32 or sprac32 binary
> doesn't matter. It will produce the same 64-bit bpf code.
> It will see 'void *' deref from this struct and will emit DW.
> May be confusion is from newly added -mattr=+alu32 flag?
> That option doesn't change that sizeof(void*)==8.
> It only allows backend to emit 32-bit alu insns.

Ok, so conclusion we had is that while BPF target is unconditionally 64 bit,
it depends which clang front end you use for compilation wrt structs. E.g.
on 32 bit native (e.g. arm) clang front end it would compile the ctx void *
pointers as 4 byte while using clang -target bpf it would compile it as 8
byte. The native clang front end is needed in case of tracing when accessing
pt_regs for walking data structures, but not for networking use case, so
always using -target bpf there is proper way. Meaning there would be no
confusion on the void * since size will always be 8 regardless of underlying
arch being 32 or 64 bit or clang/llvm binary being 32 bit on 64 bit kernel.
Thus, sticking to void * would be fine, but definitely samples/sockmap/Makefile
must be fixed as well, such that people don't copy it wrongly.

Cheers,
Daniel

^ permalink raw reply

* Re: [PATCH net] net/sched: act_simple: don't leak 'index' in the error path
From: Cong Wang @ 2018-03-16  0:43 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Jamal Hadi Salim, Jiri Pirko, David S. Miller,
	Linux Kernel Network Developers
In-Reply-To: <1521067427.2750.40.camel@redhat.com>

On Wed, Mar 14, 2018 at 3:43 PM, Davide Caratti <dcaratti@redhat.com> wrote:
> hello Cong, thank you for reviewing this.
>
> On Wed, 2018-03-14 at 11:41 -0700, Cong Wang wrote:
>> On Tue, Mar 13, 2018 at 7:13 PM, Davide Caratti <dcaratti@redhat.com> wrote:
>>
>> Looks like we just need to replace the tcf_idr_cleanup() with
>> tcf_idr_release()? Which is also simpler.
>
> I just tried it on act_simple, and I can confirm: 'index' does not leak
> anymore if alloc_defdata() fails to kzalloc(), and then tcf_idr_release()
> is called in place of of tcf_idr_cleanup().

Good.

>
>> Looks like all other callers of tcf_idr_cleanup() need to be replaced too,
>> but I don't audit all of them...
>
> no problem, I can try to do that, it's not going to be a big series
> anyway.


Please audit all of them.


>
> while at it, I will also fix other spots where the same bug can be
> reproduced, even if tcf_idr_cleanup() is not there: for example, when
> tcf_vlan_init() fails allocating struct tcf_vlan_params *p,
>
> ASSERT_RTNL();
> p = kzalloc(sizeof(*p), GFP_KERNEL);
> if (!p) {
>         if (ovr)
>                 tcf_idr_release(*a, bind);
>         return -ENOMEM;
> }
>
> the followinng behavior can be observed:
>
> # tc actions flush action vlan
> # tc actions add action vlan pop index 5
> RTNETLINK answers: Cannot allocate memory
> We have an error talking to the kernel
> # tc actions add action vlan pop index 5
> RTNETLINK answers: No space left on device
> We have an error talking to the kernel
> # tc actions add action vlan pop index 5
> RTNETLINK answers: No space left on device
> We have an error talking to the kernel
>
> Probably testing the value of 'ovr' here is wrong, or maybe it's
> not enough: I will also verify what happens using 'replace'
> keyword instead of 'add'.

Please fix it separately if really needed, and it would be nicer
if you can add your test cases to tools/testing/selftests/tc-testing/.

Thanks!

^ permalink raw reply

* Re: WARNING: CPU: 3 PID: 0 at net/sched/sch_hfsc.c:1388 hfsc_dequeue+0x319/0x350 [sch_hfsc]
From: Cong Wang @ 2018-03-16  0:48 UTC (permalink / raw)
  To: Marco Berizzi; +Cc: Linux Kernel Network Developers, Jamal Hadi Salim
In-Reply-To: <495487313.1094743.1521015026754@mail.libero.it>

On Wed, Mar 14, 2018 at 1:10 AM, Marco Berizzi <pupilla@libero.it> wrote:
>> Il 9 marzo 2018 alle 0.14 Cong Wang <xiyou.wangcong@gmail.com> ha scritto:
>>
>>
>> On Thu, Mar 8, 2018 at 8:02 AM, Marco Berizzi <pupilla@libero.it> wrote:
>> >> Marco Berizzi wrote:
>> >>
>> >>
>> >> Hello everyone,
>> >>
>> >> Yesterday I got this error on a slackware linux 4.16-rc4 system
>> >> running as a traffic shaping gateway and netfilter nat.
>> >> The error has been arisen after a partial ISP network outage,
>> >> so unfortunately it will not trivial for me to reproduce it again.
>> >
>> > Hello everyone,
>> >
>> > I'm getting this error twice/day, so fortunately I'm able to
>> > reproduce it.
>>
>> IIRC, there was a patch for this, but it got lost...
>>
>> I will take a look anyway.
>
> ok, thanks for the response. Let me know when there will be a patch
> available to test.

It has been reported here:
https://bugzilla.kernel.org/show_bug.cgi?id=109581

And there is a workaround from Konstantin:
https://patchwork.ozlabs.org/patch/803885/

Unfortunately I don't think that is a real fix, we probably need to
fix HFSC itself rather than just workaround the qlen==0. It is not
trivial since HFSC implementation is not easy to understand.
Maybe Jamal knows better than me.


Thanks

^ permalink raw reply

* Re: [PATCH net-next v3 0/7] ibmvnic: Update TX pool and TX routines
From: Thomas Falcon @ 2018-03-16  0:48 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont, davem
In-Reply-To: <1521129763-21030-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

On 03/15/2018 11:02 AM, Thomas Falcon wrote:
> This patch restructures the TX pool data structure and provides a
> separate TX pool array for TSO transmissions. This is already used
> in some way due to our unique DMA situation, namely that we cannot
> use single DMA mappings for packet data. Previously, both buffer
> arrays used the same pool entry. This restructuring allows for
> some additional cleanup in the driver code, especially in some
> places in the device transmit routine.
>
> In addition, it allows us to more easily track the consumer
> and producer indexes of a particular pool. This has been
> further improved by better tracking of in-use buffers to
> prevent possible data corruption in case an invalid buffer
> entry is used.
>
> v3: Forgot to update TX pool cleaning function to handle new data
> structures. Included 7th patch for that.
>
> v2: Fix typo in 3/6 commit subject line
>
> Thomas Falcon (7):
>   ibmvnic: Generalize TX pool structure
>   ibmvnic: Update and clean up reset TX pool routine
>   ibmvnic: Update release TX pool routine
>   ibmvnic: Update TX pool initialization routine
>   ibmvnic: Update TX and TX completion routines
>   ibmvnic: Improve TX buffer accounting
>   ibmvnic: Update TX pool cleaning routine
>
>  drivers/net/ethernet/ibm/ibmvnic.c | 275 +++++++++++++++++++++----------------
>  drivers/net/ethernet/ibm/ibmvnic.h |   8 +-
>  2 files changed, 160 insertions(+), 123 deletions(-)
>
Sorry again, I need to send another version because of a bug in the 7th patch.

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH v2 12/15] ice: Add stats and ethtool support
From: Alexander Duyck @ 2018-03-16  0:50 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Anirudh Venkataramanan, Jakub Kicinski, Netdev, intel-wired-lan,
	Andrew Lunn
In-Reply-To: <20180315165247.165ac10e@xeon-e3>

On Thu, Mar 15, 2018 at 4:52 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Thu, 15 Mar 2018 16:47:59 -0700
> Anirudh Venkataramanan <anirudh.venkataramanan@intel.com> wrote:
>
>> +
>> +static const struct ice_stats ice_gstrings_vsi_stats[] = {
>> +     ICE_VSI_STAT("tx_unicast", eth_stats.tx_unicast),
>> +     ICE_VSI_STAT("rx_unicast", eth_stats.rx_unicast),
>> +     ICE_VSI_STAT("tx_multicast", eth_stats.tx_multicast),
>> +     ICE_VSI_STAT("rx_multicast", eth_stats.rx_multicast),
>> +     ICE_VSI_STAT("tx_broadcast", eth_stats.tx_broadcast),
>> +     ICE_VSI_STAT("rx_broadcast", eth_stats.rx_broadcast),
>> +     ICE_VSI_STAT("tx_bytes", eth_stats.tx_bytes),
>> +     ICE_VSI_STAT("rx_bytes", eth_stats.rx_bytes),
>> +     ICE_VSI_STAT("rx_discards", eth_stats.rx_discards),
>> +     ICE_VSI_STAT("tx_errors", eth_stats.tx_errors),
>> +     ICE_VSI_STAT("tx_linearize", tx_linearize),
>> +     ICE_VSI_STAT("rx_unknown_protocol", eth_stats.rx_unknown_protocol),
>> +     ICE_VSI_STAT("rx_alloc_fail", rx_buf_failed),
>> +     ICE_VSI_STAT("rx_pg_alloc_fail", rx_page_failed),
>> +};
>> +
>
> Ignoring feedback from maintainers is unlikely to help get your driver adopted.

Your feedback wasn't ignored, the netdev stats are gone. I double
checked and there was this in addition to the netdev stats before so I
think the suggestion to remove the netdev stats was just taken
literally.

The VSI is a slightly different entity from the netdev itself. A
netdev can be backed by a VSI in the case of the PF, but the VSI can
be used in other ways such as what we did in i40e where we were using
it to spawn queue groups to work with mqprio as a filter target and in
that case the queue groups wouldn't have a netdev directly associated
with them so in that case it might make sense to leave these as
separate stats.

- Alex

^ permalink raw reply

* Re: [PATCH 6/7] e1000: eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16  0:50 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Netdev, Timur Tabi, sulrich, linux-arm-msm, linux-arm-kernel,
	Jeff Kirsher, intel-wired-lan, LKML
In-Reply-To: <CAKgT0UcfEwL9Jas149a=h3R4zeu4oA=uywJ9dpYD9kDdnpWQFA@mail.gmail.com>

On 3/15/2018 8:25 PM, Alexander Duyck wrote:
> On Thu, Mar 15, 2018 at 4:30 PM, Sinan Kaya <okaya@codeaurora.org> wrote:
>> On 3/14/2018 9:41 PM, Alexander Duyck wrote:
>>>>  }
>>>>
>>> So you missed the writel in e1000_xmit_frame. You should probably get
>>> that one too while you are doing these updates. The wmb() is in
>>> e1000_tx_queue().
>>>
>>
>> I brought wmb() outside along with the next descriptor assignment to be
>> similar to the rest of the other code.
>>
>> if wmb() and writel() are not visible in the same function, let's not touch
>> the code.
> 
> Maybe for e1000 we should just skip the driver entirely. Odds are you
> aren't going to have any e1000 parts running on ARM anyway since most
> of them are legacy PCI or PCI-X parts that were made over 10 years
> ago. Most of your efforts would probably be best spent on igb, igbvf,
> ixgbe, ixgbevf, i40e, i40evf, and fm10k.
> 

Sure. I'll drop it.

-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply

* linux-next: manual merge of the net-next tree with the rdma-fixes tree
From: Stephen Rothwell @ 2018-03-16  0:56 UTC (permalink / raw)
  To: David Miller, Networking, Doug Ledford, Jason Gunthorpe
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Mark Bloch,
	Leon Romanovsky

[-- Attachment #1: Type: text/plain, Size: 5754 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  drivers/infiniband/hw/mlx5/main.c

between commit:

  42cea83f9524 ("IB/mlx5: Fix cleanup order on unload")

from the rdma-fixes tree and commit:

  b5ca15ad7e61 ("IB/mlx5: Add proper representors support")

from the net-next tree.

I fixed it up (see below and the merge fix patch as well) and can
carry the fix as necessary. This is now fixed as far as linux-next is
concerned, but any non trivial conflicts should be mentioned to your
upstream maintainer when your tree is submitted for merging.  You may
also want to consider cooperating with the maintainer of the conflicting
tree to minimise any particularly complex conflicts.

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 16 Mar 2018 11:54:01 +1100
Subject: [PATCH] IB/mlx5: merge fix for "Fix cleanup order on unload"

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 drivers/infiniband/hw/mlx5/ib_rep.c  | 6 +++---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 61cc3d7db257..7fb997dadd80 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -33,9 +33,9 @@ static const struct mlx5_ib_profile rep_profile = {
 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
 		     mlx5_ib_stage_ib_reg_init,
 		     mlx5_ib_stage_ib_reg_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_UMR_RESOURCES,
-		     mlx5_ib_stage_umr_res_init,
-		     mlx5_ib_stage_umr_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
+		     mlx5_ib_stage_post_ib_reg_umr_init,
+		     NULL),
 	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
 		     mlx5_ib_stage_class_attr_init,
 		     NULL),
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7ec753ec7962..c45a7abdbe3e 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1071,8 +1071,7 @@ int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev);
 void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev);
 int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev);
 void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_umr_res_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_umr_res_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev);
 int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev);
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
 		      const struct mlx5_ib_profile *profile,
-- 
2.16.1

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/infiniband/hw/mlx5/main.c
index da091de4e69d,d9474b95d8e5..000000000000
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@@ -4860,19 -4999,19 +4996,19 @@@ int mlx5_ib_stage_ib_reg_init(struct ml
  	return ib_register_device(&dev->ib_dev, NULL);
  }
  
 -void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
 +static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
  {
 -	ib_unregister_device(&dev->ib_dev);
 +	destroy_umrc_res(dev);
  }
  
- static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
 -int mlx5_ib_stage_umr_res_init(struct mlx5_ib_dev *dev)
++void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
  {
 -	return create_umr_res(dev);
 +	ib_unregister_device(&dev->ib_dev);
  }
  
- static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 -void mlx5_ib_stage_umr_res_cleanup(struct mlx5_ib_dev *dev)
++int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
  {
 -	destroy_umrc_res(dev);
 +	return create_umr_res(dev);
  }
  
  static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
@@@ -4999,6 -5144,48 +5144,48 @@@ static const struct mlx5_ib_profile pf_
  		     NULL),
  };
  
+ static const struct mlx5_ib_profile nic_rep_profile = {
+ 	STAGE_CREATE(MLX5_IB_STAGE_INIT,
+ 		     mlx5_ib_stage_init_init,
+ 		     mlx5_ib_stage_init_cleanup),
+ 	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
+ 		     mlx5_ib_stage_flow_db_init,
+ 		     mlx5_ib_stage_flow_db_cleanup),
+ 	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
+ 		     mlx5_ib_stage_caps_init,
+ 		     NULL),
+ 	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
+ 		     mlx5_ib_stage_rep_non_default_cb,
+ 		     NULL),
+ 	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
+ 		     mlx5_ib_stage_rep_roce_init,
+ 		     mlx5_ib_stage_rep_roce_cleanup),
+ 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
+ 		     mlx5_ib_stage_dev_res_init,
+ 		     mlx5_ib_stage_dev_res_cleanup),
+ 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
+ 		     mlx5_ib_stage_counters_init,
+ 		     mlx5_ib_stage_counters_cleanup),
+ 	STAGE_CREATE(MLX5_IB_STAGE_UAR,
+ 		     mlx5_ib_stage_uar_init,
+ 		     mlx5_ib_stage_uar_cleanup),
+ 	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
+ 		     mlx5_ib_stage_bfrag_init,
+ 		     mlx5_ib_stage_bfrag_cleanup),
+ 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
+ 		     mlx5_ib_stage_ib_reg_init,
+ 		     mlx5_ib_stage_ib_reg_cleanup),
 -	STAGE_CREATE(MLX5_IB_STAGE_UMR_RESOURCES,
 -		     mlx5_ib_stage_umr_res_init,
 -		     mlx5_ib_stage_umr_res_cleanup),
++	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
++		     mlx5_ib_stage_post_ib_reg_umr_init,
++		     NULL),
+ 	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
+ 		     mlx5_ib_stage_class_attr_init,
+ 		     NULL),
+ 	STAGE_CREATE(MLX5_IB_STAGE_REP_REG,
+ 		     mlx5_ib_stage_rep_reg_init,
+ 		     mlx5_ib_stage_rep_reg_cleanup),
+ };
+ 
  static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num)
  {
  	struct mlx5_ib_multiport_info *mpi;

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply related

* [PATCH v2 0/6] Eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16  1:04 UTC (permalink / raw)
  To: netdev, timur, sulrich; +Cc: Sinan Kaya, linux-arm-msm, linux-arm-kernel

Code includes wmb() followed by writel() in multiple places. writel()
already has a barrier on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

I did a regex search for wmb() followed by writel() in each drivers
directory.
I scrubbed the ones I care about and posted this series. Note also that
I have one Infiniband patch in the series.

I considered "ease of change", "popular usage" and "performance critical
path" as the determining criteria for my filtering.

We used relaxed API heavily on ARM for a long time but
it did not exist on other architectures. For this reason, relaxed
architectures have been paying double penalty in order to use the common
drivers.

Now that relaxed API is present on all architectures, we can go and scrub
all drivers to see what needs to change and what can remain.

We start with mostly used ones and hope to increase the coverage over time.
It will take a while to cover all drivers.

Changes since v1:

i40e/i40evf: Eliminate duplicate barriers on weakly-ordered archs
missed writel calls in:
i40e:
  i40e_program_fdir_filter
  i40e_clean_rx_irq
  i40e_tx_map
i40evf:
  i40e_clean_rx_irq
  i40e_tx_map

ixgbe: eliminate duplicate barriers on weakly-ordered archs
missed the writel at the end of ixgbe_tx_map

RDMA/qedr: eliminate duplicate barriers on weakly-ordered archs
dropped since applied

igbvf: eliminate duplicate barriers on weakly-ordered archs
missed the writel at the end of igbvf_tx_queue_adv()

igb: eliminate duplicate barriers on weakly-ordered archs
missed the writel at the end of igb_tx_map()

e1000: eliminate duplicate barriers on weakly-ordered archs
dropped

ixgbevf: eliminate duplicate barriers on weakly-ordered archs
split into two and remove extra barrier.

Sinan Kaya (6):
  i40e/i40evf: Eliminate duplicate barriers on weakly-ordered archs
  ixgbe: eliminate duplicate barriers on weakly-ordered archs
  igbvf: eliminate duplicate barriers on weakly-ordered archs
  igb: eliminate duplicate barriers on weakly-ordered archs
  ixgbevf: keep writel() closer to wmb()
  ixgbevf: eliminate duplicate barriers on weakly-ordered archs

 drivers/net/ethernet/intel/i40e/i40e_txrx.c       | 8 ++++----
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c     | 4 ++--
 drivers/net/ethernet/intel/igb/igb_main.c         | 4 ++--
 drivers/net/ethernet/intel/igbvf/netdev.c         | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 8 ++++----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      | 5 -----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 4 ++--
 7 files changed, 16 insertions(+), 21 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH v2 1/6] i40e/i40evf: Eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16  1:04 UTC (permalink / raw)
  To: netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Jeff Kirsher,
	intel-wired-lan, linux-kernel
In-Reply-To: <1521162296-19729-1-git-send-email-okaya@codeaurora.org>

Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 8 ++++----
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index e554aa6cf..9455869 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -185,7 +185,7 @@ static int i40e_program_fdir_filter(struct i40e_fdir_filter *fdir_data,
 	/* Mark the data descriptor to be watched */
 	first->next_to_watch = tx_desc;
 
-	writel(tx_ring->next_to_use, tx_ring->tail);
+	writel_relaxed(tx_ring->next_to_use, tx_ring->tail);
 	return 0;
 
 dma_fail:
@@ -1375,7 +1375,7 @@ static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
 	 * such as IA-64).
 	 */
 	wmb();
-	writel(val, rx_ring->tail);
+	writel_relaxed(val, rx_ring->tail);
 }
 
 /**
@@ -2258,7 +2258,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		 */
 		wmb();
 
-		writel(xdp_ring->next_to_use, xdp_ring->tail);
+		writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail);
 	}
 
 	rx_ring->skb = skb;
@@ -3286,7 +3286,7 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
 
 	/* notify HW of packet */
 	if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
-		writel(i, tx_ring->tail);
+		writel_relaxed(i, tx_ring->tail);
 
 		/* we need this if more than one processor can write to our tail
 		 * at a time, it synchronizes IO on IA64/Altix systems
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 357d605..56eea20 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -667,7 +667,7 @@ static inline void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val)
 	 * such as IA-64).
 	 */
 	wmb();
-	writel(val, rx_ring->tail);
+	writel_relaxed(val, rx_ring->tail);
 }
 
 /**
@@ -2243,7 +2243,7 @@ static inline void i40evf_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
 
 	/* notify HW of packet */
 	if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
-		writel(i, tx_ring->tail);
+		writel_relaxed(i, tx_ring->tail);
 
 		/* we need this if more than one processor can write to our tail
 		 * at a time, it synchronizes IO on IA64/Altix systems
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 2/6] ixgbe: eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16  1:04 UTC (permalink / raw)
  To: netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Jeff Kirsher,
	intel-wired-lan, linux-kernel
In-Reply-To: <1521162296-19729-1-git-send-email-okaya@codeaurora.org>

Code includes wmb() followed by writel() in multiple places. writel()
already has a barrier on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0da5aa2..58ed70f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1692,7 +1692,7 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 		 * such as IA-64).
 		 */
 		wmb();
-		writel(i, rx_ring->tail);
+		writel_relaxed(i, rx_ring->tail);
 	}
 }
 
@@ -2453,7 +2453,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		 * know there are new descriptors to fetch.
 		 */
 		wmb();
-		writel(ring->next_to_use, ring->tail);
+		writel_relaxed(ring->next_to_use, ring->tail);
 
 		xdp_do_flush_map();
 	}
@@ -8078,7 +8078,7 @@ static int ixgbe_tx_map(struct ixgbe_ring *tx_ring,
 	ixgbe_maybe_stop_tx(tx_ring, DESC_NEEDED);
 
 	if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
-		writel(i, tx_ring->tail);
+		writel_relaxed(i, tx_ring->tail);
 
 		/* we need this if more than one processor can write to our tail
 		 * at a time, it synchronizes IO on IA64/Altix systems
@@ -10014,7 +10014,7 @@ static void ixgbe_xdp_flush(struct net_device *dev)
 	 * are new descriptors to fetch.
 	 */
 	wmb();
-	writel(ring->next_to_use, ring->tail);
+	writel_relaxed(ring->next_to_use, ring->tail);
 
 	return;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 3/6] igbvf: eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16  1:04 UTC (permalink / raw)
  To: netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Jeff Kirsher,
	intel-wired-lan, linux-kernel
In-Reply-To: <1521162296-19729-1-git-send-email-okaya@codeaurora.org>

Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/intel/igbvf/netdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 4214c15..edb1c34 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -251,7 +251,7 @@ static void igbvf_alloc_rx_buffers(struct igbvf_ring *rx_ring,
 		 * such as IA-64).
 		*/
 		wmb();
-		writel(i, adapter->hw.hw_addr + rx_ring->tail);
+		writel_relaxed(i, adapter->hw.hw_addr + rx_ring->tail);
 	}
 }
 
@@ -2297,7 +2297,7 @@ static inline void igbvf_tx_queue_adv(struct igbvf_adapter *adapter,
 
 	tx_ring->buffer_info[first].next_to_watch = tx_desc;
 	tx_ring->next_to_use = i;
-	writel(i, adapter->hw.hw_addr + tx_ring->tail);
+	writel_relaxed(i, adapter->hw.hw_addr + tx_ring->tail);
 	/* we need this if more than one processor can write to our tail
 	 * at a time, it synchronizes IO on IA64/Altix systems
 	 */
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 4/6] igb: eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16  1:04 UTC (permalink / raw)
  To: netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Jeff Kirsher,
	intel-wired-lan, linux-kernel
In-Reply-To: <1521162296-19729-1-git-send-email-okaya@codeaurora.org>

Code includes wmb() followed by writel(). writel() already has a barrier
on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index b88fae7..82aea92 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5671,7 +5671,7 @@ static int igb_tx_map(struct igb_ring *tx_ring,
 	igb_maybe_stop_tx(tx_ring, DESC_NEEDED);
 
 	if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
-		writel(i, tx_ring->tail);
+		writel_relaxed(i, tx_ring->tail);
 
 		/* we need this if more than one processor can write to our tail
 		 * at a time, it synchronizes IO on IA64/Altix systems
@@ -8072,7 +8072,7 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 		 * such as IA-64).
 		 */
 		wmb();
-		writel(i, rx_ring->tail);
+		writel_relaxed(i, rx_ring->tail);
 	}
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 5/6] ixgbevf: keep writel() closer to wmb()
From: Sinan Kaya @ 2018-03-16  1:04 UTC (permalink / raw)
  To: netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Jeff Kirsher,
	intel-wired-lan, linux-kernel
In-Reply-To: <1521162296-19729-1-git-send-email-okaya@codeaurora.org>

Remove ixgbevf_write_tail() in favor of moving writel() close to
wmb().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      | 5 -----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 4 ++--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index f695242..11e893e 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -244,11 +244,6 @@ static inline u16 ixgbevf_desc_unused(struct ixgbevf_ring *ring)
 	return ((ntc > ntu) ? 0 : ring->count) + ntc - ntu - 1;
 }
 
-static inline void ixgbevf_write_tail(struct ixgbevf_ring *ring, u32 value)
-{
-	writel(value, ring->tail);
-}
-
 #define IXGBEVF_RX_DESC(R, i)	\
 	(&(((union ixgbe_adv_rx_desc *)((R)->desc))[i]))
 #define IXGBEVF_TX_DESC(R, i)	\
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 9b3d43d..b65f691 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -659,7 +659,7 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring,
 		 * such as IA-64).
 		 */
 		wmb();
-		ixgbevf_write_tail(rx_ring, i);
+		writel(i, rx_ring->tail);
 	}
 }
 
@@ -3644,7 +3644,7 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
 	tx_ring->next_to_use = i;
 
 	/* notify HW of packet */
-	ixgbevf_write_tail(tx_ring, i);
+	writel(value, tx_ring->tail);
 
 	return;
 dma_error:
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 6/6] ixgbevf: eliminate duplicate barriers on weakly-ordered archs
From: Sinan Kaya @ 2018-03-16  1:04 UTC (permalink / raw)
  To: netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Jeff Kirsher,
	intel-wired-lan, linux-kernel
In-Reply-To: <1521162296-19729-1-git-send-email-okaya@codeaurora.org>

Code includes wmb() followed by writel() in multiple places. writel()
already has a barrier on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index b65f691..9e2e0fd 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3644,7 +3644,7 @@ static void ixgbevf_tx_map(struct ixgbevf_ring *tx_ring,
 	tx_ring->next_to_use = i;
 
 	/* notify HW of packet */
-	writel(value, tx_ring->tail);
+	writel_relaxed(value, tx_ring->tail);
 
 	return;
 dma_error:
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v2 5/6] ixgbevf: keep writel() closer to wmb()
From: Sinan Kaya @ 2018-03-16  1:13 UTC (permalink / raw)
  To: netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, Jeff Kirsher, intel-wired-lan,
	linux-kernel
In-Reply-To: <1521162296-19729-6-git-send-email-okaya@codeaurora.org>

On 3/15/2018 9:04 PM, Sinan Kaya wrote:
>  	/* notify HW of packet */
> -	ixgbevf_write_tail(tx_ring, i);
> +	writel(value, tx_ring->tail);
>  

oops. copy paste mistake. 

I'll hold onto posting v3 until i hear more feedback.

-- 
Sinan Kaya
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply

* Re: linux-next: manual merge of the net-next tree with the rdma-fixes tree
From: Doug Ledford @ 2018-03-16  1:18 UTC (permalink / raw)
  To: Stephen Rothwell, David Miller, Networking, Jason Gunthorpe
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Mark Bloch,
	Leon Romanovsky
In-Reply-To: <20180316115610.3d7f232a@canb.auug.org.au>

[-- Attachment #1: Type: text/plain, Size: 4229 bytes --]

On Fri, 2018-03-16 at 11:56 +1100, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the net-next tree got a conflict in:
> 
>   drivers/infiniband/hw/mlx5/main.c
> 
> between commit:
> 
>   42cea83f9524 ("IB/mlx5: Fix cleanup order on unload")
> 
> from the rdma-fixes tree and commit:
> 
>   b5ca15ad7e61 ("IB/mlx5: Add proper representors support")
> 
> from the net-next tree.

We are aware of the merge conflict.  This is a result of the fact that
code had been submitted to the for-next area (the representors support)
and after that an issue was found by the syzkaller bot that deserved rc
fix status and which conflicted.  The fixup you list below is
insufficient to fix the merge conflict.  The full fixup can be found in
the rdma tree from where I merged the for-rc branch into the for-next
branch and created a complete fixup of the merge conflict.  The problem
is that one patch change the device init stage flow, while the other
patch duplicates the normal device init stage flow to the representor
device stage flow.  To resolve the fix, you not only have to resolve the
contextual diffs, but you have to duplicate the changes to the normal
device stage flow into the representor device stage flow.  It is very
far from a trivial merge.  We were planning on talking to Dave about
this issue tomorrow, but you beat us to raising the issue ;-).

Here's the commit (from the rdma git repo) with the proper merge fix
(although it also has other minor merge stuff that needs to be ignored):

2d873449a202 (Merge branch 'k.o/wip/dl-for-rc' into k.o/wip/dl-for-next)

> I fixed it up (see below and the merge fix patch as well) and can
> carry the fix as necessary. This is now fixed as far as linux-next is
> concerned, but any non trivial conflicts should be mentioned to your
> upstream maintainer when your tree is submitted for merging.  You may
> also want to consider cooperating with the maintainer of the conflicting
> tree to minimise any particularly complex conflicts.
> 
> From: Stephen Rothwell <sfr@canb.auug.org.au>
> Date: Fri, 16 Mar 2018 11:54:01 +1100
> Subject: [PATCH] IB/mlx5: merge fix for "Fix cleanup order on unload"
> 
> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
> ---
>  drivers/infiniband/hw/mlx5/ib_rep.c  | 6 +++---
>  drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +--
>  2 files changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
> index 61cc3d7db257..7fb997dadd80 100644
> --- a/drivers/infiniband/hw/mlx5/ib_rep.c
> +++ b/drivers/infiniband/hw/mlx5/ib_rep.c
> @@ -33,9 +33,9 @@ static const struct mlx5_ib_profile rep_profile = {
>  	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
>  		     mlx5_ib_stage_ib_reg_init,
>  		     mlx5_ib_stage_ib_reg_cleanup),
> -	STAGE_CREATE(MLX5_IB_STAGE_UMR_RESOURCES,
> -		     mlx5_ib_stage_umr_res_init,
> -		     mlx5_ib_stage_umr_res_cleanup),
> +	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
> +		     mlx5_ib_stage_post_ib_reg_umr_init,
> +		     NULL),
>  	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
>  		     mlx5_ib_stage_class_attr_init,
>  		     NULL),
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index 7ec753ec7962..c45a7abdbe3e 100644
> --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -1071,8 +1071,7 @@ int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev);
>  void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev);
>  int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev);
>  void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev);
> -int mlx5_ib_stage_umr_res_init(struct mlx5_ib_dev *dev);
> -void mlx5_ib_stage_umr_res_cleanup(struct mlx5_ib_dev *dev);
> +int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev);
>  int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev);
>  void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
>  		      const struct mlx5_ib_profile *profile,
> -- 
> 2.16.1
> 

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox