Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 06/10] net: hns3: Add some interface for the support of DCB feature
From: Yunsheng Lin @ 2017-09-21 11:21 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1505992913-107256-1-git-send-email-linyunsheng@huawei.com>

This patch add some interface and export some interface from
hclge_tm and hclgc_main to support the upcoming DCB feature.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    |  3 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  3 ++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 48 ++++++++++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h  |  6 +++
 4 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c27b460..49a11d5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -30,7 +30,6 @@
 #define HCLGE_64BIT_STATS_FIELD_OFF(f) (offsetof(struct hclge_64_bit_stats, f))
 #define HCLGE_32BIT_STATS_FIELD_OFF(f) (offsetof(struct hclge_32_bit_stats, f))
 
-static int hclge_rss_init_hw(struct hclge_dev *hdev);
 static int hclge_set_mta_filter_mode(struct hclge_dev *hdev,
 				     enum hclge_mta_dmac_sel_type mta_mac_sel,
 				     bool enable);
@@ -2660,7 +2659,7 @@ static int hclge_get_tc_size(struct hnae3_handle *handle)
 	return hdev->rss_size_max;
 }
 
-static int hclge_rss_init_hw(struct hclge_dev *hdev)
+int hclge_rss_init_hw(struct hclge_dev *hdev)
 {
 	const  u8 hfunc = HCLGE_RSS_HASH_ALGO_TOEPLITZ;
 	struct hclge_vport *vport = hdev->vport;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 4fc36f0..394b587 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -515,4 +515,7 @@ static inline int hclge_get_queue_id(struct hnae3_queue *queue)
 int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex);
 int hclge_set_vf_vlan_common(struct hclge_dev *vport, int vfid,
 			     bool is_kill, u16 vlan, u8 qos, __be16 proto);
+
+int hclge_buffer_alloc(struct hclge_dev *hdev);
+int hclge_rss_init_hw(struct hclge_dev *hdev);
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 2bc7d63c..e158e66 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -884,10 +884,14 @@ static int hclge_tm_pri_dwrr_cfg(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_tm_map_cfg(struct hclge_dev *hdev)
+int hclge_tm_map_cfg(struct hclge_dev *hdev)
 {
 	int ret;
 
+	ret = hclge_up_to_tc_map(hdev);
+	if (ret)
+		return ret;
+
 	ret = hclge_tm_pg_to_pri_map(hdev);
 	if (ret)
 		return ret;
@@ -995,7 +999,7 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_tm_schd_mode_hw(struct hclge_dev *hdev)
+int hclge_tm_schd_mode_hw(struct hclge_dev *hdev)
 {
 	int ret;
 
@@ -1093,7 +1097,45 @@ int hclge_pause_setup_hw(struct hclge_dev *hdev)
 			return ret;
 	}
 
-	return hclge_up_to_tc_map(hdev);
+	return 0;
+}
+
+int hclge_tm_prio_tc_info_update(struct hclge_dev *hdev, u8 *prio_tc)
+{
+	struct hclge_vport *vport = hdev->vport;
+	struct hnae3_knic_private_info *kinfo;
+	u32 i, k;
+
+	for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) {
+		if (prio_tc[i] >= hdev->tm_info.num_tc)
+			return -EINVAL;
+		hdev->tm_info.prio_tc[i] = prio_tc[i];
+
+		for (k = 0;  k < hdev->num_alloc_vport; k++) {
+			kinfo = &vport[k].nic.kinfo;
+			kinfo->prio_tc[i] = prio_tc[i];
+		}
+	}
+	return 0;
+}
+
+void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc)
+{
+	u8 i, bit_map = 0;
+
+	hdev->tm_info.num_tc = num_tc;
+
+	for (i = 0; i < hdev->tm_info.num_tc; i++)
+		bit_map |= BIT(i);
+
+	if (!bit_map) {
+		bit_map = 1;
+		hdev->tm_info.num_tc = 1;
+	}
+
+	hdev->hw_tc_map = bit_map;
+
+	hclge_tm_schd_info_init(hdev);
 }
 
 int hclge_tm_init_hw(struct hclge_dev *hdev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
index 19a01e4..bf59961 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
@@ -112,4 +112,10 @@ struct hclge_port_shapping_cmd {
 
 int hclge_tm_schd_init(struct hclge_dev *hdev);
 int hclge_pause_setup_hw(struct hclge_dev *hdev);
+int hclge_tm_schd_mode_hw(struct hclge_dev *hdev);
+int hclge_tm_prio_tc_info_update(struct hclge_dev *hdev, u8 *prio_tc);
+void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc);
+int hclge_tm_dwrr_cfg(struct hclge_dev *hdev);
+int hclge_tm_map_cfg(struct hclge_dev *hdev);
+int hclge_tm_init_hw(struct hclge_dev *hdev);
 #endif
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 05/10] net: hns3: Add tc-based TM support for sriov enabled port
From: Yunsheng Lin @ 2017-09-21 11:21 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1505992913-107256-1-git-send-email-linyunsheng@huawei.com>

When sriov is enabled and TM is in tc-based mode, vf's TM
parameters is not set in TM initialization process.
This patch add the tc_based TM support for sriov enabled
using the information in vport struct.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 49 ++++++++++++++--------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 33090d0..2bc7d63c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -389,13 +389,13 @@ static int hclge_tm_pri_schd_mode_cfg(struct hclge_dev *hdev, u8 pri_id)
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
-static int hclge_tm_qs_schd_mode_cfg(struct hclge_dev *hdev, u16 qs_id)
+static int hclge_tm_qs_schd_mode_cfg(struct hclge_dev *hdev, u16 qs_id, u8 mode)
 {
 	struct hclge_desc desc;
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TM_QS_SCH_MODE_CFG, false);
 
-	if (hdev->tm_info.tc_info[qs_id].tc_sch_mode == HCLGE_SCH_MODE_DWRR)
+	if (mode == HCLGE_SCH_MODE_DWRR)
 		desc.data[1] = cpu_to_le32(HCLGE_TM_TX_SCHD_DWRR_MSK);
 	else
 		desc.data[1] = 0;
@@ -639,17 +639,18 @@ static int hclge_tm_pri_q_qs_cfg(struct hclge_dev *hdev)
 {
 	struct hclge_vport *vport = hdev->vport;
 	int ret;
-	u32 i;
+	u32 i, k;
 
 	if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) {
 		/* Cfg qs -> pri mapping, one by one mapping */
-		for (i = 0; i < hdev->tm_info.num_tc; i++) {
-			ret = hclge_tm_qs_to_pri_map_cfg(hdev, i, i);
-			if (ret)
-				return ret;
-		}
+		for (k = 0; k < hdev->num_alloc_vport; k++)
+			for (i = 0; i < hdev->tm_info.num_tc; i++) {
+				ret = hclge_tm_qs_to_pri_map_cfg(
+					hdev, vport[k].qs_offset + i, i);
+				if (ret)
+					return ret;
+			}
 	} else if (hdev->tx_sch_mode == HCLGE_FLAG_VNET_BASE_SCH_MODE) {
-		int k;
 		/* Cfg qs -> pri mapping,  qs = tc, pri = vf, 8 qs -> 1 pri */
 		for (k = 0; k < hdev->num_alloc_vport; k++)
 			for (i = 0; i < HNAE3_MAX_TC; i++) {
@@ -798,10 +799,11 @@ static int hclge_tm_pri_shaper_cfg(struct hclge_dev *hdev)
 
 static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev)
 {
+	struct hclge_vport *vport = hdev->vport;
 	struct hclge_pg_info *pg_info;
 	u8 dwrr;
 	int ret;
-	u32 i;
+	u32 i, k;
 
 	for (i = 0; i < hdev->tm_info.num_tc; i++) {
 		pg_info =
@@ -812,9 +814,13 @@ static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev)
 		if (ret)
 			return ret;
 
-		ret = hclge_tm_qs_weight_cfg(hdev, i, dwrr);
-		if (ret)
-			return ret;
+		for (k = 0; k < hdev->num_alloc_vport; k++) {
+			ret = hclge_tm_qs_weight_cfg(
+				hdev, vport[k].qs_offset + i,
+				vport[k].dwrr);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
@@ -945,7 +951,10 @@ static int hclge_tm_schd_mode_vnet_base_cfg(struct hclge_vport *vport)
 		return ret;
 
 	for (i = 0; i < kinfo->num_tc; i++) {
-		ret = hclge_tm_qs_schd_mode_cfg(hdev, vport->qs_offset + i);
+		u8 sch_mode = hdev->tm_info.tc_info[i].tc_sch_mode;
+
+		ret = hclge_tm_qs_schd_mode_cfg(hdev, vport->qs_offset + i,
+						sch_mode);
 		if (ret)
 			return ret;
 	}
@@ -957,7 +966,7 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev)
 {
 	struct hclge_vport *vport = hdev->vport;
 	int ret;
-	u8 i;
+	u8 i, k;
 
 	if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) {
 		for (i = 0; i < hdev->tm_info.num_tc; i++) {
@@ -965,9 +974,13 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev)
 			if (ret)
 				return ret;
 
-			ret = hclge_tm_qs_schd_mode_cfg(hdev, i);
-			if (ret)
-				return ret;
+			for (k = 0; k < hdev->num_alloc_vport; k++) {
+				ret = hclge_tm_qs_schd_mode_cfg(
+					hdev, vport[k].qs_offset + i,
+					HCLGE_SCH_MODE_DWRR);
+				if (ret)
+					return ret;
+			}
 		}
 	} else {
 		for (i = 0; i < hdev->num_alloc_vport; i++) {
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 04/10] net: hns3: Add support for port shaper setting in TM module
From: Yunsheng Lin @ 2017-09-21 11:21 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1505992913-107256-1-git-send-email-linyunsheng@huawei.com>

This patch add a tm_port_shaper cmd and set port shaper
to HCLGE_ETHER_MAX_RATE on TM initialization process.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 33 ++++++++++++++++++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h  |  4 +++
 2 files changed, 37 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 0b4b5d9..33090d0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -301,6 +301,35 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev,
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
+static int hclge_tm_port_shaper_cfg(struct hclge_dev *hdev)
+{
+	struct hclge_port_shapping_cmd *shap_cfg_cmd;
+	struct hclge_desc desc;
+	u8 ir_u, ir_b, ir_s;
+	int ret;
+
+	ret = hclge_shaper_para_calc(HCLGE_ETHER_MAX_RATE,
+				     HCLGE_SHAPER_LVL_PORT,
+				     &ir_b, &ir_u, &ir_s);
+	if (ret)
+		return ret;
+
+	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TM_PORT_SHAPPING, false);
+	shap_cfg_cmd = (struct hclge_port_shapping_cmd *)desc.data;
+
+	hclge_tm_set_field(shap_cfg_cmd->port_shapping_para, IR_B, ir_b);
+	hclge_tm_set_field(shap_cfg_cmd->port_shapping_para, IR_U, ir_u);
+	hclge_tm_set_field(shap_cfg_cmd->port_shapping_para, IR_S, ir_s);
+	hclge_tm_set_field(shap_cfg_cmd->port_shapping_para,
+			   BS_B, HCLGE_SHAPER_BS_U_DEF);
+	hclge_tm_set_field(shap_cfg_cmd->port_shapping_para,
+			   BS_S, HCLGE_SHAPER_BS_S_DEF);
+	shap_cfg_cmd->port_shapping_para =
+				cpu_to_le32(shap_cfg_cmd->port_shapping_para);
+
+	return hclge_cmd_send(&hdev->hw, &desc, 1);
+}
+
 static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev,
 				     enum hclge_shap_bucket bucket, u8 pri_id,
 				     u8 ir_b, u8 ir_u, u8 ir_s,
@@ -864,6 +893,10 @@ static int hclge_tm_shaper_cfg(struct hclge_dev *hdev)
 {
 	int ret;
 
+	ret = hclge_tm_port_shaper_cfg(hdev);
+	if (ret)
+		return ret;
+
 	ret = hclge_tm_pg_shaper_cfg(hdev);
 	if (ret)
 		return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
index 8ecd83c..19a01e4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
@@ -99,6 +99,10 @@ struct hclge_pfc_en_cmd {
 	u8 pri_en_bitmap;
 };
 
+struct hclge_port_shapping_cmd {
+	__le32 port_shapping_para;
+};
+
 #define hclge_tm_set_field(dest, string, val) \
 			hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \
 				       (HCLGE_TM_SHAP_##string##_LSH), val)
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 03/10] net: hns3: Add support for PFC setting in TM module
From: Yunsheng Lin @ 2017-09-21 11:21 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1505992913-107256-1-git-send-email-linyunsheng@huawei.com>

This patch add a pfc_pause_en cmd, and use it to configure
PFC option according to fc_mode in hdev->tm_info.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 68 ++++++++++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h  |  5 ++
 2 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 73a75d7..0b4b5d9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -124,6 +124,20 @@ static int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx)
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
+static int hclge_pfc_pause_en_cfg(struct hclge_dev *hdev, u8 tx_rx_bitmap,
+				  u8 pfc_bitmap)
+{
+	struct hclge_desc desc;
+	struct hclge_pfc_en_cmd *pfc = (struct hclge_pfc_en_cmd *)&desc.data;
+
+	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_PFC_PAUSE_EN, false);
+
+	pfc->tx_rx_en_bitmap = tx_rx_bitmap;
+	pfc->pri_en_bitmap = pfc_bitmap;
+
+	return hclge_cmd_send(&hdev->hw, &desc, 1);
+}
+
 static int hclge_fill_pri_array(struct hclge_dev *hdev, u8 *pri, u8 pri_id)
 {
 	u8 tc;
@@ -969,20 +983,64 @@ static int hclge_tm_schd_setup_hw(struct hclge_dev *hdev)
 	return hclge_tm_schd_mode_hw(hdev);
 }
 
+static int hclge_pfc_setup_hw(struct hclge_dev *hdev)
+{
+	u8 enable_bitmap = 0;
+
+	if (hdev->tm_info.fc_mode == HCLGE_FC_PFC)
+		enable_bitmap = HCLGE_TX_MAC_PAUSE_EN_MSK |
+				HCLGE_RX_MAC_PAUSE_EN_MSK;
+
+	return hclge_pfc_pause_en_cfg(hdev, enable_bitmap,
+				      hdev->tm_info.hw_pfc_map);
+}
+
+static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
+{
+	bool tx_en, rx_en;
+
+	switch (hdev->tm_info.fc_mode) {
+	case HCLGE_FC_NONE:
+		tx_en = false;
+		rx_en = false;
+		break;
+	case HCLGE_FC_RX_PAUSE:
+		tx_en = false;
+		rx_en = true;
+		break;
+	case HCLGE_FC_TX_PAUSE:
+		tx_en = true;
+		rx_en = false;
+		break;
+	case HCLGE_FC_FULL:
+		tx_en = true;
+		rx_en = true;
+		break;
+	default:
+		tx_en = true;
+		rx_en = true;
+	}
+
+	return hclge_mac_pause_en_cfg(hdev, tx_en, rx_en);
+}
+
 int hclge_pause_setup_hw(struct hclge_dev *hdev)
 {
-	bool en = hdev->tm_info.fc_mode != HCLGE_FC_PFC;
 	int ret;
 	u8 i;
 
-	ret = hclge_mac_pause_en_cfg(hdev, en, en);
-	if (ret)
-		return ret;
+	if (hdev->tm_info.fc_mode != HCLGE_FC_PFC)
+		return hclge_mac_pause_setup_hw(hdev);
 
-	/* Only DCB-supported dev supports qset back pressure setting */
+	/* Only DCB-supported dev supports qset back pressure and pfc cmd */
 	if (!hnae3_dev_dcb_supported(hdev))
 		return 0;
 
+	/* When MAC is GE Mode, hdev does not support pfc setting */
+	ret = hclge_pfc_setup_hw(hdev);
+	if (ret)
+		dev_warn(&hdev->pdev->dev, "set pfc pause failed:%d\n", ret);
+
 	for (i = 0; i < hdev->tm_info.num_tc; i++) {
 		ret = hclge_tm_qs_bp_cfg(hdev, i);
 		if (ret)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
index 85158b0..8ecd83c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
@@ -94,6 +94,11 @@ struct hclge_bp_to_qs_map_cmd {
 	u32 rsvd1;
 };
 
+struct hclge_pfc_en_cmd {
+	u8 tx_rx_en_bitmap;
+	u8 pri_en_bitmap;
+};
+
 #define hclge_tm_set_field(dest, string, val) \
 			hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \
 				       (HCLGE_TM_SHAP_##string##_LSH), val)
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 02/10] net: hns3: Add support for dynamically buffer reallocation
From: Yunsheng Lin @ 2017-09-21 11:21 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1505992913-107256-1-git-send-email-linyunsheng@huawei.com>

Current buffer allocation can only happen at init, when
doing buffer reallocation after init, care must be taken
care of memory which priv_buf points to.
This patch fixes it by using a dynamic allocated temporary
memory. Because we only do buffer reallocation at init or
when setting up the DCB parameter, and priv_buf is only
used at buffer allocation process, so it is ok to use a
dynamic allocated temporary memory.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   5 +
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 150 +++++++++++----------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   2 -
 3 files changed, 87 insertions(+), 70 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index a81c6cb..6b6d28e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -322,6 +322,11 @@ struct hclge_shared_buf {
 	u32 buf_size;
 };
 
+struct hclge_pkt_buf_alloc {
+	struct hclge_priv_buf priv_buf[HCLGE_MAX_TC_NUM];
+	struct hclge_shared_buf s_buf;
+};
+
 #define HCLGE_RX_COM_WL_EN_B	15
 struct hclge_rx_com_wl_buf {
 	__le16 high_wl;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index dfe0fd2..c27b460 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1324,7 +1324,8 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
+static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev,
+				    struct hclge_pkt_buf_alloc *buf_alloc)
 {
 /* TX buffer size is unit by 128 byte */
 #define HCLGE_BUF_SIZE_UNIT_SHIFT	7
@@ -1340,7 +1341,7 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
 	for (i = 0; i < HCLGE_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		buf_size = priv->tx_buf_size;
 		req->tx_pkt_buff[i] =
 			cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) |
@@ -1357,9 +1358,10 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_tx_buffer_alloc(struct hclge_dev *hdev)
+static int hclge_tx_buffer_alloc(struct hclge_dev *hdev,
+				 struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	int ret = hclge_cmd_alloc_tx_buff(hdev);
+	int ret = hclge_cmd_alloc_tx_buff(hdev, buf_alloc);
 
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
@@ -1392,13 +1394,14 @@ static int hclge_get_pfc_enalbe_num(struct hclge_dev *hdev)
 }
 
 /* Get the number of pfc enabled TCs, which have private buffer */
-static int hclge_get_pfc_priv_num(struct hclge_dev *hdev)
+static int hclge_get_pfc_priv_num(struct hclge_dev *hdev,
+				  struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	int i, cnt = 0;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if ((hdev->tm_info.hw_pfc_map & BIT(i)) &&
 		    priv->enable)
 			cnt++;
@@ -1408,13 +1411,14 @@ static int hclge_get_pfc_priv_num(struct hclge_dev *hdev)
 }
 
 /* Get the number of pfc disabled TCs, which have private buffer */
-static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev)
+static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev,
+				     struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	int i, cnt = 0;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (hdev->hw_tc_map & BIT(i) &&
 		    !(hdev->tm_info.hw_pfc_map & BIT(i)) &&
 		    priv->enable)
@@ -1424,33 +1428,35 @@ static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev)
 	return cnt;
 }
 
-static u32 hclge_get_rx_priv_buff_alloced(struct hclge_dev *hdev)
+static u32 hclge_get_rx_priv_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	u32 rx_priv = 0;
 	int i;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (priv->enable)
 			rx_priv += priv->buf_size;
 	}
 	return rx_priv;
 }
 
-static u32 hclge_get_tx_buff_alloced(struct hclge_dev *hdev)
+static u32 hclge_get_tx_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	u32 tx_buf = 0, i;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		tx_buf += priv->tx_buf_size;
 	}
 	return tx_buf;
 }
 
-static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
+static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev,
+				struct hclge_pkt_buf_alloc *buf_alloc,
+				u32 rx_all)
 {
 	u32 shared_buf_min, shared_buf_tc, shared_std;
 	int tc_num, pfc_enable_num;
@@ -1471,30 +1477,31 @@ static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
 			hdev->mps;
 	shared_std = max_t(u32, shared_buf_min, shared_buf_tc);
 
-	rx_priv = hclge_get_rx_priv_buff_alloced(hdev);
+	rx_priv = hclge_get_rx_priv_buff_alloced(buf_alloc);
 	if (rx_all <= rx_priv + shared_std)
 		return false;
 
 	shared_buf = rx_all - rx_priv;
-	hdev->s_buf.buf_size = shared_buf;
-	hdev->s_buf.self.high = shared_buf;
-	hdev->s_buf.self.low =  2 * hdev->mps;
+	buf_alloc->s_buf.buf_size = shared_buf;
+	buf_alloc->s_buf.self.high = shared_buf;
+	buf_alloc->s_buf.self.low =  2 * hdev->mps;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
 		if ((hdev->hw_tc_map & BIT(i)) &&
 		    (hdev->tm_info.hw_pfc_map & BIT(i))) {
-			hdev->s_buf.tc_thrd[i].low = hdev->mps;
-			hdev->s_buf.tc_thrd[i].high = 2 * hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].low = hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].high = 2 * hdev->mps;
 		} else {
-			hdev->s_buf.tc_thrd[i].low = 0;
-			hdev->s_buf.tc_thrd[i].high = hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].low = 0;
+			buf_alloc->s_buf.tc_thrd[i].high = hdev->mps;
 		}
 	}
 
 	return true;
 }
 
-static int hclge_tx_buffer_calc(struct hclge_dev *hdev)
+static int hclge_tx_buffer_calc(struct hclge_dev *hdev,
+				struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	u32 i, total_size;
@@ -1503,7 +1510,7 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev)
 
 	/* alloc tx buffer for all enabled tc */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		if (total_size < HCLGE_DEFAULT_TX_BUF)
 			return -ENOMEM;
@@ -1521,22 +1528,24 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev)
 
 /* hclge_rx_buffer_calc: calculate the rx private buffer size for all TCs
  * @hdev: pointer to struct hclge_dev
+ * @buf_alloc: pointer to buffer calculation data
  * @return: 0: calculate sucessful, negative: fail
  */
-int hclge_rx_buffer_calc(struct hclge_dev *hdev)
+int hclge_rx_buffer_calc(struct hclge_dev *hdev,
+			 struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	u32 rx_all = hdev->pkt_buf_size;
 	int no_pfc_priv_num, pfc_priv_num;
 	struct hclge_priv_buf *priv;
 	int i;
 
-	rx_all -= hclge_get_tx_buff_alloced(hdev);
+	rx_all -= hclge_get_tx_buff_alloced(buf_alloc);
 
 	/* When DCB is not supported, rx private
 	 * buffer is not allocated.
 	 */
 	if (!hnae3_dev_dcb_supported(hdev)) {
-		if (!hclge_is_rx_buf_ok(hdev, rx_all))
+		if (!hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 			return -ENOMEM;
 
 		return 0;
@@ -1544,7 +1553,7 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 
 	/* step 1, try to alloc private buffer for all enabled tc */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (hdev->hw_tc_map & BIT(i)) {
 			priv->enable = 1;
 			if (hdev->tm_info.hw_pfc_map & BIT(i)) {
@@ -1565,14 +1574,14 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 		}
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 2, try to decrease the buffer size of
 	 * no pfc TC's private buffer
 	 */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		priv->enable = 0;
 		priv->wl.low = 0;
@@ -1595,18 +1604,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 		}
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 3, try to reduce the number of pfc disabled TCs,
 	 * which have private buffer
 	 */
 	/* get the total no pfc enable TC number, which have private buffer */
-	no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev);
+	no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev, buf_alloc);
 
 	/* let the last to be cleared first */
 	for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		if (hdev->hw_tc_map & BIT(i) &&
 		    !(hdev->tm_info.hw_pfc_map & BIT(i))) {
@@ -1618,22 +1627,22 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 			no_pfc_priv_num--;
 		}
 
-		if (hclge_is_rx_buf_ok(hdev, rx_all) ||
+		if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) ||
 		    no_pfc_priv_num == 0)
 			break;
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 4, try to reduce the number of pfc enabled TCs
 	 * which have private buffer.
 	 */
-	pfc_priv_num = hclge_get_pfc_priv_num(hdev);
+	pfc_priv_num = hclge_get_pfc_priv_num(hdev, buf_alloc);
 
 	/* let the last to be cleared first */
 	for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		if (hdev->hw_tc_map & BIT(i) &&
 		    hdev->tm_info.hw_pfc_map & BIT(i)) {
@@ -1645,17 +1654,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 			pfc_priv_num--;
 		}
 
-		if (hclge_is_rx_buf_ok(hdev, rx_all) ||
+		if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) ||
 		    pfc_priv_num == 0)
 			break;
 	}
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	return -ENOMEM;
 }
 
-static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
+static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev,
+				   struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_rx_priv_buff *req;
 	struct hclge_desc desc;
@@ -1667,7 +1677,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 
 	/* Alloc private buffer TCs */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		struct hclge_priv_buf *priv = &hdev->priv_buf[i];
+		struct hclge_priv_buf *priv = &buf_alloc->priv_buf[i];
 
 		req->buf_num[i] =
 			cpu_to_le16(priv->buf_size >> HCLGE_BUF_UNIT_S);
@@ -1676,7 +1686,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 	}
 
 	req->shared_buf =
-		cpu_to_le16((hdev->s_buf.buf_size >> HCLGE_BUF_UNIT_S) |
+		cpu_to_le16((buf_alloc->s_buf.buf_size >> HCLGE_BUF_UNIT_S) |
 			    (1 << HCLGE_TC0_PRI_BUF_EN_B));
 
 	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -1691,7 +1701,8 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 
 #define HCLGE_PRIV_ENABLE(a) ((a) > 0 ? 1 : 0)
 
-static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
+static int hclge_rx_priv_wl_config(struct hclge_dev *hdev,
+				   struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_rx_priv_wl_buf *req;
 	struct hclge_priv_buf *priv;
@@ -1711,7 +1722,9 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
 			desc[i].flag &= ~cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
 
 		for (j = 0; j < HCLGE_TC_NUM_ONE_DESC; j++) {
-			priv = &hdev->priv_buf[i * HCLGE_TC_NUM_ONE_DESC + j];
+			u32 idx = i * HCLGE_TC_NUM_ONE_DESC + j;
+
+			priv = &buf_alloc->priv_buf[idx];
 			req->tc_wl[j].high =
 				cpu_to_le16(priv->wl.high >> HCLGE_BUF_UNIT_S);
 			req->tc_wl[j].high |=
@@ -1736,9 +1749,10 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_common_thrd_config(struct hclge_dev *hdev)
+static int hclge_common_thrd_config(struct hclge_dev *hdev,
+				    struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	struct hclge_shared_buf *s_buf = &hdev->s_buf;
+	struct hclge_shared_buf *s_buf = &buf_alloc->s_buf;
 	struct hclge_rx_com_thrd *req;
 	struct hclge_desc desc[2];
 	struct hclge_tc_thrd *tc;
@@ -1782,9 +1796,10 @@ static int hclge_common_thrd_config(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_common_wl_config(struct hclge_dev *hdev)
+static int hclge_common_wl_config(struct hclge_dev *hdev,
+				  struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	struct hclge_shared_buf *buf = &hdev->s_buf;
+	struct hclge_shared_buf *buf = &buf_alloc->s_buf;
 	struct hclge_rx_com_wl *req;
 	struct hclge_desc desc;
 	int ret;
@@ -1814,69 +1829,68 @@ static int hclge_common_wl_config(struct hclge_dev *hdev)
 
 int hclge_buffer_alloc(struct hclge_dev *hdev)
 {
+	struct hclge_pkt_buf_alloc *pkt_buf;
 	int ret;
 
-	hdev->priv_buf = devm_kmalloc_array(&hdev->pdev->dev, HCLGE_MAX_TC_NUM,
-					    sizeof(struct hclge_priv_buf),
-					    GFP_KERNEL | __GFP_ZERO);
-	if (!hdev->priv_buf)
+	pkt_buf = kzalloc(sizeof(*pkt_buf), GFP_KERNEL);
+	if (!pkt_buf)
 		return -ENOMEM;
 
-	ret = hclge_tx_buffer_calc(hdev);
+	ret = hclge_tx_buffer_calc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not calc tx buffer size for all TCs %d\n", ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_tx_buffer_alloc(hdev);
+	ret = hclge_tx_buffer_alloc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not alloc tx buffers %d\n", ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_rx_buffer_calc(hdev);
+	ret = hclge_rx_buffer_calc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not calc rx priv buffer size for all TCs %d\n",
 			ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_rx_priv_buf_alloc(hdev);
+	ret = hclge_rx_priv_buf_alloc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev, "could not alloc rx priv buffer %d\n",
 			ret);
-		return ret;
+		goto out;
 	}
 
 	if (hnae3_dev_dcb_supported(hdev)) {
-		ret = hclge_rx_priv_wl_config(hdev);
+		ret = hclge_rx_priv_wl_config(hdev, pkt_buf);
 		if (ret) {
 			dev_err(&hdev->pdev->dev,
 				"could not configure rx private waterline %d\n",
 				ret);
-			return ret;
+			goto out;
 		}
 
-		ret = hclge_common_thrd_config(hdev);
+		ret = hclge_common_thrd_config(hdev, pkt_buf);
 		if (ret) {
 			dev_err(&hdev->pdev->dev,
 				"could not configure common threshold %d\n",
 				ret);
-			return ret;
+			goto out;
 		}
 	}
 
-	ret = hclge_common_wl_config(hdev);
-	if (ret) {
+	ret = hclge_common_wl_config(hdev, pkt_buf);
+	if (ret)
 		dev_err(&hdev->pdev->dev,
 			"could not configure common waterline %d\n", ret);
-		return ret;
-	}
 
-	return 0;
+out:
+	kfree(pkt_buf);
+	return ret;
 }
 
 static int hclge_init_roce_base_info(struct hclge_vport *vport)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 9fcfd93..4fc36f0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -463,8 +463,6 @@ struct hclge_dev {
 
 	u32 pkt_buf_size; /* Total pf buf size for tx/rx */
 	u32 mps; /* Max packet size */
-	struct hclge_priv_buf *priv_buf;
-	struct hclge_shared_buf s_buf;
 
 	enum hclge_mta_dmac_sel_type mta_mac_sel_type;
 	bool enable_mta; /* Mutilcast filter enable */
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 01/10] net: hns3: Support for dynamically assigning tx buffer to TC
From: Yunsheng Lin @ 2017-09-21 11:21 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1505992913-107256-1-git-send-email-linyunsheng@huawei.com>

This patch add support of dynamically assigning tx buffer to
TC when the TC is enabled.
It will save buffer for rx direction to avoid packet loss.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  1 +
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 69 ++++++++++++++++++----
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index 758cf39..a81c6cb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -311,6 +311,7 @@ struct hclge_tc_thrd {
 struct hclge_priv_buf {
 	struct hclge_waterline wl;	/* Waterline for low and high*/
 	u32 buf_size;	/* TC private buffer size */
+	u32 tx_buf_size;
 	u32 enable;	/* Enable TC private buffer or not */
 };
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index d27618b..dfe0fd2 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1324,23 +1324,28 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
+static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
 {
 /* TX buffer size is unit by 128 byte */
 #define HCLGE_BUF_SIZE_UNIT_SHIFT	7
 #define HCLGE_BUF_SIZE_UPDATE_EN_MSK	BIT(15)
 	struct hclge_tx_buff_alloc *req;
+	struct hclge_priv_buf *priv;
 	struct hclge_desc desc;
+	u32 buf_size;
 	int ret;
 	u8 i;
 
 	req = (struct hclge_tx_buff_alloc *)desc.data;
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
-	for (i = 0; i < HCLGE_TC_NUM; i++)
+	for (i = 0; i < HCLGE_TC_NUM; i++) {
+		priv = &hdev->priv_buf[i];
+		buf_size = priv->tx_buf_size;
 		req->tx_pkt_buff[i] =
 			cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) |
 				     HCLGE_BUF_SIZE_UPDATE_EN_MSK);
+	}
 
 	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
 	if (ret) {
@@ -1352,9 +1357,9 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
 	return 0;
 }
 
-static int hclge_tx_buffer_alloc(struct hclge_dev *hdev, u32 buf_size)
+static int hclge_tx_buffer_alloc(struct hclge_dev *hdev)
 {
-	int ret = hclge_cmd_alloc_tx_buff(hdev, buf_size);
+	int ret = hclge_cmd_alloc_tx_buff(hdev);
 
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
@@ -1433,6 +1438,18 @@ static u32 hclge_get_rx_priv_buff_alloced(struct hclge_dev *hdev)
 	return rx_priv;
 }
 
+static u32 hclge_get_tx_buff_alloced(struct hclge_dev *hdev)
+{
+	struct hclge_priv_buf *priv;
+	u32 tx_buf = 0, i;
+
+	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
+		priv = &hdev->priv_buf[i];
+		tx_buf += priv->tx_buf_size;
+	}
+	return tx_buf;
+}
+
 static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
 {
 	u32 shared_buf_min, shared_buf_tc, shared_std;
@@ -1477,18 +1494,44 @@ static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
 	return true;
 }
 
+static int hclge_tx_buffer_calc(struct hclge_dev *hdev)
+{
+	struct hclge_priv_buf *priv;
+	u32 i, total_size;
+
+	total_size = hdev->pkt_buf_size;
+
+	/* alloc tx buffer for all enabled tc */
+	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
+		priv = &hdev->priv_buf[i];
+
+		if (total_size < HCLGE_DEFAULT_TX_BUF)
+			return -ENOMEM;
+
+		if (hdev->hw_tc_map & BIT(i))
+			priv->tx_buf_size = HCLGE_DEFAULT_TX_BUF;
+		else
+			priv->tx_buf_size = 0;
+
+		total_size -= priv->tx_buf_size;
+	}
+
+	return 0;
+}
+
 /* hclge_rx_buffer_calc: calculate the rx private buffer size for all TCs
  * @hdev: pointer to struct hclge_dev
- * @tx_size: the allocated tx buffer for all TCs
  * @return: 0: calculate sucessful, negative: fail
  */
-int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
+int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 {
-	u32 rx_all = hdev->pkt_buf_size - tx_size;
+	u32 rx_all = hdev->pkt_buf_size;
 	int no_pfc_priv_num, pfc_priv_num;
 	struct hclge_priv_buf *priv;
 	int i;
 
+	rx_all -= hclge_get_tx_buff_alloced(hdev);
+
 	/* When DCB is not supported, rx private
 	 * buffer is not allocated.
 	 */
@@ -1771,7 +1814,6 @@ static int hclge_common_wl_config(struct hclge_dev *hdev)
 
 int hclge_buffer_alloc(struct hclge_dev *hdev)
 {
-	u32 tx_buf_size = HCLGE_DEFAULT_TX_BUF;
 	int ret;
 
 	hdev->priv_buf = devm_kmalloc_array(&hdev->pdev->dev, HCLGE_MAX_TC_NUM,
@@ -1780,14 +1822,21 @@ int hclge_buffer_alloc(struct hclge_dev *hdev)
 	if (!hdev->priv_buf)
 		return -ENOMEM;
 
-	ret = hclge_tx_buffer_alloc(hdev, tx_buf_size);
+	ret = hclge_tx_buffer_calc(hdev);
+	if (ret) {
+		dev_err(&hdev->pdev->dev,
+			"could not calc tx buffer size for all TCs %d\n", ret);
+		return ret;
+	}
+
+	ret = hclge_tx_buffer_alloc(hdev);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not alloc tx buffers %d\n", ret);
 		return ret;
 	}
 
-	ret = hclge_rx_buffer_calc(hdev, tx_buf_size);
+	ret = hclge_rx_buffer_calc(hdev);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not calc rx priv buffer size for all TCs %d\n",
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 00/10] Add support for DCB feature in hns3 driver
From: Yunsheng Lin @ 2017-09-21 11:21 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel

The patchset contains some enhancement related to DCB before
adding support for DCB feature.

This patchset depends on the following patchset:
https://patchwork.ozlabs.org/cover/815646/
https://patchwork.ozlabs.org/cover/816145/

High Level Architecture:

       [ tc qdisc ]	       [ lldpad ]
             |                     |
             |                     |
             |                     |
       [ hns3_enet ]        [ hns3_dcbnl ]
             \                    /
                \              /
                   \        /
                 [ hclge_dcb ]
                   /      \
                /            \
             /                  \
     [ hclgc_main ]        [ hclge_tm ]

Current patch-set support following functionality:
1. Use of tc qdisc to configure the tc num and prio_tc_map.
2. Use of lldptool to configure the tc schedule mode, tc
   bandwidth(if schedule mode is ETS), prio_tc_map and
   PFC parameter.

Yunsheng Lin (10):
  net: hns3: Support for dynamically assigning tx buffer to TC
  net: hns3: Add support for dynamically buffer reallocation
  net: hns3: Add support for PFC setting in TM module
  net: hns3: Add support for port shaper setting in TM module
  net: hns3: Add tc-based TM support for sriov enabled port
  net: hns3: Add some interface for the support of DCB feature
  net: hns3: Add hclge_dcb module for the support of DCB feature
  net: hns3: Add dcb netlink interface for the support of DCB feature
  net: hns3: Setting for fc_mode and dcb enable flag in TM module
  net: hns3: Add mqprio support when interacting with network stack

 drivers/net/ethernet/hisilicon/Kconfig             |   9 +
 drivers/net/ethernet/hisilicon/hns3/hnae3.h        |  20 ++
 .../net/ethernet/hisilicon/hns3/hns3pf/Makefile    |   4 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   6 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 327 +++++++++++++++++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h |  21 ++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 219 +++++++++-----
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   8 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 232 +++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h  |  15 +
 .../ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c    | 106 +++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 137 +++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h |   7 +
 13 files changed, 983 insertions(+), 128 deletions(-)
 create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
 create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h
 create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c

-- 
1.9.1

^ permalink raw reply

* Re: [PATCH net 2/4] net:ethernet:aquantia: Fix Tx queue hangups
From: Yunsheng Lin @ 2017-09-21 11:19 UTC (permalink / raw)
  To: Igor Russkikh, David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus
In-Reply-To: <cef3863edd8d504d7406f781c97260c52f21e156.1505915085.git.igor.russkikh@aquantia.com>

Hi, Igor

On 2017/9/21 18:53, Igor Russkikh wrote:
> Driver did a poor job in managing its Tx queues: Sometimes it could stop
> tx queues due to link down condition in aq_nic_xmit - but never waked up
> them. That led to Tx path total suspend.
> This patch fixes this and improves generic queue management:
> - introduces queue restart counter
> - uses generic netif_ interface to disable and enable tx path
> - refactors link up/down condition and introduces dmesg log event when
>   link changes.
> - introduces new constant for minimum descriptors count required for queue
>   wakeup
> 
> Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
> ---
>  drivers/net/ethernet/aquantia/atlantic/aq_cfg.h  |  4 ++
>  drivers/net/ethernet/aquantia/atlantic/aq_nic.c  | 91 +++++++++++-------------
>  drivers/net/ethernet/aquantia/atlantic/aq_nic.h  |  2 -
>  drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 26 +++++++
>  drivers/net/ethernet/aquantia/atlantic/aq_ring.h |  4 ++
>  drivers/net/ethernet/aquantia/atlantic/aq_vec.c  |  8 +--
>  6 files changed, 76 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
> index 2149864..0fdaaa6 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
> @@ -51,6 +51,10 @@
>  
>  #define AQ_CFG_SKB_FRAGS_MAX   32U
>  
> +/* Number of descriptors available in one ring to resume this ring queue
> + */
> +#define AQ_CFG_RESTART_DESC_THRES   (AQ_CFG_SKB_FRAGS_MAX * 2)
> +
>  #define AQ_CFG_NAPI_WEIGHT     64U
>  
>  #define AQ_CFG_MULTICAST_ADDRESS_MAX     32U
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> index f281392..24f573c 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
> @@ -119,6 +119,35 @@ int aq_nic_cfg_start(struct aq_nic_s *self)
>  	return 0;
>  }
>  
> +static int aq_nic_update_link_status(struct aq_nic_s *self)
> +{
> +	int err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
> +
> +	if (err < 0)
> +		return -1;


why not just return err?

> +
> +	if (self->link_status.mbps != self->aq_hw->aq_link_status.mbps)
> +		pr_info("%s: link change old %d new %d\n",
> +			AQ_CFG_DRV_NAME, self->link_status.mbps,
> +			self->aq_hw->aq_link_status.mbps);

You has ndev in struct aq_nic_s *self, why not use netdev_*?


> +
> +	self->link_status = self->aq_hw->aq_link_status;
> +	if (!netif_carrier_ok(self->ndev) && self->link_status.mbps) {
> +		aq_utils_obj_set(&self->header.flags,
> +				 AQ_NIC_FLAG_STARTED);
> +		aq_utils_obj_clear(&self->header.flags,
> +				   AQ_NIC_LINK_DOWN);
> +		netif_carrier_on(self->ndev);
> +		netif_tx_wake_all_queues(self->ndev);
> +	}
> +	if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) {
> +		netif_carrier_off(self->ndev);
> +		netif_tx_disable(self->ndev);
> +		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
> +	}
> +	return 0;
> +}
> +
>  static void aq_nic_service_timer_cb(unsigned long param)
>  {
>  	struct aq_nic_s *self = (struct aq_nic_s *)param;
> @@ -131,26 +160,13 @@ static void aq_nic_service_timer_cb(unsigned long param)
>  	if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY))
>  		goto err_exit;
>  
> -	err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
> -	if (err < 0)
> +	err = aq_nic_update_link_status(self);
> +	if (err)
>  		goto err_exit;
>  
> -	self->link_status = self->aq_hw->aq_link_status;
> -
>  	self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw,
>  		    self->aq_nic_cfg.is_interrupt_moderation);
>  
> -	if (self->link_status.mbps) {
> -		aq_utils_obj_set(&self->header.flags,
> -				 AQ_NIC_FLAG_STARTED);
> -		aq_utils_obj_clear(&self->header.flags,
> -				   AQ_NIC_LINK_DOWN);
> -		netif_carrier_on(self->ndev);
> -	} else {
> -		netif_carrier_off(self->ndev);
> -		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
> -	}
> -
>  	memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
>  	memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
>  	for (i = AQ_DIMOF(self->aq_vec); i--;) {
> @@ -240,7 +256,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
>  int aq_nic_ndev_register(struct aq_nic_s *self)
>  {
>  	int err = 0;
> -	unsigned int i = 0U;
>  
>  	if (!self->ndev) {
>  		err = -EINVAL;
> @@ -262,8 +277,7 @@ int aq_nic_ndev_register(struct aq_nic_s *self)
>  
>  	netif_carrier_off(self->ndev);
>  
> -	for (i = AQ_CFG_VECS_MAX; i--;)
> -		aq_nic_ndev_queue_stop(self, i);
> +	netif_tx_disable(self->ndev);
>  
>  	err = register_netdev(self->ndev);
>  	if (err < 0)
> @@ -319,12 +333,8 @@ struct aq_nic_s *aq_nic_alloc_hot(struct net_device *ndev)
>  		err = -EINVAL;
>  		goto err_exit;
>  	}
> -	if (netif_running(ndev)) {
> -		unsigned int i;
> -
> -		for (i = AQ_CFG_VECS_MAX; i--;)
> -			netif_stop_subqueue(ndev, i);
> -	}
> +	if (netif_running(ndev))
> +		netif_tx_disable(ndev);
>  
>  	for (self->aq_vecs = 0; self->aq_vecs < self->aq_nic_cfg.vecs;
>  		self->aq_vecs++) {
> @@ -384,16 +394,6 @@ int aq_nic_init(struct aq_nic_s *self)
>  	return err;
>  }
>  
> -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx)
> -{
> -	netif_start_subqueue(self->ndev, idx);
> -}
> -
> -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx)
> -{
> -	netif_stop_subqueue(self->ndev, idx);
> -}
> -
>  int aq_nic_start(struct aq_nic_s *self)
>  {
>  	struct aq_vec_s *aq_vec = NULL;
> @@ -452,10 +452,6 @@ int aq_nic_start(struct aq_nic_s *self)
>  			goto err_exit;
>  	}
>  
> -	for (i = 0U, aq_vec = self->aq_vec[0];
> -		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
> -		aq_nic_ndev_queue_start(self, i);
> -
>  	err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs);
>  	if (err < 0)
>  		goto err_exit;
> @@ -464,6 +460,8 @@ int aq_nic_start(struct aq_nic_s *self)
>  	if (err < 0)
>  		goto err_exit;
>  
> +	netif_tx_start_all_queues(self->ndev);
> +
>  err_exit:
>  	return err;
>  }
> @@ -603,7 +601,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
>  	unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
>  	unsigned int tc = 0U;
>  	int err = NETDEV_TX_OK;
> -	bool is_nic_in_bad_state;
>  
>  	frags = skb_shinfo(skb)->nr_frags + 1;
>  
> @@ -614,13 +611,10 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
>  		goto err_exit;
>  	}
>  
> -	is_nic_in_bad_state = aq_utils_obj_test(&self->header.flags,
> -						AQ_NIC_FLAGS_IS_NOT_TX_READY) ||
> -						(aq_ring_avail_dx(ring) <
> -						AQ_CFG_SKB_FRAGS_MAX);
> +	aq_ring_update_queue_state(ring);
>  
> -	if (is_nic_in_bad_state) {
> -		aq_nic_ndev_queue_stop(self, ring->idx);
> +	/* Above status update may stop the queue. Check this. */
> +	if (__netif_subqueue_stopped(self->ndev, ring->idx)) {
>  		err = NETDEV_TX_BUSY;
>  		goto err_exit;
>  	}
> @@ -632,9 +626,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
>  						      ring,
>  						      frags);
>  		if (err >= 0) {
> -			if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1)
> -				aq_nic_ndev_queue_stop(self, ring->idx);
> -
>  			++ring->stats.tx.packets;
>  			ring->stats.tx.bytes += skb->len;
>  		}
> @@ -906,9 +897,7 @@ int aq_nic_stop(struct aq_nic_s *self)
>  	struct aq_vec_s *aq_vec = NULL;
>  	unsigned int i = 0U;
>  
> -	for (i = 0U, aq_vec = self->aq_vec[0];
> -		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
> -		aq_nic_ndev_queue_stop(self, i);
> +	netif_tx_disable(self->ndev);
>  
>  	del_timer_sync(&self->service_timer);
>  
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> index 7fc2a5e..0ddd556 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
> @@ -83,8 +83,6 @@ struct net_device *aq_nic_get_ndev(struct aq_nic_s *self);
>  int aq_nic_init(struct aq_nic_s *self);
>  int aq_nic_cfg_start(struct aq_nic_s *self);
>  int aq_nic_ndev_register(struct aq_nic_s *self);
> -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx);
> -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx);
>  void aq_nic_ndev_free(struct aq_nic_s *self);
>  int aq_nic_start(struct aq_nic_s *self);
>  int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb);
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> index 4eee199..02f79b0 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
> @@ -104,6 +104,32 @@ int aq_ring_init(struct aq_ring_s *self)
>  	return 0;
>  }
>  
> +void aq_ring_update_queue_state(struct aq_ring_s *ring)
> +{
> +	if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX)
> +		aq_ring_queue_stop(ring);
> +	else if (aq_ring_avail_dx(ring) > AQ_CFG_RESTART_DESC_THRES)
> +		aq_ring_queue_wake(ring);
> +}
> +
> +void aq_ring_queue_wake(struct aq_ring_s *ring)
> +{
> +	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
> +
> +	if (__netif_subqueue_stopped(ndev, ring->idx)) {
> +		netif_wake_subqueue(ndev, ring->idx);
> +		ring->stats.tx.queue_restarts++;
> +	}
> +}
> +
> +void aq_ring_queue_stop(struct aq_ring_s *ring)
> +{
> +	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
> +
> +	if (!__netif_subqueue_stopped(ndev, ring->idx))
> +		netif_stop_subqueue(ndev, ring->idx);
> +}
> +
>  void aq_ring_tx_clean(struct aq_ring_s *self)
>  {
>  	struct device *dev = aq_nic_get_dev(self->aq_nic);
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
> index 782176c..24523b5 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
> @@ -94,6 +94,7 @@ struct aq_ring_stats_tx_s {
>  	u64 errors;
>  	u64 packets;
>  	u64 bytes;
> +	u64 queue_restarts;
>  };
>  
>  union aq_ring_stats_s {
> @@ -147,6 +148,9 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
>  int aq_ring_init(struct aq_ring_s *self);
>  void aq_ring_rx_deinit(struct aq_ring_s *self);
>  void aq_ring_free(struct aq_ring_s *self);
> +void aq_ring_update_queue_state(struct aq_ring_s *ring);
> +void aq_ring_queue_wake(struct aq_ring_s *ring);
> +void aq_ring_queue_stop(struct aq_ring_s *ring);
>  void aq_ring_tx_clean(struct aq_ring_s *self);
>  int aq_ring_rx_clean(struct aq_ring_s *self,
>  		     struct napi_struct *napi,
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> index ebf5880..305ff8f 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
> @@ -59,12 +59,7 @@ static int aq_vec_poll(struct napi_struct *napi, int budget)
>  			if (ring[AQ_VEC_TX_ID].sw_head !=
>  			    ring[AQ_VEC_TX_ID].hw_head) {
>  				aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]);
> -
> -				if (aq_ring_avail_dx(&ring[AQ_VEC_TX_ID]) >
> -				    AQ_CFG_SKB_FRAGS_MAX) {
> -					aq_nic_ndev_queue_start(self->aq_nic,
> -						ring[AQ_VEC_TX_ID].idx);
> -				}
> +				aq_ring_update_queue_state(&ring[AQ_VEC_TX_ID]);
>  				was_tx_cleaned = true;
>  			}
>  
> @@ -364,6 +359,7 @@ void aq_vec_add_stats(struct aq_vec_s *self,
>  		stats_tx->packets += tx->packets;
>  		stats_tx->bytes += tx->bytes;
>  		stats_tx->errors += tx->errors;
> +		stats_tx->queue_restarts += tx->queue_restarts;
>  	}
>  }
>  
> 

^ permalink raw reply

* Re: [patch net-next 03/12] ipmr: Add FIB notification access functions
From: Nikolay Aleksandrov @ 2017-09-21 11:19 UTC (permalink / raw)
  To: Jiri Pirko, netdev; +Cc: davem, yotamg, idosch, mlxsw
In-Reply-To: <20170921064338.1282-4-jiri@resnulli.us>

On 21/09/17 09:43, Jiri Pirko wrote:
> From: Yotam Gigi <yotamg@mellanox.com>
> 
> Make the ipmr module register as a FIB notifier. To do that, implement both
> the ipmr_seq_read and ipmr_dump ops.
> 
> The ipmr_seq_read op returns a sequence counter that is incremented on
> every notification related operation done by the ipmr. To implement that,
> add a sequence counter in the netns_ipv4 struct and increment it whenever a
> new MFC route or VIF are added or deleted. The sequence operations are
> protected by the RTNL lock.
> 
> The ipmr_dump iterates the list of MFC routes and the list of VIF entries
> and sends notifications about them. The entries dump is done under RCU.
> 
> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/linux/mroute.h   |  15 ++++++
>  include/net/netns/ipv4.h |   3 ++
>  net/ipv4/ipmr.c          | 135 ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 151 insertions(+), 2 deletions(-)
> 
[snip]
> +
> +static int ipmr_dump(struct net *net, struct notifier_block *nb)
> +{
> +	struct mr_table *mrt;
> +	int err;
> +
> +	err = ipmr_rules_dump(net, nb);
> +	if (err)
> +		return err;
> +
> +	ipmr_for_each_table(mrt, net) {
> +		struct vif_device *v = &mrt->vif_table[0];
> +		struct mfc_cache *mfc;
> +		int vifi;
> +
> +		/* Notifiy on table VIF entries */
> +		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
> +			if (!v->dev)
> +				continue;
> +
> +			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
> +						     v, vifi, mrt->id);
> +		}

The VIF table is protected by mrt_lock (rwlock), here with RCU only
you're not guaranteed to keep v->dev. It can become NULL after the check above.
For details you can see vif_delete() in net/ipv4/ipmr.c. You need at least
mrt_lock for reading.

> +
> +		/* Notify on table MFC entries */
> +		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
> +			call_ipmr_mfc_entry_notifier(nb, net,
> +						     FIB_EVENT_ENTRY_ADD, mfc,
> +						     mrt->id);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct fib_notifier_ops ipmr_notifier_ops_template = {
> +	.family		= RTNL_FAMILY_IPMR,
> +	.fib_seq_read	= ipmr_seq_read,
> +	.fib_dump	= ipmr_dump,
> +	.owner		= THIS_MODULE,
> +};
> +
> +int __net_init ipmr_notifier_init(struct net *net)
> +{
> +	struct fib_notifier_ops *ops;
> +
> +	net->ipv4.ipmr_seq = 0;
> +
> +	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
> +	if (IS_ERR(ops))
> +		return PTR_ERR(ops);
> +	net->ipv4.ipmr_notifier_ops = ops;
> +
> +	return 0;
> +}
> +
> +static void __net_exit ipmr_notifier_exit(struct net *net)
> +{
> +	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
> +	net->ipv4.ipmr_notifier_ops = NULL;
> +}
> +
>  /* Setup for IP multicast routing */
>  static int __net_init ipmr_net_init(struct net *net)
>  {
>  	int err;
>  
> +	err = ipmr_notifier_init(net);
> +	if (err)
> +		goto ipmr_notifier_fail;
> +
>  	err = ipmr_rules_init(net);
>  	if (err < 0)
> -		goto fail;
> +		goto ipmr_rules_fail;
>  
>  #ifdef CONFIG_PROC_FS
>  	err = -ENOMEM;
> @@ -3074,7 +3202,9 @@ static int __net_init ipmr_net_init(struct net *net)
>  proc_vif_fail:
>  	ipmr_rules_exit(net);
>  #endif
> -fail:
> +ipmr_rules_fail:
> +	ipmr_notifier_exit(net);
> +ipmr_notifier_fail:
>  	return err;
>  }
>  
> @@ -3084,6 +3214,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
>  	remove_proc_entry("ip_mr_cache", net->proc_net);
>  	remove_proc_entry("ip_mr_vif", net->proc_net);
>  #endif
> +	ipmr_notifier_exit(net);
>  	ipmr_rules_exit(net);
>  }
>  
> 

^ permalink raw reply

* Re: [PATCH net] bpf: one perf event close won't free bpf program attached by another perf event
From: Peter Zijlstra @ 2017-09-21 11:17 UTC (permalink / raw)
  To: Yonghong Song; +Cc: Steven Rostedt, ast, daniel, netdev, kernel-team
In-Reply-To: <9e968490-87ae-7a79-9e59-0dcc840a93f5@fb.com>

On Wed, Sep 20, 2017 at 10:20:13PM -0700, Yonghong Song wrote:
> > (2). trace_event_call->perf_events are per cpu data structure, that
> > means, some filtering logic is needed to avoid the same perf_event prog
> > is executing twice.
> 
> What I mean here is that the trace_event_call->perf_events need to be
> checked on ALL cpus since bpf prog should be executed regardless of
> cpu affiliation. It is possible that the same perf_event in different
> per_cpu bucket and hence filtering is needed to avoid the same
> perf_event bpf_prog is executed twice.

An event will only ever be on a single CPU's list at any one time IIRC.

Now, hysterically perf_event_set_bpf_prog used the tracepoint crud
because that already had bpf bits in. But it might make sense to look at
unifying the bpf stuff across all the different event types. Have them
all use event->prog.

I suspect that would break a fair bunch of bpf proglets, since the data
access to the trace data would be completely different, but it would be
much nicer to not have this distinction based on event type.

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-21 11:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wei Wang, Cong Wang, Linux Kernel Network Developers,
	Eric Dumazet
In-Reply-To: <22cde020-e13a-3635-512c-25532f754bda@itcare.pl>



W dniu 2017-09-21 o 13:12, Paweł Staszewski pisze:
>
>
> W dniu 2017-09-21 o 13:03, Eric Dumazet pisze:
>> On Thu, 2017-09-21 at 11:06 +0200, Paweł Staszewski wrote:
>>> W dniu 2017-09-21 o 03:17, Eric Dumazet pisze:
>>>> On Wed, 2017-09-20 at 18:09 -0700, Wei Wang wrote:
>>>>>> Thanks very much Pawel for the feedback.
>>>>>>
>>>>>> I was looking into the code (specifically IPv4 part) and found 
>>>>>> that in
>>>>>> free_fib_info_rcu(), we call free_nh_exceptions() without holding 
>>>>>> the
>>>>>> fnhe_lock. I am wondering if that could cause some race condition on
>>>>>> fnhe->fnhe_rth_input/output so a double call on dst_dev_put() on the
>>>>>> same dst could be happening.
>>>>>>
>>>>>> But as we call free_fib_info_rcu() only after the grace period, and
>>>>>> the lookup code which could potentially modify
>>>>>> fnhe->fnhe_rth_input/output all holds rcu_read_lock(), it seems
>>>>>> fine...
>>>>>>
>>>>> Hi Pawel,
>>>>>
>>>>> Could you try the following debug patch on top of net-next branch and
>>>>> reproduce the issue check if there are warning msg showing?
>>>>>
>>>>> diff --git a/include/net/dst.h b/include/net/dst.h
>>>>> index 93568bd0a352..82aff41c6f63 100644
>>>>> --- a/include/net/dst.h
>>>>> +++ b/include/net/dst.h
>>>>> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry
>>>>> *dst, unsigned long time)
>>>>>    static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>>>>>    {
>>>>>           if (dst)
>>>>> -               atomic_inc(&dst->__refcnt);
>>>>> +               dst_hold(dst);
>>>>>           return dst;
>>>>>    }
>>>>>
>>>>> Thanks.
>>>>> Wei
>>>>>
>>>> Yes, we believe skb_dst_force() and skb_dst_force_safe() should be
>>>> unified  (to the 'safe' version)
>>>>
>>>> We no longer have gc to protect from 0 -> 1 transition of dst 
>>>> refcount.
>>>>
>>>>
>>>>
>>>>
>>> After adding patch from Wei
>>> https://bugzilla.kernel.org/show_bug.cgi?id=197005#c14
>>>
>> OK we have two problems here
>>
>> 1) We need to unify skb_dst_force()  ( for net tree )
>>
>> 2) Vlan devices should try to correctly handle IFF_XMIT_DST_RELEASE from
>> lower device. This will considerably help your performance.
>>
>>
>> For 1), this is what I had in mind, can you try it ?
>>
>> Thanks a lot !
>>
>> diff --git a/include/net/dst.h b/include/net/dst.h
>> index 
>> 93568bd0a3520bb7402f04d90cf04ac99c81cfbe..f23851eeaad917e8dafc06b58d23a2575405c894 
>> 100644
>> --- a/include/net/dst.h
>> +++ b/include/net/dst.h
>> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry 
>> *dst, unsigned long time)
>>   static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>>   {
>>       if (dst)
>> -        atomic_inc(&dst->__refcnt);
>> +        dst_hold(dst);
>>       return dst;
>>   }
>>   @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff 
>> *nskb, const struct sk_buff *oskb
>>       __skb_dst_copy(nskb, oskb->_skb_refdst);
>>   }
>>   -/**
>> - * skb_dst_force - makes sure skb dst is refcounted
>> - * @skb: buffer
>> - *
>> - * If dst is not yet refcounted, let's do it
>> - */
>> -static inline void skb_dst_force(struct sk_buff *skb)
>> -{
>> -    if (skb_dst_is_noref(skb)) {
>> -        WARN_ON(!rcu_read_lock_held());
>> -        skb->_skb_refdst &= ~SKB_DST_NOREF;
>> -        dst_clone(skb_dst(skb));
>> -    }
>> -}
>> -
>>   /**
>>    * dst_hold_safe - Take a reference on a dst if possible
>>    * @dst: pointer to dst entry
>> @@ -356,6 +341,23 @@ static inline void skb_dst_force_safe(struct 
>> sk_buff *skb)
>>       }
>>   }
>>   +/**
>> + * skb_dst_force - makes sure skb dst is refcounted
>> + * @skb: buffer
>> + *
>> + * If dst is not yet refcounted, let's do it
>> + */
>> +static inline void skb_dst_force(struct sk_buff *skb)
>> +{
>> +    if (skb_dst_is_noref(skb)) {
>> +        struct dst_entry *dst = skb_dst(skb);
>> +
>> +        WARN_ON(!rcu_read_lock_held());
>> +        if (!dst_hold_safe(dst))
>> +            dst = NULL;
>> +        skb->_skb_refdst = (unsigned long)dst;
>> +    }
>> +}
>>     /**
>>    *    __skb_tunnel_rx - prepare skb for rx reinsert
>>
>>
>>
> Thanks
>
> What is weird i have this part in my net-next from git:
> /**
>  * skb_dst_force_safe - makes sure skb dst is refcounted
>  * @skb: buffer
>  *
>  * If dst is not yet refcounted and not destroyed, grab a ref on it.
>  */
> static inline void skb_dst_force_safe(struct sk_buff *skb)
> {
>         if (skb_dst_is_noref(skb)) {
>                 struct dst_entry *dst = skb_dst(skb);
>
>                 if (!dst_hold_safe(dst))
>                         dst = NULL;
>
>                 skb->_skb_refdst = (unsigned long)dst;
>         }
> }
>
>
>
ok the difference is skb_dst_force_safe not skb_dst_force

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-21 11:12 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wei Wang, Cong Wang, Linux Kernel Network Developers,
	Eric Dumazet
In-Reply-To: <1505991826.29839.124.camel@edumazet-glaptop3.roam.corp.google.com>



W dniu 2017-09-21 o 13:03, Eric Dumazet pisze:
> On Thu, 2017-09-21 at 11:06 +0200, Paweł Staszewski wrote:
>> W dniu 2017-09-21 o 03:17, Eric Dumazet pisze:
>>> On Wed, 2017-09-20 at 18:09 -0700, Wei Wang wrote:
>>>>> Thanks very much Pawel for the feedback.
>>>>>
>>>>> I was looking into the code (specifically IPv4 part) and found that in
>>>>> free_fib_info_rcu(), we call free_nh_exceptions() without holding the
>>>>> fnhe_lock. I am wondering if that could cause some race condition on
>>>>> fnhe->fnhe_rth_input/output so a double call on dst_dev_put() on the
>>>>> same dst could be happening.
>>>>>
>>>>> But as we call free_fib_info_rcu() only after the grace period, and
>>>>> the lookup code which could potentially modify
>>>>> fnhe->fnhe_rth_input/output all holds rcu_read_lock(), it seems
>>>>> fine...
>>>>>
>>>> Hi Pawel,
>>>>
>>>> Could you try the following debug patch on top of net-next branch and
>>>> reproduce the issue check if there are warning msg showing?
>>>>
>>>> diff --git a/include/net/dst.h b/include/net/dst.h
>>>> index 93568bd0a352..82aff41c6f63 100644
>>>> --- a/include/net/dst.h
>>>> +++ b/include/net/dst.h
>>>> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry
>>>> *dst, unsigned long time)
>>>>    static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>>>>    {
>>>>           if (dst)
>>>> -               atomic_inc(&dst->__refcnt);
>>>> +               dst_hold(dst);
>>>>           return dst;
>>>>    }
>>>>
>>>> Thanks.
>>>> Wei
>>>>
>>> Yes, we believe skb_dst_force() and skb_dst_force_safe() should be
>>> unified  (to the 'safe' version)
>>>
>>> We no longer have gc to protect from 0 -> 1 transition of dst refcount.
>>>
>>>
>>>
>>>
>> After adding patch from Wei
>> https://bugzilla.kernel.org/show_bug.cgi?id=197005#c14
>>
> OK we have two problems here
>
> 1) We need to unify skb_dst_force()  ( for net tree )
>
> 2) Vlan devices should try to correctly handle IFF_XMIT_DST_RELEASE from
> lower device. This will considerably help your performance.
>
>
> For 1), this is what I had in mind, can you try it ?
>
> Thanks a lot !
>
> diff --git a/include/net/dst.h b/include/net/dst.h
> index 93568bd0a3520bb7402f04d90cf04ac99c81cfbe..f23851eeaad917e8dafc06b58d23a2575405c894 100644
> --- a/include/net/dst.h
> +++ b/include/net/dst.h
> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
>   static inline struct dst_entry *dst_clone(struct dst_entry *dst)
>   {
>   	if (dst)
> -		atomic_inc(&dst->__refcnt);
> +		dst_hold(dst);
>   	return dst;
>   }
>   
> @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
>   	__skb_dst_copy(nskb, oskb->_skb_refdst);
>   }
>   
> -/**
> - * skb_dst_force - makes sure skb dst is refcounted
> - * @skb: buffer
> - *
> - * If dst is not yet refcounted, let's do it
> - */
> -static inline void skb_dst_force(struct sk_buff *skb)
> -{
> -	if (skb_dst_is_noref(skb)) {
> -		WARN_ON(!rcu_read_lock_held());
> -		skb->_skb_refdst &= ~SKB_DST_NOREF;
> -		dst_clone(skb_dst(skb));
> -	}
> -}
> -
>   /**
>    * dst_hold_safe - Take a reference on a dst if possible
>    * @dst: pointer to dst entry
> @@ -356,6 +341,23 @@ static inline void skb_dst_force_safe(struct sk_buff *skb)
>   	}
>   }
>   
> +/**
> + * skb_dst_force - makes sure skb dst is refcounted
> + * @skb: buffer
> + *
> + * If dst is not yet refcounted, let's do it
> + */
> +static inline void skb_dst_force(struct sk_buff *skb)
> +{
> +	if (skb_dst_is_noref(skb)) {
> +		struct dst_entry *dst = skb_dst(skb);
> +
> +		WARN_ON(!rcu_read_lock_held());
> +		if (!dst_hold_safe(dst))
> +			dst = NULL;
> +		skb->_skb_refdst = (unsigned long)dst;
> +	}
> +}
>   
>   /**
>    *	__skb_tunnel_rx - prepare skb for rx reinsert
>
>
>
Thanks

What is weird i have this part in my net-next from git:
/**
  * skb_dst_force_safe - makes sure skb dst is refcounted
  * @skb: buffer
  *
  * If dst is not yet refcounted and not destroyed, grab a ref on it.
  */
static inline void skb_dst_force_safe(struct sk_buff *skb)
{
         if (skb_dst_is_noref(skb)) {
                 struct dst_entry *dst = skb_dst(skb);

                 if (!dst_hold_safe(dst))
                         dst = NULL;

                 skb->_skb_refdst = (unsigned long)dst;
         }
}

^ permalink raw reply

* Re: Latest net-next from GIT panic
From: Eric Dumazet @ 2017-09-21 11:03 UTC (permalink / raw)
  To: Paweł Staszewski
  Cc: Wei Wang, Cong Wang, Linux Kernel Network Developers,
	Eric Dumazet
In-Reply-To: <a016d5bc-1cbb-44d7-9ebf-e7e5428e6f98@itcare.pl>

On Thu, 2017-09-21 at 11:06 +0200, Paweł Staszewski wrote:
> 
> W dniu 2017-09-21 o 03:17, Eric Dumazet pisze:
> > On Wed, 2017-09-20 at 18:09 -0700, Wei Wang wrote:
> >>> Thanks very much Pawel for the feedback.
> >>>
> >>> I was looking into the code (specifically IPv4 part) and found that in
> >>> free_fib_info_rcu(), we call free_nh_exceptions() without holding the
> >>> fnhe_lock. I am wondering if that could cause some race condition on
> >>> fnhe->fnhe_rth_input/output so a double call on dst_dev_put() on the
> >>> same dst could be happening.
> >>>
> >>> But as we call free_fib_info_rcu() only after the grace period, and
> >>> the lookup code which could potentially modify
> >>> fnhe->fnhe_rth_input/output all holds rcu_read_lock(), it seems
> >>> fine...
> >>>
> >> Hi Pawel,
> >>
> >> Could you try the following debug patch on top of net-next branch and
> >> reproduce the issue check if there are warning msg showing?
> >>
> >> diff --git a/include/net/dst.h b/include/net/dst.h
> >> index 93568bd0a352..82aff41c6f63 100644
> >> --- a/include/net/dst.h
> >> +++ b/include/net/dst.h
> >> @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry
> >> *dst, unsigned long time)
> >>   static inline struct dst_entry *dst_clone(struct dst_entry *dst)
> >>   {
> >>          if (dst)
> >> -               atomic_inc(&dst->__refcnt);
> >> +               dst_hold(dst);
> >>          return dst;
> >>   }
> >>
> >> Thanks.
> >> Wei
> >>
> >
> > Yes, we believe skb_dst_force() and skb_dst_force_safe() should be
> > unified  (to the 'safe' version)
> >
> > We no longer have gc to protect from 0 -> 1 transition of dst refcount.
> >
> >
> >
> >
> 
> After adding patch from Wei
> https://bugzilla.kernel.org/show_bug.cgi?id=197005#c14
> 

OK we have two problems here 

1) We need to unify skb_dst_force()  ( for net tree )

2) Vlan devices should try to correctly handle IFF_XMIT_DST_RELEASE from
lower device. This will considerably help your performance.


For 1), this is what I had in mind, can you try it ?

Thanks a lot !

diff --git a/include/net/dst.h b/include/net/dst.h
index 93568bd0a3520bb7402f04d90cf04ac99c81cfbe..f23851eeaad917e8dafc06b58d23a2575405c894 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
 static inline struct dst_entry *dst_clone(struct dst_entry *dst)
 {
 	if (dst)
-		atomic_inc(&dst->__refcnt);
+		dst_hold(dst);
 	return dst;
 }
 
@@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
 	__skb_dst_copy(nskb, oskb->_skb_refdst);
 }
 
-/**
- * skb_dst_force - makes sure skb dst is refcounted
- * @skb: buffer
- *
- * If dst is not yet refcounted, let's do it
- */
-static inline void skb_dst_force(struct sk_buff *skb)
-{
-	if (skb_dst_is_noref(skb)) {
-		WARN_ON(!rcu_read_lock_held());
-		skb->_skb_refdst &= ~SKB_DST_NOREF;
-		dst_clone(skb_dst(skb));
-	}
-}
-
 /**
  * dst_hold_safe - Take a reference on a dst if possible
  * @dst: pointer to dst entry
@@ -356,6 +341,23 @@ static inline void skb_dst_force_safe(struct sk_buff *skb)
 	}
 }
 
+/**
+ * skb_dst_force - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted, let's do it
+ */
+static inline void skb_dst_force(struct sk_buff *skb)
+{
+	if (skb_dst_is_noref(skb)) {
+		struct dst_entry *dst = skb_dst(skb);
+
+		WARN_ON(!rcu_read_lock_held());
+		if (!dst_hold_safe(dst))
+			dst = NULL;
+		skb->_skb_refdst = (unsigned long)dst;
+	}
+}
 
 /**
  *	__skb_tunnel_rx - prepare skb for rx reinsert

^ permalink raw reply related

* [PATCH 1/1] net:nfc: use setup_timer
From: Allen Pais @ 2017-09-21 10:59 UTC (permalink / raw)
  To: linux-kernel; +Cc: sameo, netdev, Allen Pais

    Use setup_timer function instead of initializing timer with the
    function and data fields.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
---
 net/nfc/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/nfc/core.c b/net/nfc/core.c
index 5cf33df..e5e23c2 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -1094,9 +1094,8 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops,
 	dev->targets_generation = 1;
 
 	if (ops->check_presence) {
-		init_timer(&dev->check_pres_timer);
-		dev->check_pres_timer.data = (unsigned long)dev;
-		dev->check_pres_timer.function = nfc_check_pres_timeout;
+		setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout,
+			    (unsigned long)dev);
 
 		INIT_WORK(&dev->check_pres_work, nfc_check_pres_work);
 	}
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 2/4] cxgb4: add basic tc flower offload support
From: Yunsheng Lin @ 2017-09-21 10:55 UTC (permalink / raw)
  To: Rahul Lakkireddy, netdev; +Cc: davem, kumaras, ganeshgr, nirranjan, indranil
In-Reply-To: <dc5f8e6419ad3439b14f39306245d98537be3306.1505977744.git.rahul.lakkireddy@chelsio.com>

Hi, Kumar

On 2017/9/21 15:33, Rahul Lakkireddy wrote:
> From: Kumar Sanghvi <kumaras@chelsio.com>
> 
> Add support to add/remove flows for offload.  Following match
> and action are supported for offloading a flow:
> 
> Match: ether-protocol, IPv4/IPv6 addresses, L4 ports (TCP/UDP)
> Action: drop, redirect to another port on the device.
> 
> The qualifying flows can have accompanying mask information.
> 
> Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
> Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
> ---
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |   3 +
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c  |  26 ++
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |   2 +
>  .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 285 ++++++++++++++++++++-
>  .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h   |  17 ++
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h     |   1 +
>  6 files changed, 332 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
> index ea72d2d2e1b4..26eac599ab2c 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
> @@ -904,6 +904,9 @@ struct adapter {
>  	/* TC u32 offload */
>  	struct cxgb4_tc_u32_table *tc_u32;
>  	struct chcr_stats_debug chcr_stats;
> +
> +	/* TC flower offload */
> +	DECLARE_HASHTABLE(flower_anymatch_tbl, 9);
>  };
>  
>  /* Support for "sched-class" command to allow a TX Scheduling Class to be
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
> index 45b5853ca2f1..07a4619e2164 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
> @@ -148,6 +148,32 @@ static int get_filter_steerq(struct net_device *dev,
>  	return iq;
>  }
>  
> +int cxgb4_get_free_ftid(struct net_device *dev, int family)
> +{
> +	struct adapter *adap = netdev2adap(dev);
> +	struct tid_info *t = &adap->tids;
> +	int ftid;
> +
> +	spin_lock_bh(&t->ftid_lock);
> +	if (family == PF_INET) {
> +		ftid = find_first_zero_bit(t->ftid_bmap, t->nftids);
> +		if (ftid >= t->nftids)
> +			ftid = -1;
> +	} else {
> +		ftid = bitmap_find_free_region(t->ftid_bmap, t->nftids, 2);
> +		if (ftid < 0) {
> +			ftid = -1;

ftid = -1 is not needed?

> +			goto out_unlock;
> +		}
> +
> +		/* this is only a lookup, keep the found region unallocated */
> +		bitmap_release_region(t->ftid_bmap, ftid, 2);
> +	}
> +out_unlock:
> +	spin_unlock_bh(&t->ftid_lock);
> +	return ftid;
> +}
> +
>  static int cxgb4_set_ftid(struct tid_info *t, int fidx, int family)
>  {
>  	spin_lock_bh(&t->ftid_lock);
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> index 8923affbdaf8..3ba4e1ff8486 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> @@ -5105,6 +5105,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
>  		if (!adapter->tc_u32)
>  			dev_warn(&pdev->dev,
>  				 "could not offload tc u32, continuing\n");
> +
> +		cxgb4_init_tc_flower(adapter);
>  	}
>  
>  	if (is_offload(adapter)) {
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
> index 16dff71e4d02..1af01101faaf 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
> @@ -38,16 +38,292 @@
>  #include "cxgb4.h"
>  #include "cxgb4_tc_flower.h"
>  
> +static struct ch_tc_flower_entry *allocate_flower_entry(void)
> +{
> +	struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL);
> +	return new;
> +}
> +
> +/* Must be called with either RTNL or rcu_read_lock */
> +static struct ch_tc_flower_entry *ch_flower_lookup(struct adapter *adap,
> +						   unsigned long flower_cookie)
> +{
> +	struct ch_tc_flower_entry *flower_entry;
> +
> +	hash_for_each_possible_rcu(adap->flower_anymatch_tbl, flower_entry,
> +				   link, flower_cookie)
> +		if (flower_entry->tc_flower_cookie == flower_cookie)
> +			return flower_entry;
> +	return NULL;
> +}
> +
> +static void cxgb4_process_flow_match(struct net_device *dev,
> +				     struct tc_cls_flower_offload *cls,
> +				     struct ch_filter_specification *fs)
> +{
> +	u16 addr_type = 0;
> +
> +	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
> +		struct flow_dissector_key_control *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_CONTROL,
> +						  cls->key);
> +
> +		addr_type = key->addr_type;
> +	}
> +
> +	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
> +		struct flow_dissector_key_basic *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_BASIC,
> +						  cls->key);
> +		struct flow_dissector_key_basic *mask =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_BASIC,
> +						  cls->mask);
> +		u16 ethtype_key = ntohs(key->n_proto);
> +		u16 ethtype_mask = ntohs(mask->n_proto);
> +
> +		if (ethtype_key == ETH_P_ALL) {
> +			ethtype_key = 0;
> +			ethtype_mask = 0;
> +		}
> +
> +		fs->val.ethtype = ethtype_key;
> +		fs->mask.ethtype = ethtype_mask;
> +		fs->val.proto = key->ip_proto;
> +		fs->mask.proto = mask->ip_proto;
> +	}
> +
> +	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
> +		struct flow_dissector_key_ipv4_addrs *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
> +						  cls->key);
> +		struct flow_dissector_key_ipv4_addrs *mask =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
> +						  cls->mask);
> +		fs->type = 0;
> +		memcpy(&fs->val.lip[0], &key->dst, sizeof(key->dst));
> +		memcpy(&fs->val.fip[0], &key->src, sizeof(key->src));
> +		memcpy(&fs->mask.lip[0], &mask->dst, sizeof(mask->dst));
> +		memcpy(&fs->mask.fip[0], &mask->src, sizeof(mask->src));
> +	}
> +
> +	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
> +		struct flow_dissector_key_ipv6_addrs *key =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
> +						  cls->key);
> +		struct flow_dissector_key_ipv6_addrs *mask =
> +			skb_flow_dissector_target(cls->dissector,
> +						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
> +						  cls->mask);
> +
> +		fs->type = 1;
> +		memcpy(&fs->val.lip[0], key->dst.s6_addr, sizeof(key->dst));
> +		memcpy(&fs->val.fip[0], key->src.s6_addr, sizeof(key->src));
> +		memcpy(&fs->mask.lip[0], mask->dst.s6_addr, sizeof(mask->dst));
> +		memcpy(&fs->mask.fip[0], mask->src.s6_addr, sizeof(mask->src));
> +	}
> +
> +	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
> +		struct flow_dissector_key_ports *key, *mask;
> +
> +		key = skb_flow_dissector_target(cls->dissector,
> +						FLOW_DISSECTOR_KEY_PORTS,
> +						cls->key);
> +		mask = skb_flow_dissector_target(cls->dissector,
> +						 FLOW_DISSECTOR_KEY_PORTS,
> +						 cls->mask);
> +		fs->val.lport = cpu_to_be16(key->dst);
> +		fs->mask.lport = cpu_to_be16(mask->dst);
> +		fs->val.fport = cpu_to_be16(key->src);
> +		fs->mask.fport = cpu_to_be16(mask->src);
> +	}
> +
> +	/* Match only packets coming from the ingress port where this
> +	 * filter will be created.
> +	 */
> +	fs->val.iport = netdev2pinfo(dev)->port_id;
> +	fs->mask.iport = ~0;
> +}
> +
> +static int cxgb4_validate_flow_match(struct net_device *dev,
> +				     struct tc_cls_flower_offload *cls)
> +{
> +	if (cls->dissector->used_keys &
> +	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
> +	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
> +	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
> +	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
> +	      BIT(FLOW_DISSECTOR_KEY_PORTS))) {
> +		netdev_warn(dev, "Unsupported key used: 0x%x\n",
> +			    cls->dissector->used_keys);
> +		return -EOPNOTSUPP;
> +	}
> +	return 0;
> +}
> +
> +static void cxgb4_process_flow_actions(struct net_device *in,
> +				       struct tc_cls_flower_offload *cls,
> +				       struct ch_filter_specification *fs)
> +{
> +	const struct tc_action *a;
> +	LIST_HEAD(actions);
> +
> +	tcf_exts_to_list(cls->exts, &actions);
> +	list_for_each_entry(a, &actions, list) {
> +		if (is_tcf_gact_shot(a)) {
> +			fs->action = FILTER_DROP;
> +		} else if (is_tcf_mirred_egress_redirect(a)) {
> +			int ifindex = tcf_mirred_ifindex(a);
> +			struct net_device *out = __dev_get_by_index(dev_net(in),
> +								    ifindex);
> +			struct port_info *pi = netdev_priv(out);
> +
> +			fs->action = FILTER_SWITCH;
> +			fs->eport = pi->port_id;
> +		}
> +	}
> +}
> +
> +static int cxgb4_validate_flow_actions(struct net_device *dev,
> +				       struct tc_cls_flower_offload *cls)
> +{
> +	const struct tc_action *a;
> +	LIST_HEAD(actions);
> +
> +	tcf_exts_to_list(cls->exts, &actions);
> +	list_for_each_entry(a, &actions, list) {
> +		if (is_tcf_gact_shot(a)) {
> +			/* Do nothing */
> +		} else if (is_tcf_mirred_egress_redirect(a)) {
> +			struct adapter *adap = netdev2adap(dev);
> +			struct net_device *n_dev;
> +			unsigned int i, ifindex;
> +			bool found = false;
> +
> +			ifindex = tcf_mirred_ifindex(a);
> +			for_each_port(adap, i) {
> +				n_dev = adap->port[i];
> +				if (ifindex == n_dev->ifindex) {
> +					found = true;
> +					break;
> +				}
> +			}
> +
> +			/* If interface doesn't belong to our hw, then
> +			 * the provided output port is not valid
> +			 */
> +			if (!found) {
> +				netdev_err(dev, "%s: Out port invalid\n",
> +					   __func__);
> +				return -EINVAL;
> +			}
> +		} else {
> +			netdev_err(dev, "%s: Unsupported action\n", __func__);
> +			return -EOPNOTSUPP;
> +		}
> +	}
> +	return 0;
> +}
> +
>  int cxgb4_tc_flower_replace(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls)
>  {
> -	return -EOPNOTSUPP;
> +	struct adapter *adap = netdev2adap(dev);
> +	struct ch_tc_flower_entry *ch_flower;
> +	struct ch_filter_specification *fs;
> +	struct filter_ctx ctx;
> +	int fidx;
> +	int ret;
> +
> +	if (cxgb4_validate_flow_actions(dev, cls))
> +		return -EOPNOTSUPP;
> +
> +	if (cxgb4_validate_flow_match(dev, cls))
> +		return -EOPNOTSUPP;
> +
> +	ch_flower = allocate_flower_entry();
> +	if (!ch_flower) {
> +		netdev_err(dev, "%s: ch_flower alloc failed.\n", __func__);
> +		ret = -ENOMEM;
> +		goto err;

Just return, err label is needed?

> +	}
> +
> +	fs = &ch_flower->fs;
> +	fs->hitcnts = 1;
> +	cxgb4_process_flow_actions(dev, cls, fs);
> +	cxgb4_process_flow_match(dev, cls, fs);
> +
> +	fidx = cxgb4_get_free_ftid(dev, fs->type ? PF_INET6 : PF_INET);
> +	if (fidx < 0) {
> +		netdev_err(dev, "%s: No fidx for offload.\n", __func__);
> +		ret = -ENOMEM;
> +		goto free_entry;
> +	}
> +
> +	init_completion(&ctx.completion);
> +	ret = __cxgb4_set_filter(dev, fidx, fs, &ctx);
> +	if (ret) {
> +		netdev_err(dev, "%s: filter creation err %d\n",
> +			   __func__, ret);
> +		goto free_entry;
> +	}
> +
> +	/* Wait for reply */
> +	ret = wait_for_completion_timeout(&ctx.completion, 10 * HZ);
> +	if (!ret) {
> +		ret = -ETIMEDOUT;
> +		goto free_entry;
> +	}
> +
> +	ret = ctx.result;
> +	/* Check if hw returned error for filter creation */
> +	if (ret) {
> +		netdev_err(dev, "%s: filter creation err %d\n",
> +			   __func__, ret);
> +		goto free_entry;
> +	}
> +
> +	INIT_HLIST_NODE(&ch_flower->link);
> +	ch_flower->tc_flower_cookie = cls->cookie;
> +	ch_flower->filter_id = ctx.tid;
> +	hash_add_rcu(adap->flower_anymatch_tbl, &ch_flower->link, cls->cookie);
> +
> +	return ret;
> +
> +free_entry:
> +	kfree(ch_flower);
> +err:
> +	return ret;
>  }
>  
>  int cxgb4_tc_flower_destroy(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls)
>  {
> -	return -EOPNOTSUPP;
> +	struct adapter *adap = netdev2adap(dev);
> +	struct ch_tc_flower_entry *ch_flower;
> +	int ret;
> +
> +	ch_flower = ch_flower_lookup(adap, cls->cookie);
> +	if (!ch_flower) {
> +		ret = -ENOENT;
> +		goto err;

Same as above

> +	}
> +
> +	ret = cxgb4_del_filter(dev, ch_flower->filter_id);
> +	if (ret)
> +		goto err;
> +
> +	hash_del_rcu(&ch_flower->link);
> +	kfree_rcu(ch_flower, rcu);
> +	return ret;
> +
> +err:
> +	return ret;
>  }
>  
>  int cxgb4_tc_flower_stats(struct net_device *dev,
> @@ -55,3 +331,8 @@ int cxgb4_tc_flower_stats(struct net_device *dev,
>  {
>  	return -EOPNOTSUPP;
>  }
> +
> +void cxgb4_init_tc_flower(struct adapter *adap)
> +{
> +	hash_init(adap->flower_anymatch_tbl);
> +}
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
> index b321fc205b5a..6145a9e056eb 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
> @@ -37,10 +37,27 @@
>  
>  #include <net/pkt_cls.h>
>  
> +struct ch_tc_flower_stats {
> +	u64 packet_count;
> +	u64 byte_count;
> +	u64 last_used;
> +};
> +
> +struct ch_tc_flower_entry {
> +	struct ch_filter_specification fs;
> +	struct ch_tc_flower_stats stats;
> +	unsigned long tc_flower_cookie;
> +	struct hlist_node link;
> +	struct rcu_head rcu;
> +	u32 filter_id;
> +};
> +
>  int cxgb4_tc_flower_replace(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls);
>  int cxgb4_tc_flower_destroy(struct net_device *dev,
>  			    struct tc_cls_flower_offload *cls);
>  int cxgb4_tc_flower_stats(struct net_device *dev,
>  			  struct tc_cls_flower_offload *cls);
> +
> +void cxgb4_init_tc_flower(struct adapter *adap);
>  #endif /* __CXGB4_TC_FLOWER_H */
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
> index 84541fce94c5..88487095d14f 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
> @@ -212,6 +212,7 @@ struct filter_ctx {
>  
>  struct ch_filter_specification;
>  
> +int cxgb4_get_free_ftid(struct net_device *dev, int family);
>  int __cxgb4_set_filter(struct net_device *dev, int filter_id,
>  		       struct ch_filter_specification *fs,
>  		       struct filter_ctx *ctx);
> 

^ permalink raw reply

* [PATCH net 4/4] net:ethernet:atlantic: fix iommu errors
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Pavel Belous, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

From: Pavel Belous <pavel.belous@aquantia.com>

Call skb_frag_dma_map multiple times if tx length is greater than
device max and avoid processing tx ring until entire packet has been
sent.

Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: Pavel Belous <pavel.belous@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c  | 43 ++++++++++++++----------
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 27 ++++++++++-----
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h |  6 ++--
 3 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 24f573c..5b18ffc 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -474,6 +474,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 	unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
 	unsigned int frag_count = 0U;
 	unsigned int dx = ring->sw_tail;
+	struct aq_ring_buff_s *first = NULL;
 	struct aq_ring_buff_s *dx_buff = &ring->buff_ring[dx];
 
 	if (unlikely(skb_is_gso(skb))) {
@@ -484,6 +485,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 		dx_buff->len_l4 = tcp_hdrlen(skb);
 		dx_buff->mss = skb_shinfo(skb)->gso_size;
 		dx_buff->is_txc = 1U;
+		dx_buff->eop_index = 0xffffU;
 
 		dx_buff->is_ipv6 =
 			(ip_hdr(skb)->version == 6) ? 1U : 0U;
@@ -503,6 +505,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 	if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa)))
 		goto exit;
 
+	first = dx_buff;
 	dx_buff->len_pkt = skb->len;
 	dx_buff->is_sop = 1U;
 	dx_buff->is_mapped = 1U;
@@ -531,40 +534,46 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self,
 
 	for (; nr_frags--; ++frag_count) {
 		unsigned int frag_len = 0U;
+		unsigned int buff_offset = 0U;
+		unsigned int buff_size = 0U;
 		dma_addr_t frag_pa;
 		skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_count];
 
 		frag_len = skb_frag_size(frag);
-		frag_pa = skb_frag_dma_map(aq_nic_get_dev(self), frag, 0,
-					   frag_len, DMA_TO_DEVICE);
 
-		if (unlikely(dma_mapping_error(aq_nic_get_dev(self), frag_pa)))
-			goto mapping_error;
+		while (frag_len) {
+			if (frag_len > AQ_CFG_TX_FRAME_MAX)
+				buff_size = AQ_CFG_TX_FRAME_MAX;
+			else
+				buff_size = frag_len;
+
+			frag_pa = skb_frag_dma_map(aq_nic_get_dev(self),
+						   frag,
+						   buff_offset,
+						   buff_size,
+						   DMA_TO_DEVICE);
+
+			if (unlikely(dma_mapping_error(aq_nic_get_dev(self),
+						       frag_pa)))
+				goto mapping_error;
 
-		while (frag_len > AQ_CFG_TX_FRAME_MAX) {
 			dx = aq_ring_next_dx(ring, dx);
 			dx_buff = &ring->buff_ring[dx];
 
 			dx_buff->flags = 0U;
-			dx_buff->len = AQ_CFG_TX_FRAME_MAX;
+			dx_buff->len = buff_size;
 			dx_buff->pa = frag_pa;
 			dx_buff->is_mapped = 1U;
+			dx_buff->eop_index = 0xffffU;
+
+			frag_len -= buff_size;
+			buff_offset += buff_size;
 
-			frag_len -= AQ_CFG_TX_FRAME_MAX;
-			frag_pa += AQ_CFG_TX_FRAME_MAX;
 			++ret;
 		}
-
-		dx = aq_ring_next_dx(ring, dx);
-		dx_buff = &ring->buff_ring[dx];
-
-		dx_buff->flags = 0U;
-		dx_buff->len = frag_len;
-		dx_buff->pa = frag_pa;
-		dx_buff->is_mapped = 1U;
-		++ret;
 	}
 
+	first->eop_index = dx;
 	dx_buff->is_eop = 1U;
 	dx_buff->skb = skb;
 	goto exit;
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 02f79b0..0654e0c 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -104,6 +104,12 @@ int aq_ring_init(struct aq_ring_s *self)
 	return 0;
 }
 
+static inline bool aq_ring_dx_in_range(unsigned int h, unsigned int i,
+				       unsigned int t)
+{
+	return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t));
+}
+
 void aq_ring_update_queue_state(struct aq_ring_s *ring)
 {
 	if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX)
@@ -139,23 +145,28 @@ void aq_ring_tx_clean(struct aq_ring_s *self)
 		struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head];
 
 		if (likely(buff->is_mapped)) {
-			if (unlikely(buff->is_sop))
+			if (unlikely(buff->is_sop)) {
+				if (!buff->is_eop &&
+				    buff->eop_index != 0xffffU &&
+				    (!aq_ring_dx_in_range(self->sw_head,
+						buff->eop_index,
+						self->hw_head)))
+					break;
+
 				dma_unmap_single(dev, buff->pa, buff->len,
 						 DMA_TO_DEVICE);
-			else
+			} else {
 				dma_unmap_page(dev, buff->pa, buff->len,
 					       DMA_TO_DEVICE);
+			}
 		}
 
 		if (unlikely(buff->is_eop))
 			dev_kfree_skb_any(buff->skb);
-	}
-}
 
-static inline unsigned int aq_ring_dx_in_range(unsigned int h, unsigned int i,
-					       unsigned int t)
-{
-	return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t));
+		buff->pa = 0U;
+		buff->eop_index = 0xffffU;
+	}
 }
 
 #define AQ_SKB_ALIGN SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
index 24523b5..5844078 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
@@ -65,7 +65,7 @@ struct __packed aq_ring_buff_s {
 	};
 	union {
 		struct {
-			u32 len:16;
+			u16 len;
 			u32 is_ip_cso:1;
 			u32 is_udp_cso:1;
 			u32 is_tcp_cso:1;
@@ -77,8 +77,10 @@ struct __packed aq_ring_buff_s {
 			u32 is_cleaned:1;
 			u32 is_error:1;
 			u32 rsvd3:6;
+			u16 eop_index;
+			u16 rsvd4;
 		};
-		u32 flags;
+		u64 flags;
 	};
 };
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 3/4] net:ethernet:aquantia: Fix transient invalid link down/up indications
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Due to a bug in aquantia atlantic card firmware, it sometimes reports
invalid link speed bits. That caused driver to report link down events,
although link itself is totally fine.

This patch ignores such out of blue readings.

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
index 4f5ec9a..ab5d3cb 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
@@ -351,8 +351,7 @@ int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self)
 			break;
 
 		default:
-			link_status->mbps = 0U;
-			break;
+			return -1;
 		}
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 2/4] net:ethernet:aquantia: Fix Tx queue hangups
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Driver did a poor job in managing its Tx queues: Sometimes it could stop
tx queues due to link down condition in aq_nic_xmit - but never waked up
them. That led to Tx path total suspend.
This patch fixes this and improves generic queue management:
- introduces queue restart counter
- uses generic netif_ interface to disable and enable tx path
- refactors link up/down condition and introduces dmesg log event when
  link changes.
- introduces new constant for minimum descriptors count required for queue
  wakeup

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h  |  4 ++
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c  | 91 +++++++++++-------------
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h  |  2 -
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 26 +++++++
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h |  4 ++
 drivers/net/ethernet/aquantia/atlantic/aq_vec.c  |  8 +--
 6 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
index 2149864..0fdaaa6 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
@@ -51,6 +51,10 @@
 
 #define AQ_CFG_SKB_FRAGS_MAX   32U
 
+/* Number of descriptors available in one ring to resume this ring queue
+ */
+#define AQ_CFG_RESTART_DESC_THRES   (AQ_CFG_SKB_FRAGS_MAX * 2)
+
 #define AQ_CFG_NAPI_WEIGHT     64U
 
 #define AQ_CFG_MULTICAST_ADDRESS_MAX     32U
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index f281392..24f573c 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -119,6 +119,35 @@ int aq_nic_cfg_start(struct aq_nic_s *self)
 	return 0;
 }
 
+static int aq_nic_update_link_status(struct aq_nic_s *self)
+{
+	int err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
+
+	if (err < 0)
+		return -1;
+
+	if (self->link_status.mbps != self->aq_hw->aq_link_status.mbps)
+		pr_info("%s: link change old %d new %d\n",
+			AQ_CFG_DRV_NAME, self->link_status.mbps,
+			self->aq_hw->aq_link_status.mbps);
+
+	self->link_status = self->aq_hw->aq_link_status;
+	if (!netif_carrier_ok(self->ndev) && self->link_status.mbps) {
+		aq_utils_obj_set(&self->header.flags,
+				 AQ_NIC_FLAG_STARTED);
+		aq_utils_obj_clear(&self->header.flags,
+				   AQ_NIC_LINK_DOWN);
+		netif_carrier_on(self->ndev);
+		netif_tx_wake_all_queues(self->ndev);
+	}
+	if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) {
+		netif_carrier_off(self->ndev);
+		netif_tx_disable(self->ndev);
+		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
+	}
+	return 0;
+}
+
 static void aq_nic_service_timer_cb(unsigned long param)
 {
 	struct aq_nic_s *self = (struct aq_nic_s *)param;
@@ -131,26 +160,13 @@ static void aq_nic_service_timer_cb(unsigned long param)
 	if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY))
 		goto err_exit;
 
-	err = self->aq_hw_ops.hw_get_link_status(self->aq_hw);
-	if (err < 0)
+	err = aq_nic_update_link_status(self);
+	if (err)
 		goto err_exit;
 
-	self->link_status = self->aq_hw->aq_link_status;
-
 	self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw,
 		    self->aq_nic_cfg.is_interrupt_moderation);
 
-	if (self->link_status.mbps) {
-		aq_utils_obj_set(&self->header.flags,
-				 AQ_NIC_FLAG_STARTED);
-		aq_utils_obj_clear(&self->header.flags,
-				   AQ_NIC_LINK_DOWN);
-		netif_carrier_on(self->ndev);
-	} else {
-		netif_carrier_off(self->ndev);
-		aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN);
-	}
-
 	memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s));
 	memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s));
 	for (i = AQ_DIMOF(self->aq_vec); i--;) {
@@ -240,7 +256,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 int aq_nic_ndev_register(struct aq_nic_s *self)
 {
 	int err = 0;
-	unsigned int i = 0U;
 
 	if (!self->ndev) {
 		err = -EINVAL;
@@ -262,8 +277,7 @@ int aq_nic_ndev_register(struct aq_nic_s *self)
 
 	netif_carrier_off(self->ndev);
 
-	for (i = AQ_CFG_VECS_MAX; i--;)
-		aq_nic_ndev_queue_stop(self, i);
+	netif_tx_disable(self->ndev);
 
 	err = register_netdev(self->ndev);
 	if (err < 0)
@@ -319,12 +333,8 @@ struct aq_nic_s *aq_nic_alloc_hot(struct net_device *ndev)
 		err = -EINVAL;
 		goto err_exit;
 	}
-	if (netif_running(ndev)) {
-		unsigned int i;
-
-		for (i = AQ_CFG_VECS_MAX; i--;)
-			netif_stop_subqueue(ndev, i);
-	}
+	if (netif_running(ndev))
+		netif_tx_disable(ndev);
 
 	for (self->aq_vecs = 0; self->aq_vecs < self->aq_nic_cfg.vecs;
 		self->aq_vecs++) {
@@ -384,16 +394,6 @@ int aq_nic_init(struct aq_nic_s *self)
 	return err;
 }
 
-void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx)
-{
-	netif_start_subqueue(self->ndev, idx);
-}
-
-void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx)
-{
-	netif_stop_subqueue(self->ndev, idx);
-}
-
 int aq_nic_start(struct aq_nic_s *self)
 {
 	struct aq_vec_s *aq_vec = NULL;
@@ -452,10 +452,6 @@ int aq_nic_start(struct aq_nic_s *self)
 			goto err_exit;
 	}
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
-		aq_nic_ndev_queue_start(self, i);
-
 	err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs);
 	if (err < 0)
 		goto err_exit;
@@ -464,6 +460,8 @@ int aq_nic_start(struct aq_nic_s *self)
 	if (err < 0)
 		goto err_exit;
 
+	netif_tx_start_all_queues(self->ndev);
+
 err_exit:
 	return err;
 }
@@ -603,7 +601,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 	unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs;
 	unsigned int tc = 0U;
 	int err = NETDEV_TX_OK;
-	bool is_nic_in_bad_state;
 
 	frags = skb_shinfo(skb)->nr_frags + 1;
 
@@ -614,13 +611,10 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 		goto err_exit;
 	}
 
-	is_nic_in_bad_state = aq_utils_obj_test(&self->header.flags,
-						AQ_NIC_FLAGS_IS_NOT_TX_READY) ||
-						(aq_ring_avail_dx(ring) <
-						AQ_CFG_SKB_FRAGS_MAX);
+	aq_ring_update_queue_state(ring);
 
-	if (is_nic_in_bad_state) {
-		aq_nic_ndev_queue_stop(self, ring->idx);
+	/* Above status update may stop the queue. Check this. */
+	if (__netif_subqueue_stopped(self->ndev, ring->idx)) {
 		err = NETDEV_TX_BUSY;
 		goto err_exit;
 	}
@@ -632,9 +626,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb)
 						      ring,
 						      frags);
 		if (err >= 0) {
-			if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1)
-				aq_nic_ndev_queue_stop(self, ring->idx);
-
 			++ring->stats.tx.packets;
 			ring->stats.tx.bytes += skb->len;
 		}
@@ -906,9 +897,7 @@ int aq_nic_stop(struct aq_nic_s *self)
 	struct aq_vec_s *aq_vec = NULL;
 	unsigned int i = 0U;
 
-	for (i = 0U, aq_vec = self->aq_vec[0];
-		self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i])
-		aq_nic_ndev_queue_stop(self, i);
+	netif_tx_disable(self->ndev);
 
 	del_timer_sync(&self->service_timer);
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 7fc2a5e..0ddd556 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -83,8 +83,6 @@ struct net_device *aq_nic_get_ndev(struct aq_nic_s *self);
 int aq_nic_init(struct aq_nic_s *self);
 int aq_nic_cfg_start(struct aq_nic_s *self);
 int aq_nic_ndev_register(struct aq_nic_s *self);
-void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx);
-void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx);
 void aq_nic_ndev_free(struct aq_nic_s *self);
 int aq_nic_start(struct aq_nic_s *self);
 int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 4eee199..02f79b0 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -104,6 +104,32 @@ int aq_ring_init(struct aq_ring_s *self)
 	return 0;
 }
 
+void aq_ring_update_queue_state(struct aq_ring_s *ring)
+{
+	if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX)
+		aq_ring_queue_stop(ring);
+	else if (aq_ring_avail_dx(ring) > AQ_CFG_RESTART_DESC_THRES)
+		aq_ring_queue_wake(ring);
+}
+
+void aq_ring_queue_wake(struct aq_ring_s *ring)
+{
+	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
+
+	if (__netif_subqueue_stopped(ndev, ring->idx)) {
+		netif_wake_subqueue(ndev, ring->idx);
+		ring->stats.tx.queue_restarts++;
+	}
+}
+
+void aq_ring_queue_stop(struct aq_ring_s *ring)
+{
+	struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic);
+
+	if (!__netif_subqueue_stopped(ndev, ring->idx))
+		netif_stop_subqueue(ndev, ring->idx);
+}
+
 void aq_ring_tx_clean(struct aq_ring_s *self)
 {
 	struct device *dev = aq_nic_get_dev(self->aq_nic);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
index 782176c..24523b5 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h
@@ -94,6 +94,7 @@ struct aq_ring_stats_tx_s {
 	u64 errors;
 	u64 packets;
 	u64 bytes;
+	u64 queue_restarts;
 };
 
 union aq_ring_stats_s {
@@ -147,6 +148,9 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
 int aq_ring_init(struct aq_ring_s *self);
 void aq_ring_rx_deinit(struct aq_ring_s *self);
 void aq_ring_free(struct aq_ring_s *self);
+void aq_ring_update_queue_state(struct aq_ring_s *ring);
+void aq_ring_queue_wake(struct aq_ring_s *ring);
+void aq_ring_queue_stop(struct aq_ring_s *ring);
 void aq_ring_tx_clean(struct aq_ring_s *self);
 int aq_ring_rx_clean(struct aq_ring_s *self,
 		     struct napi_struct *napi,
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
index ebf5880..305ff8f 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
@@ -59,12 +59,7 @@ static int aq_vec_poll(struct napi_struct *napi, int budget)
 			if (ring[AQ_VEC_TX_ID].sw_head !=
 			    ring[AQ_VEC_TX_ID].hw_head) {
 				aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]);
-
-				if (aq_ring_avail_dx(&ring[AQ_VEC_TX_ID]) >
-				    AQ_CFG_SKB_FRAGS_MAX) {
-					aq_nic_ndev_queue_start(self->aq_nic,
-						ring[AQ_VEC_TX_ID].idx);
-				}
+				aq_ring_update_queue_state(&ring[AQ_VEC_TX_ID]);
 				was_tx_cleaned = true;
 			}
 
@@ -364,6 +359,7 @@ void aq_vec_add_stats(struct aq_vec_s *self,
 		stats_tx->packets += tx->packets;
 		stats_tx->bytes += tx->bytes;
 		stats_tx->errors += tx->errors;
+		stats_tx->queue_restarts += tx->queue_restarts;
 	}
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 1/4] net:ethernet:aquantia: Setup max_mtu in ndev to enable jumbo frames
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh
In-Reply-To: <cover.1505915085.git.igor.russkikh@aquantia.com>

Although hardware is capable for almost 16K MTU, without max_mtu field
correctly set it only allows standard MTU to be used.
This patch enables max MTU, calculating it from hardware maximum frame size
of 16352 octets (including FCS).

Fixes: 5513e16421cb ("net: ethernet: aquantia: Fixes for aq_ndev_change_mtu")

Signed-off-by: Pavel Belous <Pavel.Belous@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c                    | 5 +++--
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 6ac9e26..f281392 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -214,7 +214,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops,
 	SET_NETDEV_DEV(ndev, dev);
 
 	ndev->if_port = port;
-	ndev->min_mtu = ETH_MIN_MTU;
 	self->ndev = ndev;
 
 	self->aq_pci_func = aq_pci_func;
@@ -283,6 +282,8 @@ int aq_nic_ndev_init(struct aq_nic_s *self)
 	self->ndev->features = aq_hw_caps->hw_features;
 	self->ndev->priv_flags = aq_hw_caps->hw_priv_flags;
 	self->ndev->mtu = aq_nic_cfg->mtu - ETH_HLEN;
+	self->ndev->min_mtu = ETH_MIN_MTU;
+	self->ndev->max_mtu = self->aq_hw_caps.mtu - ETH_FCS_LEN - ETH_HLEN;
 
 	return 0;
 }
@@ -695,7 +696,7 @@ int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu)
 {
 	int err = 0;
 
-	if (new_mtu > self->aq_hw_caps.mtu) {
+	if (new_mtu + ETH_FCS_LEN > self->aq_hw_caps.mtu) {
 		err = -EINVAL;
 		goto err_exit;
 	}
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
index f3957e93..fcf89e2 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h
@@ -16,7 +16,7 @@
 
 #include "../aq_common.h"
 
-#define HW_ATL_B0_MTU_JUMBO (16000U)
+#define HW_ATL_B0_MTU_JUMBO  16352U
 #define HW_ATL_B0_MTU        1514U
 
 #define HW_ATL_B0_TX_RINGS 4U
-- 
2.7.4

^ permalink raw reply related

* [PATCH net 0/4] net:ethernet:aquantia: Atlantic driver bugfixes and improvements
From: Igor Russkikh @ 2017-09-21 10:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, David Arcari, Pavel Belous, Nadezhda Krupnina,
	Simon Edelhaus, Igor Russkikh

This series contains bugfixes for aQuantia Atlantic driver.

Igor Russkikh (3):
  net:ethernet:aquantia: Setup max_mtu in ndev to enable jumbo frames
  net:ethernet:aquantia: Fix Tx queue hangups
  net:ethernet:aquantia: Fix transient invalid link down/up indications

Pavel Belous (1):
  net:ethernet:atlantic: fix iommu errors

 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h    |   4 +
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c    | 139 ++++++++++-----------
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h    |   2 -
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c   |  53 ++++++--
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h   |  10 +-
 drivers/net/ethernet/aquantia/atlantic/aq_vec.c    |   8 +-
 .../aquantia/atlantic/hw_atl/hw_atl_b0_internal.h  |   2 +-
 .../aquantia/atlantic/hw_atl/hw_atl_utils.c        |   3 +-
 8 files changed, 130 insertions(+), 91 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH net-next 0/5] net: introduce noref sk
From: Eric Dumazet @ 2017-09-21 10:37 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: David Miller, netdev, pablo, fw, edumazet, hannes
In-Reply-To: <1505986931.2560.51.camel@redhat.com>

On Thu, 2017-09-21 at 11:42 +0200, Paolo Abeni wrote:
> Hi,
> 
> Thanks for the feedback!
> 
> On Wed, 2017-09-20 at 20:20 -0700, David Miller wrote:
> > From: Paolo Abeni <pabeni@redhat.com>
> > Date: Wed, 20 Sep 2017 18:54:00 +0200
> > 
> > > This series introduce the infrastructure to store inside the skb a socket
> > > pointer without carrying a refcount to the socket.
> > > 
> > > Such infrastructure is then used in the network receive path - and
> > > specifically the early demux operation.
> > > 
> > > This allows the UDP early demux to perform a full lookup for UDP sockets,
> > > with many benefits:
> > > 
> > > - the UDP early demux code is now much simpler
> > > - the early demux does not hit any performance penalties in case of UDP hash
> > >   table collision - previously the early demux performed a partial, unsuccesful,
> > >   lookup
> > > - early demux is now operational also for unconnected sockets.
> > > 
> > > This infrastrcture will be used in follow-up series to allow dst caching for
> > > unconnected UDP sockets, and than to extend the same features to TCP listening
> > > sockets.
> > 
> > Like Eric, I find this series (while exciting) quite scary :-)
> > 
> > You really have to post some kind of performance numbers in your
> > header posting in order to justify something with these ramifications
> > and scale.
> 
> This is actually a preparatory work for the next series which will
> bring in the real gain. The next patches are still to be polished so we
>  posted this separately to get some early feedback. 
> 
> If that would help, I can post the follow-up soon as RFC. Overall -
> with the follow-up appplied, too - when using a single rx ingress
> queue, I measured ~20% tput gain for unconnected ipv4 sockets - with
> rp_filter disabled - and ~30% for ipv6 sockets. In case of multiple
> ingress queues, the gain is smaller but still measurable (roughly 5%). 
> 
> Please let me know if you prefer the see the full work early. 

I want to see the full work yes. Ipv6, and everything.

I do not want ~1000 lines of changed code in the stack for some corner
cases, where people do not properly use existing infra, like proper
SO_REUSEPORT with proper BPF filter to have as many clean siloes (proper
CPU/NUMA affinities to avoid QPI traffic)

The complexity of your patches reached a point where I am extremely
nervous.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next 1/5] net: add support for noref skb->sk
From: Eric Dumazet @ 2017-09-21 10:35 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, David S. Miller, Pablo Neira Ayuso, Florian Westphal,
	Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <1505985297.2560.39.camel@redhat.com>

On Thu, 2017-09-21 at 11:14 +0200, Paolo Abeni wrote:
> Hi,
> 
> Thank you for looking at it!
> 
> On Wed, 2017-09-20 at 10:41 -0700, Eric Dumazet wrote:
> > On Wed, 2017-09-20 at 18:54 +0200, Paolo Abeni wrote:
> > > Noref sk do not carry a socket refcount, are valid
> > > only inside the current RCU section and must be
> > > explicitly cleared before exiting such section.
> > > 
> > > They will be used in a later patch to allow early demux
> > > without sock refcounting.
> > 
> > 
> > 
> > 
> > > +/* dummy destructor used by noref sockets */
> > > +void sock_dummyfree(struct sk_buff *skb)
> > > +{
> > 
> > BUG();
> > 
> > > +}
> > > +EXPORT_SYMBOL(sock_dummyfree);
> > > +
> 
> We can call sock_dummyfree() in legitimate paths, see below, but we can
> add a:
> 
> WARN_ON_ONCE(!rcu_read_lock_held());

This wont be enough see below.

> 
> here and in  skb_clear_noref_sk(). That should help much to catch
> possible bugs.
> 
> > I do not see how you ensure we do not leave RCU section with an skb
> > destructor pointing to this sock_dummyfree()
> > 
> > This patch series looks quite dangerous to me.
> 
> The idea is to explicitly clear the sknoref references before leaving
> the RCU section. Quite alike what we currently do for dst noref, but
> here the only place where we get a noref socket is the socket early
> demux, thus the scope of this change is more limited to what we have
> with noref dst_entries.
> 
> The relevant code is in the next 2 patches; after the demux we preserve
> the sknoref only if the skb has a local destination. The UDP socket
> will then set the noref on early demux lookup, and the skb will either:
> 
> * land on the corresponding UDP socket, the receive function will steal
> the sknoref
> * be dropped by some nft/iptables target - the dummy destructor is
> called
> * forwarded by some nft/iptables target outside the input path; we
> clear the skref explicitly in such targets. 
> 
> Currently there are an handful of places affected, and we can simplify
> the code dropping the early demux result for locally terminated
> multicast sockets on a host acting as a multicast router, please see
> the comment on the next patch.
> 
> > Do we really have real applications using connected UDP sockets and
> > wanting very high pps throughput ?
> 
> The ultimate goal is to improve the unconnected UDP sockets scenario,
> we do actually have use cases for that - DNS servers and VoIP SBCs.

Unconnected UDP traffic does not use refcounting on sk _already_.

And SO_REUSEPORT already allows us to handle all the traffic we want
_already_.


Please take a look at 71563f3414e917c62acd8e0fb0edf8ed6af63e4b

This might tell you why I am so nervous about your changes.

Checking WARN_ON_ONCE(!rcu_read_lock_held());
is not enough.

rcu_read_lock()
skb->destructor = sock_dummyfree;

queue the packet into an intermediate queue.
rcu_read_unlock();

....

rcu_read_lock()
...
if (skb->sk && skb->sk->state == ...) // crash

Also you covered IPv4, but really we need to forget about IPv4 and focus
on IPv6 only. And _then_ take care of IPv4 compat.

^ permalink raw reply

* [net-next v3] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl
From: Vincent Bernat @ 2017-09-21 10:05 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern, David Miller, bridge, netdev
  Cc: Vincent Bernat
In-Reply-To: <20170920162140.369bb198@xeon-e3>

Currently, there is a difference in netlink events received when an
interface is modified through bridge ioctl() or through netlink. This
patch generates additional events when an interface is added to or
removed from a bridge via ioctl().

When adding then removing an interface from a bridge with netlink, we
get:

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

When using ioctl():

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
    link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default
    link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

Without this patch, the last netlink notification is not sent.

Signed-off-by: Vincent Bernat <vincent@bernat.im>
---
 net/bridge/br_ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 7970f8540cbb..66cd98772051 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -102,6 +102,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
 	else
 		ret = br_del_if(br, dev);
 
+	if (!ret)
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL);
+
 	return ret;
 }
 
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge
From: Vincent Bernat @ 2017-09-21 10:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Ahern, David Miller, bridge, netdev
In-Reply-To: <20170920162140.369bb198@xeon-e3>

 ❦ 20 septembre 2017 16:21 -0700, Stephen Hemminger <stephen@networkplumber.org> :

> The one concern is that ports added or removed through ioctl should
> cause same events as doing the same thing via netlink. Some users use
> brctl (ioctl) and others use newer bridge (netlink) API.

I'll make a third iteration to have the same notifications when using
ioctl() with details in the commit message.
-- 
When in doubt, tell the truth.
		-- Mark Twain

^ permalink raw reply

* Re: [PATCH net-next 0/5] net: introduce noref sk
From: Paolo Abeni @ 2017-09-21  9:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, pablo, fw, edumazet, hannes
In-Reply-To: <20170920.202022.2073272587145961156.davem@davemloft.net>

Hi,

Thanks for the feedback!

On Wed, 2017-09-20 at 20:20 -0700, David Miller wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> Date: Wed, 20 Sep 2017 18:54:00 +0200
> 
> > This series introduce the infrastructure to store inside the skb a socket
> > pointer without carrying a refcount to the socket.
> > 
> > Such infrastructure is then used in the network receive path - and
> > specifically the early demux operation.
> > 
> > This allows the UDP early demux to perform a full lookup for UDP sockets,
> > with many benefits:
> > 
> > - the UDP early demux code is now much simpler
> > - the early demux does not hit any performance penalties in case of UDP hash
> >   table collision - previously the early demux performed a partial, unsuccesful,
> >   lookup
> > - early demux is now operational also for unconnected sockets.
> > 
> > This infrastrcture will be used in follow-up series to allow dst caching for
> > unconnected UDP sockets, and than to extend the same features to TCP listening
> > sockets.
> 
> Like Eric, I find this series (while exciting) quite scary :-)
> 
> You really have to post some kind of performance numbers in your
> header posting in order to justify something with these ramifications
> and scale.

This is actually a preparatory work for the next series which will
bring in the real gain. The next patches are still to be polished so we
 posted this separately to get some early feedback. 

If that would help, I can post the follow-up soon as RFC. Overall -
with the follow-up appplied, too - when using a single rx ingress
queue, I measured ~20% tput gain for unconnected ipv4 sockets - with
rp_filter disabled - and ~30% for ipv6 sockets. In case of multiple
ingress queues, the gain is smaller but still measurable (roughly 5%). 

Please let me know if you prefer the see the full work early. 

Thanks,

Paolo

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox