Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v3 net-next 03/10] net: hns3: Add support for PFC setting in TM module
From: Yunsheng Lin @ 2017-09-27  1:45 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1506476732-128130-1-git-send-email-linyunsheng@huawei.com>

This patch add a pfc_pause_en cmd, and use it to configure
PFC option according to fc_mode in hdev->tm_info.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 68 ++++++++++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h  |  5 ++
 2 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 73a75d7..0b4b5d9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -124,6 +124,20 @@ static int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx)
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
+static int hclge_pfc_pause_en_cfg(struct hclge_dev *hdev, u8 tx_rx_bitmap,
+				  u8 pfc_bitmap)
+{
+	struct hclge_desc desc;
+	struct hclge_pfc_en_cmd *pfc = (struct hclge_pfc_en_cmd *)&desc.data;
+
+	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_PFC_PAUSE_EN, false);
+
+	pfc->tx_rx_en_bitmap = tx_rx_bitmap;
+	pfc->pri_en_bitmap = pfc_bitmap;
+
+	return hclge_cmd_send(&hdev->hw, &desc, 1);
+}
+
 static int hclge_fill_pri_array(struct hclge_dev *hdev, u8 *pri, u8 pri_id)
 {
 	u8 tc;
@@ -969,20 +983,64 @@ static int hclge_tm_schd_setup_hw(struct hclge_dev *hdev)
 	return hclge_tm_schd_mode_hw(hdev);
 }
 
+static int hclge_pfc_setup_hw(struct hclge_dev *hdev)
+{
+	u8 enable_bitmap = 0;
+
+	if (hdev->tm_info.fc_mode == HCLGE_FC_PFC)
+		enable_bitmap = HCLGE_TX_MAC_PAUSE_EN_MSK |
+				HCLGE_RX_MAC_PAUSE_EN_MSK;
+
+	return hclge_pfc_pause_en_cfg(hdev, enable_bitmap,
+				      hdev->tm_info.hw_pfc_map);
+}
+
+static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
+{
+	bool tx_en, rx_en;
+
+	switch (hdev->tm_info.fc_mode) {
+	case HCLGE_FC_NONE:
+		tx_en = false;
+		rx_en = false;
+		break;
+	case HCLGE_FC_RX_PAUSE:
+		tx_en = false;
+		rx_en = true;
+		break;
+	case HCLGE_FC_TX_PAUSE:
+		tx_en = true;
+		rx_en = false;
+		break;
+	case HCLGE_FC_FULL:
+		tx_en = true;
+		rx_en = true;
+		break;
+	default:
+		tx_en = true;
+		rx_en = true;
+	}
+
+	return hclge_mac_pause_en_cfg(hdev, tx_en, rx_en);
+}
+
 int hclge_pause_setup_hw(struct hclge_dev *hdev)
 {
-	bool en = hdev->tm_info.fc_mode != HCLGE_FC_PFC;
 	int ret;
 	u8 i;
 
-	ret = hclge_mac_pause_en_cfg(hdev, en, en);
-	if (ret)
-		return ret;
+	if (hdev->tm_info.fc_mode != HCLGE_FC_PFC)
+		return hclge_mac_pause_setup_hw(hdev);
 
-	/* Only DCB-supported dev supports qset back pressure setting */
+	/* Only DCB-supported dev supports qset back pressure and pfc cmd */
 	if (!hnae3_dev_dcb_supported(hdev))
 		return 0;
 
+	/* When MAC is GE Mode, hdev does not support pfc setting */
+	ret = hclge_pfc_setup_hw(hdev);
+	if (ret)
+		dev_warn(&hdev->pdev->dev, "set pfc pause failed:%d\n", ret);
+
 	for (i = 0; i < hdev->tm_info.num_tc; i++) {
 		ret = hclge_tm_qs_bp_cfg(hdev, i);
 		if (ret)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
index 85158b0..8ecd83c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
@@ -94,6 +94,11 @@ struct hclge_bp_to_qs_map_cmd {
 	u32 rsvd1;
 };
 
+struct hclge_pfc_en_cmd {
+	u8 tx_rx_en_bitmap;
+	u8 pri_en_bitmap;
+};
+
 #define hclge_tm_set_field(dest, string, val) \
 			hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \
 				       (HCLGE_TM_SHAP_##string##_LSH), val)
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 net-next 02/10] net: hns3: Add support for dynamically buffer reallocation
From: Yunsheng Lin @ 2017-09-27  1:45 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel
In-Reply-To: <1506476732-128130-1-git-send-email-linyunsheng@huawei.com>

Current buffer allocation can only happen at init, when
doing buffer reallocation after init, care must be taken
care of memory which priv_buf points to.
This patch fixes it by using a dynamic allocated temporary
memory. Because we only do buffer reallocation at init or
when setting up the DCB parameter, and priv_buf is only
used at buffer allocation process, so it is ok to use a
dynamic allocated temporary memory.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   5 +
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 150 +++++++++++----------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   2 -
 3 files changed, 87 insertions(+), 70 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index a81c6cb..6b6d28e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -322,6 +322,11 @@ struct hclge_shared_buf {
 	u32 buf_size;
 };
 
+struct hclge_pkt_buf_alloc {
+	struct hclge_priv_buf priv_buf[HCLGE_MAX_TC_NUM];
+	struct hclge_shared_buf s_buf;
+};
+
 #define HCLGE_RX_COM_WL_EN_B	15
 struct hclge_rx_com_wl_buf {
 	__le16 high_wl;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 02da3d5..b345070 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1324,7 +1324,8 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
+static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev,
+				    struct hclge_pkt_buf_alloc *buf_alloc)
 {
 /* TX buffer size is unit by 128 byte */
 #define HCLGE_BUF_SIZE_UNIT_SHIFT	7
@@ -1338,7 +1339,7 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
 	for (i = 0; i < HCLGE_TC_NUM; i++) {
-		u32 buf_size = hdev->priv_buf[i].tx_buf_size;
+		u32 buf_size = buf_alloc->priv_buf[i].tx_buf_size;
 
 		req->tx_pkt_buff[i] =
 			cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) |
@@ -1355,9 +1356,10 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_tx_buffer_alloc(struct hclge_dev *hdev)
+static int hclge_tx_buffer_alloc(struct hclge_dev *hdev,
+				 struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	int ret = hclge_cmd_alloc_tx_buff(hdev);
+	int ret = hclge_cmd_alloc_tx_buff(hdev, buf_alloc);
 
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
@@ -1390,13 +1392,14 @@ static int hclge_get_pfc_enalbe_num(struct hclge_dev *hdev)
 }
 
 /* Get the number of pfc enabled TCs, which have private buffer */
-static int hclge_get_pfc_priv_num(struct hclge_dev *hdev)
+static int hclge_get_pfc_priv_num(struct hclge_dev *hdev,
+				  struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	int i, cnt = 0;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if ((hdev->tm_info.hw_pfc_map & BIT(i)) &&
 		    priv->enable)
 			cnt++;
@@ -1406,13 +1409,14 @@ static int hclge_get_pfc_priv_num(struct hclge_dev *hdev)
 }
 
 /* Get the number of pfc disabled TCs, which have private buffer */
-static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev)
+static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev,
+				     struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	int i, cnt = 0;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (hdev->hw_tc_map & BIT(i) &&
 		    !(hdev->tm_info.hw_pfc_map & BIT(i)) &&
 		    priv->enable)
@@ -1422,31 +1426,33 @@ static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev)
 	return cnt;
 }
 
-static u32 hclge_get_rx_priv_buff_alloced(struct hclge_dev *hdev)
+static u32 hclge_get_rx_priv_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_priv_buf *priv;
 	u32 rx_priv = 0;
 	int i;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (priv->enable)
 			rx_priv += priv->buf_size;
 	}
 	return rx_priv;
 }
 
-static u32 hclge_get_tx_buff_alloced(struct hclge_dev *hdev)
+static u32 hclge_get_tx_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	u32 i, total_tx_size = 0;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++)
-		total_tx_size += hdev->priv_buf[i].tx_buf_size;
+		total_tx_size += buf_alloc->priv_buf[i].tx_buf_size;
 
 	return total_tx_size;
 }
 
-static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
+static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev,
+				struct hclge_pkt_buf_alloc *buf_alloc,
+				u32 rx_all)
 {
 	u32 shared_buf_min, shared_buf_tc, shared_std;
 	int tc_num, pfc_enable_num;
@@ -1467,30 +1473,31 @@ static bool  hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
 			hdev->mps;
 	shared_std = max_t(u32, shared_buf_min, shared_buf_tc);
 
-	rx_priv = hclge_get_rx_priv_buff_alloced(hdev);
+	rx_priv = hclge_get_rx_priv_buff_alloced(buf_alloc);
 	if (rx_all <= rx_priv + shared_std)
 		return false;
 
 	shared_buf = rx_all - rx_priv;
-	hdev->s_buf.buf_size = shared_buf;
-	hdev->s_buf.self.high = shared_buf;
-	hdev->s_buf.self.low =  2 * hdev->mps;
+	buf_alloc->s_buf.buf_size = shared_buf;
+	buf_alloc->s_buf.self.high = shared_buf;
+	buf_alloc->s_buf.self.low =  2 * hdev->mps;
 
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
 		if ((hdev->hw_tc_map & BIT(i)) &&
 		    (hdev->tm_info.hw_pfc_map & BIT(i))) {
-			hdev->s_buf.tc_thrd[i].low = hdev->mps;
-			hdev->s_buf.tc_thrd[i].high = 2 * hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].low = hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].high = 2 * hdev->mps;
 		} else {
-			hdev->s_buf.tc_thrd[i].low = 0;
-			hdev->s_buf.tc_thrd[i].high = hdev->mps;
+			buf_alloc->s_buf.tc_thrd[i].low = 0;
+			buf_alloc->s_buf.tc_thrd[i].high = hdev->mps;
 		}
 	}
 
 	return true;
 }
 
-static int hclge_tx_buffer_calc(struct hclge_dev *hdev)
+static int hclge_tx_buffer_calc(struct hclge_dev *hdev,
+				struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	u32 i, total_size;
 
@@ -1498,7 +1505,7 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev)
 
 	/* alloc tx buffer for all enabled tc */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		struct hclge_priv_buf *priv = &hdev->priv_buf[i];
+		struct hclge_priv_buf *priv = &buf_alloc->priv_buf[i];
 
 		if (total_size < HCLGE_DEFAULT_TX_BUF)
 			return -ENOMEM;
@@ -1516,22 +1523,24 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev)
 
 /* hclge_rx_buffer_calc: calculate the rx private buffer size for all TCs
  * @hdev: pointer to struct hclge_dev
+ * @buf_alloc: pointer to buffer calculation data
  * @return: 0: calculate sucessful, negative: fail
  */
-int hclge_rx_buffer_calc(struct hclge_dev *hdev)
+int hclge_rx_buffer_calc(struct hclge_dev *hdev,
+			 struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	u32 rx_all = hdev->pkt_buf_size;
 	int no_pfc_priv_num, pfc_priv_num;
 	struct hclge_priv_buf *priv;
 	int i;
 
-	rx_all -= hclge_get_tx_buff_alloced(hdev);
+	rx_all -= hclge_get_tx_buff_alloced(buf_alloc);
 
 	/* When DCB is not supported, rx private
 	 * buffer is not allocated.
 	 */
 	if (!hnae3_dev_dcb_supported(hdev)) {
-		if (!hclge_is_rx_buf_ok(hdev, rx_all))
+		if (!hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 			return -ENOMEM;
 
 		return 0;
@@ -1539,7 +1548,7 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 
 	/* step 1, try to alloc private buffer for all enabled tc */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 		if (hdev->hw_tc_map & BIT(i)) {
 			priv->enable = 1;
 			if (hdev->tm_info.hw_pfc_map & BIT(i)) {
@@ -1560,14 +1569,14 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 		}
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 2, try to decrease the buffer size of
 	 * no pfc TC's private buffer
 	 */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		priv->enable = 0;
 		priv->wl.low = 0;
@@ -1590,18 +1599,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 		}
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 3, try to reduce the number of pfc disabled TCs,
 	 * which have private buffer
 	 */
 	/* get the total no pfc enable TC number, which have private buffer */
-	no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev);
+	no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev, buf_alloc);
 
 	/* let the last to be cleared first */
 	for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		if (hdev->hw_tc_map & BIT(i) &&
 		    !(hdev->tm_info.hw_pfc_map & BIT(i))) {
@@ -1613,22 +1622,22 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 			no_pfc_priv_num--;
 		}
 
-		if (hclge_is_rx_buf_ok(hdev, rx_all) ||
+		if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) ||
 		    no_pfc_priv_num == 0)
 			break;
 	}
 
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	/* step 4, try to reduce the number of pfc enabled TCs
 	 * which have private buffer.
 	 */
-	pfc_priv_num = hclge_get_pfc_priv_num(hdev);
+	pfc_priv_num = hclge_get_pfc_priv_num(hdev, buf_alloc);
 
 	/* let the last to be cleared first */
 	for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) {
-		priv = &hdev->priv_buf[i];
+		priv = &buf_alloc->priv_buf[i];
 
 		if (hdev->hw_tc_map & BIT(i) &&
 		    hdev->tm_info.hw_pfc_map & BIT(i)) {
@@ -1640,17 +1649,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev)
 			pfc_priv_num--;
 		}
 
-		if (hclge_is_rx_buf_ok(hdev, rx_all) ||
+		if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) ||
 		    pfc_priv_num == 0)
 			break;
 	}
-	if (hclge_is_rx_buf_ok(hdev, rx_all))
+	if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all))
 		return 0;
 
 	return -ENOMEM;
 }
 
-static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
+static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev,
+				   struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_rx_priv_buff *req;
 	struct hclge_desc desc;
@@ -1662,7 +1672,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 
 	/* Alloc private buffer TCs */
 	for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-		struct hclge_priv_buf *priv = &hdev->priv_buf[i];
+		struct hclge_priv_buf *priv = &buf_alloc->priv_buf[i];
 
 		req->buf_num[i] =
 			cpu_to_le16(priv->buf_size >> HCLGE_BUF_UNIT_S);
@@ -1671,7 +1681,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 	}
 
 	req->shared_buf =
-		cpu_to_le16((hdev->s_buf.buf_size >> HCLGE_BUF_UNIT_S) |
+		cpu_to_le16((buf_alloc->s_buf.buf_size >> HCLGE_BUF_UNIT_S) |
 			    (1 << HCLGE_TC0_PRI_BUF_EN_B));
 
 	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -1686,7 +1696,8 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev)
 
 #define HCLGE_PRIV_ENABLE(a) ((a) > 0 ? 1 : 0)
 
-static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
+static int hclge_rx_priv_wl_config(struct hclge_dev *hdev,
+				   struct hclge_pkt_buf_alloc *buf_alloc)
 {
 	struct hclge_rx_priv_wl_buf *req;
 	struct hclge_priv_buf *priv;
@@ -1706,7 +1717,9 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
 			desc[i].flag &= ~cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
 
 		for (j = 0; j < HCLGE_TC_NUM_ONE_DESC; j++) {
-			priv = &hdev->priv_buf[i * HCLGE_TC_NUM_ONE_DESC + j];
+			u32 idx = i * HCLGE_TC_NUM_ONE_DESC + j;
+
+			priv = &buf_alloc->priv_buf[idx];
 			req->tc_wl[j].high =
 				cpu_to_le16(priv->wl.high >> HCLGE_BUF_UNIT_S);
 			req->tc_wl[j].high |=
@@ -1731,9 +1744,10 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_common_thrd_config(struct hclge_dev *hdev)
+static int hclge_common_thrd_config(struct hclge_dev *hdev,
+				    struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	struct hclge_shared_buf *s_buf = &hdev->s_buf;
+	struct hclge_shared_buf *s_buf = &buf_alloc->s_buf;
 	struct hclge_rx_com_thrd *req;
 	struct hclge_desc desc[2];
 	struct hclge_tc_thrd *tc;
@@ -1777,9 +1791,10 @@ static int hclge_common_thrd_config(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_common_wl_config(struct hclge_dev *hdev)
+static int hclge_common_wl_config(struct hclge_dev *hdev,
+				  struct hclge_pkt_buf_alloc *buf_alloc)
 {
-	struct hclge_shared_buf *buf = &hdev->s_buf;
+	struct hclge_shared_buf *buf = &buf_alloc->s_buf;
 	struct hclge_rx_com_wl *req;
 	struct hclge_desc desc;
 	int ret;
@@ -1809,69 +1824,68 @@ static int hclge_common_wl_config(struct hclge_dev *hdev)
 
 int hclge_buffer_alloc(struct hclge_dev *hdev)
 {
+	struct hclge_pkt_buf_alloc *pkt_buf;
 	int ret;
 
-	hdev->priv_buf = devm_kmalloc_array(&hdev->pdev->dev, HCLGE_MAX_TC_NUM,
-					    sizeof(struct hclge_priv_buf),
-					    GFP_KERNEL | __GFP_ZERO);
-	if (!hdev->priv_buf)
+	pkt_buf = kzalloc(sizeof(*pkt_buf), GFP_KERNEL);
+	if (!pkt_buf)
 		return -ENOMEM;
 
-	ret = hclge_tx_buffer_calc(hdev);
+	ret = hclge_tx_buffer_calc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not calc tx buffer size for all TCs %d\n", ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_tx_buffer_alloc(hdev);
+	ret = hclge_tx_buffer_alloc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not alloc tx buffers %d\n", ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_rx_buffer_calc(hdev);
+	ret = hclge_rx_buffer_calc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
 			"could not calc rx priv buffer size for all TCs %d\n",
 			ret);
-		return ret;
+		goto out;
 	}
 
-	ret = hclge_rx_priv_buf_alloc(hdev);
+	ret = hclge_rx_priv_buf_alloc(hdev, pkt_buf);
 	if (ret) {
 		dev_err(&hdev->pdev->dev, "could not alloc rx priv buffer %d\n",
 			ret);
-		return ret;
+		goto out;
 	}
 
 	if (hnae3_dev_dcb_supported(hdev)) {
-		ret = hclge_rx_priv_wl_config(hdev);
+		ret = hclge_rx_priv_wl_config(hdev, pkt_buf);
 		if (ret) {
 			dev_err(&hdev->pdev->dev,
 				"could not configure rx private waterline %d\n",
 				ret);
-			return ret;
+			goto out;
 		}
 
-		ret = hclge_common_thrd_config(hdev);
+		ret = hclge_common_thrd_config(hdev, pkt_buf);
 		if (ret) {
 			dev_err(&hdev->pdev->dev,
 				"could not configure common threshold %d\n",
 				ret);
-			return ret;
+			goto out;
 		}
 	}
 
-	ret = hclge_common_wl_config(hdev);
-	if (ret) {
+	ret = hclge_common_wl_config(hdev, pkt_buf);
+	if (ret)
 		dev_err(&hdev->pdev->dev,
 			"could not configure common waterline %d\n", ret);
-		return ret;
-	}
 
-	return 0;
+out:
+	kfree(pkt_buf);
+	return ret;
 }
 
 static int hclge_init_roce_base_info(struct hclge_vport *vport)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 9fcfd93..4fc36f0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -463,8 +463,6 @@ struct hclge_dev {
 
 	u32 pkt_buf_size; /* Total pf buf size for tx/rx */
 	u32 mps; /* Max packet size */
-	struct hclge_priv_buf *priv_buf;
-	struct hclge_shared_buf s_buf;
 
 	enum hclge_mta_dmac_sel_type mta_mac_sel_type;
 	bool enable_mta; /* Mutilcast filter enable */
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 net-next 00/10] Add support for DCB feature in hns3 driver
From: Yunsheng Lin @ 2017-09-27  1:45 UTC (permalink / raw)
  To: davem
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
	netdev, linux-kernel

The patchset contains some enhancement related to DCB before
adding support for DCB feature.

This patchset depends on the following patchset:
https://patchwork.ozlabs.org/cover/815646/
https://patchwork.ozlabs.org/cover/816145/

High Level Architecture:

                   [ lldpad ]
                       |
                       |
                       |
                 [ hns3_dcbnl ]
                       |
                       |
                       |
                 [ hclge_dcb ]
                   /      \
                /            \
             /                  \
     [ hclge_main ]        [ hclge_tm ]

Current patch-set support following functionality:
   Use of lldptool to configure the tc schedule mode, tc
   bandwidth(if schedule mode is ETS), prio_tc_map and
   PFC parameter.

---
V3: Drop mqprio support

V2: Fix for not defining variables in local loop.

V1: Initial Submit.

Yunsheng Lin (10):
  net: hns3: Support for dynamically assigning tx buffer to TC
  net: hns3: Add support for dynamically buffer reallocation
  net: hns3: Add support for PFC setting in TM module
  net: hns3: Add support for port shaper setting in TM module
  net: hns3: Add tc-based TM support for sriov enabled port
  net: hns3: Add some interface for the support of DCB feature
  net: hns3: Add hclge_dcb module for the support of DCB feature
  net: hns3: Add dcb netlink interface for the support of DCB feature
  net: hns3: Setting for fc_mode and dcb enable flag in TM module
  net: hns3: Add DCB support when interacting with network stack

 drivers/net/ethernet/hisilicon/Kconfig             |   9 +
 drivers/net/ethernet/hisilicon/hns3/hnae3.h        |  17 ++
 .../net/ethernet/hisilicon/hns3/hns3pf/Makefile    |   4 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   6 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 304 +++++++++++++++++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h |  21 ++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 214 ++++++++++-----
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   8 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 231 ++++++++++++++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h  |  15 +
 .../ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c    | 106 +++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 104 ++++++-
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h |   7 +
 13 files changed, 927 insertions(+), 119 deletions(-)
 create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
 create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h
 create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c

-- 
1.9.1

^ permalink raw reply

* Re: [PATCH net-next v9] openvswitch: enable NSH support
From: Yang, Yi @ 2017-09-27  1:39 UTC (permalink / raw)
  To: Jiri Benc
  Cc: netdev@vger.kernel.org, dev@openvswitch.org, e@erig.me,
	davem@davemloft.net, Pravin Shelar, jan.scheurich
In-Reply-To: <20170926124914.60101ca1@griffin>

On Tue, Sep 26, 2017 at 06:49:14PM +0800, Jiri Benc wrote:
> On Tue, 26 Sep 2017 12:55:39 +0800, Yang, Yi wrote:
> > After push_nsh, the packet won't be recirculated to flow pipeline, so
> > key->eth.type must be set explicitly here, but for pop_nsh, the packet
> > will be recirculated to flow pipeline, it will be reparsed, so
> > key->eth.type will be set in packet parse function, we needn't handle it
> > in pop_nsh.
> 
> This seems to be a very different approach than what we currently have.
> Looking at the code, the requirement after "destructive" actions such
> as pushing or popping headers is to recirculate.

This is optimization proposed by Jan Scheurich, recurculating after push_nsh
will impact on performance, recurculating after pop_nsh is unavoidable, So
also cc jan.scheurich@ericsson.com.

Actucally all the keys before push_nsh are still there after push_nsh,
push_nsh has updated all the nsh keys, so recirculating remains avoidable.

> 
> Setting key->eth.type to satisfy conditions in the output path without
> updating the rest of the key looks very hacky and fragile to me. There
> might be other conditions and dependencies that are not obvious.
> I don't think the code was written with such code path in mind.
> 
> I'd like to hear what Pravin thinks about this.
> 
>  Jiri

^ permalink raw reply

* Re: [ovs-dev] [PATCH net-next v9] openvswitch: enable NSH support
From: Yang, Yi @ 2017-09-27  1:09 UTC (permalink / raw)
  To: Eric Garver
  Cc: dev@openvswitch.org, netdev@vger.kernel.org, jbenc@redhat.com,
	davem@davemloft.net
In-Reply-To: <20170926205936.GE1786@dev-rhel7>

On Wed, Sep 27, 2017 at 04:59:36AM +0800, Eric Garver wrote:
> On Tue, Sep 26, 2017 at 01:02:15PM +0800, Yang, Yi wrote:
> > On Tue, Sep 26, 2017 at 03:28:42AM +0800, Eric Garver wrote:
> > > On Mon, Sep 25, 2017 at 10:16:09PM +0800, Yi Yang wrote:
> > > > +
> > > > +	length = nsh_hdr_len(nsh_hdr);
> > > > +	skb_pull(skb, length);
> > > 
> > > Do you need to verify you can actually pull length bytes? I don't see
> > > any guarantee.
> > 
> > I have added skb length check in pop_nsh, so that can verify this.
> 
> That doesn't help other code that may call skb_pop_nsh(). skb_vlan_pop()
> calls skb_ensure_writable() which seems like the right thing to do.

Make sense, I will move it to skp_pop_nsh, thanks.

^ permalink raw reply

* Re: [PATCH net-next v10] openvswitch: enable NSH support
From: Yang, Yi @ 2017-09-27  0:52 UTC (permalink / raw)
  To: Jiri Benc
  Cc: netdev@vger.kernel.org, dev@openvswitch.org, e@erig.me,
	davem@davemloft.net
In-Reply-To: <20170926164240.764a66ed@griffin>

On Tue, Sep 26, 2017 at 10:42:40PM +0800, Jiri Benc wrote:
> On Tue, 26 Sep 2017 21:52:41 +0800, Yang, Yi wrote:
> > > +	return ((ret != 0) ? false : true);
> > 
> > But I don't think this is a problematic line from my understanding,
> 
> Why not:
> 
> 	return ((ret != 0 == true) ? false : true) == true;
> 
> ?
> 
> Sigh. This is equal to:
> 
> 	return !ret;
> 
> which you should use.

Ok, got it, I'll use "return !ret;", real programming art :-), I also saw
!!(condition), personally its readability is not good, typical kernel
style :-)

^ permalink raw reply

* Re: [PATCH v4 2/3] ipv4: Namespaceify tcp_fastopen_key knob
From: 严海双 @ 2017-09-27  1:05 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, edumazet, weiwan, lucab, netdev, linux-kernel
In-Reply-To: <20170926.111851.1172660559080066162.davem@davemloft.net>



> On 2017年9月27日, at 上午2:18, David Miller <davem@davemloft.net> wrote:
> 
> From: 严海双 <yanhaishuang@cmss.chinamobile.com>
> Date: Tue, 26 Sep 2017 09:25:51 +0800
> 
>>> On 2017年9月26日, at 上午7:24, David Miller <davem@davemloft.net> wrote:
>>> 
>>> From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
>>> Date: Fri, 22 Sep 2017 21:48:43 +0800
>>> 
>>>> @@ -9,13 +9,18 @@
>>>> #include <net/inetpeer.h>
>>>> #include <net/tcp.h>
>>>> 
>>>> -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
>>>> -
>>>> -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
>>>> -
>>>> -void tcp_fastopen_init_key_once(bool publish)
>>>> +void tcp_fastopen_init_key_once(struct net *net)
>>> 
>>> Why did you remove the 'publish' logic from this function?
>>> 
>> 
>> I think this logic is not necessary now, in proc_tcp_fastopen_key, I have removed 
>> tcp_fastopen_init_key_once(false) where the ‘publish’ is false:
>> 
>> -		/* Generate a dummy secret but don't publish it. This
>> -		 * is needed so we don't regenerate a new key on the
>> -		 * first invocation of tcp_fastopen_cookie_gen
>> -		 */
>> -		tcp_fastopen_init_key_once(false);
>> -		tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
>> +		tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH);
>> 
>> It said we don't regenerate a new key on first invocation of tcp_fastopen_cookie_gen, 
>> but in tcp_fastopen_cookie_gen，it didn’t  call tcp_fastopen_init_key_once since
>> from commit dfea2aa654243 (tcp: Do not call tcp_fastopen_reset_cipher from interrupt context)：
>> 
>> And in other places where call tcp_fastopen_init_key_once, the ‘publish’ is always true:
> 
> Ok, this simplification seems legitimate.
> 
> But it is unrelated to this namespacification.  So it should be in a separate patch,
> and should be documented well in the commit message using the great explanation you
> gave to me above.
> 
> Please respin this series, with this patch #2 split up into two changes.
> 
> Thank you.

Okay， thanks David for advise. I will split the patch #2 in next commit.

^ permalink raw reply

* [PATCH] net/ipv4: Update sk_for_each_entry_offset_rcu macro to  utilize rcu methods hlist_next_rcu. This fixes the warnings thrown by sparse  regarding net/ipv4/udp.c on line 1974.
From: Tim Hansen @ 2017-09-27  0:54 UTC (permalink / raw)
  Cc: Tim Hansen, David S. Miller, open list:NETWORKING [GENERAL],
	open list

Signed-off-by: Tim Hansen <devtimhansen@gmail.com>
---
 include/net/sock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index aeeec62992ca..516289f6404b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -732,10 +732,10 @@ static inline void sk_add_bind_node(struct sock *sk,
  *
  */
 #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)		       \
-	for (pos = rcu_dereference((head)->first);			       \
+	for (pos = rcu_dereference_raw(hlist_next_rcu((head)->first));	       \
 	     pos != NULL &&						       \
 		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
-	     pos = rcu_dereference(pos->next))
+	     pos = rcu_dereference_raw(hlist_next_rcu(pos->next)))
 
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
-- 
2.14.1

^ permalink raw reply related

* [PATCH net] net: Set sk_prot_creator when cloning sockets to the right proto
From: Christoph Paasch @ 2017-09-27  0:38 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

sk->sk_prot and sk->sk_prot_creator can differ when the app uses
IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one).
Which is why sk_prot_creator is there to make sure that sk_prot_free()
does the kmem_cache_free() on the right kmem_cache slab.

Now, if such a socket gets transformed back to a listening socket (using
connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through
sk_clone_lock() when a new connection comes in. But sk_prot_creator will
still point to the IPv6 kmem_cache (as everything got copied in
sk_clone_lock()). When freeing, we will thus put this
memory back into the IPv6 kmem_cache although it was allocated in the
IPv4 cache. I have seen memory corruption happening because of this.

With slub-debugging and MEMCG_KMEM enabled this gives the warning
	"cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP"

A C-program to trigger this:

void main(void)
{
        int fd = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP);
        int new_fd, newest_fd, client_fd;
        struct sockaddr_in6 bind_addr;
        struct sockaddr_in bind_addr4, client_addr1, client_addr2;
        struct sockaddr unsp;
        int val;

        memset(&bind_addr, 0, sizeof(bind_addr));
        bind_addr.sin6_family = AF_INET6;
        bind_addr.sin6_port = ntohs(42424);

        memset(&client_addr1, 0, sizeof(client_addr1));
        client_addr1.sin_family = AF_INET;
        client_addr1.sin_port = ntohs(42424);
        client_addr1.sin_addr.s_addr = inet_addr("127.0.0.1");

        memset(&client_addr2, 0, sizeof(client_addr2));
        client_addr2.sin_family = AF_INET;
        client_addr2.sin_port = ntohs(42421);
        client_addr2.sin_addr.s_addr = inet_addr("127.0.0.1");

        memset(&unsp, 0, sizeof(unsp));
        unsp.sa_family = AF_UNSPEC;

        bind(fd, (struct sockaddr *)&bind_addr, sizeof(bind_addr));

        listen(fd, 5);

        client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
        connect(client_fd, (struct sockaddr *)&client_addr1, sizeof(client_addr1));
        new_fd = accept(fd, NULL, NULL);
        close(fd);

        val = AF_INET;
        setsockopt(new_fd, SOL_IPV6, IPV6_ADDRFORM, &val, sizeof(val));

        connect(new_fd, &unsp, sizeof(unsp));

        memset(&bind_addr4, 0, sizeof(bind_addr4));
        bind_addr4.sin_family = AF_INET;
        bind_addr4.sin_port = ntohs(42421);
        bind(new_fd, (struct sockaddr *)&bind_addr4, sizeof(bind_addr4));

        listen(new_fd, 5);

        client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
        connect(client_fd, (struct sockaddr *)&client_addr2, sizeof(client_addr2));

        newest_fd = accept(new_fd, NULL, NULL);
        close(new_fd);

        close(client_fd);
        close(new_fd);
}

As far as I can see, this bug has been there since the beginning of the
git-days.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
---
 net/core/sock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index 9b7b6bbb2a23..7d55c05f449d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		sock_copy(newsk, sk);
 
+		newsk->sk_prot_creator = sk->sk_prot;
+
 		/* SANITY */
 		if (likely(newsk->sk_net_refcnt))
 			get_net(sock_net(newsk));
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH v2 net-next 10/10] net: hns3: Add mqprio support when interacting with network stack
From: Yunsheng Lin @ 2017-09-27  0:51 UTC (permalink / raw)
  To: Yuval Mintz
  Cc: huangdaode@hisilicon.com, xuwei5@hisilicon.com,
	liguozhu@hisilicon.com, Yisen.Zhuang@huawei.com,
	gabriele.paoloni@huawei.com, john.garry@huawei.com,
	linuxarm@huawei.com, salil.mehta@huawei.com, lipeng321@huawei.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	davem@davemloft.net
In-Reply-To: <AM0PR0502MB3683FFF227DA3D136AA63987BF7B0@AM0PR0502MB3683.eurprd05.prod.outlook.com>

Hi, Yuval

On 2017/9/26 20:29, Yuval Mintz wrote:
>> Hi, Yuval
>>
>> On 2017/9/26 14:43, Yuval Mintz wrote:
>>>> When using tc qdisc to configure DCB parameter, dcb_ops->setup_tc
>>>> is used to tell hclge_dcb module to do the setup.
>>>
>>> While this might be a step in the right direction, this causes an inconsistency
>>> in user experience - Some [well, most] vendors didn't allow the mqprio
>>> priority mapping to affect DCB, instead relying on the dcbnl functionality
>>> to control that configuration.
>>>
>>> A couple of options to consider:
>>>   - Perhaps said logic shouldn't be contained inside the driver but rather
>>>      in mqprio logic itself. I.e., rely on DCBNL functionality [if available] from
>>>      within mqprio and try changing the configuration.
>>
>> In net/dcb/dcbnl.c
>> dcbnl_ieee_set already call dcbnl_ieee_notify to notify the user space
>> configuration has changed, does this dcbnl_ieee_notify function do the
>> job for us? I am not sure if lldpad has registered for this notifition.
> 
> Not that familiar with the dcbnl calls; Shouldn't dcbnl_setall be called to
> make the configuration apply [or is that only for ieee]?

dcbnl_setall is for cee to make the configuration apply.
ieee does not have the apply operation.

> Regardless, don't know if it makes sense to assume user-application would
> fix the qdisc configuration by notification while dcbnl logic in kernel could have
> done that instead.
> 
>> As you suggested below, can we add a new TC_MQPRIO_HW_OFFLOAD_
>> value to
>> reflect that the configuration is needed to be changed by dcbnl_ieee_set
>> (perhaps some other function) in dcbnl?
>> Do you think it is feasible?
> 
> Either I'm miseading your answer or we think of it from 2 opposite end.
> I was thinking that the new offloaded flag would indicate to the underlying
> driver that it's expected to offload the prio mapping [as part of DCB].
> If the driver would be incapable of that it would refuse the offload.
> User would then have to explicitly request that the qdisc offload.


Adding a new offloaded flag to indicate that mqpri is using a hardware offload
shared by dcbnl seems a good idea.
As I do not know how the idea go with other, I will drop the mqprio support in
this patch, and try to add the mqprio support as you suggested in the next
patchset.

Thanks again for the lengthly reply.

> 
>>
>>
>>>   - Add a new TC_MQPRIO_HW_OFFLOAD_ value to explicitly reflect user
>>>      request to allow this configuration to affect DCB.
>>>
>>>> When using lldptool to configure DCB parameter, hclge_dcb module
>>>> call the client_ops->setup_tc to tell network stack which queue
>>>> and priority is using for specific tc.
>>>
>>> You're basically bypassing the mqprio logic.
>>> Since you're configuring the prio->queue mapping from DCB flow,
>>> you'll get an mqprio-like behavior [meaning a transmitted packet
>>> would reach a transmission queue associated with its priority] even
>>> if device wasn't grated with an mqprio qdisc.
>>> Why should your user even use mqprio? What benefit does he get from it?
>>>
>>> ...
>>>
>>>> +static int hns3_nic_set_real_num_queue(struct net_device *netdev)
>>>> +{
>>>> +	struct hns3_nic_priv *priv = netdev_priv(netdev);
>>>> +	struct hnae3_handle *h = priv->ae_handle;
>>>> +	struct hnae3_knic_private_info *kinfo = &h->kinfo;
>>>> +	unsigned int queue_size = kinfo->rss_size * kinfo->num_tc;
>>>> +	int ret;
>>>> +
>>>> +	ret = netif_set_real_num_tx_queues(netdev, queue_size);
>>>> +	if (ret) {
>>>> +		netdev_err(netdev,
>>>> +			   "netif_set_real_num_tx_queues fail, ret=%d!\n",
>>>> +			   ret);
>>>> +		return ret;
>>>> +	}
>>>> +
>>>> +	ret = netif_set_real_num_rx_queues(netdev, queue_size);
>>>
>>> I don't think you're changing the driver behavior, but why are you setting
>>> the real number of rx queues based on the number of TCs?
>>> Do you actually open (TC x RSS) Rx queues?
>>>
>>> .
>>>
> 

^ permalink raw reply

* Re: [PATCH net-next 0/2] tools: add bpftool
From: Jakub Kicinski @ 2017-09-27  0:44 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, daniel, alexei.starovoitov, davem, hannes, oss-drivers
In-Reply-To: <5522855a-937f-b2cb-4c74-3448d1680b10@gmail.com>

On Tue, 26 Sep 2017 17:32:31 -0600, David Ahern wrote:
> On 9/26/17 9:35 AM, Jakub Kicinski wrote:
> > I'm looking for a home for bpftool, Daniel suggested that 
> > tools/net could be a good place, since there are only BPF
> > utilities there already.
> > 
> > The tool should be complete for simple use cases and we
> > will continue extending it as we go along.  E.g. providing
> > disassembly of loaded programs directly using LLVM library
> > and JSON output are high on the priority list.  
> 
> I have found this to be a very useful tool. Thanks for working on it.
> Moving it into the kernel will make it easier to build since it relies
> on libbpf and other files from the kernel tree.
> 
> One change I have made locally is to link against libbpf.a. That way I
> only need to copy one file to a system to use it.

Thanks!  I made the same change here, this patchset will have bpftool
linked against libbpf statically.

^ permalink raw reply

* Re: [PATCH net-next RFC 3/5] vhost: introduce vhost_add_used_idx()
From: Jason Wang @ 2017-09-27  0:38 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <20170926170047-mutt-send-email-mst@kernel.org>



On 2017年09月27日 03:13, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:33PM +0800, Jason Wang wrote:
>> This patch introduces a helper which just increase the used idx. This
>> will be used in pair with vhost_prefetch_desc_indices() by batching
>> code.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 33 +++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  1 +
>>   2 files changed, 34 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 8424166d..6532cda 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2178,6 +2178,39 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_add_used);
>>   
>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n)
>> +{
>> +	u16 old, new;
>> +
>> +	old = vq->last_used_idx;
>> +	new = (vq->last_used_idx += n);
>> +	/* If the driver never bothers to signal in a very long while,
>> +	 * used index might wrap around. If that happens, invalidate
>> +	 * signalled_used index we stored. TODO: make sure driver
>> +	 * signals at least once in 2^16 and remove this.
>> +	 */
>> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
>> +		vq->signalled_used_valid = false;
>> +
>> +	/* Make sure buffer is written before we update index. */
>> +	smp_wmb();
>> +	if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>> +			   &vq->used->idx)) {
>> +		vq_err(vq, "Failed to increment used idx");
>> +		return -EFAULT;
>> +	}
>> +	if (unlikely(vq->log_used)) {
>> +		/* Log used index update. */
>> +		log_write(vq->log_base,
>> +			  vq->log_addr + offsetof(struct vring_used, idx),
>> +			  sizeof(vq->used->idx));
>> +		if (vq->log_ctx)
>> +			eventfd_signal(vq->log_ctx, 1);
>> +	}
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(vhost_add_used_idx);
>> +
>>   static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>>   			    struct vring_used_elem *heads,
>>   			    unsigned count)
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 16c2cb6..5dd6c05 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -199,6 +199,7 @@ int __vhost_get_vq_desc(struct vhost_virtqueue *vq,
>>   void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
>>   
>>   int vhost_vq_init_access(struct vhost_virtqueue *);
>> +int vhost_add_used_idx(struct vhost_virtqueue *vq, int n);
>>   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>>   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
>>   		     unsigned count);
> Please change the API to hide the fact that there's an index that needs
> to be updated.

In fact, an interesting optimization on top is just call 
vhost_add_used_idx(vq, n) instead of n vhost_add_used_idx(vq, 1). That's 
the reason I leave n in the API.

Thanks

>
>> -- 
>> 2.7.4

^ permalink raw reply

* Re: [PATCH net-next RFC 2/5] vhost: introduce helper to prefetch desc index
From: Jason Wang @ 2017-09-27  0:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <20170926221435-mutt-send-email-mst@kernel.org>



On 2017年09月27日 03:19, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:32PM +0800, Jason Wang wrote:
>> This patch introduces vhost_prefetch_desc_indices() which could batch
>> descriptor indices fetching and used ring updating. This intends to
>> reduce the cache misses of indices fetching and updating and reduce
>> cache line bounce when virtqueue is almost full. copy_to_user() was
>> used in order to benefit from modern cpus that support fast string
>> copy. Batched virtqueue processing will be the first user.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   drivers/vhost/vhost.h |  3 +++
>>   2 files changed, 58 insertions(+)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index f87ec75..8424166d 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -2437,6 +2437,61 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>>   
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update)
> why do you need to combine used update with prefetch?

For better performance and I believe we don't care about the overhead 
when we meet errors in tx.

>
>> +{
>> +	int ret, ret2;
>> +	u16 last_avail_idx, last_used_idx, total, copied;
>> +	__virtio16 avail_idx;
>> +	struct vring_used_elem __user *used;
>> +	int i;
>> +
>> +	if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) {
>> +		vq_err(vq, "Failed to access avail idx at %p\n",
>> +		       &vq->avail->idx);
>> +		return -EFAULT;
>> +	}
>> +	last_avail_idx = vq->last_avail_idx & (vq->num - 1);
>> +	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
>> +	total = vq->avail_idx - vq->last_avail_idx;
>> +	ret = total = min(total, num);
>> +
>> +	for (i = 0; i < ret; i++) {
>> +		ret2 = vhost_get_avail(vq, heads[i].id,
>> +				      &vq->avail->ring[last_avail_idx]);
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to get descriptors\n");
>> +			return -EFAULT;
>> +		}
>> +		last_avail_idx = (last_avail_idx + 1) & (vq->num - 1);
>> +	}
>> +
>> +	if (!used_update)
>> +		return ret;
>> +
>> +	last_used_idx = vq->last_used_idx & (vq->num - 1);
>> +	while (total) {
>> +		copied = min((u16)(vq->num - last_used_idx), total);
>> +		ret2 = vhost_copy_to_user(vq,
>> +					  &vq->used->ring[last_used_idx],
>> +					  &heads[ret - total],
>> +					  copied * sizeof(*used));
>> +
>> +		if (unlikely(ret2)) {
>> +			vq_err(vq, "Failed to update used ring!\n");
>> +			return -EFAULT;
>> +		}
>> +
>> +		last_used_idx = 0;
>> +		total -= copied;
>> +	}
>> +
>> +	/* Only get avail ring entries after they have been exposed by guest. */
>> +	smp_rmb();
> Barrier before return is a very confusing API. I guess it's designed to
> be used in a specific way to make it necessary - but what is it?

Looks like a and we need do this after reading avail_idx.

Thanks

>
>
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(vhost_prefetch_desc_indices);
>>   
>>   static int __init vhost_init(void)
>>   {
>> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
>> index 39ff897..16c2cb6 100644
>> --- a/drivers/vhost/vhost.h
>> +++ b/drivers/vhost/vhost.h
>> @@ -228,6 +228,9 @@ ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
>>   ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
>>   			     struct iov_iter *from);
>>   int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
>> +int vhost_prefetch_desc_indices(struct vhost_virtqueue *vq,
>> +				struct vring_used_elem *heads,
>> +				u16 num, bool used_update);
>>   
>>   #define vq_err(vq, fmt, ...) do {                                  \
>>   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>> -- 
>> 2.7.4

^ permalink raw reply

* Re: [PATCH net-next RFC 0/5] batched tx processing in vhost_net
From: Jason Wang @ 2017-09-27  0:27 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization, netdev, linux-kernel, kvm
In-Reply-To: <20170926164055-mutt-send-email-mst@kernel.org>

On 2017年09月26日 21:45, Michael S. Tsirkin wrote:
> On Fri, Sep 22, 2017 at 04:02:30PM +0800, Jason Wang wrote:
>> Hi:
>>
>> This series tries to implement basic tx batched processing. This is
>> done by prefetching descriptor indices and update used ring in a
>> batch. This intends to speed up used ring updating and improve the
>> cache utilization.
> Interesting, thanks for the patches. So IIUC most of the gain is really
> overcoming some of the shortcomings of virtio 1.0 wrt cache utilization?

Yes.

Actually, looks like batching in 1.1 is not as easy as in 1.0.

In 1.0, we could do something like:

batch update used ring by user copy_to_user()
smp_wmb()
update used_idx

In 1.1, we need more memory barriers, can't benefit from fast copy helpers?

for () {
     update desc.addr
     smp_wmb()
     update desc.flag
}

>
> Which is fair enough (1.0 is already deployed) but I would like to avoid
> making 1.1 support harder, and this patchset does this unfortunately,

I think the new APIs do not expose more internal data structure of 
virtio than before? (vq->heads has already been used by vhost_net for 
years). Consider the layout is re-designed completely, I don't see an 
easy method to reuse current 1.0 API for 1.1.

> see comments on individual patches. I'm sure it can be addressed though.
>
>> Test shows about ~22% improvement in tx pss.
> Is this with or without tx napi in guest?

MoonGen is used in guest for better numbers.

Thanks

>
>> Please review.
>>
>> Jason Wang (5):
>>    vhost: split out ring head fetching logic
>>    vhost: introduce helper to prefetch desc index
>>    vhost: introduce vhost_add_used_idx()
>>    vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
>>    vhost_net: basic tx virtqueue batched processing
>>
>>   drivers/vhost/net.c   | 221 ++++++++++++++++++++++++++++----------------------
>>   drivers/vhost/vhost.c | 165 +++++++++++++++++++++++++++++++------
>>   drivers/vhost/vhost.h |   9 ++
>>   3 files changed, 270 insertions(+), 125 deletions(-)
>>
>> -- 
>> 2.7.4

^ permalink raw reply

* Re: [REGRESSION] Warning in tcp_fastretrans_alert() of net/ipv4/tcp_input.c
From: Yuchung Cheng @ 2017-09-27  0:18 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Oleksandr Natalenko, Hideaki YOSHIFUJI, Alexey Kuznetsov, netdev,
	linux-kernel@vger.kernel.org
In-Reply-To: <CAK6E8=eBZ6XhRg7ihoQ_2=4bTk1RSdxT2zJ_Z7-4X-HzNeaiQQ@mail.gmail.com>

On Tue, Sep 26, 2017 at 5:12 PM, Yuchung Cheng <ycheng@google.com> wrote:
> On Tue, Sep 26, 2017 at 6:10 AM, Roman Gushchin <guro@fb.com> wrote:
>>> On Wed, Sep 20, 2017 at 6:46 PM, Roman Gushchin <guro@fb.com> wrote:
>>> >
>>> > > Hello.
>>> > >
>>> > > Since, IIRC, v4.11, there is some regression in TCP stack resulting in the
>>> > > warning shown below. Most of the time it is harmless, but rarely it just
>>> > > causes either freeze or (I believe, this is related too) panic in
>>> > > tcp_sacktag_walk() (because sk_buff passed to this function is NULL).
>>> > > Unfortunately, I still do not have proper stacktrace from panic, but will try
>>> > > to capture it if possible.
>>> > >
>>> > > Also, I have custom settings regarding TCP stack, shown below as well. ifb is
>>> > > used to shape traffic with tc.
>>> > >
>>> > > Please note this regression was already reported as BZ [1] and as a letter to
>>> > > ML [2], but got neither attention nor resolution. It is reproducible for (not
>>> > > only) me on my home router since v4.11 till v4.13.1 incl.
>>> > >
>>> > > Please advise on how to deal with it. I'll provide any additional info if
>>> > > necessary, also ready to test patches if any.
>>> > >
>>> > > Thanks.
>>> > >
>>> > > [1] https://bugzilla.kernel.org/show_bug.cgi?id=195835
>>> > > [2] https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_netdev_msg436158.html&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=jJYgtDM7QT-W-Fz_d29HYQ&m=MDDRfLG5DvdOeniMpaZDJI8ulKQ6PQ6OX_1YtRsiTMA&s=-n3dGZw-pQ95kMBUfq5G9nYZFcuWtbTDlYFkcvQPoKc&e=
>>> >
>>> > We're experiencing the same problems on some machines in our fleet.
>>> > Exactly the same symptoms: tcp_fastretrans_alert() warnings and
>>> > sometimes panics in tcp_sacktag_walk().
>>> >
>>> > Here is an example of a backtrace with the panic log:
>>
>> Hi Yuchung!
>>
>>> do you still see the panics if you disable RACK?
>>> sysctl net.ipv4.tcp_recovery=0?
>>
>> No, we haven't seen any crash since that.
> I am out of ideas how RACK can potentially cause tcp_sacktag_walk to
> take an empty skb :-( Do you have stack trace or any hint on which call
> to tcp-sacktag_walk triggered the panic? internally at Google we never
> see that.
hmm something just struck me: could you try
sysctl net.ipv4.tcp_recovery=1 net.ipv4.tcp_retrans_collapse=0
and see if kernel still panics on sack processing?

>
>
>>
>>>
>>> also have you experience any sack reneg? could you post the output of
>>> ' nstat |grep -i TCP' thanks
>>
>> hostname        TcpActiveOpens                  2289680            0.0
>> hostname        TcpPassiveOpens                 3592758            0.0
>> hostname        TcpAttemptFails                 746910             0.0
>> hostname        TcpEstabResets                  154988             0.0
>> hostname        TcpInSegs                       16258678255        0.0
>> hostname        TcpOutSegs                      46967011611        0.0
>> hostname        TcpRetransSegs                  13724310           0.0
>> hostname        TcpInErrs                       2                  0.0
>> hostname        TcpOutRsts                      9418798            0.0
>> hostname        TcpExtEmbryonicRsts             2303               0.0
>> hostname        TcpExtPruneCalled               90192              0.0
>> hostname        TcpExtOfoPruned                 57274              0.0
>> hostname        TcpExtOutOfWindowIcmps          3                  0.0
>> hostname        TcpExtTW                        1164705            0.0
>> hostname        TcpExtTWRecycled                2                  0.0
>> hostname        TcpExtPAWSEstab                 159                0.0
>> hostname        TcpExtDelayedACKs               209207209          0.0
>> hostname        TcpExtDelayedACKLocked          508571             0.0
>> hostname        TcpExtDelayedACKLost            1713248            0.0
>> hostname        TcpExtListenOverflows           625                0.0
>> hostname        TcpExtListenDrops               625                0.0
>> hostname        TcpExtTCPHPHits                 9341188489         0.0
>> hostname        TcpExtTCPPureAcks               1434646465         0.0
>> hostname        TcpExtTCPHPAcks                 5733614672         0.0
>> hostname        TcpExtTCPSackRecovery           3261698            0.0
>> hostname        TcpExtTCPSACKReneging           12203              0.0
>> hostname        TcpExtTCPSACKReorder            433189             0.0
>> hostname        TcpExtTCPTSReorder              22694              0.0
>> hostname        TcpExtTCPFullUndo               45092              0.0
>> hostname        TcpExtTCPPartialUndo            22016              0.0
>> hostname        TcpExtTCPLossUndo               2150040            0.0
>> hostname        TcpExtTCPLostRetransmit         60119              0.0
>> hostname        TcpExtTCPSackFailures           2626782            0.0
>> hostname        TcpExtTCPLossFailures           182999             0.0
>> hostname        TcpExtTCPFastRetrans            4334275            0.0
>> hostname        TcpExtTCPSlowStartRetrans       3453348            0.0
>> hostname        TcpExtTCPTimeouts               1070997            0.0
>> hostname        TcpExtTCPLossProbes             2633545            0.0
>> hostname        TcpExtTCPLossProbeRecovery      941647             0.0
>> hostname        TcpExtTCPSackRecoveryFail       336302             0.0
>> hostname        TcpExtTCPRcvCollapsed           461354             0.0
>> hostname        TcpExtTCPAbortOnData            349196             0.0
>> hostname        TcpExtTCPAbortOnClose           3395               0.0
>> hostname        TcpExtTCPAbortOnTimeout         51201              0.0
>> hostname        TcpExtTCPMemoryPressures        2                  0.0
>> hostname        TcpExtTCPSpuriousRTOs           2120503            0.0
>> hostname        TcpExtTCPSackShifted            2613736            0.0
>> hostname        TcpExtTCPSackMerged             21358743           0.0
>> hostname        TcpExtTCPSackShiftFallback      8769387            0.0
>> hostname        TcpExtTCPBacklogDrop            5                  0.0
>> hostname        TcpExtTCPRetransFail            843                0.0
>> hostname        TcpExtTCPRcvCoalesce            949068035          0.0
>> hostname        TcpExtTCPOFOQueue               470118             0.0
>> hostname        TcpExtTCPOFODrop                9915               0.0
>> hostname        TcpExtTCPOFOMerge               9                  0.0
>> hostname        TcpExtTCPChallengeACK           90                 0.0
>> hostname        TcpExtTCPSYNChallenge           3                  0.0
>> hostname        TcpExtTCPFastOpenActive         2089               0.0
>> hostname        TcpExtTCPSpuriousRtxHostQueues  896596             0.0
>> hostname        TcpExtTCPAutoCorking            547386735          0.0
>> hostname        TcpExtTCPFromZeroWindowAdv      28757              0.0
>> hostname        TcpExtTCPToZeroWindowAdv        28761              0.0
>> hostname        TcpExtTCPWantZeroWindowAdv      322431             0.0
>> hostname        TcpExtTCPSynRetrans             3026               0.0
>> hostname        TcpExtTCPOrigDataSent           40976870977        0.0
>> hostname        TcpExtTCPHystartTrainDetect     453920             0.0
>> hostname        TcpExtTCPHystartTrainCwnd       11586273           0.0
>> hostname        TcpExtTCPHystartDelayDetect     10943              0.0
>> hostname        TcpExtTCPHystartDelayCwnd       763554             0.0
>> hostname        TcpExtTCPACKSkippedPAWS         30                 0.0
>> hostname        TcpExtTCPACKSkippedSeq          218                0.0
>> hostname        TcpExtTCPWinProbe               2408               0.0
>> hostname        TcpExtTCPKeepAlive              213768             0.0
>> hostname        TcpExtTCPMTUPFail               69                 0.0
>> hostname        TcpExtTCPMTUPSuccess            8811               0.0
>>
>> Thanks!

^ permalink raw reply

* Re: [REGRESSION] Warning in tcp_fastretrans_alert() of net/ipv4/tcp_input.c
From: Yuchung Cheng @ 2017-09-27  0:12 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Oleksandr Natalenko, Hideaki YOSHIFUJI, Alexey Kuznetsov, netdev,
	linux-kernel@vger.kernel.org
In-Reply-To: <20170926131011.GB26395@castle.DHCP.thefacebook.com>

On Tue, Sep 26, 2017 at 6:10 AM, Roman Gushchin <guro@fb.com> wrote:
>> On Wed, Sep 20, 2017 at 6:46 PM, Roman Gushchin <guro@fb.com> wrote:
>> >
>> > > Hello.
>> > >
>> > > Since, IIRC, v4.11, there is some regression in TCP stack resulting in the
>> > > warning shown below. Most of the time it is harmless, but rarely it just
>> > > causes either freeze or (I believe, this is related too) panic in
>> > > tcp_sacktag_walk() (because sk_buff passed to this function is NULL).
>> > > Unfortunately, I still do not have proper stacktrace from panic, but will try
>> > > to capture it if possible.
>> > >
>> > > Also, I have custom settings regarding TCP stack, shown below as well. ifb is
>> > > used to shape traffic with tc.
>> > >
>> > > Please note this regression was already reported as BZ [1] and as a letter to
>> > > ML [2], but got neither attention nor resolution. It is reproducible for (not
>> > > only) me on my home router since v4.11 till v4.13.1 incl.
>> > >
>> > > Please advise on how to deal with it. I'll provide any additional info if
>> > > necessary, also ready to test patches if any.
>> > >
>> > > Thanks.
>> > >
>> > > [1] https://bugzilla.kernel.org/show_bug.cgi?id=195835
>> > > [2] https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_netdev_msg436158.html&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=jJYgtDM7QT-W-Fz_d29HYQ&m=MDDRfLG5DvdOeniMpaZDJI8ulKQ6PQ6OX_1YtRsiTMA&s=-n3dGZw-pQ95kMBUfq5G9nYZFcuWtbTDlYFkcvQPoKc&e=
>> >
>> > We're experiencing the same problems on some machines in our fleet.
>> > Exactly the same symptoms: tcp_fastretrans_alert() warnings and
>> > sometimes panics in tcp_sacktag_walk().
>> >
>> > Here is an example of a backtrace with the panic log:
>
> Hi Yuchung!
>
>> do you still see the panics if you disable RACK?
>> sysctl net.ipv4.tcp_recovery=0?
>
> No, we haven't seen any crash since that.
I am out of ideas how RACK can potentially cause tcp_sacktag_walk to
take an empty skb :-( Do you have stack trace or any hint on which call
to tcp-sacktag_walk triggered the panic? internally at Google we never
see that.


>
>>
>> also have you experience any sack reneg? could you post the output of
>> ' nstat |grep -i TCP' thanks
>
> hostname        TcpActiveOpens                  2289680            0.0
> hostname        TcpPassiveOpens                 3592758            0.0
> hostname        TcpAttemptFails                 746910             0.0
> hostname        TcpEstabResets                  154988             0.0
> hostname        TcpInSegs                       16258678255        0.0
> hostname        TcpOutSegs                      46967011611        0.0
> hostname        TcpRetransSegs                  13724310           0.0
> hostname        TcpInErrs                       2                  0.0
> hostname        TcpOutRsts                      9418798            0.0
> hostname        TcpExtEmbryonicRsts             2303               0.0
> hostname        TcpExtPruneCalled               90192              0.0
> hostname        TcpExtOfoPruned                 57274              0.0
> hostname        TcpExtOutOfWindowIcmps          3                  0.0
> hostname        TcpExtTW                        1164705            0.0
> hostname        TcpExtTWRecycled                2                  0.0
> hostname        TcpExtPAWSEstab                 159                0.0
> hostname        TcpExtDelayedACKs               209207209          0.0
> hostname        TcpExtDelayedACKLocked          508571             0.0
> hostname        TcpExtDelayedACKLost            1713248            0.0
> hostname        TcpExtListenOverflows           625                0.0
> hostname        TcpExtListenDrops               625                0.0
> hostname        TcpExtTCPHPHits                 9341188489         0.0
> hostname        TcpExtTCPPureAcks               1434646465         0.0
> hostname        TcpExtTCPHPAcks                 5733614672         0.0
> hostname        TcpExtTCPSackRecovery           3261698            0.0
> hostname        TcpExtTCPSACKReneging           12203              0.0
> hostname        TcpExtTCPSACKReorder            433189             0.0
> hostname        TcpExtTCPTSReorder              22694              0.0
> hostname        TcpExtTCPFullUndo               45092              0.0
> hostname        TcpExtTCPPartialUndo            22016              0.0
> hostname        TcpExtTCPLossUndo               2150040            0.0
> hostname        TcpExtTCPLostRetransmit         60119              0.0
> hostname        TcpExtTCPSackFailures           2626782            0.0
> hostname        TcpExtTCPLossFailures           182999             0.0
> hostname        TcpExtTCPFastRetrans            4334275            0.0
> hostname        TcpExtTCPSlowStartRetrans       3453348            0.0
> hostname        TcpExtTCPTimeouts               1070997            0.0
> hostname        TcpExtTCPLossProbes             2633545            0.0
> hostname        TcpExtTCPLossProbeRecovery      941647             0.0
> hostname        TcpExtTCPSackRecoveryFail       336302             0.0
> hostname        TcpExtTCPRcvCollapsed           461354             0.0
> hostname        TcpExtTCPAbortOnData            349196             0.0
> hostname        TcpExtTCPAbortOnClose           3395               0.0
> hostname        TcpExtTCPAbortOnTimeout         51201              0.0
> hostname        TcpExtTCPMemoryPressures        2                  0.0
> hostname        TcpExtTCPSpuriousRTOs           2120503            0.0
> hostname        TcpExtTCPSackShifted            2613736            0.0
> hostname        TcpExtTCPSackMerged             21358743           0.0
> hostname        TcpExtTCPSackShiftFallback      8769387            0.0
> hostname        TcpExtTCPBacklogDrop            5                  0.0
> hostname        TcpExtTCPRetransFail            843                0.0
> hostname        TcpExtTCPRcvCoalesce            949068035          0.0
> hostname        TcpExtTCPOFOQueue               470118             0.0
> hostname        TcpExtTCPOFODrop                9915               0.0
> hostname        TcpExtTCPOFOMerge               9                  0.0
> hostname        TcpExtTCPChallengeACK           90                 0.0
> hostname        TcpExtTCPSYNChallenge           3                  0.0
> hostname        TcpExtTCPFastOpenActive         2089               0.0
> hostname        TcpExtTCPSpuriousRtxHostQueues  896596             0.0
> hostname        TcpExtTCPAutoCorking            547386735          0.0
> hostname        TcpExtTCPFromZeroWindowAdv      28757              0.0
> hostname        TcpExtTCPToZeroWindowAdv        28761              0.0
> hostname        TcpExtTCPWantZeroWindowAdv      322431             0.0
> hostname        TcpExtTCPSynRetrans             3026               0.0
> hostname        TcpExtTCPOrigDataSent           40976870977        0.0
> hostname        TcpExtTCPHystartTrainDetect     453920             0.0
> hostname        TcpExtTCPHystartTrainCwnd       11586273           0.0
> hostname        TcpExtTCPHystartDelayDetect     10943              0.0
> hostname        TcpExtTCPHystartDelayCwnd       763554             0.0
> hostname        TcpExtTCPACKSkippedPAWS         30                 0.0
> hostname        TcpExtTCPACKSkippedSeq          218                0.0
> hostname        TcpExtTCPWinProbe               2408               0.0
> hostname        TcpExtTCPKeepAlive              213768             0.0
> hostname        TcpExtTCPMTUPFail               69                 0.0
> hostname        TcpExtTCPMTUPSuccess            8811               0.0
>
> Thanks!

^ permalink raw reply

* [PATCH net 8/9] net/8390: Fix redundant code
From: Finn Thain @ 2017-09-27  0:07 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, linux-kernel, linux-arm-kernel, Russell King
In-Reply-To: <cover.1506470623.git.fthain@telegraphics.com.au>

The patch which introduced the 8390 core module parameter 'msg_enable'
failed to do anything useful with it: it merely causes an ancient
version string to be logged.

Remove the other code that logs the same string. Use the msg_enable
module parameter as the default value for ei_local->msg_enable.
Otherwise, some 8390 modules have no way to set ei_local->msg_enable.

Also fix two more issues arising from the same patch: indentation
mistakes and pointless static variables.

Fixes: c45f812f0280 ("8390 : Replace ei_debug with msg_enable/NETIF_MSG_* feature")
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/8390/ax88796.c   |  3 ---
 drivers/net/ethernet/8390/axnet_cs.c  |  2 --
 drivers/net/ethernet/8390/etherh.c    | 17 -----------------
 drivers/net/ethernet/8390/hydra.c     |  4 ----
 drivers/net/ethernet/8390/lib8390.c   |  2 ++
 drivers/net/ethernet/8390/mac8390.c   |  7 -------
 drivers/net/ethernet/8390/mcf8390.c   |  4 ----
 drivers/net/ethernet/8390/pcnet_cs.c  |  4 ----
 drivers/net/ethernet/8390/zorro8390.c |  5 -----
 9 files changed, 2 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 05d9d3e2e92e..28aa79d2f16c 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -77,8 +77,6 @@ static unsigned char version[] = "ax88796.c: Copyright 2005,2007 Simtec Electron
 
 #define AX_GPOC_PPDSET	BIT(6)
 
-static u32 ax_msg_enable;
-
 /* device private data */
 
 struct ax_device {
@@ -747,7 +745,6 @@ static int ax_init_dev(struct net_device *dev)
 	ei_local->block_output = &ax_block_output;
 	ei_local->get_8390_hdr = &ax_get_8390_hdr;
 	ei_local->priv = 0;
-	ei_local->msg_enable = ax_msg_enable;
 
 	dev->netdev_ops = &ax_netdev_ops;
 	dev->ethtool_ops = &ax_ethtool_ops;
diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c
index 3da1fc539ef9..91e76dc1e6e1 100644
--- a/drivers/net/ethernet/8390/axnet_cs.c
+++ b/drivers/net/ethernet/8390/axnet_cs.c
@@ -104,7 +104,6 @@ static void AX88190_init(struct net_device *dev, int startp);
 static int ax_open(struct net_device *dev);
 static int ax_close(struct net_device *dev);
 static irqreturn_t ax_interrupt(int irq, void *dev_id);
-static u32 axnet_msg_enable;
 
 /*====================================================================*/
 
@@ -151,7 +150,6 @@ static int axnet_probe(struct pcmcia_device *link)
 	return -ENOMEM;
 
     ei_local = netdev_priv(dev);
-    ei_local->msg_enable = axnet_msg_enable;
     spin_lock_init(&ei_local->page_lock);
 
     info = PRIV(dev);
diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c
index 11cbf22ad201..32e9627e3880 100644
--- a/drivers/net/ethernet/8390/etherh.c
+++ b/drivers/net/ethernet/8390/etherh.c
@@ -64,8 +64,6 @@ static char version[] =
 
 #include "lib8390.c"
 
-static u32 etherh_msg_enable;
-
 struct etherh_priv {
 	void __iomem	*ioc_fast;
 	void __iomem	*memc;
@@ -502,18 +500,6 @@ etherh_close(struct net_device *dev)
 }
 
 /*
- * Initialisation
- */
-
-static void __init etherh_banner(void)
-{
-	static int version_printed;
-
-	if ((etherh_msg_enable & NETIF_MSG_DRV) && (version_printed++ == 0))
-		pr_info("%s", version);
-}
-
-/*
  * Read the ethernet address string from the on board rom.
  * This is an ascii string...
  */
@@ -671,8 +657,6 @@ etherh_probe(struct expansion_card *ec, const struct ecard_id *id)
 	struct etherh_priv *eh;
 	int ret;
 
-	etherh_banner();
-
 	ret = ecard_request_resources(ec);
 	if (ret)
 		goto out;
@@ -757,7 +741,6 @@ etherh_probe(struct expansion_card *ec, const struct ecard_id *id)
 	ei_local->block_output  = etherh_block_output;
 	ei_local->get_8390_hdr  = etherh_get_header;
 	ei_local->interface_num = 0;
-	ei_local->msg_enable = etherh_msg_enable;
 
 	etherh_reset(dev);
 	__NS8390_init(dev, 0);
diff --git a/drivers/net/ethernet/8390/hydra.c b/drivers/net/ethernet/8390/hydra.c
index 8ae249195301..941754ea78ec 100644
--- a/drivers/net/ethernet/8390/hydra.c
+++ b/drivers/net/ethernet/8390/hydra.c
@@ -66,7 +66,6 @@ static void hydra_block_input(struct net_device *dev, int count,
 static void hydra_block_output(struct net_device *dev, int count,
 			       const unsigned char *buf, int start_page);
 static void hydra_remove_one(struct zorro_dev *z);
-static u32 hydra_msg_enable;
 
 static struct zorro_device_id hydra_zorro_tbl[] = {
     { ZORRO_PROD_HYDRA_SYSTEMS_AMIGANET },
@@ -119,7 +118,6 @@ static int hydra_init(struct zorro_dev *z)
     int start_page, stop_page;
     int j;
     int err;
-    struct ei_device *ei_local;
 
     static u32 hydra_offsets[16] = {
 	0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
@@ -138,8 +136,6 @@ static int hydra_init(struct zorro_dev *z)
     start_page = NESM_START_PG;
     stop_page = NESM_STOP_PG;
 
-    ei_local = netdev_priv(dev);
-    ei_local->msg_enable = hydra_msg_enable;
     dev->base_addr = ioaddr;
     dev->irq = IRQ_AMIGA_PORTS;
 
diff --git a/drivers/net/ethernet/8390/lib8390.c b/drivers/net/ethernet/8390/lib8390.c
index 60f8e2c8e726..5d9bbde9fe68 100644
--- a/drivers/net/ethernet/8390/lib8390.c
+++ b/drivers/net/ethernet/8390/lib8390.c
@@ -975,6 +975,8 @@ static void ethdev_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	spin_lock_init(&ei_local->page_lock);
+
+	ei_local->msg_enable = msg_enable;
 }
 
 /**
diff --git a/drivers/net/ethernet/8390/mac8390.c b/drivers/net/ethernet/8390/mac8390.c
index 9497f18eaba0..1bfc66f37971 100644
--- a/drivers/net/ethernet/8390/mac8390.c
+++ b/drivers/net/ethernet/8390/mac8390.c
@@ -167,7 +167,6 @@ static void slow_sane_block_output(struct net_device *dev, int count,
 				   const unsigned char *buf, int start_page);
 static void word_memcpy_tocard(unsigned long tp, const void *fp, int count);
 static void word_memcpy_fromcard(void *tp, unsigned long fp, int count);
-static u32 mac8390_msg_enable;
 
 static enum mac8390_type __init mac8390_ident(struct nubus_dev *dev)
 {
@@ -297,8 +296,6 @@ static bool __init mac8390_init(struct net_device *dev, struct nubus_dev *ndev,
 	int offset;
 	volatile unsigned short *i;
 
-	printk_once(KERN_INFO pr_fmt("%s"), version);
-
 	dev->irq = SLOT2IRQ(ndev->board->slot);
 	/* This is getting to be a habit */
 	dev->base_addr = (ndev->board->slot_addr |
@@ -396,7 +393,6 @@ struct net_device * __init mac8390_probe(int unit)
 	struct net_device *dev;
 	struct nubus_dev *ndev = NULL;
 	int err = -ENODEV;
-	struct ei_device *ei_local;
 
 	static unsigned int slots;
 
@@ -436,9 +432,6 @@ struct net_device * __init mac8390_probe(int unit)
 	if (!ndev)
 		goto out;
 
-	 ei_local = netdev_priv(dev);
-	 ei_local->msg_enable = mac8390_msg_enable;
-
 	err = register_netdev(dev);
 	if (err)
 		goto out;
diff --git a/drivers/net/ethernet/8390/mcf8390.c b/drivers/net/ethernet/8390/mcf8390.c
index 4bb967bc879e..4ad8031ab669 100644
--- a/drivers/net/ethernet/8390/mcf8390.c
+++ b/drivers/net/ethernet/8390/mcf8390.c
@@ -38,7 +38,6 @@ static const char version[] =
 
 #define NESM_START_PG	0x40	/* First page of TX buffer */
 #define NESM_STOP_PG	0x80	/* Last page +1 of RX ring */
-static u32 mcf8390_msg_enable;
 
 #ifdef NE2000_ODDOFFSET
 /*
@@ -407,7 +406,6 @@ static int mcf8390_init(struct net_device *dev)
 static int mcf8390_probe(struct platform_device *pdev)
 {
 	struct net_device *dev;
-	struct ei_device *ei_local;
 	struct resource *mem, *irq;
 	resource_size_t msize;
 	int ret;
@@ -435,8 +433,6 @@ static int mcf8390_probe(struct platform_device *pdev)
 
 	SET_NETDEV_DEV(dev, &pdev->dev);
 	platform_set_drvdata(pdev, dev);
-	ei_local = netdev_priv(dev);
-	ei_local->msg_enable = mcf8390_msg_enable;
 
 	dev->irq = irq->start;
 	dev->base_addr = mem->start;
diff --git a/drivers/net/ethernet/8390/pcnet_cs.c b/drivers/net/ethernet/8390/pcnet_cs.c
index bd0a2a14b649..a81ffe4874e1 100644
--- a/drivers/net/ethernet/8390/pcnet_cs.c
+++ b/drivers/net/ethernet/8390/pcnet_cs.c
@@ -66,7 +66,6 @@
 #define PCNET_RDC_TIMEOUT (2*HZ/100)	/* Max wait in jiffies for Tx RDC */
 
 static const char *if_names[] = { "auto", "10baseT", "10base2"};
-static u32 pcnet_msg_enable;
 
 /*====================================================================*/
 
@@ -556,7 +555,6 @@ static int pcnet_config(struct pcmcia_device *link)
     int start_pg, stop_pg, cm_offset;
     int has_shmem = 0;
     struct hw_info *local_hw_info;
-    struct ei_device *ei_local;
 
     dev_dbg(&link->dev, "pcnet_config\n");
 
@@ -606,8 +604,6 @@ static int pcnet_config(struct pcmcia_device *link)
 	mii_phy_probe(dev);
 
     SET_NETDEV_DEV(dev, &link->dev);
-    ei_local = netdev_priv(dev);
-    ei_local->msg_enable = pcnet_msg_enable;
 
     if (register_netdev(dev) != 0) {
 	pr_notice("register_netdev() failed\n");
diff --git a/drivers/net/ethernet/8390/zorro8390.c b/drivers/net/ethernet/8390/zorro8390.c
index 6d93956b293b..35a500a21521 100644
--- a/drivers/net/ethernet/8390/zorro8390.c
+++ b/drivers/net/ethernet/8390/zorro8390.c
@@ -44,8 +44,6 @@
 static const char version[] =
 	"8390.c:v1.10cvs 9/23/94 Donald Becker (becker@cesdis.gsfc.nasa.gov)\n";
 
-static u32 zorro8390_msg_enable;
-
 #include "lib8390.c"
 
 #define DRV_NAME	"zorro8390"
@@ -296,7 +294,6 @@ static int zorro8390_init(struct net_device *dev, unsigned long board,
 	int err;
 	unsigned char SA_prom[32];
 	int start_page, stop_page;
-	struct ei_device *ei_local = netdev_priv(dev);
 	static u32 zorro8390_offsets[16] = {
 		0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
 		0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
@@ -388,8 +385,6 @@ static int zorro8390_init(struct net_device *dev, unsigned long board,
 	dev->netdev_ops = &zorro8390_netdev_ops;
 	__NS8390_init(dev, 0);
 
-	ei_local->msg_enable = zorro8390_msg_enable;
-
 	err = register_netdev(dev);
 	if (err) {
 		free_irq(IRQ_AMIGA_PORTS, dev);
-- 
2.13.5

^ permalink raw reply related

* Re: [PATCH net-next 0/5] net: dsa: use generic slave phydev
From: Florian Fainelli @ 2017-09-26 23:55 UTC (permalink / raw)
  To: Vivien Didelot, netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn
In-Reply-To: <20170926211535.21273-1-vivien.didelot@savoirfairelinux.com>

On 09/26/2017 02:15 PM, Vivien Didelot wrote:
> DSA currently stores a phy_device pointer in each slave private
> structure. This requires to implement our own ethtool ksettings
> accessors and such.
> 
> This patchset removes the private phy_device in favor of the one
> provided in the net_device structure, and thus allows us to use the
> generic phy_ethtool_* functions.

For this series:

Tested-by: Florian Fainelli <f.fainelli@gmail.com>

On bcm_sf2 (7445 and 7278) along with the externally attached BCM53125
switch that needs the special MDIO read/write divert. We properly attach
to the right PHY devices in all cases.

Also tested unbind/bind, working correctly.

Thanks!

> 
> Vivien Didelot (5):
>   net: dsa: return -ENODEV is there is no slave PHY
>   net: dsa: use slave device phydev
>   net: dsa: use phy_ethtool_get_link_ksettings
>   net: dsa: use phy_ethtool_set_link_ksettings
>   net: dsa: use phy_ethtool_nway_reset
> 
>  net/dsa/dsa_priv.h |   1 -
>  net/dsa/slave.c    | 143 +++++++++++++++++++----------------------------------
>  2 files changed, 52 insertions(+), 92 deletions(-)
> 

-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 2/5] net: dsa: use slave device phydev
From: Florian Fainelli @ 2017-09-26 23:54 UTC (permalink / raw)
  To: Vivien Didelot, netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn
In-Reply-To: <20170926211535.21273-3-vivien.didelot@savoirfairelinux.com>

On 09/26/2017 02:15 PM, Vivien Didelot wrote:
> There is no need to store a phy_device in dsa_slave_priv since
> net_device already provides one. Simply s/p->phy/dev->phydev/.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH iproute2] Add information about COLORFGBG to ip.8 man page
From: Roland Hopferwieser @ 2017-09-26 23:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20170920175633.55ee67c9@xeon-e3>

[-- Attachment #1: Type: text/plain, Size: 103 bytes --]

> Your patch was damaged by the mailer you used.
> Please fix and resubmit.

Sorry, now as attachment.

[-- Attachment #2: iproute2-Add-information-about-COLORFGBG-to-ip.8-man-page.patch --]
[-- Type: text/x-patch, Size: 487 bytes --]

diff --git a/man/man8/ip.8 b/man/man8/ip.8
index ae018fdf..2a27a56e 100644
--- a/man/man8/ip.8
+++ b/man/man8/ip.8
@@ -187,7 +187,8 @@ executes specified command over all objects, it depends if command supports this
 
 .TP
 .BR "\-c" , " -color"
-Use color output.
+Use color output. The color palette is affected by the COLORFGBG environment variable, which typically has the form "fg;bg".
+If "bg" is set to 0-6 or 8, the dark color palette is used.
 
 .TP
 .BR "\-t" , " \-timestamp"

^ permalink raw reply related

* [PATCH iproute2 v3] lib: json_print: rework 'new_json_obj' drop FILE* argument
From: Julien Fortin @ 2017-09-26 23:45 UTC (permalink / raw)
  To: netdev; +Cc: roopa, nikolay, dsa, Julien Fortin

From: Julien Fortin <julien@cumulusnetworks.com>

As Stephen Hemminger mentioned on the last submission the new_json_obj
function is always called with fp == stdout, so right now, there's no
need of this extra argument.

The background for the rework is the following:
The ip monitor didn't call `new_json_obj` (even for in non json context),
so the static FILE* _fp variable wasn't initialized, thus raising a
SIGSEGV in ipaddress.c. This patch should fix this issue for good, new
paths won't have to call `new_json_obj`.

How to reproduce:

$ ip -t mon label link
(gdb) bt
.#0  _IO_vfprintf_internal (s=s@entry=0x0, format=format@entry=0x45460d “%d: “, ap=ap@entry=0x7fffffff7f18) at vfprintf.c:1278
.#1  0x0000000000451310 in color_fprintf (fp=0x0, attr=<optimized out>, fmt=0x45460d “%d: “) at color.c:108
.#2  0x000000000044a856 in print_color_int (t=t@entry=PRINT_ANY, color=color@entry=4294967295, key=key@entry=0x4545fc “ifindex”,
    fmt=fmt@entry=0x45460d “%d: “, value=<optimized out>) at ip_print.c:132
.#3  0x000000000040ccd2 in print_int (value=<optimized out>, fmt=0x45460d “%d: “, key=0x4545fc “ifindex”, t=PRINT_ANY) at ip_common.h:189
.#4  print_linkinfo (who=<optimized out>, n=0x7fffffffa380, arg=0x7ffff77a82a0 <_IO_2_1_stdout_>) at ipaddress.c:1107
.#5  0x0000000000422e13 in accept_msg (who=0x7fffffff8320, ctrl=0x7fffffff8310, n=0x7fffffffa380, arg=0x7ffff77a82a0 <_IO_2_1_stdout_>) at ipmonitor.c:89
.#6  0x000000000044c58f in rtnl_listen (rtnl=0x672160 <rth>, handler=handler@entry=0x422c70 <accept_msg>, jarg=0x7ffff77a82a0 <_IO_2_1_stdout_>)
    at libnetlink.c:761
.#7  0x00000000004233db in do_ipmonitor (argc=<optimized out>, argv=0x7fffffffe5a0) at ipmonitor.c:310
.#8  0x0000000000408f74 in do_cmd (argv0=0x7fffffffe7f5 “mon”, argc=3, argv=0x7fffffffe588) at ip.c:116
.#9  0x0000000000408a94 in main (argc=4, argv=0x7fffffffe580) at ip.c:311

Fixes: 6377572f ("ip: ip_print: add new API to print JSON or regular format output")
Reported-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Julien Fortin <julien@cumulusnetworks.com>
---
 include/json_print.h |  4 +---
 ip/ipaddress.c       |  4 ++--
 lib/json_print.c     | 31 ++++++++++---------------------
 3 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/include/json_print.h b/include/json_print.h
index 44cf5ac5..b6ce1f9f 100644
--- a/include/json_print.h
+++ b/include/json_print.h
@@ -29,13 +29,11 @@ enum output_type {
 	PRINT_ANY = 4,
 };
 
-void new_json_obj(int json, FILE *fp);
+void new_json_obj(int json);
 void delete_json_obj(void);
 
 bool is_json_context(void);
 
-void set_current_fp(FILE *fp);
-
 void fflush_fp(void);
 
 void open_json_object(const char *str);
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index b8bc387a..9e9a7e0a 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1815,7 +1815,7 @@ static int ipaddr_showdump(void)
 	if (ipadd_dump_check_magic())
 		exit(-1);
 
-	new_json_obj(json, stdout);
+	new_json_obj(json);
 	open_json_object(NULL);
 	open_json_array(PRINT_JSON, "addr_info");
 
@@ -2176,7 +2176,7 @@ static int ipaddr_list_flush_or_save(int argc, char **argv, int action)
 	 * Initialize a json_writer and open an array object
 	 * if -json was specified.
 	 */
-	new_json_obj(json, stdout);
+	new_json_obj(json);
 
 	/*
 	 * If only filter_dev present and none of the other
diff --git a/lib/json_print.c b/lib/json_print.c
index 93b4119d..aa527af6 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -16,15 +16,14 @@
 #include "json_print.h"
 
 static json_writer_t *_jw;
-static FILE *_fp;
 
 #define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && _jw)
 #define _IS_FP_CONTEXT(type) (!_jw && (type & PRINT_FP || type & PRINT_ANY))
 
-void new_json_obj(int json, FILE *fp)
+void new_json_obj(int json)
 {
 	if (json) {
-		_jw = jsonw_new(fp);
+		_jw = jsonw_new(stdout);
 		if (!_jw) {
 			perror("json object");
 			exit(1);
@@ -32,7 +31,6 @@ void new_json_obj(int json, FILE *fp)
 		jsonw_pretty(_jw, true);
 		jsonw_start_array(_jw);
 	}
-	set_current_fp(fp);
 }
 
 void delete_json_obj(void)
@@ -48,15 +46,6 @@ bool is_json_context(void)
 	return _jw != NULL;
 }
 
-void set_current_fp(FILE *fp)
-{
-	if (!fp) {
-		fprintf(stderr, "Error: invalid file pointer.\n");
-		exit(1);
-	}
-	_fp = fp;
-}
-
 json_writer_t *get_json_writer(void)
 {
 	return _jw;
@@ -89,7 +78,7 @@ void open_json_array(enum output_type type, const char *str)
 			jsonw_name(_jw, str);
 		jsonw_start_array(_jw);
 	} else if (_IS_FP_CONTEXT(type)) {
-		fprintf(_fp, "%s", str);
+		printf("%s", str);
 	}
 }
 
@@ -103,7 +92,7 @@ void close_json_array(enum output_type type, const char *str)
 		jsonw_end_array(_jw);
 		jsonw_pretty(_jw, true);
 	} else if (_IS_FP_CONTEXT(type)) {
-		fprintf(_fp, "%s", str);
+		printf("%s", str);
 	}
 }
 
@@ -124,7 +113,7 @@ void close_json_array(enum output_type type, const char *str)
 			else						\
 				jsonw_##type_name##_field(_jw, key, value); \
 		} else if (_IS_FP_CONTEXT(t)) {				\
-			color_fprintf(_fp, color, fmt, value);          \
+			color_fprintf(stdout, color, fmt, value);          \
 		}							\
 	}
 _PRINT_FUNC(int, int);
@@ -147,7 +136,7 @@ void print_color_string(enum output_type type,
 		else
 			jsonw_string_field(_jw, key, value);
 	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, value);
+		color_fprintf(stdout, color, fmt, value);
 	}
 }
 
@@ -168,7 +157,7 @@ void print_color_bool(enum output_type type,
 		else
 			jsonw_bool(_jw, value);
 	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, value ? "true" : "false");
+		color_fprintf(stdout, color, fmt, value ? "true" : "false");
 	}
 }
 
@@ -187,7 +176,7 @@ void print_color_0xhex(enum output_type type,
 		snprintf(b1, sizeof(b1), "%#x", hex);
 		print_string(PRINT_JSON, key, NULL, b1);
 	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, hex);
+		color_fprintf(stdout, color, fmt, hex);
 	}
 }
 
@@ -206,7 +195,7 @@ void print_color_hex(enum output_type type,
 		else
 			jsonw_string(_jw, b1);
 	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, hex);
+		color_fprintf(stdout, color, fmt, hex);
 	}
 }
 
@@ -226,6 +215,6 @@ void print_color_null(enum output_type type,
 		else
 			jsonw_null(_jw);
 	} else if (_IS_FP_CONTEXT(type)) {
-		color_fprintf(_fp, color, fmt, value);
+		color_fprintf(stdout, color, fmt, value);
 	}
 }
-- 
2.14.1

^ permalink raw reply related

* [iproute2 net-next 3/3] man: Add initial manpage for tc-cbs(8)
From: Vinicius Costa Gomes @ 2017-09-26 23:39 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran, henrik
In-Reply-To: <20170926233958.12027-1-vinicius.gomes@intel.com>

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 man/man8/tc-cbs.8 | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 man/man8/tc-cbs.8

diff --git a/man/man8/tc-cbs.8 b/man/man8/tc-cbs.8
new file mode 100644
index 00000000..e84c5495
--- /dev/null
+++ b/man/man8/tc-cbs.8
@@ -0,0 +1,100 @@
+.TH CBS 8 "18 Sept 2017" "iproute2" "Linux"
+.SH NAME
+CBS \- Credit Based Shaper (CBS) Qdisc
+.SH SYNOPSIS
+.B tc qdisc ... dev
+dev
+.B parent
+classid
+.B [ handle
+major:
+.B ] cbs idleslope
+idleslope
+.B sendslope
+sendslope
+.B hicredit
+hicredit
+.B locredit
+locredit
+
+.SH DESCRIPTION
+The CBS (Credit Based Shaper) qdisc implements the shaping algorithm
+defined by the IEEE 802.1Q-2014 Section 8.6.8.2, which applies a well
+defined rate limiting method to the traffic.
+
+This queueing discipline is intended to be used by TSN (Time Sensitive
+Networking) applications, the CBS parameters are derived directly by
+what is described by the Annex L of the IEEE 802.1Q-2014
+Sepcification. The algorithm and how it affects the latency are
+detailed there.
+
+CBS is meant to be installed under another qdisc that maps packet
+flows to traffic classes, one example is
+.BR mqprio(8).
+
+.SH PARAMETERS
+.TP
+idleslope
+Idleslope is the rate of credits that is accumulated (in kilobits per
+second) when there is at least one packet waiting for transmission.
+Packets are transmitted when the current value of credits is equal or
+greater than zero. When there is no packet to be transmitted the
+amount of credits is set to zero. This is the main tunable of the CBS
+algorithm.
+.TP
+sendslope
+Sendslope is the rate of credits that is depleted (it should be a
+negative number of kilobits per second) when a transmission is
+ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
+8.6.8.2 item g):
+
+sendslope = idleslope - port_transmit_rate
+
+.TP
+hicredit
+Hicredit defines the maximum amount of credits (in bytes) that can be
+accumulated. Hicredit depends on the characteristics of interfering
+traffic, 'max_interference_size' is the maximum size of any burst of
+traffic that can delay the transmission of a frame that is available
+for transmission for this traffic class, (IEEE 802.1Q-2014 Annex L,
+Equation L-3):
+
+hicredit = max_interference_size * (idleslope / port_transmit_rate)
+
+.TP
+locredit
+Locredit is the minimum amount of credits that can be reached. It is a
+function of the traffic flowing through this qdisc (IEEE 802.1Q-2014
+Annex L, Equation L-2):
+
+locredit = max_frame_size * (sendslope / port_transmit_rate)
+
+.SH EXAMPLES
+
+CBS is used to enforce a Quality of Service by limiting the data rate
+of a traffic class, to separate packets into traffic classes the user
+may choose
+.BR mqprio(8),
+and configure it like this:
+
+.EX
+# tc qdisc add dev eth0 handle 100: parent root mqprio num_tc 3 \\
+	map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\
+	queues 1@0 1@1 2@2 \\
+	hw 0
+.EE
+.P
+To replace the current queuing disciple by CBS in the current queueing
+discipline connected to traffic class number 0, issue:
+.P
+.EX
+# tc qdisc replace dev eth0 parent 100:4 cbs \\
+	locredit -1470 hicredit 30 sendslope -980000 idleslope 20000
+.EE
+
+These values are obtained from the following parameters, idleslope is
+20mbit/s, the transmission rate is 1Gbit/s and the maximum interfering
+frame size is 1500 bytes.
+
+.SH AUTHORS
+Vinicius Costa Gomes <vinicius.gomes@intel.com>
-- 
2.14.2

^ permalink raw reply related

* [iproute2 net-next 2/3] tc: Add support for the CBS qdisc
From: Vinicius Costa Gomes @ 2017-09-26 23:39 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran, henrik
In-Reply-To: <20170926233958.12027-1-vinicius.gomes@intel.com>

The Credit Based Shaper (CBS) queueing discipline allows bandwidth
reservation with sub-milisecond precision. It is defined by the
802.1Q-2014 specification (section 8.6.8.2 and Annex L).

The syntax is:

tc qdisc add dev DEV parent NODE cbs locredit <LOCREDIT>
   		hicredit <HICREDIT> sendslope <SENDSLOPE>
		idleslope <IDLESLOPE>

(The order is not important)

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 tc/Makefile |   1 +
 tc/q_cbs.c  | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 tc/q_cbs.c

diff --git a/tc/Makefile b/tc/Makefile
index 777de5e6..24bd3e2e 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -69,6 +69,7 @@ TCMODULES += q_hhf.o
 TCMODULES += q_clsact.o
 TCMODULES += e_bpf.o
 TCMODULES += f_matchall.o
+TCMODULES += q_cbs.o
 
 TCSO :=
 ifeq ($(TC_CONFIG_ATM),y)
diff --git a/tc/q_cbs.c b/tc/q_cbs.c
new file mode 100644
index 00000000..80dd599a
--- /dev/null
+++ b/tc/q_cbs.c
@@ -0,0 +1,134 @@
+/*
+ * q_cbs.c		CBS.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+	fprintf(stderr, "Usage: ... cbs hicredit BYTES locredit BYTES sendslope BPS idleslope BPS\n");
+}
+
+static void explain1(const char *arg, const char *val)
+{
+	fprintf(stderr, "cbs: illegal value for \"%s\": \"%s\"\n", arg, val);
+}
+
+static int cbs_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+	int ok = 0;
+	struct tc_cbs_qopt opt = {};
+	struct rtattr *tail;
+
+	while (argc > 0) {
+		if (matches(*argv, "hicredit") == 0) {
+			NEXT_ARG();
+			if (opt.hicredit) {
+				fprintf(stderr, "cbs: duplicate \"hicredit\" specification\n");
+				return -1;
+			}
+			if (get_s32(&opt.hicredit, *argv, 0)) {
+				explain1("hicredit", *argv);
+				return -1;
+			}
+			ok++;
+		} else if (matches(*argv, "locredit") == 0) {
+			NEXT_ARG();
+			if (opt.locredit) {
+				fprintf(stderr, "cbs: duplicate \"locredit\" specification\n");
+				return -1;
+			}
+			if (get_s32(&opt.locredit, *argv, 0)) {
+				explain1("locredit", *argv);
+				return -1;
+			}
+			ok++;
+		} else if (matches(*argv, "sendslope") == 0) {
+			NEXT_ARG();
+			if (opt.sendslope) {
+				fprintf(stderr, "cbs: duplicate \"sendslope\" specification\n");
+				return -1;
+			}
+			if (get_s32(&opt.sendslope, *argv, 0)) {
+				explain1("sendslope", *argv);
+				return -1;
+			}
+			ok++;
+		} else if (matches(*argv, "idleslope") == 0) {
+			NEXT_ARG();
+			if (opt.idleslope) {
+				fprintf(stderr, "cbs: duplicate \"idleslope\" specification\n");
+				return -1;
+			}
+			if (get_s32(&opt.idleslope, *argv, 0)) {
+				explain1("idleslope", *argv);
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "cbs: unknown parameter \"%s\"\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--; argv++;
+	}
+
+	tail = NLMSG_TAIL(n);
+	addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+	addattr_l(n, 2024, TCA_CBS_PARMS, &opt, sizeof(opt));
+	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+	return 0;
+}
+
+static int cbs_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+	struct rtattr *tb[TCA_CBS_MAX+1];
+	struct tc_cbs_qopt *qopt;
+
+	if (opt == NULL)
+		return 0;
+
+	parse_rtattr_nested(tb, TCA_CBS_MAX, opt);
+
+	if (tb[TCA_CBS_PARMS] == NULL)
+		return -1;
+
+	qopt = RTA_DATA(tb[TCA_CBS_PARMS]);
+	if (RTA_PAYLOAD(tb[TCA_CBS_PARMS])  < sizeof(*qopt))
+		return -1;
+
+	fprintf(f, "hicredit %d ", qopt->hicredit);
+	fprintf(f, "locredit %d ", qopt->locredit);
+	fprintf(f, "sendslope %d ", qopt->sendslope);
+	fprintf(f, "idleslope %d ", qopt->idleslope);
+
+	return 0;
+}
+
+struct qdisc_util cbs_qdisc_util = {
+	.id		= "cbs",
+	.parse_qopt	= cbs_parse_opt,
+	.print_qopt	= cbs_print_opt,
+};
-- 
2.14.2

^ permalink raw reply related

* [iproute2 net-next 1/3] update headers with CBS API
From: Vinicius Costa Gomes @ 2017-09-26 23:39 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, andre.guedes,
	ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran, henrik

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 include/linux/pkt_sched.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 099bf552..27c849c0 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -871,4 +871,21 @@ struct tc_pie_xstats {
 	__u32 maxq;             /* maximum queue size */
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
+
+/* CBS */
+struct tc_cbs_qopt {
+	__s32 hicredit;
+	__s32 locredit;
+	__s32 idleslope;
+	__s32 sendslope;
+};
+
+enum {
+	TCA_CBS_UNSPEC,
+	TCA_CBS_PARMS,
+	__TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
-- 
2.14.2

^ permalink raw reply related

* [next-queue PATCH 3/3] igb: Add support for CBS offload
From: Vinicius Costa Gomes @ 2017-09-26 23:39 UTC (permalink / raw)
  To: netdev, intel-wired-lan
  Cc: Andre Guedes, jhs, xiyou.wangcong, jiri, ivan.briano,
	jesus.sanchez-palencia, boon.leong.ong, richardcochran, henrik
In-Reply-To: <20170926233916.11774-1-vinicius.gomes@intel.com>

From: Andre Guedes <andre.guedes@intel.com>

This patch adds support for Credit-Based Shaper (CBS) qdisc offload
from Traffic Control system. This support enable us to leverage the
Forwarding and Queuing for Time-Sensitive Streams (FQTSS) features
from Intel i210 Ethernet Controller. FQTSS is the former 802.1Qav
standard which was merged into 802.1Q in 2014. It enables traffic
prioritization and bandwidth reservation via the Credit-Based Shaper
which is implemented in hardware by i210 controller.

The patch introduces the igb_setup_tc() function which implements the
support for CBS qdisc hardware offload in the IGB driver. CBS offload
is the only traffic control offload supported by the driver at the
moment.

FQTSS transmission mode from i210 controller is automatically enabled
by the IGB driver when the CBS is enabled for the first hardware
queue. Likewise, FQTSS mode is automatically disabled when CBS is
disabled for the last hardware queue. Changing FQTSS mode requires NIC
reset.

FQTSS feature is supported by i210 controller only.

Signed-off-by: Andre Guedes <andre.guedes@intel.com>
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  23 ++
 drivers/net/ethernet/intel/igb/e1000_regs.h    |   8 +
 drivers/net/ethernet/intel/igb/igb.h           |   6 +
 drivers/net/ethernet/intel/igb/igb_main.c      | 347 +++++++++++++++++++++++++
 4 files changed, 384 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 1de82f247312..83cabff1e0ab 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -353,7 +353,18 @@
 #define E1000_RXPBS_CFG_TS_EN           0x80000000
 
 #define I210_RXPBSIZE_DEFAULT		0x000000A2 /* RXPBSIZE default */
+#define I210_RXPBSIZE_MASK		0x0000003F
+#define I210_RXPBSIZE_PB_32KB		0x00000020
 #define I210_TXPBSIZE_DEFAULT		0x04000014 /* TXPBSIZE default */
+#define I210_TXPBSIZE_MASK		0xC0FFFFFF
+#define I210_TXPBSIZE_PB0_8KB		(8 << 0)
+#define I210_TXPBSIZE_PB1_8KB		(8 << 6)
+#define I210_TXPBSIZE_PB2_4KB		(4 << 12)
+#define I210_TXPBSIZE_PB3_4KB		(4 << 18)
+
+#define I210_DTXMXPKTSZ_DEFAULT		0x00000098
+
+#define I210_SR_QUEUES_NUM		2
 
 /* SerDes Control */
 #define E1000_SCTL_DISABLE_SERDES_LOOPBACK 0x0400
@@ -1051,4 +1062,16 @@
 #define E1000_VLAPQF_P_VALID(_n)	(0x1 << (3 + (_n) * 4))
 #define E1000_VLAPQF_QUEUE_MASK	0x03
 
+/* TX Qav Control fields */
+#define E1000_TQAVCTRL_XMIT_MODE	BIT(0)
+#define E1000_TQAVCTRL_DATAFETCHARB	BIT(4)
+#define E1000_TQAVCTRL_DATATRANARB	BIT(8)
+
+/* TX Qav Credit Control fields */
+#define E1000_TQAVCC_IDLESLOPE_MASK	0xFFFF
+#define E1000_TQAVCC_QUEUEMODE		BIT(31)
+
+/* Transmit Descriptor Control fields */
+#define E1000_TXDCTL_PRIORITY		BIT(27)
+
 #endif
diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h
index 58adbf234e07..8eee081d395f 100644
--- a/drivers/net/ethernet/intel/igb/e1000_regs.h
+++ b/drivers/net/ethernet/intel/igb/e1000_regs.h
@@ -421,6 +421,14 @@ do { \
 
 #define E1000_I210_FLA		0x1201C
 
+#define E1000_I210_DTXMXPKTSZ	0x355C
+
+#define E1000_I210_TXDCTL(_n)	(0x0E028 + ((_n) * 0x40))
+
+#define E1000_I210_TQAVCTRL	0x3570
+#define E1000_I210_TQAVCC(_n)	(0x3004 + ((_n) * 0x40))
+#define E1000_I210_TQAVHC(_n)	(0x300C + ((_n) * 0x40))
+
 #define E1000_INVM_DATA_REG(_n)	(0x12120 + 4*(_n))
 #define E1000_INVM_SIZE		64 /* Number of INVM Data Registers */
 
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 06ffb2bc713e..92845692087a 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -281,6 +281,11 @@ struct igb_ring {
 	u16 count;			/* number of desc. in the ring */
 	u8 queue_index;			/* logical index of the ring*/
 	u8 reg_idx;			/* physical index of the ring */
+	bool cbs_enable;		/* indicates if CBS is enabled */
+	s32 idleslope;			/* idleSlope in kbps */
+	s32 sendslope;			/* sendSlope in kbps */
+	s32 hicredit;			/* hiCredit in bytes */
+	s32 locredit;			/* loCredit in bytes */
 
 	/* everything past this point are written often */
 	u16 next_to_clean;
@@ -621,6 +626,7 @@ struct igb_adapter {
 #define IGB_FLAG_EEE			BIT(14)
 #define IGB_FLAG_VLAN_PROMISC		BIT(15)
 #define IGB_FLAG_RX_LEGACY		BIT(16)
+#define IGB_FLAG_FQTSS			BIT(17)
 
 /* Media Auto Sense */
 #define IGB_MAS_ENABLE_0		0X0001
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..03b8d0f4acfd 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
+#include <net/pkt_sched.h>
 #include <linux/net_tstamp.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
@@ -62,6 +63,17 @@
 #define BUILD 0
 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \
 __stringify(BUILD) "-k"
+
+enum queue_mode {
+	QUEUE_MODE_STRICT_PRIORITY,
+	QUEUE_MODE_STREAM_RESERVATION,
+};
+
+enum tx_queue_prio {
+	TX_QUEUE_PRIO_HIGH,
+	TX_QUEUE_PRIO_LOW,
+};
+
 char igb_driver_name[] = "igb";
 char igb_driver_version[] = DRV_VERSION;
 static const char igb_driver_string[] =
@@ -1271,6 +1283,12 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter,
 		ring->count = adapter->tx_ring_count;
 		ring->queue_index = txr_idx;
 
+		ring->cbs_enable = false;
+		ring->idleslope = 0;
+		ring->sendslope = 0;
+		ring->hicredit = 0;
+		ring->locredit = 0;
+
 		u64_stats_init(&ring->tx_syncp);
 		u64_stats_init(&ring->tx_syncp2);
 
@@ -1598,6 +1616,284 @@ static void igb_get_hw_control(struct igb_adapter *adapter)
 			ctrl_ext | E1000_CTRL_EXT_DRV_LOAD);
 }
 
+static void enable_fqtss(struct igb_adapter *adapter, bool enable)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+
+	if (enable)
+		adapter->flags |= IGB_FLAG_FQTSS;
+	else
+		adapter->flags &= ~IGB_FLAG_FQTSS;
+
+	if (netif_running(netdev))
+		schedule_work(&adapter->reset_task);
+}
+
+static bool is_fqtss_enabled(struct igb_adapter *adapter)
+{
+	return (adapter->flags & IGB_FLAG_FQTSS) ? true : false;
+}
+
+static void set_tx_desc_fetch_prio(struct e1000_hw *hw, int queue,
+				   enum tx_queue_prio prio)
+{
+	u32 val;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 4);
+
+	val = rd32(E1000_I210_TXDCTL(queue));
+
+	if (prio == TX_QUEUE_PRIO_HIGH)
+		val |= E1000_TXDCTL_PRIORITY;
+	else
+		val &= ~E1000_TXDCTL_PRIORITY;
+
+	wr32(E1000_I210_TXDCTL(queue), val);
+}
+
+static void set_queue_mode(struct e1000_hw *hw, int queue, enum queue_mode mode)
+{
+	u32 val;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 1);
+
+	val = rd32(E1000_I210_TQAVCC(queue));
+
+	if (mode == QUEUE_MODE_STREAM_RESERVATION)
+		val |= E1000_TQAVCC_QUEUEMODE;
+	else
+		val &= ~E1000_TQAVCC_QUEUEMODE;
+
+	wr32(E1000_I210_TQAVCC(queue), val);
+}
+
+/**
+ *  igb_configure_cbs - Configure Credit-Based Shaper (CBS)
+ *  @adapter: pointer to adapter struct
+ *  @queue: queue number
+ *  @enable: true = enable CBS, false = disable CBS
+ *  @idleslope: idleSlope in kbps
+ *  @sendslope: sendSlope in kbps
+ *  @hicredit: hiCredit in bytes
+ *  @locredit: loCredit in bytes
+ *
+ *  Configure CBS for a given hardware queue. When disabling, idleslope,
+ *  sendslope, hicredit, locredit arguments are ignored. Returns 0 if
+ *  success. Negative otherwise.
+ **/
+static void igb_configure_cbs(struct igb_adapter *adapter, int queue,
+			      bool enable, int idleslope, int sendslope,
+			      int hicredit, int locredit)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+	u32 tqavcc;
+	u16 value;
+
+	WARN_ON(hw->mac.type != e1000_i210);
+	WARN_ON(queue < 0 || queue > 1);
+
+	if (enable) {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
+		set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
+
+		/* According to i210 datasheet section 7.2.7.7, we should set
+		 * the 'idleSlope' field from TQAVCC register following the
+		 * equation:
+		 *
+		 * For 100 Mbps link speed:
+		 *
+		 *     value = BW * 0x7735 * 0.2                          (E1)
+		 *
+		 * For 1000Mbps link speed:
+		 *
+		 *     value = BW * 0x7735 * 2                            (E2)
+		 *
+		 * E1 and E2 can be merged into one equation as shown below.
+		 * Note that 'link-speed' is in Mbps.
+		 *
+		 *     value = BW * 0x7735 * 2 * link-speed
+		 *                           --------------               (E3)
+		 *                                1000
+		 *
+		 * 'BW' is the percentage bandwidth out of full link speed
+		 * which can be found with the following equation. Note that
+		 * idleSlope here is the parameter from this function which
+		 * is in kbps.
+		 *
+		 *     BW =     idleSlope
+		 *          -----------------                             (E4)
+		 *          link-speed * 1000
+		 *
+		 * That said, we can come up with a generic equation to
+		 * calculate the value we should set it TQAVCC register by
+		 * replacing 'BW' in E3 by E4. The resulting equation is:
+		 *
+		 * value =     idleSlope     * 0x7735 * 2 * link-speed
+		 *         -----------------            --------------    (E5)
+		 *         link-speed * 1000                 1000
+		 *
+		 * 'link-speed' is present in both sides of the fraction so
+		 * it is canceled out. The final equation is the following:
+		 *
+		 *     value = idleSlope * 61034
+		 *             -----------------                          (E6)
+		 *                  1000000
+		 */
+		value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000);
+
+		tqavcc = rd32(E1000_I210_TQAVCC(queue));
+		tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
+		tqavcc |= value;
+		wr32(E1000_I210_TQAVCC(queue), tqavcc);
+
+		wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735);
+	} else {
+		set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW);
+		set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY);
+
+		/* Set idleSlope to zero. */
+		tqavcc = rd32(E1000_I210_TQAVCC(queue));
+		tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK;
+		wr32(E1000_I210_TQAVCC(queue), tqavcc);
+
+		/* Set hiCredit to zero. */
+		wr32(E1000_I210_TQAVHC(queue), 0);
+	}
+
+	/* XXX: In i210 controller the sendSlope and loCredit parameters from
+	 * CBS are not configurable by software so we don't do any 'controller
+	 * configuration' in respect to these parameters.
+	 */
+
+	netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n",
+		   (enable) ? "enabled" : "disabled", queue,
+		   idleslope, sendslope, hicredit, locredit);
+}
+
+static int igb_save_cbs_params(struct igb_adapter *adapter, int queue,
+			       bool enable, int idleslope, int sendslope,
+			       int hicredit, int locredit)
+{
+	struct igb_ring *ring;
+
+	if (queue < 0 || queue > adapter->num_tx_queues)
+		return -EINVAL;
+
+	ring = adapter->tx_ring[queue];
+
+	ring->cbs_enable = enable;
+	ring->idleslope = idleslope;
+	ring->sendslope = sendslope;
+	ring->hicredit = hicredit;
+	ring->locredit = locredit;
+
+	return 0;
+}
+
+static bool is_any_cbs_enabled(struct igb_adapter *adapter)
+{
+	struct igb_ring *ring;
+	int i;
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		ring = adapter->tx_ring[i];
+
+		if (ring->cbs_enable)
+			return true;
+	}
+
+	return false;
+}
+
+static void igb_setup_tx_mode(struct igb_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct e1000_hw *hw = &adapter->hw;
+	u32 val;
+
+	/* Only i210 controller supports changing the transmission mode. */
+	if (hw->mac.type != e1000_i210)
+		return;
+
+	if (is_fqtss_enabled(adapter)) {
+		int i, max_queue;
+
+		/* Configure TQAVCTRL register: set transmit mode to 'Qav',
+		 * set data fetch arbitration to 'round robin' and set data
+		 * transfer arbitration to 'credit shaper algorithm.
+		 */
+		val = rd32(E1000_I210_TQAVCTRL);
+		val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB;
+		val &= ~E1000_TQAVCTRL_DATAFETCHARB;
+		wr32(E1000_I210_TQAVCTRL, val);
+
+		/* Configure Tx and Rx packet buffers sizes as described in
+		 * i210 datasheet section 7.2.7.7.
+		 */
+		val = rd32(E1000_TXPBS);
+		val &= ~I210_TXPBSIZE_MASK;
+		val |= I210_TXPBSIZE_PB0_8KB | I210_TXPBSIZE_PB1_8KB |
+			I210_TXPBSIZE_PB2_4KB | I210_TXPBSIZE_PB3_4KB;
+		wr32(E1000_TXPBS, val);
+
+		val = rd32(E1000_RXPBS);
+		val &= ~I210_RXPBSIZE_MASK;
+		val |= I210_RXPBSIZE_PB_32KB;
+		wr32(E1000_RXPBS, val);
+
+		/* Section 8.12.9 states that MAX_TPKT_SIZE from DTXMXPKTSZ
+		 * register should not exceed the buffer size programmed in
+		 * TXPBS. The smallest buffer size programmed in TXPBS is 4kB
+		 * so according to the datasheet we should set MAX_TPKT_SIZE to
+		 * 4kB / 64.
+		 *
+		 * However, when we do so, no frame from queue 2 and 3 are
+		 * transmitted.  It seems the MAX_TPKT_SIZE should not be great
+		 * or _equal_ to the buffer size programmed in TXPBS. For this
+		 * reason, we set set MAX_ TPKT_SIZE to (4kB - 1) / 64.
+		 */
+		val = (4096 - 1) / 64;
+		wr32(E1000_I210_DTXMXPKTSZ, val);
+
+		/* Since FQTSS mode is enabled, apply any CBS configuration
+		 * previously set. If no previous CBS configuration has been
+		 * done, then the initial configuration is applied, which means
+		 * CBS is disabled.
+		 */
+		max_queue = (adapter->num_tx_queues < I210_SR_QUEUES_NUM) ?
+			    adapter->num_tx_queues : I210_SR_QUEUES_NUM;
+
+		for (i = 0; i < max_queue; i++) {
+			struct igb_ring *ring = adapter->tx_ring[i];
+
+			igb_configure_cbs(adapter, i, ring->cbs_enable,
+					  ring->idleslope, ring->sendslope,
+					  ring->hicredit, ring->locredit);
+		}
+	} else {
+		wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT);
+		wr32(E1000_TXPBS, I210_TXPBSIZE_DEFAULT);
+		wr32(E1000_I210_DTXMXPKTSZ, I210_DTXMXPKTSZ_DEFAULT);
+
+		val = rd32(E1000_I210_TQAVCTRL);
+		/* According to Section 8.12.21, the other flags we've set when
+		 * enabling FQTSS are not relevant when disabling FQTSS so we
+		 * don't set they here.
+		 */
+		val &= ~E1000_TQAVCTRL_XMIT_MODE;
+		wr32(E1000_I210_TQAVCTRL, val);
+	}
+
+	netdev_dbg(netdev, "FQTSS %s\n", (is_fqtss_enabled(adapter)) ?
+		   "enabled" : "disabled");
+}
+
 /**
  *  igb_configure - configure the hardware for RX and TX
  *  @adapter: private board structure
@@ -1609,6 +1905,7 @@ static void igb_configure(struct igb_adapter *adapter)
 
 	igb_get_hw_control(adapter);
 	igb_set_rx_mode(netdev);
+	igb_setup_tx_mode(adapter);
 
 	igb_restore_vlan(adapter);
 
@@ -2150,6 +2447,55 @@ igb_features_check(struct sk_buff *skb, struct net_device *dev,
 	return features;
 }
 
+static int igb_offload_cbs(struct igb_adapter *adapter,
+			   struct tc_cbs_qopt_offload *qopt)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	int err;
+
+	/* CBS offloading is only supported by i210 controller. */
+	if (hw->mac.type != e1000_i210)
+		return -EOPNOTSUPP;
+
+	/* CBS offloading is only supported by queue 0 and queue 1. */
+	if (qopt->queue < 0 || qopt->queue > 1)
+		return -EINVAL;
+
+	err = igb_save_cbs_params(adapter, qopt->queue, qopt->enable,
+				  qopt->idleslope, qopt->sendslope,
+				  qopt->hicredit, qopt->locredit);
+	if (err)
+		return err;
+
+	if (is_fqtss_enabled(adapter)) {
+		igb_configure_cbs(adapter, qopt->queue, qopt->enable,
+				  qopt->idleslope, qopt->sendslope,
+				  qopt->hicredit, qopt->locredit);
+
+		if (!is_any_cbs_enabled(adapter))
+			enable_fqtss(adapter, false);
+
+	} else {
+		enable_fqtss(adapter, true);
+	}
+
+	return 0;
+}
+
+static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			void *type_data)
+{
+	struct igb_adapter *adapter = netdev_priv(dev);
+
+	switch (type) {
+	case TC_SETUP_CBS:
+		return igb_offload_cbs(adapter, type_data);
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops igb_netdev_ops = {
 	.ndo_open		= igb_open,
 	.ndo_stop		= igb_close,
@@ -2175,6 +2521,7 @@ static const struct net_device_ops igb_netdev_ops = {
 	.ndo_set_features	= igb_set_features,
 	.ndo_fdb_add		= igb_ndo_fdb_add,
 	.ndo_features_check	= igb_features_check,
+	.ndo_setup_tc		= igb_setup_tc,
 };
 
 /**
-- 
2.14.2

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox