Netdev List
 help / color / mirror / Atom feed
* [PATCH V4 net-next 06/10] net: hns3: add debug messages to identify eth down cause
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Yonglong Liu, Peng Li, Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>

From: Yonglong Liu <liuyonglong@huawei.com>

Some times just see the eth interface have been down/up via
dmesg, but can not know why the eth down. So adds some debug
messages to identify the cause for this.

Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c       | 18 ++++++++++++++++++
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c    | 19 +++++++++++++++++++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c    | 11 +++++++++++
 3 files changed, 48 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 4d58c53..0cf9301 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -459,6 +459,9 @@ static int hns3_nic_net_open(struct net_device *netdev)
 		h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
 
 	hns3_config_xps(priv);
+
+	netif_dbg(h, drv, netdev, "net open\n");
+
 	return 0;
 }
 
@@ -519,6 +522,8 @@ static int hns3_nic_net_stop(struct net_device *netdev)
 	if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state))
 		return 0;
 
+	netif_dbg(h, drv, netdev, "net stop\n");
+
 	if (h->ae_algo->ops->set_timer_task)
 		h->ae_algo->ops->set_timer_task(priv->ae_handle, false);
 
@@ -1550,6 +1555,8 @@ static int hns3_setup_tc(struct net_device *netdev, void *type_data)
 	h = hns3_get_handle(netdev);
 	kinfo = &h->kinfo;
 
+	netif_dbg(h, drv, netdev, "setup tc: num_tc=%u\n", tc);
+
 	return (kinfo->dcb_ops && kinfo->dcb_ops->setup_tc) ?
 		kinfo->dcb_ops->setup_tc(h, tc, prio_tc) : -EOPNOTSUPP;
 }
@@ -1593,6 +1600,10 @@ static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
 	struct hnae3_handle *h = hns3_get_handle(netdev);
 	int ret = -EIO;
 
+	netif_dbg(h, drv, netdev,
+		  "set vf vlan: vf=%d, vlan=%u, qos=%u, vlan_proto=%u\n",
+		  vf, vlan, qos, vlan_proto);
+
 	if (h->ae_algo->ops->set_vf_vlan_filter)
 		ret = h->ae_algo->ops->set_vf_vlan_filter(h, vf, vlan,
 							  qos, vlan_proto);
@@ -1611,6 +1622,9 @@ static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu)
 	if (!h->ae_algo->ops->set_mtu)
 		return -EOPNOTSUPP;
 
+	netif_dbg(h, drv, netdev,
+		  "change mtu from %u to %d\n", netdev->mtu, new_mtu);
+
 	ret = h->ae_algo->ops->set_mtu(h, new_mtu);
 	if (ret)
 		netdev_err(netdev, "failed to change MTU in hardware %d\n",
@@ -4395,6 +4409,10 @@ int hns3_set_channels(struct net_device *netdev,
 	if (kinfo->rss_size == new_tqp_num)
 		return 0;
 
+	netif_dbg(h, drv, netdev,
+		  "set channels: tqp_num=%u, rxfh=%d\n",
+		  new_tqp_num, rxfh_configured);
+
 	ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT);
 	if (ret)
 		return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index e71c92b..fe0f82a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -311,6 +311,8 @@ static void hns3_self_test(struct net_device *ndev,
 	if (eth_test->flags != ETH_TEST_FL_OFFLINE)
 		return;
 
+	netif_dbg(h, drv, ndev, "self test start");
+
 	st_param[HNAE3_LOOP_APP][0] = HNAE3_LOOP_APP;
 	st_param[HNAE3_LOOP_APP][1] =
 			h->flags & HNAE3_SUPPORT_APP_LOOPBACK;
@@ -374,6 +376,8 @@ static void hns3_self_test(struct net_device *ndev,
 
 	if (if_running)
 		ndev->netdev_ops->ndo_open(ndev);
+
+	netif_dbg(h, drv, ndev, "self test end\n");
 }
 
 static int hns3_get_sset_count(struct net_device *netdev, int stringset)
@@ -604,6 +608,10 @@ static int hns3_set_pauseparam(struct net_device *netdev,
 {
 	struct hnae3_handle *h = hns3_get_handle(netdev);
 
+	netif_dbg(h, drv, netdev,
+		  "set pauseparam: autoneg=%u, rx:%u, tx:%u\n",
+		  param->autoneg, param->rx_pause, param->tx_pause);
+
 	if (h->ae_algo->ops->set_pauseparam)
 		return h->ae_algo->ops->set_pauseparam(h, param->autoneg,
 						       param->rx_pause,
@@ -743,6 +751,11 @@ static int hns3_set_link_ksettings(struct net_device *netdev,
 	if (cmd->base.speed == SPEED_1000 && cmd->base.duplex == DUPLEX_HALF)
 		return -EINVAL;
 
+	netif_dbg(handle, drv, netdev,
+		  "set link(%s): autoneg=%u, speed=%u, duplex=%u\n",
+		  netdev->phydev ? "phy" : "mac",
+		  cmd->base.autoneg, cmd->base.speed, cmd->base.duplex);
+
 	/* Only support ksettings_set for netdev with phy attached for now */
 	if (netdev->phydev)
 		return phy_ethtool_ksettings_set(netdev->phydev, cmd);
@@ -984,6 +997,9 @@ static int hns3_nway_reset(struct net_device *netdev)
 		return -EINVAL;
 	}
 
+	netif_dbg(handle, drv, netdev,
+		  "nway reset (using %s)\n", phy ? "phy" : "mac");
+
 	if (phy)
 		return genphy_restart_aneg(phy);
 
@@ -1308,6 +1324,9 @@ static int hns3_set_fecparam(struct net_device *netdev,
 	if (!ops->set_fec)
 		return -EOPNOTSUPP;
 	fec_mode = eth_to_loc_fec(fec->fec);
+
+	netif_dbg(handle, drv, netdev, "set fecparam: mode=%u\n", fec_mode);
+
 	return ops->set_fec(handle, fec_mode);
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
index bac4ce1..814e0f0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
@@ -201,6 +201,7 @@ static int hclge_client_setup_tc(struct hclge_dev *hdev)
 static int hclge_ieee_setets(struct hnae3_handle *h, struct ieee_ets *ets)
 {
 	struct hclge_vport *vport = hclge_get_vport(h);
+	struct net_device *netdev = h->kinfo.netdev;
 	struct hclge_dev *hdev = vport->back;
 	bool map_changed = false;
 	u8 num_tc = 0;
@@ -215,6 +216,8 @@ static int hclge_ieee_setets(struct hnae3_handle *h, struct ieee_ets *ets)
 		return ret;
 
 	if (map_changed) {
+		netif_dbg(h, drv, netdev, "set ets\n");
+
 		ret = hclge_notify_client(hdev, HNAE3_DOWN_CLIENT);
 		if (ret)
 			return ret;
@@ -300,6 +303,7 @@ static int hclge_ieee_getpfc(struct hnae3_handle *h, struct ieee_pfc *pfc)
 static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc)
 {
 	struct hclge_vport *vport = hclge_get_vport(h);
+	struct net_device *netdev = h->kinfo.netdev;
 	struct hclge_dev *hdev = vport->back;
 	u8 i, j, pfc_map, *prio_tc;
 
@@ -325,6 +329,10 @@ static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc)
 	hdev->tm_info.hw_pfc_map = pfc_map;
 	hdev->tm_info.pfc_en = pfc->pfc_en;
 
+	netif_dbg(h, drv, netdev,
+		  "set pfc: pfc_en=%u, pfc_map=%u, num_tc=%u\n",
+		  pfc->pfc_en, pfc_map, hdev->tm_info.num_tc);
+
 	hclge_tm_pfc_info_update(hdev);
 
 	return hclge_pause_setup_hw(hdev, false);
@@ -345,8 +353,11 @@ static u8 hclge_getdcbx(struct hnae3_handle *h)
 static u8 hclge_setdcbx(struct hnae3_handle *h, u8 mode)
 {
 	struct hclge_vport *vport = hclge_get_vport(h);
+	struct net_device *netdev = h->kinfo.netdev;
 	struct hclge_dev *hdev = vport->back;
 
+	netif_dbg(h, drv, netdev, "set dcbx: mode=%u\n", mode);
+
 	/* No support for LLD_MANAGED modes or CEE */
 	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
 	    (mode & DCB_CAP_DCBX_VER_CEE) ||
-- 
2.7.4


^ permalink raw reply related

* [PATCH V4 net-next 00/10] net: hns3: some code optimizations & bugfixes & features
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Huazhong Tan

This patch-set includes code optimizations, bugfixes and features for
the HNS3 ethernet controller driver.

[patch 1/10] checks reset status before setting channel.

[patch 2/10] adds a NULL pointer checking.

[patch 3/10] removes reset level upgrading when current reset fails.

[patch 4/10] fixes a GFP flags errors when holding spin_lock.

[patch 5/10] modifies firmware version format.

[patch 6/10] adds some print information which is off by default.

[patch 7/10 - 8/10] adds two code optimizations about interrupt handler
and work task.

[patch 9/10] adds support for using order 1 pages with a 4K buffer.

[patch 10/10] modifies messages prints with dev_info() instead of
pr_info().

Change log:
V3->V4: replace netif_info with netif_dbg in [patch 6/10]
V2->V3: fixes comments from Saeed Mahameed and Joe Perches.
V1->V2: fixes comments from Saeed Mahameed and
	removes previous [patch 4/11] and [patch 11/11]
	which needs further discussion, and adds a new
	patch [10/10] suggested by Saeed Mahameed.

Guangbin Huang (1):
  net: hns3: add a check for get_reset_level

Huazhong Tan (2):
  net: hns3: remove upgrade reset level when reset fail
  net: hns3: use dev_info() instead of pr_info()

Jian Shen (1):
  net: hns3: add reset checking before set channels

Yonglong Liu (1):
  net: hns3: add debug messages to identify eth down cause

Yufeng Mo (2):
  net: hns3: change GFP flag during lock period
  net: hns3: modify firmware version display format

Yunsheng Lin (3):
  net: hns3: make hclge_service use delayed workqueue
  net: hns3: add interrupt affinity support for misc interrupt
  net: hns3: Add support for using order 1 pages with a 4K buffer

 drivers/net/ethernet/hisilicon/hns3/hnae3.h        |   9 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c    |  33 ++++-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h    |  15 ++-
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |  34 +++++-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c |  10 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c |  11 ++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 135 ++++++++++++---------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   7 +-
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c   |  10 +-
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  |   3 +-
 10 files changed, 195 insertions(+), 72 deletions(-)

-- 
2.7.4


^ permalink raw reply

* [PATCH V4 net-next 07/10] net: hns3: make hclge_service use delayed workqueue
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Yunsheng Lin, Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>

From: Yunsheng Lin <linyunsheng@huawei.com>

Use delayed work instead of using timers to trigger the
hclge_serive.

Simplify the code with one less middle function and in order
to support misc irq affinity.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 52 +++++++++-------------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  3 +-
 2 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 14199c4..13c9697 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2513,8 +2513,12 @@ static void hclge_task_schedule(struct hclge_dev *hdev)
 {
 	if (!test_bit(HCLGE_STATE_DOWN, &hdev->state) &&
 	    !test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
-	    !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state))
-		(void)schedule_work(&hdev->service_task);
+	    !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state)) {
+		hdev->hw_stats.stats_timer++;
+		hdev->fd_arfs_expire_timer++;
+		mod_delayed_work(system_wq, &hdev->service_task,
+				 round_jiffies_relative(HZ));
+	}
 }
 
 static int hclge_get_mac_link_status(struct hclge_dev *hdev)
@@ -2729,25 +2733,6 @@ static int hclge_get_status(struct hnae3_handle *handle)
 	return hdev->hw.mac.link;
 }
 
-static void hclge_service_timer(struct timer_list *t)
-{
-	struct hclge_dev *hdev = from_timer(hdev, t, service_timer);
-
-	mod_timer(&hdev->service_timer, jiffies + HZ);
-	hdev->hw_stats.stats_timer++;
-	hdev->fd_arfs_expire_timer++;
-	hclge_task_schedule(hdev);
-}
-
-static void hclge_service_complete(struct hclge_dev *hdev)
-{
-	WARN_ON(!test_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state));
-
-	/* Flush memory before next watchdog */
-	smp_mb__before_atomic();
-	clear_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state);
-}
-
 static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 {
 	u32 rst_src_reg, cmdq_src_reg, msix_src_reg;
@@ -3594,7 +3579,9 @@ static void hclge_update_vport_alive(struct hclge_dev *hdev)
 static void hclge_service_task(struct work_struct *work)
 {
 	struct hclge_dev *hdev =
-		container_of(work, struct hclge_dev, service_task);
+		container_of(work, struct hclge_dev, service_task.work);
+
+	clear_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state);
 
 	if (hdev->hw_stats.stats_timer >= HCLGE_STATS_TIMER_INTERVAL) {
 		hclge_update_stats_for_all(hdev);
@@ -3609,7 +3596,8 @@ static void hclge_service_task(struct work_struct *work)
 		hclge_rfs_filter_expire(hdev);
 		hdev->fd_arfs_expire_timer = 0;
 	}
-	hclge_service_complete(hdev);
+
+	hclge_task_schedule(hdev);
 }
 
 struct hclge_vport *hclge_get_vport(struct hnae3_handle *handle)
@@ -6148,10 +6136,13 @@ static void hclge_set_timer_task(struct hnae3_handle *handle, bool enable)
 	struct hclge_dev *hdev = vport->back;
 
 	if (enable) {
-		mod_timer(&hdev->service_timer, jiffies + HZ);
+		hclge_task_schedule(hdev);
 	} else {
-		del_timer_sync(&hdev->service_timer);
-		cancel_work_sync(&hdev->service_task);
+		/* Set the DOWN flag here to disable the service to be
+		 * scheduled again
+		 */
+		set_bit(HCLGE_STATE_DOWN, &hdev->state);
+		cancel_delayed_work_sync(&hdev->service_task);
 		clear_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state);
 	}
 }
@@ -8590,12 +8581,10 @@ static void hclge_state_uninit(struct hclge_dev *hdev)
 	set_bit(HCLGE_STATE_DOWN, &hdev->state);
 	set_bit(HCLGE_STATE_REMOVING, &hdev->state);
 
-	if (hdev->service_timer.function)
-		del_timer_sync(&hdev->service_timer);
 	if (hdev->reset_timer.function)
 		del_timer_sync(&hdev->reset_timer);
-	if (hdev->service_task.func)
-		cancel_work_sync(&hdev->service_task);
+	if (hdev->service_task.work.func)
+		cancel_delayed_work_sync(&hdev->service_task);
 	if (hdev->rst_service_task.func)
 		cancel_work_sync(&hdev->rst_service_task);
 	if (hdev->mbx_service_task.func)
@@ -8800,9 +8789,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 	hclge_dcb_ops_set(hdev);
 
-	timer_setup(&hdev->service_timer, hclge_service_timer, 0);
 	timer_setup(&hdev->reset_timer, hclge_reset_timer, 0);
-	INIT_WORK(&hdev->service_task, hclge_service_task);
+	INIT_DELAYED_WORK(&hdev->service_task, hclge_service_task);
 	INIT_WORK(&hdev->rst_service_task, hclge_reset_service_task);
 	INIT_WORK(&hdev->mbx_service_task, hclge_mailbox_service_task);
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 6a12285..dde8f22 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -806,9 +806,8 @@ struct hclge_dev {
 	u16 adminq_work_limit; /* Num of admin receive queue desc to process */
 	unsigned long service_timer_period;
 	unsigned long service_timer_previous;
-	struct timer_list service_timer;
 	struct timer_list reset_timer;
-	struct work_struct service_task;
+	struct delayed_work service_task;
 	struct work_struct rst_service_task;
 	struct work_struct mbx_service_task;
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH V4 net-next 09/10] net: hns3: Add support for using order 1 pages with a 4K buffer
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Yunsheng Lin, Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>

From: Yunsheng Lin <linyunsheng@huawei.com>

Hardware supports 0.5K, 1K, 2K, 4K RX buffer size, the
RX buffer can not be reused because the hns3_page_order
return 0 when page size and RX buffer size are both 4096.

So this patch changes the hns3_page_order to return 1 when
RX buffer is greater than half of the page size and page size
is less the 8192, and dev_alloc_pages has already been used
to allocate the compound page for RX buffer.

This patch also changes hnae3_* to hns3_* for page order
and RX buffer size calculation because they are used in
hns3 module.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Reviewed-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 10 +++++-----
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 15 ++++++++++++---
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 0cf9301..d2df42d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2081,7 +2081,7 @@ static void hns3_set_default_feature(struct net_device *netdev)
 static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
 			     struct hns3_desc_cb *cb)
 {
-	unsigned int order = hnae3_page_order(ring);
+	unsigned int order = hns3_page_order(ring);
 	struct page *p;
 
 	p = dev_alloc_pages(order);
@@ -2092,7 +2092,7 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
 	cb->page_offset = 0;
 	cb->reuse_flag = 0;
 	cb->buf  = page_address(p);
-	cb->length = hnae3_page_size(ring);
+	cb->length = hns3_page_size(ring);
 	cb->type = DESC_TYPE_PAGE;
 
 	return 0;
@@ -2395,7 +2395,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
 {
 	struct hns3_desc *desc = &ring->desc[ring->next_to_clean];
 	int size = le16_to_cpu(desc->rx.size);
-	u32 truesize = hnae3_buf_size(ring);
+	u32 truesize = hns3_buf_size(ring);
 
 	skb_add_rx_frag(skb, i, desc_cb->priv, desc_cb->page_offset + pull_len,
 			size - pull_len, truesize);
@@ -2410,7 +2410,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
 	/* Move offset up to the next cache line */
 	desc_cb->page_offset += truesize;
 
-	if (desc_cb->page_offset + truesize <= hnae3_page_size(ring)) {
+	if (desc_cb->page_offset + truesize <= hns3_page_size(ring)) {
 		desc_cb->reuse_flag = 1;
 		/* Bump ref count on page before it is given */
 		get_page(desc_cb->priv);
@@ -2692,7 +2692,7 @@ static int hns3_add_frag(struct hns3_enet_ring *ring, struct hns3_desc *desc,
 		}
 
 		if (ring->tail_skb) {
-			head_skb->truesize += hnae3_buf_size(ring);
+			head_skb->truesize += hns3_buf_size(ring);
 			head_skb->data_len += le16_to_cpu(desc->rx.size);
 			head_skb->len += le16_to_cpu(desc->rx.size);
 			skb = ring->tail_skb;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 848b866..1a17856 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -608,9 +608,18 @@ static inline bool hns3_nic_resetting(struct net_device *netdev)
 
 #define tx_ring_data(priv, idx) ((priv)->ring_data[idx])
 
-#define hnae3_buf_size(_ring) ((_ring)->buf_size)
-#define hnae3_page_order(_ring) (get_order(hnae3_buf_size(_ring)))
-#define hnae3_page_size(_ring) (PAGE_SIZE << (u32)hnae3_page_order(_ring))
+#define hns3_buf_size(_ring) ((_ring)->buf_size)
+
+static inline unsigned int hns3_page_order(struct hns3_enet_ring *ring)
+{
+#if (PAGE_SIZE < 8192)
+	if (ring->buf_size > (PAGE_SIZE / 2))
+		return 1;
+#endif
+	return 0;
+}
+
+#define hns3_page_size(_ring) (PAGE_SIZE << hns3_page_order(_ring))
 
 /* iterator for handling rings in ring group */
 #define hns3_for_each_ring(pos, head) \
-- 
2.7.4


^ permalink raw reply related

* [PATCH V4 net-next 03/10] net: hns3: remove upgrade reset level when reset fail
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>

Currently, hclge_reset_err_handle() will assert a global reset
when the failing count is smaller than MAX_RESET_FAIL_CNT, which
will affect other running functions.

So this patch removes this upgrading, and uses re-scheduling reset
task to do it.

Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Reviewed-by: Yunsheng Lin <linyunsheng@huawei.com>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 28 +++++++---------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 3fde5471..3c64d70 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3305,7 +3305,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev)
 	return ret;
 }
 
-static bool hclge_reset_err_handle(struct hclge_dev *hdev, bool is_timeout)
+static bool hclge_reset_err_handle(struct hclge_dev *hdev)
 {
 #define MAX_RESET_FAIL_CNT 5
 
@@ -3322,20 +3322,11 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev, bool is_timeout)
 		return false;
 	} else if (hdev->reset_fail_cnt < MAX_RESET_FAIL_CNT) {
 		hdev->reset_fail_cnt++;
-		if (is_timeout) {
-			set_bit(hdev->reset_type, &hdev->reset_pending);
-			dev_info(&hdev->pdev->dev,
-				 "re-schedule to wait for hw reset done\n");
-			return true;
-		}
-
-		dev_info(&hdev->pdev->dev, "Upgrade reset level\n");
-		hclge_clear_reset_cause(hdev);
-		set_bit(HNAE3_GLOBAL_RESET, &hdev->default_reset_request);
-		mod_timer(&hdev->reset_timer,
-			  jiffies + HCLGE_RESET_INTERVAL);
-
-		return false;
+		set_bit(hdev->reset_type, &hdev->reset_pending);
+		dev_info(&hdev->pdev->dev,
+			 "re-schedule reset task(%d)\n",
+			 hdev->reset_fail_cnt);
+		return true;
 	}
 
 	hclge_clear_reset_cause(hdev);
@@ -3382,7 +3373,6 @@ static int hclge_reset_stack(struct hclge_dev *hdev)
 static void hclge_reset(struct hclge_dev *hdev)
 {
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
-	bool is_timeout = false;
 	int ret;
 
 	/* Initialize ae_dev reset status as well, in case enet layer wants to
@@ -3410,10 +3400,8 @@ static void hclge_reset(struct hclge_dev *hdev)
 	if (ret)
 		goto err_reset;
 
-	if (hclge_reset_wait(hdev)) {
-		is_timeout = true;
+	if (hclge_reset_wait(hdev))
 		goto err_reset;
-	}
 
 	hdev->rst_stats.hw_reset_done_cnt++;
 
@@ -3465,7 +3453,7 @@ static void hclge_reset(struct hclge_dev *hdev)
 err_reset_lock:
 	rtnl_unlock();
 err_reset:
-	if (hclge_reset_err_handle(hdev, is_timeout))
+	if (hclge_reset_err_handle(hdev))
 		hclge_reset_task_schedule(hdev);
 }
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH V4 net-next 10/10] net: hns3: use dev_info() instead of pr_info()
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>

dev_info() is more appropriate for printing messages when driver
initialization done, so switch to dev_info().

Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   | 4 +++-
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 30a7074..4138780 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -8862,7 +8862,9 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	hclge_state_init(hdev);
 	hdev->last_reset_time = jiffies;
 
-	pr_info("%s driver initialization finished.\n", HCLGE_DRIVER_NAME);
+	dev_info(&hdev->pdev->dev, "%s driver initialization finished.\n",
+		 HCLGE_DRIVER_NAME);
+
 	return 0;
 
 err_mdiobus_unreg:
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index a13a0e1..ae0e6a6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -2695,7 +2695,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 	}
 
 	hdev->last_reset_time = jiffies;
-	pr_info("finished initializing %s driver\n", HCLGEVF_DRIVER_NAME);
+	dev_info(&hdev->pdev->dev, "finished initializing %s driver\n",
+		 HCLGEVF_DRIVER_NAME);
 
 	return 0;
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH V4 net-next 08/10] net: hns3: add interrupt affinity support for misc interrupt
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Yunsheng Lin, Peng Li, Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>

From: Yunsheng Lin <linyunsheng@huawei.com>

The misc interrupt is used to schedule the reset and mailbox
subtask, and service_task delayed_work is used to do periodic
management work each second.

This patch sets the above three subtask's affinity using the
misc interrupt' affinity.

Also this patch setups a affinity notify for misc interrupt to
allow user to change the above three subtask's affinity.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 53 ++++++++++++++++++++--
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  4 ++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 13c9697..30a7074 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1270,6 +1270,12 @@ static int hclge_configure(struct hclge_dev *hdev)
 
 	hclge_init_kdump_kernel_config(hdev);
 
+	/* Set the init affinity based on pci func number */
+	i = cpumask_weight(cpumask_of_node(dev_to_node(&hdev->pdev->dev)));
+	i = i ? PCI_FUNC(hdev->pdev->devfn) % i : 0;
+	cpumask_set_cpu(cpumask_local_spread(i, dev_to_node(&hdev->pdev->dev)),
+			&hdev->affinity_mask);
+
 	return ret;
 }
 
@@ -2499,14 +2505,16 @@ static void hclge_mbx_task_schedule(struct hclge_dev *hdev)
 {
 	if (!test_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state) &&
 	    !test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, &hdev->state))
-		schedule_work(&hdev->mbx_service_task);
+		queue_work_on(cpumask_first(&hdev->affinity_mask), system_wq,
+			      &hdev->mbx_service_task);
 }
 
 static void hclge_reset_task_schedule(struct hclge_dev *hdev)
 {
 	if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
 	    !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state))
-		schedule_work(&hdev->rst_service_task);
+		queue_work_on(cpumask_first(&hdev->affinity_mask), system_wq,
+			      &hdev->rst_service_task);
 }
 
 static void hclge_task_schedule(struct hclge_dev *hdev)
@@ -2516,8 +2524,9 @@ static void hclge_task_schedule(struct hclge_dev *hdev)
 	    !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state)) {
 		hdev->hw_stats.stats_timer++;
 		hdev->fd_arfs_expire_timer++;
-		mod_delayed_work(system_wq, &hdev->service_task,
-				 round_jiffies_relative(HZ));
+		mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
+				    system_wq, &hdev->service_task,
+				    round_jiffies_relative(HZ));
 	}
 }
 
@@ -2903,6 +2912,36 @@ static void hclge_get_misc_vector(struct hclge_dev *hdev)
 	hdev->num_msi_used += 1;
 }
 
+static void hclge_irq_affinity_notify(struct irq_affinity_notify *notify,
+				      const cpumask_t *mask)
+{
+	struct hclge_dev *hdev = container_of(notify, struct hclge_dev,
+					      affinity_notify);
+
+	cpumask_copy(&hdev->affinity_mask, mask);
+}
+
+static void hclge_irq_affinity_release(struct kref *ref)
+{
+}
+
+static void hclge_misc_affinity_setup(struct hclge_dev *hdev)
+{
+	irq_set_affinity_hint(hdev->misc_vector.vector_irq,
+			      &hdev->affinity_mask);
+
+	hdev->affinity_notify.notify = hclge_irq_affinity_notify;
+	hdev->affinity_notify.release = hclge_irq_affinity_release;
+	irq_set_affinity_notifier(hdev->misc_vector.vector_irq,
+				  &hdev->affinity_notify);
+}
+
+static void hclge_misc_affinity_teardown(struct hclge_dev *hdev)
+{
+	irq_set_affinity_notifier(hdev->misc_vector.vector_irq, NULL);
+	irq_set_affinity_hint(hdev->misc_vector.vector_irq, NULL);
+}
+
 static int hclge_misc_irq_init(struct hclge_dev *hdev)
 {
 	int ret;
@@ -8794,6 +8833,11 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	INIT_WORK(&hdev->rst_service_task, hclge_reset_service_task);
 	INIT_WORK(&hdev->mbx_service_task, hclge_mailbox_service_task);
 
+	/* Setup affinity after service timer setup because add_timer_on
+	 * is called in affinity notify.
+	 */
+	hclge_misc_affinity_setup(hdev);
+
 	hclge_clear_all_event_cause(hdev);
 	hclge_clear_resetting_state(hdev);
 
@@ -8955,6 +8999,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
 	struct hclge_dev *hdev = ae_dev->priv;
 	struct hclge_mac *mac = &hdev->hw.mac;
 
+	hclge_misc_affinity_teardown(hdev);
 	hclge_state_uninit(hdev);
 
 	if (mac->phydev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index dde8f22..688e425 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -863,6 +863,10 @@ struct hclge_dev {
 
 	DECLARE_KFIFO(mac_tnl_log, struct hclge_mac_tnl_stats,
 		      HCLGE_MAC_TNL_LOG_SIZE);
+
+	/* affinity mask and notify for misc interrupt */
+	cpumask_t affinity_mask;
+	struct irq_affinity_notify affinity_notify;
 };
 
 /* VPort level vlan tag configuration for TX direction */
-- 
2.7.4


^ permalink raw reply related

* [PATCH V4 net-next 04/10] net: hns3: change GFP flag during lock period
From: Huazhong Tan @ 2019-07-29  2:53 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	Yufeng Mo, lipeng 00277521, Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>

From: Yufeng Mo <moyufeng@huawei.com>

When allocating memory, the GFP_KERNEL cannot be used during the
spin_lock period. This is because it may cause scheduling when holding
spin_lock. This patch changes GFP flag to GFP_ATOMIC in this case.

Fixes: dd74f815dd41 ("net: hns3: Add support for rule add/delete for flow director")
Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: lipeng 00277521 <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 3c64d70..14199c4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5796,7 +5796,7 @@ static int hclge_add_fd_entry_by_arfs(struct hnae3_handle *handle, u16 queue_id,
 			return -ENOSPC;
 		}
 
-		rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+		rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
 		if (!rule) {
 			spin_unlock_bh(&hdev->fd_rule_lock);
 
-- 
2.7.4


^ permalink raw reply related

* [PATCH net] net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
From: Su Yanjun @ 2019-07-29  2:49 UTC (permalink / raw)
  To: davem, kuznet, yoshfuji; +Cc: netdev, linux-kernel, suyj.fnst

When we send mpls packets and the interface only has a
manual global ipv6 address, then the two hosts cant communicate.
I find that in ndisc_send_ns it only tries to get a ll address.
In my case, the executive path is as below.
ip6_output
 ->ip6_finish_output
  ->lwtunnel_xmit
   ->mpls_xmit
    ->neigh_resolve_output
     ->neigh_probe
      ->ndisc_solicit
       ->ndisc_send_ns

In RFC4861, 7.2.2 says
"If the source address of the packet prompting the solicitation is the
same as one of the addresses assigned to the outgoing interface, that
address SHOULD be placed in the IP Source Address of the outgoing
solicitation.  Otherwise, any one of the addresses assigned to the
interface should be used."

In this patch we try get a global address if we get ll address failed.

Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
---
 include/net/addrconf.h |  4 ++++
 net/ipv6/addrconf.c    | 34 ++++++++++++++++++++++++++++++++++
 net/ipv6/ndisc.c       |  8 ++++++--
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index becdad5..006db8e 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -107,6 +107,10 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
 		      u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    u32 banned_flags);
+int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr,
+		      u32 banned_flags);
+int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr,
+		    u32 banned_flags);
 bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 			  bool match_wildcard);
 bool inet_rcv_saddr_any(const struct sock *sk);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 521e320..4c0a43f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1870,6 +1870,40 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 	return err;
 }
 
+int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr,
+		    u32 banned_flags)
+{
+	struct inet6_ifaddr *ifp;
+	int err = -EADDRNOTAVAIL;
+
+	list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
+		if (ifp->scope == 0 &&
+		    !(ifp->flags & banned_flags)) {
+			*addr = ifp->addr;
+			err = 0;
+			break;
+		}
+	}
+	return err;
+}
+
+int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr,
+		  u32 banned_flags)
+{
+	struct inet6_dev *idev;
+	int err = -EADDRNOTAVAIL;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev) {
+		read_lock_bh(&idev->lock);
+		err = __ipv6_get_addr(idev, addr, banned_flags);
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+	return err;
+}
+
 static int ipv6_count_addresses(const struct inet6_dev *idev)
 {
 	const struct inet6_ifaddr *ifp;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 083cc1c..18ac2fb 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -606,8 +606,12 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
 
 	if (!saddr) {
 		if (ipv6_get_lladdr(dev, &addr_buf,
-				   (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)))
-			return;
+				   (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) {
+			/* try global address */
+			if (ipv6_get_addr(dev, &addr_buf,
+					  (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)))
+				return;
+		}
 		saddr = &addr_buf;
 	}
 
-- 
2.7.4




^ permalink raw reply related

* Re: [PATCH net-next v4 2/3] flow_offload: Support get default block from tc immediately
From: wenxu @ 2019-07-29  2:43 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: pablo, fw, netfilter-devel, netdev
In-Reply-To: <20190728131653.6af72a87@cakuba.netronome.com>


On 7/29/2019 4:16 AM, Jakub Kicinski wrote:
> .
> The TC default block is there because the indirect registration may
> happen _after_ the block is installed and populated.  It's the device
> driver that usually does the indirect registration, the tunnel device
> and its rules may already be set when device driver is loaded or
> reloaded.
Yes, I know this scenario.
> I don't know the nft code, but it seems unlikely it wouldn't have the
> same problem/need..

nft don't have the same problem.  The offload rule can only attached to offload base chain.

Th  offload base chain is created after the device driver loaded (the device exist).

>

^ permalink raw reply

* [PATCH] net: sched: Fix a possible null-pointer dereference in dequeue_func()
From: Jia-Ju Bai @ 2019-07-29  2:21 UTC (permalink / raw)
  To: jhs, xiyou.wangcong, jiri, davem; +Cc: netdev, linux-kernel, Jia-Ju Bai

In dequeue_func(), there is an if statement on line 74 to check whether
skb is NULL:
    if (skb)

When skb is NULL, it is used on line 77:
    prefetch(&skb->end);

Thus, a possible null-pointer dereference may occur.

To fix this bug, skb->end is used when skb is not NULL.

This bug is found by a static analysis tool STCheck written by us.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
---
 net/sched/sch_codel.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 25ef172c23df..30169b3adbbb 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -71,10 +71,10 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
 	struct Qdisc *sch = ctx;
 	struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
 
-	if (skb)
+	if (skb) {
 		sch->qstats.backlog -= qdisc_pkt_len(skb);
-
-	prefetch(&skb->end); /* we'll need skb_shinfo() */
+		prefetch(&skb->end); /* we'll need skb_shinfo() */
+	}
 	return skb;
 }
 
-- 
2.17.0


^ permalink raw reply related

* Re: [PATCH 4.4 stable net] net: tcp: Fix use-after-free in tcp_write_xmit
From: maowenan @ 2019-07-29  1:26 UTC (permalink / raw)
  To: Greg KH; +Cc: stable, davem, netdev, linux-kernel
In-Reply-To: <20190727114001.GA6685@kroah.com>



On 2019/7/27 19:40, Greg KH wrote:
> On Sat, Jul 27, 2019 at 07:22:30PM +0800, maowenan wrote:
>>
>>
>> On 2019/7/27 18:44, maowenan wrote:
>>>
>>>
>>> On 2019/7/24 20:13, maowenan wrote:
>>>>
>>>>
>>>> On 2019/7/24 19:05, Greg KH wrote:
>>>>> On Wed, Jul 24, 2019 at 05:17:15PM +0800, Mao Wenan wrote:
>>>>>> There is one report about tcp_write_xmit use-after-free with version 4.4.136:
>>>>>>
>>>>>> BUG: KASAN: use-after-free in tcp_skb_pcount include/net/tcp.h:796 [inline]
>>>>>> BUG: KASAN: use-after-free in tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
>>>>>> BUG: KASAN: use-after-free in tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
>>>>>> Read of size 2 at addr ffff8801d6fc87b0 by task syz-executor408/4195
>>>>>>
>>>>>> CPU: 0 PID: 4195 Comm: syz-executor408 Not tainted 4.4.136-gfb7e319 #59
>>>>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>>>>>>  0000000000000000 7d8f38ecc03be946 ffff8801d73b7710 ffffffff81e0edad
>>>>>>  ffffea00075bf200 ffff8801d6fc87b0 0000000000000000 ffff8801d6fc87b0
>>>>>>  dffffc0000000000 ffff8801d73b7748 ffffffff815159b6 ffff8801d6fc87b0
>>>>>> Call Trace:
>>>>>>  [<ffffffff81e0edad>] __dump_stack lib/dump_stack.c:15 [inline]
>>>>>>  [<ffffffff81e0edad>] dump_stack+0xc1/0x124 lib/dump_stack.c:51
>>>>>>  [<ffffffff815159b6>] print_address_description+0x6c/0x216 mm/kasan/report.c:252
>>>>>>  [<ffffffff81515cd5>] kasan_report_error mm/kasan/report.c:351 [inline]
>>>>>>  [<ffffffff81515cd5>] kasan_report.cold.7+0x175/0x2f7 mm/kasan/report.c:408
>>>>>>  [<ffffffff814f9784>] __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:427
>>>>>>  [<ffffffff83286582>] tcp_skb_pcount include/net/tcp.h:796 [inline]
>>>>>>  [<ffffffff83286582>] tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
>>>>>>  [<ffffffff83286582>] tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
>>>>>>  [<ffffffff83287a40>] __tcp_push_pending_frames+0xa0/0x290 net/ipv4/tcp_output.c:2307
>>>>>>  [<ffffffff8328e966>] tcp_send_fin+0x176/0xab0 net/ipv4/tcp_output.c:2883
>>>>>>  [<ffffffff8324c0d0>] tcp_close+0xca0/0xf70 net/ipv4/tcp.c:2112
>>>>>>  [<ffffffff832f8d0f>] inet_release+0xff/0x1d0 net/ipv4/af_inet.c:435
>>>>>>  [<ffffffff82f1a156>] sock_release+0x96/0x1c0 net/socket.c:586
>>>>>>  [<ffffffff82f1a296>] sock_close+0x16/0x20 net/socket.c:1037
>>>>>>  [<ffffffff81522da5>] __fput+0x235/0x6f0 fs/file_table.c:208
>>>>>>  [<ffffffff815232e5>] ____fput+0x15/0x20 fs/file_table.c:244
>>>>>>  [<ffffffff8118bd7f>] task_work_run+0x10f/0x190 kernel/task_work.c:115
>>>>>>  [<ffffffff81135285>] exit_task_work include/linux/task_work.h:21 [inline]
>>>>>>  [<ffffffff81135285>] do_exit+0x9e5/0x26b0 kernel/exit.c:759
>>>>>>  [<ffffffff8113b1d1>] do_group_exit+0x111/0x330 kernel/exit.c:889
>>>>>>  [<ffffffff8115e5cc>] get_signal+0x4ec/0x14b0 kernel/signal.c:2321
>>>>>>  [<ffffffff8100e02b>] do_signal+0x8b/0x1d30 arch/x86/kernel/signal.c:712
>>>>>>  [<ffffffff8100360a>] exit_to_usermode_loop+0x11a/0x160 arch/x86/entry/common.c:248
>>>>>>  [<ffffffff81006535>] prepare_exit_to_usermode arch/x86/entry/common.c:283 [inline]
>>>>>>  [<ffffffff81006535>] syscall_return_slowpath+0x1b5/0x1f0 arch/x86/entry/common.c:348
>>>>>>  [<ffffffff838c29b5>] int_ret_from_sys_call+0x25/0xa3
>>>>>>
>>>>>> Allocated by task 4194:
>>>>>>  [<ffffffff810341d6>] save_stack_trace+0x26/0x50 arch/x86/kernel/stacktrace.c:63
>>>>>>  [<ffffffff814f8873>] save_stack+0x43/0xd0 mm/kasan/kasan.c:512
>>>>>>  [<ffffffff814f8b57>] set_track mm/kasan/kasan.c:524 [inline]
>>>>>>  [<ffffffff814f8b57>] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:616
>>>>>>  [<ffffffff814f9122>] kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:554
>>>>>>  [<ffffffff814f4c1e>] slab_post_alloc_hook mm/slub.c:1349 [inline]
>>>>>>  [<ffffffff814f4c1e>] slab_alloc_node mm/slub.c:2615 [inline]
>>>>>>  [<ffffffff814f4c1e>] slab_alloc mm/slub.c:2623 [inline]
>>>>>>  [<ffffffff814f4c1e>] kmem_cache_alloc+0xbe/0x2a0 mm/slub.c:2628
>>>>>>  [<ffffffff82f380a6>] kmem_cache_alloc_node include/linux/slab.h:350 [inline]
>>>>>>  [<ffffffff82f380a6>] __alloc_skb+0xe6/0x600 net/core/skbuff.c:218
>>>>>>  [<ffffffff832466c3>] alloc_skb_fclone include/linux/skbuff.h:856 [inline]
>>>>>>  [<ffffffff832466c3>] sk_stream_alloc_skb+0xa3/0x5d0 net/ipv4/tcp.c:833
>>>>>>  [<ffffffff83249164>] tcp_sendmsg+0xd34/0x2b00 net/ipv4/tcp.c:1178
>>>>>>  [<ffffffff83300ef3>] inet_sendmsg+0x203/0x4d0 net/ipv4/af_inet.c:755
>>>>>>  [<ffffffff82f1e1fc>] sock_sendmsg_nosec net/socket.c:625 [inline]
>>>>>>  [<ffffffff82f1e1fc>] sock_sendmsg+0xcc/0x110 net/socket.c:635
>>>>>>  [<ffffffff82f1eedc>] SYSC_sendto+0x21c/0x370 net/socket.c:1665
>>>>>>  [<ffffffff82f21560>] SyS_sendto+0x40/0x50 net/socket.c:1633
>>>>>>  [<ffffffff838c2825>] entry_SYSCALL_64_fastpath+0x22/0x9e
>>>>>>
>>>>>> Freed by task 4194:
>>>>>>  [<ffffffff810341d6>] save_stack_trace+0x26/0x50 arch/x86/kernel/stacktrace.c:63
>>>>>>  [<ffffffff814f8873>] save_stack+0x43/0xd0 mm/kasan/kasan.c:512
>>>>>>  [<ffffffff814f91a2>] set_track mm/kasan/kasan.c:524 [inline]
>>>>>>  [<ffffffff814f91a2>] kasan_slab_free+0x72/0xc0 mm/kasan/kasan.c:589
>>>>>>  [<ffffffff814f632e>] slab_free_hook mm/slub.c:1383 [inline]
>>>>>>  [<ffffffff814f632e>] slab_free_freelist_hook mm/slub.c:1405 [inline]
>>>>>>  [<ffffffff814f632e>] slab_free mm/slub.c:2859 [inline]
>>>>>>  [<ffffffff814f632e>] kmem_cache_free+0xbe/0x340 mm/slub.c:2881
>>>>>>  [<ffffffff82f3527f>] kfree_skbmem+0xcf/0x100 net/core/skbuff.c:635
>>>>>>  [<ffffffff82f372fd>] __kfree_skb+0x1d/0x20 net/core/skbuff.c:676
>>>>>>  [<ffffffff83288834>] sk_wmem_free_skb include/net/sock.h:1447 [inline]
>>>>>>  [<ffffffff83288834>] tcp_write_queue_purge include/net/tcp.h:1460 [inline]
>>>>>>  [<ffffffff83288834>] tcp_connect_init net/ipv4/tcp_output.c:3122 [inline]
>>>>>>  [<ffffffff83288834>] tcp_connect+0xb24/0x30c0 net/ipv4/tcp_output.c:3261
>>>>>>  [<ffffffff8329b991>] tcp_v4_connect+0xf31/0x1890 net/ipv4/tcp_ipv4.c:246
>>>>>>  [<ffffffff832f9ca9>] __inet_stream_connect+0x2a9/0xc30 net/ipv4/af_inet.c:615
>>>>>>  [<ffffffff832fa685>] inet_stream_connect+0x55/0xa0 net/ipv4/af_inet.c:676
>>>>>>  [<ffffffff82f1eb78>] SYSC_connect+0x1b8/0x300 net/socket.c:1557
>>>>>>  [<ffffffff82f214b4>] SyS_connect+0x24/0x30 net/socket.c:1538
>>>>>>  [<ffffffff838c2825>] entry_SYSCALL_64_fastpath+0x22/0x9e
>>>>>>
>>>>>> Syzkaller reproducer():
>>>>>> r0 = socket$packet(0x11, 0x3, 0x300)
>>>>>> r1 = socket$inet_tcp(0x2, 0x1, 0x0)
>>>>>> bind$inet(r1, &(0x7f0000000300)={0x2, 0x4e21, @multicast1}, 0x10)
>>>>>> connect$inet(r1, &(0x7f0000000140)={0x2, 0x1000004e21, @loopback}, 0x10)
>>>>>> recvmmsg(r1, &(0x7f0000001e40)=[{{0x0, 0x0, &(0x7f0000000100)=[{&(0x7f00000005c0)=""/88, 0x58}], 0x1}}], 0x1, 0x40000000, 0x0)
>>>>>> sendto$inet(r1, &(0x7f0000000000)="e2f7ad5b661c761edf", 0x9, 0x8080, 0x0, 0x0)
>>>>>> r2 = fcntl$dupfd(r1, 0x0, r0)
>>>>>> connect$unix(r2, &(0x7f00000001c0)=@file={0x0, './file0\x00'}, 0x6e)
>>>>>>
>>>>>> C repro link: https://syzkaller.appspot.com/text?tag=ReproC&x=14db474f800000
>>>>>>
>>>>>> This is because when tcp_connect_init call tcp_write_queue_purge, it will
>>>>>> kfree all the skb in the write_queue, but the sk->sk_send_head forget to set NULL,
>>>>>> then tcp_write_xmit try to send skb, which has freed in tcp_write_queue_purge, UAF happens.
>>>>>>
>>>>>> Signed-off-by: Mao Wenan <maowenan@huawei.com>
>>>>>> ---
>>>>>>  include/net/tcp.h | 1 +
>>>>>>  1 file changed, 1 insertion(+)
>>>>>>
>>>>>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>>>>>> index bf8a0dae977a..8f8aace28cf8 100644
>>>>>> --- a/include/net/tcp.h
>>>>>> +++ b/include/net/tcp.h
>>>>>> @@ -1457,6 +1457,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
>>>>>>  
>>>>>>  	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
>>>>>>  		sk_wmem_free_skb(sk, skb);
>>>>>> +	sk->sk_send_head = NULL;
>>>>>>  	sk_mem_reclaim(sk);
>>>>>>  	tcp_clear_all_retrans_hints(tcp_sk(sk));
>>>>>>  	inet_csk(sk)->icsk_backoff = 0;
>>>>>
>>>>> Does this corrispond with a specific commit that is already in Linus's
>>>>> tree?  If not, why, did we change/mess something up when doing
>>>>> backports, or is the code just that different?
>>>>>
>>>>> Also, is this needed in 4.9.y, 4.14.y, 4.19.y, and/or 5.2.y?  Why just
>>>>> 4.4.y?
>>>
>>> Greg,
>>>
>>> I have tested latest stable tree
>>> 4.4.186 oops
>>> 4.9.151 oops
>>> 4.14.106 NO oops
>>>
>>> This patch can simple fix them.
>>
>> I have checked 4.14.y it has already existed the same fix as mine, this is the reason why 4.14.106 is NO oops.
>> commit dbbf2d1e4077bab0c65ece2765d3fc69cf7d610f
>> Author: Soheil Hassas Yeganeh <soheil@google.com>
>> Date:   Thu Mar 15 12:09:13 2018 -0400
>>
>>     tcp: reset sk_send_head in tcp_write_queue_purge
>>
> 
> So if this patch is backported to 4.4.y and 4.9.y all will be fine?
> 
yes, all are fine, but the scenarios are different, additional description should be added when backport.

4.4 and 4.9 don't have the commit abb4a8b870b5 ("tcp: purge write queue upon RST") which is referred in dbbf2d1e4077:
in tcp_connect_init calls tcp_write_queue_purge, and does not reset sk_send_head, then UAF.

4.14 have the commit abb4a8b870b5 ("tcp: purge write queue upon RST"),
in tcp_reset calls tcp_write_queue_purge(sk), and does not reset sk_send_head, then UAF.

> thanks,
> 
> greg k-h
> 
> .
> 


^ permalink raw reply

* Re: [PATCH V3 net-next 06/10] net: hns3: add debug messages to identify eth down cause
From: tanhuazhong @ 2019-07-29  1:21 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
	liuyonglong, lipeng321
In-Reply-To: <20190727.190333.249806415176311786.davem@davemloft.net>



On 2019/7/28 10:03, David Miller wrote:
> From: Huazhong Tan <tanhuazhong@huawei.com>
> Date: Sat, 27 Jul 2019 13:46:08 +0800
> 
>> From: Yonglong Liu <liuyonglong@huawei.com>
>>
>> Some times just see the eth interface have been down/up via
>> dmesg, but can not know why the eth down. So adds some debug
>> messages to identify the cause for this.
>>
>> Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
>> Signed-off-by: Peng Li <lipeng321@huawei.com>
>> Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
>> ---
>>   drivers/net/ethernet/hisilicon/hns3/hns3_enet.c       | 18 ++++++++++++++++++
>>   drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c    | 19 +++++++++++++++++++
>>   .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c    | 11 +++++++++++
>>   3 files changed, 48 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>> index 4d58c53..973c57b 100644
>> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>> @@ -459,6 +459,9 @@ static int hns3_nic_net_open(struct net_device *netdev)
>>   		h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
>>   
>>   	hns3_config_xps(priv);
>> +
>> +	netif_info(h, drv, netdev, "net open\n");
> 
> These will pollute everyone's kernel logs for normal operations.
> 
> This is not reasonable at all, sorry.
> 
> Furthermore, even if it was appropriate, "netif_info()" is not "debug".
> 

Will replace it with netif_dbg.
thanks.

> 
> .
> 


^ permalink raw reply

* Re: memory leak in bio_copy_user_iov
From: Bob Liu @ 2019-07-29  1:03 UTC (permalink / raw)
  To: syzbot, agk, axboe, coreteam, davem, dm-devel, hdanton, kaber,
	kadlec, linux-block, linux-kernel, linux-raid, netdev,
	netfilter-devel, pablo, shli, snitzer, syzkaller-bugs
In-Reply-To: <000000000000aec4ec058ec71a3d@google.com>

On 7/29/19 8:38 AM, syzbot wrote:
> syzbot has bisected this bug to:
> 
> commit 664820265d70a759dceca87b6eb200cd2b93cda8
> Author: Mike Snitzer <snitzer@redhat.com>
> Date:   Thu Feb 18 20:44:39 2016 +0000
> 
>     dm: do not return target from dm_get_live_table_for_ioctl()
> 

This(and previous bisection) look not related to the reported leak.


A possible reason may be KASAN can't recognize the failure path of bio_alloc_bioset()
where mempool_free() is called but not kmalloc(p).

But it's not a real bug, because we have the condition if (nr_iovecs > inline_vecs).

Below fix may avoid the syzbot bug report..

diff --git a/block/bio.c b/block/bio.c
index 4db1008..04a7879 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -513,8 +513,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
                        bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
                }
 
-               if (unlikely(!bvl))
-                       goto err_free;
+               if (unlikely(!bvl)) {
+                       mempool_free(p, &bs->bio_pool);
+                       return NULL;
+               }
 
                bio->bi_flags |= idx << BVEC_POOL_OFFSET;
        } else if (nr_iovecs) {
@@ -525,10 +527,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
        bio->bi_max_vecs = nr_iovecs;
        bio->bi_io_vec = bvl;
        return bio;
-
-err_free:
-       mempool_free(p, &bs->bio_pool);
-       return NULL;
 }
 EXPORT_SYMBOL(bio_alloc_bioset);


Regards, -Bob

> bisection log:  https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_bisect.txt-3Fx-3D13f4eb64600000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=MNjYy_nft_s0ErmK2n89p7y2yhKmeWlxWch0z7_dsm8&e=start commit:   0011572c Merge branch 'for-5.2-fixes' of git://git.kernel...
> git tree:       upstream
> final crash:    https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_report.txt-3Fx-3D100ceb64600000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=iviPOQNPEIjkuqBma_VWEQ9l1Ve3eOiTwads42E4ZPo&e=console output: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_log.txt-3Fx-3D17f4eb64600000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=MBwnFwjEcSQfYymfv8EYt_EawVdK9vD-OAqDMutO-YY&e=kernel config:  https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_.config-3Fx-3Dcb38d33cd06d8d48&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=SqmDUenNFS-961PGgiMW5mIUv0nIBrf0oBrzUxYZ8Do&e=dashboard link:
> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D03e5c8ebd22cc6c3a8cb&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=jKd2ocY5X94uyB8Or-OC3yffbOgClPQPlXqFnLzvvSY&e=syz repro:      https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_repro.syz-3Fx-3D13244221a00000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=K-C39Kcd1oEOtJKwnby-s1EyEZZA10mr9bcXZ0J9Kh0&e=C reproducer:   https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_repro.c-3Fx-3D117b2432a00000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=7J685CwQN6_FA2KgO3Vgy1msF0zi5O0OqZj_bgvEqBE&e=
> Reported-by: syzbot+03e5c8ebd22cc6c3a8cb@syzkaller.appspotmail.com
> Fixes: 664820265d70 ("dm: do not return target from dm_get_live_table_for_ioctl()")
> 
> For information about bisection process see: https://urldefense.proofpoint.com/v2/url?u=https-3A__goo.gl_tpsmEJ-23bisection&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=rs52TkiEQCrV4V8YQa2wT55HD8E-0AX9pn7MNIDcje4&e=


^ permalink raw reply related

* [PATCH] net: ehea: Mark expected switch fall-through
From: Gustavo A. R. Silva @ 2019-07-29  0:30 UTC (permalink / raw)
  To: Douglas Miller, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva, Stephen Rothwell,
	Kees Cook

Mark switch cases where we are expecting to fall through.

This patch fixes the following warning:

drivers/net/ethernet/ibm/ehea/ehea_main.c: In function 'ehea_mem_notifier':
include/linux/printk.h:311:2: warning: this statement may fall through [-Wimplicit-fallthrough=]
  printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/ibm/ehea/ehea_main.c:3253:3: note: in expansion of macro 'pr_info'
   pr_info("memory offlining canceled");
   ^~~~~~~
drivers/net/ethernet/ibm/ehea/ehea_main.c:3256:2: note: here
  case MEM_ONLINE:
  ^~~~

Notice that, in this particular case, the code comment is
modified in accordance with what GCC is expecting to find.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/ibm/ehea/ehea_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 4138a8480347..cca71ba7a74a 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -3251,7 +3251,7 @@ static int ehea_mem_notifier(struct notifier_block *nb,
 	switch (action) {
 	case MEM_CANCEL_OFFLINE:
 		pr_info("memory offlining canceled");
-		/* Fall through: re-add canceled memory block */
+		/* Fall through - re-add canceled memory block */
 
 	case MEM_ONLINE:
 		pr_info("memory is going online");
-- 
2.22.0


^ permalink raw reply related

* [PATCH] net: spider_net: Mark expected switch fall-through
From: Gustavo A. R. Silva @ 2019-07-29  0:32 UTC (permalink / raw)
  To: Ishizaki Kou, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva, Stephen Rothwell,
	Kees Cook

Mark switch cases where we are expecting to fall through.

This patch fixes the following warning:

drivers/net/ethernet/toshiba/spider_net.c: In function 'spider_net_release_tx_chain':
drivers/net/ethernet/toshiba/spider_net.c:783:7: warning: this statement may fall through [-Wimplicit-fallthrough=]
    if (!brutal) {
       ^
drivers/net/ethernet/toshiba/spider_net.c:792:3: note: here
   case SPIDER_NET_DESCR_RESPONSE_ERROR:
   ^~~~

Notice that, in this particular case, the code comment is
modified in accordance with what GCC is expecting to find.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/toshiba/spider_net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index 5b196ebfed49..0f346761a2b2 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -788,6 +788,7 @@ spider_net_release_tx_chain(struct spider_net_card *card, int brutal)
 			/* fallthrough, if we release the descriptors
 			 * brutally (then we don't care about
 			 * SPIDER_NET_DESCR_CARDOWNED) */
+			/* Fall through */
 
 		case SPIDER_NET_DESCR_RESPONSE_ERROR:
 		case SPIDER_NET_DESCR_PROTECTION_ERROR:
-- 
2.22.0


^ permalink raw reply related

* Re: memory leak in bio_copy_user_iov
From: syzbot @ 2019-07-29  0:38 UTC (permalink / raw)
  To: agk, axboe, coreteam, davem, dm-devel, hdanton, kaber, kadlec,
	linux-block, linux-kernel, linux-raid, netdev, netfilter-devel,
	pablo, shli, snitzer, syzkaller-bugs
In-Reply-To: <000000000000c75fb7058ba0c0e4@google.com>

syzbot has bisected this bug to:

commit 664820265d70a759dceca87b6eb200cd2b93cda8
Author: Mike Snitzer <snitzer@redhat.com>
Date:   Thu Feb 18 20:44:39 2016 +0000

     dm: do not return target from dm_get_live_table_for_ioctl()

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=13f4eb64600000
start commit:   0011572c Merge branch 'for-5.2-fixes' of git://git.kernel...
git tree:       upstream
final crash:    https://syzkaller.appspot.com/x/report.txt?x=100ceb64600000
console output: https://syzkaller.appspot.com/x/log.txt?x=17f4eb64600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=cb38d33cd06d8d48
dashboard link: https://syzkaller.appspot.com/bug?extid=03e5c8ebd22cc6c3a8cb
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=13244221a00000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=117b2432a00000

Reported-by: syzbot+03e5c8ebd22cc6c3a8cb@syzkaller.appspotmail.com
Fixes: 664820265d70 ("dm: do not return target from  
dm_get_live_table_for_ioctl()")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Andrew Lunn @ 2019-07-28 23:07 UTC (permalink / raw)
  To: Allan W. Nielsen
  Cc: Horatiu Vultur, Nikolay Aleksandrov, roopa, davem, bridge, netdev,
	linux-kernel
In-Reply-To: <20190728191558.zuopgfqza2iz5d5b@lx-anielsen.microsemi.net>

> Trying to get back to the original problem:
> 
> We have a network which implements the ODVA/DLR ring protocol. This protocol
> sends out a beacon frame as often as every 3 us (as far as I recall, default I
> believe is 400 us) to this MAC address: 01:21:6C:00:00:01.
> 
> Try take a quick look at slide 10 in [1].
> 
> If we assume that the SwitchDev driver implemented such that all multicast
> traffic goes to the CPU, then we should really have a way to install a HW
> offload path in the silicon, such that these packets does not go to the CPU (as
> they are known not to be use full, and a frame every 3 us is a significant load
> on small DMA connections and CPU resources).
> 
> If we assume that the SwitchDev driver implemented such that only "needed"
> multicast packets goes to the CPU, then we need a way to get these packets in
> case we want to implement the DLR protocol.
> 
> I'm sure that both models can work, and I do not think that this is the main
> issue here.
> 
> Our initial attempt was to allow install static L2-MAC entries and append
> multiple ports to such an entry in the MAC table. This was rejected, for several
> good reasons it seems. But I'm not sure it was clear what we wanted to achieve,
> and why we find it to be important. Hopefully this is clear with a real world
> use-case.
> 
> Any hints or ideas on what would be a better way to solve this problems will be
> much appreciated.

I always try to think about how this would work if i had a bunch of
discrete network interfaces, not a switch. What APIs are involved in
configuring such a system? How does the Linux network stack perform
software DLR? How is the reception and blocking of the multicast group
performed?

Once you understand how it works in the software implement, it should
then be more obvious which switchdev hooks should be used to
accelerate this using hardware.

	   Andrew

^ permalink raw reply

* Re: [PATCH v3] net: dsa: qca8k: enable port flow control
From: Andrew Lunn @ 2019-07-28 22:31 UTC (permalink / raw)
  To: xiaofeis
  Cc: davem, vkoul, netdev, linux-arm-msm, bjorn.andersson,
	vivien.didelot, f.fainelli, niklas.cassel, xiazha
In-Reply-To: <1564275470-52666-1-git-send-email-xiaofeis@codeaurora.org>

On Sun, Jul 28, 2019 at 08:57:50AM +0800, xiaofeis wrote:
> Set phy device advertising to enable MAC flow control.

Hi Xiaofei.

This is half of the needed change for MAC flow control.

phy_support_asym_pause(phy) is used by the MAC to tell the PHY layer
that the MAC supports flow control. The PHY will then advertise
this. When auto-negotiation is completed, the PHY layer will call
qca8k_adjust_link() with the results. It could be that the peer does
not support flow control, or only supports symmetric flow control.  So
in that function, you need to program the MAC with the results of the
auto-neg. This is currently missing. You need to look at phydev->pause
and phydev->asym_pause to decide how to configure the MAC.

       Andrew

^ permalink raw reply

* Re: [PATCH net] net: hns: fix LED configuration for marvell phy
From: Andrew Lunn @ 2019-07-28 22:14 UTC (permalink / raw)
  To: Pavel Machek
  Cc: liuyonglong, David Miller, netdev, linux-kernel, linuxarm,
	salil.mehta, yisen.zhuang, shiju.jose
In-Reply-To: <20190728132412.GC8718@xo-6d-61-c0.localdomain>

On Sun, Jul 28, 2019 at 03:24:12PM +0200, Pavel Machek wrote:
> On Thu 2019-07-25 06:28:29, Andrew Lunn wrote:
> > On Thu, Jul 25, 2019 at 11:00:08AM +0800, liuyonglong wrote:
> > > > Revert "net: hns: fix LED configuration for marvell phy"
> > > > This reverts commit f4e5f775db5a4631300dccd0de5eafb50a77c131.
> > > >
> > > > Andrew Lunn says this should be handled another way.
> > > >
> > > > Signed-off-by: David S. Miller <davem@davemloft.net>
> > > 
> > > 
> > > Hi Andrew:
> > > 
> > > I see this patch have been reverted, can you tell me the better way to do this?
> > > Thanks very much!
> > 
> > Please take a look at the work Matthias Kaehlcke is doing. It has not
> > got too far yet, but when it is complete, it should define a generic
> > way to configure PHY LEDs.
> 
> I don't remember PHY LED discussion from LED mailing list. Would you have a pointer?

Hi Pavel 

So far, it has not made it onto the generic LED list. And the current
implementation is unlikely to go as far as using the generic LED
code. But i would like the binding to be compatible with it, so that
some time in the future it could be migrated to being part of the
generic LED code. But that would also require extensions to the
generic LED code to support hardware offload of triggers.

	Andrew

^ permalink raw reply

* Re: [PATCH] tcp: add new tcp_mtu_probe_floor sysctl
From: Josh Hunt @ 2019-07-28 21:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David Miller
In-Reply-To: <5a054ca5-4077-5e91-69d5-f1add8dc8bfa@akamai.com>

On 7/28/19 2:14 PM, Josh Hunt wrote:
> On 7/28/19 6:54 AM, Eric Dumazet wrote:
>> On Sun, Jul 28, 2019 at 1:21 AM Josh Hunt <johunt@akamai.com> wrote:
>>>
>>> On 7/27/19 12:05 AM, Eric Dumazet wrote:
>>>> On Sat, Jul 27, 2019 at 4:23 AM Josh Hunt <johunt@akamai.com> wrote:
>>>>>
>>>>> The current implementation of TCP MTU probing can considerably
>>>>> underestimate the MTU on lossy connections allowing the MSS to get 
>>>>> down to
>>>>> 48. We have found that in almost all of these cases on our networks 
>>>>> these
>>>>> paths can handle much larger MTUs meaning the connections are being
>>>>> artificially limited. Even though TCP MTU probing can raise the MSS 
>>>>> back up
>>>>> we have seen this not to be the case causing connections to be 
>>>>> "stuck" with
>>>>> an MSS of 48 when heavy loss is present.
>>>>>
>>>>> Prior to pushing out this change we could not keep TCP MTU probing 
>>>>> enabled
>>>>> b/c of the above reasons. Now with a reasonble floor set we've had it
>>>>> enabled for the past 6 months.
>>>>
>>>> And what reasonable value have you used ???
>>>
>>> Reasonable for some may not be reasonable for others hence the new
>>> sysctl :) We're currently running with a fairly high value based off of
>>> the v6 min MTU minus headers and options, etc. We went conservative with
>>> our setting initially as it seemed a reasonable first step when
>>> re-enabling TCP MTU probing since with no configurable floor we saw a #
>>> of cases where connections were using severely reduced mss b/c of loss
>>> and not b/c of actual path restriction. I plan to reevaluate the setting
>>> at some point, but since the probing method is still the same it means
>>> the same clients who got stuck with mss of 48 before will land at
>>> whatever floor we set. Looking forward we are interested in trying to
>>> improve TCP MTU probing so it does not penalize clients like this.
>>>
>>> A suggestion for a more reasonable floor default would be 512, which is
>>> the same as the min_pmtu. Given both mechanisms are trying to achieve
>>> the same goal it seems like they should have a similar min/floor.
>>>
>>>>
>>>>>
>>>>> The new sysctl will still default to TCP_MIN_SND_MSS (48), but gives
>>>>> administrators the ability to control the floor of MSS probing.
>>>>>
>>>>> Signed-off-by: Josh Hunt <johunt@akamai.com>
>>>>> ---
>>>>>    Documentation/networking/ip-sysctl.txt | 6 ++++++
>>>>>    include/net/netns/ipv4.h               | 1 +
>>>>>    net/ipv4/sysctl_net_ipv4.c             | 9 +++++++++
>>>>>    net/ipv4/tcp_ipv4.c                    | 1 +
>>>>>    net/ipv4/tcp_timer.c                   | 2 +-
>>>>>    5 files changed, 18 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/Documentation/networking/ip-sysctl.txt 
>>>>> b/Documentation/networking/ip-sysctl.txt
>>>>> index df33674799b5..49e95f438ed7 100644
>>>>> --- a/Documentation/networking/ip-sysctl.txt
>>>>> +++ b/Documentation/networking/ip-sysctl.txt
>>>>> @@ -256,6 +256,12 @@ tcp_base_mss - INTEGER
>>>>>           Path MTU discovery (MTU probing).  If MTU probing is 
>>>>> enabled,
>>>>>           this is the initial MSS used by the connection.
>>>>>
>>>>> +tcp_mtu_probe_floor - INTEGER
>>>>> +       If MTU probing is enabled this caps the minimum MSS used 
>>>>> for search_low
>>>>> +       for the connection.
>>>>> +
>>>>> +       Default : 48
>>>>> +
>>>>>    tcp_min_snd_mss - INTEGER
>>>>>           TCP SYN and SYNACK messages usually advertise an ADVMSS 
>>>>> option,
>>>>>           as described in RFC 1122 and RFC 6691.
>>>>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>>>>> index bc24a8ec1ce5..c0c0791b1912 100644
>>>>> --- a/include/net/netns/ipv4.h
>>>>> +++ b/include/net/netns/ipv4.h
>>>>> @@ -116,6 +116,7 @@ struct netns_ipv4 {
>>>>>           int sysctl_tcp_l3mdev_accept;
>>>>>    #endif
>>>>>           int sysctl_tcp_mtu_probing;
>>>>> +       int sysctl_tcp_mtu_probe_floor;
>>>>>           int sysctl_tcp_base_mss;
>>>>>           int sysctl_tcp_min_snd_mss;
>>>>>           int sysctl_tcp_probe_threshold;
>>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>>> index 0b980e841927..59ded25acd04 100644
>>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>>> @@ -820,6 +820,15 @@ static struct ctl_table ipv4_net_table[] = {
>>>>>                   .extra2         = &tcp_min_snd_mss_max,
>>>>>           },
>>>>>           {
>>>>> +               .procname       = "tcp_mtu_probe_floor",
>>>>> +               .data           = 
>>>>> &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
>>>>> +               .maxlen         = sizeof(int),
>>>>> +               .mode           = 0644,
>>>>> +               .proc_handler   = proc_dointvec_minmax,
>>>>> +               .extra1         = &tcp_min_snd_mss_min,
>>>>> +               .extra2         = &tcp_min_snd_mss_max,
>>>>> +       },
>>>>> +       {
>>>>>                   .procname       = "tcp_probe_threshold",
>>>>>                   .data           = 
>>>>> &init_net.ipv4.sysctl_tcp_probe_threshold,
>>>>>                   .maxlen         = sizeof(int),
>>>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>>>> index d57641cb3477..e0a372676329 100644
>>>>> --- a/net/ipv4/tcp_ipv4.c
>>>>> +++ b/net/ipv4/tcp_ipv4.c
>>>>> @@ -2637,6 +2637,7 @@ static int __net_init tcp_sk_init(struct net 
>>>>> *net)
>>>>>           net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>>>>>           net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>>>>>           net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>>>>> +       net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
>>>>>
>>>>>           net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>>>>>           net->ipv4.sysctl_tcp_keepalive_probes = 
>>>>> TCP_KEEPALIVE_PROBES;
>>>>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>>>>> index c801cd37cc2a..dbd9d2d0ee63 100644
>>>>> --- a/net/ipv4/tcp_timer.c
>>>>> +++ b/net/ipv4/tcp_timer.c
>>>>> @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct 
>>>>> inet_connection_sock *icsk, struct sock *sk)
>>>>>           } else {
>>>>>                   mss = tcp_mtu_to_mss(sk, 
>>>>> icsk->icsk_mtup.search_low) >> 1;
>>>>>                   mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>>>>> -               mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
>>>>> +               mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
>>>>>                   mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
>>>>>                   icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, 
>>>>> mss);
>>>>>           }
>>>>
>>>>
>>>> Existing sysctl should be enough ?
>>>
>>> I don't think so. Changing tcp_min_snd_mss could impact clients that
>>> really want/need a small mss. When you added the new sysctl I tried to
>>> analyze the mss values we're seeing to understand what we could possibly
>>> raise it to. While not a huge amount, we see more clients than I
>>> expected announcing mss values in the 180-512 range. Given that I would
>>> not feel comfortable setting tcp_min_snd_mss to say 512 as I suggested
>>> above.
>>
>> If these clients need mss values in 180-512 ranges, how MTU probing
>> would work for them,
>> if you set a floor to 512 ?
> 
> First, we already seem to be fine with ignoring these paths with ICMP 
> based PMTU discovery b/c of our min_pmtu default of 512 and that is 
> configurable. Second by adding this sysctl we're giving administrators 
> the choice to decide if they'd like to attempt to support these very 
> very small # of paths which may be below 512 (MSS <= 512 does not mean 
> MTU <= 512) or cover themselves by being able to raise the floor to not 
> penalize clients who may be on very lossy networks.
> 
>>
>> Are we sure the intent of tcp_base_mss was not to act as a floor ?
> 
> My understanding is that tcp_base_mss is meant to be the initial value 
> of search_low (as per Docs). Then in RFC 4821 [1] Sections 7.2, shows 
> search_low should be configurable, and 7.7 we see that in response to 
> successive black hole detection search_low should be halved. So I don't 
> think it was meant to be a floor, but just the initial search_low param. 
> Also note that in that same section they suggest a floor of 68 for v4, 
> but a floor of 1280 for v6 which we do not adhere to currently.
> 

Clarification. We == Akamai in regards to setting tcp_base_mss to 
1400-overheads. Upstream default is 1024.

> We actually set tcp_base_mss to something close to the value suggested 
> towards the end of section 7.2 of the RFC of 1400 bytes minus IP and 
> Transport overheads and options. This way we have more realistic 
> searching based on the majority of clients that we see. The kernel winds 
> up using initial search_low/tcp_base_mss as initial eff_pmtu, so we see 
> something like:
> 
> 21:03:41.314612 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:41.670307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:42.030308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:42.534307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:43.198308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:44.478307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:47.742310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq 
> 1:1349, ack 1, win 229, length 1348: HTTP
> 21:03:56.702310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq 
> 1:675, ack 1, win 229, length 674: HTTP
> 
> For further evidence this is a real problem here's a sample of mss 
> values I found when originally investigating this problem for us:
> 
> I dug up some #s I found when originally investigating this problem:
> 
> # ss -emoitn | grep mss | sed "s/.*mss:\([0-9]*\).*/\1/" | sort -u | 
> sort -g | head -5
> 
> 36:11
> 64:7
> 72:1
> 128:13
> 144:4
> 
>  From what I could tell these connections were on paths much larger than 
> the mss they were being forced to use. I determined this by looking at 
> the mss used for other objects fetched from the same IPs.
> 
> Josh
> 
> [1] - https://www.ietf.org/rfc/rfc4821.txt
> 
>>
>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>> index 
>> c801cd37cc2a9c11f2dd4b9681137755e501a538..6d15895e9dcfb2eff51bbcf3608c7e68c1970a9e 
>>
>> 100644
>> --- a/net/ipv4/tcp_timer.c
>> +++ b/net/ipv4/tcp_timer.c
>> @@ -153,7 +153,7 @@ static void tcp_mtu_probing(struct
>> inet_connection_sock *icsk, struct sock *sk)
>>                  icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
>>          } else {
>>                  mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) 
>> >> 1;
>> -               mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>> +               mss = max(net->ipv4.sysctl_tcp_base_mss, mss);
>>                  mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
>>                  mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
>>                  icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
>>
>>
>>
>>>
>>>>
>>>> tcp_min_snd_mss  documentation could be slightly updated.
>>>>
>>>> And maybe its default value could be raised a bit.
>>>>
>>>
>>> Thanks
>>> Josh

^ permalink raw reply

* Re: [PATCH] tcp: add new tcp_mtu_probe_floor sysctl
From: Josh Hunt @ 2019-07-28 21:14 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David Miller
In-Reply-To: <CANn89iLqeixzZkop8tqOQka_9ZiKurZL9Vj05bgU99M5Pbenqw@mail.gmail.com>

On 7/28/19 6:54 AM, Eric Dumazet wrote:
> On Sun, Jul 28, 2019 at 1:21 AM Josh Hunt <johunt@akamai.com> wrote:
>>
>> On 7/27/19 12:05 AM, Eric Dumazet wrote:
>>> On Sat, Jul 27, 2019 at 4:23 AM Josh Hunt <johunt@akamai.com> wrote:
>>>>
>>>> The current implementation of TCP MTU probing can considerably
>>>> underestimate the MTU on lossy connections allowing the MSS to get down to
>>>> 48. We have found that in almost all of these cases on our networks these
>>>> paths can handle much larger MTUs meaning the connections are being
>>>> artificially limited. Even though TCP MTU probing can raise the MSS back up
>>>> we have seen this not to be the case causing connections to be "stuck" with
>>>> an MSS of 48 when heavy loss is present.
>>>>
>>>> Prior to pushing out this change we could not keep TCP MTU probing enabled
>>>> b/c of the above reasons. Now with a reasonble floor set we've had it
>>>> enabled for the past 6 months.
>>>
>>> And what reasonable value have you used ???
>>
>> Reasonable for some may not be reasonable for others hence the new
>> sysctl :) We're currently running with a fairly high value based off of
>> the v6 min MTU minus headers and options, etc. We went conservative with
>> our setting initially as it seemed a reasonable first step when
>> re-enabling TCP MTU probing since with no configurable floor we saw a #
>> of cases where connections were using severely reduced mss b/c of loss
>> and not b/c of actual path restriction. I plan to reevaluate the setting
>> at some point, but since the probing method is still the same it means
>> the same clients who got stuck with mss of 48 before will land at
>> whatever floor we set. Looking forward we are interested in trying to
>> improve TCP MTU probing so it does not penalize clients like this.
>>
>> A suggestion for a more reasonable floor default would be 512, which is
>> the same as the min_pmtu. Given both mechanisms are trying to achieve
>> the same goal it seems like they should have a similar min/floor.
>>
>>>
>>>>
>>>> The new sysctl will still default to TCP_MIN_SND_MSS (48), but gives
>>>> administrators the ability to control the floor of MSS probing.
>>>>
>>>> Signed-off-by: Josh Hunt <johunt@akamai.com>
>>>> ---
>>>>    Documentation/networking/ip-sysctl.txt | 6 ++++++
>>>>    include/net/netns/ipv4.h               | 1 +
>>>>    net/ipv4/sysctl_net_ipv4.c             | 9 +++++++++
>>>>    net/ipv4/tcp_ipv4.c                    | 1 +
>>>>    net/ipv4/tcp_timer.c                   | 2 +-
>>>>    5 files changed, 18 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
>>>> index df33674799b5..49e95f438ed7 100644
>>>> --- a/Documentation/networking/ip-sysctl.txt
>>>> +++ b/Documentation/networking/ip-sysctl.txt
>>>> @@ -256,6 +256,12 @@ tcp_base_mss - INTEGER
>>>>           Path MTU discovery (MTU probing).  If MTU probing is enabled,
>>>>           this is the initial MSS used by the connection.
>>>>
>>>> +tcp_mtu_probe_floor - INTEGER
>>>> +       If MTU probing is enabled this caps the minimum MSS used for search_low
>>>> +       for the connection.
>>>> +
>>>> +       Default : 48
>>>> +
>>>>    tcp_min_snd_mss - INTEGER
>>>>           TCP SYN and SYNACK messages usually advertise an ADVMSS option,
>>>>           as described in RFC 1122 and RFC 6691.
>>>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>>>> index bc24a8ec1ce5..c0c0791b1912 100644
>>>> --- a/include/net/netns/ipv4.h
>>>> +++ b/include/net/netns/ipv4.h
>>>> @@ -116,6 +116,7 @@ struct netns_ipv4 {
>>>>           int sysctl_tcp_l3mdev_accept;
>>>>    #endif
>>>>           int sysctl_tcp_mtu_probing;
>>>> +       int sysctl_tcp_mtu_probe_floor;
>>>>           int sysctl_tcp_base_mss;
>>>>           int sysctl_tcp_min_snd_mss;
>>>>           int sysctl_tcp_probe_threshold;
>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>> index 0b980e841927..59ded25acd04 100644
>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>> @@ -820,6 +820,15 @@ static struct ctl_table ipv4_net_table[] = {
>>>>                   .extra2         = &tcp_min_snd_mss_max,
>>>>           },
>>>>           {
>>>> +               .procname       = "tcp_mtu_probe_floor",
>>>> +               .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
>>>> +               .maxlen         = sizeof(int),
>>>> +               .mode           = 0644,
>>>> +               .proc_handler   = proc_dointvec_minmax,
>>>> +               .extra1         = &tcp_min_snd_mss_min,
>>>> +               .extra2         = &tcp_min_snd_mss_max,
>>>> +       },
>>>> +       {
>>>>                   .procname       = "tcp_probe_threshold",
>>>>                   .data           = &init_net.ipv4.sysctl_tcp_probe_threshold,
>>>>                   .maxlen         = sizeof(int),
>>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>>> index d57641cb3477..e0a372676329 100644
>>>> --- a/net/ipv4/tcp_ipv4.c
>>>> +++ b/net/ipv4/tcp_ipv4.c
>>>> @@ -2637,6 +2637,7 @@ static int __net_init tcp_sk_init(struct net *net)
>>>>           net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>>>>           net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>>>>           net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>>>> +       net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
>>>>
>>>>           net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>>>>           net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
>>>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>>>> index c801cd37cc2a..dbd9d2d0ee63 100644
>>>> --- a/net/ipv4/tcp_timer.c
>>>> +++ b/net/ipv4/tcp_timer.c
>>>> @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
>>>>           } else {
>>>>                   mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
>>>>                   mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>>>> -               mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
>>>> +               mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
>>>>                   mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
>>>>                   icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
>>>>           }
>>>
>>>
>>> Existing sysctl should be enough ?
>>
>> I don't think so. Changing tcp_min_snd_mss could impact clients that
>> really want/need a small mss. When you added the new sysctl I tried to
>> analyze the mss values we're seeing to understand what we could possibly
>> raise it to. While not a huge amount, we see more clients than I
>> expected announcing mss values in the 180-512 range. Given that I would
>> not feel comfortable setting tcp_min_snd_mss to say 512 as I suggested
>> above.
> 
> If these clients need mss values in 180-512 ranges, how MTU probing
> would work for them,
> if you set a floor to 512 ?

First, we already seem to be fine with ignoring these paths with ICMP 
based PMTU discovery b/c of our min_pmtu default of 512 and that is 
configurable. Second by adding this sysctl we're giving administrators 
the choice to decide if they'd like to attempt to support these very 
very small # of paths which may be below 512 (MSS <= 512 does not mean 
MTU <= 512) or cover themselves by being able to raise the floor to not 
penalize clients who may be on very lossy networks.

> 
> Are we sure the intent of tcp_base_mss was not to act as a floor ?

My understanding is that tcp_base_mss is meant to be the initial value 
of search_low (as per Docs). Then in RFC 4821 [1] Sections 7.2, shows 
search_low should be configurable, and 7.7 we see that in response to 
successive black hole detection search_low should be halved. So I don't 
think it was meant to be a floor, but just the initial search_low param. 
Also note that in that same section they suggest a floor of 68 for v4, 
but a floor of 1280 for v6 which we do not adhere to currently.

We actually set tcp_base_mss to something close to the value suggested 
towards the end of section 7.2 of the RFC of 1400 bytes minus IP and 
Transport overheads and options. This way we have more realistic 
searching based on the majority of clients that we see. The kernel winds 
up using initial search_low/tcp_base_mss as initial eff_pmtu, so we see 
something like:

21:03:41.314612 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
1:1461, ack 1, win 229, length 1460: HTTP
21:03:41.670307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
1:1461, ack 1, win 229, length 1460: HTTP
21:03:42.030308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
1:1461, ack 1, win 229, length 1460: HTTP
21:03:42.534307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
1:1461, ack 1, win 229, length 1460: HTTP
21:03:43.198308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
1:1461, ack 1, win 229, length 1460: HTTP
21:03:44.478307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq 
1:1461, ack 1, win 229, length 1460: HTTP
21:03:47.742310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq 
1:1349, ack 1, win 229, length 1348: HTTP
21:03:56.702310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq 
1:675, ack 1, win 229, length 674: HTTP

For further evidence this is a real problem here's a sample of mss 
values I found when originally investigating this problem for us:

I dug up some #s I found when originally investigating this problem:

# ss -emoitn | grep mss | sed "s/.*mss:\([0-9]*\).*/\1/" | sort -u | 
sort -g | head -5

36:11
64:7
72:1
128:13
144:4

 From what I could tell these connections were on paths much larger than 
the mss they were being forced to use. I determined this by looking at 
the mss used for other objects fetched from the same IPs.

Josh

[1] - https://www.ietf.org/rfc/rfc4821.txt

> 
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index c801cd37cc2a9c11f2dd4b9681137755e501a538..6d15895e9dcfb2eff51bbcf3608c7e68c1970a9e
> 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -153,7 +153,7 @@ static void tcp_mtu_probing(struct
> inet_connection_sock *icsk, struct sock *sk)
>                  icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
>          } else {
>                  mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
> -               mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
> +               mss = max(net->ipv4.sysctl_tcp_base_mss, mss);
>                  mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
>                  mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
>                  icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
> 
> 
> 
>>
>>>
>>> tcp_min_snd_mss  documentation could be slightly updated.
>>>
>>> And maybe its default value could be raised a bit.
>>>
>>
>> Thanks
>> Josh

^ permalink raw reply

* Re: Slowness forming TIPC cluster with explicit node addresses
From: Chris Packham @ 2019-07-28 21:04 UTC (permalink / raw)
  To: jon.maloy@ericsson.com, tipc-discussion@lists.sourceforge.net
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <CH2PR15MB35754D65AB240A74AE488E719AC00@CH2PR15MB3575.namprd15.prod.outlook.com>

On Fri, 2019-07-26 at 13:31 +0000, Jon Maloy wrote:
> 
> > 
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org <netdev-owner@vger.kernel.org>
> > On
> > Behalf Of Chris Packham
> > Sent: 25-Jul-19 19:37
> > To: tipc-discussion@lists.sourceforge.net
> > Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org
> > Subject: Slowness forming TIPC cluster with explicit node addresses
> > 
> > Hi,
> > 
> > I'm having problems forming a TIPC cluster between 2 nodes.
> > 
> > This is the basic steps I'm going through on each node.
> > 
> > modprobe tipc
> > ip link set eth2 up
> > tipc node set addr 1.1.5 # or 1.1.6
> > tipc bearer enable media eth dev eth0
> eth2, I assume...
> 

Yes sorry I keep switching between between Ethernet ports for testing
so I hand edited the email.

> > 
> > 
> > Then to confirm if the cluster is formed I use tipc link list
> > 
> > [root@node-5 ~]# tipc link list
> > broadcast-link: up
> > ...
> > 
> > Looking at tcpdump the two nodes are sending packets
> > 
> > 22:30:05.782320 TIPC v2.0 1.1.5 > 0.0.0, headerlength 60 bytes,
> > MessageSize
> > 76 bytes, Neighbor Detection Protocol internal, messageType Link
> > request
> > 22:30:05.863555 TIPC v2.0 1.1.6 > 0.0.0, headerlength 60 bytes,
> > MessageSize
> > 76 bytes, Neighbor Detection Protocol internal, messageType Link
> > request
> > 
> > Eventually (after a few minutes) the link does come up
> > 
> > [root@node-6 ~]# tipc link list
> > broadcast-link: up
> > 1001006:eth2-1001005:eth2: up
> > 
> > [root@node-5 ~]# tipc link list
> > broadcast-link: up
> > 1001005:eth2-1001006:eth2: up
> > 
> > When I remove the "tipc node set addr" things seem to kick into
> > life straight
> > away
> > 
> > [root@node-5 ~]# tipc link list
> > broadcast-link: up
> > 0050b61bd2aa:eth2-0050b61e6dfa:eth2: up
> > 
> > So there appears to be some difference in behaviour between having
> > an
> > explicit node address and using the default. Unfortunately our
> > application
> > relies on setting the node addresses.
> I do this many times a day, without any problems. If there would be
> any time difference, I would expect the 'auto configurable' version
> to be slower, because it involves a DAD step.
> Are you sure you don't have any other nodes running in your system?
> 
> ///jon
> 

Nope the two nodes are connected back to back. Does the number of
Ethernet interfaces make a difference? As you can see I've got 3 on
each node. One is completely disconnected, one is for booting over TFTP
 (only used by U-boot) and the other is the USB Ethernet I'm using for
testing.

> 
> > 
> > 
> > [root@node-5 ~]# uname -a
> > Linux linuxbox 5.2.0-at1+ #8 SMP Thu Jul 25 23:22:41 UTC 2019 ppc
> > GNU/Linux
> > 
> > Any thoughts on the problem?

^ permalink raw reply

* Re: [PATCH net-next v4 2/3] flow_offload: Support get default block from tc immediately
From: Jakub Kicinski @ 2019-07-28 20:16 UTC (permalink / raw)
  To: wenxu; +Cc: pablo, fw, netfilter-devel, netdev
In-Reply-To: <1564296769-32294-3-git-send-email-wenxu@ucloud.cn>

On Sun, 28 Jul 2019 14:52:48 +0800, wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
> 
> When thre indr device register, it can get the default block
> from tc immediately if the block is exist.
> 
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
> v3: no change
> v4: get tc default block without callback

Please stop reposting new versions of the patches while discussion is
ongoing, it makes it harder to follow.

The TC default block is there because the indirect registration may
happen _after_ the block is installed and populated.  It's the device
driver that usually does the indirect registration, the tunnel device
and its rules may already be set when device driver is loaded or
reloaded.

I don't know the nft code, but it seems unlikely it wouldn't have the
same problem/need..

Please explain.

^ permalink raw reply

* Linux Plumbers BPF micro-conference CFP (reminder)
From: Alexei Starovoitov @ 2019-07-28 19:24 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: bpf, Network Development
In-Reply-To: <CAADnVQJ0ATngyqo8xjXdDsyFuuov3KRtbHMR1LcV8VnEDUK8Fg@mail.gmail.com>

Hey Folks,

August 2nd deadline to submit a proposal for BPF uconf
is quickly approaching.
If you're attending LPC in Lisbon and interested
in awesome BPF uconf you need to submit a proposal.

Some of you already submitted them to lpc-bpf@vger
per instructions that were sent back on July 12.
Some proposals were sent via website.
We'd like all proposals to be seen in the website.
Could you please re-enter your proposal there?
Please go to:
https://www.linuxplumbersconf.org/event/4/abstracts/
click on 'submit new proposal'
and copy-paste what you've already sent to lpc-bpf@vger.
Much appreciate it and sorry for confusion.

There is still room for few new proposals,
but space is getting very limited.
Please don't delay.

Thanks!

> ---------- Forwarded message ---------
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Fri, Jul 12, 2019 at 7:26 AM
> Subject: Linux Plumbers BPF micro-conference CFP (reminder)
> To: <bpf@vger.kernel.org>
> Cc: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
> <xdp-newbies@vger.kernel.org>, <iovisor-dev@lists.iovisor.org>,
> <lpc-bpf@vger.kernel.org>, <alexei.starovoitov@gmail.com>
>
>
> This is a call for proposals for the BPF micro-conference at this
> years' Linux Plumbers Conference (LPC) 2019 which will be held in
> Lisbon, Portugal for September 9-11.
>
> The goal of the BPF micro-conference is to bring BPF developers
> together to discuss topics around Linux kernel work related to
> the BPF core infrastructure as well as its many subsystems under
> tracing, networking, security, and BPF user space tooling (LLVM,
> libbpf, bpftool and many others).
>
> The format of the micro-conference has a main focus on discussion,
> therefore each accepted topic will provide a short 1-2 slide
> introduction with subsequent discussion for the rest of the given
> time slot.
>
> The BPF micro-conference is a community-driven event and open to
> all LPC attendees, there is no additional registration required.
>
> Please submit your discussion proposals to the LPC BPF micro-conference
> organizers at:
>
>         lpc-bpf@vger.kernel.org
>
> Proposals must be submitted until August 2nd, and submitters will
> be notified of acceptance at latest by August 9. (Please note that
> proposals must not be sent as html mail as they are otherwise dropped
> by vger.)
>
> The format of the submission and many other details can be found at:
>
>         http://vger.kernel.org/lpc-bpf.html
>
> Looking forward to seeing you all in Lisbon in September!

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox