Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/2] hns: Clean up string operations
From: Ben Hutchings @ 2018-03-13 23:41 UTC (permalink / raw)
  To: Yisen Zhuang, Salil Mehta; +Cc: netdev
In-Reply-To: <20180313233948.GG8564@decadent.org.uk>

[-- Attachment #1: Type: text/plain, Size: 20972 bytes --]

The driver-internal string operations are only ever used for
statistics, so remove the stringset parameters and rename them
accordingly.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 drivers/net/ethernet/hisilicon/hns/hnae.h          | 13 ++++----
 drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c  | 37 +++++++++++-----------
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c | 16 +++-------
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c  |  9 +++---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h  | 10 +++---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 28 ++++++----------
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h |  6 ++--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c  | 11 +++----
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h  |  4 +--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c  | 20 ++++--------
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h  |  4 +--
 .../net/ethernet/hisilicon/hns/hns_dsaf_xgmac.c    | 24 +++++---------
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c   |  9 +++---
 13 files changed, 78 insertions(+), 113 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h b/drivers/net/ethernet/hisilicon/hns/hnae.h
index 3e62692af011..480fa2b49be7 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.h
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.h
@@ -457,10 +457,10 @@ enum hnae_media_type {
  *   update Old network device statistics
  * get_ethtool_stats()
  *   get ethtool network device statistics
- * get_strings()
- *   get a set of strings that describe the requested objects
- * get_sset_count()
- *   get number of strings that @get_strings will write
+ * get_stats_strings()
+ *   get a set of strings that describe the statistics
+ * get_stats_count()
+ *   get number of strings that @get_stats_strings will write
  * update_led_status()
  *   update the led status
  * set_led_id()
@@ -522,9 +522,8 @@ struct hnae_ae_ops {
 	void (*update_stats)(struct hnae_handle *handle,
 			     struct net_device_stats *net_stats);
 	void (*get_stats)(struct hnae_handle *handle, u64 *data);
-	void (*get_strings)(struct hnae_handle *handle,
-			    u32 stringset, u8 *data);
-	int (*get_sset_count)(struct hnae_handle *handle, int stringset);
+	void (*get_stats_strings)(struct hnae_handle *handle, u8 *data);
+	int (*get_stats_count)(struct hnae_handle *handle);
 	void (*update_led_status)(struct hnae_handle *handle);
 	int (*set_led_id)(struct hnae_handle *handle,
 			  enum hnae_led_state status);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
index bd68379d2bea..638476aa6311 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
@@ -679,21 +679,20 @@ void hns_ae_get_stats(struct hnae_handle *handle, u64 *data)
 
 	for (idx = 0; idx < handle->q_num; idx++) {
 		hns_rcb_get_stats(handle->qs[idx], p);
-		p += hns_rcb_get_ring_sset_count((int)ETH_SS_STATS);
+		p += hns_rcb_get_ring_stats_count();
 	}
 
 	hns_ppe_get_stats(ppe_cb, p);
-	p += hns_ppe_get_sset_count((int)ETH_SS_STATS);
+	p += hns_ppe_get_stats_count();
 
 	hns_mac_get_stats(mac_cb, p);
-	p += hns_mac_get_sset_count(mac_cb, (int)ETH_SS_STATS);
+	p += hns_mac_get_stats_count(mac_cb);
 
 	if (mac_cb->mac_type == HNAE_PORT_SERVICE)
 		hns_dsaf_get_stats(vf_cb->dsaf_dev, p, vf_cb->port_index);
 }
 
-void hns_ae_get_strings(struct hnae_handle *handle,
-			u32 stringset, u8 *data)
+void hns_ae_get_stats_strings(struct hnae_handle *handle, u8 *data)
 {
 	int port;
 	int idx;
@@ -711,21 +710,21 @@ void hns_ae_get_strings(struct hnae_handle *handle,
 	ppe_cb = hns_get_ppe_cb(handle);
 
 	for (idx = 0; idx < handle->q_num; idx++) {
-		hns_rcb_get_strings(stringset, p, idx);
-		p += ETH_GSTRING_LEN * hns_rcb_get_ring_sset_count(stringset);
+		hns_rcb_get_stats_strings(p, idx);
+		p += ETH_GSTRING_LEN * hns_rcb_get_ring_stats_count();
 	}
 
-	hns_ppe_get_strings(ppe_cb, stringset, p);
-	p += ETH_GSTRING_LEN * hns_ppe_get_sset_count(stringset);
+	hns_ppe_get_stats_strings(ppe_cb, p);
+	p += ETH_GSTRING_LEN * hns_ppe_get_stats_count();
 
-	hns_mac_get_strings(mac_cb, stringset, p);
-	p += ETH_GSTRING_LEN * hns_mac_get_sset_count(mac_cb, stringset);
+	hns_mac_get_stats_strings(mac_cb, p);
+	p += ETH_GSTRING_LEN * hns_mac_get_stats_count(mac_cb);
 
 	if (mac_cb->mac_type == HNAE_PORT_SERVICE)
-		hns_dsaf_get_strings(stringset, p, port, dsaf_dev);
+		hns_dsaf_get_stats_strings(p, port, dsaf_dev);
 }
 
-int hns_ae_get_sset_count(struct hnae_handle *handle, int stringset)
+int hns_ae_get_stats_count(struct hnae_handle *handle)
 {
 	u32 sset_count = 0;
 	struct hns_mac_cb *mac_cb;
@@ -735,12 +734,12 @@ int hns_ae_get_sset_count(struct hnae_handle *handle, int stringset)
 
 	mac_cb = hns_get_mac_cb(handle);
 
-	sset_count += hns_rcb_get_ring_sset_count(stringset) * handle->q_num;
-	sset_count += hns_ppe_get_sset_count(stringset);
-	sset_count += hns_mac_get_sset_count(mac_cb, stringset);
+	sset_count += hns_rcb_get_ring_stats_count() * handle->q_num;
+	sset_count += hns_ppe_get_stats_count();
+	sset_count += hns_mac_get_stats_count(mac_cb);
 
 	if (mac_cb->mac_type == HNAE_PORT_SERVICE)
-		sset_count += hns_dsaf_get_sset_count(dsaf_dev, stringset);
+		sset_count += hns_dsaf_get_stats_count(dsaf_dev);
 
 	return sset_count;
 }
@@ -923,8 +922,8 @@ static struct hnae_ae_ops hns_dsaf_ops = {
 	.update_stats = hns_ae_update_stats,
 	.set_tso_stats = hns_ae_set_tso_stats,
 	.get_stats = hns_ae_get_stats,
-	.get_strings = hns_ae_get_strings,
-	.get_sset_count = hns_ae_get_sset_count,
+	.get_stats_strings = hns_ae_get_stats_strings,
+	.get_stats_count = hns_ae_get_stats_count,
 	.update_led_status = hns_ae_update_led_status,
 	.set_led_id = hns_ae_cpld_set_led_id,
 	.get_regs = hns_ae_get_regs,
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
index 86944bc3b273..7525b03fd3d8 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
@@ -649,14 +649,11 @@ static void hns_gmac_get_stats(void *mac_drv, u64 *data)
 	}
 }
 
-static void hns_gmac_get_strings(u32 stringset, u8 *data)
+static void hns_gmac_get_stats_strings(u8 *data)
 {
 	char *buff = (char *)data;
 	u32 i;
 
-	if (stringset != ETH_SS_STATS)
-		return;
-
 	for (i = 0; i < ARRAY_SIZE(g_gmac_stats_string); i++) {
 		snprintf(buff, ETH_GSTRING_LEN, "%s",
 			 g_gmac_stats_string[i].desc);
@@ -664,12 +661,9 @@ static void hns_gmac_get_strings(u32 stringset, u8 *data)
 	}
 }
 
-static int hns_gmac_get_sset_count(int stringset)
+static int hns_gmac_get_stats_count(void)
 {
-	if (stringset == ETH_SS_STATS || stringset == ETH_SS_PRIV_FLAGS)
-		return ARRAY_SIZE(g_gmac_stats_string);
-
-	return 0;
+	return ARRAY_SIZE(g_gmac_stats_string);
 }
 
 static int hns_gmac_get_regs_count(void)
@@ -713,8 +707,8 @@ void *hns_gmac_config(struct hns_mac_cb *mac_cb, struct mac_params *mac_param)
 	mac_drv->get_regs = hns_gmac_get_regs;
 	mac_drv->get_regs_count = hns_gmac_get_regs_count;
 	mac_drv->get_ethtool_stats = hns_gmac_get_stats;
-	mac_drv->get_sset_count = hns_gmac_get_sset_count;
-	mac_drv->get_strings = hns_gmac_get_strings;
+	mac_drv->get_stats_count = hns_gmac_get_stats_count;
+	mac_drv->get_stats_strings = hns_gmac_get_stats_strings;
 	mac_drv->update_stats = hns_gmac_update_stats;
 	mac_drv->set_promiscuous = hns_gmac_set_promisc;
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
index cac86e9ae0dd..094476b27c00 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
@@ -1109,19 +1109,18 @@ void hns_mac_get_stats(struct hns_mac_cb *mac_cb, u64 *data)
 	mac_ctrl_drv->get_ethtool_stats(mac_ctrl_drv, data);
 }
 
-void hns_mac_get_strings(struct hns_mac_cb *mac_cb,
-			 int stringset, u8 *data)
+void hns_mac_get_stats_strings(struct hns_mac_cb *mac_cb, u8 *data)
 {
 	struct mac_driver *mac_ctrl_drv = hns_mac_get_drv(mac_cb);
 
-	mac_ctrl_drv->get_strings(stringset, data);
+	mac_ctrl_drv->get_stats_strings(data);
 }
 
-int hns_mac_get_sset_count(struct hns_mac_cb *mac_cb, int stringset)
+int hns_mac_get_stats_count(struct hns_mac_cb *mac_cb)
 {
 	struct mac_driver *mac_ctrl_drv = hns_mac_get_drv(mac_cb);
 
-	return mac_ctrl_drv->get_sset_count(stringset);
+	return mac_ctrl_drv->get_stats_count();
 }
 
 void hns_mac_set_promisc(struct hns_mac_cb *mac_cb, u8 en)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h
index bbc0a98e7ca3..495462d38cb2 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.h
@@ -383,9 +383,9 @@ struct mac_driver {
 	void (*get_regs)(void *mac_drv, void *data);
 	int (*get_regs_count)(void);
 	/* get strings name for ethtool statistic */
-	void (*get_strings)(u32 stringset, u8 *data);
-	/* get the number of strings*/
-	int (*get_sset_count)(int stringset);
+	void (*get_stats_strings)(u8 *data);
+	/* get the number of statistics */
+	int (*get_stats_count)(void);
 
 	/* get the statistic by ethtools*/
 	void (*get_ethtool_stats)(void *mac_drv, u64 *data);
@@ -448,8 +448,8 @@ int hns_mac_config_mac_loopback(struct hns_mac_cb *mac_cb,
 				enum hnae_loop loop, int en);
 void hns_mac_update_stats(struct hns_mac_cb *mac_cb);
 void hns_mac_get_stats(struct hns_mac_cb *mac_cb, u64 *data);
-void hns_mac_get_strings(struct hns_mac_cb *mac_cb, int stringset, u8 *data);
-int hns_mac_get_sset_count(struct hns_mac_cb *mac_cb, int stringset);
+void hns_mac_get_stats_strings(struct hns_mac_cb *mac_cb, u8 *data);
+int hns_mac_get_stats_count(struct hns_mac_cb *mac_cb);
 void hns_mac_get_regs(struct hns_mac_cb *mac_cb, void *data);
 int hns_mac_get_regs_count(struct hns_mac_cb *mac_cb);
 void hns_set_led_opt(struct hns_mac_cb *mac_cb);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
index e0bc79ea3d88..0ba28a29b738 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
@@ -2617,38 +2617,30 @@ void hns_dsaf_get_stats(struct dsaf_device *ddev, u64 *data, int port)
 }
 
 /**
- *hns_dsaf_get_sset_count - get dsaf string set count
- *@stringset: type of values in data
- *return dsaf string name count
+ *hns_dsaf_get_stats_count - get dsaf statistic count
+ *return dsaf statistic count
  */
-int hns_dsaf_get_sset_count(struct dsaf_device *dsaf_dev, int stringset)
+int hns_dsaf_get_stats_count(struct dsaf_device *dsaf_dev)
 {
 	bool is_ver1 = AE_IS_VER1(dsaf_dev->dsaf_ver);
 
-	if (stringset == ETH_SS_STATS) {
-		if (is_ver1)
-			return DSAF_STATIC_NUM;
-		else
-			return DSAF_V2_STATIC_NUM;
-	}
-	return 0;
+	if (is_ver1)
+		return DSAF_STATIC_NUM;
+	else
+		return DSAF_V2_STATIC_NUM;
 }
 
 /**
- *hns_dsaf_get_strings - get dsaf string set
- *@stringset:srting set index
+ *hns_dsaf_get_stats_strings - get dsaf statistic strings
  *@data:strings name value
  *@port:port index
  */
-void hns_dsaf_get_strings(int stringset, u8 *data, int port,
-			  struct dsaf_device *dsaf_dev)
+void hns_dsaf_get_stats_strings(u8 *data, int port,
+				struct dsaf_device *dsaf_dev)
 {
 	char *buff = (char *)data;
 	int node = port;
 
-	if (stringset != ETH_SS_STATS)
-		return;
-
 	/* for ge/xge node info */
 	buff = hns_dsaf_get_node_stats_strings(buff, node, dsaf_dev);
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
index 4507e8222683..275dec595dc7 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
@@ -442,10 +442,10 @@ void hns_dsaf_ae_uninit(struct dsaf_device *dsaf_dev);
 
 void hns_dsaf_update_stats(struct dsaf_device *dsaf_dev, u32 inode_num);
 
-int hns_dsaf_get_sset_count(struct dsaf_device *dsaf_dev, int stringset);
+int hns_dsaf_get_stats_count(struct dsaf_device *dsaf_dev);
 void hns_dsaf_get_stats(struct dsaf_device *ddev, u64 *data, int port);
-void hns_dsaf_get_strings(int stringset, u8 *data, int port,
-			  struct dsaf_device *dsaf_dev);
+void hns_dsaf_get_stats_strings(u8 *data, int port,
+				struct dsaf_device *dsaf_dev);
 
 void hns_dsaf_get_regs(struct dsaf_device *ddev, u32 port, void *data);
 int hns_dsaf_get_regs_count(void);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c
index b62816c1574e..a08471441285 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c
@@ -420,11 +420,9 @@ void hns_ppe_update_stats(struct hns_ppe_cb *ppe_cb)
 		+= dsaf_read_dev(ppe_cb, PPE_HIS_TX_PKT_CS_FAIL_CNT_REG);
 }
 
-int hns_ppe_get_sset_count(int stringset)
+int hns_ppe_get_stats_count(void)
 {
-	if (stringset == ETH_SS_STATS || stringset == ETH_SS_PRIV_FLAGS)
-		return ETH_PPE_STATIC_NUM;
-	return 0;
+	return ETH_PPE_STATIC_NUM;
 }
 
 int hns_ppe_get_regs_count(void)
@@ -433,12 +431,11 @@ int hns_ppe_get_regs_count(void)
 }
 
 /**
- * ppe_get_strings - get ppe srting
+ * ppe_get_strings - get ppe statistic strings
  * @ppe_device: ppe device
- * @stringset: string set type
  * @data: output string
  */
-void hns_ppe_get_strings(struct hns_ppe_cb *ppe_cb, int stringset, u8 *data)
+void hns_ppe_get_stats_strings(struct hns_ppe_cb *ppe_cb, u8 *data)
 {
 	char *buff = (char *)data;
 	int index = ppe_cb->index;
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h
index 9d8e643e8aa6..1baf04caf4bf 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.h
@@ -108,11 +108,11 @@ void hns_ppe_reset_common(struct dsaf_device *dsaf_dev, u8 ppe_common_index);
 
 void hns_ppe_update_stats(struct hns_ppe_cb *ppe_cb);
 
-int hns_ppe_get_sset_count(int stringset);
+int hns_ppe_get_stats_count(void);
 int hns_ppe_get_regs_count(void);
 void hns_ppe_get_regs(struct hns_ppe_cb *ppe_cb, void *data);
 
-void hns_ppe_get_strings(struct hns_ppe_cb *ppe_cb, int stringset, u8 *data);
+void hns_ppe_get_stats_strings(struct hns_ppe_cb *ppe_cb, u8 *data);
 void hns_ppe_get_stats(struct hns_ppe_cb *ppe_cb, u64 *data);
 void hns_ppe_set_tso_enable(struct hns_ppe_cb *ppe_cb, u32 value);
 void hns_ppe_set_rss_key(struct hns_ppe_cb *ppe_cb,
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c
index 6f3570cfb501..7d4ecd8c8d6e 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c
@@ -870,16 +870,12 @@ void hns_rcb_get_stats(struct hnae_queue *queue, u64 *data)
 }
 
 /**
- *hns_rcb_get_ring_sset_count - rcb string set count
- *@stringset:ethtool cmd
- *return rcb ring string set count
+ *hns_rcb_get_ring_stats_count - rcb statistic count
+ *return rcb ring statistic count
  */
-int hns_rcb_get_ring_sset_count(int stringset)
+int hns_rcb_get_ring_stats_count(void)
 {
-	if (stringset == ETH_SS_STATS || stringset == ETH_SS_PRIV_FLAGS)
-		return HNS_RING_STATIC_REG_NUM;
-
-	return 0;
+	return HNS_RING_STATIC_REG_NUM;
 }
 
 /**
@@ -901,18 +897,14 @@ int hns_rcb_get_ring_regs_count(void)
 }
 
 /**
- *hns_rcb_get_strings - get rcb string set
- *@stringset:string set index
+ *hns_rcb_get_strings - get rcb statistic strings
  *@data:strings name value
  *@index:queue index
  */
-void hns_rcb_get_strings(int stringset, u8 *data, int index)
+void hns_rcb_get_stats_strings(u8 *data, int index)
 {
 	char *buff = (char *)data;
 
-	if (stringset != ETH_SS_STATS)
-		return;
-
 	snprintf(buff, ETH_GSTRING_LEN, "tx_ring%d_rcb_pkt_num", index);
 	buff = buff + ETH_GSTRING_LEN;
 	snprintf(buff, ETH_GSTRING_LEN, "tx_ring%d_ppe_tx_pkt_num", index);
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h
index 602816498c8d..95cd0111e802 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h
@@ -154,13 +154,13 @@ void hns_rcb_get_stats(struct hnae_queue *queue, u64 *data);
 
 void hns_rcb_get_common_regs(struct rcb_common_cb *rcb_common, void *data);
 
-int hns_rcb_get_ring_sset_count(int stringset);
+int hns_rcb_get_ring_stats_count(void);
 int hns_rcb_get_common_regs_count(void);
 int hns_rcb_get_ring_regs_count(void);
 
 void hns_rcb_get_ring_regs(struct hnae_queue *queue, void *data);
 
-void hns_rcb_get_strings(int stringset, u8 *data, int index);
+void hns_rcb_get_stats_strings(u8 *data, int index);
 void hns_rcb_set_rx_ring_bs(struct hnae_queue *q, u32 buf_size);
 void hns_rcb_set_tx_ring_bs(struct hnae_queue *q, u32 buf_size);
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_xgmac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_xgmac.c
index 51e7e9f5af49..0e8fd285bfd7 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_xgmac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_xgmac.c
@@ -756,18 +756,14 @@ static void hns_xgmac_get_stats(void *mac_drv, u64 *data)
 }
 
 /**
- *hns_xgmac_get_strings - get xgmac strings name
- *@stringset: type of values in data
+ *hns_xgmac_get_stats_strings - get xgmac statistic strings
  *@data:data for value of string name
  */
-static void hns_xgmac_get_strings(u32 stringset, u8 *data)
+static void hns_xgmac_get_stats_strings(u8 *data)
 {
 	char *buff = (char *)data;
 	u32 i;
 
-	if (stringset != ETH_SS_STATS)
-		return;
-
 	for (i = 0; i < ARRAY_SIZE(g_xgmac_stats_string); i++) {
 		snprintf(buff, ETH_GSTRING_LEN, g_xgmac_stats_string[i].desc);
 		buff = buff + ETH_GSTRING_LEN;
@@ -775,16 +771,12 @@ static void hns_xgmac_get_strings(u32 stringset, u8 *data)
 }
 
 /**
- *hns_xgmac_get_sset_count - get xgmac string set count
- *@stringset: type of values in data
- *return xgmac string set count
+ *hns_xgmac_get_stats_count - get xgmac statistic count
+ *return xgmac statistic count
  */
-static int hns_xgmac_get_sset_count(int stringset)
+static int hns_xgmac_get_stats_count(void)
 {
-	if (stringset == ETH_SS_STATS || stringset == ETH_SS_PRIV_FLAGS)
-		return ARRAY_SIZE(g_xgmac_stats_string);
-
-	return 0;
+	return ARRAY_SIZE(g_xgmac_stats_string);
 }
 
 /**
@@ -832,9 +824,9 @@ void *hns_xgmac_config(struct hns_mac_cb *mac_cb, struct mac_params *mac_param)
 	mac_drv->get_link_status = hns_xgmac_get_link_status;
 	mac_drv->get_regs = hns_xgmac_get_regs;
 	mac_drv->get_ethtool_stats = hns_xgmac_get_stats;
-	mac_drv->get_sset_count = hns_xgmac_get_sset_count;
+	mac_drv->get_stats_count = hns_xgmac_get_stats_count;
 	mac_drv->get_regs_count = hns_xgmac_get_regs_count;
-	mac_drv->get_strings = hns_xgmac_get_strings;
+	mac_drv->get_stats_strings = hns_xgmac_get_stats_strings;
 	mac_drv->update_stats = hns_xgmac_update_stats;
 
 	return (void *)mac_drv;
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index b836742f7ab6..71075f69122c 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -902,7 +902,8 @@ void hns_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 			memcpy(buff, hns_nic_test_strs[MAC_INTERNALLOOP_PHY],
 			       ETH_GSTRING_LEN);
 
-	} else if (stringset == ETH_SS_STATS && h->dev->ops->get_strings) {
+	} else if (stringset == ETH_SS_STATS &&
+		   h->dev->ops->get_stats_strings) {
 		snprintf(buff, ETH_GSTRING_LEN, "rx_packets");
 		buff = buff + ETH_GSTRING_LEN;
 		snprintf(buff, ETH_GSTRING_LEN, "tx_packets");
@@ -957,7 +958,7 @@ void hns_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 		snprintf(buff, ETH_GSTRING_LEN, "netdev_tx_timeout");
 		buff = buff + ETH_GSTRING_LEN;
 
-		h->dev->ops->get_strings(h, stringset, (u8 *)buff);
+		h->dev->ops->get_stats_strings(h, (u8 *)buff);
 	}
 }
 
@@ -984,8 +985,8 @@ int hns_get_sset_count(struct net_device *netdev, int stringset)
 			cnt--;
 
 		return cnt;
-	} else if (stringset == ETH_SS_STATS && ops->get_sset_count) {
-		return HNS_NET_STATS_CNT + ops->get_sset_count(h, stringset);
+	} else if (stringset == ETH_SS_STATS && ops->get_stats_count) {
+		return HNS_NET_STATS_CNT + ops->get_stats_count(h);
 	} else {
 		return -EOPNOTSUPP;
 	}

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply related

* [PATCH net 1/2] hns: Fix string set validation in ethtool string operations
From: Ben Hutchings @ 2018-03-13 23:39 UTC (permalink / raw)
  To: Yisen Zhuang, Salil Mehta; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 3201 bytes --]

The hns driver used to assume that it would only ever be asked
about ETH_SS_TEST or ETH_SS_STATS, and for any other stringset
it would claim a count of 0 but if asked for names would copy
over all the statistic names.  This resulted in a potential
heap buffer overflow.

This was "fixed" some time ago by making it report the number of
statistics when asked about the ETH_SS_PRIV_FLAGS stringset.  This
doesn't make any sense, and it left the bug still exploitable with
other stringset numbers.

As a minimal but complete fix, stop calling the driver-internal
string operations for any stringset other than ETH_SS_STATS.

Fixes: 511e6bc071db ("net: add Hisilicon Network Subsystem DSAF support")
Fixes: 412b65d15a7f ("net: hns: fix ethtool_get_strings overflow ...")
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
---
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index 7ea7f8a4aa2a..b836742f7ab6 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -889,11 +889,6 @@ void hns_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 	struct hnae_handle *h = priv->ae_handle;
 	char *buff = (char *)data;
 
-	if (!h->dev->ops->get_strings) {
-		netdev_err(netdev, "h->dev->ops->get_strings is null!\n");
-		return;
-	}
-
 	if (stringset == ETH_SS_TEST) {
 		if (priv->ae_handle->phy_if != PHY_INTERFACE_MODE_XGMII) {
 			memcpy(buff, hns_nic_test_strs[MAC_INTERNALLOOP_MAC],
@@ -907,7 +902,7 @@ void hns_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 			memcpy(buff, hns_nic_test_strs[MAC_INTERNALLOOP_PHY],
 			       ETH_GSTRING_LEN);
 
-	} else {
+	} else if (stringset == ETH_SS_STATS && h->dev->ops->get_strings) {
 		snprintf(buff, ETH_GSTRING_LEN, "rx_packets");
 		buff = buff + ETH_GSTRING_LEN;
 		snprintf(buff, ETH_GSTRING_LEN, "tx_packets");
@@ -969,7 +964,7 @@ void hns_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
 /**
  * nic_get_sset_count - get string set count witch returned by nic_get_strings.
  * @dev: net device
- * @stringset: string set index, 0: self test string; 1: statistics string.
+ * @stringset: string set index
  *
  * Return string set count.
  */
@@ -979,10 +974,6 @@ int hns_get_sset_count(struct net_device *netdev, int stringset)
 	struct hnae_handle *h = priv->ae_handle;
 	struct hnae_ae_ops *ops = h->dev->ops;
 
-	if (!ops->get_sset_count) {
-		netdev_err(netdev, "get_sset_count is null!\n");
-		return -EOPNOTSUPP;
-	}
 	if (stringset == ETH_SS_TEST) {
 		u32 cnt = (sizeof(hns_nic_test_strs) / ETH_GSTRING_LEN);
 
@@ -993,8 +984,10 @@ int hns_get_sset_count(struct net_device *netdev, int stringset)
 			cnt--;
 
 		return cnt;
+	} else if (stringset == ETH_SS_STATS && ops->get_sset_count) {
+		return HNS_NET_STATS_CNT + ops->get_sset_count(h, stringset);
 	} else {
-		return (HNS_NET_STATS_CNT + ops->get_sset_count(h, stringset));
+		return -EOPNOTSUPP;
 	}
 }
 


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply related

* Re: [PATCH net-next v2 2/4] net: qualcomm: rmnet: Update copyright year to 2018
From: Joe Perches @ 2018-03-13 23:32 UTC (permalink / raw)
  To: Subash Abhinov Kasiviswanathan, davem, netdev
In-Reply-To: <1520981428-29148-3-git-send-email-subashab@codeaurora.org>

On Tue, 2018-03-13 at 16:50 -0600, Subash Abhinov Kasiviswanathan wrote:
> Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
> ---
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c      | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h      | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c    | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h    | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h         | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c    | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h     | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c         | 2 +-
>  drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h         | 2 +-

Did any work actually occur on all these files in 2018?
$ git log --name-only --since=01-01-2018 drivers/net/ethernet/qualcomm/rmnet/ | \
  grep "^drivers" | sort | uniq
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c

^ permalink raw reply

* Re: BUG_ON triggered in skb_segment
From: Eric Dumazet @ 2018-03-13 23:27 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric Dumazet, Yonghong Song, Steffen Klassert
  Cc: Daniel Borkmann, netdev, Martin Lau, saeedm, diptanu
In-Reply-To: <cce5fa43-eac6-1074-fe34-356c4b6a4321@fb.com>



On 03/13/2018 04:09 PM, Alexei Starovoitov wrote:

> we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
> It's not clear why it's not crashing there, but we cannot just
> reject changing proto in bpf programs now.
> We have to fix whatever needs to be fixed in skb_segment
> (if bug is there) or fix whatever necessary on mlx5 side.
> In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
> through virtio would do, so if skb_segment() needs to do something
> special with skb the SKB_GSO_DODGY flag is already there.

'Fixing' skb_segment(), I did that a long time ago and Herbert Xu was 
not happy with the fix and provided something else.

GSO_DODGY has nothing to do with the problem really.

Changing gso_size is breaking GRO since it ends up changing the number 
of segments on the wire. TCP is not going to be happy, so you'll also 
have to fix TCP eventually.

^ permalink raw reply

* Re: BUG_ON triggered in skb_segment
From: Daniel Borkmann @ 2018-03-13 23:18 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric Dumazet, Yonghong Song, Steffen Klassert
  Cc: netdev, Martin Lau, saeedm, diptanu
In-Reply-To: <cce5fa43-eac6-1074-fe34-356c4b6a4321@fb.com>

On 03/14/2018 12:09 AM, Alexei Starovoitov wrote:
> On 3/13/18 3:47 PM, Eric Dumazet wrote:
>>
>>
>> On 03/13/2018 03:37 PM, Yonghong Song wrote:
>>> Adding additional cc's:
>>>    Saeed Mahameed as this is most likely mlx5 driver related.
>>>    Diptanu Gon Choudhury who initially reported the issue.
>>>
>>>
>>> On 3/13/18 1:44 AM, Steffen Klassert wrote:
>>>> On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:
>>>>>
>>>>>
>>>>> On 03/12/2018 11:08 PM, Yonghong Song wrote:
>>>>>>
>>>>>>
>>>>>> On 3/12/18 11:04 PM, Eric Dumazet wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 03/12/2018 10:45 PM, Yonghong Song wrote:
>>>>>>>> ...
>>>>>>>> Setup:
>>>>>>>> =====
>>>>>>>>
>>>>>>>> The test will involve three machines:
>>>>>>>>     M_ipv6 <-> M_nat <-> M_ipv4
>>>>>>>>
>>>>>>>> The M_nat will do ipv4<->ipv6 address translation and then
>>>>>>>> forward packet
>>>>>>>> to proper destination. The control plane will configure M_nat
>>>>>>>> properly
>>>>>>>> will understand virtual ipv4 address for machine M_ipv6, and
>>>>>>>> virtual ipv6 address for machine M_ipv4.
>>>>>>>>
>>>>>>>> M_nat runs a bpf program, which is attached to clsact (ingress)
>>>>>>>> qdisc.
>>>>>>>> The program uses bpf_skb_change_proto to do protocol conversion.
>>>>>>>> bpf_skb_change_proto will adjust skb header_len and len properly
>>>>>>>> based on protocol change.
>>>>>>>> After the conversion, the program will make proper change on
>>>>>>>> ethhdr and ip4/6 header, recalculate checksum, and send the
>>>>>>>> packet out
>>>>>>>> through bpf_redirect.
>>>>>>>>
>>>>>>>> Experiment:
>>>>>>>> ===========
>>>>>>>>
>>>>>>>> MTU: 1500B for all three machines.
>>>>>>>>
>>>>>>>> The tso/lro/gro are enabled on the M_nat box.
>>>>>>>>
>>>>>>>> ping works on both ways of M_ipv6 <-> M_ipv4.
>>>>>>>> It works for transfering a small file (4KB) between M_ipv6 and
>>>>>>>> M_ipv4 (both ways).
>>>>>>>> Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
>>>>>>>> failed with the above BUG_ON, really fast.
>>>>>>>> Did not really test from M_ipv4 to M_ipv6 with large file.
>>>>>>>>
>>>>>>>> The error path likely to be (also from the above call stack):
>>>>>>>>     nic -> lro/gro -> bpf_program -> gso (BUG_ON)
>>>>
>>>> Just out of curiosity, are these packets created with LRO or GRO?
>>>> Usually LRO is disabled if forwarding is enabled on a machine,
>>>> because segmented LGO packets are likely corrupt.
>>>
>>> In our experiments, LRO is disabled.
>>> On mlx5, when GRO is on, the BUG_ON will happen, and
>>> when GRO is off, the BUG_ON will not happen.
>>>
>>>>
>>>> These packets take an alternative redirect path, so not sure what
>>>> happens here.
>>>>
>>>>>>>>
>>>>>>>> In one of experiments, I explicitly printed the skb->len and
>>>>>>>> skb->data_len. The values are below:
>>>>>>>>     skb_segment: len 2856, data_len 2686
>>>>>>>> They should be equal to avoid BUG.
>>>>>>>>
>>>>>>>> In another experiment, I got:
>>>>>>>>     skb_segment: len 1428, data_len 1258
>>>>>>>>
>>>>>>>> In both cases, the difference is 170 bytes. Not sure whether
>>>>>>>> this is just a coincidence or not.
>>>>>>>>
>>>>>>>> Workaround:
>>>>>>>> ===========
>>>>>>>>
>>>>>>>> A workaround to avoid BUG_ON is to disable lro/gro. This way,
>>>>>>>> kernel will not receive big packets and hence gso is not really
>>>>>>>> called.
>>>>>>>>
>>>>>>>> I am not familiar with gso code. Does anybody hit this BUG_ON
>>>>>>>> before?
>>>>>>>> Any suggestion on how to debug this?
>>>>>>>>
>>>>>>>
>>>>>>> skb_segment() works if incoming GRO packet is not modified in its
>>>>>>> geometry.
>>>>>>>
>>>>>>> In your case it seems you had to adjust gso_size (calling
>>>>>>> skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
>>>>>>> skb_segment() badly, because geometry changes, unless you had
>>>>>>> specific MTU/MSS restrictions.
>>>>>>>
>>>>>>> You will have to make skb_segment() more generic if you really
>>>>>>> want this.
>>>>>>
>>>>>> In net/core/filter.c function bpf_skb_change_proto, which is called
>>>>>> in the bpf program, does some GSO adjustment. Could you help check
>>>>>> whether it satisfies my above use case or not? Thanks!
>>>>>
>>>>> As I said this  helper ends up modifying gso_size by +/- 20
>>>>> (sizeof(ipv6
>>>>> header) - sizeof(ipv4 header))
>>>>>
>>>>> So it wont work if skb_segment() is called after this change.
>>>>
>>>> Even HW TSO use gso_size to segment the packets. Would'nt this
>>>> result in broken packets too, if gso_size is modified on a
>>>> forwarding path?
>>>>
>>>>>
>>>>> Not clear why the GRO packet is not sent as is (as a TSO packet) since
>>>>> mlx4/mlx5 NICs certainly support TSO.
>>>
>>> This is a very good observation.
>>> We did the same experiment on mlx4, the same kernel, the same
>>> userspace apps, the same bpf program. The only difference is mlx4 vs.
>>> mlx5.
>>> The mlx4 works fine with LRO=off and GRO=on, while
>>> mlx5 failed with the same LRO/GRO configuration.
>>>
>>> On mlx4 box, we are able to see TSO packets are increasing as the
>>> large file scp is in progress.
>>> # ethtool -S eth0 | grep tso
>>>       tso_packets: 45495
>>> # ethtool -S eth0 | grep tso
>>>       tso_packets: 45865
>>> # ethtool -S eth0 | grep tso
>>>       tso_packets: 46337
>>> # ethtool -S eth0 | grep tso
>>>       tso_packets: 46724
>>>
>>> And use bcc tool to track to func call count for skb_segment
>>> and find it is called 0 times. Clearly, mlx4 is able to take
>>> the packet as TSO and hence the packet will not go to
>>> the stack.
>>>
>>> # funccount.py -i 3 'skb_segment'
>>> Tracing 1 functions for "skb_segment"... Hit Ctrl-C to end.
>>>
>>> FUNC                                    COUNT
>>>
>>> FUNC                                    COUNT
>>> ...
>>>
>>> CC Saeed Mahameed who may help debug and provide some insights
>>> what is the problem.
>>
>> There are many reasons why a GRO packet might need to be segmented by
>> software (skb_segment())
>>
>> This is the step where you have crashes, so really mlx4/mlx5 difference
>> do not matter.
>>
>> gso_size should not be dynamically changed. This needs to be fixed in
>> eBPF helpers eventually.
> 
> we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
> It's not clear why it's not crashing there, but we cannot just
> reject changing proto in bpf programs now.

Right, used in Cilium since the helper was added. Back then I recall I tested
this back to back over ixgbe w/ the various options mixed on/off with BPF prog
implementing nat64 on each side, running over various netperf streams; and later
integrated the logic into Cilium itself. Perhaps something changed with more
recent kernels as well. I'll look closer into this issue tomorrow.

> We have to fix whatever needs to be fixed in skb_segment
> (if bug is there) or fix whatever necessary on mlx5 side.
> In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
> through virtio would do, so if skb_segment() needs to do something
> special with skb the SKB_GSO_DODGY flag is already there.

^ permalink raw reply

* Re: BUG_ON triggered in skb_segment
From: Alexei Starovoitov @ 2018-03-13 23:09 UTC (permalink / raw)
  To: Eric Dumazet, Yonghong Song, Steffen Klassert
  Cc: Daniel Borkmann, netdev, Martin Lau, saeedm, diptanu
In-Reply-To: <4c0d599e-95a3-be7d-bfeb-e51a16648cc8@gmail.com>

On 3/13/18 3:47 PM, Eric Dumazet wrote:
>
>
> On 03/13/2018 03:37 PM, Yonghong Song wrote:
>> Adding additional cc's:
>>    Saeed Mahameed as this is most likely mlx5 driver related.
>>    Diptanu Gon Choudhury who initially reported the issue.
>>
>>
>> On 3/13/18 1:44 AM, Steffen Klassert wrote:
>>> On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:
>>>>
>>>>
>>>> On 03/12/2018 11:08 PM, Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 3/12/18 11:04 PM, Eric Dumazet wrote:
>>>>>>
>>>>>>
>>>>>> On 03/12/2018 10:45 PM, Yonghong Song wrote:
>>>>>>> ...
>>>>>>> Setup:
>>>>>>> =====
>>>>>>>
>>>>>>> The test will involve three machines:
>>>>>>>     M_ipv6 <-> M_nat <-> M_ipv4
>>>>>>>
>>>>>>> The M_nat will do ipv4<->ipv6 address translation and then
>>>>>>> forward packet
>>>>>>> to proper destination. The control plane will configure M_nat
>>>>>>> properly
>>>>>>> will understand virtual ipv4 address for machine M_ipv6, and
>>>>>>> virtual ipv6 address for machine M_ipv4.
>>>>>>>
>>>>>>> M_nat runs a bpf program, which is attached to clsact (ingress)
>>>>>>> qdisc.
>>>>>>> The program uses bpf_skb_change_proto to do protocol conversion.
>>>>>>> bpf_skb_change_proto will adjust skb header_len and len properly
>>>>>>> based on protocol change.
>>>>>>> After the conversion, the program will make proper change on
>>>>>>> ethhdr and ip4/6 header, recalculate checksum, and send the
>>>>>>> packet out
>>>>>>> through bpf_redirect.
>>>>>>>
>>>>>>> Experiment:
>>>>>>> ===========
>>>>>>>
>>>>>>> MTU: 1500B for all three machines.
>>>>>>>
>>>>>>> The tso/lro/gro are enabled on the M_nat box.
>>>>>>>
>>>>>>> ping works on both ways of M_ipv6 <-> M_ipv4.
>>>>>>> It works for transfering a small file (4KB) between M_ipv6 and
>>>>>>> M_ipv4 (both ways).
>>>>>>> Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
>>>>>>> failed with the above BUG_ON, really fast.
>>>>>>> Did not really test from M_ipv4 to M_ipv6 with large file.
>>>>>>>
>>>>>>> The error path likely to be (also from the above call stack):
>>>>>>>     nic -> lro/gro -> bpf_program -> gso (BUG_ON)
>>>
>>> Just out of curiosity, are these packets created with LRO or GRO?
>>> Usually LRO is disabled if forwarding is enabled on a machine,
>>> because segmented LGO packets are likely corrupt.
>>
>> In our experiments, LRO is disabled.
>> On mlx5, when GRO is on, the BUG_ON will happen, and
>> when GRO is off, the BUG_ON will not happen.
>>
>>>
>>> These packets take an alternative redirect path, so not sure what
>>> happens here.
>>>
>>>>>>>
>>>>>>> In one of experiments, I explicitly printed the skb->len and
>>>>>>> skb->data_len. The values are below:
>>>>>>>     skb_segment: len 2856, data_len 2686
>>>>>>> They should be equal to avoid BUG.
>>>>>>>
>>>>>>> In another experiment, I got:
>>>>>>>     skb_segment: len 1428, data_len 1258
>>>>>>>
>>>>>>> In both cases, the difference is 170 bytes. Not sure whether
>>>>>>> this is just a coincidence or not.
>>>>>>>
>>>>>>> Workaround:
>>>>>>> ===========
>>>>>>>
>>>>>>> A workaround to avoid BUG_ON is to disable lro/gro. This way,
>>>>>>> kernel will not receive big packets and hence gso is not really
>>>>>>> called.
>>>>>>>
>>>>>>> I am not familiar with gso code. Does anybody hit this BUG_ON
>>>>>>> before?
>>>>>>> Any suggestion on how to debug this?
>>>>>>>
>>>>>>
>>>>>> skb_segment() works if incoming GRO packet is not modified in its
>>>>>> geometry.
>>>>>>
>>>>>> In your case it seems you had to adjust gso_size (calling
>>>>>> skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
>>>>>> skb_segment() badly, because geometry changes, unless you had
>>>>>> specific MTU/MSS restrictions.
>>>>>>
>>>>>> You will have to make skb_segment() more generic if you really
>>>>>> want this.
>>>>>
>>>>> In net/core/filter.c function bpf_skb_change_proto, which is called
>>>>> in the bpf program, does some GSO adjustment. Could you help check
>>>>> whether it satisfies my above use case or not? Thanks!
>>>>
>>>> As I said this  helper ends up modifying gso_size by +/- 20
>>>> (sizeof(ipv6
>>>> header) - sizeof(ipv4 header))
>>>>
>>>> So it wont work if skb_segment() is called after this change.
>>>
>>> Even HW TSO use gso_size to segment the packets. Would'nt this
>>> result in broken packets too, if gso_size is modified on a
>>> forwarding path?
>>>
>>>>
>>>> Not clear why the GRO packet is not sent as is (as a TSO packet) since
>>>> mlx4/mlx5 NICs certainly support TSO.
>>
>> This is a very good observation.
>> We did the same experiment on mlx4, the same kernel, the same
>> userspace apps, the same bpf program. The only difference is mlx4 vs.
>> mlx5.
>> The mlx4 works fine with LRO=off and GRO=on, while
>> mlx5 failed with the same LRO/GRO configuration.
>>
>> On mlx4 box, we are able to see TSO packets are increasing as the
>> large file scp is in progress.
>> # ethtool -S eth0 | grep tso
>>       tso_packets: 45495
>> # ethtool -S eth0 | grep tso
>>       tso_packets: 45865
>> # ethtool -S eth0 | grep tso
>>       tso_packets: 46337
>> # ethtool -S eth0 | grep tso
>>       tso_packets: 46724
>>
>> And use bcc tool to track to func call count for skb_segment
>> and find it is called 0 times. Clearly, mlx4 is able to take
>> the packet as TSO and hence the packet will not go to
>> the stack.
>>
>> # funccount.py -i 3 'skb_segment'
>> Tracing 1 functions for "skb_segment"... Hit Ctrl-C to end.
>>
>> FUNC                                    COUNT
>>
>> FUNC                                    COUNT
>> ...
>>
>> CC Saeed Mahameed who may help debug and provide some insights
>> what is the problem.
>
> There are many reasons why a GRO packet might need to be segmented by
> software (skb_segment())
>
> This is the step where you have crashes, so really mlx4/mlx5 difference
> do not matter.
>
> gso_size should not be dynamically changed. This needs to be fixed in
> eBPF helpers eventually.

we have bpf_skb_proto_6_to_4() that was used by cilium for long time.
It's not clear why it's not crashing there, but we cannot just
reject changing proto in bpf programs now.
We have to fix whatever needs to be fixed in skb_segment
(if bug is there) or fix whatever necessary on mlx5 side.
In bpf helper we mark it as SKB_GSO_DODGY just like packets coming
through virtio would do, so if skb_segment() needs to do something
special with skb the SKB_GSO_DODGY flag is already there.

^ permalink raw reply

* Re: [PATCH net-next 1/4] net: qualcomm: rmnet: Fix casting issues
From: Subash Abhinov Kasiviswanathan @ 2018-03-13 22:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20180313.104405.1978125419404641583.davem@davemloft.net>

> Ummm, if you change pkt_len to be a proper __be16, then you don't need 
> these
> casts when passing it to ntohs().

Hi David

I have fixed this now in v2.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply

* [PATCH net-next v2 2/4] net: qualcomm: rmnet: Update copyright year to 2018
From: Subash Abhinov Kasiviswanathan @ 2018-03-13 22:50 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520981428-29148-1-git-send-email-subashab@codeaurora.org>

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c      | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h      | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c    | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h    | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h         | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c    | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h     | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c         | 2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h         | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index c494918..096301a 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index 00e4634..0b5b5da 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2014, 2016-2017 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2014, 2016-2018 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 601edec..c758248 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
index 3537e4c..1bc6828 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013, 2016-2017 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013, 2016-2018 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 4f362df..884f1f5 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
index b0dbca0..afa2b86 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index c74a6c5..49e420e 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
index de0143e..98365ef 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2014, 2016-2017 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2014, 2016-2018 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
index 346d310..2ea16a0 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
index 71e4c32..daab75c 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next v2 4/4] net: qualcomm: rmnet: Implement fill_info
From: Subash Abhinov Kasiviswanathan @ 2018-03-13 22:50 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520981428-29148-1-git-send-email-subashab@codeaurora.org>

This is needed to query the mux_id and flags of a rmnet device.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 096301a..d0f3e0f 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -331,6 +331,35 @@ static size_t rmnet_get_size(const struct net_device *dev)
 	       nla_total_size(sizeof(struct ifla_vlan_flags)); /* IFLA_VLAN_FLAGS */
 }
 
+static int rmnet_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct rmnet_priv *priv = netdev_priv(dev);
+	struct net_device *real_dev;
+	struct ifla_vlan_flags f;
+	struct rmnet_port *port;
+
+	real_dev = priv->real_dev;
+
+	if (!rmnet_is_real_dev_registered(real_dev))
+		return -ENODEV;
+
+	if (nla_put_u16(skb, IFLA_VLAN_ID, priv->mux_id))
+		goto nla_put_failure;
+
+	port = rmnet_get_port_rtnl(real_dev);
+
+	f.flags = port->data_format;
+	f.mask  = ~0;
+
+	if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 struct rtnl_link_ops rmnet_link_ops __read_mostly = {
 	.kind		= "rmnet",
 	.maxtype	= __IFLA_VLAN_MAX,
@@ -341,6 +370,7 @@ struct rtnl_link_ops rmnet_link_ops __read_mostly = {
 	.dellink	= rmnet_dellink,
 	.get_size	= rmnet_get_size,
 	.changelink     = rmnet_changelink,
+	.fill_info	= rmnet_fill_info,
 };
 
 /* Needs either rcu_read_lock() or rtnl lock */
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next v2 3/4] net: qualcomm: rmnet: Remove unnecessary device assignment
From: Subash Abhinov Kasiviswanathan @ 2018-03-13 22:50 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520981428-29148-1-git-send-email-subashab@codeaurora.org>

Device of the de-aggregated skb is correctly assigned after inspecting
the mux_id, so remove the assignment in rmnet_map_deaggregate().

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 49e420e..e8f6c79 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -323,7 +323,6 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
 	if (!skbn)
 		return NULL;
 
-	skbn->dev = skb->dev;
 	skb_reserve(skbn, RMNET_MAP_DEAGGR_HEADROOM);
 	skb_put(skbn, packet_len);
 	memcpy(skbn->data, skb->data, packet_len);
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next v2 1/4] net: qualcomm: rmnet: Fix casting issues
From: Subash Abhinov Kasiviswanathan @ 2018-03-13 22:50 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan
In-Reply-To: <1520981428-29148-1-git-send-email-subashab@codeaurora.org>

Fix warnings which were reported when running with sparse
(make C=1 CF=-D__CHECK_ENDIAN__)

drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c:81:15:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:271:37:
warning: incorrect type in assignment (different base types)
expected unsigned short [unsigned] [usertype] pkt_len
got restricted __be16 [usertype] <noident>
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:287:29:
warning: incorrect type in assignment (different base types)
expected unsigned short [unsigned] [usertype] pkt_len
got restricted __be16 [usertype] <noident>
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:310:22:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c:319:13:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:49:18:
warning: cast to restricted __be16
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:50:18:
warning: cast to restricted __be32
drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c:74:21:
warning: cast to restricted __be16

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
index 6ce31e2..4f362df 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -23,8 +23,8 @@ struct rmnet_map_control_command {
 		struct {
 			u16 ip_family:2;
 			u16 reserved:14;
-			u16 flow_control_seq_num;
-			u32 qos_id;
+			__be16 flow_control_seq_num;
+			__be32 qos_id;
 		} flow_control;
 		u8 data[0];
 	};
@@ -44,7 +44,7 @@ struct rmnet_map_header {
 	u8  reserved_bit:1;
 	u8  cd_bit:1;
 	u8  mux_id;
-	u16 pkt_len;
+	__be16 pkt_len;
 }  __aligned(1);
 
 struct rmnet_map_dl_csum_trailer {
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next v2 0/4] net: qualcomm: rmnet: Updates 2018-03-12
From: Subash Abhinov Kasiviswanathan @ 2018-03-13 22:50 UTC (permalink / raw)
  To: davem, netdev; +Cc: Subash Abhinov Kasiviswanathan

This series contains some minor updates for rmnet driver.

Patch 1 contains fixes for sparse warnings.
Patch 2 updates the copyright date to 2018.
Patch 3 is a cleanup in receive path.
Patch 4 has the implementation of the fill_info operation.

v1->v2: Remove the force casts since the data type is changed to __be types
as mentioned by David.

Subash Abhinov Kasiviswanathan (4):
  net: qualcomm: rmnet: Fix casting issues
  net: qualcomm: rmnet: Update copyright year to 2018
  net: qualcomm: rmnet: Remove unnecessary device assignment
  net: qualcomm: rmnet: Implement fill_info

 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 32 +++++++++++++++++++++-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c   |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.h   |  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h    |  8 +++---
 .../ethernet/qualcomm/rmnet/rmnet_map_command.c    |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   |  3 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_private.h    |  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c    |  2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h    |  2 +-
 10 files changed, 43 insertions(+), 14 deletions(-)

-- 
1.9.1

^ permalink raw reply

* Re: BUG_ON triggered in skb_segment
From: Eric Dumazet @ 2018-03-13 22:47 UTC (permalink / raw)
  To: Yonghong Song, Steffen Klassert, Eric Dumazet
  Cc: Daniel Borkmann, Alexei Starovoitov, netdev, Martin Lau, saeedm,
	diptanu
In-Reply-To: <06bef27e-60f6-0fb4-e926-cb7ff3666a3c@fb.com>



On 03/13/2018 03:37 PM, Yonghong Song wrote:
> Adding additional cc's:
>    Saeed Mahameed as this is most likely mlx5 driver related.
>    Diptanu Gon Choudhury who initially reported the issue.
> 
> 
> On 3/13/18 1:44 AM, Steffen Klassert wrote:
>> On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:
>>>
>>>
>>> On 03/12/2018 11:08 PM, Yonghong Song wrote:
>>>>
>>>>
>>>> On 3/12/18 11:04 PM, Eric Dumazet wrote:
>>>>>
>>>>>
>>>>> On 03/12/2018 10:45 PM, Yonghong Song wrote:
>>>>>> ...
>>>>>> Setup:
>>>>>> =====
>>>>>>
>>>>>> The test will involve three machines:
>>>>>>     M_ipv6 <-> M_nat <-> M_ipv4
>>>>>>
>>>>>> The M_nat will do ipv4<->ipv6 address translation and then
>>>>>> forward packet
>>>>>> to proper destination. The control plane will configure M_nat 
>>>>>> properly
>>>>>> will understand virtual ipv4 address for machine M_ipv6, and
>>>>>> virtual ipv6 address for machine M_ipv4.
>>>>>>
>>>>>> M_nat runs a bpf program, which is attached to clsact (ingress) 
>>>>>> qdisc.
>>>>>> The program uses bpf_skb_change_proto to do protocol conversion.
>>>>>> bpf_skb_change_proto will adjust skb header_len and len properly
>>>>>> based on protocol change.
>>>>>> After the conversion, the program will make proper change on
>>>>>> ethhdr and ip4/6 header, recalculate checksum, and send the packet 
>>>>>> out
>>>>>> through bpf_redirect.
>>>>>>
>>>>>> Experiment:
>>>>>> ===========
>>>>>>
>>>>>> MTU: 1500B for all three machines.
>>>>>>
>>>>>> The tso/lro/gro are enabled on the M_nat box.
>>>>>>
>>>>>> ping works on both ways of M_ipv6 <-> M_ipv4.
>>>>>> It works for transfering a small file (4KB) between M_ipv6 and
>>>>>> M_ipv4 (both ways).
>>>>>> Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
>>>>>> failed with the above BUG_ON, really fast.
>>>>>> Did not really test from M_ipv4 to M_ipv6 with large file.
>>>>>>
>>>>>> The error path likely to be (also from the above call stack):
>>>>>>     nic -> lro/gro -> bpf_program -> gso (BUG_ON)
>>
>> Just out of curiosity, are these packets created with LRO or GRO?
>> Usually LRO is disabled if forwarding is enabled on a machine,
>> because segmented LGO packets are likely corrupt.
> 
> In our experiments, LRO is disabled.
> On mlx5, when GRO is on, the BUG_ON will happen, and
> when GRO is off, the BUG_ON will not happen.
> 
>>
>> These packets take an alternative redirect path, so not sure what
>> happens here.
>>
>>>>>>
>>>>>> In one of experiments, I explicitly printed the skb->len and
>>>>>> skb->data_len. The values are below:
>>>>>>     skb_segment: len 2856, data_len 2686
>>>>>> They should be equal to avoid BUG.
>>>>>>
>>>>>> In another experiment, I got:
>>>>>>     skb_segment: len 1428, data_len 1258
>>>>>>
>>>>>> In both cases, the difference is 170 bytes. Not sure whether
>>>>>> this is just a coincidence or not.
>>>>>>
>>>>>> Workaround:
>>>>>> ===========
>>>>>>
>>>>>> A workaround to avoid BUG_ON is to disable lro/gro. This way,
>>>>>> kernel will not receive big packets and hence gso is not really 
>>>>>> called.
>>>>>>
>>>>>> I am not familiar with gso code. Does anybody hit this BUG_ON before?
>>>>>> Any suggestion on how to debug this?
>>>>>>
>>>>>
>>>>> skb_segment() works if incoming GRO packet is not modified in its
>>>>> geometry.
>>>>>
>>>>> In your case it seems you had to adjust gso_size (calling
>>>>> skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
>>>>> skb_segment() badly, because geometry changes, unless you had
>>>>> specific MTU/MSS restrictions.
>>>>>
>>>>> You will have to make skb_segment() more generic if you really want 
>>>>> this.
>>>>
>>>> In net/core/filter.c function bpf_skb_change_proto, which is called
>>>> in the bpf program, does some GSO adjustment. Could you help check
>>>> whether it satisfies my above use case or not? Thanks!
>>>
>>> As I said this  helper ends up modifying gso_size by +/- 20 (sizeof(ipv6
>>> header) - sizeof(ipv4 header))
>>>
>>> So it wont work if skb_segment() is called after this change.
>>
>> Even HW TSO use gso_size to segment the packets. Would'nt this
>> result in broken packets too, if gso_size is modified on a
>> forwarding path?
>>
>>>
>>> Not clear why the GRO packet is not sent as is (as a TSO packet) since
>>> mlx4/mlx5 NICs certainly support TSO.
> 
> This is a very good observation.
> We did the same experiment on mlx4, the same kernel, the same userspace 
> apps, the same bpf program. The only difference is mlx4 vs. mlx5.
> The mlx4 works fine with LRO=off and GRO=on, while
> mlx5 failed with the same LRO/GRO configuration.
> 
> On mlx4 box, we are able to see TSO packets are increasing as the
> large file scp is in progress.
> # ethtool -S eth0 | grep tso
>       tso_packets: 45495
> # ethtool -S eth0 | grep tso
>       tso_packets: 45865
> # ethtool -S eth0 | grep tso
>       tso_packets: 46337
> # ethtool -S eth0 | grep tso
>       tso_packets: 46724
> 
> And use bcc tool to track to func call count for skb_segment
> and find it is called 0 times. Clearly, mlx4 is able to take
> the packet as TSO and hence the packet will not go to
> the stack.
> 
> # funccount.py -i 3 'skb_segment'
> Tracing 1 functions for "skb_segment"... Hit Ctrl-C to end.
> 
> FUNC                                    COUNT
> 
> FUNC                                    COUNT
> ...
> 
> CC Saeed Mahameed who may help debug and provide some insights
> what is the problem.

There are many reasons why a GRO packet might need to be segmented by 
software (skb_segment())

This is the step where you have crashes, so really mlx4/mlx5 difference 
do not matter.

gso_size should not be dynamically changed. This needs to be fixed in 
eBPF helpers eventually.

Traditionally, NAT might involve changing MSS option in TCP SYN and 
SYNACK ...

(This to avoid dropping too big packets, when the total size of a 
segment is increased by 20 bytes when converting IPV4 header to IPV6 one)

In the opposite direction, packet size is reduced by 20 bytes, so MSS 
can be left as is.

^ permalink raw reply

* Re: BUG_ON triggered in skb_segment
From: Yonghong Song @ 2018-03-13 22:37 UTC (permalink / raw)
  To: Steffen Klassert, Eric Dumazet
  Cc: Daniel Borkmann, Alexei Starovoitov, netdev, Martin Lau, saeedm,
	diptanu
In-Reply-To: <20180313084444.5hyu7kteswh6v5li@gauss3.secunet.de>

Adding additional cc's:
   Saeed Mahameed as this is most likely mlx5 driver related.
   Diptanu Gon Choudhury who initially reported the issue.


On 3/13/18 1:44 AM, Steffen Klassert wrote:
> On Mon, Mar 12, 2018 at 11:25:09PM -0700, Eric Dumazet wrote:
>>
>>
>> On 03/12/2018 11:08 PM, Yonghong Song wrote:
>>>
>>>
>>> On 3/12/18 11:04 PM, Eric Dumazet wrote:
>>>>
>>>>
>>>> On 03/12/2018 10:45 PM, Yonghong Song wrote:
>>>>> ...
>>>>> Setup:
>>>>> =====
>>>>>
>>>>> The test will involve three machines:
>>>>>     M_ipv6 <-> M_nat <-> M_ipv4
>>>>>
>>>>> The M_nat will do ipv4<->ipv6 address translation and then
>>>>> forward packet
>>>>> to proper destination. The control plane will configure M_nat properly
>>>>> will understand virtual ipv4 address for machine M_ipv6, and
>>>>> virtual ipv6 address for machine M_ipv4.
>>>>>
>>>>> M_nat runs a bpf program, which is attached to clsact (ingress) qdisc.
>>>>> The program uses bpf_skb_change_proto to do protocol conversion.
>>>>> bpf_skb_change_proto will adjust skb header_len and len properly
>>>>> based on protocol change.
>>>>> After the conversion, the program will make proper change on
>>>>> ethhdr and ip4/6 header, recalculate checksum, and send the packet out
>>>>> through bpf_redirect.
>>>>>
>>>>> Experiment:
>>>>> ===========
>>>>>
>>>>> MTU: 1500B for all three machines.
>>>>>
>>>>> The tso/lro/gro are enabled on the M_nat box.
>>>>>
>>>>> ping works on both ways of M_ipv6 <-> M_ipv4.
>>>>> It works for transfering a small file (4KB) between M_ipv6 and
>>>>> M_ipv4 (both ways).
>>>>> Transfering a large file (e.g., 4MB) from M_ipv6 to M_ipv4,
>>>>> failed with the above BUG_ON, really fast.
>>>>> Did not really test from M_ipv4 to M_ipv6 with large file.
>>>>>
>>>>> The error path likely to be (also from the above call stack):
>>>>>     nic -> lro/gro -> bpf_program -> gso (BUG_ON)
> 
> Just out of curiosity, are these packets created with LRO or GRO?
> Usually LRO is disabled if forwarding is enabled on a machine,
> because segmented LGO packets are likely corrupt.

In our experiments, LRO is disabled.
On mlx5, when GRO is on, the BUG_ON will happen, and
when GRO is off, the BUG_ON will not happen.

> 
> These packets take an alternative redirect path, so not sure what
> happens here.
> 
>>>>>
>>>>> In one of experiments, I explicitly printed the skb->len and
>>>>> skb->data_len. The values are below:
>>>>>     skb_segment: len 2856, data_len 2686
>>>>> They should be equal to avoid BUG.
>>>>>
>>>>> In another experiment, I got:
>>>>>     skb_segment: len 1428, data_len 1258
>>>>>
>>>>> In both cases, the difference is 170 bytes. Not sure whether
>>>>> this is just a coincidence or not.
>>>>>
>>>>> Workaround:
>>>>> ===========
>>>>>
>>>>> A workaround to avoid BUG_ON is to disable lro/gro. This way,
>>>>> kernel will not receive big packets and hence gso is not really called.
>>>>>
>>>>> I am not familiar with gso code. Does anybody hit this BUG_ON before?
>>>>> Any suggestion on how to debug this?
>>>>>
>>>>
>>>> skb_segment() works if incoming GRO packet is not modified in its
>>>> geometry.
>>>>
>>>> In your case it seems you had to adjust gso_size (calling
>>>> skb_decrease_gso_size() or skb_increase_gso_size()), and this breaks
>>>> skb_segment() badly, because geometry changes, unless you had
>>>> specific MTU/MSS restrictions.
>>>>
>>>> You will have to make skb_segment() more generic if you really want this.
>>>
>>> In net/core/filter.c function bpf_skb_change_proto, which is called
>>> in the bpf program, does some GSO adjustment. Could you help check
>>> whether it satisfies my above use case or not? Thanks!
>>
>> As I said this  helper ends up modifying gso_size by +/- 20 (sizeof(ipv6
>> header) - sizeof(ipv4 header))
>>
>> So it wont work if skb_segment() is called after this change.
> 
> Even HW TSO use gso_size to segment the packets. Would'nt this
> result in broken packets too, if gso_size is modified on a
> forwarding path?
> 
>>
>> Not clear why the GRO packet is not sent as is (as a TSO packet) since
>> mlx4/mlx5 NICs certainly support TSO.

This is a very good observation.
We did the same experiment on mlx4, the same kernel, the same userspace 
apps, the same bpf program. The only difference is mlx4 vs. mlx5.
The mlx4 works fine with LRO=off and GRO=on, while
mlx5 failed with the same LRO/GRO configuration.

On mlx4 box, we are able to see TSO packets are increasing as the
large file scp is in progress.
# ethtool -S eth0 | grep tso
      tso_packets: 45495
# ethtool -S eth0 | grep tso
      tso_packets: 45865
# ethtool -S eth0 | grep tso
      tso_packets: 46337
# ethtool -S eth0 | grep tso
      tso_packets: 46724

And use bcc tool to track to func call count for skb_segment
and find it is called 0 times. Clearly, mlx4 is able to take
the packet as TSO and hence the packet will not go to
the stack.

# funccount.py -i 3 'skb_segment'
Tracing 1 functions for "skb_segment"... Hit Ctrl-C to end.

FUNC                                    COUNT

FUNC                                    COUNT
...

CC Saeed Mahameed who may help debug and provide some insights
what is the problem.

> 
> If the packets are generated with GRO, there could be data chained
> at the frag_list pointer. Most NICs can't offload such skbs, so if
> skb_segment() can't split at the frag_list pointer, it will just
> segment the packets based on gso_size.
> 

^ permalink raw reply

* Re: [PATCH v3] kernel.h: Skip single-eval logic on literals in min()/max()
From: Kees Cook @ 2018-03-13 22:14 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, Linux Kernel Mailing List, Josh Poimboeuf,
	Rasmus Villemoes, Gustavo A. R. Silva, Tobin C. Harding,
	Steven Rostedt, Jonathan Corbet, Chris Mason, Josef Bacik,
	David Sterba, David S. Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Ingo Molnar, Peter Zijlstra, Thomas Gleixner,
	Masahiro Yamada, Borislav Petkov, Randy Dunlap <
In-Reply-To: <20180313140248.7fdb0d0cee044cd7c7fc7b93@linux-foundation.org>

On Tue, Mar 13, 2018 at 2:02 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Mon, 12 Mar 2018 21:28:57 -0700 Kees Cook <keescook@chromium.org> wrote:
>
>> On Mon, Mar 12, 2018 at 4:57 PM, Linus Torvalds
>> <torvalds@linux-foundation.org> wrote:
>> > On Mon, Mar 12, 2018 at 3:55 PM, Andrew Morton
>> > <akpm@linux-foundation.org> wrote:
>> >>
>> >> Replacing the __builtin_choose_expr() with ?: works of course.
>> >
>> > Hmm. That sounds like the right thing to do. We were so myopically
>> > staring at the __builtin_choose_expr() problem that we overlooked the
>> > obvious solution.
>> >
>> > Using __builtin_constant_p() together with a ?: is in fact our common
>> > pattern, so that should be fine. The only real reason to use
>> > __builtin_choose_expr() is if you want to get the *type* to vary
>> > depending on which side you choose, but that's not an issue for
>> > min/max.
>>
>> This doesn't solve it for -Wvla, unfortunately. That was the point of
>> Josh's original suggestion of __builtin_choose_expr().
>>
>> Try building with KCFLAGS=-Wval and checking net/ipv6/proc.c:
>>
>> net/ipv6/proc.c: In function ‘snmp6_seq_show_item’:
>> net/ipv6/proc.c:198:2: warning: ISO C90 forbids array ‘buff’ whose
>> size can’t be evaluated [-Wvla]
>>   unsigned long buff[SNMP_MIB_MAX];
>>   ^~~~~~~~
>
> PITA.  Didn't we once have a different way of detecting VLAs?  Some
> post-compilation asm parser, iirc.
>
> I suppose the world wouldn't end if we had a gcc version ifdef in
> kernel.h.  We'll get to remove it in, oh, ten years.

For fixing only 6 VLAs, we don't need all this effort. When it looked
like we could get away with just a "better" max(), sure. ;)

I'll send a "const_max()" which will refuse to work on
non-constant-values (so it doesn't get accidentally used on variables
that could be exposed to double-evaluation), and will work for stack
array declarations (to avoid the overly-sensitive -Wvla checks).

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* Re: [PATCH] net: dsa: drop some VLAs in switch.c
From: Salvatore Mesoraca @ 2018-03-13 22:01 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: linux-kernel, Kernel Hardening, netdev, David S. Miller,
	Andrew Lunn, Florian Fainelli, Kees Cook
In-Reply-To: <87fu5321du.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

2018-03-13 20:58 GMT+01:00 Vivien Didelot <vivien.didelot@savoirfairelinux.com>:
> Hi Salvatore,

Hi Vivien,

> Salvatore Mesoraca <s.mesoraca16@gmail.com> writes:
>
>> dsa_switch's num_ports is currently fixed to DSA_MAX_PORTS. So we avoid
>> 2 VLAs[1] by using DSA_MAX_PORTS instead of ds->num_ports.
>>
>> [1] https://lkml.org/lkml/2018/3/7/621
>>
>> Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
>
> NAK.
>
> We are in the process to remove hardcoded limits such as DSA_MAX_PORTS
> and DSA_MAX_SWITCHES, so we have to stick with ds->num_ports.

I can rewrite the patch using kmalloc.
Although, if ds->num_ports will always be less than or equal to 12, it
should be better to
just use DSA_MAX_PORTS.

Thank you,

Salvatore

^ permalink raw reply

* [PATCH bpf-next v5 0/2] bpf stackmap with build_id+offset
From: Song Liu @ 2018-03-13 21:47 UTC (permalink / raw)
  To: netdev, ast, peterz, daniel; +Cc: kernel-team, hannes, qinteng, Song Liu

Changes v4 -> v5:

1. Only allow build_id lookup in non-nmi context. Added comment and
   commit message to highlight this limitation.
2. Minor fix reported by kbuild test robot.

Changes v3 -> v4:

1. Add fallback when build_id lookup failed. In this case, status is set
   to BPF_STACK_BUILD_ID_IP, and ip of this entry is saved.
2. Handle cases where vma is only part of the file (vma->vm_pgoff != 0).
   Thanks to Teng for helping me identify this issue!
3. Address feedbacks for previous versions.

Song Liu (2):
  bpf: extend stackmap to save binary_build_id+offset instead of address
  bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID

 include/uapi/linux/bpf.h                           |  22 ++
 kernel/bpf/stackmap.c                              | 257 +++++++++++++++++++--
 tools/include/uapi/linux/bpf.h                     |  22 ++
 tools/testing/selftests/bpf/Makefile               |  12 +-
 tools/testing/selftests/bpf/test_progs.c           | 164 ++++++++++++-
 .../selftests/bpf/test_stacktrace_build_id.c       |  60 +++++
 tools/testing/selftests/bpf/urandom_read.c         |  22 ++
 7 files changed, 535 insertions(+), 24 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_stacktrace_build_id.c
 create mode 100644 tools/testing/selftests/bpf/urandom_read.c

--
2.9.5

^ permalink raw reply

* [PATCH bpf-next v5 2/2] bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID
From: Song Liu @ 2018-03-13 21:47 UTC (permalink / raw)
  To: netdev, ast, peterz, daniel; +Cc: kernel-team, hannes, qinteng, Song Liu
In-Reply-To: <20180313214747.3675581-1-songliubraving@fb.com>

test_stacktrace_build_id() is added. It accesses tracepoint urandom_read
with "dd" and "urandom_read" and gathers stack traces. Then it reads the
stack traces from the stackmap.

urandom_read is a statically link binary that reads from /dev/urandom.
test_stacktrace_build_id() calls readelf to read build ID of urandom_read
and compares it with build ID from the stackmap.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/include/uapi/linux/bpf.h                     |  22 +++
 tools/testing/selftests/bpf/Makefile               |  12 +-
 tools/testing/selftests/bpf/test_progs.c           | 164 ++++++++++++++++++++-
 .../selftests/bpf/test_stacktrace_build_id.c       |  60 ++++++++
 tools/testing/selftests/bpf/urandom_read.c         |  22 +++
 5 files changed, 278 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_stacktrace_build_id.c
 create mode 100644 tools/testing/selftests/bpf/urandom_read.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index db6bdc3..1944d0a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -231,6 +231,28 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY		(1U << 3)
 #define BPF_F_WRONLY		(1U << 4)
 
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID	(1U << 5)
+
+enum bpf_stack_build_id_status {
+	/* user space need an empty entry to identify end of a trace */
+	BPF_STACK_BUILD_ID_EMPTY = 0,
+	/* with valid build_id and offset */
+	BPF_STACK_BUILD_ID_VALID = 1,
+	/* couldn't get build_id, fallback to ip */
+	BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+	__s32		status;
+	unsigned char	build_id[BPF_BUILD_ID_SIZE];
+	union {
+		__u64	offset;
+		__u64	ip;
+	};
+};
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8567a858..b0d29fd 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -13,6 +13,14 @@ endif
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
 LDLIBS += -lcap -lelf -lrt -lpthread
 
+TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
+all: $(TEST_CUSTOM_PROGS)
+
+$(TEST_CUSTOM_PROGS): urandom_read
+
+urandom_read: urandom_read.c
+	$(CC) -o $(TEST_CUSTOM_PROGS) -static $<
+
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
@@ -21,7 +29,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-	sample_map_ret0.o test_tcpbpf_kern.o
+	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -74,3 +82,5 @@ $(OUTPUT)/%.o: %.c
 	$(CLANG) $(CLANG_FLAGS) \
 		 -O2 -target bpf -emit-llvm -c $< -o - |      \
 	$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@
+
+EXTRA_CLEAN := $(TEST_CUSTOM_PROGS)
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 27ad540..e9df48b 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -841,7 +841,8 @@ static void test_tp_attach_query(void)
 static int compare_map_keys(int map1_fd, int map2_fd)
 {
 	__u32 key, next_key;
-	char val_buf[PERF_MAX_STACK_DEPTH * sizeof(__u64)];
+	char val_buf[PERF_MAX_STACK_DEPTH *
+		     sizeof(struct bpf_stack_build_id)];
 	int err;
 
 	err = bpf_map_get_next_key(map1_fd, NULL, &key);
@@ -964,6 +965,166 @@ static void test_stacktrace_map()
 	return;
 }
 
+static int extract_build_id(char *build_id, size_t size)
+{
+	FILE *fp;
+	char *line = NULL;
+	size_t len = 0;
+
+	fp = popen("readelf -n ./urandom_read | grep 'Build ID'", "r");
+	if (fp == NULL)
+		return -1;
+
+	if (getline(&line, &len, fp) == -1)
+		goto err;
+	fclose(fp);
+
+	if (len > size)
+		len = size;
+	memcpy(build_id, line, len);
+	build_id[len] = '\0';
+	return 0;
+err:
+	fclose(fp);
+	return -1;
+}
+
+static void test_stacktrace_build_id(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	const char *file = "./test_stacktrace_build_id.o";
+	int bytes, efd, err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {};
+	__u32 key, previous_key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+	int i, j;
+	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+	int build_id_matches = 0;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		goto out;
+
+	/* Get the ID for the sched/sched_switch tracepoint */
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/random/urandom_read/id");
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+		  "read", "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	/* Open the perf event and attach bpf progrram */
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+	       == 0);
+	assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0);
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = extract_build_id(buf, 256);
+
+	if (CHECK(err, "get build_id with readelf",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+	if (CHECK(err, "get_next_key from stackmap",
+		  "err %d, errno %d\n", err, errno))
+		goto disable_pmu;
+
+	do {
+		char build_id[64];
+
+		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+		if (CHECK(err, "lookup_elem from stackmap",
+			  "err %d, errno %d\n", err, errno))
+			goto disable_pmu;
+		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+			    id_offs[i].offset != 0) {
+				for (j = 0; j < 20; ++j)
+					sprintf(build_id + 2 * j, "%02x",
+						id_offs[i].build_id[j] & 0xff);
+				if (strstr(buf, build_id) != NULL)
+					build_id_matches = 1;
+			}
+		previous_key = key;
+	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+	CHECK(build_id_matches < 1, "build id match",
+	      "Didn't find expected build ID from the map");
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+
+out:
+	return;
+}
+
 int main(void)
 {
 	test_pkt_access();
@@ -976,6 +1137,7 @@ int main(void)
 	test_obj_name();
 	test_tp_attach_query();
 	test_stacktrace_map();
+	test_stacktrace_build_id();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
new file mode 100644
index 0000000..b755bd7
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH         127
+#endif
+
+struct bpf_map_def SEC("maps") control_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") stackid_hmap = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") stackmap = {
+	.type = BPF_MAP_TYPE_STACK_TRACE,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct bpf_stack_build_id)
+		* PERF_MAX_STACK_DEPTH,
+	.max_entries = 128,
+	.map_flags = BPF_F_STACK_BUILD_ID,
+};
+
+/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
+struct random_urandom_args {
+	unsigned long long pad;
+	int got_bits;
+	int pool_left;
+	int input_left;
+};
+
+SEC("tracepoint/random/urandom_read")
+int oncpu(struct random_urandom_args *args)
+{
+	__u32 key = 0, val = 0, *value_p;
+
+	value_p = bpf_map_lookup_elem(&control_map, &key);
+	if (value_p && *value_p)
+		return 0; /* skip if non-zero *value_p */
+
+	/* The size of stackmap and stackid_hmap should be the same */
+	key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
+	if ((int)key >= 0)
+		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
new file mode 100644
index 0000000..4acfdeb
--- /dev/null
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -0,0 +1,22 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+#define BUF_SIZE 256
+int main(void)
+{
+	int fd = open("/dev/urandom", O_RDONLY);
+	int i;
+	char buf[BUF_SIZE];
+
+	if (fd < 0)
+		return 1;
+	for (i = 0; i < 4; ++i)
+		read(fd, buf, BUF_SIZE);
+
+	close(fd);
+	return 0;
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address
From: Song Liu @ 2018-03-13 21:47 UTC (permalink / raw)
  To: netdev, ast, peterz, daniel; +Cc: kernel-team, hannes, qinteng, Song Liu
In-Reply-To: <20180313214747.3675581-1-songliubraving@fb.com>

Currently, bpf stackmap store address for each entry in the call trace.
To map these addresses to user space files, it is necessary to maintain
the mapping from these virtual address to symbols in the binary. Usually,
the user space profiler (such as perf) has to scan /proc/pid/maps at the
beginning of profiling, and monitor mmap2() calls afterwards. Given the
cost of maintaining the address map, this solution is not practical for
system wide profiling that is always on.

This patch tries to solve this problem with a variation of stackmap. This
variation is enabled by flag BPF_F_STACK_BUILD_ID, and only works for
user stack. Instead of storing addresses, the variation stores ELF file
build_id + offset.

Build ID is a 20-byte unique identifier for ELF files. The following
command shows the Build ID of /bin/bash:

  [user@]$ readelf -n /bin/bash
  ...
    Build ID: XXXXXXXXXX
  ...

With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID
for each entry in the call trace, and translate it into the following
struct:

  struct bpf_stack_build_id_offset {
          __s32           status;
          unsigned char   build_id[BPF_BUILD_ID_SIZE];
          union {
                  __u64   offset;
                  __u64   ip;
          };
  };

The search of build_id is limited to the first page of the file, and this
page should be in page cache. Otherwise, we fallback to store ip for this
entry (ip field in struct bpf_stack_build_id_offset). This requires the
build_id to be stored in the first page. A quick survey of binary and
dynamic library files in a few different systems shows that almost all
binary and dynamic library files have build_id in the first page.

User space can access struct bpf_stack_build_id_offset with bpf
syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to
maintain mapping from build id to binary files. This mostly static
mapping is much easier to maintain than per process address maps.

Note: Stackmap with build_id only works in non-nmi context at this time.
This is because we need to take mm->mmap_sem for find_vma(). If this
changes, we would like to allow build_id lookup in nmi context.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/uapi/linux/bpf.h |  22 ++++
 kernel/bpf/stackmap.c    | 257 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 257 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a66769..1e15d17 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -231,6 +231,28 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY		(1U << 3)
 #define BPF_F_WRONLY		(1U << 4)
 
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID	(1U << 5)
+
+enum bpf_stack_build_id_status {
+	/* user space need an empty entry to identify end of a trace */
+	BPF_STACK_BUILD_ID_EMPTY = 0,
+	/* with valid build_id and offset */
+	BPF_STACK_BUILD_ID_VALID = 1,
+	/* couldn't get build_id, fallback to ip */
+	BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+	__s32		status;
+	unsigned char	build_id[BPF_BUILD_ID_SIZE];
+	union {
+		__u64	offset;
+		__u64	ip;
+	};
+};
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43..7b134e9 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
 #include <linux/filter.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
 #include "percpu_freelist.h"
 
-#define STACK_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK					\
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |	\
+	 BPF_F_STACK_BUILD_ID)
 
 struct stack_map_bucket {
 	struct pcpu_freelist_node fnode;
 	u32 hash;
 	u32 nr;
-	u64 ip[];
+	u64 data[];
 };
 
 struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+	return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+	return stack_map_use_build_id(map) ?
+		sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 {
 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > sysctl_perf_event_max_stack)
+	    value_size < 8 || value_size % 8)
+		return ERR_PTR(-EINVAL);
+
+	BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+	if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+		if (value_size % sizeof(struct bpf_stack_build_id) ||
+		    value_size / sizeof(struct bpf_stack_build_id)
+		    > sysctl_perf_event_max_stack)
+			return ERR_PTR(-EINVAL);
+	} else if (value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -114,13 +136,181 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
+#define BPF_BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int stack_map_parse_build_id(void *page_addr,
+					   unsigned char *build_id,
+					   void *note_start,
+					   Elf32_Word note_size)
+{
+	Elf32_Word note_offs = 0, new_offs;
+
+	/* check for overflow */
+	if (note_start < page_addr || note_start + note_size < note_start)
+		return -EINVAL;
+
+	/* only supports note that fits in the first page */
+	if (note_start + note_size > page_addr + PAGE_SIZE)
+		return -EINVAL;
+
+	while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+		Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+
+		if (nhdr->n_type == BPF_BUILD_ID &&
+		    nhdr->n_namesz == sizeof("GNU") &&
+		    nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+			memcpy(build_id,
+			       note_start + note_offs +
+			       ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+			       BPF_BUILD_ID_SIZE);
+			return 0;
+		}
+		new_offs = note_offs + sizeof(Elf32_Nhdr) +
+			ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+		if (new_offs <= note_offs)  /* overflow */
+			break;
+		note_offs = new_offs;
+	}
+	return -EINVAL;
+}
+
+/* Parse build ID from 32-bit ELF */
+static int stack_map_get_build_id_32(void *page_addr,
+				     unsigned char *build_id)
+{
+	Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+	Elf32_Phdr *phdr;
+	int i;
+
+	/* only supports phdr that fits in one page */
+	if (ehdr->e_phnum >
+	    (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+		return -EINVAL;
+
+	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+
+	for (i = 0; i < ehdr->e_phnum; ++i)
+		if (phdr[i].p_type == PT_NOTE)
+			return stack_map_parse_build_id(page_addr, build_id,
+					page_addr + phdr[i].p_offset,
+					phdr[i].p_filesz);
+	return -EINVAL;
+}
+
+/* Parse build ID from 64-bit ELF */
+static int stack_map_get_build_id_64(void *page_addr,
+				     unsigned char *build_id)
+{
+	Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+	Elf64_Phdr *phdr;
+	int i;
+
+	/* only supports phdr that fits in one page */
+	if (ehdr->e_phnum >
+	    (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+		return -EINVAL;
+
+	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+
+	for (i = 0; i < ehdr->e_phnum; ++i)
+		if (phdr[i].p_type == PT_NOTE)
+			return stack_map_parse_build_id(page_addr, build_id,
+					page_addr + phdr[i].p_offset,
+					phdr[i].p_filesz);
+	return -EINVAL;
+}
+
+/* Parse build ID of ELF file mapped to vma */
+static int stack_map_get_build_id(struct vm_area_struct *vma,
+				  unsigned char *build_id)
+{
+	Elf32_Ehdr *ehdr;
+	struct page *page;
+	void *page_addr;
+	int ret;
+
+	/* only works for page backed storage  */
+	if (!vma->vm_file)
+		return -EINVAL;
+
+	page = find_get_page(vma->vm_file->f_mapping, 0);
+	if (!page)
+		return -EFAULT;	/* page not mapped */
+
+	ret = -EINVAL;
+	page_addr = page_address(page);
+	ehdr = (Elf32_Ehdr *)page_addr;
+
+	/* compare magic x7f "ELF" */
+	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+		goto out;
+
+	/* only support executable file and shared object file */
+	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+		goto out;
+
+	if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+		ret = stack_map_get_build_id_32(page_addr, build_id);
+	else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+		ret = stack_map_get_build_id_64(page_addr, build_id);
+out:
+	put_page(page);
+	return ret;
+}
+
+static void stack_map_get_build_id_offset(struct bpf_map *map,
+					  struct stack_map_bucket *bucket,
+					  u64 *ips, u32 trace_nr)
+{
+	int i;
+	struct vm_area_struct *vma;
+	struct bpf_stack_build_id *id_offs;
+
+	bucket->nr = trace_nr;
+	id_offs = (struct bpf_stack_build_id *)bucket->data;
+
+	/*
+	 * We cannot do up_read() in nmi context, so build_id lookup is
+	 * only supported for non-nmi events. If at some point, it is
+	 * possible to run find_vma() without taking the semaphore, we
+	 * would like to allow build_id lookup in nmi context.
+	 */
+	if (!current || !current->mm || in_nmi() ||
+	    down_read_trylock(&current->mm->mmap_sem) == 0) {
+		/* cannot access current->mm, fall back to ips */
+		for (i = 0; i < trace_nr; i++) {
+			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+			id_offs[i].ip = ips[i];
+		}
+		return;
+	}
+
+	for (i = 0; i < trace_nr; i++) {
+		vma = find_vma(current->mm, ips[i]);
+		if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+			/* per entry fall back to ips */
+			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+			id_offs[i].ip = ips[i];
+			continue;
+		}
+		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
+			- vma->vm_start;
+		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+	}
+	up_read(&current->mm->mmap_sem);
+}
+
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
-	u32 max_depth = map->value_size / 8;
+	u32 max_depth = map->value_size / stack_map_data_size(map);
 	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
 	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,11 +318,16 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	bool user = flags & BPF_F_USER_STACK;
 	bool kernel = !user;
 	u64 *ips;
+	bool hash_matches;
 
 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
+	/* build_id+offset stack map only supports user stack */
+	if (stack_map_use_build_id(map) && !user)
+		return -EINVAL;
+
 	trace = get_perf_callchain(regs, init_nr, kernel, user,
 				   sysctl_perf_event_max_stack, false, false);
 
@@ -156,24 +351,42 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	id = hash & (smap->n_buckets - 1);
 	bucket = READ_ONCE(smap->buckets[id]);
 
-	if (bucket && bucket->hash == hash) {
-		if (flags & BPF_F_FAST_STACK_CMP)
+	hash_matches = bucket && bucket->hash == hash;
+	/* fast cmp */
+	if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+		return id;
+
+	if (stack_map_use_build_id(map)) {
+		/* for build_id+offset, pop a bucket before slow cmp */
+		new_bucket = (struct stack_map_bucket *)
+			pcpu_freelist_pop(&smap->freelist);
+		if (unlikely(!new_bucket))
+			return -ENOMEM;
+		stack_map_get_build_id_offset(map, new_bucket, ips, trace_nr);
+		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+		if (hash_matches && bucket->nr == trace_nr &&
+		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
 			return id;
-		if (bucket->nr == trace_nr &&
-		    memcmp(bucket->ip, ips, trace_len) == 0)
+		}
+		if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+			pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+			return -EEXIST;
+		}
+	} else {
+		if (hash_matches && bucket->nr == trace_nr &&
+		    memcmp(bucket->data, ips, trace_len) == 0)
 			return id;
+		if (bucket && !(flags & BPF_F_REUSE_STACKID))
+			return -EEXIST;
+
+		new_bucket = (struct stack_map_bucket *)
+			pcpu_freelist_pop(&smap->freelist);
+		if (unlikely(!new_bucket))
+			return -ENOMEM;
+		memcpy(new_bucket->data, ips, trace_len);
 	}
 
-	/* this call stack is not in the map, try to add it */
-	if (bucket && !(flags & BPF_F_REUSE_STACKID))
-		return -EEXIST;
-
-	new_bucket = (struct stack_map_bucket *)
-		pcpu_freelist_pop(&smap->freelist);
-	if (unlikely(!new_bucket))
-		return -ENOMEM;
-
-	memcpy(new_bucket->ip, ips, trace_len);
 	new_bucket->hash = hash;
 	new_bucket->nr = trace_nr;
 
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 	if (!bucket)
 		return -ENOENT;
 
-	trace_len = bucket->nr * sizeof(u64);
-	memcpy(value, bucket->ip, trace_len);
+	trace_len = bucket->nr * stack_map_data_size(map);
+	memcpy(value, bucket->data, trace_len);
 	memset(value + trace_len, 0, map->value_size - trace_len);
 
 	old_bucket = xchg(&smap->buckets[id], bucket);
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH iproute2] treat "default" and "all"/"any" parameters differenty
From: Luca Boccassi @ 2018-03-13 21:45 UTC (permalink / raw)
  To: Alexander Zubkov, Stephen Hemminger; +Cc: netdev@vger.kernel.org
In-Reply-To: <3ded09a4-54f9-9d82-d952-33527a6a3e58@msu.ru>

[-- Attachment #1: Type: text/plain, Size: 1367 bytes --]

On Tue, 2018-03-13 at 21:19 +0100, Alexander Zubkov wrote:
> Debian maintainer found that basic command:
> 	# ip route flush all
> No longer worked as expected which breaks user scripts and
> expectations. It no longer flushed all IPv4 routes.
> 
> Recently behaviour of "default" prefix parameter was corrected. But
> at
> the same time behaviour of "all"/"any" was altered too, because they
> were the same branch of the code. As those parameters mean different,
> they need to be treated differently in code too. This patch reflects
> the difference.
> 
> Reported-by: Luca Boccassi <bluca@debian.org>
> Signed-off-by: Alexander Zubkov <green@msu.ru>
> ---
>   lib/utils.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/utils.c b/lib/utils.c
> index 9fa5220..b289d9c 100644
> --- a/lib/utils.c
> +++ b/lib/utils.c
> @@ -658,7 +658,8 @@ int get_prefix_1(inet_prefix *dst, char *arg,
> int 
> family)
>   		dst->family = family;
>   		dst->bytelen = 0;
>   		dst->bitlen = 0;
> -		dst->flags |= PREFIXLEN_SPECIFIED;
> +		if (strcmp(arg, "default") == 0)
> +			dst->flags |= PREFIXLEN_SPECIFIED;
>   		return 0;
>   	}
> 
> --
> 1.9.1

Tested-by: Luca Boccassi <bluca@debian.org>

Yep solves the problem, ls all/any now work as before. Thanks!

-- 
Kind regards,
Luca Boccassi

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH net] net: systemport: Rewrite __bcm_sysport_tx_reclaim()
From: Florian Fainelli @ 2018-03-13 21:45 UTC (permalink / raw)
  To: netdev; +Cc: davem, Florian Fainelli

There is no need for complex checking between the last consumed index
and current consumed index, a simple subtraction will do.

This also eliminates the possibility of a permanent transmit queue stall
under the following conditions:

- one CPU bursts ring->size worth of traffic (up to 256 buffers), to the
  point where we run out of free descriptors, so we stop the transmit
  queue at the end of bcm_sysport_xmit()

- because of our locking, we have the transmit process disable
  interrupts which means we can be blocking the TX reclamation process

- when TX reclamation finally runs, we will be computing the difference
  between ring->c_index (last consumed index by SW) and what the HW
  reports through its register

- this register is masked with (ring->size - 1) = 0xff, which will lead
  to stripping the upper bits of the index (register is 16-bits wide)

- we will be computing last_tx_cn as 0, which means there is no work to
  be done, and we never wake-up the transmit queue, leaving it
  permanently disabled

A practical example is e.g: ring->c_index aka last_c_index = 12, we
pushed 256 entries, HW consumer index = 268, we mask it with 0xff = 12,
so last_tx_cn == 0, nothing happens.

Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 33 ++++++++++++++----------------
 drivers/net/ethernet/broadcom/bcmsysport.h |  2 +-
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index f15a8fc6dfc9..3fc549b88c43 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -855,10 +855,12 @@ static void bcm_sysport_tx_reclaim_one(struct bcm_sysport_tx_ring *ring,
 static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
 					     struct bcm_sysport_tx_ring *ring)
 {
-	unsigned int c_index, last_c_index, last_tx_cn, num_tx_cbs;
 	unsigned int pkts_compl = 0, bytes_compl = 0;
 	struct net_device *ndev = priv->netdev;
+	unsigned int txbds_processed = 0;
 	struct bcm_sysport_cb *cb;
+	unsigned int txbds_ready;
+	unsigned int c_index;
 	u32 hw_ind;
 
 	/* Clear status before servicing to reduce spurious interrupts */
@@ -871,29 +873,23 @@ static unsigned int __bcm_sysport_tx_reclaim(struct bcm_sysport_priv *priv,
 	/* Compute how many descriptors have been processed since last call */
 	hw_ind = tdma_readl(priv, TDMA_DESC_RING_PROD_CONS_INDEX(ring->index));
 	c_index = (hw_ind >> RING_CONS_INDEX_SHIFT) & RING_CONS_INDEX_MASK;
-	ring->p_index = (hw_ind & RING_PROD_INDEX_MASK);
-
-	last_c_index = ring->c_index;
-	num_tx_cbs = ring->size;
-
-	c_index &= (num_tx_cbs - 1);
-
-	if (c_index >= last_c_index)
-		last_tx_cn = c_index - last_c_index;
-	else
-		last_tx_cn = num_tx_cbs - last_c_index + c_index;
+	txbds_ready = (c_index - ring->c_index) & RING_CONS_INDEX_MASK;
 
 	netif_dbg(priv, tx_done, ndev,
-		  "ring=%d c_index=%d last_tx_cn=%d last_c_index=%d\n",
-		  ring->index, c_index, last_tx_cn, last_c_index);
+		  "ring=%d old_c_index=%u c_index=%u txbds_ready=%u\n",
+		  ring->index, ring->c_index, c_index, txbds_ready);
 
-	while (last_tx_cn-- > 0) {
-		cb = ring->cbs + last_c_index;
+	while (txbds_processed < txbds_ready) {
+		cb = &ring->cbs[ring->clean_index];
 		bcm_sysport_tx_reclaim_one(ring, cb, &bytes_compl, &pkts_compl);
 
 		ring->desc_count++;
-		last_c_index++;
-		last_c_index &= (num_tx_cbs - 1);
+		txbds_processed++;
+
+		if (likely(ring->clean_index < ring->size - 1))
+			ring->clean_index++;
+		else
+			ring->clean_index = 0;
 	}
 
 	u64_stats_update_begin(&priv->syncp);
@@ -1394,6 +1390,7 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv,
 	netif_tx_napi_add(priv->netdev, &ring->napi, bcm_sysport_tx_poll, 64);
 	ring->index = index;
 	ring->size = size;
+	ring->clean_index = 0;
 	ring->alloc_size = ring->size;
 	ring->desc_cpu = p;
 	ring->desc_count = ring->size;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index f5a984c1c986..19c91c76e327 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -706,7 +706,7 @@ struct bcm_sysport_tx_ring {
 	unsigned int	desc_count;	/* Number of descriptors */
 	unsigned int	curr_desc;	/* Current descriptor */
 	unsigned int	c_index;	/* Last consumer index */
-	unsigned int	p_index;	/* Current producer index */
+	unsigned int	clean_index;	/* Current clean index */
 	struct bcm_sysport_cb *cbs;	/* Transmit control blocks */
 	struct dma_desc	*desc_cpu;	/* CPU view of the descriptor */
 	struct bcm_sysport_priv *priv;	/* private context backpointer */
-- 
2.14.1

^ permalink raw reply related

* Re: Possible tcp regression in 4.14.13..26
From: Eric Dumazet @ 2018-03-13 21:45 UTC (permalink / raw)
  To: Timofey Titovets, netdev
In-Reply-To: <CAGqmi77b0HQAX-dg7JKAWEnu+eFtn8GS+5dsj2kO9Xnzcn_tLg@mail.gmail.com>



On 03/13/2018 02:41 PM, Timofey Titovets wrote:
> Hi list,
> We currently upgrade our servers from 4.14.13 to 4.14.26
> And get some problems with random kernel oops by:
> [  422.081094] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000038
> [  422.081254] IP: tcp_push+0x42/0x110
> [  422.081314] PGD 0 P4D 0
> [  422.081364] Oops: 0002 [#1] SMP PTI
> 
> Currently we assume that this caused by:
> net: tcp: close sock if net namespace is exiting
> [ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ]
> 
> If that can helps we heavy use docker and ceph rbd.
> 
> I will write if we get more info or fix that.
> 
> Thanks.
> 

Can you provide a full stack trace ?

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH 12/15] ice: Add stats and ethtool support
From: Eric Dumazet @ 2018-03-13 21:44 UTC (permalink / raw)
  To: Jesse Brandeburg, davem
  Cc: Venkataramanan, Anirudh, kubakici@wp.pl, netdev@vger.kernel.org,
	intel-wired-lan@lists.osuosl.org
In-Reply-To: <20180313141414.00007d11@intel.com>



On 03/13/2018 02:14 PM, Jesse Brandeburg wrote:
> On Tue, 13 Mar 2018 12:17:10 -0700
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>> Yes, this is a recurring mistake
>>
>> See commit
>> bf909456f6a89654cb65c01fe83a4f9b133bf978 Revert "net: hns3: Add packet
>> statistics of netdev"
> 
> Thanks for the pointer, that was a useful thread to review.  I
> understand the point that was made about not having the netdev stats
> shown in ethtool -S.  We definitely do provide per-queue stats as well
> as these regular stats in ethtool -S.
> 
> I do remember from the past discussions that it *is* useless for the
> driver to keep internally any stats that were already stored via the
> get_stats NDO, and we missed it in the internal review that this driver
> was doing that, so that will be fixed.
> 
> Maybe it's just that I've been doing this too long, but I regularly
> (and many other customers/users do as well) depend on the ethtool stats
> being atomically updated w.r.t. each other.  This means that if I'm
> getting the over rx_packets, as well as the per-queue rx_packets, and I
> read them all at once from the driver with ethtool, then I can check
> that things are working as expected.
> 
> If I have to gather the netdev stats from /proc/net/dev (which iproute2
> tool shows the /proc/net/dev stats?) and somehow atomically gather the
> ethtool -S stats.  What's the user to do in this brave new world where
> ethtool doesn't at least have rx/tx bytes and packets?

I do not believe ethtool -S provides an atomic snapshot of counters anyway.

Maybe some drivers/NIC implement such a feature, but it is not documented.

In any case, that could be implemented generically (by core networking 
stack), instead of being copied/pasted in many drivers, if this would be 
really needed.

core networking stack would append (or insert) ndo_stats.
(The atomicity rule if really needed would need to pass an extra struct 
rtnl_link_stats64 pointer to get_ethtool_stats() method)

^ permalink raw reply

* Possible tcp regression in 4.14.13..26
From: Timofey Titovets @ 2018-03-13 21:41 UTC (permalink / raw)
  To: netdev

Hi list,
We currently upgrade our servers from 4.14.13 to 4.14.26
And get some problems with random kernel oops by:
[  422.081094] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000038
[  422.081254] IP: tcp_push+0x42/0x110
[  422.081314] PGD 0 P4D 0
[  422.081364] Oops: 0002 [#1] SMP PTI

Currently we assume that this caused by:
net: tcp: close sock if net namespace is exiting
[ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ]

If that can helps we heavy use docker and ceph rbd.

I will write if we get more info or fix that.

Thanks.

-- 
Have a nice day,
Timofey.

^ permalink raw reply

* [PATCH v5 net-next 3/4] net: macb: Add phy-handle DT support
From: Brad Mouring @ 2018-03-13 21:32 UTC (permalink / raw)
  To: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Andrew Lunn
  Cc: Mark Rutland, netdev, Julia Cartwright, devicetree, Brad Mouring
In-Reply-To: <20180313213216.109153-1-brad.mouring@ni.com>

This optional binding (as described in the ethernet DT bindings doc)
directs the netdev to the phydev to use. This is useful for a phy
chip that has >1 phy in it, and two netdevs are using the same phy
chip (i.e. the second mac's phy lives on the first mac's MDIO bus)

The devicetree snippet would look something like this:

ethernet@feedf00d {
	...
	phy-handle = <&phy0> // the first netdev is physically wired to phy0
	...
	phy0: phy@0 {
		...
		reg = <0x0> // MDIO address 0
		...
	}
	phy1: phy@1 {
		...
		reg = <0x1> // MDIO address 1
		...
	}
...
}

ethernet@deadbeef {
	...
	phy-handle = <&phy1> // tells the driver to use phy1 on the
						 // first mac's mdio bus (it's wired thusly)
	...
}

The work done to add the phy_node in the first place (dacdbb4dfc1a1:
"net: macb: add fixed-link node support") will consume the
device_node (if found).

Signed-off-by: Brad Mouring <brad.mouring@ni.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index db1dc301bed3..d09bd43680b3 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -488,10 +488,12 @@ static int macb_mii_probe(struct net_device *dev)
 			}
 			bp->phy_node = of_node_get(np);
 		} else {
-			/* fallback to standard phy registration if no phy were
-			 * found during dt phy registration
+			bp->phy_node = of_parse_phandle(np, "phy-handle", 0);
+			/* fallback to standard phy registration if no
+			 * phy-handle was found nor any phy found during
+			 * dt phy registration
 			 */
-			if (!phy_find_first(bp->mii_bus)) {
+			if (!bp->phy_node && !phy_find_first(bp->mii_bus)) {
 				for (i = 0; i < PHY_MAX_ADDR; i++) {
 					struct phy_device *phydev;
 
-- 
2.16.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox