Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next 07/14] net/mlx5: Use PTR_ERR_OR_ZERO rather than its implementation
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev@vger.kernel.org, zhong jiang, Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

From: zhong jiang <zhongjiang@huawei.com>

PTR_ERR_OR_ZERO contains if(IS_ERR(...)) + PTR_ERR. It is better
to use it directly. hence just replace it.

Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 98d1f7a48304..da7555fdb4d5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -988,10 +988,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 					    &flow_act, dest, dest_ix);
 	mutex_unlock(&priv->fs.tc.t_lock);
 
-	if (IS_ERR(flow->rule[0]))
-		return PTR_ERR(flow->rule[0]);
-
-	return 0;
+	return PTR_ERR_OR_ZERO(flow->rule[0]);
 }
 
 static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
-- 
2.21.0


^ permalink raw reply related

* [net-next 08/14] net/mlx5e: kTLS, Remove unused function parameter
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Tariq Toukan, Eran Ben Elisha,
	Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

SKB parameter is no longer used in tx_post_resync_dump(),
remove it.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
index e5222d17df35..d195366461c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
@@ -256,8 +256,7 @@ struct mlx5e_dump_wqe {
 };
 
 static int
-tx_post_resync_dump(struct mlx5e_txqsq *sq, struct sk_buff *skb,
-		    skb_frag_t *frag, u32 tisn, bool first)
+tx_post_resync_dump(struct mlx5e_txqsq *sq, skb_frag_t *frag, u32 tisn, bool first)
 {
 	struct mlx5_wqe_ctrl_seg *cseg;
 	struct mlx5_wqe_data_seg *dseg;
@@ -371,8 +370,7 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx,
 	tx_post_resync_params(sq, priv_tx, info.rcd_sn);
 
 	for (i = 0; i < info.nr_frags; i++)
-		if (tx_post_resync_dump(sq, skb, info.frags[i],
-					priv_tx->tisn, !i))
+		if (tx_post_resync_dump(sq, info.frags[i], priv_tx->tisn, !i))
 			goto err_out;
 
 	/* If no dump WQE was sent, we need to have a fence NOP WQE before the
-- 
2.21.0


^ permalink raw reply related

* [net-next 10/14] net/mlx5: DR, Remove useless set memory to zero use memset()
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev@vger.kernel.org, Wei Yongjun, Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

From: Wei Yongjun <weiyongjun1@huawei.com>

The memory return by kzalloc() has already be set to zero, so
remove useless memset(0).

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index ef0dea44f3b3..5df8436b2ae3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -899,7 +899,6 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
 		goto clean_qp;
 	}

-	memset(dmn->send_ring->buf, 0, size);
 	dmn->send_ring->buf_size = size;

 	dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
-- 
2.21.0

^ permalink raw reply related

* [net-next 11/14] net/mlx5: DR, Fix error return code in dr_domain_init_resources()
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev@vger.kernel.org, Wei Yongjun, Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

From: Wei Yongjun <weiyongjun1@huawei.com>

Fix to return negative error code -ENOMEM from the error handling
case instead of 0, as done elsewhere in this function.

Fixes: 4ec9e7b02697 ("net/mlx5: DR, Expose steering domain functionality")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
index 3b9cf0bccf4d..461cc2c30538 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
@@ -66,6 +66,7 @@ static int dr_domain_init_resources(struct mlx5dr_domain *dmn)
 	dmn->uar = mlx5_get_uars_page(dmn->mdev);
 	if (!dmn->uar) {
 		mlx5dr_err(dmn, "Couldn't allocate UAR\n");
+		ret = -ENOMEM;
 		goto clean_pd;
 	}
 
@@ -73,6 +74,7 @@ static int dr_domain_init_resources(struct mlx5dr_domain *dmn)
 	if (!dmn->ste_icm_pool) {
 		mlx5dr_err(dmn, "Couldn't get icm memory for %s\n",
 			   dev_name(dmn->mdev->device));
+		ret = -ENOMEM;
 		goto clean_uar;
 	}
 
@@ -80,6 +82,7 @@ static int dr_domain_init_resources(struct mlx5dr_domain *dmn)
 	if (!dmn->action_icm_pool) {
 		mlx5dr_err(dmn, "Couldn't get action icm memory for %s\n",
 			   dev_name(dmn->mdev->device));
+		ret = -ENOMEM;
 		goto free_ste_icm_pool;
 	}
 
-- 
2.21.0


^ permalink raw reply related

* [net-next 09/14] net/mlx5e: Remove unnecessary clear_bit()s
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Maxim Mikityanskiy, Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

From: Maxim Mikityanskiy <maximmi@mellanox.com>

Don't clear MLX5E_SQ_STATE_ENABLED on error in mlx5e_open_txqsq and
mlx5e_open_icosq, because it's not set there, and is 0 by default.

Fixes: acc6c5953af1 ("net/mlx5e: Split open/close channels to stages")
Fixes: 9d18b5144a0a ("net/mlx5e: Split open/close ICOSQ into stages")
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index dadadf221087..cd51cd56484b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1315,7 +1315,6 @@ static int mlx5e_open_txqsq(struct mlx5e_channel *c,
 	return 0;
 
 err_free_txqsq:
-	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
 	mlx5e_free_txqsq(sq);
 
 	return err;
@@ -1403,7 +1402,6 @@ int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
 	return 0;
 
 err_free_icosq:
-	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
 	mlx5e_free_icosq(sq);
 
 	return err;
-- 
2.21.0


^ permalink raw reply related

* [net-next 12/14] net/mlx5: DR, Remove redundant dev_name print from err log
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev@vger.kernel.org, Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

mlx5_core_err already prints the name of the device.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../mellanox/mlx5/core/steering/dr_domain.c       | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
index 461cc2c30538..5b24732b18c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c
@@ -72,24 +72,21 @@ static int dr_domain_init_resources(struct mlx5dr_domain *dmn)
 
 	dmn->ste_icm_pool = mlx5dr_icm_pool_create(dmn, DR_ICM_TYPE_STE);
 	if (!dmn->ste_icm_pool) {
-		mlx5dr_err(dmn, "Couldn't get icm memory for %s\n",
-			   dev_name(dmn->mdev->device));
+		mlx5dr_err(dmn, "Couldn't get icm memory\n");
 		ret = -ENOMEM;
 		goto clean_uar;
 	}
 
 	dmn->action_icm_pool = mlx5dr_icm_pool_create(dmn, DR_ICM_TYPE_MODIFY_ACTION);
 	if (!dmn->action_icm_pool) {
-		mlx5dr_err(dmn, "Couldn't get action icm memory for %s\n",
-			   dev_name(dmn->mdev->device));
+		mlx5dr_err(dmn, "Couldn't get action icm memory\n");
 		ret = -ENOMEM;
 		goto free_ste_icm_pool;
 	}
 
 	ret = mlx5dr_send_ring_alloc(dmn);
 	if (ret) {
-		mlx5dr_err(dmn, "Couldn't create send-ring for %s\n",
-			   dev_name(dmn->mdev->device));
+		mlx5dr_err(dmn, "Couldn't create send-ring\n");
 		goto free_action_icm_pool;
 	}
 
@@ -312,16 +309,14 @@ mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type)
 					    dmn->info.caps.log_icm_size);
 
 	if (!dmn->info.supp_sw_steering) {
-		mlx5dr_err(dmn, "SW steering not supported for %s\n",
-			   dev_name(mdev->device));
+		mlx5dr_err(dmn, "SW steering is not supported\n");
 		goto uninit_caps;
 	}
 
 	/* Allocate resources */
 	ret = dr_domain_init_resources(dmn);
 	if (ret) {
-		mlx5dr_err(dmn, "Failed init domain resources for %s\n",
-			   dev_name(mdev->device));
+		mlx5dr_err(dmn, "Failed init domain resources\n");
 		goto uninit_caps;
 	}
 
-- 
2.21.0


^ permalink raw reply related

* [net-next 13/14] net/mlx5: Expose HW capability bits for port buffer per priority congestion counters
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Aya Levin, Moshe Shemesh, Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

Map capability bit indicating that HCA supports port buffer's congestion
counters. Also map registers with the corresponding counters.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 29 ++++++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 8dd081051a79..f3773e8536bb 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1316,6 +1316,7 @@ enum {
 	MLX5_PER_PRIORITY_COUNTERS_GROUP      = 0x10,
 	MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11,
 	MLX5_PHYSICAL_LAYER_COUNTERS_GROUP    = 0x12,
+	MLX5_PER_TRAFFIC_CLASS_CONGESTION_GROUP = 0x13,
 	MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP = 0x16,
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7d65c0578ac9..a487b681b516 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1196,7 +1196,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         rts2rts_qp_counters_set_id[0x1];
 	u8         reserved_at_16a[0x2];
 	u8         vnic_env_int_rq_oob[0x1];
-	u8         reserved_at_16d[0x2];
+	u8         sbcam_reg[0x1];
+	u8         reserved_at_16e[0x1];
 	u8         qcam_reg[0x1];
 	u8         gid_table_size[0x10];
 
@@ -1960,12 +1961,28 @@ struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits {
 	u8         port_xmit_wait[0x20];
 };
 
-struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits {
+struct mlx5_ifc_eth_per_tc_prio_grp_data_layout_bits {
 	u8         transmit_queue_high[0x20];
 
 	u8         transmit_queue_low[0x20];
 
-	u8         reserved_at_40[0x780];
+	u8         no_buffer_discard_uc_high[0x20];
+
+	u8         no_buffer_discard_uc_low[0x20];
+
+	u8         reserved_at_80[0x740];
+};
+
+struct mlx5_ifc_eth_per_tc_congest_prio_grp_data_layout_bits {
+	u8         wred_discard_high[0x20];
+
+	u8         wred_discard_low[0x20];
+
+	u8         ecn_marked_tc_high[0x20];
+
+	u8         ecn_marked_tc_low[0x20];
+
+	u8         reserved_at_80[0x740];
 };
 
 struct mlx5_ifc_eth_per_prio_grp_data_layout_bits {
@@ -3642,7 +3659,8 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
 	struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits eth_3635_cntrs_grp_data_layout;
 	struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout;
 	struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout;
-	struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
+	struct mlx5_ifc_eth_per_tc_prio_grp_data_layout_bits eth_per_tc_prio_grp_data_layout;
+	struct mlx5_ifc_eth_per_tc_congest_prio_grp_data_layout_bits eth_per_tc_congest_prio_grp_data_layout;
 	struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout;
 	struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
 	struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs;
@@ -9422,7 +9440,8 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout;
 	struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout;
 	struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout;
-	struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
+	struct mlx5_ifc_eth_per_tc_prio_grp_data_layout_bits eth_per_tc_prio_grp_data_layout;
+	struct mlx5_ifc_eth_per_tc_congest_prio_grp_data_layout_bits eth_per_tc_congest_prio_grp_data_layout;
 	struct mlx5_ifc_lane_2_module_mapping_bits lane_2_module_mapping;
 	struct mlx5_ifc_pamp_reg_bits pamp_reg;
 	struct mlx5_ifc_paos_reg_bits paos_reg;
-- 
2.21.0


^ permalink raw reply related

* [net-next 14/14] net/mlx5e: Add port buffer's congestion counters
From: Saeed Mahameed @ 2019-09-05 21:51 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Aya Levin, Moshe Shemesh, Saeed Mahameed
In-Reply-To: <20190905215034.22713-1-saeedm@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

Add 3 counters per priority to ethtool using PPCNT:
1) rx_prio[p]_buf_discard - the number of packets discarded by device
   due to lack of per host receive buffers
2) rx_prio[p]_cong_discard - the number of packets discarded by device
   due to per host congestion
3) rx_prio[p]_marked - the number of packets ECN marked by device due
   to per host congestion

Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/en_stats.c    | 149 +++++++++++++++++-
 .../ethernet/mellanox/mlx5/core/en_stats.h    |   2 +
 2 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index f1065e78086a..ac6fdcda7019 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -981,6 +981,147 @@ static void mlx5e_grp_pcie_update_stats(struct mlx5e_priv *priv)
 	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
 }
 
+#define PPORT_PER_TC_PRIO_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_per_tc_prio_grp_data_layout.c##_high)
+
+static const struct counter_desc pport_per_tc_prio_stats_desc[] = {
+	{ "rx_prio%d_buf_discard", PPORT_PER_TC_PRIO_OFF(no_buffer_discard_uc) },
+};
+
+#define NUM_PPORT_PER_TC_PRIO_COUNTERS	ARRAY_SIZE(pport_per_tc_prio_stats_desc)
+
+#define PPORT_PER_TC_CONGEST_PRIO_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_per_tc_congest_prio_grp_data_layout.c##_high)
+
+static const struct counter_desc pport_per_tc_congest_prio_stats_desc[] = {
+	{ "rx_prio%d_cong_discard", PPORT_PER_TC_CONGEST_PRIO_OFF(wred_discard) },
+	{ "rx_prio%d_marked", PPORT_PER_TC_CONGEST_PRIO_OFF(ecn_marked_tc) },
+};
+
+#define NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS \
+	ARRAY_SIZE(pport_per_tc_congest_prio_stats_desc)
+
+static int mlx5e_grp_per_tc_prio_get_num_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_GEN(mdev, sbcam_reg))
+		return 0;
+
+	return NUM_PPORT_PER_TC_PRIO_COUNTERS * NUM_PPORT_PRIO;
+}
+
+static int mlx5e_grp_per_port_buffer_congest_fill_strings(struct mlx5e_priv *priv,
+							  u8 *data, int idx)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int i, prio;
+
+	if (!MLX5_CAP_GEN(mdev, sbcam_reg))
+		return idx;
+
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		for (i = 0; i < NUM_PPORT_PER_TC_PRIO_COUNTERS; i++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_tc_prio_stats_desc[i].format, prio);
+		for (i = 0; i < NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS; i++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_tc_congest_prio_stats_desc[i].format, prio);
+	}
+
+	return idx;
+}
+
+static int mlx5e_grp_per_port_buffer_congest_fill_stats(struct mlx5e_priv *priv,
+							u64 *data, int idx)
+{
+	struct mlx5e_pport_stats *pport = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int i, prio;
+
+	if (!MLX5_CAP_GEN(mdev, sbcam_reg))
+		return idx;
+
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		for (i = 0; i < NUM_PPORT_PER_TC_PRIO_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&pport->per_tc_prio_counters[prio],
+						    pport_per_tc_prio_stats_desc, i);
+		for (i = 0; i < NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS ; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&pport->per_tc_congest_prio_counters[prio],
+						    pport_per_tc_congest_prio_stats_desc, i);
+	}
+
+	return idx;
+}
+
+static void mlx5e_grp_per_tc_prio_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+	int prio;
+
+	if (!MLX5_CAP_GEN(mdev, sbcam_reg))
+		return;
+
+	MLX5_SET(ppcnt_reg, in, pnat, 2);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP);
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		out = pstats->per_tc_prio_counters[prio];
+		MLX5_SET(ppcnt_reg, in, prio_tc, prio);
+		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+	}
+}
+
+static int mlx5e_grp_per_tc_congest_prio_get_num_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_GEN(mdev, sbcam_reg))
+		return 0;
+
+	return NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS * NUM_PPORT_PRIO;
+}
+
+static void mlx5e_grp_per_tc_congest_prio_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+	int prio;
+
+	if (!MLX5_CAP_GEN(mdev, sbcam_reg))
+		return;
+
+	MLX5_SET(ppcnt_reg, in, pnat, 2);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_TRAFFIC_CLASS_CONGESTION_GROUP);
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		out = pstats->per_tc_congest_prio_counters[prio];
+		MLX5_SET(ppcnt_reg, in, prio_tc, prio);
+		mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+	}
+}
+
+static int mlx5e_grp_per_port_buffer_congest_get_num_stats(struct mlx5e_priv *priv)
+{
+	return mlx5e_grp_per_tc_prio_get_num_stats(priv) +
+		mlx5e_grp_per_tc_congest_prio_get_num_stats(priv);
+}
+
+static void mlx5e_grp_per_port_buffer_congest_update_stats(struct mlx5e_priv *priv)
+{
+	mlx5e_grp_per_tc_prio_update_stats(priv);
+	mlx5e_grp_per_tc_congest_prio_update_stats(priv);
+}
+
 #define PPORT_PER_PRIO_OFF(c) \
 	MLX5_BYTE_OFF(ppcnt_reg, \
 		      counter_set.eth_per_prio_grp_data_layout.c##_high)
@@ -1610,7 +1751,13 @@ const struct mlx5e_stats_grp mlx5e_stats_grps[] = {
 		.get_num_stats = mlx5e_grp_channels_get_num_stats,
 		.fill_strings = mlx5e_grp_channels_fill_strings,
 		.fill_stats = mlx5e_grp_channels_fill_stats,
-	}
+	},
+	{
+		.get_num_stats = mlx5e_grp_per_port_buffer_congest_get_num_stats,
+		.fill_strings = mlx5e_grp_per_port_buffer_congest_fill_strings,
+		.fill_stats = mlx5e_grp_per_port_buffer_congest_fill_stats,
+		.update_stats = mlx5e_grp_per_port_buffer_congest_update_stats,
+	},
 };
 
 const int mlx5e_num_stats_grps = ARRAY_SIZE(mlx5e_stats_grps);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index c281e567711d..79f261bf86ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -207,6 +207,8 @@ struct mlx5e_pport_stats {
 	__be64 phy_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 	__be64 phy_statistical_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
 	__be64 eth_ext_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 per_tc_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 per_tc_congest_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
 };
 
 #define PCIE_PERF_GET(pcie_stats, c) \
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH net] net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list
From: Willem de Bruijn @ 2019-09-05 21:51 UTC (permalink / raw)
  To: Shmulik Ladkani
  Cc: Daniel Borkmann, Eric Dumazet, eyal, netdev, Shmulik Ladkani,
	Alexander Duyck
In-Reply-To: <20190905183633.8144-1-shmulik.ladkani@gmail.com>

On Thu, Sep 5, 2019 at 2:36 PM Shmulik Ladkani <shmulik@metanetworks.com> wrote:
>
> Historically, support for frag_list packets entering skb_segment() was
> limited to frag_list members terminating on exact same gso_size
> boundaries. This is verified with a BUG_ON since commit 89319d3801d1
> ("net: Add frag_list support to skb_segment"), quote:
>
>     As such we require all frag_list members terminate on exact MSS
>     boundaries.  This is checked using BUG_ON.
>     As there should only be one producer in the kernel of such packets,
>     namely GRO, this requirement should not be difficult to maintain.
>
> However, since commit 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper"),
> the "exact MSS boundaries" assumption no longer holds:
> An eBPF program using bpf_skb_change_proto() DOES modify 'gso_size', but
> leaves the frag_list members as originally merged by GRO with the
> original 'gso_size'. Example of such programs are bpf-based NAT46 or
> NAT64.
>
> This lead to a kernel BUG_ON for flows involving:
>  - GRO generating a frag_list skb
>  - bpf program performing bpf_skb_change_proto() or bpf_skb_adjust_room()
>  - skb_segment() of the skb
>
> See example BUG_ON reports in [0].
>
> In commit 13acc94eff12 ("net: permit skb_segment on head_frag frag_list skb"),
> skb_segment() was modified to support the "gso_size mangling" case of
> a frag_list GRO'ed skb, but *only* for frag_list members having
> head_frag==true (having a page-fragment head).
>
> Alas, GRO packets having frag_list members with a linear kmalloced head
> (head_frag==false) still hit the BUG_ON.
>
> This commit adds support to skb_segment() for a 'head_skb' packet having
> a frag_list whose members are *non* head_frag, with gso_size mangled, by
> disabling SG and thus falling-back to copying the data from the given
> 'head_skb' into the generated segmented skbs - as suggested by Willem de
> Bruijn [1].
>
> Since this approach involves the penalty of skb_copy_and_csum_bits()
> when building the segments, care was taken in order to enable this
> solution only when required:
>  - untrusted gso_size, by testing SKB_GSO_DODGY is set
>    (SKB_GSO_DODGY is set by any gso_size mangling functions in
>     net/core/filter.c)
>  - the frag_list is non empty, its item is a non head_frag, *and* the
>    headlen of the given 'head_skb' does not match the gso_size.
>
> [0]
> https://lore.kernel.org/netdev/20190826170724.25ff616f@pixies/
> https://lore.kernel.org/netdev/9265b93f-253d-6b8c-f2b8-4b54eff1835c@fb.com/
>
> [1]
> https://lore.kernel.org/netdev/CA+FuTSfVsgNDi7c=GUU8nMg2hWxF2SjCNLXetHeVPdnxAW5K-w@mail.gmail.com/
>
> Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
> Suggested-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
> ---
>  net/core/skbuff.c | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index ea8e8d332d85..c4bd1881acff 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3678,6 +3678,24 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>         sg = !!(features & NETIF_F_SG);
>         csum = !!can_checksum_protocol(features, proto);
>
> +       if (mss != GSO_BY_FRAGS &&
> +           (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
> +               /* gso_size is untrusted.
> +                *
> +                * If head_skb has a frag_list with a linear non head_frag
> +                * item, and head_skb's headlen does not fit requested
> +                * gso_size, fall back to copying the skbs - by disabling sg.
> +                *
> +                * We assume checking the first frag suffices, i.e if either of
> +                * the frags have non head_frag data, then the first frag is
> +                * too.
> +                */
> +               if (list_skb && skb_headlen(list_skb) && !list_skb->head_frag &&
> +                   (mss != skb_headlen(head_skb) - doffset)) {

I thought the idea was to check skb_headlen(list_skb), as that is the
cause of the problem. Is skb_headlen(head_skb) a good predictor of
that? I can certainly imagine that it is, just not sure.

Thanks for preparing the patch, and explaining the problem and
solution clearly in the commit message. I'm pretty sure I'll have
forgotten the finer details next time we have to look at this
function again.

^ permalink raw reply

* Re: [PATCH v2 bpf-next 2/3] bpf: implement CAP_BPF
From: Alexei Starovoitov @ 2019-09-05 22:00 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, nicolas.dichtel@6wind.com, Alexei Starovoitov,
	luto@amacapital.net, davem@davemloft.net, peterz@infradead.org,
	rostedt@goodmis.org, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <acc09eaf-5289-9457-3ce1-f27efb6013b8@iogearbox.net>

On Thu, Sep 05, 2019 at 10:37:03AM +0200, Daniel Borkmann wrote:
> On 9/4/19 5:21 PM, Alexei Starovoitov wrote:
> > On 9/4/19 8:16 AM, Daniel Borkmann wrote:
> > > opening/creating BPF maps" error="Unable to create map
> > > /run/cilium/bpffs/tc/globals/cilium_lxc: operation not permitted"
> > > subsys=daemon
> > > 2019-09-04T14:11:47.28178666Z level=fatal msg="Error while creating
> > > daemon" error="Unable to create map
> > > /run/cilium/bpffs/tc/globals/cilium_lxc: operation not permitted"
> > > subsys=daemon
> > 
> > Ok. We have to include caps in both cap_sys_admin and cap_bpf then.
> > 
> > > And /same/ deployment with reverted patches, hence no CAP_BPF gets it up
> > > and running again:
> > > 
> > > # kubectl get pods --all-namespaces -o wide
> > 
> > Can you share what this magic commands do underneath?
> 
> What do you mean by magic commands? Latter is showing all pods in the cluster:
> 
> https://kubernetes.io/docs/reference/kubectl/cheatsheet/#viewing-finding-resources

"magic" in a sense that I have no idea how k8s "services" and "pods" map
to kernel namespaces.

> Checking moby go code, it seems to exec with GetAllCapabilities which returns
> all of the capabilities it is aware of, and afaics, they seem to be using
> the below go library to get the hard-coded list from where obviously CAP_BPF
> is unknown which might also explain the breakage I've been seeing:
> 
> https://github.com/syndtr/gocapability/blob/33e07d32887e1e06b7c025f27ce52f62c7990bc0/capability/enum_gen.go

thanks for the link.
That library is much better written than libcap.
capability_linux.go is reading cap_last_cap dynamically and it can understand
proposed CAP_BPF, CAP_TRACING without need o be recompiled (unlike libcap).
So passing new caps to k8s should be trivial. The library won't know
their names, but passing by number looks to be already supported.
I'm still not sure which part of k8s setup clears the caps and
why this v2 set doesn't work, but that doesn't matter any more.
I believe I addressed this compat issue in v3 set.
Could you please give a try just repeating your previous commands?


^ permalink raw reply

* RE: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF NIC
From: Haiyang Zhang @ 2019-09-05 23:07 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: sashal@kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, KY Srinivasan, Stephen Hemminger,
	olaf@aepfle.de, vkuznets, davem@davemloft.net,
	linux-kernel@vger.kernel.org, Mark Bloch
In-Reply-To: <20190830160451.43a61cf9@cakuba.netronome.com>



> -----Original Message-----
> From: Jakub Kicinski <jakub.kicinski@netronome.com>
> Sent: Friday, August 30, 2019 7:05 PM
> To: Haiyang Zhang <haiyangz@microsoft.com>
> Cc: sashal@kernel.org; linux-hyperv@vger.kernel.org;
> netdev@vger.kernel.org; KY Srinivasan <kys@microsoft.com>; Stephen
> Hemminger <sthemmin@microsoft.com>; olaf@aepfle.de; vkuznets
> <vkuznets@redhat.com>; davem@davemloft.net; linux-
> kernel@vger.kernel.org; Mark Bloch <markb@mellanox.com>
> Subject: Re: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF
> NIC
> 
> On Fri, 30 Aug 2019 03:45:38 +0000, Haiyang Zhang wrote:
> > VF NIC may go down then come up during host servicing events. This
> > causes the VF NIC offloading feature settings to roll back to the
> > defaults. This patch can synchronize features from synthetic NIC to
> > the VF NIC during ndo_set_features (ethtool -K), and
> > netvsc_register_vf when VF comes back after host events.
> >
> > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> > Cc: Mark Bloch <markb@mellanox.com>
> 
> If we want to make this change in behaviour we should change net_failover
> at the same time.

After checking the net_failover, I found it's for virtio based SRIOV, and very 
different from what we did for Hyper-V based SRIOV.

We let the netvsc driver acts as both the synthetic (PV) driver and the transparent 
bonding master for the VF NIC. But net_failover acts as a master device on top 
of both virtio PV NIC, and VF NIC. And the net_failover doesn't implemented 
operations, like ndo_set_features.
So the code change for our netvsc driver cannot be applied to net_failover driver.

I will re-submit my two patches (fixing the extra tab in the 1st one as you pointed 
out). Thanks!

- Haiyang


^ permalink raw reply

* [PATCH net-next,v2, 0/2] Enable sg as tunable, sync offload settings to VF NIC
From: Haiyang Zhang @ 2019-09-05 23:22 UTC (permalink / raw)
  To: sashal@kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf@aepfle.de,
	vkuznets, davem@davemloft.net, linux-kernel@vger.kernel.org

This patch set fixes an issue in SG tuning, and sync
offload settings from synthetic NIC to VF NIC.

Haiyang Zhang (2):
  hv_netvsc: hv_netvsc: Allow scatter-gather feature to be tunable
  hv_netvsc: Sync offloading features to VF NIC

 drivers/net/hyperv/hyperv_net.h   |  2 +-
 drivers/net/hyperv/netvsc_drv.c   | 26 ++++++++++++++++++++++----
 drivers/net/hyperv/rndis_filter.c |  1 +
 3 files changed, 24 insertions(+), 5 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH net-next,v2, 1/2] hv_netvsc: Allow scatter-gather feature to be tunable
From: Haiyang Zhang @ 2019-09-05 23:23 UTC (permalink / raw)
  To: sashal@kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf@aepfle.de,
	vkuznets, davem@davemloft.net, linux-kernel@vger.kernel.org
In-Reply-To: <1567725722-33552-1-git-send-email-haiyangz@microsoft.com>

In a previous patch, the NETIF_F_SG was missing after the code changes.
That caused the SG feature to be "fixed". This patch includes it into
hw_features, so it is tunable again.

Fixes: 23312a3be999 ("netvsc: negotiate checksum and segmentation parameters")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/hyperv/hyperv_net.h   | 2 +-
 drivers/net/hyperv/netvsc_drv.c   | 4 ++--
 drivers/net/hyperv/rndis_filter.c | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index ecc9af0..670ef68 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -822,7 +822,7 @@ struct nvsp_message {
 
 #define NETVSC_SUPPORTED_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | \
 				      NETIF_F_TSO | NETIF_F_IPV6_CSUM | \
-				      NETIF_F_TSO6 | NETIF_F_LRO)
+				      NETIF_F_TSO6 | NETIF_F_LRO | NETIF_F_SG)
 
 #define VRSS_SEND_TAB_SIZE 16  /* must be power of 2 */
 #define VRSS_CHANNEL_MAX 64
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 0a6cd2f..1f1192e 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2313,8 +2313,8 @@ static int netvsc_probe(struct hv_device *dev,
 
 	/* hw_features computed in rndis_netdev_set_hwcaps() */
 	net->features = net->hw_features |
-		NETIF_F_HIGHDMA | NETIF_F_SG |
-		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
+		NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX |
+		NETIF_F_HW_VLAN_CTAG_RX;
 	net->vlan_features = net->features;
 
 	netdev_lockdep_set_classes(net);
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 317dbe9..abaf815 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1207,6 +1207,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device,
 
 	/* Compute tx offload settings based on hw capabilities */
 	net->hw_features |= NETIF_F_RXCSUM;
+	net->hw_features |= NETIF_F_SG;
 
 	if ((hwcaps.csum.ip4_txcsum & NDIS_TXCSUM_ALL_TCP4) == NDIS_TXCSUM_ALL_TCP4) {
 		/* Can checksum TCP */
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v2, 2/2] hv_netvsc: Sync offloading features to VF NIC
From: Haiyang Zhang @ 2019-09-05 23:23 UTC (permalink / raw)
  To: sashal@kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger, olaf@aepfle.de,
	vkuznets, davem@davemloft.net, linux-kernel@vger.kernel.org,
	Mark Bloch
In-Reply-To: <1567725722-33552-1-git-send-email-haiyangz@microsoft.com>

VF NIC may go down then come up during host servicing events. This
causes the VF NIC offloading feature settings to roll back to the
defaults. This patch can synchronize features from synthetic NIC to
the VF NIC during ndo_set_features (ethtool -K),
and netvsc_register_vf when VF comes back after host events.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Mark Bloch <markb@mellanox.com>
---
 drivers/net/hyperv/netvsc_drv.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 1f1192e..39dddcd 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1785,13 +1785,15 @@ static int netvsc_set_features(struct net_device *ndev,
 	netdev_features_t change = features ^ ndev->features;
 	struct net_device_context *ndevctx = netdev_priv(ndev);
 	struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
+	struct net_device *vf_netdev = rtnl_dereference(ndevctx->vf_netdev);
 	struct ndis_offload_params offloads;
+	int ret = 0;
 
 	if (!nvdev || nvdev->destroy)
 		return -ENODEV;
 
 	if (!(change & NETIF_F_LRO))
-		return 0;
+		goto syncvf;
 
 	memset(&offloads, 0, sizeof(struct ndis_offload_params));
 
@@ -1803,7 +1805,19 @@ static int netvsc_set_features(struct net_device *ndev,
 		offloads.rsc_ip_v6 = NDIS_OFFLOAD_PARAMETERS_RSC_DISABLED;
 	}
 
-	return rndis_filter_set_offload_params(ndev, nvdev, &offloads);
+	ret = rndis_filter_set_offload_params(ndev, nvdev, &offloads);
+
+	if (ret)
+		features ^= NETIF_F_LRO;
+
+syncvf:
+	if (!vf_netdev)
+		return ret;
+
+	vf_netdev->wanted_features = features;
+	netdev_update_features(vf_netdev);
+
+	return ret;
 }
 
 static u32 netvsc_get_msglevel(struct net_device *ndev)
@@ -2181,6 +2195,10 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 
 	dev_hold(vf_netdev);
 	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
+
+	vf_netdev->wanted_features = ndev->features;
+	netdev_update_features(vf_netdev);
+
 	return NOTIFY_OK;
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH bpf-next] kbuild: replace BASH-specific ${@:2} with shift and ${@}
From: Yonghong Song @ 2019-09-05 23:59 UTC (permalink / raw)
  To: Andrii Nakryiko, bpf@vger.kernel.org, netdev@vger.kernel.org,
	Alexei Starovoitov, daniel@iogearbox.net
  Cc: andrii.nakryiko@gmail.com, Kernel Team, Stephen Rothwell,
	Masahiro Yamada
In-Reply-To: <20190905175938.599455-1-andriin@fb.com>



On 9/5/19 10:59 AM, Andrii Nakryiko wrote:
> ${@:2} is BASH-specific extension, which makes link-vmlinux.sh rely on
> BASH. Use shift and ${@} instead to fix this issue.
> 
> Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
> Fixes: 341dfcf8d78e ("btf: expose BTF info through sysfs")
> Cc: Stephen Rothwell <sfr@canb.auug.org.au>
> Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
> Signed-off-by: Andrii Nakryiko <andriin@fb.com>

Tested with bash/sh/csh, all works.
Acked-by: Yonghong Song <yhs@fb.com>

> ---
>   scripts/link-vmlinux.sh | 16 +++++++++++-----
>   1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
> index 0d8f41db8cd6..8c59970a09dc 100755
> --- a/scripts/link-vmlinux.sh
> +++ b/scripts/link-vmlinux.sh
> @@ -57,12 +57,16 @@ modpost_link()
>   
>   # Link of vmlinux
>   # ${1} - output file
> -# ${@:2} - optional extra .o files
> +# ${2}, ${3}, ... - optional extra .o files
>   vmlinux_link()
>   {
>   	local lds="${objtree}/${KBUILD_LDS}"
> +	local output=${1}
>   	local objects
>   
> +	# skip output file argument
> +	shift
> +
>   	if [ "${SRCARCH}" != "um" ]; then
>   		objects="--whole-archive			\
>   			${KBUILD_VMLINUX_OBJS}			\
> @@ -70,9 +74,10 @@ vmlinux_link()
>   			--start-group				\
>   			${KBUILD_VMLINUX_LIBS}			\
>   			--end-group				\
> -			${@:2}"
> +			${@}"
>   
> -		${LD} ${KBUILD_LDFLAGS} ${LDFLAGS_vmlinux} -o ${1}	\
> +		${LD} ${KBUILD_LDFLAGS} ${LDFLAGS_vmlinux}	\
> +			-o ${output}				\
>   			-T ${lds} ${objects}
>   	else
>   		objects="-Wl,--whole-archive			\
> @@ -81,9 +86,10 @@ vmlinux_link()
>   			-Wl,--start-group			\
>   			${KBUILD_VMLINUX_LIBS}			\
>   			-Wl,--end-group				\
> -			${@:2}"
> +			${@}"
>   
> -		${CC} ${CFLAGS_vmlinux} -o ${1}			\
> +		${CC} ${CFLAGS_vmlinux}				\
> +			-o ${output}				\
>   			-Wl,-T,${lds}				\
>   			${objects}				\
>   			-lutil -lrt -lpthread
> 

^ permalink raw reply

* [PATCH net-next,v3 0/4] flow_offload: update mangle action representation
From: Pablo Neira Ayuso @ 2019-09-06  0:03 UTC (permalink / raw)
  To: netfilter-devel
  Cc: davem, netdev, jakub.kicinski, jiri, saeedm, vishal, vladbu

This patch updates the mangle action representation:

Patch 1) Undo bitwise NOT operation on the mangle mask (coming from tc
         pedit userspace).

Patch 2) mangle value &= mask from the front-end side.

Patch 3) adjust offset, length and coalesce consecutive pedit keys into
         one single action.

Patch 4) add support for payload mangling for netfilter.

After this patchset:

* Offset to payload does not need to be on the 32-bits boundaries anymore.
  This patchset adds front-end code to adjust the offset and length coming
  from the tc pedit representation, so drivers get an exact header field
  offset and length.

* This new front-end code coalesces consecutive pedit actions into one
  single action, so drivers can mangle IPv6 and ethernet address fields
  in one go, instead once for each 32-bit word.

On the driver side, diffstat -t shows that drivers code to deal with
payload mangling gets simplified:

        INSERTED,DELETED,MODIFIED,FILENAME
        46,116,0,drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c (-70 LOC)
        12,28,0,drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h (-16 LOC)
	26,54,0,drivers/net/ethernet/mellanox/mlx5/core/en_tc.c (-27 LOC)
        89,111,0,drivers/net/ethernet/netronome/nfp/flower/action.c (-17 LOC)

While, on the front-end side the balance is the following:

        123,22,0,net/sched/cls_api.c (+101 LOC)

Changes since v2:

* Fix is_action_keys_supported() breakage in mlx5 reported by Vlad Buslov.

Pablo Neira Ayuso (4):
  net: flow_offload: flip mangle action mask
  net: flow_offload: bitwise AND on mangle action value field
  net: flow_offload: mangle action at byte level
  netfilter: nft_payload: packet mangling offload support

 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 163 +++++-----------
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h   |  40 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  80 +++-----
 drivers/net/ethernet/netronome/nfp/flower/action.c | 204 ++++++++++-----------
 include/net/flow_offload.h                         |   7 +-
 net/netfilter/nft_payload.c                        |  73 ++++++++
 net/sched/cls_api.c                                | 144 ++++++++++++---
 7 files changed, 378 insertions(+), 333 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next,v3 1/4] net: flow_offload: flip mangle action mask
From: Pablo Neira Ayuso @ 2019-09-06  0:04 UTC (permalink / raw)
  To: netfilter-devel
  Cc: davem, netdev, jakub.kicinski, jiri, saeedm, vishal, vladbu
In-Reply-To: <20190906000403.3701-1-pablo@netfilter.org>

Userspace tc pedit action performs a bitwise NOT operation on the mask.
All of the existing drivers in the tree undo this operation. Prepare the
mangle mask in the way the drivers expect from the
tc_setup_flow_action() function.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 12 ++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c      |  6 +++---
 drivers/net/ethernet/netronome/nfp/flower/action.c   |  8 ++++----
 net/sched/cls_api.c                                  |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index e447976bdd3e..2d26dbca701d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -275,7 +275,7 @@ static int cxgb4_validate_flow_match(struct net_device *dev,
 static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask,
 			  u8 field)
 {
-	u32 set_val = val & ~mask;
+	u32 set_val = val & mask;
 	u32 offset = 0;
 	u8 size = 1;
 	int i;
@@ -301,7 +301,7 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val,
 			offload_pedit(fs, val, mask, ETH_DMAC_31_0);
 			break;
 		case PEDIT_ETH_DMAC_47_32_SMAC_15_0:
-			if (~mask & PEDIT_ETH_DMAC_MASK)
+			if (mask & PEDIT_ETH_DMAC_MASK)
 				offload_pedit(fs, val, mask, ETH_DMAC_47_32);
 			else
 				offload_pedit(fs, val >> 16, mask >> 16,
@@ -353,7 +353,7 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val,
 	case FLOW_ACT_MANGLE_HDR_TYPE_TCP:
 		switch (offset) {
 		case PEDIT_TCP_SPORT_DPORT:
-			if (~mask & PEDIT_TCP_UDP_SPORT_MASK)
+			if (mask & PEDIT_TCP_UDP_SPORT_MASK)
 				offload_pedit(fs, cpu_to_be32(val) >> 16,
 					      cpu_to_be32(mask) >> 16,
 					      TCP_SPORT);
@@ -366,7 +366,7 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val,
 	case FLOW_ACT_MANGLE_HDR_TYPE_UDP:
 		switch (offset) {
 		case PEDIT_UDP_SPORT_DPORT:
-			if (~mask & PEDIT_TCP_UDP_SPORT_MASK)
+			if (mask & PEDIT_TCP_UDP_SPORT_MASK)
 				offload_pedit(fs, cpu_to_be32(val) >> 16,
 					      cpu_to_be32(mask) >> 16,
 					      UDP_SPORT);
@@ -510,7 +510,7 @@ static bool valid_pedit_action(struct net_device *dev,
 	case FLOW_ACT_MANGLE_HDR_TYPE_TCP:
 		switch (offset) {
 		case PEDIT_TCP_SPORT_DPORT:
-			if (!valid_l4_mask(~mask)) {
+			if (!valid_l4_mask(mask)) {
 				netdev_err(dev, "%s: Unsupported mask for TCP L4 ports\n",
 					   __func__);
 				return false;
@@ -525,7 +525,7 @@ static bool valid_pedit_action(struct net_device *dev,
 	case FLOW_ACT_MANGLE_HDR_TYPE_UDP:
 		switch (offset) {
 		case PEDIT_UDP_SPORT_DPORT:
-			if (!valid_l4_mask(~mask)) {
+			if (!valid_l4_mask(mask)) {
 				netdev_err(dev, "%s: Unsupported mask for UDP L4 ports\n",
 					   __func__);
 				return false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 30d26eba75a3..3db63cf41ee5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2509,7 +2509,7 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv,
 	val = act->mangle.val;
 	offset = act->mangle.offset;
 
-	err = set_pedit_val(htype, ~mask, val, offset, &hdrs[cmd]);
+	err = set_pedit_val(htype, mask, val, offset, &hdrs[cmd]);
 	if (err)
 		goto out_err;
 
@@ -2609,7 +2609,7 @@ static bool is_action_keys_supported(const struct flow_action_entry *act)
 
 	htype = act->mangle.htype;
 	offset = act->mangle.offset;
-	mask = ~act->mangle.mask;
+	mask = act->mangle.mask;
 	/* For IPv4 & IPv6 header check 4 byte word,
 	 * to determine that modified fields
 	 * are NOT ttl & hop_limit only.
@@ -2733,7 +2733,7 @@ static int add_vlan_rewrite_action(struct mlx5e_priv *priv, int namespace,
 		.id = FLOW_ACTION_MANGLE,
 		.mangle.htype = FLOW_ACT_MANGLE_HDR_TYPE_ETH,
 		.mangle.offset = offsetof(struct vlan_ethhdr, h_vlan_TCI),
-		.mangle.mask = ~(u32)be16_to_cpu(*(__be16 *)&mask16),
+		.mangle.mask = (u32)be16_to_cpu(*(__be16 *)&mask16),
 		.mangle.val = (u32)be16_to_cpu(*(__be16 *)&val16),
 	};
 	u8 match_prio_mask, match_prio_val;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 1b019fdfcd97..ee0066a7ba87 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -495,7 +495,7 @@ nfp_fl_set_eth(const struct flow_action_entry *act, u32 off,
 		return -EOPNOTSUPP;
 	}
 
-	mask = ~act->mangle.mask;
+	mask = act->mangle.mask;
 	exact = act->mangle.val;
 
 	if (exact & ~mask) {
@@ -532,7 +532,7 @@ nfp_fl_set_ip4(const struct flow_action_entry *act, u32 off,
 	__be32 exact, mask;
 
 	/* We are expecting tcf_pedit to return a big endian value */
-	mask = (__force __be32)~act->mangle.mask;
+	mask = (__force __be32)act->mangle.mask;
 	exact = (__force __be32)act->mangle.val;
 
 	if (exact & ~mask) {
@@ -673,7 +673,7 @@ nfp_fl_set_ip6(const struct flow_action_entry *act, u32 off,
 	u8 word;
 
 	/* We are expecting tcf_pedit to return a big endian value */
-	mask = (__force __be32)~act->mangle.mask;
+	mask = (__force __be32)act->mangle.mask;
 	exact = (__force __be32)act->mangle.val;
 
 	if (exact & ~mask) {
@@ -713,7 +713,7 @@ nfp_fl_set_tport(const struct flow_action_entry *act, u32 off,
 		return -EOPNOTSUPP;
 	}
 
-	mask = ~act->mangle.mask;
+	mask = act->mangle.mask;
 	exact = act->mangle.val;
 
 	if (exact & ~mask) {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 671ca905dbb5..fbab004d0075 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3379,7 +3379,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 					goto err_out;
 				}
 				entry->mangle.htype = tcf_pedit_htype(act, k);
-				entry->mangle.mask = tcf_pedit_mask(act, k);
+				entry->mangle.mask = ~tcf_pedit_mask(act, k);
 				entry->mangle.val = tcf_pedit_val(act, k);
 				entry->mangle.offset = tcf_pedit_offset(act, k);
 				entry = &flow_action->entries[++j];
-- 
2.11.0


^ permalink raw reply related

* [PATCH net-next,v3 2/4] net: flow_offload: bitwise AND on mangle action value field
From: Pablo Neira Ayuso @ 2019-09-06  0:04 UTC (permalink / raw)
  To: netfilter-devel
  Cc: davem, netdev, jakub.kicinski, jiri, saeedm, vishal, vladbu
In-Reply-To: <20190906000403.3701-1-pablo@netfilter.org>

Drivers perform a bitwise AND on the value and the mask. Update
tc_setup_flow_action() to perform this operation so drivers do not need
to do this.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c      | 2 +-
 drivers/net/ethernet/netronome/nfp/flower/action.c   | 9 ++++-----
 net/sched/cls_api.c                                  | 3 ++-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index 2d26dbca701d..5afc15a60199 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -275,7 +275,6 @@ static int cxgb4_validate_flow_match(struct net_device *dev,
 static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask,
 			  u8 field)
 {
-	u32 set_val = val & mask;
 	u32 offset = 0;
 	u8 size = 1;
 	int i;
@@ -287,7 +286,7 @@ static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask,
 			break;
 		}
 	}
-	memcpy((u8 *)fs + offset, &set_val, size);
+	memcpy((u8 *)fs + offset, &val, size);
 }
 
 static void process_pedit_field(struct ch_filter_specification *fs, u32 val,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 3db63cf41ee5..ec47e994b7e0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2214,7 +2214,7 @@ static int set_pedit_val(u8 hdr_type, u32 mask, u32 val, u32 offset,
 		goto out_err;
 
 	*curr_pmask |= mask;
-	*curr_pval  |= (val & mask);
+	*curr_pval  |= val;
 
 	return 0;
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index ee0066a7ba87..592c36ba9e3f 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -477,7 +477,6 @@ static void nfp_fl_set_helper32(u32 value, u32 mask, u8 *p_exact, u8 *p_mask)
 	u32 oldvalue = get_unaligned((u32 *)p_exact);
 	u32 oldmask = get_unaligned((u32 *)p_mask);
 
-	value &= mask;
 	value |= oldvalue & ~mask;
 
 	put_unaligned(oldmask | mask, (u32 *)p_mask);
@@ -544,7 +543,7 @@ nfp_fl_set_ip4(const struct flow_action_entry *act, u32 off,
 	case offsetof(struct iphdr, daddr):
 		set_ip_addr->ipv4_dst_mask |= mask;
 		set_ip_addr->ipv4_dst &= ~mask;
-		set_ip_addr->ipv4_dst |= exact & mask;
+		set_ip_addr->ipv4_dst |= exact;
 		set_ip_addr->head.jump_id = NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS;
 		set_ip_addr->head.len_lw = sizeof(*set_ip_addr) >>
 					   NFP_FL_LW_SIZ;
@@ -552,7 +551,7 @@ nfp_fl_set_ip4(const struct flow_action_entry *act, u32 off,
 	case offsetof(struct iphdr, saddr):
 		set_ip_addr->ipv4_src_mask |= mask;
 		set_ip_addr->ipv4_src &= ~mask;
-		set_ip_addr->ipv4_src |= exact & mask;
+		set_ip_addr->ipv4_src |= exact;
 		set_ip_addr->head.jump_id = NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS;
 		set_ip_addr->head.len_lw = sizeof(*set_ip_addr) >>
 					   NFP_FL_LW_SIZ;
@@ -606,7 +605,7 @@ nfp_fl_set_ip6_helper(int opcode_tag, u8 word, __be32 exact, __be32 mask,
 {
 	ip6->ipv6[word].mask |= mask;
 	ip6->ipv6[word].exact &= ~mask;
-	ip6->ipv6[word].exact |= exact & mask;
+	ip6->ipv6[word].exact |= exact;
 
 	ip6->reserved = cpu_to_be16(0);
 	ip6->head.jump_id = opcode_tag;
@@ -651,7 +650,7 @@ nfp_fl_set_ip6_hop_limit_flow_label(u32 off, __be32 exact, __be32 mask,
 
 		ip_hl_fl->ipv6_label_mask |= mask;
 		ip_hl_fl->ipv6_label &= ~mask;
-		ip_hl_fl->ipv6_label |= exact & mask;
+		ip_hl_fl->ipv6_label |= exact;
 		break;
 	}
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index fbab004d0075..e30a151d8527 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3380,7 +3380,8 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 				}
 				entry->mangle.htype = tcf_pedit_htype(act, k);
 				entry->mangle.mask = ~tcf_pedit_mask(act, k);
-				entry->mangle.val = tcf_pedit_val(act, k);
+				entry->mangle.val = tcf_pedit_val(act, k) &
+							entry->mangle.mask;
 				entry->mangle.offset = tcf_pedit_offset(act, k);
 				entry = &flow_action->entries[++j];
 			}
-- 
2.11.0


^ permalink raw reply related

* [PATCH net-next,v3 4/4] netfilter: nft_payload: packet mangling offload support
From: Pablo Neira Ayuso @ 2019-09-06  0:04 UTC (permalink / raw)
  To: netfilter-devel
  Cc: davem, netdev, jakub.kicinski, jiri, saeedm, vishal, vladbu
In-Reply-To: <20190906000403.3701-1-pablo@netfilter.org>

This patch allows for mangling packet fields using hardware offload
infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_payload.c | 73 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 22a80eb60222..0efa8bfd2b51 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -562,12 +562,85 @@ static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr
 	return -1;
 }
 
+static int nft_payload_offload_set_nh(struct nft_offload_ctx *ctx,
+				      struct nft_flow_rule *flow,
+				      const struct nft_payload_set *priv)
+{
+	int type = FLOW_ACT_MANGLE_UNSPEC;
+
+	switch (ctx->dep.l3num) {
+	case htons(ETH_P_IP):
+		type = FLOW_ACT_MANGLE_HDR_TYPE_IP4;
+		break;
+	case htons(ETH_P_IPV6):
+		type = FLOW_ACT_MANGLE_HDR_TYPE_IP6;
+		break;
+	}
+
+	return type;
+}
+
+static int nft_payload_offload_set_th(struct nft_offload_ctx *ctx,
+				      struct nft_flow_rule *flow,
+				      const struct nft_payload_set *priv)
+{
+	int type = FLOW_ACT_MANGLE_UNSPEC;
+
+	switch (ctx->dep.protonum) {
+	case IPPROTO_TCP:
+		type = FLOW_ACT_MANGLE_HDR_TYPE_TCP;
+		break;
+	case IPPROTO_UDP:
+		type = FLOW_ACT_MANGLE_HDR_TYPE_UDP;
+		break;
+	}
+
+	return type;
+}
+
+static int nft_payload_set_offload(struct nft_offload_ctx *ctx,
+				   struct nft_flow_rule *flow,
+				   const struct nft_expr *expr)
+{
+	const struct nft_payload_set *priv = nft_expr_priv(expr);
+	struct nft_offload_reg *sreg = &ctx->regs[priv->sreg];
+	int type = FLOW_ACT_MANGLE_UNSPEC;
+	struct flow_action_entry *entry;
+
+	switch (priv->base) {
+	case NFT_PAYLOAD_LL_HEADER:
+		type = FLOW_ACT_MANGLE_HDR_TYPE_ETH;
+		break;
+	case NFT_PAYLOAD_NETWORK_HEADER:
+		type = nft_payload_offload_set_nh(ctx, flow, priv);
+		break;
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		type = nft_payload_offload_set_th(ctx, flow, priv);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	entry = &flow->rule->action.entries[ctx->num_actions++];
+	entry->id		= FLOW_ACTION_MANGLE;
+	entry->mangle.htype	= type;
+	entry->mangle.offset	= priv->offset;
+	entry->mangle.len	= priv->len;
+
+	memcpy(entry->mangle.val, sreg->data.data, priv->len);
+	memset(entry->mangle.mask, 0xff, priv->len);
+
+	return type != FLOW_ACT_MANGLE_UNSPEC ? 0 : -EOPNOTSUPP;
+}
+
 static const struct nft_expr_ops nft_payload_set_ops = {
 	.type		= &nft_payload_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload_set)),
 	.eval		= nft_payload_set_eval,
 	.init		= nft_payload_set_init,
 	.dump		= nft_payload_set_dump,
+	.offload	= nft_payload_set_offload,
 };
 
 static const struct nft_expr_ops *
-- 
2.11.0


^ permalink raw reply related

* [PATCH net-next,v3 3/4] net: flow_offload: mangle action at byte level
From: Pablo Neira Ayuso @ 2019-09-06  0:04 UTC (permalink / raw)
  To: netfilter-devel
  Cc: davem, netdev, jakub.kicinski, jiri, saeedm, vishal, vladbu
In-Reply-To: <20190906000403.3701-1-pablo@netfilter.org>

The flow mangle action is originally modeled after the tc pedit action,
this has a number of shortcomings:

1) The tc pedit offset must be set on the 32-bits boundaries. Many
   protocol header field offsets are not aligned to 32-bits, eg. port
   destination, port source and ethernet destination. This patch adjusts
   the offset accordingly and trim off length in these case, so drivers get
   an exact offset and length to the header fields.

2) The maximum mangle length is one word of 32-bits, hence you need to
   up to four actions to mangle an IPv6 address. This patch coalesces
   consecutive tc pedit actions into one single action so drivers can
   configure the IPv6 mangling in one go. Ethernet address fields now
   require one single action instead of two too.

The following drivers have been updated accordingly to use this new
mangle action layout:

1) The cxgb4 driver does not need to split protocol field matching
   larger than one 32-bit words into multiple definitions. Instead one
   single definition per protocol field is enough. Checking for
   transport protocol ports is also simplified.

2) The mlx5 driver logic to disallow IPv4 ttl and IPv6 hoplimit fields
   becomes more simple too.

3) The nfp driver uses the nfp_fl_set_helper() function to configure the
   payload mangling. The memchr_inv() function is used to check for
   proper initialization of the value and mask. The driver has been
   updated to refer to the exact protocol header offsets too.

As a result, this patch reduces code complexity on the driver side at
the cost of adding ~100 LOC at the core to perform offset and length
adjustment; and to coalesce consecutive actions.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 162 +++++-----------
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h   |  40 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  80 +++-----
 drivers/net/ethernet/netronome/nfp/flower/action.c | 203 ++++++++++-----------
 include/net/flow_offload.h                         |   7 +-
 net/sched/cls_api.c                                | 145 ++++++++++++---
 6 files changed, 305 insertions(+), 332 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index 5afc15a60199..ba1ced08e41c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -44,20 +44,12 @@
 #define STATS_CHECK_PERIOD (HZ / 2)
 
 static struct ch_tc_pedit_fields pedits[] = {
-	PEDIT_FIELDS(ETH_, DMAC_31_0, 4, dmac, 0),
-	PEDIT_FIELDS(ETH_, DMAC_47_32, 2, dmac, 4),
-	PEDIT_FIELDS(ETH_, SMAC_15_0, 2, smac, 0),
-	PEDIT_FIELDS(ETH_, SMAC_47_16, 4, smac, 2),
+	PEDIT_FIELDS(ETH_, DMAC, 6, dmac, 0),
+	PEDIT_FIELDS(ETH_, SMAC, 6, smac, 0),
 	PEDIT_FIELDS(IP4_, SRC, 4, nat_fip, 0),
 	PEDIT_FIELDS(IP4_, DST, 4, nat_lip, 0),
-	PEDIT_FIELDS(IP6_, SRC_31_0, 4, nat_fip, 0),
-	PEDIT_FIELDS(IP6_, SRC_63_32, 4, nat_fip, 4),
-	PEDIT_FIELDS(IP6_, SRC_95_64, 4, nat_fip, 8),
-	PEDIT_FIELDS(IP6_, SRC_127_96, 4, nat_fip, 12),
-	PEDIT_FIELDS(IP6_, DST_31_0, 4, nat_lip, 0),
-	PEDIT_FIELDS(IP6_, DST_63_32, 4, nat_lip, 4),
-	PEDIT_FIELDS(IP6_, DST_95_64, 4, nat_lip, 8),
-	PEDIT_FIELDS(IP6_, DST_127_96, 4, nat_lip, 12),
+	PEDIT_FIELDS(IP6_, SRC, 16, nat_fip, 0),
+	PEDIT_FIELDS(IP6_, DST, 16, nat_lip, 0),
 	PEDIT_FIELDS(TCP_, SPORT, 2, nat_fport, 0),
 	PEDIT_FIELDS(TCP_, DPORT, 2, nat_lport, 0),
 	PEDIT_FIELDS(UDP_, SPORT, 2, nat_fport, 0),
@@ -272,8 +264,8 @@ static int cxgb4_validate_flow_match(struct net_device *dev,
 	return 0;
 }
 
-static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask,
-			  u8 field)
+static void offload_pedit(struct ch_filter_specification *fs,
+			  struct flow_action_entry *act, u8 field)
 {
 	u32 offset = 0;
 	u8 size = 1;
@@ -286,92 +278,68 @@ static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask,
 			break;
 		}
 	}
-	memcpy((u8 *)fs + offset, &val, size);
+	memcpy((u8 *)fs + offset, act->mangle.val, size);
 }
 
-static void process_pedit_field(struct ch_filter_specification *fs, u32 val,
-				u32 mask, u32 offset, u8 htype)
+static void process_pedit_field(struct ch_filter_specification *fs,
+				struct flow_action_entry *act)
 {
+	u32 offset = act->mangle.offset;
+	u8 htype = act->mangle.htype;
+
 	switch (htype) {
 	case FLOW_ACT_MANGLE_HDR_TYPE_ETH:
 		switch (offset) {
-		case PEDIT_ETH_DMAC_31_0:
+		case PEDIT_ETH_DMAC:
 			fs->newdmac = 1;
-			offload_pedit(fs, val, mask, ETH_DMAC_31_0);
-			break;
-		case PEDIT_ETH_DMAC_47_32_SMAC_15_0:
-			if (mask & PEDIT_ETH_DMAC_MASK)
-				offload_pedit(fs, val, mask, ETH_DMAC_47_32);
-			else
-				offload_pedit(fs, val >> 16, mask >> 16,
-					      ETH_SMAC_15_0);
+			offload_pedit(fs, act, ETH_DMAC);
 			break;
-		case PEDIT_ETH_SMAC_47_16:
+		case PEDIT_ETH_SMAC:
 			fs->newsmac = 1;
-			offload_pedit(fs, val, mask, ETH_SMAC_47_16);
+			offload_pedit(fs, act, ETH_SMAC);
+			break;
 		}
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_IP4:
 		switch (offset) {
 		case PEDIT_IP4_SRC:
-			offload_pedit(fs, val, mask, IP4_SRC);
+			offload_pedit(fs, act, IP4_SRC);
 			break;
 		case PEDIT_IP4_DST:
-			offload_pedit(fs, val, mask, IP4_DST);
+			offload_pedit(fs, act, IP4_DST);
 		}
 		fs->nat_mode = NAT_MODE_ALL;
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_IP6:
 		switch (offset) {
-		case PEDIT_IP6_SRC_31_0:
-			offload_pedit(fs, val, mask, IP6_SRC_31_0);
-			break;
-		case PEDIT_IP6_SRC_63_32:
-			offload_pedit(fs, val, mask, IP6_SRC_63_32);
-			break;
-		case PEDIT_IP6_SRC_95_64:
-			offload_pedit(fs, val, mask, IP6_SRC_95_64);
-			break;
-		case PEDIT_IP6_SRC_127_96:
-			offload_pedit(fs, val, mask, IP6_SRC_127_96);
-			break;
-		case PEDIT_IP6_DST_31_0:
-			offload_pedit(fs, val, mask, IP6_DST_31_0);
+		case PEDIT_IP6_SRC:
+			offload_pedit(fs, act, IP6_SRC);
 			break;
-		case PEDIT_IP6_DST_63_32:
-			offload_pedit(fs, val, mask, IP6_DST_63_32);
+		case PEDIT_IP6_DST:
+			offload_pedit(fs, act, IP6_DST);
 			break;
-		case PEDIT_IP6_DST_95_64:
-			offload_pedit(fs, val, mask, IP6_DST_95_64);
-			break;
-		case PEDIT_IP6_DST_127_96:
-			offload_pedit(fs, val, mask, IP6_DST_127_96);
 		}
 		fs->nat_mode = NAT_MODE_ALL;
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_TCP:
 		switch (offset) {
-		case PEDIT_TCP_SPORT_DPORT:
-			if (mask & PEDIT_TCP_UDP_SPORT_MASK)
-				offload_pedit(fs, cpu_to_be32(val) >> 16,
-					      cpu_to_be32(mask) >> 16,
-					      TCP_SPORT);
-			else
-				offload_pedit(fs, cpu_to_be32(val),
-					      cpu_to_be32(mask), TCP_DPORT);
+		case PEDIT_TCP_SPORT:
+			offload_pedit(fs, act, TCP_SPORT);
+			break;
+		case PEDIT_TCP_DPORT:
+			offload_pedit(fs, act, TCP_DPORT);
+			break;
 		}
 		fs->nat_mode = NAT_MODE_ALL;
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_UDP:
 		switch (offset) {
-		case PEDIT_UDP_SPORT_DPORT:
-			if (mask & PEDIT_TCP_UDP_SPORT_MASK)
-				offload_pedit(fs, cpu_to_be32(val) >> 16,
-					      cpu_to_be32(mask) >> 16,
-					      UDP_SPORT);
-			else
-				offload_pedit(fs, cpu_to_be32(val),
-					      cpu_to_be32(mask), UDP_DPORT);
+		case PEDIT_UDP_SPORT:
+			offload_pedit(fs, act, UDP_SPORT);
+			break;
+		case PEDIT_UDP_DPORT:
+			offload_pedit(fs, act, UDP_DPORT);
+			break;
 		}
 		fs->nat_mode = NAT_MODE_ALL;
 	}
@@ -424,17 +392,8 @@ static void cxgb4_process_flow_actions(struct net_device *in,
 			}
 			}
 			break;
-		case FLOW_ACTION_MANGLE: {
-			u32 mask, val, offset;
-			u8 htype;
-
-			htype = act->mangle.htype;
-			mask = act->mangle.mask;
-			val = act->mangle.val;
-			offset = act->mangle.offset;
-
-			process_pedit_field(fs, val, mask, offset, htype);
-			}
+		case FLOW_ACTION_MANGLE:
+			process_pedit_field(fs, act);
 			break;
 		default:
 			break;
@@ -442,35 +401,20 @@ static void cxgb4_process_flow_actions(struct net_device *in,
 	}
 }
 
-static bool valid_l4_mask(u32 mask)
-{
-	u16 hi, lo;
-
-	/* Either the upper 16-bits (SPORT) OR the lower
-	 * 16-bits (DPORT) can be set, but NOT BOTH.
-	 */
-	hi = (mask >> 16) & 0xFFFF;
-	lo = mask & 0xFFFF;
-
-	return hi && lo ? false : true;
-}
-
 static bool valid_pedit_action(struct net_device *dev,
 			       const struct flow_action_entry *act)
 {
-	u32 mask, offset;
+	u32 offset;
 	u8 htype;
 
 	htype = act->mangle.htype;
-	mask = act->mangle.mask;
 	offset = act->mangle.offset;
 
 	switch (htype) {
 	case FLOW_ACT_MANGLE_HDR_TYPE_ETH:
 		switch (offset) {
-		case PEDIT_ETH_DMAC_31_0:
-		case PEDIT_ETH_DMAC_47_32_SMAC_15_0:
-		case PEDIT_ETH_SMAC_47_16:
+		case PEDIT_ETH_DMAC:
+		case PEDIT_ETH_SMAC:
 			break;
 		default:
 			netdev_err(dev, "%s: Unsupported pedit field\n",
@@ -491,14 +435,8 @@ static bool valid_pedit_action(struct net_device *dev,
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_IP6:
 		switch (offset) {
-		case PEDIT_IP6_SRC_31_0:
-		case PEDIT_IP6_SRC_63_32:
-		case PEDIT_IP6_SRC_95_64:
-		case PEDIT_IP6_SRC_127_96:
-		case PEDIT_IP6_DST_31_0:
-		case PEDIT_IP6_DST_63_32:
-		case PEDIT_IP6_DST_95_64:
-		case PEDIT_IP6_DST_127_96:
+		case PEDIT_IP6_SRC:
+		case PEDIT_IP6_DST:
 			break;
 		default:
 			netdev_err(dev, "%s: Unsupported pedit field\n",
@@ -508,12 +446,8 @@ static bool valid_pedit_action(struct net_device *dev,
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_TCP:
 		switch (offset) {
-		case PEDIT_TCP_SPORT_DPORT:
-			if (!valid_l4_mask(mask)) {
-				netdev_err(dev, "%s: Unsupported mask for TCP L4 ports\n",
-					   __func__);
-				return false;
-			}
+		case PEDIT_TCP_SPORT:
+		case PEDIT_TCP_DPORT:
 			break;
 		default:
 			netdev_err(dev, "%s: Unsupported pedit field\n",
@@ -523,12 +457,8 @@ static bool valid_pedit_action(struct net_device *dev,
 		break;
 	case FLOW_ACT_MANGLE_HDR_TYPE_UDP:
 		switch (offset) {
-		case PEDIT_UDP_SPORT_DPORT:
-			if (!valid_l4_mask(mask)) {
-				netdev_err(dev, "%s: Unsupported mask for UDP L4 ports\n",
-					   __func__);
-				return false;
-			}
+		case PEDIT_UDP_SPORT:
+		case PEDIT_UDP_DPORT:
 			break;
 		default:
 			netdev_err(dev, "%s: Unsupported pedit field\n",
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
index eb4c95248baf..03892755a18f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h
@@ -55,23 +55,14 @@ struct ch_tc_flower_entry {
 };
 
 enum {
-	ETH_DMAC_31_0,	/* dmac bits 0.. 31 */
-	ETH_DMAC_47_32,	/* dmac bits 32..47 */
-	ETH_SMAC_15_0,	/* smac bits 0.. 15 */
-	ETH_SMAC_47_16,	/* smac bits 16..47 */
+	ETH_DMAC,	/* 48-bits dmac bits */
+	ETH_SMAC,	/* 48-bits smac bits */
 
 	IP4_SRC,	/* 32-bit IPv4 src  */
 	IP4_DST,	/* 32-bit IPv4 dst  */
 
-	IP6_SRC_31_0,	/* src bits 0..  31 */
-	IP6_SRC_63_32,	/* src bits 63.. 32 */
-	IP6_SRC_95_64,	/* src bits 95.. 64 */
-	IP6_SRC_127_96,	/* src bits 127..96 */
-
-	IP6_DST_31_0,	/* dst bits 0..  31 */
-	IP6_DST_63_32,	/* dst bits 63.. 32 */
-	IP6_DST_95_64,	/* dst bits 95.. 64 */
-	IP6_DST_127_96,	/* dst bits 127..96 */
+	IP6_SRC,	/* 128-bit IPv6 src */
+	IP6_DST,	/* 128-bit IPv6 dst */
 
 	TCP_SPORT,	/* 16-bit TCP sport */
 	TCP_DPORT,	/* 16-bit TCP dport */
@@ -90,23 +81,16 @@ struct ch_tc_pedit_fields {
 	{ type## field, size, \
 		offsetof(struct ch_filter_specification, fs_field) + (offset) }
 
-#define PEDIT_ETH_DMAC_MASK		0xffff
-#define PEDIT_TCP_UDP_SPORT_MASK	0xffff
-#define PEDIT_ETH_DMAC_31_0		0x0
-#define PEDIT_ETH_DMAC_47_32_SMAC_15_0	0x4
-#define PEDIT_ETH_SMAC_47_16		0x8
+#define PEDIT_ETH_DMAC			0x0
+#define PEDIT_ETH_SMAC			0x6
+#define PEDIT_IP6_SRC			0x8
+#define PEDIT_IP6_DST			0x18
 #define PEDIT_IP4_SRC			0xC
 #define PEDIT_IP4_DST			0x10
-#define PEDIT_IP6_SRC_31_0		0x8
-#define PEDIT_IP6_SRC_63_32		0xC
-#define PEDIT_IP6_SRC_95_64		0x10
-#define PEDIT_IP6_SRC_127_96		0x14
-#define PEDIT_IP6_DST_31_0		0x18
-#define PEDIT_IP6_DST_63_32		0x1C
-#define PEDIT_IP6_DST_95_64		0x20
-#define PEDIT_IP6_DST_127_96		0x24
-#define PEDIT_TCP_SPORT_DPORT		0x0
-#define PEDIT_UDP_SPORT_DPORT		0x0
+#define PEDIT_TCP_SPORT			0x0
+#define PEDIT_TCP_DPORT			0x2
+#define PEDIT_UDP_SPORT			0x0
+#define PEDIT_UDP_DPORT			0x2
 
 int cxgb4_tc_flower_replace(struct net_device *dev,
 			    struct flow_cls_offload *cls);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ec47e994b7e0..0b8e34fb8a0a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2202,19 +2202,24 @@ static int pedit_header_offsets[] = {
 
 #define pedit_header(_ph, _htype) ((void *)(_ph) + pedit_header_offsets[_htype])
 
-static int set_pedit_val(u8 hdr_type, u32 mask, u32 val, u32 offset,
+static int set_pedit_val(u8 hdr_type, const struct flow_action_entry *act,
 			 struct pedit_headers_action *hdrs)
 {
-	u32 *curr_pmask, *curr_pval;
+	u32 offset = act->mangle.offset;
+	u8 *curr_pmask, *curr_pval;
+	int i;
 
-	curr_pmask = (u32 *)(pedit_header(&hdrs->masks, hdr_type) + offset);
-	curr_pval  = (u32 *)(pedit_header(&hdrs->vals, hdr_type) + offset);
+	curr_pmask = (u8 *)(pedit_header(&hdrs->masks, hdr_type) + offset);
+	curr_pval  = (u8 *)(pedit_header(&hdrs->vals, hdr_type) + offset);
 
-	if (*curr_pmask & mask)  /* disallow acting twice on the same location */
-		goto out_err;
+	for (i = 0; i < act->mangle.len; i++) {
+		/* disallow acting twice on the same location */
+		if (curr_pmask[i] & act->mangle.mask[i])
+			goto out_err;
 
-	*curr_pmask |= mask;
-	*curr_pval  |= val;
+		curr_pmask[i] |= act->mangle.mask[i];
+		curr_pval[i] |= act->mangle.val[i];
+	}
 
 	return 0;
 
@@ -2488,7 +2493,6 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv,
 {
 	u8 cmd = (act->id == FLOW_ACTION_MANGLE) ? 0 : 1;
 	int err = -EOPNOTSUPP;
-	u32 mask, val, offset;
 	u8 htype;
 
 	htype = act->mangle.htype;
@@ -2505,11 +2509,7 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv,
 		goto out_err;
 	}
 
-	mask = act->mangle.mask;
-	val = act->mangle.val;
-	offset = act->mangle.offset;
-
-	err = set_pedit_val(htype, mask, val, offset, &hdrs[cmd]);
+	err = set_pedit_val(htype, act, &hdrs[cmd]);
 	if (err)
 		goto out_err;
 
@@ -2590,49 +2590,19 @@ static bool csum_offload_supported(struct mlx5e_priv *priv,
 	return true;
 }
 
-struct ip_ttl_word {
-	__u8	ttl;
-	__u8	protocol;
-	__sum16	check;
-};
-
-struct ipv6_hoplimit_word {
-	__be16	payload_len;
-	__u8	nexthdr;
-	__u8	hop_limit;
-};
-
 static bool is_action_keys_supported(const struct flow_action_entry *act)
 {
-	u32 mask, offset;
-	u8 htype;
+	u32 offset = act->mangle.offset;
+	u8 htype = act->mangle.htype;
 
-	htype = act->mangle.htype;
-	offset = act->mangle.offset;
-	mask = act->mangle.mask;
-	/* For IPv4 & IPv6 header check 4 byte word,
-	 * to determine that modified fields
-	 * are NOT ttl & hop_limit only.
-	 */
 	if (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP4) {
-		struct ip_ttl_word *ttl_word =
-			(struct ip_ttl_word *)&mask;
-
-		if (offset != offsetof(struct iphdr, ttl) ||
-		    ttl_word->protocol ||
-		    ttl_word->check) {
+		if (offset != offsetof(struct iphdr, ttl))
 			return true;
-		}
 	} else if (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP6) {
-		struct ipv6_hoplimit_word *hoplimit_word =
-			(struct ipv6_hoplimit_word *)&mask;
-
-		if (offset != offsetof(struct ipv6hdr, payload_len) ||
-		    hoplimit_word->payload_len ||
-		    hoplimit_word->nexthdr) {
+		if (offset != offsetof(struct ipv6hdr, hop_limit))
 			return true;
-		}
 	}
+
 	return false;
 }
 
@@ -2727,19 +2697,21 @@ static int add_vlan_rewrite_action(struct mlx5e_priv *priv, int namespace,
 				   struct pedit_headers_action *hdrs,
 				   u32 *action, struct netlink_ext_ack *extack)
 {
-	u16 mask16 = VLAN_VID_MASK;
-	u16 val16 = act->vlan.vid & VLAN_VID_MASK;
-	const struct flow_action_entry pedit_act = {
+	__be16 mask16 = htons(VLAN_VID_MASK);
+	__be16 val16 = htons(act->vlan.vid & VLAN_VID_MASK);
+	struct flow_action_entry pedit_act = {
 		.id = FLOW_ACTION_MANGLE,
 		.mangle.htype = FLOW_ACT_MANGLE_HDR_TYPE_ETH,
 		.mangle.offset = offsetof(struct vlan_ethhdr, h_vlan_TCI),
-		.mangle.mask = (u32)be16_to_cpu(*(__be16 *)&mask16),
-		.mangle.val = (u32)be16_to_cpu(*(__be16 *)&val16),
+		.mangle.len = 2,
 	};
 	u8 match_prio_mask, match_prio_val;
 	void *headers_c, *headers_v;
 	int err;
 
+	memcpy(pedit_act.mangle.mask, &mask16, sizeof(__be16));
+	memcpy(pedit_act.mangle.val, &val16, sizeof(__be16));
+
 	headers_c = get_match_headers_criteria(*action, &parse_attr->spec);
 	headers_v = get_match_headers_value(*action, &parse_attr->spec);
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 592c36ba9e3f..a525ec244b79 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -472,38 +472,44 @@ nfp_fl_set_ipv4_tun(struct nfp_app *app, struct nfp_fl_set_ipv4_tun *set_tun,
 	return 0;
 }
 
-static void nfp_fl_set_helper32(u32 value, u32 mask, u8 *p_exact, u8 *p_mask)
+static void nfp_fl_set_helper(const u8 *value, const u8 *mask,
+			      u8 *p_exact, u8 *p_mask, int len)
 {
-	u32 oldvalue = get_unaligned((u32 *)p_exact);
-	u32 oldmask = get_unaligned((u32 *)p_mask);
+	int i;
 
-	value |= oldvalue & ~mask;
-
-	put_unaligned(oldmask | mask, (u32 *)p_mask);
-	put_unaligned(value, (u32 *)p_exact);
+	for (i = 0; i < len; i++) {
+		p_exact[i] = (p_exact[i] & ~mask[i]) | value[i];
+		p_mask[i] |= mask[i];
+	}
 }
 
 static int
 nfp_fl_set_eth(const struct flow_action_entry *act, u32 off,
 	       struct nfp_fl_set_eth *set_eth, struct netlink_ext_ack *extack)
 {
-	u32 exact, mask;
+	int i;
 
-	if (off + 4 > ETH_ALEN * 2) {
+	switch (off) {
+	case offsetof(struct ethhdr, h_dest):
+		i = 0;
+		break;
+	case offsetof(struct ethhdr, h_source):
+		i = ETH_ALEN;
+		break;
+	default:
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit ethernet action");
 		return -EOPNOTSUPP;
 	}
 
-	mask = act->mangle.mask;
-	exact = act->mangle.val;
-
-	if (exact & ~mask) {
+	if (memchr_inv(&act->mangle.val, 0, act->mangle.len) &&
+	    !memchr_inv(&act->mangle.mask, 0, act->mangle.len)) {
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit ethernet action");
 		return -EOPNOTSUPP;
 	}
 
-	nfp_fl_set_helper32(exact, mask, &set_eth->eth_addr_val[off],
-			    &set_eth->eth_addr_mask[off]);
+	nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+			  &set_eth->eth_addr_val[i],
+			  &set_eth->eth_addr_mask[i], act->mangle.len);
 
 	set_eth->reserved = cpu_to_be16(0);
 	set_eth->head.jump_id = NFP_FL_ACTION_OPCODE_SET_ETHERNET;
@@ -524,68 +530,46 @@ nfp_fl_set_ip4(const struct flow_action_entry *act, u32 off,
 	       struct nfp_fl_set_ip4_ttl_tos *set_ip_ttl_tos,
 	       struct netlink_ext_ack *extack)
 {
-	struct ipv4_ttl_word *ttl_word_mask;
-	struct ipv4_ttl_word *ttl_word;
-	struct iphdr *tos_word_mask;
-	struct iphdr *tos_word;
-	__be32 exact, mask;
-
-	/* We are expecting tcf_pedit to return a big endian value */
-	mask = (__force __be32)act->mangle.mask;
-	exact = (__force __be32)act->mangle.val;
-
-	if (exact & ~mask) {
+	if (memchr_inv(act->mangle.val, 0, act->mangle.len) &&
+	    !memchr_inv(act->mangle.mask, 0, act->mangle.len)) {
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit IPv4 action");
 		return -EOPNOTSUPP;
 	}
 
 	switch (off) {
 	case offsetof(struct iphdr, daddr):
-		set_ip_addr->ipv4_dst_mask |= mask;
-		set_ip_addr->ipv4_dst &= ~mask;
-		set_ip_addr->ipv4_dst |= exact;
+		nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+				  (u8 *)&set_ip_addr->ipv4_dst,
+				  (u8 *)&set_ip_addr->ipv4_dst_mask,
+				  act->mangle.len);
 		set_ip_addr->head.jump_id = NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS;
 		set_ip_addr->head.len_lw = sizeof(*set_ip_addr) >>
 					   NFP_FL_LW_SIZ;
 		break;
 	case offsetof(struct iphdr, saddr):
-		set_ip_addr->ipv4_src_mask |= mask;
-		set_ip_addr->ipv4_src &= ~mask;
-		set_ip_addr->ipv4_src |= exact;
+		nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+				  (u8 *)&set_ip_addr->ipv4_src,
+				  (u8 *)&set_ip_addr->ipv4_src_mask,
+				  act->mangle.len);
 		set_ip_addr->head.jump_id = NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS;
 		set_ip_addr->head.len_lw = sizeof(*set_ip_addr) >>
 					   NFP_FL_LW_SIZ;
 		break;
 	case offsetof(struct iphdr, ttl):
-		ttl_word_mask = (struct ipv4_ttl_word *)&mask;
-		ttl_word = (struct ipv4_ttl_word *)&exact;
-
-		if (ttl_word_mask->protocol || ttl_word_mask->check) {
-			NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit IPv4 ttl action");
-			return -EOPNOTSUPP;
-		}
-
-		set_ip_ttl_tos->ipv4_ttl_mask |= ttl_word_mask->ttl;
-		set_ip_ttl_tos->ipv4_ttl &= ~ttl_word_mask->ttl;
-		set_ip_ttl_tos->ipv4_ttl |= ttl_word->ttl & ttl_word_mask->ttl;
+		nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+				  (u8 *)&set_ip_ttl_tos->ipv4_ttl,
+				  (u8 *)&set_ip_ttl_tos->ipv4_ttl_mask,
+				  act->mangle.len);
 		set_ip_ttl_tos->head.jump_id =
 			NFP_FL_ACTION_OPCODE_SET_IPV4_TTL_TOS;
 		set_ip_ttl_tos->head.len_lw = sizeof(*set_ip_ttl_tos) >>
 					      NFP_FL_LW_SIZ;
 		break;
-	case round_down(offsetof(struct iphdr, tos), 4):
-		tos_word_mask = (struct iphdr *)&mask;
-		tos_word = (struct iphdr *)&exact;
-
-		if (tos_word_mask->version || tos_word_mask->ihl ||
-		    tos_word_mask->tot_len) {
-			NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit IPv4 tos action");
-			return -EOPNOTSUPP;
-		}
-
-		set_ip_ttl_tos->ipv4_tos_mask |= tos_word_mask->tos;
-		set_ip_ttl_tos->ipv4_tos &= ~tos_word_mask->tos;
-		set_ip_ttl_tos->ipv4_tos |= tos_word->tos & tos_word_mask->tos;
+	case offsetof(struct iphdr, tos):
+		nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+				  (u8 *)&set_ip_ttl_tos->ipv4_tos,
+				  (u8 *)&set_ip_ttl_tos->ipv4_tos_mask,
+				  act->mangle.len);
 		set_ip_ttl_tos->head.jump_id =
 			NFP_FL_ACTION_OPCODE_SET_IPV4_TTL_TOS;
 		set_ip_ttl_tos->head.len_lw = sizeof(*set_ip_ttl_tos) >>
@@ -600,12 +584,17 @@ nfp_fl_set_ip4(const struct flow_action_entry *act, u32 off,
 }
 
 static void
-nfp_fl_set_ip6_helper(int opcode_tag, u8 word, __be32 exact, __be32 mask,
-		      struct nfp_fl_set_ipv6_addr *ip6)
+nfp_fl_set_ip6_helper(int opcode_tag, const struct flow_action_entry *act,
+		      struct nfp_fl_set_ipv6_addr *ip6,
+		      struct netlink_ext_ack *extack)
 {
-	ip6->ipv6[word].mask |= mask;
-	ip6->ipv6[word].exact &= ~mask;
-	ip6->ipv6[word].exact |= exact;
+	int i, j;
+
+	for (i = 0, j = 0; i < sizeof(struct in6_addr); i++, j += sizeof(u32)) {
+		nfp_fl_set_helper(&act->mangle.val[j], &act->mangle.mask[j],
+				  (u8 *)&ip6->ipv6[i].exact,
+				  (u8 *)&ip6->ipv6[i].mask, sizeof(u32));
+	}
 
 	ip6->reserved = cpu_to_be16(0);
 	ip6->head.jump_id = opcode_tag;
@@ -619,39 +608,34 @@ struct ipv6_hop_limit_word {
 };
 
 static int
-nfp_fl_set_ip6_hop_limit_flow_label(u32 off, __be32 exact, __be32 mask,
+nfp_fl_set_ip6_hop_limit_flow_label(u32 off,
+				    const struct flow_action_entry *act,
 				    struct nfp_fl_set_ipv6_tc_hl_fl *ip_hl_fl,
 				    struct netlink_ext_ack *extack)
 {
-	struct ipv6_hop_limit_word *fl_hl_mask;
-	struct ipv6_hop_limit_word *fl_hl;
-
 	switch (off) {
-	case offsetof(struct ipv6hdr, payload_len):
-		fl_hl_mask = (struct ipv6_hop_limit_word *)&mask;
-		fl_hl = (struct ipv6_hop_limit_word *)&exact;
-
-		if (fl_hl_mask->nexthdr || fl_hl_mask->payload_len) {
-			NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit IPv6 hop limit action");
-			return -EOPNOTSUPP;
-		}
-
-		ip_hl_fl->ipv6_hop_limit_mask |= fl_hl_mask->hop_limit;
-		ip_hl_fl->ipv6_hop_limit &= ~fl_hl_mask->hop_limit;
-		ip_hl_fl->ipv6_hop_limit |= fl_hl->hop_limit &
-					    fl_hl_mask->hop_limit;
+	case offsetof(struct ipv6hdr, hop_limit):
+		nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+				  &ip_hl_fl->ipv6_hop_limit,
+				  &ip_hl_fl->ipv6_hop_limit_mask,
+				  act->mangle.len);
 		break;
-	case round_down(offsetof(struct ipv6hdr, flow_lbl), 4):
-		if (mask & ~IPV6_FLOW_LABEL_MASK ||
-		    exact & ~IPV6_FLOW_LABEL_MASK) {
+	case 1: /* offsetof(struct ipv6hdr, flow_lbl) */
+		/* The initial 4-bits are part of the traffic class field. */
+		if (act->mangle.val[0] & 0xf0 ||
+		    act->mangle.mask[0] & 0xf0) {
 			NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit IPv6 flow label action");
 			return -EOPNOTSUPP;
 		}
 
-		ip_hl_fl->ipv6_label_mask |= mask;
-		ip_hl_fl->ipv6_label &= ~mask;
-		ip_hl_fl->ipv6_label |= exact;
+		nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+				  (u8 *)&ip_hl_fl->ipv6_label,
+				  (u8 *)&ip_hl_fl->ipv6_label_mask,
+				  act->mangle.len);
 		break;
+	default:
+		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit IPv6 action");
+		return -EOPNOTSUPP;
 	}
 
 	ip_hl_fl->head.jump_id = NFP_FL_ACTION_OPCODE_SET_IPV6_TC_HL_FL;
@@ -667,31 +651,23 @@ nfp_fl_set_ip6(const struct flow_action_entry *act, u32 off,
 	       struct nfp_fl_set_ipv6_tc_hl_fl *ip_hl_fl,
 	       struct netlink_ext_ack *extack)
 {
-	__be32 exact, mask;
 	int err = 0;
-	u8 word;
 
-	/* We are expecting tcf_pedit to return a big endian value */
-	mask = (__force __be32)act->mangle.mask;
-	exact = (__force __be32)act->mangle.val;
-
-	if (exact & ~mask) {
+	if (memchr_inv(act->mangle.val, 0, act->mangle.len) &&
+	    !memchr_inv(act->mangle.mask, 0, act->mangle.len)) {
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit IPv6 action");
 		return -EOPNOTSUPP;
 	}
 
 	if (off < offsetof(struct ipv6hdr, saddr)) {
-		err = nfp_fl_set_ip6_hop_limit_flow_label(off, exact, mask,
-							  ip_hl_fl, extack);
-	} else if (off < offsetof(struct ipv6hdr, daddr)) {
-		word = (off - offsetof(struct ipv6hdr, saddr)) / sizeof(exact);
-		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, word,
-				      exact, mask, ip_src);
-	} else if (off < offsetof(struct ipv6hdr, daddr) +
-		       sizeof(struct in6_addr)) {
-		word = (off - offsetof(struct ipv6hdr, daddr)) / sizeof(exact);
-		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, word,
-				      exact, mask, ip_dst);
+		err = nfp_fl_set_ip6_hop_limit_flow_label(off, act, ip_hl_fl,
+							  extack);
+	} else if (off == offsetof(struct ipv6hdr, saddr)) {
+		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC,
+				      act, ip_src, extack);
+	} else if (off == offsetof(struct ipv6hdr, daddr)) {
+		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST,
+				      act, ip_dst, extack);
 	} else {
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: pedit on unsupported section of IPv6 header");
 		return -EOPNOTSUPP;
@@ -705,23 +681,30 @@ nfp_fl_set_tport(const struct flow_action_entry *act, u32 off,
 		 struct nfp_fl_set_tport *set_tport, int opcode,
 		 struct netlink_ext_ack *extack)
 {
-	u32 exact, mask;
+	int i;
 
-	if (off) {
+	switch (off) {
+	case offsetof(struct tcphdr, source):
+		i = 0;
+		break;
+	case offsetof(struct tcphdr, dest):
+		i = sizeof(u16);
+		break;
+	default:
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: pedit on unsupported section of L4 header");
 		return -EOPNOTSUPP;
 	}
 
-	mask = act->mangle.mask;
-	exact = act->mangle.val;
-
-	if (exact & ~mask) {
+	if (memchr_inv(act->mangle.val, 0, act->mangle.len) &&
+	    !memchr_inv(act->mangle.mask, 0, act->mangle.len)) {
 		NL_SET_ERR_MSG_MOD(extack, "unsupported offload: invalid pedit L4 action");
 		return -EOPNOTSUPP;
 	}
 
-	nfp_fl_set_helper32(exact, mask, set_tport->tp_port_val,
-			    set_tport->tp_port_mask);
+	nfp_fl_set_helper(act->mangle.val, act->mangle.mask,
+			  &set_tport->tp_port_val[i],
+			  &set_tport->tp_port_mask[i],
+			  act->mangle.len);
 
 	set_tport->reserved = cpu_to_be16(0);
 	set_tport->head.jump_id = opcode;
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index fc881875f856..a2853b37dded 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -154,6 +154,8 @@ enum flow_action_mangle_base {
 	FLOW_ACT_MANGLE_HDR_TYPE_UDP,
 };
 
+#define FLOW_ACTION_MANGLE_MAXLEN	16
+
 struct flow_action_entry {
 	enum flow_action_id		id;
 	union {
@@ -167,8 +169,9 @@ struct flow_action_entry {
 		struct {				/* FLOW_ACTION_PACKET_EDIT */
 			enum flow_action_mangle_base htype;
 			u32		offset;
-			u32		mask;
-			u32		val;
+			u8		mask[FLOW_ACTION_MANGLE_MAXLEN];
+			u8		val[FLOW_ACTION_MANGLE_MAXLEN];
+			u8		len;
 		} mangle;
 		const struct ip_tunnel_info *tunnel;	/* FLOW_ACTION_TUNNEL_ENCAP */
 		u32			csum_flags;	/* FLOW_ACTION_CSUM */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e30a151d8527..468db33292a9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3289,11 +3289,128 @@ void tc_cleanup_flow_action(struct flow_action *flow_action)
 }
 EXPORT_SYMBOL(tc_cleanup_flow_action);
 
+static unsigned int flow_action_mangle_type(enum pedit_cmd cmd)
+{
+	switch (cmd) {
+	case TCA_PEDIT_KEY_EX_CMD_SET:
+		return FLOW_ACTION_MANGLE;
+	case TCA_PEDIT_KEY_EX_CMD_ADD:
+		return FLOW_ACTION_ADD;
+	default:
+		WARN_ON_ONCE(1);
+	}
+	return 0;
+}
+
+struct flow_action_mangle_ctx {
+	u8	cmd;
+	u8	offset;
+	u8	htype;
+	u8	idx;
+	u8	val[FLOW_ACTION_MANGLE_MAXLEN];
+	u8	mask[FLOW_ACTION_MANGLE_MAXLEN];
+	u32	first_word;
+	u32	last_word;
+};
+
+static void flow_action_mangle_entry(struct flow_action_entry *entry,
+				     const struct flow_action_mangle_ctx *ctx)
+{
+	u32 delta;
+
+	entry->id = ctx->cmd;
+	entry->mangle.htype = ctx->htype;
+	entry->mangle.offset = ctx->offset;
+	entry->mangle.len = ctx->idx;
+
+	/* Adjust offset. */
+	delta = sizeof(u32) - (fls(ctx->first_word) / BITS_PER_BYTE);
+	entry->mangle.offset += delta;
+
+	/* Adjust length. */
+	entry->mangle.len -= ((ffs(ctx->last_word) / BITS_PER_BYTE) + delta);
+
+	/* Copy value and mask from offset using the adjusted length. */
+	memcpy(entry->mangle.val, &ctx->val[delta], entry->mangle.len);
+	memcpy(entry->mangle.mask, &ctx->mask[delta], entry->mangle.len);
+}
+
+static void flow_action_mangle_ctx_update(struct flow_action_mangle_ctx *ctx,
+					  const struct tc_action *act, int k)
+{
+	u32 val, mask;
+
+	val = tcf_pedit_val(act, k);
+	mask = ~tcf_pedit_mask(act, k);
+
+	memcpy(&ctx->val[ctx->idx], &val, sizeof(u32));
+	memcpy(&ctx->mask[ctx->idx], &mask, sizeof(u32));
+	ctx->idx += sizeof(u32);
+}
+
+static void flow_action_mangle_ctx_init(struct flow_action_mangle_ctx *ctx,
+					const struct tc_action *act, int k)
+{
+	ctx->cmd = flow_action_mangle_type(tcf_pedit_cmd(act, k));
+	ctx->offset = tcf_pedit_offset(act, k);
+	ctx->htype = tcf_pedit_htype(act, k);
+	ctx->idx = 0;
+
+	ctx->first_word = ntohl(~tcf_pedit_mask(act, k));
+	ctx->last_word = ctx->first_word;
+
+	flow_action_mangle_ctx_update(ctx, act, k);
+}
+
+static int flow_action_mangle(struct flow_action *flow_action,
+			      struct flow_action_entry *entry,
+			      const struct tc_action *act, int j)
+{
+	struct flow_action_mangle_ctx ctx;
+	int k;
+
+	if (tcf_pedit_cmd(act, 0) > TCA_PEDIT_KEY_EX_CMD_ADD)
+		return -1;
+
+	flow_action_mangle_ctx_init(&ctx, act, 0);
+
+	/* Special case: one single 32-bits word. */
+	if (tcf_pedit_nkeys(act) == 1) {
+		flow_action_mangle_entry(entry, &ctx);
+		return j;
+	}
+
+	for (k = 1; k < tcf_pedit_nkeys(act); k++) {
+		if (tcf_pedit_cmd(act, k) > TCA_PEDIT_KEY_EX_CMD_ADD)
+			return -1;
+
+		/* Offset is contiguous and type is the same, coalesce. */
+		if (ctx.idx < FLOW_ACTION_MANGLE_MAXLEN &&
+		    ctx.offset + ctx.idx == tcf_pedit_offset(act, k) &&
+		    ctx.cmd == flow_action_mangle_type(tcf_pedit_cmd(act, k))) {
+			flow_action_mangle_ctx_update(&ctx, act, k);
+			continue;
+		}
+		ctx.last_word = ntohl(~tcf_pedit_mask(act, k - 1));
+
+		/* Cannot coalesce, set up this entry. */
+		flow_action_mangle_entry(entry, &ctx);
+
+		flow_action_mangle_ctx_init(&ctx, act, k);
+		entry = &flow_action->entries[++j];
+	}
+
+	ctx.last_word = ntohl(~tcf_pedit_mask(act, k - 1));
+	flow_action_mangle_entry(entry, &ctx);
+
+	return j;
+}
+
 int tc_setup_flow_action(struct flow_action *flow_action,
 			 const struct tcf_exts *exts, bool rtnl_held)
 {
 	const struct tc_action *act;
-	int i, j, k, err = 0;
+	int i, j, err = 0;
 
 	if (!exts)
 		return 0;
@@ -3366,25 +3483,9 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 		} else if (is_tcf_tunnel_release(act)) {
 			entry->id = FLOW_ACTION_TUNNEL_DECAP;
 		} else if (is_tcf_pedit(act)) {
-			for (k = 0; k < tcf_pedit_nkeys(act); k++) {
-				switch (tcf_pedit_cmd(act, k)) {
-				case TCA_PEDIT_KEY_EX_CMD_SET:
-					entry->id = FLOW_ACTION_MANGLE;
-					break;
-				case TCA_PEDIT_KEY_EX_CMD_ADD:
-					entry->id = FLOW_ACTION_ADD;
-					break;
-				default:
-					err = -EOPNOTSUPP;
-					goto err_out;
-				}
-				entry->mangle.htype = tcf_pedit_htype(act, k);
-				entry->mangle.mask = ~tcf_pedit_mask(act, k);
-				entry->mangle.val = tcf_pedit_val(act, k) &
-							entry->mangle.mask;
-				entry->mangle.offset = tcf_pedit_offset(act, k);
-				entry = &flow_action->entries[++j];
-			}
+			j = flow_action_mangle(flow_action, entry, act, j);
+			if (j < 0)
+				goto err_out;
 		} else if (is_tcf_csum(act)) {
 			entry->id = FLOW_ACTION_CSUM;
 			entry->csum_flags = tcf_csum_update_flags(act);
@@ -3439,9 +3540,9 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 			goto err_out;
 		}
 
-		if (!is_tcf_pedit(act))
-			j++;
+		j++;
 	}
+	flow_action->num_entries = j;
 
 err_out:
 	if (!rtnl_held)
-- 
2.11.0


^ permalink raw reply related

* Re: [PATCH] kcm: use BPF_PROG_RUN
From: Yonghong Song @ 2019-09-06  0:07 UTC (permalink / raw)
  To: Sami Tolvanen, David S. Miller, Tom Herbert
  Cc: netdev@vger.kernel.org, bpf@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20190905211528.97828-1-samitolvanen@google.com>



On 9/5/19 2:15 PM, Sami Tolvanen wrote:
> Instead of invoking struct bpf_prog::bpf_func directly, use the
> BPF_PROG_RUN macro.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>

Acked-by: Yonghong Song <yhs@fb.com>

> ---
>   net/kcm/kcmsock.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
> index 5dbc0c48f8cb..f350c613bd7d 100644
> --- a/net/kcm/kcmsock.c
> +++ b/net/kcm/kcmsock.c
> @@ -379,7 +379,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
>   	struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
>   	struct bpf_prog *prog = psock->bpf_prog;
>   
> -	return (*prog->bpf_func)(skb, prog->insnsi);
> +	return BPF_PROG_RUN(prog, skb);
>   }
>   
>   static int kcm_read_sock_done(struct strparser *strp, int err)
> 

^ permalink raw reply

* Re: [PATCH] Revert "net: get rid of an signed integer overflow in ip_idents_reserve()"
From: Shaokun Zhang @ 2019-09-06  1:25 UTC (permalink / raw)
  To: Eric Dumazet, netdev, linux-kernel
  Cc: Yang Guo, David S. Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jiri Pirko
In-Reply-To: <afa992f6-eb5e-8104-9a03-f992b184a6b6@gmail.com>

Hi Eric,

On 2019/7/26 17:58, Eric Dumazet wrote:
> 
> 
> On 7/26/19 11:17 AM, Shaokun Zhang wrote:
>> From: Yang Guo <guoyang2@huawei.com>
>>
>> There is an significant performance regression with the following
>> commit-id <adb03115f459>
>> ("net: get rid of an signed integer overflow in ip_idents_reserve()").
>>
>>
> 
> So, you jump around and took ownership of this issue, while some of us
> are already working on it ?
> 

Any update about this issue?

Thanks,
Shaokun

> Have you first checked that current UBSAN versions will not complain anymore ?
> 
> A revert adding back the original issue would be silly, performance of
> benchmarks is nice but secondary.
> 
> 
> 


^ permalink raw reply

* Re: WARNING in hso_free_net_device
From: Hui Peng @ 2019-09-06  2:05 UTC (permalink / raw)
  To: Andrey Konovalov
  Cc: Stephen Hemminger, syzbot+44d53c7255bb1aea22d2, alexios.zavras,
	David S. Miller, Greg Kroah-Hartman, LKML, USB list,
	Mathias Payer, netdev, rfontana, syzkaller-bugs, Thomas Gleixner,
	Oliver Neukum
In-Reply-To: <CAAeHK+y3eQ7bXvo1tiAkwLCsFkbSU5B+6hsKbdEzkSXP2_Jyzg@mail.gmail.com>



On 9/5/2019 7:24 AM, Andrey Konovalov wrote:
> On Thu, Sep 5, 2019 at 4:20 AM Hui Peng <benquike@gmail.com> wrote:
>>
>> Can you guys have  a look at the attached patch?
> 
> Let's try it:
> 
> #syz test: https://github.com/google/kasan.git eea39f24
> 
> FYI: there are two more reports coming from this driver, which might
> (or might not) have the same root cause. One of them has a suggested
> fix by Oliver.
> 
> https://syzkaller.appspot.com/bug?extid=67b2bd0e34f952d0321e
> https://syzkaller.appspot.com/bug?extid=93f2f45b19519b289613
> 

I think they are different, though similar.
This one is resulted from unregistering a network device.
These 2 are resulted from unregistering a tty device.

>>
>> On 9/4/19 6:41 PM, Stephen Hemminger wrote:
>>> On Wed, 4 Sep 2019 16:27:50 -0400
>>> Hui Peng <benquike@gmail.com> wrote:
>>>
>>>> Hi, all:
>>>>
>>>> I looked at the bug a little.
>>>>
>>>> The issue is that in the error handling code, hso_free_net_device
>>>> unregisters
>>>>
>>>> the net_device (hso_net->net)  by calling unregister_netdev. In the
>>>> error handling code path,
>>>>
>>>> hso_net->net has not been registered yet.
>>>>
>>>> I think there are two ways to solve the issue:
>>>>
>>>> 1. fix it in drivers/net/usb/hso.c to avoiding unregistering the
>>>> net_device when it is still not registered
>>>>
>>>> 2. fix it in unregister_netdev. We can add a field in net_device to
>>>> record whether it is registered, and make unregister_netdev return if
>>>> the net_device is not registered yet.
>>>>
>>>> What do you guys think ?
>>> #1

^ permalink raw reply

* RE: [PATCH REPOST 1/2] can: flexcan: fix deadlock when using self wakeup
From: Joakim Zhang @ 2019-09-06  2:11 UTC (permalink / raw)
  To: Sean Nyekjaer, mkl@pengutronix.de, linux-can@vger.kernel.org
  Cc: wg@grandegger.com, netdev@vger.kernel.org, dl-linux-imx,
	Martin Hundebøll
In-Reply-To: <1655f342-7aaf-5e36-d141-d00eee84f3ec@geanix.com>

> -----Original Message-----
> From: Sean Nyekjaer <sean@geanix.com>
> Sent: 2019年9月5日 21:18
> To: Joakim Zhang <qiangqing.zhang@nxp.com>; mkl@pengutronix.de;
> linux-can@vger.kernel.org
> Cc: wg@grandegger.com; netdev@vger.kernel.org; dl-linux-imx
> <linux-imx@nxp.com>; Martin Hundebøll <martin@geanix.com>
> Subject: Re: [PATCH REPOST 1/2] can: flexcan: fix deadlock when using self
> wakeup
> 
> 
> 
> On 05/09/2019 09.10, Joakim Zhang wrote:
> > Hi Sean,
> >
> > Could you update lastest flexcan driver using linux-can-next/flexcan and then
> merge below two patches from linux-can/testing?
> > d0b53616716e (HEAD -> testing, origin/testing) can: flexcan: add LPSR
> > mode support for i.MX7D 803eb6bad65b can: flexcan: fix deadlock when
> > using self wakeup
> >
> > Best Regards,
> > Joakim Zhang
> 
> The testing branch have some UBI bugs, when suspending it crashes...
> So will have to leave this, until they are resolved :-)

I think you can find the base and cherry-pick all above flexcan related latest patches to your local stable tree.
I did this and test wakeup function on i.MX8QM/QXP before.

BTW, I think without CAN FD related patches, you still can test wakeup function on i.MX6/7 with this patch(may need fix merge conflicts)
This patch has been merged into our 4.19 kernel and passed the wakeup test for i.MX6/7/8. I went through the flexcan suspend/resume process again,
and not find logical issue. If you met wakeup failure, could you help to locate the problem more accurate. Failure log you provided last time, strange
and illogical.

Thanks a lot! 😊

Best Regards,
Joakim Zhang
> /Sean

^ permalink raw reply

* [PATCH net v2] isdn/capi: check message length in capi_write()
From: Eric Biggers @ 2019-09-06  2:36 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Karsten Keil, syzkaller-bugs
In-Reply-To: <20190901.114406.528788701327829265.davem@davemloft.net>

From: Eric Biggers <ebiggers@google.com>

syzbot reported:

    BUG: KMSAN: uninit-value in capi_write+0x791/0xa90 drivers/isdn/capi/capi.c:700
    CPU: 0 PID: 10025 Comm: syz-executor379 Not tainted 4.20.0-rc7+ #2
    Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
    Call Trace:
      __dump_stack lib/dump_stack.c:77 [inline]
      dump_stack+0x173/0x1d0 lib/dump_stack.c:113
      kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613
      __msan_warning+0x82/0xf0 mm/kmsan/kmsan_instr.c:313
      capi_write+0x791/0xa90 drivers/isdn/capi/capi.c:700
      do_loop_readv_writev fs/read_write.c:703 [inline]
      do_iter_write+0x83e/0xd80 fs/read_write.c:961
      vfs_writev fs/read_write.c:1004 [inline]
      do_writev+0x397/0x840 fs/read_write.c:1039
      __do_sys_writev fs/read_write.c:1112 [inline]
      __se_sys_writev+0x9b/0xb0 fs/read_write.c:1109
      __x64_sys_writev+0x4a/0x70 fs/read_write.c:1109
      do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
      entry_SYSCALL_64_after_hwframe+0x63/0xe7
    [...]

The problem is that capi_write() is reading past the end of the message.
Fix it by checking the message's length in the needed places.

Reported-and-tested-by: syzbot+0849c524d9c634f5ae66@syzkaller.appspotmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
---

Changed v1 => v2:

- Use CAPI_DATA_B3_REQ_LEN, and define and use a constant for
  CAPI_DISCONNECT_B3_RESP_LEN.

 drivers/isdn/capi/capi.c          | 10 +++++++++-
 include/uapi/linux/isdn/capicmd.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 3c3ad42f22bf..c92b405b7646 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -688,6 +688,9 @@ capi_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos
 	if (!cdev->ap.applid)
 		return -ENODEV;
 
+	if (count < CAPIMSG_BASELEN)
+		return -EINVAL;
+
 	skb = alloc_skb(count, GFP_USER);
 	if (!skb)
 		return -ENOMEM;
@@ -698,7 +701,8 @@ capi_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos
 	}
 	mlen = CAPIMSG_LEN(skb->data);
 	if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_REQ) {
-		if ((size_t)(mlen + CAPIMSG_DATALEN(skb->data)) != count) {
+		if (count < CAPI_DATA_B3_REQ_LEN ||
+		    (size_t)(mlen + CAPIMSG_DATALEN(skb->data)) != count) {
 			kfree_skb(skb);
 			return -EINVAL;
 		}
@@ -711,6 +715,10 @@ capi_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos
 	CAPIMSG_SETAPPID(skb->data, cdev->ap.applid);
 
 	if (CAPIMSG_CMD(skb->data) == CAPI_DISCONNECT_B3_RESP) {
+		if (count < CAPI_DISCONNECT_B3_RESP_LEN) {
+			kfree_skb(skb);
+			return -EINVAL;
+		}
 		mutex_lock(&cdev->lock);
 		capincci_free(cdev, CAPIMSG_NCCI(skb->data));
 		mutex_unlock(&cdev->lock);
diff --git a/include/uapi/linux/isdn/capicmd.h b/include/uapi/linux/isdn/capicmd.h
index 4941628a4fb9..5ec88e7548a9 100644
--- a/include/uapi/linux/isdn/capicmd.h
+++ b/include/uapi/linux/isdn/capicmd.h
@@ -16,6 +16,7 @@
 #define CAPI_MSG_BASELEN		8
 #define CAPI_DATA_B3_REQ_LEN		(CAPI_MSG_BASELEN+4+4+2+2+2)
 #define CAPI_DATA_B3_RESP_LEN		(CAPI_MSG_BASELEN+4+2)
+#define CAPI_DISCONNECT_B3_RESP_LEN	(CAPI_MSG_BASELEN+4)
 
 /*----- CAPI commands -----*/
 #define CAPI_ALERT		    0x01
-- 
2.23.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox