Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net 09/14] net/mlx5e: Fix possible deadlock of VXLAN lock
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Gal Pressman, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Gal Pressman <galp@mellanox.com>

mlx5e_vxlan_lookup_port is called both from mlx5e_add_vxlan_port (user
context) and mlx5e_features_check (softirq), but the lock acquired does
not disable bottom half and might result in deadlock. Fix it by simply
replacing spin_lock() with spin_lock_bh().
While at it, replace all unnecessary spin_lock_irq() to spin_lock_bh().

lockdep's WARNING: inconsistent lock state
[  654.028136] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[  654.028229] swapper/5/0 [HC0[0]:SC1[9]:HE1:SE0] takes:
[  654.028321]  (&(&vxlan_db->lock)->rlock){+.?.}, at: [<ffffffffa06e7f0e>] mlx5e_vxlan_lookup_port+0x1e/0x50 [mlx5_core]
[  654.028528] {SOFTIRQ-ON-W} state was registered at:
[  654.028607]   _raw_spin_lock+0x3c/0x70
[  654.028689]   mlx5e_vxlan_lookup_port+0x1e/0x50 [mlx5_core]
[  654.028794]   mlx5e_vxlan_add_port+0x2e/0x120 [mlx5_core]
[  654.028878]   process_one_work+0x1e9/0x640
[  654.028942]   worker_thread+0x4a/0x3f0
[  654.029002]   kthread+0x141/0x180
[  654.029056]   ret_from_fork+0x24/0x30
[  654.029114] irq event stamp: 579088
[  654.029174] hardirqs last  enabled at (579088): [<ffffffff818f475a>] ip6_finish_output2+0x49a/0x8c0
[  654.029309] hardirqs last disabled at (579087): [<ffffffff818f470e>] ip6_finish_output2+0x44e/0x8c0
[  654.029446] softirqs last  enabled at (579030): [<ffffffff810b3b3d>] irq_enter+0x6d/0x80
[  654.029567] softirqs last disabled at (579031): [<ffffffff810b3c05>] irq_exit+0xb5/0xc0
[  654.029684] other info that might help us debug this:
[  654.029781]  Possible unsafe locking scenario:

[  654.029868]        CPU0
[  654.029908]        ----
[  654.029947]   lock(&(&vxlan_db->lock)->rlock);
[  654.030045]   <Interrupt>
[  654.030090]     lock(&(&vxlan_db->lock)->rlock);
[  654.030162]
 *** DEADLOCK ***

Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
index 07a9ba6cfc70..f8238275759f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
@@ -71,9 +71,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
 	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
 	struct mlx5e_vxlan *vxlan;
 
-	spin_lock(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	vxlan = radix_tree_lookup(&vxlan_db->tree, port);
-	spin_unlock(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 
 	return vxlan;
 }
@@ -100,9 +100,9 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
 
 	vxlan->udp_port = port;
 
-	spin_lock_irq(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
-	spin_unlock_irq(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 	if (err)
 		goto err_free;
 
@@ -121,9 +121,9 @@ static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port)
 	struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
 	struct mlx5e_vxlan *vxlan;
 
-	spin_lock_irq(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	vxlan = radix_tree_delete(&vxlan_db->tree, port);
-	spin_unlock_irq(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 
 	if (!vxlan)
 		return;
@@ -171,12 +171,12 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
 	struct mlx5e_vxlan *vxlan;
 	unsigned int port = 0;
 
-	spin_lock_irq(&vxlan_db->lock);
+	spin_lock_bh(&vxlan_db->lock);
 	while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
 		port = vxlan->udp_port;
-		spin_unlock_irq(&vxlan_db->lock);
+		spin_unlock_bh(&vxlan_db->lock);
 		__mlx5e_vxlan_core_del_port(priv, (u16)port);
-		spin_lock_irq(&vxlan_db->lock);
+		spin_lock_bh(&vxlan_db->lock);
 	}
-	spin_unlock_irq(&vxlan_db->lock);
+	spin_unlock_bh(&vxlan_db->lock);
 }
-- 
2.13.0

^ permalink raw reply related

* [net 08/14] net/mlx5: Fix error flow in CREATE_QP command
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Moni Shoua, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Moni Shoua <monis@mellanox.com>

In error flow, when DESTROY_QP command should be executed, the wrong
mailbox was set with data, not the one that is written to hardware,
Fix that.

Fixes: 09a7d9eca1a6 '{net,IB}/mlx5: QP/XRCD commands via mlx5 ifc'
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index db9e665ab104..889130edb715 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
 err_cmd:
 	memset(din, 0, sizeof(din));
 	memset(dout, 0, sizeof(dout));
-	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
-	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+	MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
 	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
 	return err;
 }
-- 
2.13.0

^ permalink raw reply related

* [net 07/14] net/mlx5: Fix misspelling in the error message and comment
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Eugenia Emantayev <eugenia@mellanox.com>

Fix misspelling in word syndrome.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 60771865c99c..0308a2b4823c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -466,7 +466,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 		case MLX5_EVENT_TYPE_CQ_ERROR:
 			cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
-			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrom 0x%x\n",
+			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrome 0x%x\n",
 				       cqn, eqe->data.cq_err.syndrome);
 			mlx5_cq_event(dev, cqn, eqe->type);
 			break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 1a0e797ad001..21d29f7936f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -241,7 +241,7 @@ static void print_health_info(struct mlx5_core_dev *dev)
 	u32 fw;
 	int i;
 
-	/* If the syndrom is 0, the device is OK and no need to print buffer */
+	/* If the syndrome is 0, the device is OK and no need to print buffer */
 	if (!ioread8(&h->synd))
 		return;
 
-- 
2.13.0

^ permalink raw reply related

* [net 01/14] net/mlx5: FPGA, return -EINVAL if size is zero
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Kamal Heib, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Kamal Heib <kamalh@mellanox.com>

Currently, if a size of zero is passed to
mlx5_fpga_mem_{read|write}_i2c()
the "err" return value will not be initialized, which triggers gcc
warnings:

[..]/mlx5/core/fpga/sdk.c:87 mlx5_fpga_mem_read_i2c() error:
uninitialized symbol 'err'.
[..]/mlx5/core/fpga/sdk.c:115 mlx5_fpga_mem_write_i2c() error:
uninitialized symbol 'err'.

fix that.

Fixes: a9956d35d199 ('net/mlx5: FPGA, Add SBU infrastructure')
Signed-off-by: Kamal Heib <kamalh@mellanox.com>
Reviewed-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
index 3c11d6e2160a..14962969c5ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
@@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
 	u8 actual_size;
 	int err;
 
+	if (!size)
+		return -EINVAL;
+
 	if (!fdev->mdev)
 		return -ENOTCONN;
 
@@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
 	u8 actual_size;
 	int err;
 
+	if (!size)
+		return -EINVAL;
+
 	if (!fdev->mdev)
 		return -ENOTCONN;
 
-- 
2.13.0

^ permalink raw reply related

* [net 06/14] net/mlx5e: Fix defaulting RX ring size when not needed
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eugenia Emantayev, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Eugenia Emantayev <eugenia@mellanox.com>

Fixes the bug when turning on/off CQE compression mechanism
resets the RX rings size to default value when it is not
needed.

Fixes: 2fc4bfb7250d ("net/mlx5e: Dynamic RQ type infrastructure")
Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h          |  8 ++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c  | 10 ++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c     | 15 +++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c |  2 +-
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 43f9054830e5..543060c305a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -82,6 +82,9 @@
 	max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
 #define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
 #define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
+#define MLX5E_MPWQE_STRIDE_SZ(mdev, cqe_cmprs) \
+	(cqe_cmprs ? MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : \
+	MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev))
 
 #define MLX5_MPWRQ_LOG_WQE_SZ			18
 #define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
@@ -936,8 +939,9 @@ void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params,
 				 u8 cq_period_mode);
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
 				 u8 cq_period_mode);
-void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
-			      struct mlx5e_params *params, u8 rq_type);
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params,
+			       u8 rq_type);
 
 static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 23425f028405..8f05efa5c829 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1523,8 +1523,10 @@ int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val
 	new_channels.params = priv->channels.params;
 	MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS, new_val);
 
-	mlx5e_set_rq_type_params(priv->mdev, &new_channels.params,
-				 new_channels.params.rq_wq_type);
+	new_channels.params.mpwqe_log_stride_sz =
+		MLX5E_MPWQE_STRIDE_SZ(priv->mdev, new_val);
+	new_channels.params.mpwqe_log_num_strides =
+		MLX5_MPWRQ_LOG_WQE_SZ - new_channels.params.mpwqe_log_stride_sz;
 
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
 		priv->channels.params = new_channels.params;
@@ -1536,6 +1538,10 @@ int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val
 		return err;
 
 	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	mlx5e_dbg(DRV, priv, "MLX5E: RxCqeCmprss was turned %s\n",
+		  MLX5E_GET_PFLAG(&priv->channels.params,
+				  MLX5E_PFLAG_RX_CQE_COMPRESS) ? "ON" : "OFF");
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c535a44ab8ac..d9d8227f195f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -78,8 +78,8 @@ static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 		MLX5_CAP_ETH(mdev, reg_umr_sq);
 }
 
-void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
-			      struct mlx5e_params *params, u8 rq_type)
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params, u8 rq_type)
 {
 	params->rq_wq_type = rq_type;
 	params->lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
@@ -88,10 +88,8 @@ void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
 		params->log_rq_size = is_kdump_kernel() ?
 			MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW :
 			MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
-		params->mpwqe_log_stride_sz =
-			MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS) ?
-			MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) :
-			MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev);
+		params->mpwqe_log_stride_sz = MLX5E_MPWQE_STRIDE_SZ(mdev,
+			MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
 		params->mpwqe_log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ -
 			params->mpwqe_log_stride_sz;
 		break;
@@ -115,13 +113,14 @@ void mlx5e_set_rq_type_params(struct mlx5_core_dev *mdev,
 		       MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
 }
 
-static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
+static void mlx5e_set_rq_params(struct mlx5_core_dev *mdev,
+				struct mlx5e_params *params)
 {
 	u8 rq_type = mlx5e_check_fragmented_striding_rq_cap(mdev) &&
 		    !params->xdp_prog && !MLX5_IPSEC_DEV(mdev) ?
 		    MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
 		    MLX5_WQ_TYPE_LINKED_LIST;
-	mlx5e_set_rq_type_params(mdev, params, rq_type);
+	mlx5e_init_rq_type_params(mdev, params, rq_type);
 }
 
 static void mlx5e_update_carrier(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index d2a66dc4adc6..8812d7208e8f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -57,7 +57,7 @@ static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev,
 				   struct mlx5e_params *params)
 {
 	/* Override RQ params as IPoIB supports only LINKED LIST RQ for now */
-	mlx5e_set_rq_type_params(mdev, params, MLX5_WQ_TYPE_LINKED_LIST);
+	mlx5e_init_rq_type_params(mdev, params, MLX5_WQ_TYPE_LINKED_LIST);
 
 	/* RQ size in ipoib by default is 512 */
 	params->log_rq_size = is_kdump_kernel() ?
-- 
2.13.0

^ permalink raw reply related

* [net 04/14] net/mlx5e: Fix ETS BW check
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Huy Nguyen, Moshe Shemesh, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Huy Nguyen <huyn@mellanox.com>

Fix bug that allows ets bw sum to be 0% when ets tc type exists.

Fixes: 08fb1dacdd76 ('net/mlx5e: Support DCBNL IEEE ETS')
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index c6d90b6dd80e..9bcf38f4123b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -274,6 +274,7 @@ int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets)
 static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
 				    struct ieee_ets *ets)
 {
+	bool have_ets_tc = false;
 	int bw_sum = 0;
 	int i;
 
@@ -288,11 +289,14 @@ static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
 	}
 
 	/* Validate Bandwidth Sum */
-	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
-		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS)
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
+			have_ets_tc = true;
 			bw_sum += ets->tc_tx_bw[i];
+		}
+	}
 
-	if (bw_sum != 0 && bw_sum != 100) {
+	if (have_ets_tc && bw_sum != 100) {
 		netdev_err(netdev,
 			   "Failed to validate ETS: BW sum is illegal\n");
 		return -EINVAL;
-- 
2.13.0

^ permalink raw reply related

* [net 05/14] net/mlx5e: Fix features check of IPv6 traffic
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Gal Pressman, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Gal Pressman <galp@mellanox.com>

The assumption that the next header field contains the transport
protocol is wrong for IPv6 packets with extension headers.
Instead, we should look the inner-most next header field in the buffer.
This will fix TSO offload for tunnels over IPv6 with extension headers.

Performance testing: 19.25x improvement, cool!
Measuring bandwidth of 16 threads TCP traffic over IPv6 GRE tap.
CPU: Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz
NIC: Mellanox Technologies MT28800 Family [ConnectX-5 Ex]
TSO: Enabled
Before: 4,926.24  Mbps
Now   : 94,827.91 Mbps

Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cbec66bc82f1..c535a44ab8ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3678,6 +3678,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
 						     struct sk_buff *skb,
 						     netdev_features_t features)
 {
+	unsigned int offset = 0;
 	struct udphdr *udph;
 	u8 proto;
 	u16 port;
@@ -3687,7 +3688,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
 		proto = ip_hdr(skb)->protocol;
 		break;
 	case htons(ETH_P_IPV6):
-		proto = ipv6_hdr(skb)->nexthdr;
+		proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
 		break;
 	default:
 		goto out;
-- 
2.13.0

^ permalink raw reply related

* [net 03/14] net/mlx5: Fix rate limit packet pacing naming and struct
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

In mlx5_ifc, struct size was not complete, and thus driver was sending
garbage after the last defined field. Fixed it by adding reserved field
to complete the struct size.

In addition, rename all set_rate_limit to set_pp_rate_limit to be
compliant with the Firmware <-> Driver definition.

Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates")
Fixes: 1466cc5b23d1 ("net/mlx5: Rate limit tables support")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/rl.c  | 22 +++++++++++-----------
 include/linux/mlx5/mlx5_ifc.h                 |  8 +++++---
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 1fffdebbc9e8..e9a1fbcc4adf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
 	case MLX5_CMD_OP_ALLOC_Q_COUNTER:
 	case MLX5_CMD_OP_QUERY_Q_COUNTER:
-	case MLX5_CMD_OP_SET_RATE_LIMIT:
+	case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
 	case MLX5_CMD_OP_QUERY_RATE_LIMIT:
 	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
 	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
@@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
 	MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
 	MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
-	MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
+	MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
 	MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
 	MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
 	MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
index e651e4c02867..d3c33e9eea72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
 	return ret_entry;
 }
 
-static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
+static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
 				   u32 rate, u16 index)
 {
-	u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
 
-	MLX5_SET(set_rate_limit_in, in, opcode,
-		 MLX5_CMD_OP_SET_RATE_LIMIT);
-	MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
-	MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
+	MLX5_SET(set_pp_rate_limit_in, in, opcode,
+		 MLX5_CMD_OP_SET_PP_RATE_LIMIT);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate);
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
@@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index)
 		entry->refcount++;
 	} else {
 		/* new rate limit */
-		err = mlx5_set_rate_limit_cmd(dev, rate, entry->index);
+		err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index);
 		if (err) {
 			mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
 				      rate, err);
@@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate)
 	entry->refcount--;
 	if (!entry->refcount) {
 		/* need to remove rate */
-		mlx5_set_rate_limit_cmd(dev, 0, entry->index);
+		mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index);
 		entry->rate = 0;
 	}
 
@@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
 	/* Clear all configured rates */
 	for (i = 0; i < table->max_size; i++)
 		if (table->rl_entry[i].rate)
-			mlx5_set_rate_limit_cmd(dev, 0,
-						table->rl_entry[i].index);
+			mlx5_set_pp_rate_limit_cmd(dev, 0,
+						   table->rl_entry[i].index);
 
 	kfree(dev->priv.rl_table.rl_entry);
 }
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 38a7577a9ce7..d44ec5f41d4a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -147,7 +147,7 @@ enum {
 	MLX5_CMD_OP_ALLOC_Q_COUNTER               = 0x771,
 	MLX5_CMD_OP_DEALLOC_Q_COUNTER             = 0x772,
 	MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
-	MLX5_CMD_OP_SET_RATE_LIMIT                = 0x780,
+	MLX5_CMD_OP_SET_PP_RATE_LIMIT             = 0x780,
 	MLX5_CMD_OP_QUERY_RATE_LIMIT              = 0x781,
 	MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT      = 0x782,
 	MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT     = 0x783,
@@ -7239,7 +7239,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
 	u8         vxlan_udp_port[0x10];
 };
 
-struct mlx5_ifc_set_rate_limit_out_bits {
+struct mlx5_ifc_set_pp_rate_limit_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
 
@@ -7248,7 +7248,7 @@ struct mlx5_ifc_set_rate_limit_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
-struct mlx5_ifc_set_rate_limit_in_bits {
+struct mlx5_ifc_set_pp_rate_limit_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
 
@@ -7261,6 +7261,8 @@ struct mlx5_ifc_set_rate_limit_in_bits {
 	u8         reserved_at_60[0x20];
 
 	u8         rate_limit[0x20];
+
+	u8         reserved_at_a0[0x160];
 };
 
 struct mlx5_ifc_access_register_out_bits {
-- 
2.13.0

^ permalink raw reply related

* [pull request][net 00/14] Mellanox, mlx5 fixes 2017-12-19
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Saeed Mahameed

Hi Dave,

The follwoing series includes some fixes for mlx5 core and etherent
driver.

Please pull and let me know if there is any problem.

This series doesn't introduce any conflict with the ongoing mlx5 for-next
submission.

For -stable:

kernels >= v4.7.y
    ("net/mlx5e: Fix possible deadlock of VXLAN lock")
    ("net/mlx5e: Add refcount to VXLAN structure")
    ("net/mlx5e: Prevent possible races in VXLAN control flow")
    ("net/mlx5e: Fix features check of IPv6 traffic")

kernels >= v4.9.y
    ("net/mlx5: Fix error flow in CREATE_QP command")
    ("net/mlx5: Fix rate limit packet pacing naming and struct")

kernels >= v4.13.y
    ("net/mlx5: FPGA, return -EINVAL if size is zero")

kernels >= v4.14.y
    ("Revert "mlx5: move affinity hints assignments to generic code")

All above patches apply and compile with no issues on corresponding -stable.

Thanks,
Saeed.

---

The following changes since commit d03a45572efa068fa64db211d6d45222660e76c5:

  ipv4: fib: Fix metrics match when deleting a route (2017-12-19 14:21:58 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5-fixes-2017-12-19

for you to fetch changes up to a2fba188fd5eadd6061bef4f2f2577a43231ebf3:

  net/mlx5: Stay in polling mode when command EQ destroy fails (2017-12-19 23:24:05 +0200)

----------------------------------------------------------------
mlx5-fixes-2017-12-19

Misc fixes for mlx5 core and mlx5 netdev driver.

----------------------------------------------------------------
Eran Ben Elisha (1):
      net/mlx5: Fix rate limit packet pacing naming and struct

Eugenia Emantayev (2):
      net/mlx5e: Fix defaulting RX ring size when not needed
      net/mlx5: Fix misspelling in the error message and comment

Gal Pressman (4):
      net/mlx5e: Fix features check of IPv6 traffic
      net/mlx5e: Fix possible deadlock of VXLAN lock
      net/mlx5e: Add refcount to VXLAN structure
      net/mlx5e: Prevent possible races in VXLAN control flow

Huy Nguyen (1):
      net/mlx5e: Fix ETS BW check

Kamal Heib (1):
      net/mlx5: FPGA, return -EINVAL if size is zero

Maor Gottlieb (1):
      net/mlx5: Fix steering memory leak

Moni Shoua (1):
      net/mlx5: Fix error flow in CREATE_QP command

Moshe Shemesh (2):
      net/mlx5: Cleanup IRQs in case of unload failure
      net/mlx5: Stay in polling mode when command EQ destroy fails

Saeed Mahameed (1):
      Revert "mlx5: move affinity hints assignments to generic code"

 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  9 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 10 ++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 10 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 63 +++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 20 +++---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c |  6 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 16 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/health.c   |  2 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 75 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/qp.c       |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/rl.c       | 22 +++----
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c    | 64 ++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h    |  1 +
 include/linux/mlx5/driver.h                        |  3 +-
 include/linux/mlx5/mlx5_ifc.h                      |  8 ++-
 17 files changed, 215 insertions(+), 104 deletions(-)

^ permalink raw reply

* [net 02/14] Revert "mlx5: move affinity hints assignments to generic code"
From: Saeed Mahameed @ 2017-12-19 22:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Saeed Mahameed, Sagi Grimberg, Thomas Gleixner,
	Jes Sorensen
In-Reply-To: <20171219222456.29627-1-saeedm@mellanox.com>

Before the offending commit, mlx5 core did the IRQ affinity itself,
and it seems that the new generic code have some drawbacks and one
of them is the lack for user ability to modify irq affinity after
the initial affinity values got assigned.

The issue is still being discussed and a solution in the new generic code
is required, until then we need to revert this patch.

This fixes the following issue:
echo <new affinity> > /proc/irq/<x>/smp_affinity
fails with  -EIO

This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664.
Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since
it is used in mlx5_ib driver.

Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code")
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jes Sorensen <jsorensen@fb.com>
Reported-by: Jes Sorensen <jsorensen@fb.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 45 +++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 75 +++++++++++++++++++++--
 include/linux/mlx5/driver.h                       |  1 +
 4 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c0872b3284cb..43f9054830e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -590,6 +590,7 @@ struct mlx5e_channel {
 	struct mlx5_core_dev      *mdev;
 	struct hwtstamp_config    *tstamp;
 	int                        ix;
+	int                        cpu;
 };
 
 struct mlx5e_channels {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d2b057a3e512..cbec66bc82f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -71,11 +71,6 @@ struct mlx5e_channel_param {
 	struct mlx5e_cq_param      icosq_cq;
 };
 
-static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
-{
-	return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
-}
-
 static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 {
 	return MLX5_CAP_GEN(mdev, striding_rq) &&
@@ -444,17 +439,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	int mtt_sz = mlx5e_get_wqe_mtt_sz();
 	int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
-	int node = mlx5e_get_node(c->priv, c->ix);
 	int i;
 
 	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
-					GFP_KERNEL, node);
+				      GFP_KERNEL, cpu_to_node(c->cpu));
 	if (!rq->mpwqe.info)
 		goto err_out;
 
 	/* We allocate more than mtt_sz as we will align the pointer */
-	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
-					GFP_KERNEL, node);
+	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
+					cpu_to_node(c->cpu));
 	if (unlikely(!rq->mpwqe.mtt_no_align))
 		goto err_free_wqe_info;
 
@@ -562,7 +556,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	int err;
 	int i;
 
-	rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
 
 	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
 				&rq->wq_ctrl);
@@ -629,8 +623,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
-				     GFP_KERNEL,
-				     mlx5e_get_node(c->priv, c->ix));
+				     GFP_KERNEL, cpu_to_node(c->cpu));
 		if (!rq->wqe.frag_info) {
 			err = -ENOMEM;
 			goto err_rq_wq_destroy;
@@ -1000,13 +993,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1053,13 +1046,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 	sq->channel   = c;
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1126,13 +1119,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
 
-	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db    = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
+	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1504,8 +1497,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = c->priv->mdev;
 	int err;
 
-	param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
-	param->wq.db_numa_node  = mlx5e_get_node(c->priv, c->ix);
+	param->wq.buf_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node  = cpu_to_node(c->cpu);
 	param->eq_ix   = c->ix;
 
 	err = mlx5e_alloc_cq_common(mdev, param, cq);
@@ -1604,6 +1597,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
 	mlx5e_free_cq(cq);
 }
 
+static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
+{
+	return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
+}
+
 static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
 			     struct mlx5e_params *params,
 			     struct mlx5e_channel_param *cparam)
@@ -1752,12 +1750,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 {
 	struct mlx5e_cq_moder icocq_moder = {0, 0};
 	struct net_device *netdev = priv->netdev;
+	int cpu = mlx5e_get_cpu(priv, ix);
 	struct mlx5e_channel *c;
 	unsigned int irq;
 	int err;
 	int eqn;
 
-	c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
+	c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
 	if (!c)
 		return -ENOMEM;
 
@@ -1765,6 +1764,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->mdev     = priv->mdev;
 	c->tstamp   = &priv->tstamp;
 	c->ix       = ix;
+	c->cpu      = cpu;
 	c->pdev     = &priv->mdev->pdev->dev;
 	c->netdev   = priv->netdev;
 	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
@@ -1853,8 +1853,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
 	for (tc = 0; tc < c->num_tc; tc++)
 		mlx5e_activate_txqsq(&c->sq[tc]);
 	mlx5e_activate_rq(&c->rq);
-	netif_set_xps_queue(c->netdev,
-		mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
+	netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
 }
 
 static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 5f323442cc5a..8a89c7e8cd63 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -317,9 +317,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
-	struct irq_affinity irqdesc = {
-		.pre_vectors = MLX5_EQ_VEC_COMP_BASE,
-	};
 	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
 
@@ -333,10 +330,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 	if (!priv->irq_info)
 		goto err_free_msix;
 
-	nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
+	nvec = pci_alloc_irq_vectors(dev->pdev,
 			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
-			PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
-			&irqdesc);
+			PCI_IRQ_MSIX);
 	if (nvec < 0)
 		return nvec;
 
@@ -622,6 +618,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 	return (u64)timer_l | (u64)timer_h1 << 32;
 }
 
+static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
+		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
+		return -ENOMEM;
+	}
+
+	cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
+			priv->irq_info[i].mask);
+
+	if (IS_ENABLED(CONFIG_SMP) &&
+	    irq_set_affinity_hint(irq, priv->irq_info[i].mask))
+		mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
+
+	return 0;
+}
+
+static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
+
+	irq_set_affinity_hint(irq, NULL);
+	free_cpumask_var(priv->irq_info[i].mask);
+}
+
+static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
+		err = mlx5_irq_set_affinity_hint(mdev, i);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	for (i--; i >= 0; i--)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+
+	return err;
+}
+
+static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+}
+
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn)
 {
@@ -1097,6 +1150,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_stop_eqs;
 	}
 
+	err = mlx5_irq_set_affinity_hints(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
+		goto err_affinity_hints;
+	}
+
 	err = mlx5_init_fs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init flow steering\n");
@@ -1154,6 +1213,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_cleanup_fs(dev);
 
 err_fs:
+	mlx5_irq_clear_affinity_hints(dev);
+
+err_affinity_hints:
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -1222,6 +1284,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 
 	mlx5_sriov_detach(dev);
 	mlx5_cleanup_fs(dev);
+	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_put_uars_page(dev, priv->uar);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a886b51511ab..40a6f33c4cde 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -556,6 +556,7 @@ struct mlx5_core_sriov {
 };
 
 struct mlx5_irq_info {
+	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
-- 
2.13.0

^ permalink raw reply related

* Re: [PATCH net-next] net: Clarify dev_weight documentation for LRO and GRO_HW.
From: Marcelo Ricardo Leitner @ 2017-12-19 22:26 UTC (permalink / raw)
  To: Michael Chan; +Cc: davem, netdev
In-Reply-To: <1513717976-24263-1-git-send-email-michael.chan@broadcom.com>

On Tue, Dec 19, 2017 at 04:12:56PM -0500, Michael Chan wrote:
> Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> Signed-off-by: Michael Chan <michael.chan@broadcom.com>

Thanks!

> ---
>  Documentation/sysctl/net.txt | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
> index b67044a..35c62f5 100644
> --- a/Documentation/sysctl/net.txt
> +++ b/Documentation/sysctl/net.txt
> @@ -95,7 +95,9 @@ dev_weight
>  --------------
>  
>  The maximum number of packets that kernel can handle on a NAPI interrupt,
> -it's a Per-CPU variable.
> +it's a Per-CPU variable. For drivers that support LRO or GRO_HW, a hardware
> +aggregated packet is counted as one packet in this context.
> +
>  Default: 64
>  
>  dev_weight_rx_bias
> -- 
> 1.8.3.1
> 

^ permalink raw reply

* Re: [RFC] hv_netvsc: automatically name slave VF network device
From: Jakub Kicinski @ 2017-12-19 22:24 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Stephen Hemminger, Jiri Pirko
In-Reply-To: <20171219140659.39f6cc1c@xeon-e3>

On Tue, 19 Dec 2017 14:06:59 -0800, Stephen Hemminger wrote:
> > > > > I assume you mean the modern application is udev, and it works
> > > > > but the name is meaningless because it based of synthetic PCI
> > > > > information. The PCI host adapter is simulated for pass through
> > > > > devices. Names like enp12s0.
> > > > > 
> > > > > Since every passthrough VF device on Hyper-V/Azure has a matching
> > > > > synthetic network device with same mac address. It is best to
> > > > > have the relationship shown in the name.        
> > > > 
> > > > How about we make the VF drivers expose "vf" as phys_port_name?
> > > > Then systemd/udev should glue that onto the name regardless of
> > > > how the VF is used.      
> > > 
> > > One of the goals was not to modify in any way other drivers (like VF).    
> > 
> > Why?  Do you have out-of-tree drivers you can't change or some such?  
> 
> This needs to work on enterprise distributions; plus it is not good
> practice to introduce random changes into partners like Mellanox
> drivers.

Are we talking about Linux or Windows kernel here?  I don't think
this requires hypervisor changes?  The notion of a "partner" and
changing drivers by people who are not employed by a vendor being 
bad practice sounds entirely foreign to me.

If we agree that marking VF interfaces is useful (and I think 
it is) then we should mark them always, not only when they are 
enslaved to a magic bond.  And the natural way of doing that 
seems to be phys_port_name.

^ permalink raw reply

* Re: [PATCH v2 iproute2 net-next] erspan: add erspan version II support
From: David Ahern @ 2017-12-19 22:17 UTC (permalink / raw)
  To: William Tu, netdev
In-Reply-To: <1513386366-38095-1-git-send-email-u9012063@gmail.com>

On 12/15/17 6:06 PM, William Tu wrote:
> @@ -343,6 +355,22 @@ get_failed:
>  				invarg("invalid erspan index\n", *argv);
>  			if (erspan_idx & ~((1<<20) - 1) || erspan_idx == 0)
>  				invarg("erspan index must be > 0 and <= 20-bit\n", *argv);
> +		} else if (strcmp(*argv, "erspan_ver") == 0) {
> +			NEXT_ARG();
> +			if (get_u8(&erspan_ver, *argv, 0))
> +				invarg("invalid erspan version\n", *argv);
> +			if (erspan_ver != 1 && erspan_ver != 2)
> +				invarg("erspan version must be 1 or 2\n", *argv);
> +		} else if (strcmp(*argv, "erspan_dir") == 0) {
> +			NEXT_ARG();
> +			if (get_u8(&erspan_dir, *argv, 0))
> +				invarg("invalid erspan direction\n", *argv);
> +			if (erspan_dir != 0 && erspan_dir != 1)
> +				invarg("erspan direction must be 0(Ingress) or 1(Egress)\n", *argv);

Why not allow ingress and egress as strings and using matches()?
e.g.,  '... erspan_dir in[gress] ...' or '... erspan_dir e[gress] ...'

Seems much more intuitive than remembering "0" = ingress and "1" = egress.

> @@ -374,8 +402,15 @@ get_failed:
>  		addattr_l(n, 1024, IFLA_GRE_TTL, &ttl, 1);
>  		addattr_l(n, 1024, IFLA_GRE_TOS, &tos, 1);
>  		addattr32(n, 1024, IFLA_GRE_FWMARK, fwmark);
> -		if (erspan_idx != 0)
> -			addattr32(n, 1024, IFLA_GRE_ERSPAN_INDEX, erspan_idx);
> +		if (erspan_ver) {
> +			addattr8(n, 1024, IFLA_GRE_ERSPAN_VER, erspan_ver);
> +			if (erspan_ver == 1 && erspan_idx != 0) {
> +				addattr32(n, 1024, IFLA_GRE_ERSPAN_INDEX, erspan_idx);
> +			} else if (erspan_ver == 2) {
> +				addattr8(n, 1024, IFLA_GRE_ERSPAN_DIR, erspan_dir);
> +				addattr16(n, 1024, IFLA_GRE_ERSPAN_HWID, erspan_hwid);
> +			}
> +		}
>  	} else {
>  		addattr_l(n, 1024, IFLA_GRE_COLLECT_METADATA, NULL, 0);
>  	}
> @@ -514,7 +549,25 @@ static void gre_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
>  	if (tb[IFLA_GRE_ERSPAN_INDEX]) {
>  		__u32 erspan_idx = rta_getattr_u32(tb[IFLA_GRE_ERSPAN_INDEX]);
>  
> -		fprintf(f, "erspan_index %u ", erspan_idx);
> +		print_uint(PRINT_ANY, "erspan_index", "erspan_index %u", erspan_idx);
> +	}
> +
> +	if (tb[IFLA_GRE_ERSPAN_VER]) {
> +		__u8 erspan_ver = rta_getattr_u8(tb[IFLA_GRE_ERSPAN_VER]);
> +
> +		print_uint(PRINT_ANY, "erspan_ver", "erspan_ver %u", erspan_ver);
> +	}
> +
> +	if (tb[IFLA_GRE_ERSPAN_DIR]) {
> +		__u8 erspan_dir = rta_getattr_u8(tb[IFLA_GRE_ERSPAN_DIR]);
> +
> +		print_uint(PRINT_ANY, "erspan_dir", "erspan_dir %u", erspan_dir);

similarly, here can convert the 0/1 to a human string.

Same comments for the changes to link_gre6.c

^ permalink raw reply

* Re: [RFC] hv_netvsc: automatically name slave VF network device
From: Stephen Hemminger @ 2017-12-19 22:06 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, Stephen Hemminger, Jiri Pirko
In-Reply-To: <20171219135529.62800475@cakuba.netronome.com>

On Tue, 19 Dec 2017 13:55:29 -0800
Jakub Kicinski <kubakici@wp.pl> wrote:

> On Tue, 19 Dec 2017 13:29:49 -0800, Stephen Hemminger wrote:
> > > > biosdevname is dead, gone and wouldn't work on Azure (it dumpster
> > > > dives in /dev/mem).      
> > > 
> > > Hm, I haven't worked on biosdevname myself, but AFAIU it also falls 
> > > back to information from the PCI VPD, which could be populated by 
> > > the hypervisor.    
> > 
> > VPD never had any useful standard are info.
> > The rules used by udev come off sysfs, see:
> >   https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/  
> 
> Yes, the current VPD info looks quite limited, although it is
> extendable.
> 
> > > > I assume you mean the modern application is udev, and it works
> > > > but the name is meaningless because it based of synthetic PCI
> > > > information. The PCI host adapter is simulated for pass through
> > > > devices. Names like enp12s0.
> > > > 
> > > > Since every passthrough VF device on Hyper-V/Azure has a matching
> > > > synthetic network device with same mac address. It is best to
> > > > have the relationship shown in the name.      
> > > 
> > > How about we make the VF drivers expose "vf" as phys_port_name?
> > > Then systemd/udev should glue that onto the name regardless of
> > > how the VF is used.    
> > 
> > One of the goals was not to modify in any way other drivers (like VF).  
> 
> Why?  Do you have out-of-tree drivers you can't change or some such?

This needs to work on enterprise distributions; plus it is not good practice
to introduce random changes into partners like Mellanox drivers.

^ permalink raw reply

* Re: [PATCH v4 04/36] nds32: Kernel booting and initialization
From: Randy Dunlap @ 2017-12-19 22:01 UTC (permalink / raw)
  To: Greentime Hu, greentime-MUIXKm3Oiri1Z/+hSey0Gg,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, arnd-r2nGTMty4D4,
	linux-arch-u79uwXL29TY76Z2rM5mHXA, tglx-hfZtesqFncYOwBW4kG4KsQ,
	jason-NLaQJdtUoK4Be96aLqz0jA, marc.zyngier-5wv7dgnIgG8,
	robh+dt-DgEjT+Ai2ygdnm+yROfE0A, netdev-u79uwXL29TY76Z2rM5mHXA,
	deanbo422-Re5JQEeQqe8AvxtiuMwx3w,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
	dhowells-H+wXaHxf7aLQT0dZR+AlfA, will.deacon-5wv7dgnIgG8,
	daniel.lezcano-QSEj5FYQhm4dnm+yROfE0A,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	geert.uytterhoeven-Re5JQEeQqe8AvxtiuMwx3w,
	linus.walleij-QSEj5FYQhm4dnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
	greg-U8xfFu+wG4EAvxtiuMwx3w, ren_guo-Y+KPrCd2zL4AvxtiuMwx3w,
	pombredanne-od1rfyK75/E
  Cc: Vincent Chen
In-Reply-To: <935ff034982f077b2e6f5eeccd6fe2110614fc9c.1513577007.git.green.hu-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On 12/17/2017 10:46 PM, Greentime Hu wrote:
> From: Greentime Hu <greentime-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
> 
> This patch includes the kernel startup code. It can get dtb pointer
> passed from bootloader. It will create a temp mapping by tlb
> instructions at beginning and goto start_kernel.
> 
> Signed-off-by: Vincent Chen <vincentc-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
> Signed-off-by: Greentime Hu <greentime-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
> ---
>  arch/nds32/kernel/head.S  |  189 ++++++++++++++++++++++
>  arch/nds32/kernel/setup.c |  383 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 572 insertions(+)
>  create mode 100644 arch/nds32/kernel/head.S
>  create mode 100644 arch/nds32/kernel/setup.c
> 

> diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c
> new file mode 100644
> index 0000000..7718c58
> --- /dev/null
> +++ b/arch/nds32/kernel/setup.c
> @@ -0,0 +1,383 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) 2005-2017 Andes Technology Corporation
> +

[snip]

> +struct cache_info L1_cache_info[2];
> +static void __init dump_cpu_info(int cpu)
> +{
> +	int i, p = 0;
> +	char str[sizeof(hwcap_str) + 16];
> +
> +	for (i = 0; hwcap_str[i]; i++) {
> +		if (elf_hwcap & (1 << i)) {
> +			sprintf(str + p, "%s ", hwcap_str[i]);
> +			p += strlen(hwcap_str[i]) + 1;
> +		}
> +	}
> +
> +	pr_info("CPU%d Featuretures: %s\n", cpu, str);

	               Features:


-- 
~Randy
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Linux ECN Handling
From: Steve Ibanez @ 2017-12-19 22:00 UTC (permalink / raw)
  To: Neal Cardwell
  Cc: Eric Dumazet, Yuchung Cheng, Daniel Borkmann, Netdev,
	Florian Westphal, Mohammad Alizadeh, Lawrence Brakmo
In-Reply-To: <CADVnQynAqMYzi+Kt5RjbW+ma_99nGyDPubSd6bYamQGsK5f+7A@mail.gmail.com>

Hi Neal,

I managed to track down the code path that the unACKed CWR packet is
taking. The tcp_rcv_established() function calls tcp_ack_snd_check()
at the end of step5 and then the return statement indicated below is
invoked, which prevents the __tcp_ack_snd_check() function from
running.

static inline void tcp_ack_snd_check(struct sock *sk)
{
        if (!inet_csk_ack_scheduled(sk)) {
                /* We sent a data segment already. */
                return;   /* <=== here */
        }
        __tcp_ack_snd_check(sk, 1);
}

So somehow tcp_ack_snd_check() thinks that a data segment was already
sent when in fact it wasn't. Do you see a way around this issue?

Thanks,
-Steve

On Tue, Dec 19, 2017 at 7:28 AM, Neal Cardwell <ncardwell@google.com> wrote:
> On Tue, Dec 19, 2017 at 12:16 AM, Steve Ibanez <sibanez@stanford.edu> wrote:
>>
>> Hi Neal,
>>
>> I started looking into this receiver ACKing issue today. Strangely,
>> when I tried adding printk statements at the top of the
>> tcp_v4_do_rcv(), tcp_rcv_established(), __tcp_ack_snd_check() and
>> tcp_send_delayed_ack() functions they were never executed on the
>> machine running the iperf3 server (i.e. the destination of the flows).
>> Maybe the iperf3 server is using its own TCP stack?
>>
>> In any case, the ACKing problem is reproducible using just normal
>> iperf for which I do see my printk statements being executed. I can
>> now confirm that when the CWR marked packet (for which no ACK is sent)
>> arrives at the receiver, the __tcp_ack_snd_check() function is never
>> invoked; and hence neither is the tcp_send_delayed_ack() function.
>> Hopefully this helps narrow down where the issue might be? I started
>> adding some printk statements into the tcp_rcv_established() function,
>> but I'm not sure where the best places to look would be so I wanted to
>> ask your advice on this.
>
> Thanks for the detailed report!
>
> As a next step to narrow down why the CWR-marked packet is not acked,
> I would suggest adding printk statements at the bottom of
> tcp_rcv_established() in all the spots where we have a goto or return
> that would cause us to short-circuit and not reach the
> tcp_ack_snd_check() at the bottom of the function. This could be much
> like your existing nice debugging printks, that log any time we get to
> that spot and if (sk->sk_family == AF_INET && th->cwr). And these
> could be in the following spots (marked "here"):
>
> slow_path:
>         if (len < (th->doff << 2) || tcp_checksum_complete(skb))
>                 goto csum_error;                /* <=== here */
>
>         if (!th->ack && !th->rst && !th->syn)
>                 goto discard;                /* <=== here */
>
>         /*
>          *      Standard slow path.
>          */
>
>         if (!tcp_validate_incoming(sk, skb, th, 1))
>                 return;                /* <=== here */
>
> step5:
>         if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
>                 goto discard;                /* <=== here */
>
>
> thanks,
> neal

^ permalink raw reply

* Re: [RFC] hv_netvsc: automatically name slave VF network device
From: Jakub Kicinski @ 2017-12-19 21:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Stephen Hemminger, Jiri Pirko
In-Reply-To: <20171219132949.57926170@xeon-e3>

On Tue, 19 Dec 2017 13:29:49 -0800, Stephen Hemminger wrote:
> > > biosdevname is dead, gone and wouldn't work on Azure (it dumpster
> > > dives in /dev/mem).    
> > 
> > Hm, I haven't worked on biosdevname myself, but AFAIU it also falls 
> > back to information from the PCI VPD, which could be populated by 
> > the hypervisor.  
> 
> VPD never had any useful standard are info.
> The rules used by udev come off sysfs, see:
>   https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/

Yes, the current VPD info looks quite limited, although it is
extendable.

> > > I assume you mean the modern application is udev, and it works
> > > but the name is meaningless because it based of synthetic PCI
> > > information. The PCI host adapter is simulated for pass through
> > > devices. Names like enp12s0.
> > > 
> > > Since every passthrough VF device on Hyper-V/Azure has a matching
> > > synthetic network device with same mac address. It is best to
> > > have the relationship shown in the name.    
> > 
> > How about we make the VF drivers expose "vf" as phys_port_name?
> > Then systemd/udev should glue that onto the name regardless of
> > how the VF is used.  
> 
> One of the goals was not to modify in any way other drivers (like VF).

Why?  Do you have out-of-tree drivers you can't change or some such?

^ permalink raw reply

* Re: [pull request][for-next 00/11] Mellanox, mlx5 E-Switch updates 2017-12-19
From: Marcelo Ricardo Leitner @ 2017-12-19 21:54 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: David S. Miller, Doug Ledford, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Leon Romanovsky
In-Reply-To: <20171219203340.2600-1-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

On Tue, Dec 19, 2017 at 12:33:29PM -0800, Saeed Mahameed wrote:
> Hi Dave and Doug,
> 
> ==============
> This series includes updates for mlx5 E-Switch infrastructures,
> to be merged into net-next and rdma-next trees.
> 
> Mark's patches provide E-Switch refactoring that generalize the mlx5
> E-Switch vf representors interfaces and data structures. The serious is
> mainly focused on moving ethernet (netdev) specific representors logic out
> of E-Switch (eswitch.c) into mlx5e representor module (en_rep.c), which
> provides better separation and allows future support for other types of vf
> representors (e.g. RDMA).
> 
> Gal's patches at the end of this serious, provide a simple syntax fix and
> two other patches that handles vport ingress/egress ACL steering name
> spaces to be aligned with the Firmware/Hardware specs.

Patch 10 actually looks quite worrysome if a card would support only
one ingress or egress, but if all of them support both, then it should
be fine yes. Is that possible, to support only one direction?

  Marcelo
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v3 0/3] kallsyms: don't leak address
From: Tobin C. Harding @ 2017-12-19 21:39 UTC (permalink / raw)
  To: kernel-hardening
  Cc: Tobin C. Harding, Steven Rostedt, Tycho Andersen, Linus Torvalds,
	Kees Cook, Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development,
	Joe Perches

This set plugs a kernel address leak that occurs if kallsyms symbol
look up fails. This set was prompted by a leaking address found using
scripts/leaking_addresses.pl on a PowerPC machine in the wild.

$ perl scripts/leaking_addresses.pl		[address sanitized]
...
/proc/8025/task/8025/stack: [<0000000000000000>] 0xc0000001XXXXXXXX

$ uname -r
4.4.0-79-powerpc64-smp

Patch set does not change behaviour when KALLSYMS is not defined
(suggested by Linus).

Comments on version 1 indicated that current behaviour may be useful for
debugging. This version adds a kernel command-line parameter in order to
be able to preserve current behaviour (print raw address if kallsyms
symbol look up fails). (Command-line parameter suggested by Steve.)

New command-line parameter is documented only in the kernel-doc for
kallsyms functions sprint_symbol() and sprint_symbol_no_offset(). Is
this sufficient? Perhaps an entry in printk-formats.txt also?

Patch 1 - return error code if symbol look up fails unless new
	  command-line parameter 'insecure_print_all_symbols' is enabled.
Patch 2 - print <symbol not found> to buffer if symbol look up returns
	  an error.
Patch 3 - maintain current behaviour in ftrace.

thanks,
Tobin.

v3:
 - Remove const string and use ternary operator (suggested by Joe
   Perches) 

v2:
 - Add kernel command-line parameter.
 - Remove unnecessary function.
 - Fix broken ftrace code (and actually build and test ftrace code).

All code tested.

Tobin C. Harding (3):
  kallsyms: don't leak address when symbol not found
  vsprintf: print <symbol not found> if symbol not found
  trace: print address if symbol not found

 kernel/kallsyms.c                | 31 +++++++++++++++++++++++++------
 kernel/trace/trace.h             | 24 ++++++++++++++++++++++++
 kernel/trace/trace_events_hist.c |  6 +++---
 kernel/trace/trace_output.c      |  2 +-
 lib/vsprintf.c                   |  9 +++++----
 5 files changed, 58 insertions(+), 14 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH v3 3/3] trace: print address if symbol not found
From: Tobin C. Harding @ 2017-12-19 21:39 UTC (permalink / raw)
  To: kernel-hardening
  Cc: Tobin C. Harding, Steven Rostedt, Tycho Andersen, Linus Torvalds,
	Kees Cook, Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development,
	Joe Perches
In-Reply-To: <1513719564-13249-1-git-send-email-me@tobin.cc>

Fixes behaviour modified by: commit 40eee173a35e ("kallsyms: don't leak
address when symbol not found")

Previous patch changed behaviour of kallsyms function sprint_symbol() to
return an error code instead of printing the address if a symbol was not
found. Ftrace relies on the original behaviour. We should not break
tracing when applying the previous patch. We can maintain the original
behaviour by checking the return code on calls to sprint_symbol() and
friends.

Check return code and print actual address on error (i.e symbol not
found).

Signed-off-by: Tobin C. Harding <me@tobin.cc>
---
 kernel/trace/trace.h             | 24 ++++++++++++++++++++++++
 kernel/trace/trace_events_hist.c |  6 +++---
 kernel/trace/trace_output.c      |  2 +-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2a6d0325a761..881b1a577d75 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1814,4 +1814,28 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len)
 
 extern struct trace_iterator *tracepoint_print_iter;
 
+static inline int
+trace_sprint_symbol(char *buffer, unsigned long address)
+{
+	int ret;
+
+	ret = sprint_symbol(buffer, address);
+	if (ret == -1)
+		ret = sprintf(buffer, "0x%lx", address);
+
+	return ret;
+}
+
+static inline int
+trace_sprint_symbol_no_offset(char *buffer, unsigned long address)
+{
+	int ret;
+
+	ret = sprint_symbol_no_offset(buffer, address);
+	if (ret == -1)
+		ret = sprintf(buffer, "0x%lx", address);
+
+	return ret;
+}
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1e1558c99d56..ca523327c058 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -982,7 +982,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
 			return;
 
 		seq_printf(m, "%*c", 1 + spaces, ' ');
-		sprint_symbol(str, stacktrace_entries[i]);
+		trace_sprint_symbol(str, stacktrace_entries[i]);
 		seq_printf(m, "%s\n", str);
 	}
 }
@@ -1014,12 +1014,12 @@ hist_trigger_entry_print(struct seq_file *m,
 			seq_printf(m, "%s: %llx", field_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM) {
 			uval = *(u64 *)(key + key_field->offset);
-			sprint_symbol_no_offset(str, uval);
+			trace_sprint_symbol_no_offset(str, uval);
 			seq_printf(m, "%s: [%llx] %-45s", field_name,
 				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
 			uval = *(u64 *)(key + key_field->offset);
-			sprint_symbol(str, uval);
+			trace_sprint_symbol(str, uval);
 			seq_printf(m, "%s: [%llx] %-55s", field_name,
 				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 90db994ac900..f3c3a0a60f72 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -365,7 +365,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 #ifdef CONFIG_KALLSYMS
 	const char *name;
 
-	sprint_symbol(str, address);
+	trace_sprint_symbol(str, address);
 	name = kretprobed(str);
 
 	if (name && strlen(name)) {
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 2/3] vsprintf: print <symbol not found> if symbol not found
From: Tobin C. Harding @ 2017-12-19 21:39 UTC (permalink / raw)
  To: kernel-hardening
  Cc: Tobin C. Harding, Steven Rostedt, Tycho Andersen, Linus Torvalds,
	Kees Cook, Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development,
	Joe Perches
In-Reply-To: <1513719564-13249-1-git-send-email-me@tobin.cc>

Depends on: commit 40eee173a35e ("kallsyms: don't leak address when
symbol not found")

Currently vsprintf for specifiers %p[SsB] relies on the behaviour of
kallsyms (sprint_symbol()) and prints the actual address if a symbol is
not found. Previous patch changes this behaviour so that sprint_symbol()
returns an error if symbol not found. With this patch in place we can
print a sanitized message '<symbol not found>' instead of leaking the
address.

Print '<symbol not found>' for printk specifier %p[sSB] if symbol look
up fails.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
---
 lib/vsprintf.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 01c3957b2de6..503402a44ffe 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -674,6 +674,7 @@ char *symbol_string(char *buf, char *end, void *ptr,
 	unsigned long value;
 #ifdef CONFIG_KALLSYMS
 	char sym[KSYM_SYMBOL_LEN];
+	int ret;
 #endif
 
 	if (fmt[1] == 'R')
@@ -682,13 +683,13 @@ char *symbol_string(char *buf, char *end, void *ptr,
 
 #ifdef CONFIG_KALLSYMS
 	if (*fmt == 'B')
-		sprint_backtrace(sym, value);
+		ret = sprint_backtrace(sym, value);
 	else if (*fmt != 'f' && *fmt != 's')
-		sprint_symbol(sym, value);
+		ret = sprint_symbol(sym, value);
 	else
-		sprint_symbol_no_offset(sym, value);
+		ret = sprint_symbol_no_offset(sym, value);
 
-	return string(buf, end, sym, spec);
+	return string(buf, end, ret == -1 ? "<symbol not found>" : sym, spec);
 #else
 	return special_hex_number(buf, end, value, sizeof(void *));
 #endif
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 1/3] kallsyms: don't leak address when symbol not found
From: Tobin C. Harding @ 2017-12-19 21:39 UTC (permalink / raw)
  To: kernel-hardening
  Cc: Tobin C. Harding, Steven Rostedt, Tycho Andersen, Linus Torvalds,
	Kees Cook, Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development,
	Joe Perches
In-Reply-To: <1513719564-13249-1-git-send-email-me@tobin.cc>

Currently if kallsyms_lookup() fails to find the symbol then the address
is printed. This potentially leaks sensitive information but is useful
for debugging. We would like to stop the leak but keep the current
behaviour when needed for debugging. To achieve this we can add a
command-line parameter that if enabled maintains the current
behaviour. If the command-line parameter is not enabled we can return an
error instead of printing the address giving the calling code the option
of how to handle the look up failure.

Add command-line parameter 'insecure_print_all_symbols'. If parameter is
not enabled return an error value instead of printing the raw address.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
---
 kernel/kallsyms.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index d5fa4116688a..2707cf751437 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -383,6 +383,16 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 	return lookup_module_symbol_attrs(addr, size, offset, modname, name);
 }
 
+/* Enables printing of raw address when symbol look up fails */
+static bool insecure_print_all_symbols;
+
+static int __init enable_insecure_print_all_symbols(char *unused)
+{
+	insecure_print_all_symbols = true;
+	return 0;
+}
+early_param("insecure_print_all_symbols", enable_insecure_print_all_symbols);
+
 /* Look up a kernel symbol and return it in a text buffer. */
 static int __sprint_symbol(char *buffer, unsigned long address,
 			   int symbol_offset, int add_offset)
@@ -394,8 +404,15 @@ static int __sprint_symbol(char *buffer, unsigned long address,
 
 	address += symbol_offset;
 	name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
-	if (!name)
-		return sprintf(buffer, "0x%lx", address - symbol_offset);
+	if (insecure_print_all_symbols) {
+		if (!name)
+			return sprintf(buffer, "0x%lx", address - symbol_offset);
+	} else {
+		if (!name) {
+			buffer[0] = '\0';
+			return -1;
+		}
+	}
 
 	if (name != buffer)
 		strcpy(buffer, name);
@@ -417,8 +434,9 @@ static int __sprint_symbol(char *buffer, unsigned long address,
  * @address: address to lookup
  *
  * This function looks up a kernel symbol with @address and stores its name,
- * offset, size and module name to @buffer if possible. If no symbol was found,
- * just saves its @address as is.
+ * offset, size and module name to @buffer if possible. If no symbol was found
+ * returns -1 unless kernel command-line parameter 'insecure_print_all_symbols'
+ * is enabled, in which case saves @address as is to buffer.
  *
  * This function returns the number of bytes stored in @buffer.
  */
@@ -434,8 +452,9 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
  * @address: address to lookup
  *
  * This function looks up a kernel symbol with @address and stores its name
- * and module name to @buffer if possible. If no symbol was found, just saves
- * its @address as is.
+ * and module name to @buffer if possible. If no symbol was found, returns -1
+ * unless kernel command-line parameter 'insecure_print_all_symbols' is enabled,
+ * in which case saves @address as is to buffer.
  *
  * This function returns the number of bytes stored in @buffer.
  */
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next (after merge)] netdevsim: bpf: align to cls_bpf changes
From: Jakub Kicinski @ 2017-12-19 21:35 UTC (permalink / raw)
  To: netdev, davem; +Cc: oss-drivers, jiri, daniel, Jakub Kicinski

cls_bpf no longer takes care of offload tracking.  Make sure
netdevsim performs necessary checks.  This fixes a warning
caused by TC trying to remove a filter it has not added.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
Dave, this will have to be applied to net-next as soon as
the previous series comes in via net.  netdevsim doesn't
exist in net/master, so I can't fix it there :(  I'm not
sure how to handle this situation best...
---
 drivers/net/netdevsim/bpf.c                 | 25 ++++++++++++++-----------
 tools/testing/selftests/bpf/test_offload.py |  4 ++--
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 7799942ed349..c977fece64a3 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -107,6 +107,7 @@ int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	struct tc_cls_bpf_offload *cls_bpf = type_data;
 	struct bpf_prog *prog = cls_bpf->prog;
 	struct netdevsim *ns = cb_priv;
+	struct bpf_prog *oldprog;
 
 	if (type != TC_SETUP_CLSBPF ||
 	    !tc_can_offload(ns->netdev) ||
@@ -114,25 +115,27 @@ int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	    cls_bpf->common.chain_index)
 		return -EOPNOTSUPP;
 
-	if (nsim_xdp_offload_active(ns))
-		return -EBUSY;
-
 	if (!ns->bpf_tc_accept)
 		return -EOPNOTSUPP;
 	/* Note: progs without skip_sw will probably not be dev bound */
 	if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept)
 		return -EOPNOTSUPP;
 
-	switch (cls_bpf->command) {
-	case TC_CLSBPF_REPLACE:
-		return nsim_bpf_offload(ns, prog, true);
-	case TC_CLSBPF_ADD:
-		return nsim_bpf_offload(ns, prog, false);
-	case TC_CLSBPF_DESTROY:
-		return nsim_bpf_offload(ns, NULL, true);
-	default:
+	if (cls_bpf->command != TC_CLSBPF_OFFLOAD)
 		return -EOPNOTSUPP;
+
+	oldprog = cls_bpf->oldprog;
+
+	/* Don't remove if oldprog doesn't match driver's state */
+	if (ns->bpf_offloaded != oldprog) {
+		oldprog = NULL;
+		if (!cls_bpf->prog)
+			return 0;
+		if (ns->bpf_offloaded)
+			return -EBUSY;
 	}
+
+	return nsim_bpf_offload(ns, cls_bpf->prog, oldprog);
 }
 
 int nsim_bpf_disable_tc(struct netdevsim *ns)
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index 3914f7a4585a..c940505c2978 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -647,8 +647,8 @@ samples = ["sample_ret0.o"]
 
     start_test("Test asking for TC offload of two filters...")
     sim.cls_bpf_add_filter(obj, da=True, skip_sw=True)
-    sim.cls_bpf_add_filter(obj, da=True, skip_sw=True)
-    # The above will trigger a splat until TC cls_bpf drivers are fixed
+    ret, _ = sim.cls_bpf_add_filter(obj, da=True, skip_sw=True, fail=False)
+    fail(ret == 0, "Managed to offload two TC filters at the same time")
 
     sim.tc_flush_filters(bound=2, total=2)
 
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 2/2] nfp: bpf: keep track of the offloaded program
From: Jakub Kicinski @ 2017-12-19 21:32 UTC (permalink / raw)
  To: netdev; +Cc: daniel, jiri, oss-drivers, Jakub Kicinski
In-Reply-To: <20171219213214.1084-1-jakub.kicinski@netronome.com>

After TC offloads were converted to callbacks we have no choice
but keep track of the offloaded filter in the driver.

The check for nn->dp.bpf_offload_xdp was a stop gap solution
to make sure failed TC offload won't disable XDP, it's no longer
necessary.  nfp_net_bpf_offload() will return -EBUSY on
TC vs XDP conflicts.

Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
Dave, this will conflict on nfp_bpf_vnic_free() with net-next.
The function was removed since stack will stop XDP now, but
we need to add it back with just the WARN and free on @bv.
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 47 ++++++++++++++++++++++++---
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  8 +++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index a4cf62ba4604..13190aa09faf 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -82,10 +82,33 @@ static const char *nfp_bpf_extra_cap(struct nfp_app *app, struct nfp_net *nn)
 	return nfp_net_ebpf_capable(nn) ? "BPF" : "";
 }
 
+static int
+nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
+{
+	int err;
+
+	nn->app_priv = kzalloc(sizeof(struct nfp_bpf_vnic), GFP_KERNEL);
+	if (!nn->app_priv)
+		return -ENOMEM;
+
+	err = nfp_app_nic_vnic_alloc(app, nn, id);
+	if (err)
+		goto err_free_priv;
+
+	return 0;
+err_free_priv:
+	kfree(nn->app_priv);
+	return err;
+}
+
 static void nfp_bpf_vnic_free(struct nfp_app *app, struct nfp_net *nn)
 {
+	struct nfp_bpf_vnic *bv = nn->app_priv;
+
 	if (nn->dp.bpf_offload_xdp)
 		nfp_bpf_xdp_offload(app, nn, NULL);
+	WARN_ON(bv->tc_prog);
+	kfree(bv);
 }
 
 static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
@@ -93,6 +116,9 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 {
 	struct tc_cls_bpf_offload *cls_bpf = type_data;
 	struct nfp_net *nn = cb_priv;
+	struct bpf_prog *oldprog;
+	struct nfp_bpf_vnic *bv;
+	int err;
 
 	if (type != TC_SETUP_CLSBPF ||
 	    !tc_can_offload(nn->dp.netdev) ||
@@ -100,8 +126,6 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	    cls_bpf->common.protocol != htons(ETH_P_ALL) ||
 	    cls_bpf->common.chain_index)
 		return -EOPNOTSUPP;
-	if (nn->dp.bpf_offload_xdp)
-		return -EBUSY;
 
 	/* Only support TC direct action */
 	if (!cls_bpf->exts_integrated ||
@@ -113,7 +137,22 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 	if (cls_bpf->command != TC_CLSBPF_OFFLOAD)
 		return -EOPNOTSUPP;
 
-	return nfp_net_bpf_offload(nn, cls_bpf->prog, cls_bpf->oldprog);
+	bv = nn->app_priv;
+	oldprog = cls_bpf->oldprog;
+
+	/* Don't remove if oldprog doesn't match driver's state */
+	if (bv->tc_prog != oldprog) {
+		oldprog = NULL;
+		if (!cls_bpf->prog)
+			return 0;
+	}
+
+	err = nfp_net_bpf_offload(nn, cls_bpf->prog, oldprog);
+	if (err)
+		return err;
+
+	bv->tc_prog = cls_bpf->prog;
+	return 0;
 }
 
 static int nfp_bpf_setup_tc_block(struct net_device *netdev,
@@ -161,7 +200,7 @@ const struct nfp_app_type app_bpf = {
 
 	.extra_cap	= nfp_bpf_extra_cap,
 
-	.vnic_alloc	= nfp_app_nic_vnic_alloc,
+	.vnic_alloc	= nfp_bpf_vnic_alloc,
 	.vnic_free	= nfp_bpf_vnic_free,
 
 	.setup_tc	= nfp_bpf_setup_tc,
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 082a15f6dfb5..57b6043177a3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -172,6 +172,14 @@ struct nfp_prog {
 	struct list_head insns;
 };
 
+/**
+ * struct nfp_bpf_vnic - per-vNIC BPF priv structure
+ * @tc_prog:	currently loaded cls_bpf program
+ */
+struct nfp_bpf_vnic {
+	struct bpf_prog *tc_prog;
+};
+
 int nfp_bpf_jit(struct nfp_prog *prog);
 
 extern const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops;
-- 
2.15.1

^ permalink raw reply related

* [PATCH net 1/2] cls_bpf: fix offload assumptions after callback conversion
From: Jakub Kicinski @ 2017-12-19 21:32 UTC (permalink / raw)
  To: netdev; +Cc: daniel, jiri, oss-drivers, Jakub Kicinski
In-Reply-To: <20171219213214.1084-1-jakub.kicinski@netronome.com>

cls_bpf used to take care of tracking what offload state a filter
is in, i.e. it would track if offload request succeeded or not.
This information would then be used to issue correct requests to
the driver, e.g. requests for statistics only on offloaded filters,
removing only filters which were offloaded, using add instead of
replace if previous filter was not added etc.

This tracking of offload state no longer functions with the new
callback infrastructure.  There could be multiple entities trying
to offload the same filter.

Throw out all the tracking and corresponding commands and simply
pass to the drivers both old and new bpf program.  Drivers will
have to deal with offload state tracking by themselves.

Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 12 +---
 include/net/pkt_cls.h                         |  5 +-
 net/sched/cls_bpf.c                           | 93 +++++++++++----------------
 3 files changed, 43 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index e379b78e86ef..a4cf62ba4604 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -110,16 +110,10 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 		return -EOPNOTSUPP;
 	}
 
-	switch (cls_bpf->command) {
-	case TC_CLSBPF_REPLACE:
-		return nfp_net_bpf_offload(nn, cls_bpf->prog, true);
-	case TC_CLSBPF_ADD:
-		return nfp_net_bpf_offload(nn, cls_bpf->prog, false);
-	case TC_CLSBPF_DESTROY:
-		return nfp_net_bpf_offload(nn, NULL, true);
-	default:
+	if (cls_bpf->command != TC_CLSBPF_OFFLOAD)
 		return -EOPNOTSUPP;
-	}
+
+	return nfp_net_bpf_offload(nn, cls_bpf->prog, cls_bpf->oldprog);
 }
 
 static int nfp_bpf_setup_tc_block(struct net_device *netdev,
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0105445cab83..8e08b6da72f3 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -694,9 +694,7 @@ struct tc_cls_matchall_offload {
 };
 
 enum tc_clsbpf_command {
-	TC_CLSBPF_ADD,
-	TC_CLSBPF_REPLACE,
-	TC_CLSBPF_DESTROY,
+	TC_CLSBPF_OFFLOAD,
 	TC_CLSBPF_STATS,
 };
 
@@ -705,6 +703,7 @@ struct tc_cls_bpf_offload {
 	enum tc_clsbpf_command command;
 	struct tcf_exts *exts;
 	struct bpf_prog *prog;
+	struct bpf_prog *oldprog;
 	const char *name;
 	bool exts_integrated;
 	u32 gen_flags;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6fe798c2df1a..8d78e7f4ecc3 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -42,7 +42,6 @@ struct cls_bpf_prog {
 	struct list_head link;
 	struct tcf_result res;
 	bool exts_integrated;
-	bool offloaded;
 	u32 gen_flags;
 	struct tcf_exts exts;
 	u32 handle;
@@ -148,33 +147,37 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog)
 }
 
 static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
-			       enum tc_clsbpf_command cmd)
+			       struct cls_bpf_prog *oldprog)
 {
-	bool addorrep = cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE;
 	struct tcf_block *block = tp->chain->block;
-	bool skip_sw = tc_skip_sw(prog->gen_flags);
 	struct tc_cls_bpf_offload cls_bpf = {};
+	struct cls_bpf_prog *obj;
+	bool skip_sw;
 	int err;
 
+	skip_sw = prog && tc_skip_sw(prog->gen_flags);
+	obj = prog ?: oldprog;
+
 	tc_cls_common_offload_init(&cls_bpf.common, tp);
-	cls_bpf.command = cmd;
-	cls_bpf.exts = &prog->exts;
-	cls_bpf.prog = prog->filter;
-	cls_bpf.name = prog->bpf_name;
-	cls_bpf.exts_integrated = prog->exts_integrated;
-	cls_bpf.gen_flags = prog->gen_flags;
+	cls_bpf.command = TC_CLSBPF_OFFLOAD;
+	cls_bpf.exts = &obj->exts;
+	cls_bpf.prog = prog ? prog->filter : NULL;
+	cls_bpf.oldprog = oldprog ? oldprog->filter : NULL;
+	cls_bpf.name = obj->bpf_name;
+	cls_bpf.exts_integrated = obj->exts_integrated;
+	cls_bpf.gen_flags = obj->gen_flags;
 
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
-	if (addorrep) {
+	if (prog) {
 		if (err < 0) {
-			cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
+			cls_bpf_offload_cmd(tp, oldprog, prog);
 			return err;
 		} else if (err > 0) {
 			prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
 		}
 	}
 
-	if (addorrep && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
+	if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
 		return -EINVAL;
 
 	return 0;
@@ -183,38 +186,17 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 			   struct cls_bpf_prog *oldprog)
 {
-	struct cls_bpf_prog *obj = prog;
-	enum tc_clsbpf_command cmd;
-	bool skip_sw;
-	int ret;
-
-	skip_sw = tc_skip_sw(prog->gen_flags) ||
-		(oldprog && tc_skip_sw(oldprog->gen_flags));
-
-	if (oldprog && oldprog->offloaded) {
-		if (!tc_skip_hw(prog->gen_flags)) {
-			cmd = TC_CLSBPF_REPLACE;
-		} else if (!tc_skip_sw(prog->gen_flags)) {
-			obj = oldprog;
-			cmd = TC_CLSBPF_DESTROY;
-		} else {
-			return -EINVAL;
-		}
-	} else {
-		if (tc_skip_hw(prog->gen_flags))
-			return skip_sw ? -EINVAL : 0;
-		cmd = TC_CLSBPF_ADD;
-	}
-
-	ret = cls_bpf_offload_cmd(tp, obj, cmd);
-	if (ret)
-		return ret;
+	if (prog && oldprog && prog->gen_flags != oldprog->gen_flags)
+		return -EINVAL;
 
-	obj->offloaded = true;
-	if (oldprog)
-		oldprog->offloaded = false;
+	if (prog && tc_skip_hw(prog->gen_flags))
+		prog = NULL;
+	if (oldprog && tc_skip_hw(oldprog->gen_flags))
+		oldprog = NULL;
+	if (!prog && !oldprog)
+		return 0;
 
-	return 0;
+	return cls_bpf_offload_cmd(tp, prog, oldprog);
 }
 
 static void cls_bpf_stop_offload(struct tcf_proto *tp,
@@ -222,25 +204,26 @@ static void cls_bpf_stop_offload(struct tcf_proto *tp,
 {
 	int err;
 
-	if (!prog->offloaded)
-		return;
-
-	err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY);
-	if (err) {
+	err = cls_bpf_offload_cmd(tp, NULL, prog);
+	if (err)
 		pr_err("Stopping hardware offload failed: %d\n", err);
-		return;
-	}
-
-	prog->offloaded = false;
 }
 
 static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
 					 struct cls_bpf_prog *prog)
 {
-	if (!prog->offloaded)
-		return;
+	struct tcf_block *block = tp->chain->block;
+	struct tc_cls_bpf_offload cls_bpf = {};
+
+	tc_cls_common_offload_init(&cls_bpf.common, tp);
+	cls_bpf.command = TC_CLSBPF_STATS;
+	cls_bpf.exts = &prog->exts;
+	cls_bpf.prog = prog->filter;
+	cls_bpf.name = prog->bpf_name;
+	cls_bpf.exts_integrated = prog->exts_integrated;
+	cls_bpf.gen_flags = prog->gen_flags;
 
-	cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS);
+	tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, false);
 }
 
 static int cls_bpf_init(struct tcf_proto *tp)
-- 
2.15.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox