Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 5/9] net/mlx5: Put elements related to offloaded TC rule in one struct
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Put the representors related to the source and dest vports and the
action in struct mlx5_esw_flow_attr which is used while setting the FDB rule.

This patch doesn't change any functionality.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 51 ++++++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  | 10 ++++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  9 ++--
 3 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 783e122..3eb319b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -39,6 +39,7 @@
 #include <linux/rhashtable.h>
 #include <net/switchdev.h>
 #include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
 #include "en.h"
 #include "en_tc.h"
 #include "eswitch.h"
@@ -47,6 +48,7 @@ struct mlx5e_tc_flow {
 	struct rhash_head	node;
 	u64			cookie;
 	struct mlx5_flow_rule	*rule;
+	struct mlx5_esw_flow_attr *attr;
 };
 
 #define MLX5E_TC_TABLE_NUM_ENTRIES 1024
@@ -114,15 +116,11 @@ err_create_ft:
 
 static struct mlx5_flow_rule *mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 						    struct mlx5_flow_spec *spec,
-						    u32 action, u32 dst_vport)
+						    struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-	struct mlx5_eswitch_rep *rep = priv->ppriv;
-	u32 src_vport;
 
-	src_vport = rep->vport;
-
-	return mlx5_eswitch_add_offloaded_rule(esw, spec, action, src_vport, dst_vport);
+	return mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
 }
 
 static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
@@ -358,7 +356,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 }
 
 static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
-				u32 *action, u32 *dest_vport)
+				struct mlx5_esw_flow_attr *attr)
 {
 	const struct tc_action *a;
 	LIST_HEAD(actions);
@@ -366,17 +364,18 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	if (tc_no_actions(exts))
 		return -EINVAL;
 
-	*action = 0;
+	memset(attr, 0, sizeof(*attr));
+	attr->in_rep = priv->ppriv;
 
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
 		/* Only support a single action per rule */
-		if (*action)
+		if (attr->action)
 			return -EINVAL;
 
 		if (is_tcf_gact_shot(a)) {
-			*action = MLX5_FLOW_CONTEXT_ACTION_DROP |
-				  MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			attr->action = MLX5_FLOW_CONTEXT_ACTION_DROP |
+				       MLX5_FLOW_CONTEXT_ACTION_COUNT;
 			continue;
 		}
 
@@ -384,7 +383,6 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			int ifindex = tcf_mirred_ifindex(a);
 			struct net_device *out_dev;
 			struct mlx5e_priv *out_priv;
-			struct mlx5_eswitch_rep *out_rep;
 
 			out_dev = __dev_get_by_index(dev_net(priv->netdev), ifindex);
 
@@ -394,10 +392,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				return -EINVAL;
 			}
 
+			attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 			out_priv = netdev_priv(out_dev);
-			out_rep  = out_priv->ppriv;
-			*dest_vport = out_rep->vport;
-			*action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			attr->out_rep = out_priv->ppriv;
 			continue;
 		}
 
@@ -411,18 +408,27 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 {
 	struct mlx5e_tc_table *tc = &priv->fs.tc;
 	int err = 0;
-	u32 flow_tag, action, dest_vport = 0;
+	bool fdb_flow = false;
+	u32 flow_tag, action;
 	struct mlx5e_tc_flow *flow;
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_rule *old = NULL;
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
+	if (esw && esw->mode == SRIOV_OFFLOADS)
+		fdb_flow = true;
+
 	flow = rhashtable_lookup_fast(&tc->ht, &f->cookie,
 				      tc->ht_params);
-	if (flow)
+	if (flow) {
 		old = flow->rule;
-	else
-		flow = kzalloc(sizeof(*flow), GFP_KERNEL);
+	} else {
+		if (fdb_flow)
+			flow = kzalloc(sizeof(*flow) + sizeof(struct mlx5_esw_flow_attr),
+				       GFP_KERNEL);
+		else
+			flow = kzalloc(sizeof(*flow), GFP_KERNEL);
+	}
 
 	spec = mlx5_vzalloc(sizeof(*spec));
 	if (!spec || !flow) {
@@ -436,11 +442,12 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 	if (err < 0)
 		goto err_free;
 
-	if (esw && esw->mode == SRIOV_OFFLOADS) {
-		err = parse_tc_fdb_actions(priv, f->exts, &action, &dest_vport);
+	if (fdb_flow) {
+		flow->attr  = (struct mlx5_esw_flow_attr *)(flow + 1);
+		err = parse_tc_fdb_actions(priv, f->exts, flow->attr);
 		if (err < 0)
 			goto err_free;
-		flow->rule = mlx5e_tc_add_fdb_flow(priv, spec, action, dest_vport);
+		flow->rule = mlx5e_tc_add_fdb_flow(priv, spec, flow->attr);
 	} else {
 		err = parse_tc_nic_actions(priv, f->exts, &action, &flow_tag);
 		if (err < 0)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 4f5391a..eeeeadc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -238,11 +238,12 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
 				 struct ifla_vf_stats *vf_stats);
 
 struct mlx5_flow_spec;
+struct mlx5_esw_flow_attr;
 
 struct mlx5_flow_rule *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
-				u32 action, u32 src_vport, u32 dst_vport);
+				struct mlx5_esw_flow_attr *attr);
 struct mlx5_flow_rule *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
 
@@ -251,6 +252,13 @@ enum {
 	SET_VLAN_INSERT	= BIT(1)
 };
 
+struct mlx5_esw_flow_attr {
+	struct mlx5_eswitch_rep *in_rep;
+	struct mlx5_eswitch_rep *out_rep;
+
+	int	action;
+};
+
 int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
 				 struct mlx5_eswitch_rep *rep,
 				 u16 *sqns_array, int sqns_num);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b901cd4..c0d9d1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -46,19 +46,22 @@ enum {
 struct mlx5_flow_rule *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
-				u32 action, u32 src_vport, u32 dst_vport)
+				struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest = { 0 };
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_rule *rule;
 	void *misc;
+	int action;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
 
+	action = attr->action;
+
 	if (action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 		dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-		dest.vport_num = dst_vport;
+		dest.vport_num = attr->out_rep->vport;
 		action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	} else if (action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
 		counter = mlx5_fc_create(esw->dev, true);
@@ -69,7 +72,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	}
 
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
-	MLX5_SET(fte_match_set_misc, misc, source_port, src_vport);
+	MLX5_SET(fte_match_set_misc, misc, source_port, attr->in_rep->vport);
 
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
 	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 4/9] net/mlx5: E-Switch, Allow fine tuning of eswitch vport push/pop vlan
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

The HW can be programmed to push vlan, pop vlan or both.

A factorization step towards using the push/pop capabilties in the
eswitch offloads mode. This patch doesn't add new functionality.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 33 +++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |  5 ++++
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 4927494..6605453 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -127,7 +127,7 @@ static int modify_esw_vport_context_cmd(struct mlx5_core_dev *dev, u16 vport,
 }
 
 static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport,
-				  u16 vlan, u8 qos, bool set)
+				  u16 vlan, u8 qos, u8 set_flags)
 {
 	u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {0};
 
@@ -135,14 +135,18 @@ static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport,
 	    !MLX5_CAP_ESW(dev, vport_cvlan_insert_if_not_exist))
 		return -ENOTSUPP;
 
-	esw_debug(dev, "Set Vport[%d] VLAN %d qos %d set=%d\n",
-		  vport, vlan, qos, set);
-	if (set) {
+	esw_debug(dev, "Set Vport[%d] VLAN %d qos %d set=%x\n",
+		  vport, vlan, qos, set_flags);
+
+	if (set_flags & SET_VLAN_STRIP)
 		MLX5_SET(modify_esw_vport_context_in, in,
 			 esw_vport_context.vport_cvlan_strip, 1);
+
+	if (set_flags & SET_VLAN_INSERT) {
 		/* insert only if no vlan in packet */
 		MLX5_SET(modify_esw_vport_context_in, in,
 			 esw_vport_context.vport_cvlan_insert, 1);
+
 		MLX5_SET(modify_esw_vport_context_in, in,
 			 esw_vport_context.cvlan_pcp, qos);
 		MLX5_SET(modify_esw_vport_context_in, in,
@@ -1777,25 +1781,21 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
 	return 0;
 }
 
-int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
-				int vport, u16 vlan, u8 qos)
+int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				  int vport, u16 vlan, u8 qos, u8 set_flags)
 {
 	struct mlx5_vport *evport;
 	int err = 0;
-	int set = 0;
 
 	if (!ESW_ALLOWED(esw))
 		return -EPERM;
 	if (!LEGAL_VPORT(esw, vport) || (vlan > 4095) || (qos > 7))
 		return -EINVAL;
 
-	if (vlan || qos)
-		set = 1;
-
 	mutex_lock(&esw->state_lock);
 	evport = &esw->vports[vport];
 
-	err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set);
+	err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set_flags);
 	if (err)
 		goto unlock;
 
@@ -1813,6 +1813,17 @@ unlock:
 	return err;
 }
 
+int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				int vport, u16 vlan, u8 qos)
+{
+	u8 set_flags = 0;
+
+	if (vlan || qos)
+		set_flags = SET_VLAN_STRIP | SET_VLAN_INSERT;
+
+	return __mlx5_eswitch_set_vport_vlan(esw, vport, vlan, qos, set_flags);
+}
+
 int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
 				    int vport, bool spoofchk)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ebfcde0..4f5391a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -246,6 +246,11 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 struct mlx5_flow_rule *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
 
+enum {
+	SET_VLAN_STRIP	= BIT(0),
+	SET_VLAN_INSERT	= BIT(1)
+};
+
 int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
 				 struct mlx5_eswitch_rep *rep,
 				 u16 *sqns_array, int sqns_num);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 8/9] net/mlx5e: Add TC vlan action for SRIOV offloads
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Parse TC vlan actions and set the required elements to allow offloading.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 43 ++++++++++++++++++-------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 3eb319b..e61bd52 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -119,17 +119,27 @@ static struct mlx5_flow_rule *mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 						    struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	int err;
+
+	err = mlx5_eswitch_add_vlan_action(esw, attr);
+	if (err)
+		return ERR_PTR(err);
 
 	return mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
 }
 
 static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
-			      struct mlx5_flow_rule *rule)
+			      struct mlx5_flow_rule *rule,
+			      struct mlx5_esw_flow_attr *attr)
 {
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_fc *counter = NULL;
 
 	counter = mlx5_flow_rule_counter(rule);
 
+	if (esw && esw->mode == SRIOV_OFFLOADS)
+		mlx5_eswitch_del_vlan_action(esw, attr);
+
 	mlx5_del_flow_rule(rule);
 
 	mlx5_fc_destroy(priv->mdev, counter);
@@ -369,13 +379,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
-		/* Only support a single action per rule */
-		if (attr->action)
-			return -EINVAL;
-
 		if (is_tcf_gact_shot(a)) {
-			attr->action = MLX5_FLOW_CONTEXT_ACTION_DROP |
-				       MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
+					MLX5_FLOW_CONTEXT_ACTION_COUNT;
 			continue;
 		}
 
@@ -392,12 +398,25 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				return -EINVAL;
 			}
 
-			attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 			out_priv = netdev_priv(out_dev);
 			attr->out_rep = out_priv->ppriv;
 			continue;
 		}
 
+		if (is_tcf_vlan(a)) {
+			if (tcf_vlan_action(a) == VLAN_F_POP) {
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP;
+			} else if (tcf_vlan_action(a) == VLAN_F_PUSH) {
+				if (tcf_vlan_push_proto(a) != htons(ETH_P_8021Q))
+					return -EOPNOTSUPP;
+
+				attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH;
+				attr->vlan = tcf_vlan_push_vid(a);
+			}
+			continue;
+		}
+
 		return -EINVAL;
 	}
 	return 0;
@@ -413,6 +432,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 	struct mlx5e_tc_flow *flow;
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_rule *old = NULL;
+	struct mlx5_esw_flow_attr *old_attr;
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
 	if (esw && esw->mode == SRIOV_OFFLOADS)
@@ -422,6 +442,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 				      tc->ht_params);
 	if (flow) {
 		old = flow->rule;
+		old_attr = flow->attr;
 	} else {
 		if (fdb_flow)
 			flow = kzalloc(sizeof(*flow) + sizeof(struct mlx5_esw_flow_attr),
@@ -466,7 +487,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 		goto err_del_rule;
 
 	if (old)
-		mlx5e_tc_del_flow(priv, old);
+		mlx5e_tc_del_flow(priv, old, old_attr);
 
 	goto out;
 
@@ -494,7 +515,7 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,
 
 	rhashtable_remove_fast(&tc->ht, &flow->node, tc->ht_params);
 
-	mlx5e_tc_del_flow(priv, flow->rule);
+	mlx5e_tc_del_flow(priv, flow->rule, flow->attr);
 
 	kfree(flow);
 
@@ -551,7 +572,7 @@ static void _mlx5e_tc_del_flow(void *ptr, void *arg)
 	struct mlx5e_tc_flow *flow = ptr;
 	struct mlx5e_priv *priv = arg;
 
-	mlx5e_tc_del_flow(priv, flow->rule);
+	mlx5e_tc_del_flow(priv, flow->rule, flow->attr);
 	kfree(flow);
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 2/9] net/mlx5: E-Switch, Set the vport when registering the uplink rep
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Set the vport value in the PF entry to be that of the uplink so
we can use it blindly over the tc / eswitch offload code without
translating it each time we deal with the uplink representor.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  6 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 10 ++------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  3 ++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 27 +++++++++++-----------
 4 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a9fc9d4..b309e7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3726,9 +3726,9 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 		mlx5_query_nic_vport_mac_address(mdev, 0, rep.hw_id);
 		rep.load = mlx5e_nic_rep_load;
 		rep.unload = mlx5e_nic_rep_unload;
-		rep.vport = 0;
+		rep.vport = FDB_UPLINK_VPORT;
 		rep.priv_data = priv;
-		mlx5_eswitch_register_vport_rep(esw, &rep);
+		mlx5_eswitch_register_vport_rep(esw, 0, &rep);
 	}
 }
 
@@ -3867,7 +3867,7 @@ static void mlx5e_register_vport_rep(struct mlx5_core_dev *mdev)
 		rep.unload = mlx5e_vport_rep_unload;
 		rep.vport = vport;
 		ether_addr_copy(rep.hw_id, mac);
-		mlx5_eswitch_register_vport_rep(esw, &rep);
+		mlx5_eswitch_register_vport_rep(esw, vport, &rep);
 	}
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 22cfc4a..783e122 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -120,10 +120,7 @@ static struct mlx5_flow_rule *mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	struct mlx5_eswitch_rep *rep = priv->ppriv;
 	u32 src_vport;
 
-	if (rep->vport) /* set source vport for the flow */
-		src_vport = rep->vport;
-	else
-		src_vport = FDB_UPLINK_VPORT;
+	src_vport = rep->vport;
 
 	return mlx5_eswitch_add_offloaded_rule(esw, spec, action, src_vport, dst_vport);
 }
@@ -399,10 +396,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 
 			out_priv = netdev_priv(out_dev);
 			out_rep  = out_priv->ppriv;
-			if (out_rep->vport == 0)
-				*dest_vport = FDB_UPLINK_VPORT;
-			else
-				*dest_vport = out_rep->vport;
+			*dest_vport = out_rep->vport;
 			*action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 			continue;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index b96e8c9..6d8c5a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -254,9 +254,10 @@ void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
 int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
 int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
 void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
+				     int vport_index,
 				     struct mlx5_eswitch_rep *rep);
 void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
-				       int vport);
+				       int vport_index);
 
 #define MLX5_DEBUG_ESWITCH_MASK BIT(3)
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 3dc83a9..a73721b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -144,16 +144,12 @@ int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
 {
 	struct mlx5_flow_rule *flow_rule;
 	struct mlx5_esw_sq *esw_sq;
-	int vport;
 	int err;
 	int i;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return 0;
 
-	vport = rep->vport == 0 ?
-		FDB_UPLINK_VPORT : rep->vport;
-
 	for (i = 0; i < sqns_num; i++) {
 		esw_sq = kzalloc(sizeof(*esw_sq), GFP_KERNEL);
 		if (!esw_sq) {
@@ -163,7 +159,7 @@ int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
 
 		/* Add re-inject rule to the PF/representor sqs */
 		flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw,
-								vport,
+								rep->vport,
 								sqns_array[i]);
 		if (IS_ERR(flow_rule)) {
 			err = PTR_ERR(flow_rule);
@@ -612,27 +608,30 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 }
 
 void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
-				     struct mlx5_eswitch_rep *rep)
+				     int vport_index,
+				     struct mlx5_eswitch_rep *__rep)
 {
 	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep *rep;
+
+	rep = &offloads->vport_reps[vport_index];
 
-	memcpy(&offloads->vport_reps[rep->vport], rep,
-	       sizeof(struct mlx5_eswitch_rep));
+	memcpy(rep, __rep, sizeof(struct mlx5_eswitch_rep));
 
-	INIT_LIST_HEAD(&offloads->vport_reps[rep->vport].vport_sqs_list);
-	offloads->vport_reps[rep->vport].valid = true;
+	INIT_LIST_HEAD(&rep->vport_sqs_list);
+	rep->valid = true;
 }
 
 void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
-				       int vport)
+				       int vport_index)
 {
 	struct mlx5_esw_offload *offloads = &esw->offloads;
 	struct mlx5_eswitch_rep *rep;
 
-	rep = &offloads->vport_reps[vport];
+	rep = &offloads->vport_reps[vport_index];
 
-	if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport].enabled)
+	if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport_index].enabled)
 		rep->unload(esw, rep);
 
-	offloads->vport_reps[vport].valid = false;
+	rep->valid = false;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 3/9] net/mlx5: E-Switch, Set vport representor fields explicitly on registration
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

The structure we use for the eswitch vport representor (mlx5_eswitch_rep)
has some fields which are set from upper layers in the driver when they
register the rep. Use explicit setting on registration time for them and
avoid global memcpy. This patch doesn't add new functionality.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 5 +++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 8 +++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 6d8c5a2..ebfcde0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -178,11 +178,12 @@ struct mlx5_eswitch_rep {
 	void		       (*unload)(struct mlx5_eswitch *esw,
 					 struct mlx5_eswitch_rep *rep);
 	u16		       vport;
-	struct mlx5_flow_rule *vport_rx_rule;
+	u8		       hw_id[ETH_ALEN];
 	void		      *priv_data;
+
+	struct mlx5_flow_rule *vport_rx_rule;
 	struct list_head       vport_sqs_list;
 	bool		       valid;
-	u8		       hw_id[ETH_ALEN];
 };
 
 struct mlx5_esw_offload {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a73721b..b901cd4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -616,7 +616,13 @@ void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
 
 	rep = &offloads->vport_reps[vport_index];
 
-	memcpy(rep, __rep, sizeof(struct mlx5_eswitch_rep));
+	memset(rep, 0, sizeof(*rep));
+
+	rep->load   = __rep->load;
+	rep->unload = __rep->unload;
+	rep->vport  = __rep->vport;
+	rep->priv_data = __rep->priv_data;
+	ether_addr_copy(rep->hw_id, __rep->hw_id);
 
 	INIT_LIST_HEAD(&rep->vport_sqs_list);
 	rep->valid = true;
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 9/9] net/mlx5e: Add TC vlan match parsing
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Enhance the parsing of offloaded TC rules matches to handle vlans.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index e61bd52..a350b71 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -164,6 +164,7 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec
 	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
 	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
 	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
 	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
 	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
 	      BIT(FLOW_DISSECTOR_KEY_PORTS))) {
@@ -227,6 +228,24 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec
 				key->src);
 	}
 
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->key);
+		struct flow_dissector_key_vlan *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->mask);
+		if (mask->vlan_id) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, vlan_tag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, vlan_tag, 1);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, mask->vlan_id);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, key->vlan_id);
+		}
+	}
+
 	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
 		struct flow_dissector_key_ipv4_addrs *key =
 			skb_flow_dissector_target(f->dissector,
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 6/9] net/mlx5e: Refactor retrival of skb from rx completion element (cqe)
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Factor the relevant code into a static inline helper (skb_from_cqe)
doing that.

Move the call to napi_gro_receive to be carried out just
after mlx5e_complete_rx_cqe returns.

Both changes are to be used for the VF representor as well
in the next commit.

This patch doesn't change any functionality.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 41 +++++++++++++++++--------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 0a81bd3..e836e47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -629,7 +629,6 @@ static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
 	rq->stats.packets++;
 	rq->stats.bytes += cqe_bcnt;
 	mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
-	napi_gro_receive(rq->cq.napi, skb);
 }
 
 static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_sq *sq)
@@ -733,20 +732,15 @@ static inline bool mlx5e_xdp_handle(struct mlx5e_rq *rq,
 	}
 }
 
-void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+static inline
+struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     u16 wqe_counter, u32 cqe_bcnt)
 {
 	struct bpf_prog *xdp_prog = READ_ONCE(rq->xdp_prog);
 	struct mlx5e_dma_info *di;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	void *va, *data;
-	u32 cqe_bcnt;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
 	di             = &rq->dma_info[wqe_counter];
 	va             = page_address(di->page);
 	data           = va + MLX5_RX_HEADROOM;
@@ -757,22 +751,21 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 				      rq->buff.wqe_sz,
 				      DMA_FROM_DEVICE);
 	prefetch(data);
-	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
 		rq->stats.wqe_err++;
 		mlx5e_page_release(rq, di, true);
-		goto wq_ll_pop;
+		return NULL;
 	}
 
 	if (mlx5e_xdp_handle(rq, xdp_prog, di, data, cqe_bcnt))
-		goto wq_ll_pop; /* page/packet was consumed by XDP */
+		return NULL; /* page/packet was consumed by XDP */
 
 	skb = build_skb(va, RQ_PAGE_SIZE(rq));
 	if (unlikely(!skb)) {
 		rq->stats.buff_alloc_err++;
 		mlx5e_page_release(rq, di, true);
-		goto wq_ll_pop;
+		return NULL;
 	}
 
 	/* queue up for recycling ..*/
@@ -782,7 +775,28 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	skb_reserve(skb, MLX5_RX_HEADROOM);
 	skb_put(skb, cqe_bcnt);
 
+	return skb;
+}
+
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5e_rx_wqe *wqe;
+	__be16 wqe_counter_be;
+	struct sk_buff *skb;
+	u16 wqe_counter;
+	u32 cqe_bcnt;
+
+	wqe_counter_be = cqe->wqe_counter;
+	wqe_counter    = be16_to_cpu(wqe_counter_be);
+	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+
+	skb = skb_from_cqe(rq, cqe, wqe_counter, cqe_bcnt);
+	if (!skb)
+		goto wq_ll_pop;
+
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
 
 wq_ll_pop:
 	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
@@ -861,6 +875,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_mpwqe_fill_rx_skb(rq, cqe, wi, cqe_bcnt, skb);
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
 
 mpwrq_cqe_out:
 	if (likely(wi->consumed_strides < rq->mpwqe_num_strides))
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 0/9] Mellanox 100G SRIOV offloads vlan push/pop
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed

Hi Dave,

>From Or Gerlitz:

This series further enhances the SRIOV TC offloads of mlx5 to handle
the TC vlan push and pop actions. This serves a common use-case in
virtualization systems where the virtual switch add (push) vlan tags
to packets sent from VMs and removes (pop) vlan tags before the packet
is received by the VM. We use the new E-Switch switchdev mode and the
TC vlan action to achieve that also in SW defined SRIOV environments by
offloading TC rules that contain this action along with forwarding
(TC mirred/redirect action) the packet.

In the first patch we add some helpers to access the TC vlan action info
by offloading drivers. The next five patches don't add any new functionality,
they do some refactoring and cleanups in the current code to be used next.

The seventh patch deals with supporting vlans by the mlx5 e-switch in switchdev
mode. The eighth patch does the vlan action offload from TC and the last patch
adds matching for vlans as typically required by TC flows that involve vlan
pop action.

The series was applied on top of commit 524605e "cxgb4: Convert to use simple_open()"

Thanks.

Or Gerlitz (9):
  net_sched: act_vlan: add helper inlines to access tcf_vlan info
  net/mlx5: E-Switch, Set the vport when registering the uplink rep
  net/mlx5: E-Switch, Set vport representor fields explicitly on
    registration
  net/mlx5: E-Switch, Allow fine tuning of eswitch vport push/pop vlan
  net/mlx5: Put elements related to offloaded TC rule in one struct
  net/mlx5e: Refactor retrival of skb from rx completion element (cqe)
  net/mlx5: E-Switch, Support VLAN actions in the offloads mode
  net/mlx5e: Add TC vlan action for SRIOV offloads
  net/mlx5e: Add TC vlan match parsing

 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  27 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  74 +++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 109 ++++++----
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  33 ++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  38 +++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 222 +++++++++++++++++++--
 include/net/tc_act/tc_vlan.h                       |  25 +++
 8 files changed, 446 insertions(+), 83 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH net-next 1/9] net_sched: act_vlan: add helper inlines to access tcf_vlan info
From: Saeed Mahameed @ 2016-09-22 17:01 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
	Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
In-Reply-To: <1474563709-17943-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Needed e.g for offloading drivers to pick the relevant attributes.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/tc_act/tc_vlan.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 6b83588..48cca32 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -11,6 +11,7 @@
 #define __NET_TC_VLAN_H
 
 #include <net/act_api.h>
+#include <linux/tc_act/tc_vlan.h>
 
 #define VLAN_F_POP		0x1
 #define VLAN_F_PUSH		0x2
@@ -24,4 +25,28 @@ struct tcf_vlan {
 };
 #define to_vlan(a) ((struct tcf_vlan *)a)
 
+static inline bool is_tcf_vlan(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->type == TCA_ACT_VLAN)
+		return true;
+#endif
+	return false;
+}
+
+static inline u32 tcf_vlan_action(const struct tc_action *a)
+{
+	return to_vlan(a)->tcfv_action;
+}
+
+static inline u16 tcf_vlan_push_vid(const struct tc_action *a)
+{
+	return to_vlan(a)->tcfv_push_vid;
+}
+
+static inline __be16 tcf_vlan_push_proto(const struct tc_action *a)
+{
+	return to_vlan(a)->tcfv_push_proto;
+}
+
 #endif /* __NET_TC_VLAN_H */
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v2] fs/select: add vmalloc fallback for select(2)
From: Vlastimil Babka @ 2016-09-22 16:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexander Viro, Andrew Morton, linux-fsdevel, linux-kernel,
	linux-mm, Michal Hocko, netdev
In-Reply-To: <1474562982.23058.140.camel@edumazet-glaptop3.roam.corp.google.com>

On 09/22/2016 06:49 PM, Eric Dumazet wrote:
> On Thu, 2016-09-22 at 18:43 +0200, Vlastimil Babka wrote:
>> The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
>> with the number of fds passed. We had a customer report page allocation
>> failures of order-4 for this allocation. This is a costly order, so it might
>> easily fail, as the VM expects such allocation to have a lower-order fallback.
>>
>> Such trivial fallback is vmalloc(), as the memory doesn't have to be
>> physically contiguous. Also the allocation is temporary for the duration of the
>> syscall, so it's unlikely to stress vmalloc too much.
>
> vmalloc() uses a vmap_area_lock spinlock, and TLB flushes.
>
> So I guess allowing vmalloc() being called from an innocent application
> doing a select() might be dangerous, especially if this select() happens
> thousands of time per second.

Isn't seq_buf_alloc() similarly exposed? And ipc_alloc()?

^ permalink raw reply

* Re: [PATCH v3 2/2] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates
From: Jan Engelhardt @ 2016-09-22 16:53 UTC (permalink / raw)
  To: Vishwanath Pai
  Cc: pablo, kaber, kadlec, johunt, netfilter-devel, coreteam, netdev,
	pai.vishwain
In-Reply-To: <20160922164344.GB20423@akamai.com>


On Thursday 2016-09-22 18:43, Vishwanath Pai wrote:
>+struct hashlimit_cfg2 {
>+	__u32 mode;	  /* bitmask of XT_HASHLIMIT_HASH_* */
>+	__u64 avg;    /* Average secs between packets * scale */
>+	__u64 burst;  /* Period multiplier for upper limit. */

This would have different sizes between i386 and x86_64,
necessiting additional compat functions. It should be padded
or reordered instead.

>+
>+	/* user specified */
>+	__u32 size;		/* how many buckets */
>+	__u32 max;		/* max number of entries */
>+	__u32 gc_interval;	/* gc interval */
>+	__u32 expire;	/* when do entries expire? */
>+
>+	__u8 srcmask, dstmask;
>+};
>+
>+struct xt_hashlimit_mtinfo2 {
>+	char name[NAME_MAX];
>+	struct hashlimit_cfg2 cfg;
>+
>+	/* Used internally by the kernel */
>+	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
>+};


^ permalink raw reply

* Re: [PATCH v2] fs/select: add vmalloc fallback for select(2)
From: Eric Dumazet @ 2016-09-22 16:49 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Alexander Viro, Andrew Morton, linux-fsdevel, linux-kernel,
	linux-mm, Michal Hocko, netdev
In-Reply-To: <20160922164359.9035-1-vbabka@suse.cz>

On Thu, 2016-09-22 at 18:43 +0200, Vlastimil Babka wrote:
> The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
> with the number of fds passed. We had a customer report page allocation
> failures of order-4 for this allocation. This is a costly order, so it might
> easily fail, as the VM expects such allocation to have a lower-order fallback.
> 
> Such trivial fallback is vmalloc(), as the memory doesn't have to be
> physically contiguous. Also the allocation is temporary for the duration of the
> syscall, so it's unlikely to stress vmalloc too much.

vmalloc() uses a vmap_area_lock spinlock, and TLB flushes.

So I guess allowing vmalloc() being called from an innocent application
doing a select() might be dangerous, especially if this select() happens
thousands of time per second.





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH net-next] Documentation: devicetree: revise ethernet device-tree binding about TRGMII
From: Sergei Shtylyov @ 2016-09-22 16:48 UTC (permalink / raw)
  To: sean.wang, davem, john
  Cc: nbd, netdev, linux-kernel, linux-mediatek, devicetree, keyhaede,
	objelf
In-Reply-To: <1474560996-20863-1-git-send-email-sean.wang@mediatek.com>

On 09/22/2016 07:16 PM, sean.wang@mediatek.com wrote:

> From: Sean Wang <sean.wang@mediatek.com>
>
> fix typo in mediatek-net.txt and add phy-mode "trgmii" to ethernet.txt

    These changes are unrelated to each other, so there should be 2 separate 
patches. And have the patches I reviewed been merged already, why are you 
sending an incremental patch?

> Cc: devicetree@vger.kernel.org
> Reported-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
> Signed-off-by: Sean Wang <sean.wang@mediatek.com>
[...]

MBR, Sergei

^ permalink raw reply

* [PATCH v2] fs/select: add vmalloc fallback for select(2)
From: Vlastimil Babka @ 2016-09-22 16:43 UTC (permalink / raw)
  To: Alexander Viro, Andrew Morton
  Cc: linux-fsdevel, linux-kernel, linux-mm, Michal Hocko, netdev,
	Eric Dumazet, Vlastimil Babka

The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
with the number of fds passed. We had a customer report page allocation
failures of order-4 for this allocation. This is a costly order, so it might
easily fail, as the VM expects such allocation to have a lower-order fallback.

Such trivial fallback is vmalloc(), as the memory doesn't have to be
physically contiguous. Also the allocation is temporary for the duration of the
syscall, so it's unlikely to stress vmalloc too much.

Note that the poll(2) syscall seems to use a linked list of order-0 pages, so
it doesn't need this kind of fallback.

[eric.dumazet@gmail.com: fix failure path logic]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 fs/select.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 8ed9da50896a..b99e98524fde 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -29,6 +29,7 @@
 #include <linux/sched/rt.h>
 #include <linux/freezer.h>
 #include <net/busy_poll.h>
+#include <linux/vmalloc.h>
 
 #include <asm/uaccess.h>
 
@@ -558,6 +559,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	struct fdtable *fdt;
 	/* Allocate small arguments on the stack to save memory and be faster */
 	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+	unsigned long alloc_size;
 
 	ret = -EINVAL;
 	if (n < 0)
@@ -580,8 +582,12 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	bits = stack_fds;
 	if (size > sizeof(stack_fds) / 6) {
 		/* Not enough space in on-stack array; must use kmalloc */
+		alloc_size = 6 * size;
 		ret = -ENOMEM;
-		bits = kmalloc(6 * size, GFP_KERNEL);
+		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
+		if (!bits && alloc_size > PAGE_SIZE)
+			bits = vmalloc(alloc_size);
+
 		if (!bits)
 			goto out_nofds;
 	}
@@ -618,7 +624,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 
 out:
 	if (bits != stack_fds)
-		kfree(bits);
+		kvfree(bits);
 out_nofds:
 	return ret;
 }
-- 
2.10.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3 2/2] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates
From: Vishwanath Pai @ 2016-09-22 16:43 UTC (permalink / raw)
  To: pablo, kaber, kadlec
  Cc: johunt, netfilter-devel, coreteam, netdev, pai.vishwain

V2:
Removed the call to BUG() in cfg_copy, we return -EINVAL
instead and all calls to cfg_copy check for this

V3:
change "revision" in the call to cfg_copy inside htable_create to 2
previously this would pass down revision from the function parameter
this is wrong since *cfg here is always hashlimit_cfg2

There is no change in userspace patch so V2 will work

--

netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destroy.

Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very
similar in both rev1 and rev2 with only minor changes, so I have split
those functions and moved all the common code to a *_common function.

Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Joshua Hunt <johunt@akamai.com>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  23 ++
 net/netfilter/xt_hashlimit.c                | 330 ++++++++++++++++++++++------
 2 files changed, 285 insertions(+), 68 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h
index ea8c1c0..be5d2e1 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -6,6 +6,7 @@
 
 /* timings are in milliseconds. */
 #define XT_HASHLIMIT_SCALE_v1 10000
+#define XT_HASHLIMIT_SCALE 1000000llu
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
@@ -63,6 +64,20 @@ struct hashlimit_cfg1 {
 	__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg2 {
+	__u32 mode;	  /* bitmask of XT_HASHLIMIT_HASH_* */
+	__u64 avg;    /* Average secs between packets * scale */
+	__u64 burst;  /* Period multiplier for upper limit. */
+
+	/* user specified */
+	__u32 size;		/* how many buckets */
+	__u32 max;		/* max number of entries */
+	__u32 gc_interval;	/* gc interval */
+	__u32 expire;	/* when do entries expire? */
+
+	__u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
 	char name[IFNAMSIZ];
 	struct hashlimit_cfg1 cfg;
@@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 {
 	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo2 {
+	char name[NAME_MAX];
+	struct hashlimit_cfg2 cfg;
+
+	/* Used internally by the kernel */
+	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 6b0ad93..64579f3 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 
 /* need to declare this at the top */
 static const struct file_operations dl_file_ops_v1;
+static const struct file_operations dl_file_ops;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -86,8 +87,8 @@ struct dsthash_ent {
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
-		u_int32_t credit;
-		u_int32_t credit_cap, cost;
+		u_int64_t credit;
+		u_int64_t credit_cap, cost;
 	} rateinfo;
 	struct rcu_head rcu;
 };
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
 	u_int8_t family;
 	bool rnd_initialized;
 
-	struct hashlimit_cfg1 cfg;	/* config */
+	struct hashlimit_cfg2 cfg;	/* config */
 
 	/* used internally */
 	spinlock_t lock;		/* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
 	struct hlist_head hash[0];	/* hashtable itself */
 };
 
+static int
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+	if (revision == 1) {
+		struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+		to->mode = cfg->mode;
+		to->avg = cfg->avg;
+		to->burst = cfg->burst;
+		to->size = cfg->size;
+		to->max = cfg->max;
+		to->gc_interval = cfg->gc_interval;
+		to->expire = cfg->expire;
+		to->srcmask = cfg->srcmask;
+		to->dstmask = cfg->dstmask;
+	} else if (revision == 2) {
+		memcpy(to, from, sizeof(struct hashlimit_cfg2));
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static DEFINE_MUTEX(hashlimit_mutex);	/* protects htables list */
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
@@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-			    u_int8_t family)
+static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+			 const char *name, u_int8_t family,
+			 struct xt_hashlimit_htable **out_hinfo,
+			 int revision)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
-	unsigned int size;
-	unsigned int i;
+	unsigned int size, i;
+	int ret;
 
-	if (minfo->cfg.size) {
-		size = minfo->cfg.size;
+	if (cfg->size) {
+		size = cfg->size;
 	} else {
 		size = (totalram_pages << PAGE_SHIFT) / 16384 /
 		       sizeof(struct list_head);
@@ -238,10 +265,14 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	                sizeof(struct list_head) * size);
 	if (hinfo == NULL)
 		return -ENOMEM;
-	minfo->hinfo = hinfo;
+	*out_hinfo = hinfo;
 
 	/* copy match config into hashtable config */
-	memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+	ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
+
+	if (ret)
+		return ret;
+
 	hinfo->cfg.size = size;
 	if (hinfo->cfg.max == 0)
 		hinfo->cfg.max = 8 * hinfo->cfg.size;
@@ -255,17 +286,18 @@ static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	hinfo->count = 0;
 	hinfo->family = family;
 	hinfo->rnd_initialized = false;
-	hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+	hinfo->name = kstrdup(name, GFP_KERNEL);
 	if (!hinfo->name) {
 		vfree(hinfo);
 		return -ENOMEM;
 	}
 	spin_lock_init(&hinfo->lock);
 
-	hinfo->pde = proc_create_data(minfo->name, 0,
+	hinfo->pde = proc_create_data(name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		&dl_file_ops_v1, hinfo);
+		(revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
+		hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -399,6 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
    CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
 #define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -408,8 +441,11 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
 #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1)
 
+#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ)
 #define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
@@ -425,15 +461,24 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 }
 
 /* Precision saver. */
-static u32 user2credits(u32 user)
+static u64 user2credits(u64 user, int revision)
 {
-	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
-		/* Divide first. */
-		return (user / XT_HASHLIMIT_SCALE_v1) *\
-					HZ * CREDITS_PER_JIFFY_v1;
+	if (revision == 1) {
+		/* If multiplying would overflow... */
+		if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
+			/* Divide first. */
+			return (user / XT_HASHLIMIT_SCALE_v1) *\
+						HZ * CREDITS_PER_JIFFY_v1;
+
+		return (user * HZ * CREDITS_PER_JIFFY_v1) \
+						/ XT_HASHLIMIT_SCALE_v1;
+	} else {
+		if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+			return (user / XT_HASHLIMIT_SCALE) *\
+						HZ * CREDITS_PER_JIFFY;
 
-	return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE_v1;
+		return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+	}
 }
 
 static u32 user2credits_byte(u32 user)
@@ -443,10 +488,11 @@ static u32 user2credits_byte(u32 user)
 	return (u32) (us >> 32);
 }
 
-static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+			    u32 mode, int revision)
 {
 	unsigned long delta = now - dh->rateinfo.prev;
-	u32 cap;
+	u64 cap, cpj;
 
 	if (delta == 0)
 		return;
@@ -454,7 +500,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 	dh->rateinfo.prev = now;
 
 	if (mode & XT_HASHLIMIT_BYTES) {
-		u32 tmp = dh->rateinfo.credit;
+		u64 tmp = dh->rateinfo.credit;
 		dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
 		cap = CREDITS_PER_JIFFY_BYTES * HZ;
 		if (tmp >= dh->rateinfo.credit) {/* overflow */
@@ -462,7 +508,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 			return;
 		}
 	} else {
-		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
+		cpj = (revision == 1) ?
+			CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY;
+		dh->rateinfo.credit += delta * cpj;
 		cap = dh->rateinfo.credit_cap;
 	}
 	if (dh->rateinfo.credit > cap)
@@ -470,7 +518,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 }
 
 static void rateinfo_init(struct dsthash_ent *dh,
-			  struct xt_hashlimit_htable *hinfo)
+			  struct xt_hashlimit_htable *hinfo, int revision)
 {
 	dh->rateinfo.prev = jiffies;
 	if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
@@ -479,8 +527,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
 		dh->rateinfo.credit_cap = hinfo->cfg.burst;
 	} else {
 		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-						   hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+						   hinfo->cfg.burst, revision);
+		dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision);
 		dh->rateinfo.credit_cap = dh->rateinfo.credit;
 	}
 }
@@ -604,15 +652,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
+		    struct xt_hashlimit_htable *hinfo,
+		    const struct hashlimit_cfg2 *cfg, int revision)
 {
-	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
-	struct xt_hashlimit_htable *hinfo = info->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
 	struct dsthash_dst dst;
 	bool race = false;
-	u32 cost;
+	u64 cost;
 
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
@@ -627,18 +675,18 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 		} else if (race) {
 			/* Already got an entry, update expiration timeout */
 			dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-			rateinfo_recalc(dh, now, hinfo->cfg.mode);
+			rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 		} else {
 			dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-			rateinfo_init(dh, hinfo);
+			rateinfo_init(dh, hinfo, revision);
 		}
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-		rateinfo_recalc(dh, now, hinfo->cfg.mode);
+		rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 	}
 
-	if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+	if (cfg->mode & XT_HASHLIMIT_BYTES)
 		cost = hashlimit_byte_cost(skb->len, dh);
 	else
 		cost = dh->rateinfo.cost;
@@ -648,70 +696,126 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 		dh->rateinfo.credit -= cost;
 		spin_unlock(&dh->lock);
 		rcu_read_unlock_bh();
-		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
+		return !(cfg->mode & XT_HASHLIMIT_INVERT);
 	}
 
 	spin_unlock(&dh->lock);
 	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
-	return info->cfg.mode & XT_HASHLIMIT_INVERT;
+	return cfg->mode & XT_HASHLIMIT_INVERT;
 
  hotdrop:
 	par->hotdrop = true;
 	return false;
 }
 
-static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+static bool
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
+	struct hashlimit_cfg2 cfg = {};
+	int ret;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_common(skb, par, hinfo, &cfg, 1);
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
+
+	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
+}
+
+static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
+				     struct xt_hashlimit_htable **hinfo,
+				     struct hashlimit_cfg2 *cfg,
+				     const char *name, int revision)
 {
 	struct net *net = par->net;
-	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 	int ret;
 
-	if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
-		return -EINVAL;
-	if (info->name[sizeof(info->name)-1] != '\0')
+	if (cfg->gc_interval == 0 || cfg->expire == 0)
 		return -EINVAL;
 	if (par->family == NFPROTO_IPV4) {
-		if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
+		if (cfg->srcmask > 32 || cfg->dstmask > 32)
 			return -EINVAL;
 	} else {
-		if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128)
+		if (cfg->srcmask > 128 || cfg->dstmask > 128)
 			return -EINVAL;
 	}
 
-	if (info->cfg.mode & ~XT_HASHLIMIT_ALL) {
+	if (cfg->mode & ~XT_HASHLIMIT_ALL) {
 		pr_info("Unknown mode mask %X, kernel too old?\n",
-						info->cfg.mode);
+						cfg->mode);
 		return -EINVAL;
 	}
 
 	/* Check for overflow. */
-	if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
-		if (user2credits_byte(info->cfg.avg) == 0) {
-			pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+	if (cfg->mode & XT_HASHLIMIT_BYTES) {
+		if (user2credits_byte(cfg->avg) == 0) {
+			pr_info("overflow, rate too high: %llu\n", cfg->avg);
 			return -EINVAL;
 		}
-	} else if (info->cfg.burst == 0 ||
-		    user2credits(info->cfg.avg * info->cfg.burst) <
-		    user2credits(info->cfg.avg)) {
-			pr_info("overflow, try lower: %u/%u\n",
-				info->cfg.avg, info->cfg.burst);
+	} else if (cfg->burst == 0 ||
+		    user2credits(cfg->avg * cfg->burst, revision) <
+		    user2credits(cfg->avg, revision)) {
+			pr_info("overflow, try lower: %llu/%llu\n",
+				cfg->avg, cfg->burst);
 			return -ERANGE;
 	}
 
 	mutex_lock(&hashlimit_mutex);
-	info->hinfo = htable_find_get(net, info->name, par->family);
-	if (info->hinfo == NULL) {
-		ret = htable_create_v1(net, info, par->family);
+	*hinfo = htable_find_get(net, name, par->family);
+	if (*hinfo == NULL) {
+		ret = htable_create(net, cfg, name, par->family,
+				    hinfo, revision);
 		if (ret < 0) {
 			mutex_unlock(&hashlimit_mutex);
 			return ret;
 		}
 	}
 	mutex_unlock(&hashlimit_mutex);
+
 	return 0;
 }
 
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	struct hashlimit_cfg2 cfg = {};
+	int ret;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_check_common(par, &info->hinfo,
+					 &cfg, info->name, 1);
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
+					 info->name, 2);
+}
+
 static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
@@ -719,6 +823,13 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 	htable_put(info->hinfo);
 }
 
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	htable_put(info->hinfo);
+}
+
 static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 	{
 		.name           = "hashlimit",
@@ -730,6 +841,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
+	{
+		.name           = "hashlimit",
+		.revision       = 2,
+		.family         = NFPROTO_IPV4,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	{
 		.name           = "hashlimit",
@@ -741,6 +862,16 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
+	{
+		.name           = "hashlimit",
+		.revision       = 2,
+		.family         = NFPROTO_IPV6,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 #endif
 };
 
@@ -787,18 +918,12 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 	spin_unlock_bh(&htable->lock);
 }
 
-static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
-			       struct seq_file *s)
+static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
+			 struct seq_file *s)
 {
-	const struct xt_hashlimit_htable *ht = s->private;
-
-	spin_lock(&ent->lock);
-	/* recalculate to show accurate numbers */
-	rateinfo_recalc(ent, jiffies, ht->cfg.mode);
-
 	switch (family) {
 	case NFPROTO_IPV4:
-		seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n",
 			   (long)(ent->expires - jiffies)/HZ,
 			   &ent->dst.ip.src,
 			   ntohs(ent->dst.src_port),
@@ -809,7 +934,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 		break;
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	case NFPROTO_IPV6:
-		seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n",
 			   (long)(ent->expires - jiffies)/HZ,
 			   &ent->dst.ip6.src,
 			   ntohs(ent->dst.src_port),
@@ -822,6 +947,34 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 	default:
 		BUG();
 	}
+}
+
+static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
+			       struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1);
+
+	dl_seq_print(ent, family, s);
+
+	spin_unlock(&ent->lock);
+	return seq_has_overflowed(s);
+}
+
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
+			    struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+
+	dl_seq_print(ent, family, s);
+
 	spin_unlock(&ent->lock);
 	return seq_has_overflowed(s);
 }
@@ -840,6 +993,20 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
 	return 0;
 }
 
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+	struct dsthash_ent *ent;
+
+	if (!hlist_empty(&htable->hash[*bucket])) {
+		hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+			if (dl_seq_real_show(ent, htable->family, s))
+				return -1;
+	}
+	return 0;
+}
+
 static const struct seq_operations dl_seq_ops_v1 = {
 	.start = dl_seq_start,
 	.next  = dl_seq_next,
@@ -847,6 +1014,13 @@ static const struct seq_operations dl_seq_ops_v1 = {
 	.show  = dl_seq_show_v1
 };
 
+static const struct seq_operations dl_seq_ops = {
+	.start = dl_seq_start,
+	.next  = dl_seq_next,
+	.stop  = dl_seq_stop,
+	.show  = dl_seq_show
+};
+
 static int dl_proc_open_v1(struct inode *inode, struct file *file)
 {
 	int ret = seq_open(file, &dl_seq_ops_v1);
@@ -858,6 +1032,18 @@ static int dl_proc_open_v1(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &dl_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+
+		sf->private = PDE_DATA(inode);
+	}
+	return ret;
+}
+
 static const struct file_operations dl_file_ops_v1 = {
 	.owner   = THIS_MODULE,
 	.open    = dl_proc_open_v1,
@@ -866,6 +1052,14 @@ static const struct file_operations dl_file_ops_v1 = {
 	.release = seq_release
 };
 
+static const struct file_operations dl_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = dl_proc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
 static int __net_init hashlimit_proc_net_init(struct net *net)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
-- 
1.9.1


^ permalink raw reply related

* [PATCH v3 1/2] netfilter: Prepare xt_hashlimit.c for revision 2
From: Vishwanath Pai @ 2016-09-22 16:42 UTC (permalink / raw)
  To: pablo, kaber, kadlec
  Cc: johunt, netfilter-devel, coreteam, netdev, pai.vishwain

I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the
end of the names.

Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Joshua Hunt <johunt@akamai.com>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  2 +-
 net/netfilter/xt_hashlimit.c                | 61 +++++++++++++++--------------
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h
index 6db9037..ea8c1c0 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -5,7 +5,7 @@
 #include <linux/if.h>
 
 /* timings are in milliseconds. */
-#define XT_HASHLIMIT_SCALE 10000
+#define XT_HASHLIMIT_SCALE_v1 10000
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 1786968..6b0ad93 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,7 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 }
 
 /* need to declare this at the top */
-static const struct file_operations dl_file_ops;
+static const struct file_operations dl_file_ops_v1;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -215,8 +215,8 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-			 u_int8_t family)
+static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
+			    u_int8_t family)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
@@ -265,7 +265,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	hinfo->pde = proc_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		&dl_file_ops, hinfo);
+		&dl_file_ops_v1, hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -398,7 +398,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
    (slowest userspace tool allows), which means
    CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+#define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -410,7 +410,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
 
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
  * credit_cap is used as a counter that tells us how many times we can
@@ -428,11 +428,12 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 static u32 user2credits(u32 user)
 {
 	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1))
 		/* Divide first. */
-		return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+		return (user / XT_HASHLIMIT_SCALE_v1) *\
+					HZ * CREDITS_PER_JIFFY_v1;
 
-	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+	return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE_v1;
 }
 
 static u32 user2credits_byte(u32 user)
@@ -461,7 +462,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
 			return;
 		}
 	} else {
-		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+		dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
 		cap = dh->rateinfo.credit_cap;
 	}
 	if (dh->rateinfo.credit > cap)
@@ -603,7 +604,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
@@ -660,7 +661,7 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return false;
 }
 
-static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 {
 	struct net *net = par->net;
 	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
@@ -701,7 +702,7 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
 	mutex_lock(&hashlimit_mutex);
 	info->hinfo = htable_find_get(net, info->name, par->family);
 	if (info->hinfo == NULL) {
-		ret = htable_create(net, info, par->family);
+		ret = htable_create_v1(net, info, par->family);
 		if (ret < 0) {
 			mutex_unlock(&hashlimit_mutex);
 			return ret;
@@ -711,7 +712,7 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 
@@ -723,10 +724,10 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 1,
 		.family         = NFPROTO_IPV4,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
-		.checkentry     = hashlimit_mt_check,
-		.destroy        = hashlimit_mt_destroy,
+		.checkentry     = hashlimit_mt_check_v1,
+		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
@@ -734,10 +735,10 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 1,
 		.family         = NFPROTO_IPV6,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v1,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
-		.checkentry     = hashlimit_mt_check,
-		.destroy        = hashlimit_mt_destroy,
+		.checkentry     = hashlimit_mt_check_v1,
+		.destroy        = hashlimit_mt_destroy_v1,
 		.me             = THIS_MODULE,
 	},
 #endif
@@ -786,8 +787,8 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 	spin_unlock_bh(&htable->lock);
 }
 
-static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
-				   struct seq_file *s)
+static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
+			       struct seq_file *s)
 {
 	const struct xt_hashlimit_htable *ht = s->private;
 
@@ -825,7 +826,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 	return seq_has_overflowed(s);
 }
 
-static int dl_seq_show(struct seq_file *s, void *v)
+static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
 	struct xt_hashlimit_htable *htable = s->private;
 	unsigned int *bucket = (unsigned int *)v;
@@ -833,22 +834,22 @@ static int dl_seq_show(struct seq_file *s, void *v)
 
 	if (!hlist_empty(&htable->hash[*bucket])) {
 		hlist_for_each_entry(ent, &htable->hash[*bucket], node)
-			if (dl_seq_real_show(ent, htable->family, s))
+			if (dl_seq_real_show_v1(ent, htable->family, s))
 				return -1;
 	}
 	return 0;
 }
 
-static const struct seq_operations dl_seq_ops = {
+static const struct seq_operations dl_seq_ops_v1 = {
 	.start = dl_seq_start,
 	.next  = dl_seq_next,
 	.stop  = dl_seq_stop,
-	.show  = dl_seq_show
+	.show  = dl_seq_show_v1
 };
 
-static int dl_proc_open(struct inode *inode, struct file *file)
+static int dl_proc_open_v1(struct inode *inode, struct file *file)
 {
-	int ret = seq_open(file, &dl_seq_ops);
+	int ret = seq_open(file, &dl_seq_ops_v1);
 
 	if (!ret) {
 		struct seq_file *sf = file->private_data;
@@ -857,9 +858,9 @@ static int dl_proc_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static const struct file_operations dl_file_ops = {
+static const struct file_operations dl_file_ops_v1 = {
 	.owner   = THIS_MODULE,
-	.open    = dl_proc_open,
+	.open    = dl_proc_open_v1,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH] fs/select: add vmalloc fallback for select(2)
From: Vlastimil Babka @ 2016-09-22 16:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, linux-mm,
	Michal Hocko, netdev
In-Reply-To: <1474561478.23058.127.camel@edumazet-glaptop3.roam.corp.google.com>

On 09/22/2016 06:24 PM, Eric Dumazet wrote:

>> +		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
>> +		if (!bits && alloc_size > PAGE_SIZE) {
>> +			bits = vmalloc(alloc_size);
>> +
>> +			if (!bits)
>> +				goto out_nofds;
>
> Test should happen if alloc_size <= PAGE_SIZE
>
>> +		}
>
> if (!bits && alloc_size > PAGE_SIZE)
>     bits = vmalloc(alloc_size);
>
> if (!bits)
>       goto out_nofds;
>

Thanks... stupid last-minute changes.

^ permalink raw reply

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
From: Eric Dumazet @ 2016-09-22 16:30 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Edward Cree, netdev, David S. Miller, James Morris,
	Trond Myklebust, Alexander Duyck, Daniel Borkmann, Eric Dumazet,
	Tom Herbert, Hannes Frederic Sowa, linux-nfs
In-Reply-To: <1474560864.4845.78.camel@redhat.com>

On Thu, 2016-09-22 at 18:14 +0200, Paolo Abeni wrote:

> I think that the idea behind using atomic ops directly on
> sk_forward_alloc is to avoid adding other fields to the udp_socket. 
> 
> If we can add some fields to the udp_sock structure, the schema proposed
> in this patch should fit better (modulo bugs ;-), always requiring a
> single atomic operation at memory reclaiming time and at memory
> allocation time.

But do we want any additional atomic to begin with ?

Given typical number of UDP sockets on a host, we could reserve/forward
alloc at socket creation time, and when SO_RCVBUF is changed.

I totally understand that this schem does not work with TCP, because TCP
BDP mandates xx MB of buffers per socket, and we really handle millions
of TCP sockets per host,

But for UDP, most applications are working well with a default limit,
and we hardly have more than 1000 UDP sockets.

cat /proc/sys/net/core/rmem_default
212992

^ permalink raw reply

* [PATCH net 0/2] Fix tc-ife bugs
From: Yotam Gigi @ 2016-09-22 12:55 UTC (permalink / raw)
  To: jhs, davem, netdev; +Cc: Yotam Gigi

This patch-set contains two bugfixes in the tc-ife action, one fixing some
random behaviour in encode side, and one fixing the decode side packet
parsing logic.

Yotam Gigi (2):
  act_ife: Fix external mac header on encode
  act_ife: Fix false parsing on decode side

 net/sched/act_ife.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

-- 
2.4.11

^ permalink raw reply

* [PATCH net 2/2] act_ife: Fix false parsing on decode side
From: Yotam Gigi @ 2016-09-22 12:55 UTC (permalink / raw)
  To: jhs, davem, netdev; +Cc: Yotam Gigi
In-Reply-To: <1474548926-22815-1-git-send-email-yotamg@mellanox.com>

On ife decode side, the action iterates over the tlvs in the ife header
and parses them one by one, where in each iteration the current pointer is
advanced according to the tlv size.

Before, the pointer was advanced in a wrong way, as the tlv type and size
bytes were not taken into account. This led to false parsing of ife
header. In addition, due to the fact that the loop counter was unsigned,
it could lead to infinite parsing loop.

This fix changes the loop counter to be signed and fixes the parsing to
take into account the tlv type and size.

Fixes: ef6980b6becb ("net sched: introduce IFE action")
Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
---
 net/sched/act_ife.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 7f71a3d..4786b28 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -627,7 +627,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_ife_info *ife = to_ife(a);
 	int action = ife->tcf_action;
 	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
-	u16 ifehdrln = ifehdr->metalen;
+	int ifehdrln = (int)ifehdr->metalen;
 	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
 
 	spin_lock(&ife->tcf_lock);
@@ -668,8 +668,8 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
 			ife->tcf_qstats.overlimits++;
 		}
 
-		tlvdata += alen;
-		ifehdrln -= alen;
+		tlvdata += alen + sizeof(struct meta_tlvhdr);
+		ifehdrln -= alen + sizeof(struct meta_tlvhdr);
 		tlv = (struct meta_tlvhdr *)tlvdata;
 	}
 
-- 
2.4.11

^ permalink raw reply related

* Re: [PATCH] fs/select: add vmalloc fallback for select(2)
From: Eric Dumazet @ 2016-09-22 16:24 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, linux-mm,
	Michal Hocko, netdev
In-Reply-To: <20160922152831.24165-1-vbabka@suse.cz>

On Thu, 2016-09-22 at 17:28 +0200, Vlastimil Babka wrote:
> The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
> with the number of fds passed. We had a customer report page allocation
> failures of order-4 for this allocation. This is a costly order, so it might
> easily fail, as the VM expects such allocation to have a lower-order fallback.
> 
> Such trivial fallback is vmalloc(), as the memory doesn't have to be
> physically contiguous. Also the allocation is temporary for the duration of the
> syscall, so it's unlikely to stress vmalloc too much.
> 
> Note that the poll(2) syscall seems to use a linked list of order-0 pages, so
> it doesn't need this kind of fallback.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  fs/select.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/select.c b/fs/select.c
> index 8ed9da50896a..8fe5bddbe99b 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched/rt.h>
>  #include <linux/freezer.h>
>  #include <net/busy_poll.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/uaccess.h>
>  
> @@ -558,6 +559,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  	struct fdtable *fdt;
>  	/* Allocate small arguments on the stack to save memory and be faster */
>  	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
> +	unsigned long alloc_size;
>  
>  	ret = -EINVAL;
>  	if (n < 0)
> @@ -580,10 +582,15 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  	bits = stack_fds;
>  	if (size > sizeof(stack_fds) / 6) {
>  		/* Not enough space in on-stack array; must use kmalloc */
> +		alloc_size = 6 * size;
>  		ret = -ENOMEM;
> -		bits = kmalloc(6 * size, GFP_KERNEL);
> -		if (!bits)
> -			goto out_nofds;
> +		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
> +		if (!bits && alloc_size > PAGE_SIZE) {
> +			bits = vmalloc(alloc_size);
> +
> +			if (!bits)
> +				goto out_nofds;

Test should happen if alloc_size <= PAGE_SIZE

> +		}

if (!bits && alloc_size > PAGE_SIZE)
    bits = vmalloc(alloc_size);

if (!bits)
      goto out_nofds;



>  	}
>  	fds.in      = bits;
>  	fds.out     = bits +   size;
> @@ -618,7 +625,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
>  
>  out:
>  	if (bits != stack_fds)
> -		kfree(bits);
> +		kvfree(bits);
>  out_nofds:
>  	return ret;
>  }


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH net-next] Documentation: devicetree: revise ethernet device-tree binding about TRGMII
From: sean.wang @ 2016-09-22 16:16 UTC (permalink / raw)
  To: davem, sergei.shtylyov, john
  Cc: nbd, netdev, linux-kernel, linux-mediatek, devicetree, keyhaede,
	objelf, Sean Wang

From: Sean Wang <sean.wang@mediatek.com>

fix typo in mediatek-net.txt and add phy-mode "trgmii" to ethernet.txt

Cc: devicetree@vger.kernel.org
Reported-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
---
 Documentation/devicetree/bindings/net/ethernet.txt     | 4 ++--
 Documentation/devicetree/bindings/net/mediatek-net.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt
index 5d88f37..e1d7681 100644
--- a/Documentation/devicetree/bindings/net/ethernet.txt
+++ b/Documentation/devicetree/bindings/net/ethernet.txt
@@ -11,8 +11,8 @@ The following properties are common to the Ethernet controllers:
   the maximum frame size (there's contradiction in ePAPR).
 - phy-mode: string, operation mode of the PHY interface; supported values are
   "mii", "gmii", "sgmii", "qsgmii", "tbi", "rev-mii", "rmii", "rgmii", "rgmii-id",
-  "rgmii-rxid", "rgmii-txid", "rtbi", "smii", "xgmii"; this is now a de-facto
-  standard property;
+  "rgmii-rxid", "rgmii-txid", "rtbi", "smii", "xgmii", "trgmii"; this is now a
+  de-facto standard property;
 - phy-connection-type: the same as "phy-mode" property but described in ePAPR;
 - phy-handle: phandle, specifies a reference to a node representing a PHY
   device; this property is described in ePAPR and so preferred;
diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt b/Documentation/devicetree/bindings/net/mediatek-net.txt
index bae848e..da05f8b 100644
--- a/Documentation/devicetree/bindings/net/mediatek-net.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-net.txt
@@ -34,7 +34,7 @@ Required properties:
 - phy-handle: see ethernet.txt file in the same directory and
 	the phy-mode "trgmii" required being provided when reg
 	is equal to 0 and the MAC uses fixed-link to connect
-	with inernal switch such as MT7530.
+	with internal switch such as MT7530.
 
 Example:
 
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH net-next 2/3] udp: implement memory accounting helpers
From: Paolo Abeni @ 2016-09-22 16:14 UTC (permalink / raw)
  To: Edward Cree
  Cc: Eric Dumazet, netdev-u79uwXL29TY76Z2rM5mHXA, David S. Miller,
	James Morris, Trond Myklebust, Alexander Duyck, Daniel Borkmann,
	Eric Dumazet, Tom Herbert, Hannes Frederic Sowa,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <589839b3-5930-2527-b0a3-315be254a175-s/n/eUQHGBpZroRs9YW3xA@public.gmane.org>

On Thu, 2016-09-22 at 16:21 +0100, Edward Cree wrote:
> On 22/09/16 11:33, Paolo Abeni wrote:
> > Hi Eric,
> >
> > On Wed, 2016-09-21 at 16:31 -0700, Eric Dumazet wrote:
> >> Also does inet_diag properly give the forward_alloc to user ?
> >>
> >> $ ss -mua
> >> State      Recv-Q Send-Q Local Address:Port                 Peer Addres
> >> s:Port
> >> UNCONN     51584  0          *:52460      *:*                    
> >> 	 skmem:(r51584,rb327680,t0,tb327680,f1664,w0,o0,bl0,d575)
> > Thank you very much for reviewing this! 
> >
> > My bad, there is still a race which leads to temporary negative values
> > of fwd. I feel the fix is trivial but it needs some investigation.
> >
> >> Couldn't we instead use an union of an atomic_t and int for
> >> sk->sk_forward_alloc ?
> > That was our first attempt, but we had some issue on mem scheduling; if
> > we use:
> >
> >    if (atomic_sub_return(size, &sk->sk_forward_alloc_atomic) < 0) {
> >         // fwd alloc
> >    }
> >
> > that leads to inescapable, temporary, negative value for
> > sk->sk_forward_alloc.
> >
> > Another option would be:
> >
> > again:
> >         fwd = atomic_read(&sk->sk_forward_alloc_atomic);
> >         if (fwd > size) {
> > 		if (atomic_cmpxchg(&sk->sk_forward_alloc_atomic, fwd, fwd - size) != fwd)
> > 			goto again;
> >         } else 
> >             // fwd alloc
> >
> > which would be bad under high contention.
> Apologies if I'm misunderstanding the problem, but couldn't you have two
> atomic_t fields, 'internal' and 'external' forward_alloc.  Then
>     if (atomic_sub_return(size, &sk->sk_forward_alloc_internal) < 0) {
>         atomic_sub(size, &sk->sk_forward_alloc);
>         // fwd alloc
>     } else {
>         atomic_add(size, &sk->sk_forward_alloc_internal);
>     }
> or something like that.  Then sk->sk_forward_alloc never sees a negative
> value, and is always >= sk->sk_forward_alloc_internal.  Of course places
> that go the other way would have to add to sk->sk_forward_alloc first,
> before adding to sk->sk_forward_alloc_internal, to maintain that invariant.

I think that the idea behind using atomic ops directly on
sk_forward_alloc is to avoid adding other fields to the udp_socket. 

If we can add some fields to the udp_sock structure, the schema proposed
in this patch should fit better (modulo bugs ;-), always requiring a
single atomic operation at memory reclaiming time and at memory
allocation time.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH iproute2 net-next] tc: m_vlan: Add vlan modify action
From: Stephen Hemminger @ 2016-09-22 16:05 UTC (permalink / raw)
  To: Shmulik Ladkani; +Cc: netdev, Jamal Hadi Salim, Jiri Pirko, Shmulik Ladkani
In-Reply-To: <1474536670-2495-1-git-send-email-shmulik.ladkani@gmail.com>

On Thu, 22 Sep 2016 12:31:10 +0300
Shmulik Ladkani <shmulik.ladkani@ravellosystems.com> wrote:

> +
> +static const char *action_name(int action)
> +{
> +	static const char * const names[] = {
> +		[TCA_VLAN_ACT_POP] = "pop",
> +		[TCA_VLAN_ACT_PUSH] = "push",
> +		[TCA_VLAN_ACT_MODIFY] = "modify",
> +	};
> +	return names[action];
> +}
> +

Why are you wrapping a simple array lookup in a function?

^ permalink raw reply

* [PATCH net-next] net_sched: sch_fq: account for schedule/timers drifts
From: Eric Dumazet @ 2016-09-22 15:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

It looks like the following patch can make FQ very precise, even in VM
or stressed hosts. It matters at high pacing rates.

We take into account the difference between the time that was programmed
when last packet was sent, and current time (a drift of tens of usecs is
often observed)

Add an EWMA of the unthrottle latency to help diagnostics.

This latency is the difference between current time and oldest packet in
delayed RB-tree. This accounts for the high resolution timer latency,
but can be different under stress, as fq_check_throttled() can be
opportunistically be called from a dequeue() called after an enqueue()
for a different flow. 


Tested:
// Start a 10Gbit flow
$ netperf --google-pacing-rate 1250000000 -H lpaa24 -l 10000 -- -K bbr &

Before patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17106.04 756876.84   1102.75 1119049.02      0.00      0.00      0.52

After patch :
$ sar -n DEV 10 5 | grep eth0 | grep Average
Average:         eth0  17867.00 800245.90   1151.77 1183172.12      0.00      0.00      0.52

A new iproute2 tc can output the 'unthrottle latency' :

$ tc -s qd sh dev eth0 | grep latency
  0 gc, 0 highprio, 32490767 throttled, 2382 ns latency

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/uapi/linux/pkt_sched.h |    2 +-
 net/sched/sch_fq.c             |   21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f8e39dbaa781..df7451d35131 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -811,7 +811,7 @@ struct tc_fq_qd_stats {
 	__u32	flows;
 	__u32	inactive_flows;
 	__u32	throttled_flows;
-	__u32	pad;
+	__u32	unthrottle_latency_ns;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 5dd929cc1423..18e752439f6f 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -86,6 +86,7 @@ struct fq_sched_data {
 
 	struct rb_root	delayed;	/* for rate limited flows */
 	u64		time_next_delayed_flow;
+	unsigned long	unthrottle_latency_ns;
 
 	struct fq_flow	internal;	/* for non classified or high prio packets */
 	u32		quantum;
@@ -408,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 {
+	unsigned long sample;
 	struct rb_node *p;
 
 	if (q->time_next_delayed_flow > now)
 		return;
 
+	/* Update unthrottle latency EWMA.
+	 * This is cheap and can help diagnosing timer/latency problems.
+	 */
+	sample = (unsigned long)(now - q->time_next_delayed_flow);
+	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
+	q->unthrottle_latency_ns += sample >> 3;
+
 	q->time_next_delayed_flow = ~0ULL;
 	while ((p = rb_first(&q->delayed)) != NULL) {
 		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
@@ -515,7 +524,12 @@ begin:
 			len = NSEC_PER_SEC;
 			q->stat_pkts_too_long++;
 		}
-
+		/* Account for schedule/timers drifts.
+		 * f->time_next_packet was set when prior packet was sent,
+		 * and current time (@now) can be too late by tens of us.
+		 */
+		if (f->time_next_packet)
+			len -= min(len/2, now - f->time_next_packet);
 		f->time_next_packet = now + len;
 	}
 out:
@@ -787,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
 	q->flow_refill_delay	= msecs_to_jiffies(40);
 	q->flow_max_rate	= ~0U;
+	q->time_next_delayed_flow = ~0ULL;
 	q->rate_enable		= 1;
 	q->new_flows.first	= NULL;
 	q->old_flows.first	= NULL;
@@ -854,8 +869,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.flows		  = q->flows;
 	st.inactive_flows	  = q->inactive_flows;
 	st.throttled_flows	  = q->throttled_flows;
-	st.pad			  = 0;
-
+	st.unthrottle_latency_ns  = min_t(unsigned long,
+					  q->unthrottle_latency_ns, ~0U);
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox