Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next 3/6] bpf: update bpf.h uapi header for tools
From: Jesper Dangaard Brouer @ 2017-09-27  7:03 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: brouer, davem, alexei.starovoitov, john.fastabend,
	peter.waskiewicz.jr, jakub.kicinski, netdev
In-Reply-To: <4bdb49b4387d698443e32b8a6a85414e57ef4de0.1506297988.git.daniel@iogearbox.net>

On Mon, 25 Sep 2017 02:25:52 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> Looks like a couple of updates missed to get carried into tools/include/uapi/,
> so copy the bpf.h header as usual to pull in latest updates.
> 
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  tools/include/uapi/linux/bpf.h | 45 ++++++++++++++++++++++++++++++------------
>  1 file changed, 32 insertions(+), 13 deletions(-)
> 
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 461811e..e43491a 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -143,12 +143,6 @@ enum bpf_attach_type {
>  
>  #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
>  
> -enum bpf_sockmap_flags {
> -	BPF_SOCKMAP_UNSPEC,
> -	BPF_SOCKMAP_STRPARSER,
> -	__MAX_BPF_SOCKMAP_FLAG
> -};
> -

Why does "bpf_sockmap_flags" gets removed?

It got added in: commit 69e8cc134bcb ("bpf: sockmap sample program").
But I cannot find any users of these flags, so I guess it is okay.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH net] i40e: Fix limit imprecise of the number of MAC/VLAN that can be added for VFs
From: w00273186 @ 2017-09-27  6:58 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher; +Cc: netdev, intel-wired-lan, caihe, Yunjian Wang

From: Yunjian Wang <wangyunjian@huawei.com>

Now it don't limit the number of MAC/VLAN strictly. When there is more
elements in the virtchnl MAC/VLAN list, it can still add successfully.

Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 27 +++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 4d1e670..285b96a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2065,11 +2065,6 @@ static inline int i40e_check_vf_permission(struct i40e_vf *vf, u8 *macaddr)
 		dev_err(&pf->pdev->dev,
 			"VF attempting to override administratively set MAC address, reload the VF driver to resume normal operation\n");
 		ret = -EPERM;
-	} else if ((vf->num_mac >= I40E_VC_MAX_MAC_ADDR_PER_VF) &&
-		   !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
-		dev_err(&pf->pdev->dev,
-			"VF is not trusted, switch the VF to trusted to add more functionality\n");
-		ret = -EPERM;
 	}
 	return ret;
 }
@@ -2128,6 +2123,15 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 		} else {
 			vf->num_mac++;
 		}
+
+		if ((vf->num_mac >= I40E_VC_MAX_MAC_ADDR_PER_VF) &&
+		    !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+			dev_err(&pf->pdev->dev,
+				"VF is not trusted, switch the VF to trusted to add more functionality\n");
+			ret = -EPERM;
+			spin_unlock_bh(&vsi->mac_filter_hash_lock);
+			goto error_param;
+		}
 	}
 	spin_unlock_bh(&vsi->mac_filter_hash_lock);
 
@@ -2221,12 +2225,6 @@ static int i40e_vc_add_vlan_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 	i40e_status aq_ret = 0;
 	int i;
 
-	if ((vf->num_vlan >= I40E_VC_MAX_VLAN_PER_VF) &&
-	    !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
-		dev_err(&pf->pdev->dev,
-			"VF is not trusted, switch the VF to trusted to add more VLAN addresses\n");
-		goto error_param;
-	}
 	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states) ||
 	    !i40e_vc_isvalid_vsi_id(vf, vsi_id)) {
 		aq_ret = I40E_ERR_PARAM;
@@ -2269,6 +2267,13 @@ static int i40e_vc_add_vlan_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 			dev_err(&pf->pdev->dev,
 				"Unable to add VLAN filter %d for VF %d, error %d\n",
 				vfl->vlan_id[i], vf->vf_id, ret);
+		if ((vf->num_vlan >= I40E_VC_MAX_VLAN_PER_VF) &&
+		    !test_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps)) {
+			dev_err(&pf->pdev->dev,
+				"VF is not trusted, switch the VF to trusted to add more VLAN addresses\n");
+			aq_ret = -EPERM;
+			goto error_param;
+		}
 	}
 
 error_param:
-- 
1.8.3.1

^ permalink raw reply related

* Re: WARNING: kernel stack frame pointer at ffff880156a5fea0 in bash:2103 has bad value 00007ffec7d87e50
From: Richard Weinberger @ 2017-09-27  6:51 UTC (permalink / raw)
  To: Josh Poimboeuf
  Cc: Alexei Starovoitov, ast, daniel, netdev, linux-kernel, mingo
In-Reply-To: <20170926224246.mj6cxkkelc4sanda@treble>

Am Mittwoch, 27. September 2017, 00:42:46 CEST schrieb Josh Poimboeuf:
> > Here is another variant of the warning, it matches the attached .config:
> I can take a look at it.  Unfortunately, for these types of issues I
> often need the vmlinux file to be able to make sense of the unwinder
> dump.  So if you happen to have somewhere to copy the vmlinux to, that
> would be helpful.  Or if you give me your GCC version I can try to
> rebuild it locally.

There you go:
http://git.infradead.org/~rw/bpf_splat/vmlinux.xz

Thanks,
//richard

-- 
sigma star gmbh - Eduard-Bodem-Gasse 6 - 6020 Innsbruck - Austria
ATU66964118 - FN 374287y

^ permalink raw reply

* [patch net-next v3 12/12] mlxsw: spectrum: router: Don't ignore IPMR notifications
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Make the Spectrum router logic not ignore the RTNL_FAMILY_IPMR FIB
notifications.

Past commits added the IPMR VIF and MFC add/del notifications via the
fib_notifier chain. In addition, a code for handling these notifications in
the Spectrum router logic was added. Make the Spectrum router logic not
ignore these notifications and forward the requests to the Spectrum
multicast router offloading logic.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index dbd9c19..ef4b86b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5159,7 +5159,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	struct mlxsw_sp_router *router;
 
 	if (!net_eq(info->net, &init_net) ||
-	    (info->family != AF_INET && info->family != AF_INET6))
+	    (info->family != AF_INET && info->family != AF_INET6 &&
+	     info->family != RTNL_FAMILY_IPMR))
 		return NOTIFY_DONE;
 
 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 10/12] mlxsw: spectrum_router: Add multicast routes notification handling functionality
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Add functionality for calling the multicast routing offloading logic upon
MFC and VIF add and delete notifications. In addition, call the multicast
routing upon RIF addition and deletion events.

As the multicast routing offload logic may sleep, the actual calls are done
in a deferred work. To ensure the MFC object is not freed in that interval,
a reference is held to it. In case of a failure, the abort mechanism is
used, which ejects all the routes from the hardware and triggers the
traffic to flow through the kernel.

Note: At that stage, the FIB notifications are still ignored, and will be
enabled in a further patch.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 187 ++++++++++++++++++++-
 1 file changed, 185 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 28c0c84..7758442 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -65,6 +65,8 @@
 #include "spectrum_cnt.h"
 #include "spectrum_dpipe.h"
 #include "spectrum_ipip.h"
+#include "spectrum_mr.h"
+#include "spectrum_mr_tcam.h"
 #include "spectrum_router.h"
 
 struct mlxsw_sp_vr;
@@ -459,6 +461,7 @@ struct mlxsw_sp_vr {
 	unsigned int rif_count;
 	struct mlxsw_sp_fib *fib4;
 	struct mlxsw_sp_fib *fib6;
+	struct mlxsw_sp_mr_table *mr4_table;
 };
 
 static const struct rhashtable_params mlxsw_sp_fib_ht_params;
@@ -653,7 +656,7 @@ static void mlxsw_sp_lpm_fini(struct mlxsw_sp *mlxsw_sp)
 
 static bool mlxsw_sp_vr_is_used(const struct mlxsw_sp_vr *vr)
 {
-	return !!vr->fib4 || !!vr->fib6;
+	return !!vr->fib4 || !!vr->fib6 || !!vr->mr4_table;
 }
 
 static struct mlxsw_sp_vr *mlxsw_sp_vr_find_unused(struct mlxsw_sp *mlxsw_sp)
@@ -744,9 +747,18 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
 		err = PTR_ERR(vr->fib6);
 		goto err_fib6_create;
 	}
+	vr->mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
+						 MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(vr->mr4_table)) {
+		err = PTR_ERR(vr->mr4_table);
+		goto err_mr_table_create;
+	}
 	vr->tb_id = tb_id;
 	return vr;
 
+err_mr_table_create:
+	mlxsw_sp_fib_destroy(vr->fib6);
+	vr->fib6 = NULL;
 err_fib6_create:
 	mlxsw_sp_fib_destroy(vr->fib4);
 	vr->fib4 = NULL;
@@ -755,6 +767,8 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
 
 static void mlxsw_sp_vr_destroy(struct mlxsw_sp_vr *vr)
 {
+	mlxsw_sp_mr_table_destroy(vr->mr4_table);
+	vr->mr4_table = NULL;
 	mlxsw_sp_fib_destroy(vr->fib6);
 	vr->fib6 = NULL;
 	mlxsw_sp_fib_destroy(vr->fib4);
@@ -775,7 +789,8 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id)
 static void mlxsw_sp_vr_put(struct mlxsw_sp_vr *vr)
 {
 	if (!vr->rif_count && list_empty(&vr->fib4->node_list) &&
-	    list_empty(&vr->fib6->node_list))
+	    list_empty(&vr->fib6->node_list) &&
+	    mlxsw_sp_mr_table_empty(vr->mr4_table))
 		mlxsw_sp_vr_destroy(vr);
 }
 
@@ -4731,6 +4746,75 @@ static int __mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp,
 	return 0;
 }
 
+static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp,
+				     struct mfc_entry_notifier_info *men_info,
+				     bool replace)
+{
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	return mlxsw_sp_mr_route4_add(vr->mr4_table, men_info->mfc, replace);
+}
+
+static void mlxsw_sp_router_fibmr_del(struct mlxsw_sp *mlxsw_sp,
+				      struct mfc_entry_notifier_info *men_info)
+{
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, men_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mlxsw_sp_mr_route4_del(vr->mr4_table, men_info->mfc);
+	mlxsw_sp_vr_put(vr);
+}
+
+static int
+mlxsw_sp_router_fibmr_vif_add(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, ven_info->dev);
+	return mlxsw_sp_mr_vif_add(vr->mr4_table, ven_info->dev,
+				   ven_info->vif_index,
+				   ven_info->vif_flags, rif);
+}
+
+static void
+mlxsw_sp_router_fibmr_vif_del(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, ven_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mlxsw_sp_mr_vif_del(vr->mr4_table, ven_info->vif_index);
+	mlxsw_sp_vr_put(vr);
+}
+
 static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp)
 {
 	enum mlxsw_reg_ralxx_protocol proto = MLXSW_REG_RALXX_PROTOCOL_IPV4;
@@ -4741,6 +4825,10 @@ static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp)
 	if (err)
 		return err;
 
+	/* The multicast router code does not need an abort trap as by default,
+	 * packets that don't match any routes are trapped to the CPU.
+	 */
+
 	proto = MLXSW_REG_RALXX_PROTOCOL_IPV6;
 	return __mlxsw_sp_router_set_abort_trap(mlxsw_sp, proto,
 						MLXSW_SP_LPM_TREE_MIN + 1);
@@ -4822,6 +4910,8 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp)
 
 		if (!mlxsw_sp_vr_is_used(vr))
 			continue;
+
+		mlxsw_sp_mr_table_flush(vr->mr4_table);
 		mlxsw_sp_vr_fib_flush(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4);
 
 		/* If virtual router was only used for IPv4, then it's no
@@ -4854,6 +4944,8 @@ struct mlxsw_sp_fib_event_work {
 		struct fib_entry_notifier_info fen_info;
 		struct fib_rule_notifier_info fr_info;
 		struct fib_nh_notifier_info fnh_info;
+		struct mfc_entry_notifier_info men_info;
+		struct vif_entry_notifier_info ven_info;
 	};
 	struct mlxsw_sp *mlxsw_sp;
 	unsigned long event;
@@ -4940,6 +5032,55 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 	kfree(fib_work);
 }
 
+static void mlxsw_sp_router_fibmr_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	struct fib_rule *rule;
+	bool replace;
+	int err;
+
+	rtnl_lock();
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+
+		err = mlxsw_sp_router_fibmr_add(mlxsw_sp, &fib_work->men_info,
+						replace);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		ipmr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fibmr_del(mlxsw_sp, &fib_work->men_info);
+		ipmr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD:
+		err = mlxsw_sp_router_fibmr_vif_add(mlxsw_sp,
+						    &fib_work->ven_info);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_VIF_DEL:
+		mlxsw_sp_router_fibmr_vif_del(mlxsw_sp,
+					      &fib_work->ven_info);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_RULE_ADD: /* fall through */
+	case FIB_EVENT_RULE_DEL:
+		rule = fib_work->fr_info.rule;
+		if (!ipmr_rule_default(rule) && !rule->l3mdev)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		fib_rule_put(rule);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
 static void mlxsw_sp_router_fib4_event(struct mlxsw_sp_fib_event_work *fib_work,
 				       struct fib_notifier_info *info)
 {
@@ -4985,6 +5126,30 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
 	}
 }
 
+static void
+mlxsw_sp_router_fibmr_event(struct mlxsw_sp_fib_event_work *fib_work,
+			    struct fib_notifier_info *info)
+{
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		memcpy(&fib_work->men_info, info, sizeof(fib_work->men_info));
+		ipmr_cache_hold(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD: /* fall through */
+	case FIB_EVENT_VIF_DEL:
+		memcpy(&fib_work->ven_info, info, sizeof(fib_work->ven_info));
+		dev_hold(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_RULE_ADD: /* fall through */
+	case FIB_EVENT_RULE_DEL:
+		memcpy(&fib_work->fr_info, info, sizeof(fib_work->fr_info));
+		fib_rule_get(fib_work->fr_info.rule);
+		break;
+	}
+}
+
 /* Called with rcu_read_lock() */
 static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 				     unsigned long event, void *ptr)
@@ -5014,6 +5179,10 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 		INIT_WORK(&fib_work->work, mlxsw_sp_router_fib6_event_work);
 		mlxsw_sp_router_fib6_event(fib_work, info);
 		break;
+	case RTNL_FAMILY_IPMR:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fibmr_event_work);
+		mlxsw_sp_router_fibmr_event(fib_work, info);
+		break;
 	}
 
 	mlxsw_core_schedule_work(&fib_work->work);
@@ -5227,12 +5396,18 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
 	if (err)
 		goto err_configure;
 
+	err = mlxsw_sp_mr_rif_add(vr->mr4_table, rif);
+	if (err)
+		goto err_mr_rif_add;
+
 	mlxsw_sp_rif_counters_alloc(rif);
 	mlxsw_sp->router->rifs[rif_index] = rif;
 	vr->rif_count++;
 
 	return rif;
 
+err_mr_rif_add:
+	ops->deconfigure(rif);
 err_configure:
 	if (fid)
 		mlxsw_sp_fid_put(fid);
@@ -5257,6 +5432,7 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
 	vr->rif_count--;
 	mlxsw_sp->router->rifs[rif->rif_index] = NULL;
 	mlxsw_sp_rif_counters_free(rif);
+	mlxsw_sp_mr_rif_del(vr->mr4_table, rif);
 	ops->deconfigure(rif);
 	if (fid)
 		/* Loopback RIFs are not associated with a FID. */
@@ -6120,6 +6296,10 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 	if (err)
 		goto err_lpm_init;
 
+	err = mlxsw_sp_mr_init(mlxsw_sp, &mlxsw_sp_mr_tcam_ops);
+	if (err)
+		goto err_mr_init;
+
 	err = mlxsw_sp_vrs_init(mlxsw_sp);
 	if (err)
 		goto err_vrs_init;
@@ -6141,6 +6321,8 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 err_neigh_init:
 	mlxsw_sp_vrs_fini(mlxsw_sp);
 err_vrs_init:
+	mlxsw_sp_mr_fini(mlxsw_sp);
+err_mr_init:
 	mlxsw_sp_lpm_fini(mlxsw_sp);
 err_lpm_init:
 	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
@@ -6162,6 +6344,7 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
 	unregister_fib_notifier(&mlxsw_sp->router->fib_nb);
 	mlxsw_sp_neigh_fini(mlxsw_sp);
 	mlxsw_sp_vrs_fini(mlxsw_sp);
+	mlxsw_sp_mr_fini(mlxsw_sp);
 	mlxsw_sp_lpm_fini(mlxsw_sp);
 	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
 	rhashtable_destroy(&mlxsw_sp->router->nexthop_ht);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 11/12] mlxsw: spectrum: Notify multicast router on RIF MTU changes
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Due to the fact that multicast routes hold the minimum MTU of all the
egress RIFs and trap packets that don't meet it, notify the mulitcast
router code on RIF MTU changes.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 7758442..dbd9c19 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5773,6 +5773,17 @@ int mlxsw_sp_netdevice_router_port_event(struct net_device *dev)
 	if (err)
 		goto err_rif_fdb_op;
 
+	if (rif->mtu != dev->mtu) {
+		struct mlxsw_sp_vr *vr;
+
+		/* The RIF is relevant only to its mr_table instance, as unlike
+		 * unicast routing, in multicast routing a RIF cannot be shared
+		 * between several multicast routing tables.
+		 */
+		vr = &mlxsw_sp->router->vrs[rif->vr_id];
+		mlxsw_sp_mr_rif_mtu_update(vr->mr4_table, rif, dev->mtu);
+	}
+
 	ether_addr_copy(rif->addr, dev->dev_addr);
 	rif->mtu = dev->mtu;
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 09/12] mlxsw: spectrum: router: Squash the default route table to main
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Currently, the mlxsw Spectrum driver offloads only either the RT_TABLE_MAIN
FIB table or the VRF tables, so the RT_TABLE_LOCAL table is squashed to the
RT_TABLE_MAIN table to allow local routes to be offloaded too.

By default, multicast MFC routes which are not assigned to any user
requested table are put in the RT_TABLE_DEFAULT table.

Due to the fact that offloading multicast MFC routes support in Spectrum
router logic is going to be introduced soon, squash the default table to
MAIN too.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 321f735..28c0c84 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -693,8 +693,8 @@ static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp,
 
 static u32 mlxsw_sp_fix_tb_id(u32 tb_id)
 {
-	/* For our purpose, squash main and local table into one */
-	if (tb_id == RT_TABLE_LOCAL)
+	/* For our purpose, squash main, default and local tables into one */
+	if (tb_id == RT_TABLE_LOCAL || tb_id == RT_TABLE_DEFAULT)
 		tb_id = RT_TABLE_MAIN;
 	return tb_id;
 }
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 08/12] mlxsw: spectrum: Add the multicast routing hardware logic
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Implement the multicast routing hardware API introduced in previous patch
for the specific spectrum hardware.

The spectrum hardware multicast routes are written using the RMFT2 register
and point to an ACL flexible action set. The actions used for multicast
routes are:
 - Counter action, which allows counting bytes and packets on multicast
   routes.
 - Multicast route action, which provide RPF check and do the actual packet
   duplication to a list of RIFs.
 - Trap action, in the case the route action specified by the called is
   trap.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/Makefile       |   2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |   1 +
 .../net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c | 828 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h |  43 ++
 4 files changed, 873 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h

diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile
index 9b29764..4816504 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Makefile
+++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile
@@ -18,7 +18,7 @@ mlxsw_spectrum-objs		:= spectrum.o spectrum_buffers.o \
 				   spectrum_acl.o spectrum_flower.o \
 				   spectrum_cnt.o spectrum_fid.o \
 				   spectrum_ipip.o spectrum_acl_flex_actions.o \
-				   spectrum_mr.o
+				   spectrum_mr.o spectrum_mr_tcam.o
 mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB)	+= spectrum_dcb.o
 mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o
 obj-$(CONFIG_MLXSW_MINIMAL)	+= mlxsw_minimal.o
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 44c5259..ae67e60 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -139,6 +139,7 @@ struct mlxsw_sp_port_mall_tc_entry {
 struct mlxsw_sp_sb;
 struct mlxsw_sp_bridge;
 struct mlxsw_sp_router;
+struct mlxsw_sp_mr;
 struct mlxsw_sp_acl;
 struct mlxsw_sp_counter_pool;
 struct mlxsw_sp_fid_core;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
new file mode 100644
index 0000000..cda9e9a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
@@ -0,0 +1,828 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/parman.h>
+
+#include "reg.h"
+#include "spectrum.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_mr.h"
+
+struct mlxsw_sp_mr_tcam_region {
+	struct mlxsw_sp *mlxsw_sp;
+	enum mlxsw_reg_rtar_key_type rtar_key_type;
+	struct parman *parman;
+	struct parman_prio *parman_prios;
+};
+
+struct mlxsw_sp_mr_tcam {
+	struct mlxsw_sp_mr_tcam_region ipv4_tcam_region;
+};
+
+/* This struct maps to one RIGR2 register entry */
+struct mlxsw_sp_mr_erif_sublist {
+	struct list_head list;
+	u32 rigr2_kvdl_index;
+	int num_erifs;
+	u16 erif_indices[MLXSW_REG_RIGR2_MAX_ERIFS];
+	bool synced;
+};
+
+struct mlxsw_sp_mr_tcam_erif_list {
+	struct list_head erif_sublists;
+	u32 kvdl_index;
+};
+
+static bool
+mlxsw_sp_mr_erif_sublist_full(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	int erif_list_entries = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						   MC_ERIF_LIST_ENTRIES);
+
+	return erif_sublist->num_erifs == erif_list_entries;
+}
+
+static void
+mlxsw_sp_mr_erif_list_init(struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	INIT_LIST_HEAD(&erif_list->erif_sublists);
+}
+
+#define MLXSW_SP_KVDL_RIGR2_SIZE 1
+
+static struct mlxsw_sp_mr_erif_sublist *
+mlxsw_sp_mr_erif_sublist_create(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	int err;
+
+	erif_sublist = kzalloc(sizeof(*erif_sublist), GFP_KERNEL);
+	if (!erif_sublist)
+		return ERR_PTR(-ENOMEM);
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_RIGR2_SIZE,
+				  &erif_sublist->rigr2_kvdl_index);
+	if (err) {
+		kfree(erif_sublist);
+		return ERR_PTR(err);
+	}
+
+	list_add_tail(&erif_sublist->list, &erif_list->erif_sublists);
+	return erif_sublist;
+}
+
+static void
+mlxsw_sp_mr_erif_sublist_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	list_del(&erif_sublist->list);
+	mlxsw_sp_kvdl_free(mlxsw_sp, erif_sublist->rigr2_kvdl_index);
+	kfree(erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			  u16 erif_index)
+{
+	struct mlxsw_sp_mr_erif_sublist *sublist;
+
+	/* If either there is no erif_entry or the last one is full, allocate a
+	 * new one.
+	 */
+	if (list_empty(&erif_list->erif_sublists)) {
+		sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp, erif_list);
+		if (IS_ERR(sublist))
+			return PTR_ERR(sublist);
+		erif_list->kvdl_index = sublist->rigr2_kvdl_index;
+	} else {
+		sublist = list_last_entry(&erif_list->erif_sublists,
+					  struct mlxsw_sp_mr_erif_sublist,
+					  list);
+		sublist->synced = false;
+		if (mlxsw_sp_mr_erif_sublist_full(mlxsw_sp, sublist)) {
+			sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp,
+								  erif_list);
+			if (IS_ERR(sublist))
+				return PTR_ERR(sublist);
+		}
+	}
+
+	/* Add the eRIF to the last entry's last index */
+	sublist->erif_indices[sublist->num_erifs++] = erif_index;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_erif_list_flush(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist, *tmp;
+
+	list_for_each_entry_safe(erif_sublist, tmp, &erif_list->erif_sublists,
+				 list)
+		mlxsw_sp_mr_erif_sublist_destroy(mlxsw_sp, erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_commit(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *curr_sublist;
+	char rigr2_pl[MLXSW_REG_RIGR2_LEN];
+	int err;
+	int i;
+
+	list_for_each_entry(curr_sublist, &erif_list->erif_sublists, list) {
+		if (curr_sublist->synced)
+			continue;
+
+		/* If the sublist is not the last one, pack the next index */
+		if (list_is_last(&curr_sublist->list,
+				 &erif_list->erif_sublists)) {
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     false, 0);
+		} else {
+			struct mlxsw_sp_mr_erif_sublist *next_sublist;
+
+			next_sublist = list_next_entry(curr_sublist, list);
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     true,
+					     next_sublist->rigr2_kvdl_index);
+		}
+
+		/* Pack all the erifs */
+		for (i = 0; i < curr_sublist->num_erifs; i++) {
+			u16 erif_index = curr_sublist->erif_indices[i];
+
+			mlxsw_reg_rigr2_erif_entry_pack(rigr2_pl, i, true,
+							erif_index);
+		}
+
+		/* Write the entry */
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rigr2),
+				      rigr2_pl);
+		if (err)
+			/* No need of a rollback here because this
+			 * hardware entry should not be pointed yet.
+			 */
+			return err;
+		curr_sublist->synced = true;
+	}
+	return 0;
+}
+
+static void mlxsw_sp_mr_erif_list_move(struct mlxsw_sp_mr_tcam_erif_list *to,
+				       struct mlxsw_sp_mr_tcam_erif_list *from)
+{
+	list_splice(&from->erif_sublists, &to->erif_sublists);
+	to->kvdl_index = from->kvdl_index;
+}
+
+struct mlxsw_sp_mr_tcam_route {
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	u32 counter_index;
+	struct parman_item parman_item;
+	struct parman_prio *parman_prio;
+	enum mlxsw_sp_mr_route_action action;
+	struct mlxsw_sp_mr_route_key key;
+	u16 irif_index;
+	u16 min_mtu;
+};
+
+static struct mlxsw_afa_block *
+mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp,
+				  enum mlxsw_sp_mr_route_action route_action,
+				  u16 irif_index, u32 counter_index,
+				  u16 min_mtu,
+				  struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	afa_block = mlxsw_afa_block_create(mlxsw_sp->afa);
+	if (IS_ERR(afa_block))
+		return afa_block;
+
+	err = mlxsw_afa_block_append_counter(afa_block, counter_index);
+	if (err)
+		goto err;
+
+	switch (route_action) {
+	case MLXSW_SP_MR_ROUTE_ACTION_TRAP:
+		err = mlxsw_afa_block_append_trap(afa_block,
+						  MLXSW_TRAP_ID_ACL1);
+		if (err)
+			goto err;
+		break;
+	case MLXSW_SP_MR_ROUTE_ACTION_FORWARD:
+		/* If we are about to append a multicast router action, commit
+		 * the erif_list.
+		 */
+		err = mlxsw_sp_mr_erif_list_commit(mlxsw_sp, erif_list);
+		if (err)
+			goto err;
+
+		err = mlxsw_afa_block_append_mcrouter(afa_block, irif_index,
+						      min_mtu, false,
+						      erif_list->kvdl_index);
+		if (err)
+			goto err;
+		break;
+	default:
+		err = -EINVAL;
+		goto err;
+	}
+
+	err = mlxsw_afa_block_commit(afa_block);
+	if (err)
+		goto err;
+	return afa_block;
+err:
+	mlxsw_afa_block_destroy(afa_block);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_mr_tcam_afa_block_destroy(struct mlxsw_afa_block *afa_block)
+{
+	mlxsw_afa_block_destroy(afa_block);
+}
+
+static int mlxsw_sp_mr_tcam_route_replace(struct mlxsw_sp *mlxsw_sp,
+					  struct parman_item *parman_item,
+					  struct mlxsw_sp_mr_route_key *key,
+					  struct mlxsw_afa_block *afa_block)
+{
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	switch (key->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, true, parman_item->index,
+					  key->vrid,
+					  MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, 0,
+					  ntohl(key->group.addr4),
+					  ntohl(key->group_mask.addr4),
+					  ntohl(key->source.addr4),
+					  ntohl(key->source_mask.addr4),
+					  mlxsw_afa_block_first_set(afa_block));
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+	default:
+		WARN_ON_ONCE(1);
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static int mlxsw_sp_mr_tcam_route_remove(struct mlxsw_sp *mlxsw_sp, int vrid,
+					 struct parman_item *parman_item)
+{
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, false, parman_item->index, vrid,
+				  0, 0, 0, 0, 0, 0, NULL);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static int
+mlxsw_sp_mr_tcam_erif_populate(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			       struct mlxsw_sp_mr_route_info *route_info)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < route_info->erif_num; i++) {
+		u16 erif_index = route_info->erif_indices[i];
+
+		err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, erif_list,
+						erif_index);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_mr_tcam_route_parman_item_add(struct mlxsw_sp_mr_tcam *mr_tcam,
+				       struct mlxsw_sp_mr_tcam_route *route,
+				       enum mlxsw_sp_mr_route_prio prio)
+{
+	struct parman_prio *parman_prio = NULL;
+	int err;
+
+	switch (route->key.proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parman_prio = &mr_tcam->ipv4_tcam_region.parman_prios[prio];
+		err = parman_item_add(mr_tcam->ipv4_tcam_region.parman,
+				      parman_prio, &route->parman_item);
+		if (err)
+			return err;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+	default:
+		WARN_ON_ONCE(1);
+	}
+	route->parman_prio = parman_prio;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_tcam_route_parman_item_remove(struct mlxsw_sp_mr_tcam *mr_tcam,
+					  struct mlxsw_sp_mr_tcam_route *route)
+{
+	switch (route->key.proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parman_item_remove(mr_tcam->ipv4_tcam_region.parman,
+				   route->parman_prio, &route->parman_item);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static int
+mlxsw_sp_mr_tcam_route_create(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv,
+			      struct mlxsw_sp_mr_route_params *route_params)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+	int err;
+
+	route->key = route_params->key;
+	route->irif_index = route_params->value.irif_index;
+	route->min_mtu = route_params->value.min_mtu;
+	route->action = route_params->value.route_action;
+
+	/* Create the egress RIFs list */
+	mlxsw_sp_mr_erif_list_init(&route->erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &route->erif_list,
+					     &route_params->value);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flow counter */
+	err = mlxsw_sp_flow_counter_alloc(mlxsw_sp, &route->counter_index);
+	if (err)
+		goto err_counter_alloc;
+
+	/* Create the flexible action block */
+	route->afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+							     route->action,
+							     route->irif_index,
+							     route->counter_index,
+							     route->min_mtu,
+							     &route->erif_list);
+	if (IS_ERR(route->afa_block)) {
+		err = PTR_ERR(route->afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Allocate place in the TCAM */
+	err = mlxsw_sp_mr_tcam_route_parman_item_add(mr_tcam, route,
+						     route_params->prio);
+	if (err)
+		goto err_parman_item_add;
+
+	/* Write the route to the TCAM */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, route->afa_block);
+	if (err)
+		goto err_route_replace;
+	return 0;
+
+err_route_replace:
+	mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route);
+err_parman_item_add:
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+err_afa_block_create:
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+err_erif_populate:
+err_counter_alloc:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	return err;
+}
+
+static void mlxsw_sp_mr_tcam_route_destroy(struct mlxsw_sp *mlxsw_sp,
+					   void *priv, void *route_priv)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	mlxsw_sp_mr_tcam_route_remove(mlxsw_sp, route->key.vrid,
+				      &route->parman_item);
+	mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route);
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+}
+
+static int mlxsw_sp_mr_tcam_route_stats(struct mlxsw_sp *mlxsw_sp,
+					void *route_priv, u64 *packets,
+					u64 *bytes)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, route->counter_index,
+					 packets, bytes);
+}
+
+static int
+mlxsw_sp_mr_tcam_route_action_update(struct mlxsw_sp *mlxsw_sp,
+				     void *route_priv,
+				     enum mlxsw_sp_mr_route_action route_action)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route_action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->action = route_action;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_min_mtu_update(struct mlxsw_sp *mlxsw_sp,
+						 void *route_priv, u16 min_mtu)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->min_mtu = min_mtu;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_irif_update(struct mlxsw_sp *mlxsw_sp,
+					      void *route_priv, u16 irif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return -EINVAL;
+	route->irif_index = irif_index;
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_add(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	int err;
+
+	err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &route->erif_list,
+					erif_index);
+	if (err)
+		return err;
+
+	/* Commit the action only if the route action is not TRAP */
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return mlxsw_sp_mr_erif_list_commit(mlxsw_sp,
+						    &route->erif_list);
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_del(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+	int i;
+
+	/* Create a copy of the original erif_list without the deleted entry */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	list_for_each_entry(erif_sublist, &route->erif_list.erif_sublists, list) {
+		for (i = 0; i < erif_sublist->num_erifs; i++) {
+			u16 curr_erif = erif_sublist->erif_indices[i];
+
+			if (curr_erif == erif_index)
+				continue;
+			err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &erif_list,
+							curr_erif);
+			if (err)
+				goto err_erif_list_add;
+		}
+	}
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_list_add:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+static int
+mlxsw_sp_mr_tcam_route_update(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new erif_list */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &erif_list, route_info);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route_info->route_action,
+						      route_info->irif_index,
+						      route->counter_index,
+						      route_info->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					     &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	route->action = route_info->route_action;
+	route->irif_index = route_info->irif_index;
+	route->min_mtu = route_info->min_mtu;
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_populate:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+#define MLXSW_SP_MR_TCAM_REGION_BASE_COUNT 16
+#define MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP 16
+
+static int
+mlxsw_sp_mr_tcam_region_alloc(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_ALLOCATE,
+			    mr_tcam_region->rtar_key_type,
+			    MLXSW_SP_MR_TCAM_REGION_BASE_COUNT);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void
+mlxsw_sp_mr_tcam_region_free(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_DEALLOCATE,
+			    mr_tcam_region->rtar_key_type, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static int mlxsw_sp_mr_tcam_region_parman_resize(void *priv,
+						 unsigned long new_count)
+{
+	struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+	u64 max_tcam_rules;
+
+	max_tcam_rules = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_TCAM_RULES);
+	if (new_count > max_tcam_rules)
+		return -EINVAL;
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_RESIZE,
+			    mr_tcam_region->rtar_key_type, new_count);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void mlxsw_sp_mr_tcam_region_parman_move(void *priv,
+						unsigned long from_index,
+						unsigned long to_index,
+						unsigned long count)
+{
+	struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rrcr_pl[MLXSW_REG_RRCR_LEN];
+
+	mlxsw_reg_rrcr_pack(rrcr_pl, MLXSW_REG_RRCR_OP_MOVE,
+			    from_index, count,
+			    mr_tcam_region->rtar_key_type, to_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rrcr), rrcr_pl);
+}
+
+static const struct parman_ops mlxsw_sp_mr_tcam_region_parman_ops = {
+	.base_count	= MLXSW_SP_MR_TCAM_REGION_BASE_COUNT,
+	.resize_step	= MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP,
+	.resize		= mlxsw_sp_mr_tcam_region_parman_resize,
+	.move		= mlxsw_sp_mr_tcam_region_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+static int
+mlxsw_sp_mr_tcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_mr_tcam_region *mr_tcam_region,
+			     enum mlxsw_reg_rtar_key_type rtar_key_type)
+{
+	struct parman_prio *parman_prios;
+	struct parman *parman;
+	int err;
+	int i;
+
+	mr_tcam_region->rtar_key_type = rtar_key_type;
+	mr_tcam_region->mlxsw_sp = mlxsw_sp;
+
+	err = mlxsw_sp_mr_tcam_region_alloc(mr_tcam_region);
+	if (err)
+		return err;
+
+	parman = parman_create(&mlxsw_sp_mr_tcam_region_parman_ops,
+			       mr_tcam_region);
+	if (!parman) {
+		err = -ENOMEM;
+		goto err_parman_create;
+	}
+	mr_tcam_region->parman = parman;
+
+	parman_prios = kmalloc_array(MLXSW_SP_MR_ROUTE_PRIO_MAX + 1,
+				     sizeof(*parman_prios), GFP_KERNEL);
+	if (!parman_prios)
+		goto err_parman_prios_alloc;
+	mr_tcam_region->parman_prios = parman_prios;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_init(mr_tcam_region->parman,
+				 &mr_tcam_region->parman_prios[i], i);
+	return 0;
+
+err_parman_prios_alloc:
+	parman_destroy(parman);
+err_parman_create:
+	mlxsw_sp_mr_tcam_region_free(mr_tcam_region);
+	return err;
+}
+
+static void
+mlxsw_sp_mr_tcam_region_fini(struct mlxsw_sp_mr_tcam_region *mr_tcam_region)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_fini(&mr_tcam_region->parman_prios[i]);
+	kfree(mr_tcam_region->parman_prios);
+	parman_destroy(mr_tcam_region->parman);
+	mlxsw_sp_mr_tcam_region_free(mr_tcam_region);
+}
+
+static int mlxsw_sp_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MC_ERIF_LIST_ENTRIES) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_TCAM_RULES))
+		return -EIO;
+
+	return mlxsw_sp_mr_tcam_region_init(mlxsw_sp,
+					    &mr_tcam->ipv4_tcam_region,
+					    MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST);
+}
+
+static void mlxsw_sp_mr_tcam_fini(void *priv)
+{
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	mlxsw_sp_mr_tcam_region_fini(&mr_tcam->ipv4_tcam_region);
+}
+
+const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops = {
+	.priv_size = sizeof(struct mlxsw_sp_mr_tcam),
+	.route_priv_size = sizeof(struct mlxsw_sp_mr_tcam_route),
+	.init = mlxsw_sp_mr_tcam_init,
+	.route_create = mlxsw_sp_mr_tcam_route_create,
+	.route_update = mlxsw_sp_mr_tcam_route_update,
+	.route_stats = mlxsw_sp_mr_tcam_route_stats,
+	.route_action_update = mlxsw_sp_mr_tcam_route_action_update,
+	.route_min_mtu_update = mlxsw_sp_mr_tcam_route_min_mtu_update,
+	.route_irif_update = mlxsw_sp_mr_tcam_route_irif_update,
+	.route_erif_add = mlxsw_sp_mr_tcam_route_erif_add,
+	.route_erif_del = mlxsw_sp_mr_tcam_route_erif_del,
+	.route_destroy = mlxsw_sp_mr_tcam_route_destroy,
+	.fini = mlxsw_sp_mr_tcam_fini,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
new file mode 100644
index 0000000..f9b59ee
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
@@ -0,0 +1,43 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+#define _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+
+#include "spectrum.h"
+#include "spectrum_mr.h"
+
+extern const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops;
+
+#endif
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 07/12] mlxsw: spectrum: Add the multicast routing offloading logic
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Add the multicast router offloading logic, which is in charge of handling
the VIF and MFC notifications and translating it to the hardware logic API.

The offloading logic has to overcome several obstacles in order to safely
comply with the kernel multicast router user API:
 - It must keep track of the mapping between VIFs to netdevices. The user
   can add an MFC cache entry pointing to a VIF, delete the VIF and add
   re-add it with a different netdevice. The offloading logic has to handle
   this in order to be compatible with the kernel logic.
 - It must keep track of the mapping between netdevices to spectrum RIFs,
   as the current hardware implementation assume having a RIF for every
   port in a multicast router.
 - It must handle routes pointing to pimreg device to be trapped to the
   kernel, as the packet should be delivered to userspace.
 - It must handle routes pointing tunnel VIFs. The current implementation
   does not support multicast forwarding to tunnels, thus routes that point
   to a tunnel should be trapped to the kernel.
 - It must be aware of proxy multicast routes, which include both (*,*)
   routes and duplicate routes. Currently proxy routes are not offloaded
   and trigger the abort mechanism: removal of all routes from hardware and
   triggering the traffic to go through the kernel.

The multicast routing offloading logic also updates the counters of the
offloaded MFC routes in a periodic work.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v2->v3:
 - Remove double =0 initialization in mlxsw_sp_mr_route_valid_evifs_num.
 - Fix route4 allocation size.
v1->v2:
 - Update the lastuse MFC entry field too, in addition to packets an bytes.
---
 drivers/net/ethernet/mellanox/mlxsw/Makefile      |    3 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h    |    1 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c | 1014 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h |  133 +++
 4 files changed, 1150 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h

diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile
index 4b88158..9b29764 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Makefile
+++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile
@@ -17,7 +17,8 @@ mlxsw_spectrum-objs		:= spectrum.o spectrum_buffers.o \
 				   spectrum_kvdl.o spectrum_acl_tcam.o \
 				   spectrum_acl.o spectrum_flower.o \
 				   spectrum_cnt.o spectrum_fid.o \
-				   spectrum_ipip.o spectrum_acl_flex_actions.o
+				   spectrum_ipip.o spectrum_acl_flex_actions.o \
+				   spectrum_mr.o
 mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB)	+= spectrum_dcb.o
 mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o
 obj-$(CONFIG_MLXSW_MINIMAL)	+= mlxsw_minimal.o
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 9355d91..44c5259 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -153,6 +153,7 @@ struct mlxsw_sp {
 	struct mlxsw_sp_sb *sb;
 	struct mlxsw_sp_bridge *bridge;
 	struct mlxsw_sp_router *router;
+	struct mlxsw_sp_mr *mr;
 	struct mlxsw_afa *afa;
 	struct mlxsw_sp_acl *acl;
 	struct mlxsw_sp_fid_core *fid_core;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
new file mode 100644
index 0000000..0912025
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
@@ -0,0 +1,1014 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/rhashtable.h>
+
+#include "spectrum_mr.h"
+#include "spectrum_router.h"
+
+struct mlxsw_sp_mr {
+	const struct mlxsw_sp_mr_ops *mr_ops;
+	void *catchall_route_priv;
+	struct delayed_work stats_update_dw;
+	struct list_head table_list;
+#define MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL 5000 /* ms */
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_vif {
+	struct net_device *dev;
+	const struct mlxsw_sp_rif *rif;
+	unsigned long vif_flags;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as one of the egress VIFs
+	 */
+	struct list_head route_evif_list;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as an ingress VIF
+	 */
+	struct list_head route_ivif_list;
+};
+
+struct mlxsw_sp_mr_route_vif_entry {
+	struct list_head vif_node;
+	struct list_head route_node;
+	struct mlxsw_sp_mr_vif *mr_vif;
+	struct mlxsw_sp_mr_route *mr_route;
+};
+
+struct mlxsw_sp_mr_table {
+	struct list_head node;
+	enum mlxsw_sp_l3proto proto;
+	struct mlxsw_sp *mlxsw_sp;
+	u32 vr_id;
+	struct mlxsw_sp_mr_vif vifs[MAXVIFS];
+	struct list_head route_list;
+	struct rhashtable route_ht;
+	char catchall_route_priv[0];
+	/* catchall_route_priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_route {
+	struct list_head node;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_mr_route_key key;
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 min_mtu;
+	struct mfc_cache *mfc4;
+	void *route_priv;
+	const struct mlxsw_sp_mr_table *mr_table;
+	/* A list of route_vif_entry structs that point to the egress VIFs */
+	struct list_head evif_list;
+	/* A route_vif_entry struct that point to the ingress VIF */
+	struct mlxsw_sp_mr_route_vif_entry ivif;
+};
+
+static const struct rhashtable_params mlxsw_sp_mr_route_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_mr_route_key),
+	.key_offset = offsetof(struct mlxsw_sp_mr_route, key),
+	.head_offset = offsetof(struct mlxsw_sp_mr_route, ht_node),
+	.automatic_shrinking = true,
+};
+
+static bool mlxsw_sp_mr_vif_regular(const struct mlxsw_sp_mr_vif *vif)
+{
+	return !(vif->vif_flags & (VIFF_TUNNEL | VIFF_REGISTER));
+}
+
+static bool mlxsw_sp_mr_vif_valid(const struct mlxsw_sp_mr_vif *vif)
+{
+	return mlxsw_sp_mr_vif_regular(vif) && vif->dev && vif->rif;
+}
+
+static bool mlxsw_sp_mr_vif_rif_invalid(const struct mlxsw_sp_mr_vif *vif)
+{
+	return mlxsw_sp_mr_vif_regular(vif) && vif->dev && !vif->rif;
+}
+
+static bool
+mlxsw_sp_mr_route_ivif_in_evifs(const struct mlxsw_sp_mr_route *mr_route)
+{
+	vifi_t ivif;
+
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		ivif = mr_route->mfc4->mfc_parent;
+		return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+	return false;
+}
+
+static int
+mlxsw_sp_mr_route_valid_evifs_num(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	int valid_evifs;
+
+	valid_evifs = 0;
+	list_for_each_entry(rve, &mr_route->evif_list, route_node)
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+			valid_evifs++;
+	return valid_evifs;
+}
+
+static bool mlxsw_sp_mr_route_starg(const struct mlxsw_sp_mr_route *mr_route)
+{
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return mr_route->key.source_mask.addr4 == INADDR_ANY;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+	return false;
+}
+
+static enum mlxsw_sp_mr_route_action
+mlxsw_sp_mr_route_action(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* If the ingress port is not regular and resolved, trap the route */
+	if (!mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* The kernel does not match a (*,G) route that the ingress interface is
+	 * not one of the egress interfaces, so trap these kind of routes.
+	 */
+	if (mlxsw_sp_mr_route_starg(mr_route) &&
+	    !mlxsw_sp_mr_route_ivif_in_evifs(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If the route has no valid eVIFs, trap it. */
+	if (!mlxsw_sp_mr_route_valid_evifs_num(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If either one of the eVIFs is not regular (VIF of type pimreg or
+	 * tunnel) or one of the VIFs has no matching RIF, trap the packet.
+	 */
+	list_for_each_entry(rve, &mr_route->evif_list, route_node) {
+		if (!mlxsw_sp_mr_vif_regular(rve->mr_vif) ||
+		    mlxsw_sp_mr_vif_rif_invalid(rve->mr_vif))
+			return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	}
+	return MLXSW_SP_MR_ROUTE_ACTION_FORWARD;
+}
+
+static enum mlxsw_sp_mr_route_prio
+mlxsw_sp_mr_route_prio(const struct mlxsw_sp_mr_route *mr_route)
+{
+	return mlxsw_sp_mr_route_starg(mr_route) ?
+		MLXSW_SP_MR_ROUTE_PRIO_STARG : MLXSW_SP_MR_ROUTE_PRIO_SG;
+}
+
+static void mlxsw_sp_mr_route4_key(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route_key *key,
+				   const struct mfc_cache *mfc)
+{
+	bool starg = (mfc->mfc_origin == INADDR_ANY);
+
+	memset(key, 0, sizeof(*key));
+	key->vrid = mr_table->vr_id;
+	key->proto = mr_table->proto;
+	key->group.addr4 = mfc->mfc_mcastgrp;
+	key->group_mask.addr4 = 0xffffffff;
+	key->source.addr4 = mfc->mfc_origin;
+	key->source_mask.addr4 = starg ? 0 : 0xffffffff;
+}
+
+static int mlxsw_sp_mr_route_evif_link(struct mlxsw_sp_mr_route *mr_route,
+				       struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	rve = kzalloc(sizeof(*rve), GFP_KERNEL);
+	if (!rve)
+		return -ENOMEM;
+	rve->mr_route = mr_route;
+	rve->mr_vif = mr_vif;
+	list_add_tail(&rve->route_node, &mr_route->evif_list);
+	list_add_tail(&rve->vif_node, &mr_vif->route_evif_list);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_evif_unlink(struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	list_del(&rve->route_node);
+	list_del(&rve->vif_node);
+	kfree(rve);
+}
+
+static void mlxsw_sp_mr_route_ivif_link(struct mlxsw_sp_mr_route *mr_route,
+					struct mlxsw_sp_mr_vif *mr_vif)
+{
+	mr_route->ivif.mr_route = mr_route;
+	mr_route->ivif.mr_vif = mr_vif;
+	list_add_tail(&mr_route->ivif.vif_node, &mr_vif->route_ivif_list);
+}
+
+static void mlxsw_sp_mr_route_ivif_unlink(struct mlxsw_sp_mr_route *mr_route)
+{
+	list_del(&mr_route->ivif.vif_node);
+}
+
+static int
+mlxsw_sp_mr_route_info_create(struct mlxsw_sp_mr_table *mr_table,
+			      struct mlxsw_sp_mr_route *mr_route,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	u16 *erif_indices;
+	u16 irif_index;
+	u16 erif = 0;
+
+	erif_indices = kmalloc_array(MAXVIFS, sizeof(*erif_indices),
+				     GFP_KERNEL);
+	if (!erif_indices)
+		return -ENOMEM;
+
+	list_for_each_entry(rve, &mr_route->evif_list, route_node) {
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+			u16 rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+
+			erif_indices[erif++] = rifi;
+		}
+	}
+
+	if (mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		irif_index = mlxsw_sp_rif_index(mr_route->ivif.mr_vif->rif);
+	else
+		irif_index = 0;
+
+	route_info->irif_index = irif_index;
+	route_info->erif_indices = erif_indices;
+	route_info->min_mtu = mr_route->min_mtu;
+	route_info->route_action = mr_route->route_action;
+	route_info->erif_num = erif;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_info_destroy(struct mlxsw_sp_mr_route_info *route_info)
+{
+	kfree(route_info->erif_indices);
+}
+
+static int mlxsw_sp_mr_route_write(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route *mr_route,
+				   bool replace)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_info route_info;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	int err;
+
+	err = mlxsw_sp_mr_route_info_create(mr_table, mr_route, &route_info);
+	if (err)
+		return err;
+
+	if (!replace) {
+		struct mlxsw_sp_mr_route_params route_params;
+
+		mr_route->route_priv = kzalloc(mr->mr_ops->route_priv_size,
+					       GFP_KERNEL);
+		if (!mr_route->route_priv) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		route_params.key = mr_route->key;
+		route_params.value = route_info;
+		route_params.prio = mlxsw_sp_mr_route_prio(mr_route);
+		err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+					       mr_route->route_priv,
+					       &route_params);
+		if (err)
+			kfree(mr_route->route_priv);
+	} else {
+		err = mr->mr_ops->route_update(mlxsw_sp, mr_route->route_priv,
+					       &route_info);
+	}
+out:
+	mlxsw_sp_mr_route_info_destroy(&route_info);
+	return err;
+}
+
+static void mlxsw_sp_mr_route_erase(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv, mr_route->route_priv);
+	kfree(mr_route->route_priv);
+}
+
+static struct mlxsw_sp_mr_route *
+mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table,
+			  struct mfc_cache *mfc)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err;
+	int i;
+
+	/* Allocate and init a new route and fill it with parameters */
+	mr_route = kzalloc(sizeof(*mr_route), GFP_KERNEL);
+	if (!mr_route)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&mr_route->evif_list);
+	mlxsw_sp_mr_route4_key(mr_table, &mr_route->key, mfc);
+
+	/* Find min_mtu and link iVIF and eVIFs */
+	mr_route->min_mtu = ETH_MAX_MTU;
+	ipmr_cache_hold(mfc);
+	mr_route->mfc4 = mfc;
+	mr_route->mr_table = mr_table;
+	for (i = 0; i < MAXVIFS; i++) {
+		if (mfc->mfc_un.res.ttls[i] != 255) {
+			err = mlxsw_sp_mr_route_evif_link(mr_route,
+							  &mr_table->vifs[i]);
+			if (err)
+				goto err;
+			if (mr_table->vifs[i].dev &&
+			    mr_table->vifs[i].dev->mtu < mr_route->min_mtu)
+				mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
+		}
+	}
+	mlxsw_sp_mr_route_ivif_link(mr_route, &mr_table->vifs[mfc->mfc_parent]);
+	if (err)
+		goto err;
+
+	mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
+	return mr_route;
+err:
+	ipmr_cache_put(mfc);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_mr_route4_destroy(struct mlxsw_sp_mr_table *mr_table,
+				       struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+
+	mlxsw_sp_mr_route_ivif_unlink(mr_route);
+	ipmr_cache_put(mr_route->mfc4);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+}
+
+static void mlxsw_sp_mr_route_destroy(struct mlxsw_sp_mr_table *mr_table,
+				      struct mlxsw_sp_mr_route *mr_route)
+{
+	switch (mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_mr_route4_destroy(mr_table, mr_route);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void mlxsw_sp_mr_mfc_offload_set(struct mlxsw_sp_mr_route *mr_route,
+					bool offload)
+{
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		if (offload)
+			mr_route->mfc4->mfc_flags |= MFC_OFFLOAD;
+		else
+			mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void mlxsw_sp_mr_mfc_offload_update(struct mlxsw_sp_mr_route *mr_route)
+{
+	bool offload;
+
+	offload = mr_route->route_action != MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_set(mr_route, offload);
+}
+
+static void __mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	mlxsw_sp_mr_mfc_offload_set(mr_route, false);
+	mlxsw_sp_mr_route_erase(mr_table, mr_route);
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+	list_del(&mr_route->node);
+	mlxsw_sp_mr_route_destroy(mr_table, mr_route);
+}
+
+int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table,
+			   struct mfc_cache *mfc, bool replace)
+{
+	struct mlxsw_sp_mr_route *mr_orig_route = NULL;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err;
+
+	/* If the route is a (*,*) route, abort, as these kind of routes are
+	 * used for proxy routes.
+	 */
+	if (mfc->mfc_origin == INADDR_ANY && mfc->mfc_mcastgrp == INADDR_ANY) {
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		return -EINVAL;
+	}
+
+	/* Create a new route */
+	mr_route = mlxsw_sp_mr_route4_create(mr_table, mfc);
+	if (IS_ERR(mr_route))
+		return PTR_ERR(mr_route);
+
+	/* Find any route with a matching key */
+	mr_orig_route = rhashtable_lookup_fast(&mr_table->route_ht,
+					       &mr_route->key,
+					       mlxsw_sp_mr_route_ht_params);
+	if (replace) {
+		/* On replace case, make the route point to the new route_priv.
+		 */
+		if (WARN_ON(!mr_orig_route)) {
+			err = -ENOENT;
+			goto err_no_orig_route;
+		}
+		mr_route->route_priv = mr_orig_route->route_priv;
+	} else if (mr_orig_route) {
+		/* On non replace case, if another route with the same key was
+		 * found, abort, as duplicate routes are used for proxy routes.
+		 */
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		err = -EINVAL;
+		goto err_duplicate_route;
+	}
+
+	/* Put it in the table data-structures */
+	list_add_tail(&mr_route->node, &mr_table->route_list);
+	err = rhashtable_insert_fast(&mr_table->route_ht,
+				     &mr_route->ht_node,
+				     mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	/* Write the route to the hardware */
+	err = mlxsw_sp_mr_route_write(mr_table, mr_route, replace);
+	if (err)
+		goto err_mr_route_write;
+
+	/* Destroy the original route */
+	if (replace) {
+		rhashtable_remove_fast(&mr_table->route_ht,
+				       &mr_orig_route->ht_node,
+				       mlxsw_sp_mr_route_ht_params);
+		list_del(&mr_orig_route->node);
+		mlxsw_sp_mr_route4_destroy(mr_table, mr_orig_route);
+	}
+
+	mlxsw_sp_mr_mfc_offload_update(mr_route);
+	return 0;
+
+err_mr_route_write:
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+err_rhashtable_insert:
+	list_del(&mr_route->node);
+err_no_orig_route:
+err_duplicate_route:
+	mlxsw_sp_mr_route4_destroy(mr_table, mr_route);
+	return err;
+}
+
+void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table,
+			    struct mfc_cache *mfc)
+{
+	struct mlxsw_sp_mr_route *mr_route;
+	struct mlxsw_sp_mr_route_key key;
+
+	mlxsw_sp_mr_route4_key(mr_table, &key, mfc);
+	mr_route = rhashtable_lookup_fast(&mr_table->route_ht, &key,
+					  mlxsw_sp_mr_route_ht_params);
+	if (mr_route)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+}
+
+/* Should be called after the VIF struct is updated */
+static int
+mlxsw_sp_mr_route_ivif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 irif_index;
+	int err;
+
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return 0;
+
+	/* rve->mr_vif->rif is guaranteed to be valid at this stage */
+	irif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	err = mr->mr_ops->route_irif_update(mlxsw_sp, rve->mr_route->route_priv,
+					    irif_index);
+	if (err)
+		return err;
+
+	err = mr->mr_ops->route_action_update(mlxsw_sp,
+					      rve->mr_route->route_priv,
+					      route_action);
+	if (err)
+		/* No need to rollback here because the iRIF change only takes
+		 * place after the action has been updated.
+		 */
+		return err;
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_ivif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_action_update(mlxsw_sp, rve->mr_route->route_priv,
+					MLXSW_SP_MR_ROUTE_ACTION_TRAP);
+	rve->mr_route->route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+/* Should be called after the RIF struct is updated */
+static int
+mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 erif_index = 0;
+	int err;
+
+	/* Update the route action, as the new eVIF can be a tunnel or a pimreg
+	 * device which will require updating the action.
+	 */
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action) {
+		err = mr->mr_ops->route_action_update(mlxsw_sp,
+						      rve->mr_route->route_priv,
+						      route_action);
+		if (err)
+			return err;
+	}
+
+	/* Add the eRIF */
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+		erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+		err = mr->mr_ops->route_erif_add(mlxsw_sp,
+						 rve->mr_route->route_priv,
+						 erif_index);
+		if (err)
+			goto err_route_erif_add;
+	}
+
+	/* Update the minimum MTU */
+	if (rve->mr_vif->dev->mtu < rve->mr_route->min_mtu) {
+		rve->mr_route->min_mtu = rve->mr_vif->dev->mtu;
+		err = mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+						       rve->mr_route->route_priv,
+						       rve->mr_route->min_mtu);
+		if (err)
+			goto err_route_min_mtu_update;
+	}
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+
+err_route_min_mtu_update:
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
+					   erif_index);
+err_route_erif_add:
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						rve->mr_route->route_action);
+	return err;
+}
+
+/* Should be called before the RIF struct is updated */
+static void
+mlxsw_sp_mr_route_evif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 rifi;
+
+	/* If the unresolved RIF was not valid, no need to delete it */
+	if (!mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		return;
+
+	/* Update the route action: if there is only one valid eVIF in the
+	 * route, set the action to trap as the VIF deletion will lead to zero
+	 * valid eVIFs. On any other case, use the mlxsw_sp_mr_route_action to
+	 * determine the route action.
+	 */
+	if (mlxsw_sp_mr_route_valid_evifs_num(rve->mr_route) == 1)
+		route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	else
+		route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						route_action);
+
+	/* Delete the erif from the route */
+	rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, rifi);
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+static int mlxsw_sp_mr_vif_resolve(struct mlxsw_sp_mr_table *mr_table,
+				   struct net_device *dev,
+				   struct mlxsw_sp_mr_vif *mr_vif,
+				   unsigned long vif_flags,
+				   const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *irve, *erve;
+	int err;
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = rif;
+	mr_vif->vif_flags = vif_flags;
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(irve, &mr_vif->route_ivif_list, vif_node) {
+		err = mlxsw_sp_mr_route_ivif_resolve(mr_table, irve);
+		if (err)
+			goto err_irif_unresolve;
+	}
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(erve, &mr_vif->route_evif_list, vif_node) {
+		err = mlxsw_sp_mr_route_evif_resolve(mr_table, erve);
+		if (err)
+			goto err_erif_unresolve;
+	}
+	return 0;
+
+err_erif_unresolve:
+	list_for_each_entry_from_reverse(erve, &mr_vif->route_evif_list,
+					 vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, erve);
+err_irif_unresolve:
+	list_for_each_entry_from_reverse(irve, &mr_vif->route_ivif_list,
+					 vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, irve);
+	mr_vif->rif = NULL;
+	return err;
+}
+
+static void mlxsw_sp_mr_vif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				      struct net_device *dev,
+				      struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, rve);
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(rve, &mr_vif->route_ivif_list, vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, rve);
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = NULL;
+}
+
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags, const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return -EINVAL;
+	if (mr_vif->dev)
+		return -EEXIST;
+	return mlxsw_sp_mr_vif_resolve(mr_table, dev, mr_vif, vif_flags, rif);
+}
+
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return;
+	if (WARN_ON(!mr_vif->dev))
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, NULL, mr_vif);
+}
+
+struct mlxsw_sp_mr_vif *
+mlxsw_sp_mr_dev_vif_lookup(struct mlxsw_sp_mr_table *mr_table,
+			   const struct net_device *dev)
+{
+	vifi_t vif_index;
+
+	for (vif_index = 0; vif_index < MAXVIFS; vif_index++)
+		if (mr_table->vifs[vif_index].dev == dev)
+			return &mr_table->vifs[vif_index];
+	return NULL;
+}
+
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return 0;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return 0;
+	return mlxsw_sp_mr_vif_resolve(mr_table, mr_vif->dev, mr_vif,
+				       mr_vif->vif_flags, rif);
+}
+
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, mr_vif->dev, mr_vif);
+}
+
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	/* Search for a VIF that use that RIF */
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+
+	/* Update all the routes that uses that VIF as eVIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node) {
+		if (mtu < rve->mr_route->min_mtu) {
+			rve->mr_route->min_mtu = mtu;
+			mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+							 rve->mr_route->route_priv,
+							 mtu);
+		}
+	}
+}
+
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 vr_id,
+						   enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_mr_route_params catchall_route_params = {
+		.prio = MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+		.key = {
+			.vrid = vr_id,
+		},
+		.value = {
+			.route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+		}
+	};
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_table *mr_table;
+	int err;
+	int i;
+
+	mr_table = kzalloc(sizeof(*mr_table) + mr->mr_ops->route_priv_size,
+			   GFP_KERNEL);
+	if (!mr_table)
+		return ERR_PTR(-ENOMEM);
+
+	mr_table->vr_id = vr_id;
+	mr_table->mlxsw_sp = mlxsw_sp;
+	mr_table->proto = proto;
+	INIT_LIST_HEAD(&mr_table->route_list);
+
+	err = rhashtable_init(&mr_table->route_ht,
+			      &mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_route_rhashtable_init;
+
+	for (i = 0; i < MAXVIFS; i++) {
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_evif_list);
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_ivif_list);
+	}
+
+	err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+				       mr_table->catchall_route_priv,
+				       &catchall_route_params);
+	if (err)
+		goto err_ops_route_create;
+	list_add_tail(&mr_table->node, &mr->table_list);
+	return mr_table;
+
+err_ops_route_create:
+	rhashtable_destroy(&mr_table->route_ht);
+err_route_rhashtable_init:
+	kfree(mr_table);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	WARN_ON(!mlxsw_sp_mr_table_empty(mr_table));
+	list_del(&mr_table->node);
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv,
+				  &mr_table->catchall_route_priv);
+	rhashtable_destroy(&mr_table->route_ht);
+	kfree(mr_table);
+}
+
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp_mr_route *mr_route, *tmp;
+	int i;
+
+	list_for_each_entry_safe(mr_route, tmp, &mr_table->route_list, node)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+
+	for (i = 0; i < MAXVIFS; i++) {
+		mr_table->vifs[i].dev = NULL;
+		mr_table->vifs[i].rif = NULL;
+	}
+}
+
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table)
+{
+	int i;
+
+	for (i = 0; i < MAXVIFS; i++)
+		if (mr_table->vifs[i].dev)
+			return false;
+	return list_empty(&mr_table->route_list);
+}
+
+static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp,
+					   struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u64 packets, bytes;
+
+	if (mr_route->route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return;
+
+	mr->mr_ops->route_stats(mlxsw_sp, mr_route->route_priv, &packets,
+				&bytes);
+
+	switch (mr_route->mr_table->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		if (mr_route->mfc4->mfc_un.res.pkt != packets)
+			mr_route->mfc4->mfc_un.res.lastuse = jiffies;
+		mr_route->mfc4->mfc_un.res.pkt = packets;
+		mr_route->mfc4->mfc_un.res.bytes = bytes;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		/* fall through */
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+
+static void mlxsw_sp_mr_stats_update(struct work_struct *work)
+{
+	struct mlxsw_sp_mr *mr = container_of(work, struct mlxsw_sp_mr,
+					      stats_update_dw.work);
+	struct mlxsw_sp_mr_table *mr_table;
+	struct mlxsw_sp_mr_route *mr_route;
+	unsigned long interval;
+
+	rtnl_lock();
+	list_for_each_entry(mr_table, &mr->table_list, node)
+		list_for_each_entry(mr_route, &mr_table->route_list, node)
+			mlxsw_sp_mr_route_stats_update(mr_table->mlxsw_sp,
+						       mr_route);
+	rtnl_unlock();
+
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+}
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops)
+{
+	struct mlxsw_sp_mr *mr;
+	unsigned long interval;
+	int err;
+
+	mr = kzalloc(sizeof(*mr) + mr_ops->priv_size, GFP_KERNEL);
+	if (!mr)
+		return -ENOMEM;
+	mr->mr_ops = mr_ops;
+	mlxsw_sp->mr = mr;
+	INIT_LIST_HEAD(&mr->table_list);
+
+	err = mr_ops->init(mlxsw_sp, mr->priv);
+	if (err)
+		goto err;
+
+	/* Create the delayed work for counter updates */
+	INIT_DELAYED_WORK(&mr->stats_update_dw, mlxsw_sp_mr_stats_update);
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+	return 0;
+err:
+	kfree(mr);
+	return err;
+}
+
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	cancel_delayed_work_sync(&mr->stats_update_dw);
+	mr->mr_ops->fini(mr->priv);
+	kfree(mr);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
new file mode 100644
index 0000000..c851b23
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
@@ -0,0 +1,133 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_H
+#define _MLXSW_SPECTRUM_MCROUTER_H
+
+#include <linux/mroute.h>
+#include "spectrum_router.h"
+#include "spectrum.h"
+
+enum mlxsw_sp_mr_route_action {
+	MLXSW_SP_MR_ROUTE_ACTION_FORWARD,
+	MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+};
+
+enum mlxsw_sp_mr_route_prio {
+	MLXSW_SP_MR_ROUTE_PRIO_SG,
+	MLXSW_SP_MR_ROUTE_PRIO_STARG,
+	MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+	__MLXSW_SP_MR_ROUTE_PRIO_MAX
+};
+
+#define MLXSW_SP_MR_ROUTE_PRIO_MAX (__MLXSW_SP_MR_ROUTE_PRIO_MAX - 1)
+
+struct mlxsw_sp_mr_route_key {
+	int vrid;
+	enum mlxsw_sp_l3proto proto;
+	union mlxsw_sp_l3addr group;
+	union mlxsw_sp_l3addr group_mask;
+	union mlxsw_sp_l3addr source;
+	union mlxsw_sp_l3addr source_mask;
+};
+
+struct mlxsw_sp_mr_route_info {
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 irif_index;
+	u16 *erif_indices;
+	size_t erif_num;
+	u16 min_mtu;
+};
+
+struct mlxsw_sp_mr_route_params {
+	struct mlxsw_sp_mr_route_key key;
+	struct mlxsw_sp_mr_route_info value;
+	enum mlxsw_sp_mr_route_prio prio;
+};
+
+struct mlxsw_sp_mr_ops {
+	int priv_size;
+	int route_priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	int (*route_create)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			    void *route_priv,
+			    struct mlxsw_sp_mr_route_params *route_params);
+	int (*route_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			    struct mlxsw_sp_mr_route_info *route_info);
+	int (*route_stats)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			   u64 *packets, u64 *bytes);
+	int (*route_action_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				   enum mlxsw_sp_mr_route_action route_action);
+	int (*route_min_mtu_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				    u16 min_mtu);
+	int (*route_irif_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				 u16 irif_index);
+	int (*route_erif_add)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	int (*route_erif_del)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	void (*route_destroy)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv);
+	void (*fini)(void *priv);
+};
+
+struct mlxsw_sp_mr;
+struct mlxsw_sp_mr_table;
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops);
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table,
+			   struct mfc_cache *mfc, bool replace);
+void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table,
+			    struct mfc_cache *mfc);
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index);
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu);
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 tb_id,
+						   enum mlxsw_sp_l3proto proto);
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table);
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table);
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table);
+
+#endif
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 06/12] net: mroute: Check if rule is a default rule
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

When the ipmr starts, it adds one default FIB rule that matches all packets
and sends them to the DEFAULT (multicast) FIB table. A more complex rule
can be added by user to specify that for a specific interface, a packet
should be look up at either an arbitrary table or according to the l3mdev
of the interface.

For drivers willing to offload the ipmr logic into a hardware but don't
want to offload all the FIB rules functionality, provide a function that
can indicate whether the FIB rule is the default multicast rule, thus only
one routing table is needed.

This way, a driver can register to the FIB notification chain, get
notifications about FIB rules added and trigger some kind of an internal
abort mechanism when a non default rule is added by the user.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v2->v3:
 - Use the already existing ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
v1->v2:
 - Update the lastuse MFC entry field too, in addition to packets an bytes.
---
 include/linux/mroute.h |  7 +++++++
 net/ipv4/ipmr.c        | 12 ++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 5566580..b072a84 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_rules.h>
 #include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
@@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
 int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
 int ip_mr_init(void);
+bool ipmr_rule_default(const struct fib_rule *rule);
 #else
 static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
 				       char __user *optval, unsigned int optlen)
@@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt)
 {
 	return 0;
 }
+
+static inline bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
 #endif
 
 struct vif_device {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2a795d2..292a8e8 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -274,6 +274,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
 {
 	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
 }
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #else
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -318,6 +324,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net)
 {
 	return 0;
 }
+
+bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
+EXPORT_SYMBOL(ipmr_rule_default);
 #endif
 
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 05/12] net: ipmr: Add MFC offload indication
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Allow drivers, registered to the fib notification chain indicate whether a
multicast MFC route is offloaded or not, similarly to unicast routes. The
indication of whether a route is offloaded is done using the mfc_flags
field on an mfc_cache struct, and the information is sent to the userspace
via the RTNetlink interface only.

Currently, MFC routes are either offloaded or not, thus there is no need to
add per-VIF offload indication.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
v1->v2:
 - Add comment for the MFC_OFFLOAD flag
---
 include/linux/mroute.h | 2 ++
 net/ipv4/ipmr.c        | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 54c5cb8..5566580 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -90,9 +90,11 @@ struct mr_table {
 
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
  */
 enum {
 	MFC_STATIC = BIT(0),
+	MFC_OFFLOAD = BIT(1),
 };
 
 struct mfc_cache_cmp_arg {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ba71bc4..2a795d2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2268,6 +2268,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
 		return -EMSGSIZE;
 
+	if (c->mfc_flags & MFC_OFFLOAD)
+		rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
 	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
 		return -EMSGSIZE;
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 04/12] ipmr: Send FIB notifications on MFC and VIF entries
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Use the newly introduced notification chain to send events upon VIF and MFC
addition and deletion. The MFC notifications are sent only on resolved MFC
entries, as unresolved cannot be offloaded.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
 net/ipv4/ipmr.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 49879c3..ba71bc4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -627,6 +627,27 @@ static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
 	return call_fib_notifier(nb, net, event_type, &info.info);
 }
 
+static int call_ipmr_vif_entry_notifiers(struct net *net,
+					 enum fib_event_type event_type,
+					 struct vif_device *vif,
+					 vifi_t vif_index, u32 tb_id)
+{
+	struct vif_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.dev = vif->dev,
+		.vif_index = vif_index,
+		.vif_flags = vif->flags,
+		.tb_id = tb_id,
+	};
+
+	ASSERT_RTNL();
+	net->ipv4.ipmr_seq++;
+	return call_fib_notifiers(net, event_type, &info.info);
+}
+
 static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
 					struct net *net,
 					enum fib_event_type event_type,
@@ -644,6 +665,24 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
 	return call_fib_notifier(nb, net, event_type, &info.info);
 }
 
+static int call_ipmr_mfc_entry_notifiers(struct net *net,
+					 enum fib_event_type event_type,
+					 struct mfc_cache *mfc, u32 tb_id)
+{
+	struct mfc_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.mfc = mfc,
+		.tb_id = tb_id
+	};
+
+	ASSERT_RTNL();
+	net->ipv4.ipmr_seq++;
+	return call_fib_notifiers(net, event_type, &info.info);
+}
+
 /**
  *	vif_delete - Delete a VIF entry
  *	@notify: Set to 1, if the caller is a notifier_call
@@ -651,6 +690,7 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 		      struct list_head *head)
 {
+	struct net *net = read_pnet(&mrt->net);
 	struct vif_device *v;
 	struct net_device *dev;
 	struct in_device *in_dev;
@@ -660,6 +700,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 
 	v = &mrt->vif_table[vifi];
 
+	if (VIF_EXISTS(mrt, vifi))
+		call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
+					      mrt->id);
+
 	write_lock_bh(&mrt_lock);
 	dev = v->dev;
 	v->dev = NULL;
@@ -909,6 +953,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	if (vifi+1 > mrt->maxvif)
 		mrt->maxvif = vifi+1;
 	write_unlock_bh(&mrt_lock);
+	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
 	return 0;
 }
 
@@ -1209,6 +1254,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 {
+	struct net *net = read_pnet(&mrt->net);
 	struct mfc_cache *c;
 
 	/* The entries are added/deleted only under RTNL */
@@ -1220,6 +1266,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 		return -ENOENT;
 	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 	list_del_rcu(&c->list);
+	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
 	mroute_netlink_event(mrt, c, RTM_DELROUTE);
 	ipmr_cache_put(c);
 
@@ -1248,6 +1295,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 		if (!mrtsock)
 			c->mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
+		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
+					      mrt->id);
 		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
 		return 0;
 	}
@@ -1297,6 +1346,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 		ipmr_cache_resolve(net, mrt, uc, c);
 		ipmr_cache_free(uc);
 	}
+	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
 	mroute_netlink_event(mrt, c, RTM_NEWROUTE);
 	return 0;
 }
@@ -1304,6 +1354,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 /* Close the multicast socket, and clear the vif tables etc */
 static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
+	struct net *net = read_pnet(&mrt->net);
 	struct mfc_cache *c, *tmp;
 	LIST_HEAD(list);
 	int i;
@@ -1322,6 +1373,8 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 			continue;
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 		list_del_rcu(&c->list);
+		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c,
+					      mrt->id);
 		mroute_netlink_event(mrt, c, RTM_DELROUTE);
 		ipmr_cache_put(c);
 	}
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 03/12] ipmr: Add FIB notification access functions
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Make the ipmr module register as a FIB notifier. To do that, implement both
the ipmr_seq_read and ipmr_dump ops.

The ipmr_seq_read op returns a sequence counter that is incremented on
every notification related operation done by the ipmr. To implement that,
add a sequence counter in the netns_ipv4 struct and increment it whenever a
new MFC route or VIF are added or deleted. The sequence operations are
protected by the RTNL lock.

The ipmr_dump iterates the list of MFC routes and the list of VIF entries
and sends notifications about them. The entries dump is done under RCU
where the VIF dump uses the mrt_lock too, as the vif->dev field can change
under RCU.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
v1->v2:
 - Take the mrt_lock when dumping VIF entries.
---
 include/linux/mroute.h   |  15 ++++++
 include/net/netns/ipv4.h |   3 ++
 net/ipv4/ipmr.c          | 137 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 10028f2..54c5cb8 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
 #ifdef CONFIG_IP_MROUTE
@@ -58,6 +59,14 @@ struct vif_device {
 	int		link;			/* Physical interface index	*/
 };
 
+struct vif_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct net_device *dev;
+	vifi_t vif_index;
+	unsigned short vif_flags;
+	u32 tb_id;
+};
+
 #define VIFF_STATIC 0x8000
 
 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
@@ -146,6 +155,12 @@ struct mfc_cache {
 	struct rcu_head	rcu;
 };
 
+struct mfc_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct mfc_cache *mfc;
+	u32 tb_id;
+};
+
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8387f09..abc84d9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -163,6 +163,9 @@ struct netns_ipv4 {
 	struct fib_notifier_ops	*notifier_ops;
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
 
+	struct fib_notifier_ops	*ipmr_notifier_ops;
+	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
+
 	atomic_t	rt_genid;
 };
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 86dc5f9..49879c3 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -264,6 +264,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	fib_rules_unregister(net->ipv4.mr_rules_ops);
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
+}
 #else
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
@@ -298,6 +308,16 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	net->ipv4.mrt = NULL;
 	rtnl_unlock();
 }
+
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static unsigned int ipmr_rules_seq_read(struct net *net)
+{
+	return 0;
+}
 #endif
 
 static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -587,6 +607,43 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 }
 #endif
 
+static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct vif_device *vif,
+					vifi_t vif_index, u32 tb_id)
+{
+	struct vif_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.dev = vif->dev,
+		.vif_index = vif_index,
+		.vif_flags = vif->flags,
+		.tb_id = tb_id,
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
+					struct net *net,
+					enum fib_event_type event_type,
+					struct mfc_cache *mfc, u32 tb_id)
+{
+	struct mfc_entry_notifier_info info = {
+		.info = {
+			.family = RTNL_FAMILY_IPMR,
+			.net = net,
+		},
+		.mfc = mfc,
+		.tb_id = tb_id
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
 /**
  *	vif_delete - Delete a VIF entry
  *	@notify: Set to 1, if the caller is a notifier_call
@@ -3050,14 +3107,87 @@ static const struct net_protocol pim_protocol = {
 };
 #endif
 
+static unsigned int ipmr_seq_read(struct net *net)
+{
+	ASSERT_RTNL();
+
+	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+}
+
+static int ipmr_dump(struct net *net, struct notifier_block *nb)
+{
+	struct mr_table *mrt;
+	int err;
+
+	err = ipmr_rules_dump(net, nb);
+	if (err)
+		return err;
+
+	ipmr_for_each_table(mrt, net) {
+		struct vif_device *v = &mrt->vif_table[0];
+		struct mfc_cache *mfc;
+		int vifi;
+
+		/* Notifiy on table VIF entries */
+		read_lock(&mrt_lock);
+		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+			if (!v->dev)
+				continue;
+
+			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
+						     v, vifi, mrt->id);
+		}
+		read_unlock(&mrt_lock);
+
+		/* Notify on table MFC entries */
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+			call_ipmr_mfc_entry_notifier(nb, net,
+						     FIB_EVENT_ENTRY_ADD, mfc,
+						     mrt->id);
+	}
+
+	return 0;
+}
+
+static const struct fib_notifier_ops ipmr_notifier_ops_template = {
+	.family		= RTNL_FAMILY_IPMR,
+	.fib_seq_read	= ipmr_seq_read,
+	.fib_dump	= ipmr_dump,
+	.owner		= THIS_MODULE,
+};
+
+int __net_init ipmr_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	net->ipv4.ipmr_seq = 0;
+
+	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv4.ipmr_notifier_ops = ops;
+
+	return 0;
+}
+
+static void __net_exit ipmr_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
+	net->ipv4.ipmr_notifier_ops = NULL;
+}
+
 /* Setup for IP multicast routing */
 static int __net_init ipmr_net_init(struct net *net)
 {
 	int err;
 
+	err = ipmr_notifier_init(net);
+	if (err)
+		goto ipmr_notifier_fail;
+
 	err = ipmr_rules_init(net);
 	if (err < 0)
-		goto fail;
+		goto ipmr_rules_fail;
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
@@ -3074,7 +3204,9 @@ static int __net_init ipmr_net_init(struct net *net)
 proc_vif_fail:
 	ipmr_rules_exit(net);
 #endif
-fail:
+ipmr_rules_fail:
+	ipmr_notifier_exit(net);
+ipmr_notifier_fail:
 	return err;
 }
 
@@ -3084,6 +3216,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
 	remove_proc_entry("ip_mr_cache", net->proc_net);
 	remove_proc_entry("ip_mr_vif", net->proc_net);
 #endif
+	ipmr_notifier_exit(net);
 	ipmr_rules_exit(net);
 }
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 02/12] ipmr: Add reference count to MFC entries
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

Next commits will introduce MFC notifications through the atomic
fib_notification chain, thus allowing modules to be aware of MFC entries.

Due to the fact that modules may need to hold a reference to an MFC entry,
add reference count to MFC entries to prevent them from being freed while
these modules use them.

The reference counting is done only on resolved MFC entries currently.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
v1->v2:
 - Add comment for the mfc_cache.mfc_un.res.refcount field, similarly to
   all other fields in the struct
---
 include/linux/mroute.h | 21 +++++++++++++++++++++
 net/ipv4/ipmr.c        |  8 +++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index d7f6333..10028f2 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -109,6 +109,7 @@ struct mfc_cache_cmp_arg {
  * @wrong_if: number of wrong source interface hits
  * @lastuse: time of last use of the group (traffic or update)
  * @ttls: OIF TTL threshold array
+ * @refcount: reference count for this entry
  * @list: global entry list
  * @rcu: used for entry destruction
  */
@@ -138,6 +139,7 @@ struct mfc_cache {
 			unsigned long wrong_if;
 			unsigned long lastuse;
 			unsigned char ttls[MAXVIFS];
+			refcount_t refcount;
 		} res;
 	} mfc_un;
 	struct list_head list;
@@ -148,4 +150,23 @@ struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
 		   struct rtmsg *rtm, u32 portid);
+
+#ifdef CONFIG_IP_MROUTE
+void ipmr_cache_free(struct mfc_cache *mfc_cache);
+#else
+static inline void ipmr_cache_free(struct mfc_cache *mfc_cache)
+{
+}
+#endif
+
+static inline void ipmr_cache_put(struct mfc_cache *c)
+{
+	if (refcount_dec_and_test(&c->mfc_un.res.refcount))
+		ipmr_cache_free(c);
+}
+static inline void ipmr_cache_hold(struct mfc_cache *c)
+{
+	refcount_inc(&c->mfc_un.res.refcount);
+}
+
 #endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c9b3e6e..86dc5f9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -652,10 +652,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head)
 	kmem_cache_free(mrt_cachep, c);
 }
 
-static inline void ipmr_cache_free(struct mfc_cache *c)
+void ipmr_cache_free(struct mfc_cache *c)
 {
 	call_rcu(&c->rcu, ipmr_cache_free_rcu);
 }
+EXPORT_SYMBOL(ipmr_cache_free);
 
 /* Destroy an unresolved cache entry, killing queued skbs
  * and reporting error to netlink readers.
@@ -949,6 +950,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
 	if (c) {
 		c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
 		c->mfc_un.res.minvif = MAXVIFS;
+		refcount_set(&c->mfc_un.res.refcount, 1);
 	}
 	return c;
 }
@@ -1162,7 +1164,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 	list_del_rcu(&c->list);
 	mroute_netlink_event(mrt, c, RTM_DELROUTE);
-	ipmr_cache_free(c);
+	ipmr_cache_put(c);
 
 	return 0;
 }
@@ -1264,7 +1266,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 		list_del_rcu(&c->list);
 		mroute_netlink_event(mrt, c, RTM_DELROUTE);
-		ipmr_cache_free(c);
+		ipmr_cache_put(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 01/12] fib: notifier: Add VIF add and delete event types
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Yotam Gigi <yotamg@mellanox.com>

In order for an interface to forward packets according to the kernel
multicast routing table, it must be configured with a VIF index according
to the mroute user API. The VIF index is then used to refer to that
interface in the mroute user API, for example, to set the iif and oifs of
an MFC entry.

In order to allow drivers to be aware and offload multicast routes, they
have to be aware of the VIF add and delete notifications.

Due to the fact that a specific VIF can be deleted and re-added pointing to
another netdevice, and the MFC routes that point to it will forward the
matching packets to the new netdevice, a driver willing to offload MFC
cache entries must be aware of the VIF add and delete events in addition to
MFC routes notifications.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
 include/net/fib_notifier.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index 669b971..54cd6b8 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -20,6 +20,8 @@ enum fib_event_type {
 	FIB_EVENT_RULE_DEL,
 	FIB_EVENT_NH_ADD,
 	FIB_EVENT_NH_DEL,
+	FIB_EVENT_VIF_ADD,
+	FIB_EVENT_VIF_DEL,
 };

 struct fib_notifier_ops {
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v3 00/12] mlxsw: Add support for offloading IPv4 multicast routes
From: Jiri Pirko @ 2017-09-27  6:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng

From: Jiri Pirko <jiri@mellanox.com>

Yotam says:

This patch-set introduces offloading of the kernel IPv4 multicast router
logic in the Spectrum driver.

The first patch makes the Spectrum driver ignore FIB notifications that are
not of address family IPv4 or IPv6. This is needed in order to prevent
crashes while the next patches introduce the RTNL_FAMILY_IPMR FIB
notifications.

Patches 2-5 update ipmr to use the FIB notification chain for both MFC and
VIF notifications, and patches 8-12 update the Spectrum driver to register
to these notifications and offload the routes.

Similarly to IPv4 and IPv6, any failure will trigger the abort mechanism
which is updated in this patch-set to eject multicast route tables too.

At this stage, the following limitations apply:
 - A multicast MFC route will be offloaded by the driver if all the output
   interfaces are Spectrum router interfaces (RIFs). In any other case
   (which includes pimreg device, tunnel devices and management ports) the
   route will be trapped to the CPU and the packets will be forwarded by
   software.
 - ipmr proxy routes are not supported and will trigger the abort
   mechanism.
 - The MFC TTL values are currently treated as boolean: if the value is
   different than 255, the traffic is forwarded to the interface and if the
   value is 255 it is not forwarded. Dropping packets based on their TTL isn't
   currently supported.

To allow users to have visibility on which of the routes are offloaded and
which are not, patch 6 introduces a per-route offload indication similar to
IPv4 and IPv6 routes which is sent to the user via the RTNetlink interface.

The Spectrum driver multicast router offloading support, which is
introduced in patches 8 and 9, is divided into two parts:
 - The hardware logic which abstracts the Spectrum hardware and provides a
   simple API for the upper levels.
 - The offloading logic which gets the MFC and VIF notifications from the
   kernel and updates the hardware using the hardware logic part.

Finally, the last patch makes the Spectrum router logic not ignore the
multicast FIB notifications and call the corresponding functions in the
multicast router offloading logic.

---
v2->v3:
 - Move the ipmr_rule_default function definition to be inside the already
   existing CONFIG_IP_MROUTE_MULTIPLE_TABLES ifdef block (patch 6)
 - Remove double =0 initialization in spectrum_mr.c (patch 7)
 - Fix route4 allocation size (patch 7)
v1->v2:
 - Add comments for struct fields in mroute.h
 - Take the mrt_lock while dumping VIFs in the fib_notifier dump callback
 - Update the MFC lastuse field too

Yotam Gigi (12):
  fib: notifier: Add VIF add and delete event types
  ipmr: Add reference count to MFC entries
  ipmr: Add FIB notification access functions
  ipmr: Send FIB notifications on MFC and VIF entries
  net: ipmr: Add MFC offload indication
  net: mroute: Check if rule is a default rule
  mlxsw: spectrum: Add the multicast routing offloading logic
  mlxsw: spectrum: Add the multicast routing hardware logic
  mlxsw: spectrum: router: Squash the default route table to main
  mlxsw: spectrum_router: Add multicast routes notification handling
    functionality
  mlxsw: spectrum: Notify multicast router on RIF MTU changes
  mlxsw: spectrum: router: Don't ignore IPMR notifications

 drivers/net/ethernet/mellanox/mlxsw/Makefile       |    3 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |    2 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c  | 1014 ++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h  |  133 +++
 .../net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c |  828 ++++++++++++++++
 .../net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h |   43 +
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  205 +++-
 include/linux/mroute.h                             |   45 +
 include/net/fib_notifier.h                         |    2 +
 include/net/netns/ipv4.h                           |    3 +
 net/ipv4/ipmr.c                                    |  213 +++-
 11 files changed, 2480 insertions(+), 11 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h

-- 
2.9.5

^ permalink raw reply

* Re: [PATCH net] net: Set sk_prot_creator when cloning sockets to the right proto
From: Eric Dumazet @ 2017-09-27  6:14 UTC (permalink / raw)
  To: Christoph Paasch; +Cc: David Miller, netdev
In-Reply-To: <20170927003850.23731-1-cpaasch@apple.com>

On Tue, 2017-09-26 at 17:38 -0700, Christoph Paasch wrote:
> sk->sk_prot and sk->sk_prot_creator can differ when the app uses
> IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one).
> Which is why sk_prot_creator is there to make sure that sk_prot_free()
> does the kmem_cache_free() on the right kmem_cache slab.
> 
> Now, if such a socket gets transformed back to a listening socket (using
> connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through
> sk_clone_lock() when a new connection comes in. But sk_prot_creator will
> still point to the IPv6 kmem_cache (as everything got copied in
> sk_clone_lock()). When freeing, we will thus put this
> memory back into the IPv6 kmem_cache although it was allocated in the
> IPv4 cache. I have seen memory corruption happening because of this.
> 
> With slub-debugging and MEMCG_KMEM enabled this gives the warning
> 	"cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP"

Nice catch, thanks !

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [PATCH] net: stmmac: Meet alignment requirements for DMA
From: Paul Burton @ 2017-09-27  5:23 UTC (permalink / raw)
  To: David Miller
  Cc: matt.redfearn, netdev, alexandre.torgue, peppe.cavallaro,
	linux-kernel, linux-mips, james.hogan
In-Reply-To: <20170926.215321.424825014223425519.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 2770 bytes --]

Hi David,

On Tuesday, 26 September 2017 21:53:21 PDT David Miller wrote:
> From: Paul Burton <paul.burton@imgtec.com>
> Date: Tue, 26 Sep 2017 21:30:56 -0700
> 
> > Nobody said that you are required to do anything, I suggested that
> > it would be beneficial if you were to suggest a change to the
> > documented DMA API such that it allows your usage where it currently
> > does not.
> 
> Documentation is often wrong and it is here.  What 200+ drivers
> actually do and depend upon trumps a simple text document.

Agreed - but if the documented API is wrong then it should change.

> The requirement is that the memory remains quiescent on the cpu side
> while the device messes with it.  And that this quiescence requirement
> may or may not be on a cache line basis.
> 
> There is absolutely no requirement that the buffers themselves are
> cache line aligned.
> 
> In fact, receive buffers for networking are intentionally 2-byte
> aligned in order for the ipv4 headers to be naturally 32-bit aligned.
> 
> Cache line aligning receive buffers will actually make some
> architectures trap because of the bad alignment.
> 
> So see, this cache line alignment requirement is pure madness from
> just about any perspective whatsoever.

Thank you - that's more constructive.

I understand that the network code doesn't suffer from a problem with using 
non-cacheline-aligned buffers, because you guarantee that the CPU will not be 
writing to anything on either side of the memory mapped for DMA up to at least 
a cache line boundary. That is all well & good (though still, I think that 
ought to be documented somewhere even if just by a comment somewhere in linux/
sk_buff.h).

There is still a problem though in other cases which do not provide such a 
guarantee - for example the MMC issue I pointed out previously - which it 
would be useful to be able to catch rather than allowing silent memory 
corruption which can be difficult to track down. Whilst the particular case of 
mapping a struct sk_buff's data for DMA might not trigger this issue the issue 
does still exist in other cases for which aligning data to a cache line 
boundary is not always "pure madness".

So whilst it sounds like you'd happily just change or remove the paragraph 
about cache-line alignment in Documentation/DMA-API.txt, and I agree that 
would be a good start, I wonder whether we could do something better. One 
option might be for the warning in the MIPS DMA code to be predicated on one 
of the cache lines only partially covered by a DMA mapping actually being 
dirty - though this would probably be a more expensive check than we'd want in 
the general case so might have to be conditional upon some new debug entry in 
Kconfig. I'll give this some thought.

Thanks,
    Paul

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH v4 net-next 12/12] gtp: Allow configuring GTP interface as standalone
From: Tom Herbert @ 2017-09-27  4:58 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170927045803.2477-1-tom@quantonium.net>

Add new configuration of GTP interfaces that allow specifying a port to
listen on (as opposed to having to get sockets from a userspace control
plane). This allows GTP interfaces to be configured and the data path
tested without requiring a GTP-C daemon.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c        | 215 ++++++++++++++++++++++++++++++++++++-----------
 include/uapi/linux/gtp.h |   5 ++
 2 files changed, 169 insertions(+), 51 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 1c580df4cfc5..dc1fcd3034af 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -93,6 +93,9 @@ struct gtp_dev {
 	struct sock		*sk0;
 	struct sock		*sk1u;
 
+	struct socket		*sock0;
+	struct socket		*sock1u;
+
 	struct net_device	*dev;
 
 	unsigned int		role;
@@ -451,26 +454,33 @@ static void gtp_encap_destroy(struct sock *sk)
 	}
 }
 
-static void gtp_encap_disable_sock(struct sock *sk)
+static void gtp_encap_release(struct gtp_dev *gtp)
 {
-	if (!sk)
-		return;
+	if (gtp->sk0) {
+		if (gtp->sock0) {
+			udp_tunnel_sock_release(gtp->sock0);
+			gtp->sock0 = NULL;
+		} else {
+			gtp_encap_destroy(gtp->sk0);
+		}
 
-	gtp_encap_destroy(sk);
-}
+		gtp->sk0 = NULL;
+	}
 
-static void gtp_encap_disable(struct gtp_dev *gtp)
-{
-	gtp_encap_disable_sock(gtp->sk0);
-	gtp_encap_disable_sock(gtp->sk1u);
+	if (gtp->sk1u) {
+		if (gtp->sock1u) {
+			udp_tunnel_sock_release(gtp->sock1u);
+			gtp->sock1u = NULL;
+		} else {
+			gtp_encap_destroy(gtp->sk1u);
+		}
+
+		gtp->sk1u = NULL;
+	}
 }
 
 static int gtp_dev_init(struct net_device *dev)
 {
-	struct gtp_dev *gtp = netdev_priv(dev);
-
-	gtp->dev = dev;
-
 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
@@ -482,7 +492,8 @@ static void gtp_dev_uninit(struct net_device *dev)
 {
 	struct gtp_dev *gtp = netdev_priv(dev);
 
-	gtp_encap_disable(gtp);
+	gtp_encap_release(gtp);
+
 	free_percpu(dev->tstats);
 }
 
@@ -751,6 +762,8 @@ static void gtp_link_setup(struct net_device *dev)
 				  sizeof(struct udphdr) +
 				  sizeof(struct gtp0_header);
 
+	gtp->dev = dev;
+
 	gro_cells_init(&gtp->gro_cells, dev);
 }
 
@@ -764,13 +777,19 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 		       struct netlink_ext_ack *extack)
 {
 	unsigned int role = GTP_ROLE_GGSN;
+	bool have_fd, have_ports;
 	bool is_ipv6 = false;
 	struct gtp_dev *gtp;
 	struct gtp_net *gn;
 	int hashsize, err;
 
-	if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1])
+	have_fd = !!data[IFLA_GTP_FD0] || !!data[IFLA_GTP_FD1];
+	have_ports = !!data[IFLA_GTP_PORT0] || !!data[IFLA_GTP_PORT1];
+
+	if (!(have_fd ^ have_ports)) {
+		/* Either got fd(s) or port(s) */
 		return -EINVAL;
+	}
 
 	if (data[IFLA_GTP_ROLE]) {
 		role = nla_get_u32(data[IFLA_GTP_ROLE]);
@@ -831,7 +850,7 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 out_hashtable:
 	gtp_hashtable_free(gtp);
 out_encap:
-	gtp_encap_disable(gtp);
+	gtp_encap_release(gtp);
 	return err;
 }
 
@@ -840,7 +859,7 @@ static void gtp_dellink(struct net_device *dev, struct list_head *head)
 	struct gtp_dev *gtp = netdev_priv(dev);
 
 	gro_cells_destroy(&gtp->gro_cells);
-	gtp_encap_disable(gtp);
+	gtp_encap_release(gtp);
 	gtp_hashtable_free(gtp);
 	list_del_rcu(&gtp->list);
 	unregister_netdevice_queue(dev, head);
@@ -851,6 +870,8 @@ static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = {
 	[IFLA_GTP_FD1]			= { .type = NLA_U32 },
 	[IFLA_GTP_PDP_HASHSIZE]		= { .type = NLA_U32 },
 	[IFLA_GTP_ROLE]			= { .type = NLA_U32 },
+	[IFLA_GTP_PORT0]		= { .type = NLA_U16 },
+	[IFLA_GTP_PORT1]		= { .type = NLA_U16 },
 };
 
 static int gtp_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -949,11 +970,35 @@ static void gtp_hashtable_free(struct gtp_dev *gtp)
 	kfree(gtp->tid_hash);
 }
 
-static struct sock *gtp_encap_enable_socket(int fd, int type,
-					    struct gtp_dev *gtp,
-					    bool is_ipv6)
+static int gtp_encap_enable_sock(struct socket *sock, int type,
+				 struct gtp_dev *gtp)
 {
 	struct udp_tunnel_sock_cfg tuncfg = {NULL};
+
+	switch (type) {
+	case UDP_ENCAP_GTP0:
+		tuncfg.encap_rcv = gtp0_udp_encap_recv;
+		break;
+	case UDP_ENCAP_GTP1U:
+		tuncfg.encap_rcv = gtp1u_udp_encap_recv;
+		break;
+	default:
+		pr_debug("Unknown encap type %u\n", type);
+		return -EINVAL;
+	}
+
+	tuncfg.sk_user_data = gtp;
+	tuncfg.encap_type = type;
+	tuncfg.encap_destroy = gtp_encap_destroy;
+
+	setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg);
+
+	return 0;
+}
+
+static struct sock *gtp_encap_enable_fd(int fd, int type, struct gtp_dev *gtp,
+					bool is_ipv6)
+{
 	struct socket *sock;
 	struct sock *sk;
 	int err;
@@ -986,60 +1031,128 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 	sk = sock->sk;
 	sock_hold(sk);
 
-	switch (type) {
-	case UDP_ENCAP_GTP0:
-		tuncfg.encap_rcv = gtp0_udp_encap_recv;
-		break;
-	case UDP_ENCAP_GTP1U:
-		tuncfg.encap_rcv = gtp1u_udp_encap_recv;
-		break;
-	default:
-		pr_debug("Unknown encap type %u\n", type);
-		sk = ERR_PTR(-EINVAL);
-		goto out_sock;
-	}
-
-	tuncfg.sk_user_data = gtp;
-	tuncfg.encap_type = type;
-	tuncfg.encap_destroy = gtp_encap_destroy;
-
-	setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg);
+	err = gtp_encap_enable_sock(sock, type, gtp);
+	if (err < 0)
+		sk = ERR_PTR(err);
 
 out_sock:
 	sockfd_put(sock);
 	return sk;
 }
 
+static struct socket *gtp_create_sock(struct net *net, bool ipv6,
+				      __be16 port, u32 flags)
+{
+	struct socket *sock;
+	struct udp_port_cfg udp_conf;
+	int err;
+
+	memset(&udp_conf, 0, sizeof(udp_conf));
+
+#if GTP_IPV6
+	if (ipv6) {
+		udp_conf.family = AF_INET6;
+		udp_conf.ipv6_v6only = 1;
+	} else {
+		udp_conf.family = AF_INET;
+	}
+#else
+	udp_conf.family = AF_INET;
+#endif
+
+	udp_conf.local_udp_port = port;
+
+	/* Open UDP socket */
+	err = udp_sock_create(net, &udp_conf, &sock);
+	if (err)
+		return ERR_PTR(err);
+
+	return sock;
+}
+
 static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[],
 			    bool is_ipv6)
 {
+	int err;
+
+	struct socket *sock0 = NULL, *sock1u = NULL;
 	struct sock *sk0 = NULL, *sk1u = NULL;
 
 	if (data[IFLA_GTP_FD0]) {
 		u32 fd0 = nla_get_u32(data[IFLA_GTP_FD0]);
 
-		sk0 = gtp_encap_enable_socket(fd0, UDP_ENCAP_GTP0, gtp,
-					      is_ipv6);
-		if (IS_ERR(sk0))
-			return PTR_ERR(sk0);
+		sk0 = gtp_encap_enable_fd(fd0, UDP_ENCAP_GTP0, gtp, is_ipv6);
+		if (IS_ERR(sk0)) {
+			err = PTR_ERR(sk0);
+			sk0 = NULL;
+			goto out_err;
+		}
+	} else if (data[IFLA_GTP_PORT0]) {
+		__be16 port = nla_get_u16(data[IFLA_GTP_PORT0]);
+
+		sock0 = gtp_create_sock(dev_net(gtp->dev), is_ipv6, port, 0);
+		if (IS_ERR(sock0)) {
+			err = PTR_ERR(sock0);
+			sock0 = NULL;
+			goto out_err;
+		}
+
+		err = gtp_encap_enable_sock(sock0, UDP_ENCAP_GTP0, gtp);
+		if (err)
+			goto out_err;
 	}
 
 	if (data[IFLA_GTP_FD1]) {
 		u32 fd1 = nla_get_u32(data[IFLA_GTP_FD1]);
 
-		sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp,
-					       is_ipv6);
+		sk1u = gtp_encap_enable_fd(fd1, UDP_ENCAP_GTP1U, gtp, is_ipv6);
 		if (IS_ERR(sk1u)) {
-			if (sk0)
-				gtp_encap_disable_sock(sk0);
-			return PTR_ERR(sk1u);
+			err = PTR_ERR(sk1u);
+			sk1u = NULL;
+			goto out_err;
+		}
+	} else if (data[IFLA_GTP_PORT1]) {
+		__be16 port = nla_get_u16(data[IFLA_GTP_PORT1]);
+
+		sock1u = gtp_create_sock(dev_net(gtp->dev), is_ipv6, port, 0);
+		if (IS_ERR(sock1u)) {
+			err = PTR_ERR(sock1u);
+			sock1u = NULL;
+			goto out_err;
 		}
+
+		err = gtp_encap_enable_sock(sock1u, UDP_ENCAP_GTP1U, gtp);
+		if (err)
+			goto out_err;
+	}
+
+	if (sock0) {
+		gtp->sock0 = sock0;
+		gtp->sk0 = sock0->sk;
+	} else {
+		gtp->sk0 = sk0;
 	}
 
-	gtp->sk0 = sk0;
-	gtp->sk1u = sk1u;
+	if (sock1u) {
+		gtp->sock1u = sock1u;
+		gtp->sk1u = sock1u->sk;
+	} else {
+		gtp->sk1u = sk1u;
+	}
 
 	return 0;
+
+out_err:
+	if (sk0)
+		gtp_encap_destroy(sk0);
+	if (sk1u)
+		gtp_encap_destroy(sk1u);
+	if (sock0)
+		udp_tunnel_sock_release(sock0);
+	if (sock1u)
+		udp_tunnel_sock_release(sock1u);
+
+	return err;
 }
 
 static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[])
@@ -1624,8 +1737,8 @@ static const struct genl_ops gtp_genl_ops[] = {
 };
 
 static struct genl_family gtp_genl_family __ro_after_init = {
-	.name		= "gtp",
-	.version	= 0,
+	.name		= GTP_GENL_NAME,
+	.version	= GTP_GENL_VERSION,
 	.hdrsize	= 0,
 	.maxattr	= GTPA_MAX,
 	.netnsok	= true,
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index 8eec519fa754..0da18aa88be8 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -9,6 +9,11 @@ enum gtp_genl_cmds {
 	GTP_CMD_MAX,
 };
 
+/* NETLINK_GENERIC related info
+ */
+#define GTP_GENL_NAME		"gtp"
+#define GTP_GENL_VERSION	0
+
 enum gtp_version {
 	GTP_V0 = 0,
 	GTP_V1,
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 11/12] gtp: Experimental support encpasulating over IPv6
From: Tom Herbert @ 2017-09-27  4:58 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170927045803.2477-1-tom@quantonium.net>

Allows using GTP datapath over IPv6. Remote peers are indicated by IPv6.

Note this is experimental, more work is needed to make this
compliant with 3GPP standard.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c            | 248 ++++++++++++++++++++++++++++++++++---------
 include/uapi/linux/gtp.h     |   1 +
 include/uapi/linux/if_link.h |   3 +
 3 files changed, 200 insertions(+), 52 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 919ec6e14973..1c580df4cfc5 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -28,6 +28,7 @@
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/ip.h>
+#include <net/ip6_tunnel.h>
 #include <net/udp.h>
 #include <net/udp_tunnel.h>
 #include <net/icmp.h>
@@ -59,16 +60,22 @@ struct pdp_ctx {
 	__be16			gtp_port;
 
 	u16			ms_af;
+	u16			peer_af;
 #if GTP_IPV6
 	union {
 		struct in_addr	ms_addr_ip4;
 		struct in6_addr	ms_addr_ip6;
 	};
+
+	union {
+		struct in_addr	peer_addr_ip4;
+		struct in6_addr	peer_addr_ip6;
+	};
 #else
 	struct in_addr	ms_addr_ip4;
+	struct in_addr	peer_addr_ip4;
 #endif
 
-	struct in_addr		peer_addr_ip4;
 
 	struct sock		*sk;
 	struct net_device       *dev;
@@ -93,8 +100,11 @@ struct gtp_dev {
 	struct hlist_head	*tid_hash;
 
 	struct hlist_head	*addr4_hash;
+
 #if GTP_IPV6
 	struct hlist_head	*addr6_hash;
+
+	unsigned int		is_ipv6:1;
 #endif
 
 	struct gro_cells	gro_cells;
@@ -534,8 +544,6 @@ static int gtp_xmit(struct sk_buff *skb, struct net_device *dev,
 {
 	struct iphdr *inner_iph = NULL;
 	struct sock *sk = pctx->sk;
-	__be32 saddr = inet_sk(sk)->inet_saddr;
-	struct rtable *rt;
 	int err = 0;
 
 	if (skb->protocol == ETH_P_IP)
@@ -548,38 +556,84 @@ static int gtp_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	skb_reset_inner_headers(skb);
 
-	/* Source address returned by route lookup is ignored since
-	 * we get the address from a socket.
-	 */
-	rt = ip_tunnel_get_route(dev, skb, sk->sk_protocol,
-				 sk->sk_bound_dev_if, RT_CONN_FLAGS(sk),
-				 pctx->peer_addr_ip4.s_addr, &saddr,
-				 pctx->gtp_port, pctx->gtp_port,
-				 &pctx->dst_cache, NULL);
-
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		goto out_err;
-	}
+	if (pctx->peer_af == AF_INET) {
+		__be32 saddr = inet_sk(sk)->inet_saddr;
+		struct rtable *rt;
+
+		/* Source address returned by route lookup is ignored since
+		 * we get the address from a socket.
+		 */
+		rt = ip_tunnel_get_route(dev, skb, sk->sk_protocol,
+					 sk->sk_bound_dev_if, RT_CONN_FLAGS(sk),
+					 pctx->peer_addr_ip4.s_addr, &saddr,
+					 pctx->gtp_port, pctx->gtp_port,
+					 &pctx->dst_cache, NULL);
+
+		if (IS_ERR(rt)) {
+			err = PTR_ERR(rt);
+			goto out_err;
+		}
+
+		skb_dst_drop(skb);
 
-	skb_dst_drop(skb);
+		gtp_push_header(skb, pctx);
 
-	gtp_push_header(skb, pctx);
+		if (inner_iph)
+			__iptunnel_update_pmtu(dev, skb, &rt->dst,
+					       !!inner_iph->frag_off,
+					       inner_iph, pctx->hlen,
+					       pctx->peer_addr_ip4.s_addr);
 
-	if (inner_iph)
-		__iptunnel_update_pmtu(dev, skb, &rt->dst,
-				       !!inner_iph->frag_off,
-				       inner_iph, pctx->hlen,
-				       pctx->peer_addr_ip4.s_addr);
+		udp_tunnel_xmit_skb(rt, sk, skb, saddr,
+				    pctx->peer_addr_ip4.s_addr,
+				    0, ip4_dst_hoplimit(&rt->dst), 0,
+				    pctx->gtp_port, pctx->gtp_port,
+				    false, false);
 
-	udp_tunnel_xmit_skb(rt, sk, skb, saddr,
-			    pctx->peer_addr_ip4.s_addr,
-			    0, ip4_dst_hoplimit(&rt->dst), 0,
-			    pctx->gtp_port, pctx->gtp_port,
-			    false, false);
+		netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n",
+			   &saddr, &pctx->peer_addr_ip4.s_addr);
 
-	netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n",
-		   &saddr, &pctx->peer_addr_ip4.s_addr);
+#if GTP_IPV6
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (pctx->peer_af == AF_INET6) {
+		struct in6_addr saddr = inet6_sk(sk)->saddr;
+		struct dst_entry *dst;
+
+		/* Source address returned by route lookup is ignored since
+		 * we get the address from a socket.
+		 */
+		dst = ip6_tnl_get_route(dev, skb, sk, sk->sk_protocol,
+					sk->sk_bound_dev_if, 0,
+					0, &pctx->peer_addr_ip6, &saddr,
+					pctx->gtp_port, pctx->gtp_port,
+					&pctx->dst_cache, NULL);
+
+		if (IS_ERR(dst)) {
+			err = PTR_ERR(dst);
+			goto out_err;
+		}
+
+		skb_dst_drop(skb);
+
+		gtp_push_header(skb, pctx);
+
+		if (inner_iph)
+			__iptunnel_update_pmtu(dev, skb, dst,
+					       !!inner_iph->frag_off,
+					       inner_iph, pctx->hlen, 0);
+
+		udp_tunnel6_xmit_skb(dst, sk, skb, dev,
+				     &saddr, &pctx->peer_addr_ip6,
+				     0, ip6_dst_hoplimit(dst), 0,
+				     pctx->gtp_port, pctx->gtp_port,
+				     false);
+
+		netdev_dbg(dev, "gtp -> IP src: %pI6 dst: %pI6\n",
+			   &saddr, &pctx->peer_addr_ip6);
+
+#endif
+#endif
+	}
 
 	return 0;
 
@@ -688,7 +742,12 @@ static void gtp_link_setup(struct net_device *dev)
 
 	/* Assume largest header, ie. GTPv0. */
 	dev->needed_headroom	= LL_MAX_HEADER +
+#if GTP_IPV6
+				  max_t(int, sizeof(struct iphdr),
+					sizeof(struct ipv6hdr)) +
+#else
 				  sizeof(struct iphdr) +
+#endif
 				  sizeof(struct udphdr) +
 				  sizeof(struct gtp0_header);
 
@@ -697,12 +756,15 @@ static void gtp_link_setup(struct net_device *dev)
 
 static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize);
 static void gtp_hashtable_free(struct gtp_dev *gtp);
-static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]);
+static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[],
+			    bool is_ipv6);
 
 static int gtp_newlink(struct net *src_net, struct net_device *dev,
 		       struct nlattr *tb[], struct nlattr *data[],
 		       struct netlink_ext_ack *extack)
 {
+	unsigned int role = GTP_ROLE_GGSN;
+	bool is_ipv6 = false;
 	struct gtp_dev *gtp;
 	struct gtp_net *gn;
 	int hashsize, err;
@@ -710,9 +772,32 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 	if (!data[IFLA_GTP_FD0] && !data[IFLA_GTP_FD1])
 		return -EINVAL;
 
+	if (data[IFLA_GTP_ROLE]) {
+		role = nla_get_u32(data[IFLA_GTP_ROLE]);
+		if (role > GTP_ROLE_SGSN)
+			return -EINVAL;
+	}
+
+	if (data[IFLA_GTP_AF]) {
+		u16 af = nla_get_u16(data[IFLA_GTP_AF]);
+
+		switch (af) {
+		case AF_INET:
+			is_ipv6 = false;
+			break;
+#if GTP_IPV6
+		case AF_INET6:
+			is_ipv6 = true;
+			break;
+#endif
+		default:
+			return -EINVAL;
+		}
+	}
+
 	gtp = netdev_priv(dev);
 
-	err = gtp_encap_enable(gtp, data);
+	err = gtp_encap_enable(gtp, data, is_ipv6);
 	if (err < 0)
 		return err;
 
@@ -731,6 +816,11 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 		goto out_hashtable;
 	}
 
+	gtp->role = role;
+#if GTP_IPV6
+	gtp->is_ipv6 = is_ipv6;
+#endif
+
 	gn = net_generic(dev_net(dev), gtp_net_id);
 	list_add_rcu(&gtp->list, &gn->gtp_dev_list);
 
@@ -860,7 +950,8 @@ static void gtp_hashtable_free(struct gtp_dev *gtp)
 }
 
 static struct sock *gtp_encap_enable_socket(int fd, int type,
-					    struct gtp_dev *gtp)
+					    struct gtp_dev *gtp,
+					    bool is_ipv6)
 {
 	struct udp_tunnel_sock_cfg tuncfg = {NULL};
 	struct socket *sock;
@@ -881,6 +972,12 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 		goto out_sock;
 	}
 
+	if (sock->sk->sk_family != (is_ipv6 ? AF_INET6 : AF_INET)) {
+		pr_debug("socket fd=%d not right family\n", fd);
+		sk = ERR_PTR(-EINVAL);
+		goto out_sock;
+	}
+
 	if (rcu_dereference_sk_user_data(sock->sk)) {
 		sk = ERR_PTR(-EBUSY);
 		goto out_sock;
@@ -913,16 +1010,16 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 	return sk;
 }
 
-static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
+static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[],
+			    bool is_ipv6)
 {
-	struct sock *sk1u = NULL;
-	struct sock *sk0 = NULL;
-	unsigned int role = GTP_ROLE_GGSN;
+	struct sock *sk0 = NULL, *sk1u = NULL;
 
 	if (data[IFLA_GTP_FD0]) {
 		u32 fd0 = nla_get_u32(data[IFLA_GTP_FD0]);
 
-		sk0 = gtp_encap_enable_socket(fd0, UDP_ENCAP_GTP0, gtp);
+		sk0 = gtp_encap_enable_socket(fd0, UDP_ENCAP_GTP0, gtp,
+					      is_ipv6);
 		if (IS_ERR(sk0))
 			return PTR_ERR(sk0);
 	}
@@ -930,7 +1027,8 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
 	if (data[IFLA_GTP_FD1]) {
 		u32 fd1 = nla_get_u32(data[IFLA_GTP_FD1]);
 
-		sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp);
+		sk1u = gtp_encap_enable_socket(fd1, UDP_ENCAP_GTP1U, gtp,
+					       is_ipv6);
 		if (IS_ERR(sk1u)) {
 			if (sk0)
 				gtp_encap_disable_sock(sk0);
@@ -938,15 +1036,8 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
 		}
 	}
 
-	if (data[IFLA_GTP_ROLE]) {
-		role = nla_get_u32(data[IFLA_GTP_ROLE]);
-		if (role > GTP_ROLE_SGSN)
-			return -EINVAL;
-	}
-
 	gtp->sk0 = sk0;
 	gtp->sk1u = sk1u;
-	gtp->role = role;
 
 	return 0;
 }
@@ -982,8 +1073,18 @@ static void pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 	__be16 default_port = 0;
 
 	pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]);
-	pctx->peer_addr_ip4.s_addr =
-		nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]);
+
+	if (info->attrs[GTPA_PEER_ADDRESS]) {
+		pctx->peer_af = AF_INET;
+		pctx->peer_addr_ip4.s_addr =
+			nla_get_in_addr(info->attrs[GTPA_PEER_ADDRESS]);
+#if GTP_IPV6
+	} else if (info->attrs[GTPA_PEER6_ADDRESS]) {
+		pctx->peer_af = AF_INET6;
+		pctx->peer_addr_ip6 = nla_get_in6_addr(
+					info->attrs[GTPA_PEER6_ADDRESS]);
+#endif
+	}
 
 	switch (pctx->gtp_version) {
 	case GTP_V0:
@@ -1162,11 +1263,17 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 	int err;
 
 	if (!info->attrs[GTPA_VERSION] ||
-	   !info->attrs[GTPA_LINK] ||
-	   !info->attrs[GTPA_PEER_ADDRESS])
+	    !info->attrs[GTPA_LINK])
 		return -EINVAL;
 
 #if GTP_IPV6
+	if (!(!!info->attrs[GTPA_PEER_ADDRESS] ^
+	      !!info->attrs[GTPA_PEER6_ADDRESS])) {
+		/* Either v4 or v6 peer address must be set */
+
+		return -EINVAL;
+	}
+
 	if (!(!!info->attrs[GTPA_MS_ADDRESS] ^
 	      !!info->attrs[GTPA_MS6_ADDRESS])) {
 		/* Either v4 or v6 mobile subscriber address must be set */
@@ -1174,6 +1281,12 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 #else
+	if (!info->attrs[GTPA_PEER_ADDRESS]) {
+		/* v4 peer address must be set */
+
+		return -EINVAL;
+	}
+
 	if (!info->attrs[GTPA_MS_ADDRESS]) {
 		/* v4 mobile subscriber address must be set */
 
@@ -1207,6 +1320,14 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 		goto out_unlock;
 	}
 
+#if GTP_IPV6
+	if ((info->attrs[GTPA_PEER_ADDRESS] && gtp->is_ipv6) ||
+	    (info->attrs[GTPA_PEER6_ADDRESS] && !gtp->is_ipv6)) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+#endif
+
 	if (version == GTP_V0)
 		sk = gtp->sk0;
 	else if (version == GTP_V1)
@@ -1315,10 +1436,31 @@ static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 	if (genlh == NULL)
 		goto nlmsg_failure;
 
-	if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) ||
-	    nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr))
+	if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version))
 		goto nla_put_failure;
 
+	if (nla_put_u32(skb, GTPA_LINK, pctx->dev->ifindex))
+		goto nla_put_failure;
+
+	switch (pctx->peer_af) {
+	case AF_INET:
+		if (nla_put_be32(skb, GTPA_PEER_ADDRESS,
+				 pctx->peer_addr_ip4.s_addr))
+			goto nla_put_failure;
+
+		break;
+#if GTP_IPV6
+	case AF_INET6:
+		if (nla_put_in6_addr(skb, GTPA_PEER6_ADDRESS,
+				     &pctx->peer_addr_ip6))
+			goto nla_put_failure;
+
+		break;
+#endif
+	default:
+		goto nla_put_failure;
+	}
+
 	switch (pctx->ms_af) {
 	case AF_INET:
 		if (nla_put_be32(skb, GTPA_MS_ADDRESS,
@@ -1448,6 +1590,8 @@ static struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = {
 	[GTPA_PEER_ADDRESS]	= { .type = NLA_U32, },
 	[GTPA_MS_ADDRESS]	= { .type = NLA_U32, },
 #if GTP_IPV6
+	[GTPA_PEER6_ADDRESS]	= { .len = FIELD_SIZEOF(struct ipv6hdr,
+							daddr) },
 	[GTPA_MS6_ADDRESS]	= { .len = FIELD_SIZEOF(struct ipv6hdr,
 							daddr) },
 #endif
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index ae4e632c0360..8eec519fa754 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -29,6 +29,7 @@ enum gtp_attrs {
 	GTPA_PAD,
 	GTPA_PORT,
 	GTPA_MS6_ADDRESS,
+	GTPA_PEER6_ADDRESS,
 	__GTPA_MAX,
 };
 #define GTPA_MAX (__GTPA_MAX + 1)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d062c58d5cb..81c26864abeb 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -552,6 +552,9 @@ enum {
 	IFLA_GTP_FD1,
 	IFLA_GTP_PDP_HASHSIZE,
 	IFLA_GTP_ROLE,
+	IFLA_GTP_AF,
+	IFLA_GTP_PORT0,
+	IFLA_GTP_PORT1,
 	__IFLA_GTP_MAX,
 };
 #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 10/12] gtp: Experimental encapsulation of IPv6 packets
From: Tom Herbert @ 2017-09-27  4:58 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170927045803.2477-1-tom@quantonium.net>

Allow IPv6 mobile subscriber packets. This entails adding an IPv6 mobile
subscriber address to pdp context and IPv6 specific variants to find pdp
contexts by address.

Note that this is experimental support of IPv6, more work is
necessary to make this compliant with 3GPP standard.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/Kconfig      |  12 +-
 drivers/net/gtp.c        | 324 +++++++++++++++++++++++++++++++++++++++--------
 include/uapi/linux/gtp.h |   1 +
 3 files changed, 280 insertions(+), 57 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index d4292d56bb02..21836f657e5a 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -226,7 +226,17 @@ config GTP
 	  3GPP TS 29.060 standards.
 
 	  To compile this drivers as a module, choose M here: the module
-	  wil be called gtp.
+	  will be called gtp.
+
+config GTP_IPV6_EXPERIMENTAL
+	bool "GTP IPv6 datapath (EXPERIMENTAL)"
+	default n
+	depends on GTP
+	---help---
+	  This is an experimental implementation that allows encapsulating
+	  IPv6 over GTP and using GTP over IPv6 for testing and development
+	  purpose. This is not a standards conformant implementation for
+	  IPv6 and GTP. More work is needed reach that level.
 
 config MACSEC
 	tristate "IEEE 802.1AE MAC-level encryption (MACsec)"
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 44844eba8df2..919ec6e14973 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -36,6 +36,8 @@
 #include <net/netns/generic.h>
 #include <net/gtp.h>
 
+#define GTP_IPV6 IS_ENABLED(CONFIG_GTP_IPV6_EXPERIMENTAL)
+
 /* An active session for the subscriber. */
 struct pdp_ctx {
 	struct hlist_node	hlist_tid;
@@ -55,9 +57,17 @@ struct pdp_ctx {
 	u8			gtp_version;
 	u8			hlen;
 	__be16			gtp_port;
-	u16			af;
 
-	struct in_addr		ms_addr_ip4;
+	u16			ms_af;
+#if GTP_IPV6
+	union {
+		struct in_addr	ms_addr_ip4;
+		struct in6_addr	ms_addr_ip6;
+	};
+#else
+	struct in_addr	ms_addr_ip4;
+#endif
+
 	struct in_addr		peer_addr_ip4;
 
 	struct sock		*sk;
@@ -81,7 +91,11 @@ struct gtp_dev {
 	unsigned int		role;
 	unsigned int		hash_size;
 	struct hlist_head	*tid_hash;
-	struct hlist_head	*addr_hash;
+
+	struct hlist_head	*addr4_hash;
+#if GTP_IPV6
+	struct hlist_head	*addr6_hash;
+#endif
 
 	struct gro_cells	gro_cells;
 };
@@ -99,6 +113,7 @@ static void pdp_context_delete(struct pdp_ctx *pctx);
 static inline u32 gtp0_hashfn(u64 tid)
 {
 	u32 *tid32 = (u32 *) &tid;
+
 	return jhash_2words(tid32[0], tid32[1], gtp_h_initval);
 }
 
@@ -107,11 +122,6 @@ static inline u32 gtp1u_hashfn(u32 tid)
 	return jhash_1word(tid, gtp_h_initval);
 }
 
-static inline u32 ipv4_hashfn(__be32 ip)
-{
-	return jhash_1word((__force u32)ip, gtp_h_initval);
-}
-
 /* Resolve a PDP context structure based on the 64bit TID. */
 static struct pdp_ctx *gtp0_pdp_find(struct gtp_dev *gtp, u64 tid)
 {
@@ -144,16 +154,21 @@ static struct pdp_ctx *gtp1_pdp_find(struct gtp_dev *gtp, u32 tid)
 	return NULL;
 }
 
+static inline u32 gtp_ipv4_hashfn(__be32 ip)
+{
+	return jhash_1word((__force u32)ip, gtp_h_initval);
+}
+
 /* Resolve a PDP context based on IPv4 address of MS. */
 static struct pdp_ctx *ipv4_pdp_find(struct gtp_dev *gtp, __be32 ms_addr)
 {
 	struct hlist_head *head;
 	struct pdp_ctx *pdp;
 
-	head = &gtp->addr_hash[ipv4_hashfn(ms_addr) % gtp->hash_size];
+	head = &gtp->addr4_hash[gtp_ipv4_hashfn(ms_addr) % gtp->hash_size];
 
 	hlist_for_each_entry_rcu(pdp, head, hlist_addr) {
-		if (pdp->af == AF_INET &&
+		if (pdp->ms_af == AF_INET &&
 		    pdp->ms_addr_ip4.s_addr == ms_addr)
 			return pdp;
 	}
@@ -177,33 +192,109 @@ static bool gtp_check_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx,
 		return iph->saddr == pctx->ms_addr_ip4.s_addr;
 }
 
+#if GTP_IPV6
+
+static inline u32 gtp_ipv6_hashfn(const struct in6_addr *a)
+{
+	return __ipv6_addr_jhash(a, gtp_h_initval);
+}
+
+/* Resolve a PDP context based on IPv6 address of MS. */
+static struct pdp_ctx *ipv6_pdp_find(struct gtp_dev *gtp,
+				     const struct in6_addr *ms_addr)
+{
+	struct hlist_head *head;
+	struct pdp_ctx *pdp;
+
+	head = &gtp->addr6_hash[gtp_ipv6_hashfn(ms_addr) % gtp->hash_size];
+
+	hlist_for_each_entry_rcu(pdp, head, hlist_addr) {
+		if (pdp->ms_af == AF_INET6 &&
+		    ipv6_addr_equal(&pdp->ms_addr_ip6, ms_addr))
+			return pdp;
+	}
+
+	return NULL;
+}
+
+static bool gtp_check_ms_ipv6(struct sk_buff *skb, struct pdp_ctx *pctx,
+			      unsigned int hdrlen, unsigned int role)
+{
+	struct ipv6hdr *ipv6h;
+
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct ipv6hdr)))
+		return false;
+
+	ipv6h = (struct ipv6hdr *)(skb->data + hdrlen);
+
+	if (role == GTP_ROLE_SGSN)
+		return ipv6_addr_equal(&ipv6h->daddr, &pctx->ms_addr_ip6);
+	else
+		return ipv6_addr_equal(&ipv6h->saddr, &pctx->ms_addr_ip6);
+}
+
+#endif
+
 /* Check if the inner IP address in this packet is assigned to any
  * existing mobile subscriber.
  */
 static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx,
 			     unsigned int hdrlen, unsigned int role)
 {
-	switch (ntohs(skb->protocol)) {
-	case ETH_P_IP:
+	struct iphdr *iph;
+
+	/* Minimally there needs to be an IPv4 header */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
+		return false;
+
+	iph = (struct iphdr *)(skb->data + hdrlen);
+
+	switch (iph->version) {
+	case 4:
 		return gtp_check_ms_ipv4(skb, pctx, hdrlen, role);
+#if GTP_IPV6
+	case 6:
+		return gtp_check_ms_ipv6(skb, pctx, hdrlen, role);
+#endif
 	}
+
 	return false;
 }
 
+static u16 ipver_to_eth(struct iphdr *iph)
+{
+	switch (iph->version) {
+	case 4:
+		return htons(ETH_P_IP);
+#if GTP_IPV6
+	case 6:
+		return htons(ETH_P_IPV6);
+#endif
+	default:
+		return 0;
+	}
+}
+
 static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
-			unsigned int hdrlen, unsigned int role)
+		  unsigned int hdrlen, unsigned int role)
 {
 	struct gtp_dev *gtp = netdev_priv(pctx->dev);
 	struct pcpu_sw_netstats *stats;
+	u16 inner_protocol;
 
 	if (!gtp_check_ms(skb, pctx, hdrlen, role)) {
 		netdev_dbg(pctx->dev, "No PDP ctx for this MS\n");
 		return 1;
 	}
 
+	inner_protocol = ipver_to_eth((struct iphdr *)(skb->data + hdrlen));
+	if (!inner_protocol)
+		return -1;
+
 	/* Get rid of the GTP + UDP headers. */
-	if (iptunnel_pull_header(skb, hdrlen, skb->protocol,
-				 !net_eq(sock_net(pctx->sk), dev_net(pctx->dev))))
+	if (iptunnel_pull_header(skb, hdrlen, inner_protocol,
+				 !net_eq(sock_net(pctx->sk),
+					 dev_net(pctx->dev))))
 		return -1;
 
 	netdev_dbg(pctx->dev, "forwarding packet from GGSN to uplink\n");
@@ -241,7 +332,8 @@ static int gtp0_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!gtp)
 		goto pass;
 
-	if (!pskb_may_pull(skb, hdrlen))
+	/* Pull through IP header since gtp_rx looks at IP version */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
 		goto drop;
 
 	gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr));
@@ -287,7 +379,8 @@ static int gtp1u_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (!gtp)
 		goto pass;
 
-	if (!pskb_may_pull(skb, hdrlen))
+	/* Pull through IP header since gtp_rx looks at IP version */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
 		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
@@ -309,8 +402,10 @@ static int gtp1u_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	if (gtp1->flags & GTP1_F_MASK)
 		hdrlen += 4;
 
-	/* Make sure the header is larger enough, including extensions. */
-	if (!pskb_may_pull(skb, hdrlen))
+	/* Make sure the header is larger enough, including extensions and
+	 * also an IP header since gtp_rx looks at IP version
+	 */
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct iphdr)))
 		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
@@ -391,7 +486,8 @@ static inline void gtp0_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 	gtp0->flags	= 0x1e; /* v0, GTP-non-prime. */
 	gtp0->type	= GTP_TPDU;
 	gtp0->length	= htons(payload_len);
-	gtp0->seq	= htons((atomic_inc_return(&pctx->tx_seq) - 1) % 0xffff);
+	gtp0->seq	= htons((atomic_inc_return(&pctx->tx_seq) - 1) %
+				0xffff);
 	gtp0->flow	= htons(pctx->u.v0.flow);
 	gtp0->number	= 0xff;
 	gtp0->spare[0]	= gtp0->spare[1] = gtp0->spare[2] = 0xff;
@@ -523,6 +619,25 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		break;
 	}
+#if GTP_IPV6
+	case ETH_P_IPV6: {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+		if (gtp->role == GTP_ROLE_SGSN)
+			pctx = ipv6_pdp_find(gtp, &ipv6h->saddr);
+		else
+			pctx = ipv6_pdp_find(gtp, &ipv6h->daddr);
+
+		if (!pctx) {
+			netdev_dbg(dev, "no PDP ctx found for %pI6, skip\n",
+				   &ipv6h->daddr);
+			err = -ENOENT;
+			goto tx_err;
+		}
+
+		break;
+	}
+#endif
 	default:
 		err = -EOPNOTSUPP;
 		goto tx_err;
@@ -692,23 +807,38 @@ static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize)
 {
 	int i;
 
-	gtp->addr_hash = kmalloc(sizeof(struct hlist_head) * hsize, GFP_KERNEL);
-	if (gtp->addr_hash == NULL)
-		return -ENOMEM;
+	gtp->addr4_hash = kmalloc_array(hsize, sizeof(*gtp->addr4_hash),
+					GFP_KERNEL);
+	if (!gtp->addr4_hash)
+		goto err;
+
+#if GTP_IPV6
+	gtp->addr6_hash = kmalloc_array(hsize, sizeof(*gtp->addr6_hash),
+					GFP_KERNEL);
+	if (!gtp->addr6_hash)
+		goto err;
+#endif
 
-	gtp->tid_hash = kmalloc(sizeof(struct hlist_head) * hsize, GFP_KERNEL);
-	if (gtp->tid_hash == NULL)
-		goto err1;
+	gtp->tid_hash = kmalloc_array(hsize, sizeof(struct hlist_head),
+				      GFP_KERNEL);
+	if (!gtp->tid_hash)
+		goto err;
 
 	gtp->hash_size = hsize;
 
 	for (i = 0; i < hsize; i++) {
-		INIT_HLIST_HEAD(&gtp->addr_hash[i]);
+		INIT_HLIST_HEAD(&gtp->addr4_hash[i]);
+#if GTP_IPV6
+		INIT_HLIST_HEAD(&gtp->addr6_hash[i]);
+#endif
 		INIT_HLIST_HEAD(&gtp->tid_hash[i]);
 	}
 	return 0;
-err1:
-	kfree(gtp->addr_hash);
+err:
+	kfree(gtp->addr4_hash);
+#if GTP_IPV6
+	kfree(gtp->addr6_hash);
+#endif
 	return -ENOMEM;
 }
 
@@ -722,7 +852,10 @@ static void gtp_hashtable_free(struct gtp_dev *gtp)
 			pdp_context_delete(pctx);
 
 	synchronize_rcu();
-	kfree(gtp->addr_hash);
+	kfree(gtp->addr4_hash);
+#if GTP_IPV6
+	kfree(gtp->addr6_hash);
+#endif
 	kfree(gtp->tid_hash);
 }
 
@@ -844,16 +977,13 @@ static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[])
 	return gtp;
 }
 
-static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
+static void pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 {
 	__be16 default_port = 0;
 
 	pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]);
-	pctx->af = AF_INET;
 	pctx->peer_addr_ip4.s_addr =
 		nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]);
-	pctx->ms_addr_ip4.s_addr =
-		nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
 
 	switch (pctx->gtp_version) {
 	case GTP_V0:
@@ -882,33 +1012,59 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 		pctx->gtp_port = default_port;
 }
 
-static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
-			struct genl_info *info)
+static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk,
+		       struct genl_info *info)
 {
 	struct net_device *dev = gtp->dev;
+	struct hlist_head *addr_list;
+	struct pdp_ctx *pctx = NULL;
 	u32 hash_ms, hash_tid = 0;
-	struct pdp_ctx *pctx;
-	bool found = false;
-	__be32 ms_addr;
+#if GTP_IPV6
+	struct in6_addr ms6_addr;
+#endif
+	__be32 ms_addr = 0;
+	int ms_af;
 	int err;
 
-	ms_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
-	hash_ms = ipv4_hashfn(ms_addr) % gtp->hash_size;
+#if GTP_IPV6
+	/* Caller ensures we have either v4 or v6 mobile subscriber address */
+	if (info->attrs[GTPA_MS_ADDRESS]) {
+		/* IPv4 mobile subscriber */
 
-	hlist_for_each_entry_rcu(pctx, &gtp->addr_hash[hash_ms], hlist_addr) {
-		if (pctx->ms_addr_ip4.s_addr == ms_addr) {
-			found = true;
-			break;
-		}
+		ms_addr = nla_get_in_addr(info->attrs[GTPA_MS_ADDRESS]);
+		hash_ms = gtp_ipv4_hashfn(ms_addr) % gtp->hash_size;
+		addr_list = &gtp->addr4_hash[hash_ms];
+		ms_af = AF_INET;
+
+		pctx = ipv4_pdp_find(gtp, ms_addr);
+	} else {
+		/* IPv6 mobile subscriber */
+
+		ms6_addr = nla_get_in6_addr(info->attrs[GTPA_MS6_ADDRESS]);
+		hash_ms = gtp_ipv6_hashfn(&ms6_addr) % gtp->hash_size;
+		addr_list = &gtp->addr6_hash[hash_ms];
+		ms_af = AF_INET6;
+
+		pctx = ipv6_pdp_find(gtp, &ms6_addr);
 	}
+#else
+	/* IPv4 mobile subscriber */
 
-	if (found) {
+	ms_addr = nla_get_in_addr(info->attrs[GTPA_MS_ADDRESS]);
+	hash_ms = gtp_ipv4_hashfn(ms_addr) % gtp->hash_size;
+	addr_list = &gtp->addr4_hash[hash_ms];
+	ms_af = AF_INET;
+
+	pctx = ipv4_pdp_find(gtp, ms_addr);
+#endif
+
+	if (pctx) {
 		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 		if (info->nlhdr->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
-		ipv4_pdp_fill(pctx, info);
+		pdp_fill(pctx, info);
 
 		if (pctx->gtp_version == GTP_V0)
 			netdev_dbg(dev, "GTPv0-U: update tunnel id = %llx (pdp %p)\n",
@@ -934,7 +1090,20 @@ static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 	sock_hold(sk);
 	pctx->sk = sk;
 	pctx->dev = gtp->dev;
-	ipv4_pdp_fill(pctx, info);
+	pctx->ms_af = ms_af;
+
+	switch (ms_af) {
+	case AF_INET:
+		pctx->ms_addr_ip4.s_addr = ms_addr;
+		break;
+#if GTP_IPV6
+	case AF_INET6:
+		pctx->ms_addr_ip6 = ms6_addr;
+		break;
+#endif
+	}
+
+	pdp_fill(pctx, info);
 	atomic_set(&pctx->tx_seq, 0);
 
 	switch (pctx->gtp_version) {
@@ -951,7 +1120,7 @@ static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 		break;
 	}
 
-	hlist_add_head_rcu(&pctx->hlist_addr, &gtp->addr_hash[hash_ms]);
+	hlist_add_head_rcu(&pctx->hlist_addr, addr_list);
 	hlist_add_head_rcu(&pctx->hlist_tid, &gtp->tid_hash[hash_tid]);
 
 	switch (pctx->gtp_version) {
@@ -993,11 +1162,25 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 	int err;
 
 	if (!info->attrs[GTPA_VERSION] ||
-	    !info->attrs[GTPA_LINK] ||
-	    !info->attrs[GTPA_PEER_ADDRESS] ||
-	    !info->attrs[GTPA_MS_ADDRESS])
+	   !info->attrs[GTPA_LINK] ||
+	   !info->attrs[GTPA_PEER_ADDRESS])
 		return -EINVAL;
 
+#if GTP_IPV6
+	if (!(!!info->attrs[GTPA_MS_ADDRESS] ^
+	      !!info->attrs[GTPA_MS6_ADDRESS])) {
+		/* Either v4 or v6 mobile subscriber address must be set */
+
+		return -EINVAL;
+	}
+#else
+	if (!info->attrs[GTPA_MS_ADDRESS]) {
+		/* v4 mobile subscriber address must be set */
+
+		return -EINVAL;
+	}
+#endif
+
 	version = nla_get_u32(info->attrs[GTPA_VERSION]);
 
 	switch (version) {
@@ -1036,7 +1219,7 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 		goto out_unlock;
 	}
 
-	err = ipv4_pdp_add(gtp, sk, info);
+	err = gtp_pdp_add(gtp, sk, info);
 
 out_unlock:
 	rcu_read_unlock();
@@ -1056,6 +1239,13 @@ static struct pdp_ctx *gtp_find_pdp_by_link(struct net *net,
 		__be32 ip = nla_get_be32(nla[GTPA_MS_ADDRESS]);
 
 		return ipv4_pdp_find(gtp, ip);
+#if GTP_IPV6
+	} else if (nla[GTPA_MS6_ADDRESS]) {
+		struct in6_addr ip6 =
+		    nla_get_in6_addr(nla[GTPA_MS6_ADDRESS]);
+
+		return ipv6_pdp_find(gtp, &ip6);
+#endif
 	} else if (nla[GTPA_VERSION]) {
 		u32 gtp_version = nla_get_u32(nla[GTPA_VERSION]);
 
@@ -1126,9 +1316,27 @@ static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 		goto nlmsg_failure;
 
 	if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) ||
-	    nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr) ||
-	    nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms_addr_ip4.s_addr))
+	    nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr))
+		goto nla_put_failure;
+
+	switch (pctx->ms_af) {
+	case AF_INET:
+		if (nla_put_be32(skb, GTPA_MS_ADDRESS,
+				 pctx->ms_addr_ip4.s_addr))
+			goto nla_put_failure;
+
+		break;
+#if GTP_IPV6
+	case AF_INET6:
+		if (nla_put_in6_addr(skb, GTPA_MS6_ADDRESS,
+				     &pctx->ms_addr_ip6))
+			goto nla_put_failure;
+
+		break;
+#endif
+	default:
 		goto nla_put_failure;
+	}
 
 	switch (pctx->gtp_version) {
 	case GTP_V0:
@@ -1239,6 +1447,10 @@ static struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = {
 	[GTPA_TID]		= { .type = NLA_U64, },
 	[GTPA_PEER_ADDRESS]	= { .type = NLA_U32, },
 	[GTPA_MS_ADDRESS]	= { .type = NLA_U32, },
+#if GTP_IPV6
+	[GTPA_MS6_ADDRESS]	= { .len = FIELD_SIZEOF(struct ipv6hdr,
+							daddr) },
+#endif
 	[GTPA_FLOW]		= { .type = NLA_U16, },
 	[GTPA_NET_NS_FD]	= { .type = NLA_U32, },
 	[GTPA_I_TEI]		= { .type = NLA_U32, },
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index b2283a5c6d7f..ae4e632c0360 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -28,6 +28,7 @@ enum gtp_attrs {
 	GTPA_O_TEI,	/* for GTPv1 only */
 	GTPA_PAD,
 	GTPA_PORT,
+	GTPA_MS6_ADDRESS,
 	__GTPA_MAX,
 };
 #define GTPA_MAX (__GTPA_MAX + 1)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 09/12] gtp: Eliminate pktinfo and add port configuration
From: Tom Herbert @ 2017-09-27  4:58 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170927045803.2477-1-tom@quantonium.net>

The gtp pktinfo structure is unnecessary and needs a lot of code to
manage it. Remove it. Also, add per pdp port configuration for transmit.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c        | 177 +++++++++++++++++++++--------------------------
 include/uapi/linux/gtp.h |   1 +
 2 files changed, 80 insertions(+), 98 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index bbb08f8849d3..44844eba8df2 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -54,6 +54,7 @@ struct pdp_ctx {
 	} u;
 	u8			gtp_version;
 	u8			hlen;
+	__be16			gtp_port;
 	u16			af;
 
 	struct in_addr		ms_addr_ip4;
@@ -420,73 +421,36 @@ static inline void gtp1_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 	 */
 }
 
-struct gtp_pktinfo {
-	struct sock		*sk;
-	struct iphdr		*iph;
-	struct flowi4		fl4;
-	struct rtable		*rt;
-	struct pdp_ctx		*pctx;
-	struct net_device	*dev;
-	__be16			gtph_port;
-};
-
-static void gtp_push_header(struct sk_buff *skb, struct gtp_pktinfo *pktinfo)
+static void gtp_push_header(struct sk_buff *skb, struct pdp_ctx *pctx)
 {
-	switch (pktinfo->pctx->gtp_version) {
+	switch (pctx->gtp_version) {
 	case GTP_V0:
-		pktinfo->gtph_port = htons(GTP0_PORT);
-		gtp0_push_header(skb, pktinfo->pctx);
+		gtp0_push_header(skb, pctx);
 		break;
 	case GTP_V1:
-		pktinfo->gtph_port = htons(GTP1U_PORT);
-		gtp1_push_header(skb, pktinfo->pctx);
+		gtp1_push_header(skb, pctx);
 		break;
 	}
 }
 
-static inline void gtp_set_pktinfo_ipv4(struct gtp_pktinfo *pktinfo,
-					struct sock *sk, struct iphdr *iph,
-					struct pdp_ctx *pctx, struct rtable *rt,
-					struct flowi4 *fl4,
-					struct net_device *dev)
-{
-	pktinfo->sk	= sk;
-	pktinfo->iph	= iph;
-	pktinfo->pctx	= pctx;
-	pktinfo->rt	= rt;
-	pktinfo->fl4	= *fl4;
-	pktinfo->dev	= dev;
-}
-
-static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
-			     struct gtp_pktinfo *pktinfo)
+static int gtp_xmit(struct sk_buff *skb, struct net_device *dev,
+		    struct pdp_ctx *pctx)
 {
-	struct gtp_dev *gtp = netdev_priv(dev);
-	struct pdp_ctx *pctx;
+	struct iphdr *inner_iph = NULL;
+	struct sock *sk = pctx->sk;
+	__be32 saddr = inet_sk(sk)->inet_saddr;
 	struct rtable *rt;
-	struct flowi4 fl4;
-	struct iphdr *iph;
-	struct sock *sk;
-	__be32 saddr;
+	int err = 0;
 
-	/* Read the IP destination address and resolve the PDP context.
-	 * Prepend PDP header with TEI/TID from PDP ctx.
-	 */
-	iph = ip_hdr(skb);
-	if (gtp->role == GTP_ROLE_SGSN)
-		pctx = ipv4_pdp_find(gtp, iph->saddr);
-	else
-		pctx = ipv4_pdp_find(gtp, iph->daddr);
+	if (skb->protocol == ETH_P_IP)
+		inner_iph = ip_hdr(skb);
 
-	if (!pctx) {
-		netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n",
-			   &iph->daddr);
-		return -ENOENT;
-	}
-	netdev_dbg(dev, "found PDP context %p\n", pctx);
+	/* Ensure there is sufficient headroom. */
+	err = skb_cow_head(skb, dev->needed_headroom);
+	if (unlikely(err))
+		goto out_err;
 
-	sk = pctx->sk;
-	saddr = inet_sk(sk)->inet_saddr;
+	skb_reset_inner_headers(skb);
 
 	/* Source address returned by route lookup is ignored since
 	 * we get the address from a socket.
@@ -494,81 +458,89 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	rt = ip_tunnel_get_route(dev, skb, sk->sk_protocol,
 				 sk->sk_bound_dev_if, RT_CONN_FLAGS(sk),
 				 pctx->peer_addr_ip4.s_addr, &saddr,
-				 pktinfo->gtph_port, pktinfo->gtph_port,
+				 pctx->gtp_port, pctx->gtp_port,
 				 &pctx->dst_cache, NULL);
 
 	if (IS_ERR(rt)) {
-		if (rt == ERR_PTR(-ELOOP)) {
-			netdev_dbg(dev, "circular route to SSGN %pI4\n",
-				   &pctx->peer_addr_ip4.s_addr);
-			dev->stats.collisions++;
-			goto err_rt;
-		} else {
-			netdev_dbg(dev, "no route to SSGN %pI4\n",
-				   &pctx->peer_addr_ip4.s_addr);
-			dev->stats.tx_carrier_errors++;
-			goto err;
-		}
+		err = PTR_ERR(rt);
+		goto out_err;
 	}
 
 	skb_dst_drop(skb);
 
-	gtp_set_pktinfo_ipv4(pktinfo, sk, iph, pctx, rt, &fl4, dev);
-	gtp_push_header(skb, pktinfo);
+	gtp_push_header(skb, pctx);
+
+	if (inner_iph)
+		__iptunnel_update_pmtu(dev, skb, &rt->dst,
+				       !!inner_iph->frag_off,
+				       inner_iph, pctx->hlen,
+				       pctx->peer_addr_ip4.s_addr);
 
-	__iptunnel_update_pmtu(dev, skb, &rt->dst, !!iph->frag_off, iph,
-			       pctx->hlen, pctx->peer_addr_ip4.s_addr);
+	udp_tunnel_xmit_skb(rt, sk, skb, saddr,
+			    pctx->peer_addr_ip4.s_addr,
+			    0, ip4_dst_hoplimit(&rt->dst), 0,
+			    pctx->gtp_port, pctx->gtp_port,
+			    false, false);
+
+	netdev_dbg(dev, "gtp -> IP src: %pI4 dst: %pI4\n",
+		   &saddr, &pctx->peer_addr_ip4.s_addr);
 
 	return 0;
-err_rt:
-	ip_rt_put(rt);
-err:
-	return -EBADMSG;
+
+out_err:
+	if (err == -ELOOP)
+		dev->stats.collisions++;
+	else
+		dev->stats.tx_carrier_errors++;
+
+	return err;
 }
 
 static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	unsigned int proto = ntohs(skb->protocol);
-	struct gtp_pktinfo pktinfo;
+	struct gtp_dev *gtp = netdev_priv(dev);
+	struct pdp_ctx *pctx;
 	int err;
 
-	/* Ensure there is sufficient headroom. */
-	if (skb_cow_head(skb, dev->needed_headroom))
-		goto tx_err;
-
-	skb_reset_inner_headers(skb);
-
 	/* PDP context lookups in gtp_build_skb_*() need rcu read-side lock. */
 	rcu_read_lock();
 	switch (proto) {
-	case ETH_P_IP:
-		err = gtp_build_skb_ip4(skb, dev, &pktinfo);
+	case ETH_P_IP: {
+		struct iphdr *iph = ip_hdr(skb);
+
+		if (gtp->role == GTP_ROLE_SGSN)
+			pctx = ipv4_pdp_find(gtp, iph->saddr);
+		else
+			pctx = ipv4_pdp_find(gtp, iph->daddr);
+
+		if (!pctx) {
+			netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n",
+				   &iph->daddr);
+			err = -ENOENT;
+			goto tx_err;
+		}
+
 		break;
+	}
 	default:
 		err = -EOPNOTSUPP;
-		break;
+		goto tx_err;
 	}
-	rcu_read_unlock();
+
+	netdev_dbg(dev, "found PDP context %p\n", pctx);
+
+	err = gtp_xmit(skb, dev, pctx);
 
 	if (err < 0)
 		goto tx_err;
 
-	switch (proto) {
-	case ETH_P_IP:
-		netdev_dbg(pktinfo.dev, "gtp -> IP src: %pI4 dst: %pI4\n",
-			   &pktinfo.iph->saddr, &pktinfo.iph->daddr);
-		udp_tunnel_xmit_skb(pktinfo.rt, pktinfo.sk, skb,
-				    pktinfo.fl4.saddr, pktinfo.fl4.daddr,
-				    pktinfo.iph->tos,
-				    ip4_dst_hoplimit(&pktinfo.rt->dst),
-				    0,
-				    pktinfo.gtph_port, pktinfo.gtph_port,
-				    true, false);
-		break;
-	}
+	rcu_read_unlock();
 
 	return NETDEV_TX_OK;
+
 tx_err:
+	rcu_read_unlock();
 	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
@@ -874,6 +846,8 @@ static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[])
 
 static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 {
+	__be16 default_port = 0;
+
 	pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]);
 	pctx->af = AF_INET;
 	pctx->peer_addr_ip4.s_addr =
@@ -890,15 +864,22 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 		pctx->u.v0.tid = nla_get_u64(info->attrs[GTPA_TID]);
 		pctx->u.v0.flow = nla_get_u16(info->attrs[GTPA_FLOW]);
 		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp0_header);
+		default_port = htons(GTP0_PORT);
 		break;
 	case GTP_V1:
 		pctx->u.v1.i_tei = nla_get_u32(info->attrs[GTPA_I_TEI]);
 		pctx->u.v1.o_tei = nla_get_u32(info->attrs[GTPA_O_TEI]);
 		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp1_header);
+		default_port = htons(GTP1U_PORT);
 		break;
 	default:
 		break;
 	}
+
+	if (info->attrs[GTPA_PORT])
+		pctx->gtp_port = nla_get_u16(info->attrs[GTPA_PORT]);
+	else
+		pctx->gtp_port = default_port;
 }
 
 static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk,
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index 57d1edb8efd9..b2283a5c6d7f 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -27,6 +27,7 @@ enum gtp_attrs {
 	GTPA_I_TEI,	/* for GTPv1 only */
 	GTPA_O_TEI,	/* for GTPv1 only */
 	GTPA_PAD,
+	GTPA_PORT,
 	__GTPA_MAX,
 };
 #define GTPA_MAX (__GTPA_MAX + 1)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 08/12] gtp: Call function to update path mtu
From: Tom Herbert @ 2017-09-27  4:57 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170927045803.2477-1-tom@quantonium.net>

Replace mtu handling with call to __iptunnel_update_pmtu.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 36 ++++++------------------------------
 1 file changed, 6 insertions(+), 30 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index a6e2e0a1f424..bbb08f8849d3 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -53,6 +53,7 @@ struct pdp_ctx {
 		} v1;
 	} u;
 	u8			gtp_version;
+	u8			hlen;
 	u16			af;
 
 	struct in_addr		ms_addr_ip4;
@@ -467,8 +468,6 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 	struct iphdr *iph;
 	struct sock *sk;
 	__be32 saddr;
-	__be16 df;
-	int mtu;
 
 	/* Read the IP destination address and resolve the PDP context.
 	 * Prepend PDP header with TEI/TID from PDP ctx.
@@ -514,37 +513,12 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 
 	skb_dst_drop(skb);
 
-	/* This is similar to tnl_update_pmtu(). */
-	df = iph->frag_off;
-	if (df) {
-		mtu = dst_mtu(&rt->dst) - dev->hard_header_len -
-			sizeof(struct iphdr) - sizeof(struct udphdr);
-		switch (pctx->gtp_version) {
-		case GTP_V0:
-			mtu -= sizeof(struct gtp0_header);
-			break;
-		case GTP_V1:
-			mtu -= sizeof(struct gtp1_header);
-			break;
-		}
-	} else {
-		mtu = dst_mtu(&rt->dst);
-	}
-
-	rt->dst.ops->update_pmtu(&rt->dst, NULL, skb, mtu);
-
-	if (!skb_is_gso(skb) && (iph->frag_off & htons(IP_DF)) &&
-	    mtu < ntohs(iph->tot_len)) {
-		netdev_dbg(dev, "packet too big, fragmentation needed\n");
-		memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(mtu));
-		goto err_rt;
-	}
-
 	gtp_set_pktinfo_ipv4(pktinfo, sk, iph, pctx, rt, &fl4, dev);
 	gtp_push_header(skb, pktinfo);
 
+	__iptunnel_update_pmtu(dev, skb, &rt->dst, !!iph->frag_off, iph,
+			       pctx->hlen, pctx->peer_addr_ip4.s_addr);
+
 	return 0;
 err_rt:
 	ip_rt_put(rt);
@@ -915,10 +889,12 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 		 */
 		pctx->u.v0.tid = nla_get_u64(info->attrs[GTPA_TID]);
 		pctx->u.v0.flow = nla_get_u16(info->attrs[GTPA_FLOW]);
+		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp0_header);
 		break;
 	case GTP_V1:
 		pctx->u.v1.i_tei = nla_get_u32(info->attrs[GTPA_I_TEI]);
 		pctx->u.v1.o_tei = nla_get_u32(info->attrs[GTPA_O_TEI]);
+		pctx->hlen = sizeof(struct udphdr) + sizeof(struct gtp1_header);
 		break;
 	default:
 		break;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 07/12] gtp: udp recv clean up
From: Tom Herbert @ 2017-09-27  4:57 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170927045803.2477-1-tom@quantonium.net>

Create separate UDP receive functions for GTP version 0 and version 1.
Set encap_rcv appropriately when configuring a socket.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 100 ++++++++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 51 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 00e5ea5cb935..a6e2e0a1f424 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -225,14 +225,20 @@ static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
 	return 0;
 }
 
-/* 1 means pass up to the stack, -1 means drop and 0 means decapsulated. */
-static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
+/* UDP encapsulation receive handler for GTPv0-U . See net/ipv4/udp.c.
+ * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket.
+ */
+static int gtp0_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct gtp_dev *gtp = rcu_dereference_sk_user_data(sk);
 	unsigned int hdrlen = sizeof(struct udphdr) +
 			      sizeof(struct gtp0_header);
 	struct gtp0_header *gtp0;
 	struct pdp_ctx *pctx;
 
+	if (!gtp)
+		goto pass;
+
 	if (!pskb_may_pull(skb, hdrlen))
 		goto drop;
 
@@ -244,26 +250,41 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if (gtp0->type != GTP_TPDU)
 		goto pass;
 
+	netdev_dbg(gtp->dev, "received GTP0 packet\n");
+
 	pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid));
 	if (!pctx) {
 		netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb);
 		goto pass;
 	}
 
-	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+	if (!gtp_rx(pctx, skb, hdrlen, gtp->role)) {
+		/* Successfully received */
+		return 0;
+	}
+
 drop:
-	return -1;
+	kfree_skb(skb);
+	return 0;
+
 pass:
 	return 1;
 }
 
-static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
+/* UDP encapsulation receive handler for GTPv0-U . See net/ipv4/udp.c.
+ * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket.
+ */
+static int gtp1u_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct gtp_dev *gtp = rcu_dereference_sk_user_data(sk);
 	unsigned int hdrlen = sizeof(struct udphdr) +
 			      sizeof(struct gtp1_header);
 	struct gtp1_header *gtp1;
 	struct pdp_ctx *pctx;
 
+	if (!gtp)
+		goto pass;
+
 	if (!pskb_may_pull(skb, hdrlen))
 		goto drop;
 
@@ -275,6 +296,8 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	if (gtp1->type != GTP_TPDU)
 		goto pass;
 
+	netdev_dbg(gtp->dev, "received GTP1 packet\n");
+
 	/* From 29.060: "This field shall be present if and only if any one or
 	 * more of the S, PN and E flags are set.".
 	 *
@@ -296,9 +319,15 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 		goto drop;
 	}
 
-	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+	if (!gtp_rx(pctx, skb, hdrlen, gtp->role)) {
+		/* Successfully received */
+		return 0;
+	}
+
 drop:
-	return -1;
+	kfree_skb(skb);
+	return 0;
+
 pass:
 	return 1;
 }
@@ -329,49 +358,6 @@ static void gtp_encap_disable(struct gtp_dev *gtp)
 	gtp_encap_disable_sock(gtp->sk1u);
 }
 
-/* UDP encapsulation receive handler. See net/ipv4/udp.c.
- * Return codes: 0: success, <0: error, >0: pass up to userspace UDP socket.
- */
-static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb)
-{
-	struct gtp_dev *gtp;
-	int ret = 0;
-
-	gtp = rcu_dereference_sk_user_data(sk);
-	if (!gtp)
-		return 1;
-
-	netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk);
-
-	switch (udp_sk(sk)->encap_type) {
-	case UDP_ENCAP_GTP0:
-		netdev_dbg(gtp->dev, "received GTP0 packet\n");
-		ret = gtp0_udp_encap_recv(gtp, skb);
-		break;
-	case UDP_ENCAP_GTP1U:
-		netdev_dbg(gtp->dev, "received GTP1U packet\n");
-		ret = gtp1u_udp_encap_recv(gtp, skb);
-		break;
-	default:
-		ret = -1; /* Shouldn't happen. */
-	}
-
-	switch (ret) {
-	case 1:
-		netdev_dbg(gtp->dev, "pass up to the process\n");
-		break;
-	case 0:
-		break;
-	case -1:
-		netdev_dbg(gtp->dev, "GTP packet has been dropped\n");
-		kfree_skb(skb);
-		ret = 0;
-		break;
-	}
-
-	return ret;
-}
-
 static int gtp_dev_init(struct net_device *dev)
 {
 	struct gtp_dev *gtp = netdev_priv(dev);
@@ -824,9 +810,21 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 	sk = sock->sk;
 	sock_hold(sk);
 
+	switch (type) {
+	case UDP_ENCAP_GTP0:
+		tuncfg.encap_rcv = gtp0_udp_encap_recv;
+		break;
+	case UDP_ENCAP_GTP1U:
+		tuncfg.encap_rcv = gtp1u_udp_encap_recv;
+		break;
+	default:
+		pr_debug("Unknown encap type %u\n", type);
+		sk = ERR_PTR(-EINVAL);
+		goto out_sock;
+	}
+
 	tuncfg.sk_user_data = gtp;
 	tuncfg.encap_type = type;
-	tuncfg.encap_rcv = gtp_encap_recv;
 	tuncfg.encap_destroy = gtp_encap_destroy;
 
 	setup_udp_tunnel_sock(sock_net(sock->sk), sock, &tuncfg);
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 06/12] gtp: Use goto for exceptions in gtp_udp_encap_recv funcs
From: Tom Herbert @ 2017-09-27  4:57 UTC (permalink / raw)
  To: davem; +Cc: pablo, laforge, aschultz, netdev, rohit, Tom Herbert
In-Reply-To: <20170927045803.2477-1-tom@quantonium.net>

Consolidate return logic to make it easier to extend.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 drivers/net/gtp.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index f2aac5d01143..00e5ea5cb935 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -234,23 +234,27 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	struct pdp_ctx *pctx;
 
 	if (!pskb_may_pull(skb, hdrlen))
-		return -1;
+		goto drop;
 
 	gtp0 = (struct gtp0_header *)(skb->data + sizeof(struct udphdr));
 
 	if ((gtp0->flags >> 5) != GTP_V0)
-		return 1;
+		goto pass;
 
 	if (gtp0->type != GTP_TPDU)
-		return 1;
+		goto pass;
 
 	pctx = gtp0_pdp_find(gtp, be64_to_cpu(gtp0->tid));
 	if (!pctx) {
 		netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb);
-		return 1;
+		goto pass;
 	}
 
 	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+drop:
+	return -1;
+pass:
+	return 1;
 }
 
 static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
@@ -261,15 +265,15 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 	struct pdp_ctx *pctx;
 
 	if (!pskb_may_pull(skb, hdrlen))
-		return -1;
+		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
 
 	if ((gtp1->flags >> 5) != GTP_V1)
-		return 1;
+		goto pass;
 
 	if (gtp1->type != GTP_TPDU)
-		return 1;
+		goto pass;
 
 	/* From 29.060: "This field shall be present if and only if any one or
 	 * more of the S, PN and E flags are set.".
@@ -282,17 +286,21 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 
 	/* Make sure the header is larger enough, including extensions. */
 	if (!pskb_may_pull(skb, hdrlen))
-		return -1;
+		goto drop;
 
 	gtp1 = (struct gtp1_header *)(skb->data + sizeof(struct udphdr));
 
 	pctx = gtp1_pdp_find(gtp, ntohl(gtp1->tid));
 	if (!pctx) {
 		netdev_dbg(gtp->dev, "No PDP ctx to decap skb=%p\n", skb);
-		return 1;
+		goto drop;
 	}
 
 	return gtp_rx(pctx, skb, hdrlen, gtp->role);
+drop:
+	return -1;
+pass:
+	return 1;
 }
 
 static void gtp_encap_destroy(struct sock *sk)
-- 
2.11.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox