Netdev List
 help / color / mirror / Atom feed
* [PATCH mlx5-next v2 11/20] IB/mlx5: Introduce DEVX
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Yishai Hadas <yishaih@mellanox.com>

Introduce DEVX to enable direct device commands in downstream patches
from this series.

In that mode of work the firmware manages the isolation between
processes' resources and as such a DEVX user id is created and assigned
to the given user context upon allocation request.

A capability check is done to make sure that this feature is really
supported by the firmware prior to creating the DEVX user id.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/Makefile           |  1 +
 drivers/infiniband/hw/mlx5/devx.c             | 58 +++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx5/main.c             | 24 +++++++++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h          | 13 ++++++
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c |  4 ++
 include/linux/mlx5/device.h                   |  3 ++
 include/linux/mlx5/mlx5_ifc.h                 | 15 ++++++-
 include/uapi/rdma/mlx5-abi.h                  |  3 ++
 8 files changed, 117 insertions(+), 4 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx5/devx.c

diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index d42b922bede8..577e4c418bae 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
 mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o
 mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
 mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
+mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
new file mode 100644
index 000000000000..775448910ad1
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/ib_umem.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_ib.h"
+
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
+{
+	u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
+	u64 general_obj_types;
+	void *uctx;
+	void *hdr;
+	int err;
+
+	uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx);
+	hdr = MLX5_ADDR_OF(create_uctx_in, in, hdr);
+
+	general_obj_types = MLX5_CAP_GEN_64(dev->mdev, general_obj_types);
+	if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UCTX) ||
+	    !(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UMEM))
+		return -EINVAL;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+
+	MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_UCTX);
+
+	err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	context->devx_uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+	return 0;
+}
+
+void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev,
+			  struct mlx5_ib_ucontext *context)
+{
+	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {0};
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
+
+	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_UCTX);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, context->devx_uid);
+
+	mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e52dd21519b4..430a9e36d392 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1676,8 +1676,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (err)
 		return ERR_PTR(err);
 
-	if (req.flags)
-		return ERR_PTR(-EINVAL);
+	if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
+		return ERR_PTR(-EOPNOTSUPP);
 
 	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
 		return ERR_PTR(-EOPNOTSUPP);
@@ -1761,6 +1761,18 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 			goto out_uars;
 	}
 
+	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
+		/* Block DEVX on Infiniband as of SELinux */
+		if (mlx5_ib_port_link_layer(ibdev, 1) != IB_LINK_LAYER_ETHERNET) {
+			err = -EPERM;
+			goto out_td;
+		}
+
+		err = mlx5_ib_devx_create(dev, context);
+		if (err)
+			goto out_td;
+	}
+
 	INIT_LIST_HEAD(&context->vma_private_list);
 	mutex_init(&context->vma_private_list_mutex);
 	INIT_LIST_HEAD(&context->db_page_list);
@@ -1821,7 +1833,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 
 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	if (err)
-		goto out_td;
+		goto out_mdev;
 
 	bfregi->ver = ver;
 	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
@@ -1831,6 +1843,9 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 
 	return &context->ibucontext;
 
+out_mdev:
+	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
+		mlx5_ib_devx_destroy(dev, context);
 out_td:
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 		mlx5_ib_dealloc_transport_domain(dev, context->tdn);
@@ -1856,6 +1871,9 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
 	struct mlx5_bfreg_info *bfregi;
 
+	if (context->devx_uid)
+		mlx5_ib_devx_destroy(dev, context);
+
 	bfregi = &context->bfregi;
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 		mlx5_ib_dealloc_transport_domain(dev, context->tdn);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d89c8fe626f6..5e8abd8bcd71 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -143,6 +143,7 @@ struct mlx5_ib_ucontext {
 
 	u64			lib_caps;
 	DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES);
+	u16			devx_uid;
 };
 
 static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -1217,6 +1218,18 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev,
 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
 				  u8 port_num);
 
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
+			struct mlx5_ib_ucontext *context);
+void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev,
+			  struct mlx5_ib_ucontext *context);
+#else
+static inline int
+mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
+		    struct mlx5_ib_ucontext *context) { return -EOPNOTSUPP; };
+static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev,
+					struct mlx5_ib_ucontext *context) {}
+#endif
 static inline void init_query_mad(struct ib_smp *mad)
 {
 	mad->base_version  = 1;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index b99d6df3905b..d07f24de8fa3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -310,6 +310,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_DEALLOC_ENCAP_HEADER:
 	case MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT:
 	case MLX5_CMD_OP_FPGA_DESTROY_QP:
+	case MLX5_CMD_OP_DESTROY_GENERAL_OBJECT:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -427,6 +428,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_FPGA_MODIFY_QP:
 	case MLX5_CMD_OP_FPGA_QUERY_QP:
 	case MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS:
+	case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
 		*status = MLX5_DRIVER_STATUS_ABORTED;
 		*synd = MLX5_DRIVER_SYND;
 		return -EIO;
@@ -599,6 +601,8 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP);
 	MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP_COUNTERS);
 	MLX5_COMMAND_STR_CASE(FPGA_DESTROY_QP);
+	MLX5_COMMAND_STR_CASE(CREATE_GENERAL_OBJECT);
+	MLX5_COMMAND_STR_CASE(DESTROY_GENERAL_OBJECT);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 02f72ebf31a7..f8671c0a43aa 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1071,6 +1071,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_GEN(mdev, cap) \
 	MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
 
+#define MLX5_CAP_GEN_64(mdev, cap) \
+	MLX5_GET64(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
+
 #define MLX5_CAP_GEN_MAX(mdev, cap) \
 	MLX5_GET(cmd_hca_cap, mdev->caps.hca_max[MLX5_CAP_GENERAL], cap)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 78d4993a2920..f810772e80c0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -75,6 +75,15 @@ enum {
 	MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
 };
 
+enum {
+	MLX5_GENERAL_OBJ_TYPES_CAP_UCTX = (1ULL << 4),
+	MLX5_GENERAL_OBJ_TYPES_CAP_UMEM = (1ULL << 5),
+};
+
+enum {
+	MLX5_OBJ_TYPE_UCTX = 0x0004,
+};
+
 enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP                 = 0x100,
 	MLX5_CMD_OP_QUERY_ADAPTER                 = 0x101,
@@ -242,6 +251,8 @@ enum {
 	MLX5_CMD_OP_FPGA_QUERY_QP                 = 0x962,
 	MLX5_CMD_OP_FPGA_DESTROY_QP               = 0x963,
 	MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS        = 0x964,
+	MLX5_CMD_OP_CREATE_GENERAL_OBJECT         = 0xa00,
+	MLX5_CMD_OP_DESTROY_GENERAL_OBJECT        = 0xa03,
 	MLX5_CMD_OP_MAX
 };
 
@@ -1113,7 +1124,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_3f8[0x3];
 	u8         log_max_current_uc_list[0x5];
 
-	u8         reserved_at_400[0x80];
+	u8         general_obj_types[0x40];
+
+	u8         reserved_at_440[0x40];
 
 	u8         reserved_at_480[0x3];
 	u8         log_max_l2_table[0x5];
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 8daec1fa49cf..5d591ff28139 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -76,6 +76,9 @@ enum mlx5_lib_caps {
 	MLX5_LIB_CAP_4K_UAR	= (__u64)1 << 0,
 };
 
+enum mlx5_ib_alloc_uctx_v2_flags {
+	MLX5_IB_ALLOC_UCTX_DEVX	= 1 << 0,
+};
 struct mlx5_ib_alloc_ucontext_req_v2 {
 	__u32	total_num_bfregs;
 	__u32	num_low_latency_bfregs;
-- 
2.14.4

^ permalink raw reply related

* [PATCH mlx5-next v2 10/20] net/mlx5: Expose DEVX ifc structures
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Yishai Hadas <yishaih@mellanox.com>

This patch updates the mlx5_ifc structures with the following:
- Expose general command header in/out format.
- Expose user context format.
- Expose umem format.

Downstream patches from this series will use this stuff.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 52 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 27134c4fcb76..78d4993a2920 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9115,4 +9115,56 @@ struct mlx5_ifc_dealloc_memic_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         obj_type[0x10];
+
+	u8         obj_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         obj_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_umem_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x5b];
+	u8         log_page_size[0x5];
+
+	u8         page_offset[0x20];
+
+	u8         num_of_mtt[0x40];
+
+	struct mlx5_ifc_mtt_bits  mtt[0];
+};
+
+struct mlx5_ifc_uctx_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x1c0];
+};
+
+struct mlx5_ifc_create_umem_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits   hdr;
+	struct mlx5_ifc_umem_bits                     umem;
+};
+
+struct mlx5_ifc_create_uctx_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits   hdr;
+	struct mlx5_ifc_uctx_bits                     uctx;
+};
+
 #endif /* MLX5_IFC_H */
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 09/20] IB/core: Improve uverbs_cleanup_ucontext algorithm
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Yishai Hadas <yishaih@mellanox.com>

Improve uverbs_cleanup_ucontext algorithm to work properly even when
there are two objects from the same type and one points to the other.
For that case to work the 'destroy_order' is not used any more as it's
static per type and can't support this use case.

Instead, the new algorithm iterates over the objects in a LIFO mode as
was before, at the end of each loop if were left objects that couldn't
be destroyed it re-iterates over them give another chance to destroy them
successfully.

If was one iteration that didn't cleanup any object the last iteration
will force the cleanup as was done before this change, this is coming to
prevent memory leaks even in that fatal case.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c                | 96 ++++++++++++----------
 drivers/infiniband/core/uverbs_std_types.c         | 22 +++--
 .../infiniband/core/uverbs_std_types_counters.c    |  2 +-
 drivers/infiniband/core/uverbs_std_types_cq.c      |  2 +-
 drivers/infiniband/core/uverbs_std_types_dm.c      |  3 +-
 .../infiniband/core/uverbs_std_types_flow_action.c |  2 +-
 drivers/infiniband/core/uverbs_std_types_mr.c      |  3 +-
 include/rdma/ib_verbs.h                            |  2 +-
 include/rdma/uverbs_types.h                        | 11 +--
 9 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index df3c40533252..f4554de59659 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -645,61 +645,69 @@ void uverbs_close_fd(struct file *f)
 	kref_put(uverbs_file_ref, ib_uverbs_release_file);
 }
 
-void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
+static int __uverbs_cleanup_ucontext(struct ib_ucontext *ucontext,
+				    enum rdma_remove_reason reason)
 {
-	enum rdma_remove_reason reason = device_removed ?
-		RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE;
-	unsigned int cur_order = 0;
+	struct ib_uobject *obj, *next_obj;
+	int ret = -EINVAL;
+	int err = 0;
 
+	/*
+	 * This shouldn't run while executing other commands on this
+	 * context. Thus, the only thing we should take care of is
+	 * releasing a FD while traversing this list. The FD could be
+	 * closed and released from the _release fop of this FD.
+	 * In order to mitigate this, we add a lock.
+	 * We take and release the lock per traversal in order to let
+	 * other threads (which might still use the FDs) chance to run.
+	 */
+	mutex_lock(&ucontext->uobjects_lock);
 	ucontext->cleanup_reason = reason;
+	list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects, list) {
+		/*
+		 * if we hit this WARN_ON, that means we are
+		 * racing with a lookup_get.
+		 */
+		WARN_ON(uverbs_try_lock_object(obj, true));
+		err = obj->type->type_class->remove_commit(obj, reason);
+		if (err) {
+			if (reason == RDMA_REMOVE_DESTROY) {
+				pr_debug("ib_uverbs: failed to remove uobject id %d err %d\n",
+					 obj->id, err);
+				atomic_set(&obj->usecnt, 0);
+				continue;
+			}
+
+			pr_warn("ib_uverbs: unable to remove uobject id %d err %d\n",
+				obj->id, err);
+		}
+
+		list_del(&obj->list);
+		/* put the ref we took when we created the object */
+		uverbs_uobject_put(obj);
+		ret = 0;
+	}
+	mutex_unlock(&ucontext->uobjects_lock);
+	return ret;
+}
+
+void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed)
+{
 	/*
 	 * Waits for all remove_commit and alloc_commit to finish. Logically, We
 	 * want to hold this forever as the context is going to be destroyed,
 	 * but we'll release it since it causes a "held lock freed" BUG message.
 	 */
 	down_write(&ucontext->cleanup_rwsem);
+	while (!list_empty(&ucontext->uobjects))
+		if (__uverbs_cleanup_ucontext(ucontext, RDMA_REMOVE_DESTROY))
+			/* No entry was cleaned-up successfully during this iteration */
+			break;
 
-	while (!list_empty(&ucontext->uobjects)) {
-		struct ib_uobject *obj, *next_obj;
-		unsigned int next_order = UINT_MAX;
+	if (!list_empty(&ucontext->uobjects))
+		__uverbs_cleanup_ucontext(ucontext, device_removed ?
+			RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE);
 
-		/*
-		 * This shouldn't run while executing other commands on this
-		 * context. Thus, the only thing we should take care of is
-		 * releasing a FD while traversing this list. The FD could be
-		 * closed and released from the _release fop of this FD.
-		 * In order to mitigate this, we add a lock.
-		 * We take and release the lock per order traversal in order
-		 * to let other threads (which might still use the FDs) chance
-		 * to run.
-		 */
-		mutex_lock(&ucontext->uobjects_lock);
-		list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects,
-					 list) {
-			if (obj->type->destroy_order == cur_order) {
-				int ret;
-
-				/*
-				 * if we hit this WARN_ON, that means we are
-				 * racing with a lookup_get.
-				 */
-				WARN_ON(uverbs_try_lock_object(obj, true));
-				ret = obj->type->type_class->remove_commit(obj,
-									   reason);
-				list_del(&obj->list);
-				if (ret)
-					pr_warn("ib_uverbs: failed to remove uobject id %d order %u\n",
-						obj->id, cur_order);
-				/* put the ref we took when we created the object */
-				uverbs_uobject_put(obj);
-			} else {
-				next_order = min(next_order,
-						 obj->type->destroy_order);
-			}
-		}
-		mutex_unlock(&ucontext->uobjects_lock);
-		cur_order = next_order;
-	}
 	up_write(&ucontext->cleanup_rwsem);
 }
 
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index 0df0ac9c1de3..0901b7877408 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -246,44 +246,42 @@ void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata)
 }
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COMP_CHANNEL,
-			    &UVERBS_TYPE_ALLOC_FD(0,
-						  sizeof(struct ib_uverbs_completion_event_file),
+			    &UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file),
 						  uverbs_hot_unplug_completion_event_file,
 						  &uverbs_event_fops,
 						  "[infinibandevent]", O_RDONLY));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_QP,
-			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object),
 						      uverbs_free_qp));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW,
-			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw));
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_SRQ,
-			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object),
 						      uverbs_free_srq));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH,
-			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah));
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW,
 			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object),
-						      0, uverbs_free_flow));
+						      uverbs_free_flow));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_WQ,
-			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object),
 						      uverbs_free_wq));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL,
-			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl));
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_XRCD,
-			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object),
 						      uverbs_free_xrcd));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
-			    /* 2 is used in order to free the PD after MRs */
-			    &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd));
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DEVICE, NULL);
 
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
index 03b182a684a6..e2d82b5ba732 100644
--- a/drivers/infiniband/core/uverbs_std_types_counters.c
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -150,7 +150,7 @@ static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_READ,
 			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS,
-			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_counters),
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_counters),
 			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
 			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY),
 			    &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ));
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 3d293d01afea..278528f56f1f 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -201,7 +201,7 @@ static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY,
 			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ,
-			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0,
+			    &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object),
 						      uverbs_free_cq),
 #if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI)
 			    &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),
diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c
index 8b681575b615..8fd976d30895 100644
--- a/drivers/infiniband/core/uverbs_std_types_dm.c
+++ b/drivers/infiniband/core/uverbs_std_types_dm.c
@@ -102,7 +102,6 @@ static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_DM_FREE,
 			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM,
-			    /* 1 is used in order to free the DM after MRs */
-			    &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_dm),
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_dm),
 			    &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC),
 			    &UVERBS_METHOD(UVERBS_METHOD_DM_FREE));
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
index a7be51cf2e42..0cc15f8ee455 100644
--- a/drivers/infiniband/core/uverbs_std_types_flow_action.c
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -428,7 +428,7 @@ static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_FLOW_ACTION_DESTRO
 			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW_ACTION,
-			    &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow_action),
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action),
 			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE),
 			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY),
 			    &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY));
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 68f7cadf088f..d7f7ba3802af 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -142,6 +142,5 @@ static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_DM_MR_REG,
 			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
 
 DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MR,
-			    /* 1 is used in order to free the MR after all the MWs */
-			    &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr),
+			    &UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
 			    &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG));
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4c6241bc2039..152b5907c864 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1473,7 +1473,7 @@ struct ib_fmr_attr {
 struct ib_umem;
 
 enum rdma_remove_reason {
-	/* Userspace requested uobject deletion. Call could fail */
+	/* Userspace requested uobject deletion or initial try to remove uobject via cleanup. Call could fail */
 	RDMA_REMOVE_DESTROY,
 	/* Context deletion. This call should delete the actual object itself */
 	RDMA_REMOVE_CLOSE,
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
index cc04ec65588d..175495d1b0b8 100644
--- a/include/rdma/uverbs_types.h
+++ b/include/rdma/uverbs_types.h
@@ -93,7 +93,6 @@ struct uverbs_obj_type_class {
 struct uverbs_obj_type {
 	const struct uverbs_obj_type_class * const type_class;
 	size_t	     obj_size;
-	unsigned int destroy_order;
 };
 
 /*
@@ -152,10 +151,9 @@ extern const struct uverbs_obj_type_class uverbs_fd_class;
 
 #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) -	\
 				   sizeof(char))
-#define UVERBS_TYPE_ALLOC_FD(_order, _obj_size, _context_closed, _fops, _name, _flags)\
+#define UVERBS_TYPE_ALLOC_FD(_obj_size, _context_closed, _fops, _name, _flags)\
 	((&((const struct uverbs_obj_fd_type)				\
 	 {.type = {							\
-		.destroy_order = _order,				\
 		.type_class = &uverbs_fd_class,				\
 		.obj_size = (_obj_size) +				\
 			UVERBS_BUILD_BUG_ON((_obj_size) < sizeof(struct ib_uobject_file)), \
@@ -164,18 +162,17 @@ extern const struct uverbs_obj_type_class uverbs_fd_class;
 	 .fops = _fops,							\
 	 .name = _name,							\
 	 .flags = _flags}))->type)
-#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order, _destroy_object)	\
+#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _destroy_object)	\
 	((&((const struct uverbs_obj_idr_type)				\
 	 {.type = {							\
-		.destroy_order = _order,				\
 		.type_class = &uverbs_idr_class,			\
 		.obj_size = (_size) +					\
 			UVERBS_BUILD_BUG_ON((_size) <			\
 					    sizeof(struct ib_uobject))	\
 	 },								\
 	 .destroy_object = _destroy_object,}))->type)
-#define UVERBS_TYPE_ALLOC_IDR(_order, _destroy_object)			\
-	 UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order,	\
+#define UVERBS_TYPE_ALLOC_IDR(_destroy_object)			\
+	 UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject),	\
 				  _destroy_object)
 
 #endif
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 08/20] IB/uverbs: Allow an empty namespace in ioctl() framework
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Matan Barak <matanb@mellanox.com>

The ioctl parser framework wrongly assumed that each namespace is
populated. This could lead to NULL dereferences. Fix the parser to
always check that a given namespace indeed exists.

Fixes: fac9658cabb9 ("IB/core: Add new ioctl interface")
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/uverbs_ioctl.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index ee15c9ca788b..fb12e8ef7147 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -201,6 +201,9 @@ static int uverbs_finalize_attrs(struct uverbs_attr_bundle *attrs_bundle,
 			spec_hash[i];
 		unsigned int j;
 
+		if (!curr_spec_bucket)
+			continue;
+
 		for (j = 0; j < curr_bundle->num_attrs; j++) {
 			struct uverbs_attr *attr;
 			const struct uverbs_attr_spec *spec;
@@ -248,7 +251,7 @@ static int uverbs_uattrs_process(struct ib_device *ibdev,
 		struct uverbs_attr_spec_hash *attr_spec_bucket;
 
 		ret = uverbs_ns_idx(&attr_id, method->num_buckets);
-		if (ret < 0) {
+		if (ret < 0 || !method->attr_buckets[ret]) {
 			if (uattr->flags & UVERBS_ATTR_F_MANDATORY) {
 				uverbs_finalize_attrs(attr_bundle,
 						      method->attr_buckets,
@@ -291,6 +294,9 @@ static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *met
 		struct uverbs_attr_spec_hash *attr_spec_bucket =
 			method_spec->attr_buckets[i];
 
+		if (!attr_spec_bucket)
+			continue;
+
 		if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask,
 				   attr_bundle->hash[i].valid_bitmap,
 				   attr_spec_bucket->num_attrs))
@@ -404,7 +410,12 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev,
 	 * filled at a later stage (uverbs_process_attr)
 	 */
 	for (i = 0; i < method_spec->num_buckets; i++) {
-		unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs;
+		unsigned int curr_num_attrs;
+
+		if (!method_spec->attr_buckets[i])
+			continue;
+
+		curr_num_attrs = method_spec->attr_buckets[i]->num_attrs;
 
 		ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr;
 		curr_attr += curr_num_attrs;
-- 
2.14.4

^ permalink raw reply related

* [PATCH mlx5-next v2 01/20] net/mlx5_core: Prevent warns in dmesg upon firmware commands
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Yishai Hadas <yishaih@mellanox.com>

When DEVX is used application builds by itself the command mail box,
this patch prevents warns upon firmware commands as of invalid user
space usage.

In addition,
A failure in destroy_mkey command was changed to be printed only under
debug mode.

This prevents a redundant warn when a memory window was used
with rereg_mr and finally was some kernel cleanup as of reset
flow/process termination.

In that case this command might temporarily fails as part of the cleanup
but finally it expects to succeed.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 487388aed98f..b99d6df3905b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -677,7 +677,7 @@ struct mlx5_ifc_mbox_out_bits {
 
 struct mlx5_ifc_mbox_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -697,6 +697,7 @@ static int mlx5_cmd_check(struct mlx5_core_dev *dev, void *in, void *out)
 	u8  status;
 	u16 opcode;
 	u16 op_mod;
+	u16 uid;
 
 	mlx5_cmd_mbox_status(out, &status, &syndrome);
 	if (!status)
@@ -704,8 +705,18 @@ static int mlx5_cmd_check(struct mlx5_core_dev *dev, void *in, void *out)
 
 	opcode = MLX5_GET(mbox_in, in, opcode);
 	op_mod = MLX5_GET(mbox_in, in, op_mod);
+	uid    = MLX5_GET(mbox_in, in, uid);
 
-	mlx5_core_err(dev,
+	if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY)
+		mlx5_core_err(dev,
+		      "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
+		      mlx5_command_str(opcode),
+		      opcode, op_mod,
+		      cmd_status_str(status),
+		      status,
+		      syndrome);
+	else
+		mlx5_core_dbg(dev,
 		      "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
 		      mlx5_command_str(opcode),
 		      opcode, op_mod,
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 06/20] IB/uverbs: Add PTR_IN attributes that are allocated/copied automatically
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Matan Barak <matanb@mellanox.com>

Adding UVERBS_ATTR_SPEC_F_ALLOC_AND_COPY flag to PTR_IN attributes.
By using this flag, the parse automatically allocates and copies the
user-space data. This data is accessible by using uverbs_attr_get_len
and uverbs_attr_get_alloced_ptr inline accessor functions from the
handler.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/uverbs_ioctl.c | 25 ++++++++++++++++++++++++-
 include/rdma/uverbs_ioctl.h            | 25 +++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 8cc3e8dad9b5..ee15c9ca788b 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -114,7 +114,26 @@ static int uverbs_process_attr(struct ib_device *ibdev,
 		    uattr->attr_data.reserved)
 			return -EINVAL;
 
-		e->ptr_attr.data = uattr->data;
+		if (val_spec->flags & UVERBS_ATTR_SPEC_F_ALLOC_AND_COPY &&
+		    uattr->len > sizeof(((struct ib_uverbs_attr *)0)->data)) {
+			int ret;
+			void *p;
+
+			p = kvmalloc(uattr->len, GFP_KERNEL);
+			if (!p)
+				return -ENOMEM;
+
+			e->ptr_attr.data = ptr_to_u64(p);
+
+			ret = copy_from_user(p, u64_to_user_ptr(uattr->data),
+					     uattr->len);
+			if (ret) {
+				kvfree(p);
+				return -EFAULT;
+			}
+		} else {
+			e->ptr_attr.data = uattr->data;
+		}
 		e->ptr_attr.len = uattr->len;
 		e->ptr_attr.flags = uattr->flags;
 		break;
@@ -201,6 +220,10 @@ static int uverbs_finalize_attrs(struct uverbs_attr_bundle *attrs_bundle,
 								     commit);
 				if (!ret)
 					ret = current_ret;
+			} else if (spec->type == UVERBS_ATTR_TYPE_PTR_IN &&
+				   spec->flags & UVERBS_ATTR_SPEC_F_ALLOC_AND_COPY &&
+				   !uverbs_attr_ptr_is_inline(attr)) {
+				kvfree(u64_to_ptr(void, attr->ptr_attr.data));
 			}
 		}
 	}
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index bd6bba3a6e04..0e6f782727bd 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -65,6 +65,8 @@ enum {
 	UVERBS_ATTR_SPEC_F_MANDATORY	= 1U << 0,
 	/* Support extending attributes by length, validate all unknown size == zero  */
 	UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO = 1U << 1,
+	/* Valid only for PTR_IN. Allocate and copy the data inside the parser */
+	UVERBS_ATTR_SPEC_F_ALLOC_AND_COPY = 1U << 2,
 };
 
 /* Specification of a single attribute inside the ioctl message */
@@ -431,6 +433,17 @@ static inline struct ib_uobject *uverbs_attr_get_uobject(const struct uverbs_att
 	return attr->obj_attr.uobject;
 }
 
+static inline int uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle,
+				      u16 idx)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	return attr->ptr_attr.len;
+}
+
 static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle,
 				 size_t idx, const void *from, size_t size)
 {
@@ -457,6 +470,18 @@ static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr)
 	return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data);
 }
 
+static inline void *uverbs_attr_get_alloced_ptr(const struct uverbs_attr_bundle *attrs_bundle,
+						u16 idx)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+	if (IS_ERR(attr))
+		return (void *)attr;
+
+	return uverbs_attr_ptr_is_inline(attr) ? u64_to_ptr(void *, attr->ptr_attr.data) :
+		u64_to_ptr(void, attr->ptr_attr.data);
+}
+
 static inline int _uverbs_copy_from(void *to,
 				    const struct uverbs_attr_bundle *attrs_bundle,
 				    size_t idx,
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 05/20] IB/uverbs: Refactor uverbs_finalize_objects
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Matan Barak <matanb@mellanox.com>

uverbs_finalize_objects is currently used only to commit or abort
objects. Since we want to add automatic allocation/free of PTR_IN
attributes, moving it to uverbs_ioctl.c and renamit it to
uverbs_finalize_attrs.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c    | 40 ---------------------
 drivers/infiniband/core/rdma_core.h    | 10 ++----
 drivers/infiniband/core/uverbs_ioctl.c | 64 +++++++++++++++++++++++++++-------
 3 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 8035a0a7564c..df3c40533252 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -779,43 +779,3 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
 
 	return ret;
 }
-
-int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
-			    struct uverbs_attr_spec_hash * const *spec_hash,
-			    size_t num,
-			    bool commit)
-{
-	unsigned int i;
-	int ret = 0;
-
-	for (i = 0; i < num; i++) {
-		struct uverbs_attr_bundle_hash *curr_bundle =
-			&attrs_bundle->hash[i];
-		const struct uverbs_attr_spec_hash *curr_spec_bucket =
-			spec_hash[i];
-		unsigned int j;
-
-		for (j = 0; j < curr_bundle->num_attrs; j++) {
-			struct uverbs_attr *attr;
-			const struct uverbs_attr_spec *spec;
-
-			if (!uverbs_attr_is_valid_in_hash(curr_bundle, j))
-				continue;
-
-			attr = &curr_bundle->attrs[j];
-			spec = &curr_spec_bucket->attrs[j];
-
-			if (spec->type == UVERBS_ATTR_TYPE_IDR ||
-			    spec->type == UVERBS_ATTR_TYPE_FD) {
-				int current_ret;
-
-				current_ret = uverbs_finalize_object(attr->obj_attr.uobject,
-								     spec->obj.access,
-								     commit);
-				if (!ret)
-					ret = current_ret;
-			}
-		}
-	}
-	return ret;
-}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 1efcf93238dd..a243cc2a59f7 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -94,9 +94,6 @@ struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type
 						   struct ib_ucontext *ucontext,
 						   enum uverbs_obj_access access,
 						   int id);
-int uverbs_finalize_object(struct ib_uobject *uobj,
-			   enum uverbs_obj_access access,
-			   bool commit);
 /*
  * Note that certain finalize stages could return a status:
  *   (a) alloc_commit could return a failure if the object is committed at the
@@ -112,9 +109,8 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
  * function. For example, this could happen when we couldn't destroy an
  * object.
  */
-int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
-			    struct uverbs_attr_spec_hash * const *spec_hash,
-			    size_t num,
-			    bool commit);
+int uverbs_finalize_object(struct ib_uobject *uobj,
+			   enum uverbs_obj_access access,
+			   bool commit);
 
 #endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 8d32c4ae368c..8cc3e8dad9b5 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -167,6 +167,46 @@ static int uverbs_process_attr(struct ib_device *ibdev,
 	return 0;
 }
 
+static int uverbs_finalize_attrs(struct uverbs_attr_bundle *attrs_bundle,
+			  struct uverbs_attr_spec_hash * const *spec_hash,
+			  size_t num,
+			  bool commit)
+{
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < num; i++) {
+		struct uverbs_attr_bundle_hash *curr_bundle =
+			&attrs_bundle->hash[i];
+		const struct uverbs_attr_spec_hash *curr_spec_bucket =
+			spec_hash[i];
+		unsigned int j;
+
+		for (j = 0; j < curr_bundle->num_attrs; j++) {
+			struct uverbs_attr *attr;
+			const struct uverbs_attr_spec *spec;
+
+			if (!uverbs_attr_is_valid_in_hash(curr_bundle, j))
+				continue;
+
+			attr = &curr_bundle->attrs[j];
+			spec = &curr_spec_bucket->attrs[j];
+
+			if (spec->type == UVERBS_ATTR_TYPE_IDR ||
+			    spec->type == UVERBS_ATTR_TYPE_FD) {
+				int current_ret;
+
+				current_ret = uverbs_finalize_object(attr->obj_attr.uobject,
+								     spec->obj.access,
+								     commit);
+				if (!ret)
+					ret = current_ret;
+			}
+		}
+	}
+	return ret;
+}
+
 static int uverbs_uattrs_process(struct ib_device *ibdev,
 				 struct ib_ucontext *ucontext,
 				 const struct ib_uverbs_attr *uattrs,
@@ -187,10 +227,10 @@ static int uverbs_uattrs_process(struct ib_device *ibdev,
 		ret = uverbs_ns_idx(&attr_id, method->num_buckets);
 		if (ret < 0) {
 			if (uattr->flags & UVERBS_ATTR_F_MANDATORY) {
-				uverbs_finalize_objects(attr_bundle,
-							method->attr_buckets,
-							num_given_buckets,
-							false);
+				uverbs_finalize_attrs(attr_bundle,
+						      method->attr_buckets,
+						      num_given_buckets,
+						      false);
 				return ret;
 			}
 			continue;
@@ -208,10 +248,10 @@ static int uverbs_uattrs_process(struct ib_device *ibdev,
 					  attr_spec_bucket, &attr_bundle->hash[ret],
 					  uattr_ptr++);
 		if (ret) {
-			uverbs_finalize_objects(attr_bundle,
-						method->attr_buckets,
-						num_given_buckets,
-						false);
+			uverbs_finalize_attrs(attr_bundle,
+					      method->attr_buckets,
+					      num_given_buckets,
+					      false);
 			return ret;
 		}
 	}
@@ -271,10 +311,10 @@ static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr,
 
 	ret = method_spec->handler(ibdev, ufile, attr_bundle);
 cleanup:
-	finalize_ret = uverbs_finalize_objects(attr_bundle,
-					       method_spec->attr_buckets,
-					       attr_bundle->num_buckets,
-					       !ret);
+	finalize_ret = uverbs_finalize_attrs(attr_bundle,
+					     method_spec->attr_buckets,
+					     attr_bundle->num_buckets,
+					     !ret);
 
 	return ret ? ret : finalize_ret;
 }
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 04/20] IB/uverbs: Export uverbs idr and fd types
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Matan Barak <matanb@mellanox.com>

As provider drivers could use UVERBS_ATTR_FD and UVERBS_ATTR_IDR macros
need to export them.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index a6e904973ba8..8035a0a7564c 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -611,6 +611,7 @@ const struct uverbs_obj_type_class uverbs_idr_class = {
 	 */
 	.needs_kfree_rcu = true,
 };
+EXPORT_SYMBOL(uverbs_idr_class);
 
 static void _uverbs_close_fd(struct ib_uobject_file *uobj_file)
 {
@@ -719,6 +720,7 @@ const struct uverbs_obj_type_class uverbs_fd_class = {
 	.remove_commit = remove_commit_fd_uobject,
 	.needs_kfree_rcu = false,
 };
+EXPORT_SYMBOL(uverbs_fd_class);
 
 struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs,
 						   struct ib_ucontext *ucontext,
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 03/20] kernel.h: Reuse u64_to_ptr macro to cast __user pointers
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Redefine u64_to_user_ptr() macro to be implemented with u64_to_ptr().

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/kernel.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7d60a3472929..3012ae2f52f7 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -83,12 +83,7 @@ static inline u64 ptr_to_u64(const void *ptr)
 }					\
 )
 
-#define u64_to_user_ptr(x) (		\
-{					\
-	typecheck(u64, x);		\
-	(void __user *)(uintptr_t)x;	\
-}					\
-)
+#define u64_to_user_ptr(x)	u64_to_ptr(void __user, x)
 
 /*
  * This looks more complex than it should be. But we need to
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 02/20] drm/i915: Move u64-to-ptr helpers to general header
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

The macro u64_to_ptr() and function ptr_to_u64() are useful enough
to be part of general header, so move them there and allow RDMA
subsystem reuse them.

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/gpu/drm/i915/i915_utils.h | 12 ++----------
 include/linux/kernel.h            | 12 ++++++++++++
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 00165ad55fb3..2ffc43b97a5a 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -25,6 +25,8 @@
 #ifndef __I915_UTILS_H
 #define __I915_UTILS_H
 
+#include <linux/kernel.h>
+
 #undef WARN_ON
 /* Many gcc seem to no see through this and fall over :( */
 #if 0
@@ -102,16 +104,6 @@
 	__T;								\
 })
 
-static inline u64 ptr_to_u64(const void *ptr)
-{
-	return (uintptr_t)ptr;
-}
-
-#define u64_to_ptr(T, x) ({						\
-	typecheck(u64, x);						\
-	(T *)(uintptr_t)(x);						\
-})
-
 #define __mask_next_bit(mask) ({					\
 	int __idx = ffs(mask) - 1;					\
 	mask &= ~BIT(__idx);						\
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d23123238534..7d60a3472929 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -71,6 +71,18 @@
  */
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
+static inline u64 ptr_to_u64(const void *ptr)
+{
+	return (uintptr_t)ptr;
+}
+
+#define u64_to_ptr(T, x) (		\
+{					\
+	typecheck(u64, x);		\
+	(T *)(uintptr_t)(x);		\
+}					\
+)
+
 #define u64_to_user_ptr(x) (		\
 {					\
 	typecheck(u64, x);		\
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 00/20] Introduce mlx5 DEVX interface
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev

From: Leon Romanovsky <leonro@mellanox.com>

Changelog:
v1->v2:
 * Rebase on top of v4.18-rc1
v0 -> v1:
 * Dropped few validation/debug patches from the KABI part as of Jason's comments.
 * Use kvmalloc/kvfree instead of kmalloc/kfree to prevent higher order allocation under user control.
 * Cleaned up a dependency of CONFIG_INFINIBAND_USER_ACCESS from the uverbs layer.
 * Moved to white list command mode.
 * Serialize access to DEVX object - odify/read new methods were added
   with the appropriate IDR lock.
 * Block DEVX on IB port as of SELinux.
 * Improve uverbs_cleanup_ucontext algorithm to support two objects from the same type.
 * Fix mlx5_ifc.h to use reserved_at_xxx.
 * Enforce devx_uid existence upon DEVX methods.

>From Yishai:
----------------------------------------------------------------------
This DEVX series enables direct access from the user space area to the
mlx5 device driver by using the KABI mechanism.

The main purpose here is to make the user space driver as independent as
possible from the kernel so that future device functionality and commands
can be activated with minimal to none kernel changes.

The series keeps the same level of user process security/isolation as
provided today by the kernel IB verbs by using a user id returned from
the firmware upon context creation.

This user id is put by the kernel code in all firmware command from the
DEVX interface so that the isolation/security will be enforced by the
firmware based on that id.

Below kernel services are exposed to let the above work:

1) Expose a DEVX object that represent some underlay firmware object,
the input command to create it is some raw data given by the user
application which should match the device specification.

2) Expose a UMEM DEVX object for user memory registration for DMA:
The driver provides an API to register user memory, getting as input the
user address, length and access flags, and provides to the user as output
an ID (UMEM ID) returned by the firmware to this registered memory.

The user will use that UMEM ID in device direct commands that use this
memory instead of the physical addresses list.

3) Expose device general command API:
The driver provides an API to issue some general command except than
create and destroy.

4) UAR mapping:
Returns a device UAR ID for a given user index by using the kernel
context information.

This is safe from security point of view as described in details in the
relevant patch in the series.

5) EQ mapping:
Returns the matching device EQN for a given user vector number via the
DEVX interface.

6) Process resources cleanup:
This is achieved by the existing kernel KABI logic, upon DEVX object
creation the kernel builds the FW destroy command inbox in the KABI
object and uses it upon cleanup.

7) Future device commands:
This will be supported by some general command type (create, destroy)
that the DEVX code embeds as part of its code and is going to be
used from now on by the firmware.

At the beginning of this series there are some KABI infra-structures
improvements and fixes to enable the above DEVX functionality in a
cleaner way.

More details appear as part of the commit logs of the series and as part
of the comments in the code itself.

Thanks

Leon Romanovsky (2):
  drm/i915: Move u64-to-ptr helpers to general header
  kernel.h: Reuse u64_to_ptr macro to cast __user pointers

Matan Barak (5):
  IB/uverbs: Export uverbs idr and fd types
  IB/uverbs: Refactor uverbs_finalize_objects
  IB/uverbs: Add PTR_IN attributes that are allocated/copied
    automatically
  IB/uverbs: Add a macro to define a type with no kernel known size
  IB/uverbs: Allow an empty namespace in ioctl() framework

Yishai Hadas (13):
  net/mlx5_core: Prevent warns in dmesg upon firmware commands
  IB/core: Improve uverbs_cleanup_ucontext algorithm
  net/mlx5: Expose DEVX ifc structures
  IB/mlx5: Introduce DEVX
  IB/core: Introduce DECLARE_UVERBS_GLOBAL_METHODS
  IB: Expose ib_ucontext from a given ib_uverbs_file
  IB/mlx5: Add support for DEVX general command
  IB/mlx5: Add obj create and destroy functionality
  IB/mlx5: Add DEVX support for modify and query commands
  IB/mlx5: Add support for DEVX query UAR
  IB/mlx5: Add DEVX support for memory registration
  IB/mlx5: Add DEVX query EQN support
  IB/mlx5: Expose DEVX tree

 drivers/gpu/drm/i915/i915_utils.h                  |   12 +-
 drivers/infiniband/core/rdma_core.c                |  138 +--
 drivers/infiniband/core/rdma_core.h                |   10 +-
 drivers/infiniband/core/uverbs_ioctl.c             |  104 +-
 drivers/infiniband/core/uverbs_main.c              |    6 +
 drivers/infiniband/core/uverbs_std_types.c         |   26 +-
 .../infiniband/core/uverbs_std_types_counters.c    |    2 +-
 drivers/infiniband/core/uverbs_std_types_cq.c      |    2 +-
 drivers/infiniband/core/uverbs_std_types_dm.c      |    3 +-
 .../infiniband/core/uverbs_std_types_flow_action.c |    2 +-
 drivers/infiniband/core/uverbs_std_types_mr.c      |    3 +-
 drivers/infiniband/hw/mlx5/Makefile                |    1 +
 drivers/infiniband/hw/mlx5/devx.c                  | 1106 ++++++++++++++++++++
 drivers/infiniband/hw/mlx5/main.c                  |   31 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |   19 +
 drivers/infiniband/hw/mlx5/qp.c                    |    9 +-
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |   24 +-
 include/linux/kernel.h                             |   11 +-
 include/linux/mlx5/device.h                        |    3 +
 include/linux/mlx5/mlx5_ifc.h                      |   71 +-
 include/rdma/ib_verbs.h                            |    4 +-
 include/rdma/uverbs_ioctl.h                        |   27 +
 include/rdma/uverbs_named_ioctl.h                  |    4 +
 include/rdma/uverbs_std_types.h                    |    2 -
 include/rdma/uverbs_types.h                        |   11 +-
 include/uapi/rdma/mlx5-abi.h                       |    3 +
 include/uapi/rdma/mlx5_user_ioctl_cmds.h           |   73 ++
 27 files changed, 1548 insertions(+), 159 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx5/devx.c

^ permalink raw reply

* Re: [PATCH net-next,RFC 00/13] New fast forwarding path
From: Steffen Klassert @ 2018-06-17  9:23 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: David Miller, pablo, netfilter-devel, netdev
In-Reply-To: <8fd4ce02-6bf0-6bee-f8de-92c551881465@iogearbox.net>

Hi Daniel,

On Fri, Jun 15, 2018 at 03:22:24PM +0200, Daniel Borkmann wrote:
> Hi Steffen,
> 
> On 06/15/2018 08:17 AM, Steffen Klassert wrote:
> > 
> > I started with this last year because I wanted to improve
> > the IPsec (and UDP) forwarding path. Batching packets
> > at layer2  and send them directly to the output path
> > seemed to be a good method to improve this.
> > 
> > In particular, we need to do only one IPsec lookup
> > for the whole packet chain. So it relaxes the pain
> > from reomoving the IPsec flowcache a bit. It can be
> > only a first step, but we need some improvements here
> > as people start to complain about that.
> 
> But did you also experiment with XDP on this? 

I've already tried to figure out what I have to to
do to get XDP with forwarding, but still don't realy
know how to set this up.

Maybe it is time to have a deeper look into BPF/XDP,
but for now I feel a bit lost with this.

> Would be curious about
> the numbers. You'd get implicit batching for the forwarding via devmap
> as well if you're required to flush it out via different device with
> XDP_REDIRECT; otherwise XDP_TX of course. Given we have recently
> integrated helpers for XDP to do a FIB and neighbor lookup from the
> kernel tables, where it's thus shared and integrated with the rest of
> the stack and tooling, it would be awesome to get to the same point
> with xfrm as well. Eyal recently did a start on that for xfrm for tc
> progs; would be nice to have integration on XDP as well, potentially
> it might also result in a bigger plus on the forwarding numbers.

It might make sense to intrgrate XDP with xfrm to
be able to compare numbers etc. But I need a working
XDP setup and some understanding about it first, what
could take some time.

^ permalink raw reply

* [PATCH] tc, bpf: add option to dump bpf verifier as C program fragment
From: Ophir Munk @ 2018-06-17  8:48 UTC (permalink / raw)
  To: netdev, Stephen Hemminger, David Ahern
  Cc: Thomas Monjalon, Olga Shern, Ophir Munk

Similar to cbpf used within tcpdump utility with a "-d" option to dump
the compiled packet-matching code in a human readable form - tc has the
"verbose" option to dump ebpf verifier output.
Another useful option of cbpf using tcpdump "-dd" option is to dump
packet-matching code a C program fragment. Similar to this - this commit
adds a new tc ebpf option named "code" to dump ebpf verifier as C program
fragment.

Existing "verbose" option sample output:

Verifier analysis:
0: (61) r2 = *(u32 *)(r1 +52)
1: (18) r3 = 0xdeadbeef
3: (63) *(u32 *)(r10 -4) = r3
.
.
11: (63) *(u32 *)(r1 +52) = r2
12: (18) r0 = 0xffffffff
14: (95) exit

New "code" option sample output:

/* struct bpf_insn cls_q_code[] = { */
{0x61,    2,    1,       52, 0x00000000},
{0x18,    3,    0,        0, 0xdeadbeef},
{0x00,    0,    0,        0, 0x00000000},
.
.
{0x63,    1,    2,       52, 0x00000000},
{0x18,    0,    0,        0, 0xffffffff},
{0x00,    0,    0,        0, 0x00000000},
{0x95,    0,    0,        0, 0x00000000},

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 include/bpf_util.h |  1 +
 lib/bpf.c          | 35 +++++++++++++++++++++++++++++------
 tc/m_bpf.c         |  3 ++-
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 219beb4..cf611c5 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -72,6 +72,7 @@ struct bpf_cfg_in {
 	enum bpf_mode mode;
 	__u32 ifindex;
 	bool verbose;
+	bool code;	
 	int argc;
 	char **argv;
 	struct sock_filter opcodes[BPF_MAXINSNS];
diff --git a/lib/bpf.c b/lib/bpf.c
index c38d92d..b13ec3f 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -113,10 +113,10 @@ const char *bpf_prog_to_default_section(enum bpf_prog_type type)
 
 #ifdef HAVE_ELF
 static int bpf_obj_open(const char *path, enum bpf_prog_type type,
-			const char *sec, __u32 ifindex, bool verbose);
+			const char *sec, __u32 ifindex, bool verbose, bool code);
 #else
 static int bpf_obj_open(const char *path, enum bpf_prog_type type,
-			const char *sec, __u32 ifindex, bool verbose)
+			const char *sec, __u32 ifindex, bool verbose, bool code)
 {
 	fprintf(stderr, "No ELF library support compiled in.\n");
 	errno = ENOSYS;
@@ -809,6 +809,7 @@ static int bpf_do_parse(struct bpf_cfg_in *cfg, const bool *opt_tbl)
 {
 	const char *file, *section, *uds_name;
 	bool verbose = false;
+	bool code = false;	
 	int i, ret, argc;
 	char **argv;
 
@@ -890,6 +891,11 @@ static int bpf_do_parse(struct bpf_cfg_in *cfg, const bool *opt_tbl)
 			NEXT_ARG_FWD();
 		}
 
+		if (argc > 0 && matches(*argv, "code") == 0) {
+			code = true;
+			NEXT_ARG_FWD();
+		}
+
 		PREV_ARG();
 	}
 
@@ -911,6 +917,7 @@ static int bpf_do_parse(struct bpf_cfg_in *cfg, const bool *opt_tbl)
 	cfg->uds     = uds_name;
 	cfg->argc    = argc;
 	cfg->argv    = argv;
+	cfg->code    = code;
 	cfg->verbose = verbose;
 
 	return ret;
@@ -921,7 +928,7 @@ static int bpf_do_load(struct bpf_cfg_in *cfg)
 	if (cfg->mode == EBPF_OBJECT) {
 		cfg->prog_fd = bpf_obj_open(cfg->object, cfg->type,
 					    cfg->section, cfg->ifindex,
-					    cfg->verbose);
+					    cfg->verbose, cfg->code);
 		return cfg->prog_fd;
 	}
 	return 0;
@@ -1133,6 +1140,7 @@ struct bpf_elf_ctx {
 	enum bpf_prog_type	type;
 	__u32			ifindex;
 	bool			verbose;
+	bool			code;
 	struct bpf_elf_st	stat;
 	struct bpf_hash_entry	*ht[256];
 	char			*log;
@@ -1179,6 +1187,17 @@ bpf_dump_error(struct bpf_elf_ctx *ctx, const char *format, ...)
 	}
 }
 
+static void bpf_dump_code(const char *section, const struct bpf_insn *insns, unsigned int cnt)
+{
+	int i;
+	fprintf(stderr, "/* struct bpf_insn %s_code[] = { */\n", section);
+	for (i=0; i < cnt; i++) {
+		fprintf(stderr, "\t{0x%.2x, %4u, %4u, %8d, 0x%.8x},\n",
+		insns[i].code, insns[i].dst_reg, insns[i].src_reg, insns[i].off, insns[i].imm);
+	}
+	fprintf(stderr, "\n");
+}
+
 static int bpf_log_realloc(struct bpf_elf_ctx *ctx)
 {
 	const size_t log_max = UINT_MAX >> 8;
@@ -1526,6 +1545,9 @@ retry:
 		bpf_prog_report(fd, section, prog, ctx);
 	}
 
+	if (ctx->code)
+		bpf_dump_code(section, prog->insns, prog->size / sizeof(struct bpf_insn));
+
 	return fd;
 }
 
@@ -2439,7 +2461,7 @@ static void bpf_get_cfg(struct bpf_elf_ctx *ctx)
 
 static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname,
 			    enum bpf_prog_type type, __u32 ifindex,
-			    bool verbose)
+			    bool verbose, bool code)
 {
 	int ret = -EINVAL;
 
@@ -2450,6 +2472,7 @@ static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname,
 	memset(ctx, 0, sizeof(*ctx));
 	bpf_get_cfg(ctx);
 	ctx->verbose = verbose;
+	ctx->code    = code;
 	ctx->type    = type;
 	ctx->ifindex = ifindex;
 
@@ -2543,12 +2566,12 @@ static void bpf_elf_ctx_destroy(struct bpf_elf_ctx *ctx, bool failure)
 static struct bpf_elf_ctx __ctx;
 
 static int bpf_obj_open(const char *pathname, enum bpf_prog_type type,
-			const char *section, __u32 ifindex, bool verbose)
+			const char *section, __u32 ifindex, bool verbose, bool code)
 {
 	struct bpf_elf_ctx *ctx = &__ctx;
 	int fd = 0, ret;
 
-	ret = bpf_elf_ctx_init(ctx, pathname, type, ifindex, verbose);
+	ret = bpf_elf_ctx_init(ctx, pathname, type, ifindex, verbose, code);
 	if (ret < 0) {
 		fprintf(stderr, "Cannot initialize ELF context!\n");
 		return ret;
diff --git a/tc/m_bpf.c b/tc/m_bpf.c
index 1c1f71c..9947113 100644
--- a/tc/m_bpf.c
+++ b/tc/m_bpf.c
@@ -33,7 +33,8 @@ static void explain(void)
 	fprintf(stderr, "\n");
 	fprintf(stderr, "eBPF use case:\n");
 	fprintf(stderr, " object-file FILE [ section ACT_NAME ] [ export UDS_FILE ]");
-	fprintf(stderr, " [ verbose ]\n");
+	fprintf(stderr, " [ verbose ]");
+	fprintf(stderr, " [ code ]\n");
 	fprintf(stderr, " object-pinned FILE\n");
 	fprintf(stderr, "\n");
 	fprintf(stderr, "Where BPF_BYTECODE := \'s,c t f k,c t f k,c t f k,...\'\n");
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH 1/3] net: dsa: Add DT bindings for Vitesse VSC73xx switches
From: Jiri Pirko @ 2018-06-17  7:40 UTC (permalink / raw)
  To: Linus Walleij
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, netdev,
	openwrt-devel, LEDE Development List, Gabor Juhos, devicetree
In-Reply-To: <20180614123534.8063-2-linus.walleij@linaro.org>

Thu, Jun 14, 2018 at 02:35:32PM CEST, linus.walleij@linaro.org wrote:
>This adds the device tree bindings for the Vitesse VSC73xx
>switches. We also add the vendor name for Vitesse.
>
>Cc: devicetree@vger.kernel.org
>Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
>---
> .../bindings/net/dsa/vitesse,vsc73xx.txt      | 81 +++++++++++++++++++
> .../devicetree/bindings/vendor-prefixes.txt   |  1 +
> 2 files changed, 82 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
>
>diff --git a/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
>new file mode 100644
>index 000000000000..474cdba5fa37
>--- /dev/null
>+++ b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
>@@ -0,0 +1,81 @@
>+Vitess VSC73xx Switches

s/Vitess/Vitesse/

[...]

^ permalink raw reply

* Re: [PATCH COMMITTED] bluetooth: hci_nokia: Don't include linux/unaligned/le_struct.h directly.
From: Marcel Holtmann @ 2018-06-17  7:42 UTC (permalink / raw)
  To: David S. Miller; +Cc: Johan Hedberg, linux-bluetooth, netdev
In-Reply-To: <20180616.164059.1116555189277956188.davem@davemloft.net>

Hi Dave,

> This breaks the build as this header is not meant to be used in this
> way.
> 
> ./include/linux/unaligned/access_ok.h:8:28: error: redefinition of ‘get_unaligned_le16’
> static __always_inline u16 get_unaligned_le16(const void *p)
>                            ^~~~~~~~~~~~~~~~~~
> In file included from drivers/bluetooth/hci_nokia.c:32:
> ./include/linux/unaligned/le_struct.h:7:19: note: previous definition of ‘get_unaligned_le16’ was here
> static inline u16 get_unaligned_le16(const void *p)
> 
> Use asm/unaligned.h instead.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
> drivers/bluetooth/hci_nokia.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)

Acked-by: Marcel Holtmann <marcel@holtmann.org>

Regards

Marcel

^ permalink raw reply

* [PATCH] net: fix e1000.rst Documentation build errors
From: Randy Dunlap @ 2018-06-17  0:36 UTC (permalink / raw)
  To: netdev@vger.kernel.org, David Miller, linux-doc@vger.kernel.org
  Cc: LKML, Jeff Kirsher, aaron.f.brown

From: Randy Dunlap <rdunlap@infradead.org>

Fix Documentation build errors in e1000.rst.  Several section titles and
their underlines should not be indented.

Documentation/networking/e1000.rst:358: (SEVERE/4) Unexpected section title.

Fixes: 228046e76189 ("Documentation: e1000: Update kernel documentation")

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Aaron Brown <aaron.f.brown@intel.com>
---
Is there a Sphinx version problem here?  Tested-by: should indicate
that there was no error like I am seeing.

 Documentation/networking/e1000.rst |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

--- lnx-418-rc1.orig/Documentation/networking/e1000.rst
+++ lnx-418-rc1/Documentation/networking/e1000.rst
@@ -354,8 +354,8 @@ previously mentioned to force the adapte
 Additional Configurations
 =========================
 
-  Jumbo Frames
-  ------------
+Jumbo Frames
+------------
   Jumbo Frames support is enabled by changing the MTU to a value larger than
   the default of 1500.  Use the ifconfig command to increase the MTU size.
   For example::
@@ -389,8 +389,8 @@ Additional Configurations
      Intel(R) PRO/1000 Gigabit Server Adapter
      Intel(R) PRO/1000 PM Network Connection
 
-  ethtool
-  -------
+ethtool
+-------
   The driver utilizes the ethtool interface for driver configuration and
   diagnostics, as well as displaying statistical information.  The ethtool
   version 1.6 or later is required for this functionality.
@@ -398,8 +398,8 @@ Additional Configurations
   The latest release of ethtool can be found from
   https://www.kernel.org/pub/software/network/ethtool/
 
-  Enabling Wake on LAN* (WoL)
-  ---------------------------
+Enabling Wake on LAN* (WoL)
+---------------------------
   WoL is configured through the ethtool* utility.
 
   WoL will be enabled on the system during the next shut down or reboot.

^ permalink raw reply

* [PATCH] net: fix e100.rst Documentation build errors
From: Randy Dunlap @ 2018-06-17  0:36 UTC (permalink / raw)
  To: linux-doc@vger.kernel.org, netdev@vger.kernel.org, Jeff Kirsher,
	David Miller
  Cc: LKML, Aaron Brown

From: Randy Dunlap <rdunlap@infradead.org>

Fix Documentation build errors in e100.rst.  Several section titles
and the corresponding underlines should not be indented.

Documentation/networking/e100.rst:90: (SEVERE/4) Unexpected section title.
Documentation/networking/e100.rst:109: (SEVERE/4) Unexpected section title.

Fixes: 85d63445f411 ("Documentation: e100: Update the Intel 10/100 driver doc")

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Aaron Brown <aaron.f.brown@intel.com>
---
Is there a Sphinx version problem here?  Tested-by: should indicate
that there was no error like I am seeing.

 Documentation/networking/e100.rst |   24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

--- lnx-418-rc1.orig/Documentation/networking/e100.rst
+++ lnx-418-rc1/Documentation/networking/e100.rst
@@ -86,8 +86,8 @@ Event Log Message Level:  The driver use
 Additional Configurations
 =========================
 
-  Configuring the Driver on Different Distributions
-  -------------------------------------------------
+Configuring the Driver on Different Distributions
+-------------------------------------------------
 
   Configuring a network driver to load properly when the system is started is
   distribution dependent. Typically, the configuration process involves adding
@@ -105,8 +105,8 @@ Additional Configurations
        alias eth0 e100
        alias eth1 e100
 
-  Viewing Link Messages
-  ---------------------
+Viewing Link Messages
+---------------------
   In order to see link messages and other Intel driver information on your
   console, you must set the dmesg level up to six. This can be done by
   entering the following on the command line before loading the e100 driver::
@@ -119,8 +119,8 @@ Additional Configurations
   NOTE: This setting is not saved across reboots.
 
 
-  ethtool
-  -------
+ethtool
+-------
 
   The driver utilizes the ethtool interface for driver configuration and
   diagnostics, as well as displaying statistical information.  The ethtool
@@ -129,8 +129,8 @@ Additional Configurations
   The latest release of ethtool can be found from
   https://www.kernel.org/pub/software/network/ethtool/
 
-  Enabling Wake on LAN* (WoL)
-  ---------------------------
+Enabling Wake on LAN* (WoL)
+---------------------------
   WoL is provided through the ethtool* utility.  For instructions on enabling
   WoL with ethtool, refer to the ethtool man page.
 
@@ -138,16 +138,16 @@ Additional Configurations
   this driver version, in order to enable WoL, the e100 driver must be
   loaded when shutting down or rebooting the system.
 
-  NAPI
-  ----
+NAPI
+----
 
   NAPI (Rx polling mode) is supported in the e100 driver.
 
   See https://wiki.linuxfoundation.org/networking/napi for more information
   on NAPI.
 
-  Multiple Interfaces on Same Ethernet Broadcast Network
-  ------------------------------------------------------
+Multiple Interfaces on Same Ethernet Broadcast Network
+------------------------------------------------------
 
   Due to the default ARP behavior on Linux, it is not possible to have
   one system on two IP networks in the same Ethernet broadcast domain

^ permalink raw reply

* Re: [PATCH] net_sched: blackhole: tell upper qdisc about dropped packets
From: David Miller @ 2018-06-16 23:42 UTC (permalink / raw)
  To: khlebnikov; +Cc: netdev, xiyou.wangcong, jiri, jhs
In-Reply-To: <152905845136.88583.6197221349040585517.stgit@buzz>

From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 15 Jun 2018 13:27:31 +0300

> When blackhole is used on top of classful qdisc like hfsc it breaks
> qlen and backlog counters because packets are disappear without notice.
> 
> In HFSC non-zero qlen while all classes are inactive triggers warning:
> WARNING: ... at net/sched/sch_hfsc.c:1393 hfsc_dequeue+0xba4/0xe90 [sch_hfsc]
> and schedules watchdog work endlessly.
> 
> This patch return __NET_XMIT_BYPASS in addition to NET_XMIT_SUCCESS,
> this flag tells upper layer: this packet is gone and isn't queued.
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

Applied, thanks.

^ permalink raw reply

* [PATCH COMMITTED] bluetooth: hci_nokia: Don't include linux/unaligned/le_struct.h directly.
From: David Miller @ 2018-06-16 23:40 UTC (permalink / raw)
  To: marcel; +Cc: johan.hedberg, linux-bluetooth, netdev


This breaks the build as this header is not meant to be used in this
way.

./include/linux/unaligned/access_ok.h:8:28: error: redefinition of ‘get_unaligned_le16’
 static __always_inline u16 get_unaligned_le16(const void *p)
                            ^~~~~~~~~~~~~~~~~~
In file included from drivers/bluetooth/hci_nokia.c:32:
./include/linux/unaligned/le_struct.h:7:19: note: previous definition of ‘get_unaligned_le16’ was here
 static inline u16 get_unaligned_le16(const void *p)

Use asm/unaligned.h instead.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bluetooth/hci_nokia.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bluetooth/hci_nokia.c b/drivers/bluetooth/hci_nokia.c
index 14d159e2042d..2dc33e65d2d0 100644
--- a/drivers/bluetooth/hci_nokia.c
+++ b/drivers/bluetooth/hci_nokia.c
@@ -29,7 +29,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <linux/unaligned/le_struct.h>
+#include <asm/unaligned.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
 
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: David Miller @ 2018-06-16 23:27 UTC (permalink / raw)
  To: dwmw2; +Cc: eric.dumazet, netdev, ldir
In-Reply-To: <d7644be784f3164bbc5f79154260160d.squirrel@twosheds.infradead.org>

From: "David Woodhouse" <dwmw2@infradead.org>
Date: Sat, 16 Jun 2018 20:52:33 -0000

>> This Fixes tag shoots the messenger really.
>>
>> I suggest to instead use :
>>
>> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")
> 
> 
> Oh, I hadn't realised how recent that was. Sure, let's blame you instead :)

Patch applied with adjusted Fixes: tag, and queued up for -stable.

Thanks.

^ permalink raw reply

* Re: pull-request: bpf 2018-06-16
From: David Miller @ 2018-06-16 22:54 UTC (permalink / raw)
  To: daniel; +Cc: ast, netdev
In-Reply-To: <20180615230630.12117-1-daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 16 Jun 2018 01:06:30 +0200

> The following pull-request contains BPF updates for your *net* tree.
> 
> The main changes are:
 ...
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

Pulled, thanks Daniel.

^ permalink raw reply

* [PATCH] xfrm4: Remove export declaration from xfrm4_protocol_init
From: efremov @ 2018-06-16 20:58 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Denis Efremov, David S. Miller, netdev, linux-kernel, ldv-project

From: Denis Efremov <efremov@linux.com>

The function xfrm4_protocol_init is exported as GPL symbol and
annotated as __init. That is reasonable only in the case when
this function is called from another's module __init section.
Otherwise, we will face section mismatch error. xfrm4_protocol_init
is used in xfrm4_init along with xfrm4_state_init and xfrm4_policy_init.
The last two functions are not exported as GPL symbols. According to
this, it seem's like there is no reason to export xfrm4_protocol_init too.
Fix potential section mismatch by removing export declaration
from xfrm4_protocol_init.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Denis Efremov <efremov@linux.com>
---
 net/ipv4/xfrm4_protocol.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..0e1f5dc2766b 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -297,4 +297,3 @@ void __init xfrm4_protocol_init(void)
 {
 	xfrm_input_register_afinfo(&xfrm4_input_afinfo);
 }
-EXPORT_SYMBOL(xfrm4_protocol_init);
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: David Woodhouse @ 2018-06-16 20:52 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Woodhouse, netdev, ldir
In-Reply-To: <62fbc51a-4738-6c21-8c53-d84fee7a9bbd@gmail.com>

> This Fixes tag shoots the messenger really.
>
> I suggest to instead use :
>
> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")


Oh, I hadn't realised how recent that was. Sure, let's blame you instead :)


-- 
dwmw2

^ permalink raw reply

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: David Woodhouse @ 2018-06-16 15:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Woodhouse, netdev, ldir
In-Reply-To: <62fbc51a-4738-6c21-8c53-d84fee7a9bbd@gmail.com>


>> Commit 14afee4b609 ("net: convert sock.sk_wmem_alloc from atomic_t to
>> refcount_t") did exactly what it was intended to do, and turned this
>> mostly-theoretical problem into a real one, causing PPPoATM to fail
>> immediately as sk_wmem_alloc underflows and atm_may_send() *immediately*
>> starts refusing to allow new packets.

 ...

>> Fixes: 14afee4b ("net: convert sock.sk_wmem_alloc from atomic_t to
>> refcount_t")
>
> This Fixes tag shoots the messenger really.

A little bit, yes. The text hopefully made that clear.

> I suggest to instead use :
>
> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")
>
> Because even without the conversion to refcount_t, we could have a LOCKDEP
> splat in :
>
> filter = rcu_dereference_check(sk->sk_filter,
>                                atomic_read(&sk->sk_wmem_alloc) == 0);
>
> Note that some places make a further check even when LOCKDEP is not used.
>
> net/ipv4/af_inet.c:154:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/iucv/af_iucv.c:405:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/key/af_key.c:112:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/netlink/af_netlink.c:410:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/packet/af_packet.c:1286:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/rxrpc/af_rxrpc.c:852:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/unix/af_unix.c:490:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
>
>
> We might factorize these checks into __sk_destruct()
>


How many of those were likely to trigger in practice on an ATM VCC though?
If we are taking the Fixes: tag as a hint about which stable kernels we
might want to backport to, rather than a moral assignment of blame, then
14afee4b is probably not the worst place to point it. But I don't mind
much...

-- 
dwmw2

^ permalink raw reply

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: Eric Dumazet @ 2018-06-16 12:57 UTC (permalink / raw)
  To: David Woodhouse, netdev, ldir
In-Reply-To: <20180616105544.246714-1-dwmw2@infradead.org>



On 06/16/2018 03:55 AM, David Woodhouse wrote:
> ATM accounts for in-flight TX packets in sk_wmem_alloc of the VCC on
> which they are to be sent. But it doesn't take ownership of those
> packets from the sock (if any) which originally owned them. They should
> remain owned by their actual sender until they've left the box.
> 
> There's a hack in pskb_expand_head() to avoid adjusting skb->truesize
> for certain skbs, precisely to avoid messing up sk_wmem_alloc
> accounting. Ideally that hack would cover the ATM use case too, but it
> doesn't — skbs which aren't owned by any sock, for example PPP control
> frames, still get their truesize adjusted when the low-level ATM driver
> adds headroom.
> 
> This has always been an issue, it seems. The truesize of a packet
> increases, and sk_wmem_alloc on the VCC goes negative. But this wasn't
> for normal traffic, only for control frames. So I think we just got away
> with it, and we probably needed to send 2GiB of LCP echo frames before
> the misaccounting would ever have caused a problem and caused
> atm_may_send() to start refusing packets.
> 
> Commit 14afee4b609 ("net: convert sock.sk_wmem_alloc from atomic_t to
> refcount_t") did exactly what it was intended to do, and turned this
> mostly-theoretical problem into a real one, causing PPPoATM to fail
> immediately as sk_wmem_alloc underflows and atm_may_send() *immediately*
> starts refusing to allow new packets.
> 
> The least intrusive solution to this problem is to stash the value of
> skb->truesize that was accounted to the VCC, in a new member of the
> ATM_SKB(skb) structure. Then in atm_pop_raw() subtract precisely that
> value instead of the then-current value of skb->truesize.
> 
> Fixes: 14afee4b ("net: convert sock.sk_wmem_alloc from atomic_t to refcount_t")

This Fixes tag shoots the messenger really.

I suggest to instead use :

Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")

Because even without the conversion to refcount_t, we could have a LOCKDEP
splat in :

filter = rcu_dereference_check(sk->sk_filter,
                               atomic_read(&sk->sk_wmem_alloc) == 0);

Note that some places make a further check even when LOCKDEP is not used.

net/ipv4/af_inet.c:154:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/iucv/af_iucv.c:405:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/key/af_key.c:112:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/netlink/af_netlink.c:410:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/packet/af_packet.c:1286:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/rxrpc/af_rxrpc.c:852:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/unix/af_unix.c:490:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));


We might factorize these checks into __sk_destruct()

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox