Netdev List
 help / color / mirror / Atom feed
* [PATCH rdma-next v2 05/20] IB/uverbs: Refactor uverbs_finalize_objects
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Matan Barak <matanb@mellanox.com>

uverbs_finalize_objects is currently used only to commit or abort
objects. Since we want to add automatic allocation/free of PTR_IN
attributes, moving it to uverbs_ioctl.c and renamit it to
uverbs_finalize_attrs.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c    | 40 ---------------------
 drivers/infiniband/core/rdma_core.h    | 10 ++----
 drivers/infiniband/core/uverbs_ioctl.c | 64 +++++++++++++++++++++++++++-------
 3 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 8035a0a7564c..df3c40533252 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -779,43 +779,3 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
 
 	return ret;
 }
-
-int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
-			    struct uverbs_attr_spec_hash * const *spec_hash,
-			    size_t num,
-			    bool commit)
-{
-	unsigned int i;
-	int ret = 0;
-
-	for (i = 0; i < num; i++) {
-		struct uverbs_attr_bundle_hash *curr_bundle =
-			&attrs_bundle->hash[i];
-		const struct uverbs_attr_spec_hash *curr_spec_bucket =
-			spec_hash[i];
-		unsigned int j;
-
-		for (j = 0; j < curr_bundle->num_attrs; j++) {
-			struct uverbs_attr *attr;
-			const struct uverbs_attr_spec *spec;
-
-			if (!uverbs_attr_is_valid_in_hash(curr_bundle, j))
-				continue;
-
-			attr = &curr_bundle->attrs[j];
-			spec = &curr_spec_bucket->attrs[j];
-
-			if (spec->type == UVERBS_ATTR_TYPE_IDR ||
-			    spec->type == UVERBS_ATTR_TYPE_FD) {
-				int current_ret;
-
-				current_ret = uverbs_finalize_object(attr->obj_attr.uobject,
-								     spec->obj.access,
-								     commit);
-				if (!ret)
-					ret = current_ret;
-			}
-		}
-	}
-	return ret;
-}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 1efcf93238dd..a243cc2a59f7 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -94,9 +94,6 @@ struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type
 						   struct ib_ucontext *ucontext,
 						   enum uverbs_obj_access access,
 						   int id);
-int uverbs_finalize_object(struct ib_uobject *uobj,
-			   enum uverbs_obj_access access,
-			   bool commit);
 /*
  * Note that certain finalize stages could return a status:
  *   (a) alloc_commit could return a failure if the object is committed at the
@@ -112,9 +109,8 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
  * function. For example, this could happen when we couldn't destroy an
  * object.
  */
-int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
-			    struct uverbs_attr_spec_hash * const *spec_hash,
-			    size_t num,
-			    bool commit);
+int uverbs_finalize_object(struct ib_uobject *uobj,
+			   enum uverbs_obj_access access,
+			   bool commit);
 
 #endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
index 8d32c4ae368c..8cc3e8dad9b5 100644
--- a/drivers/infiniband/core/uverbs_ioctl.c
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -167,6 +167,46 @@ static int uverbs_process_attr(struct ib_device *ibdev,
 	return 0;
 }
 
+static int uverbs_finalize_attrs(struct uverbs_attr_bundle *attrs_bundle,
+			  struct uverbs_attr_spec_hash * const *spec_hash,
+			  size_t num,
+			  bool commit)
+{
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < num; i++) {
+		struct uverbs_attr_bundle_hash *curr_bundle =
+			&attrs_bundle->hash[i];
+		const struct uverbs_attr_spec_hash *curr_spec_bucket =
+			spec_hash[i];
+		unsigned int j;
+
+		for (j = 0; j < curr_bundle->num_attrs; j++) {
+			struct uverbs_attr *attr;
+			const struct uverbs_attr_spec *spec;
+
+			if (!uverbs_attr_is_valid_in_hash(curr_bundle, j))
+				continue;
+
+			attr = &curr_bundle->attrs[j];
+			spec = &curr_spec_bucket->attrs[j];
+
+			if (spec->type == UVERBS_ATTR_TYPE_IDR ||
+			    spec->type == UVERBS_ATTR_TYPE_FD) {
+				int current_ret;
+
+				current_ret = uverbs_finalize_object(attr->obj_attr.uobject,
+								     spec->obj.access,
+								     commit);
+				if (!ret)
+					ret = current_ret;
+			}
+		}
+	}
+	return ret;
+}
+
 static int uverbs_uattrs_process(struct ib_device *ibdev,
 				 struct ib_ucontext *ucontext,
 				 const struct ib_uverbs_attr *uattrs,
@@ -187,10 +227,10 @@ static int uverbs_uattrs_process(struct ib_device *ibdev,
 		ret = uverbs_ns_idx(&attr_id, method->num_buckets);
 		if (ret < 0) {
 			if (uattr->flags & UVERBS_ATTR_F_MANDATORY) {
-				uverbs_finalize_objects(attr_bundle,
-							method->attr_buckets,
-							num_given_buckets,
-							false);
+				uverbs_finalize_attrs(attr_bundle,
+						      method->attr_buckets,
+						      num_given_buckets,
+						      false);
 				return ret;
 			}
 			continue;
@@ -208,10 +248,10 @@ static int uverbs_uattrs_process(struct ib_device *ibdev,
 					  attr_spec_bucket, &attr_bundle->hash[ret],
 					  uattr_ptr++);
 		if (ret) {
-			uverbs_finalize_objects(attr_bundle,
-						method->attr_buckets,
-						num_given_buckets,
-						false);
+			uverbs_finalize_attrs(attr_bundle,
+					      method->attr_buckets,
+					      num_given_buckets,
+					      false);
 			return ret;
 		}
 	}
@@ -271,10 +311,10 @@ static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr,
 
 	ret = method_spec->handler(ibdev, ufile, attr_bundle);
 cleanup:
-	finalize_ret = uverbs_finalize_objects(attr_bundle,
-					       method_spec->attr_buckets,
-					       attr_bundle->num_buckets,
-					       !ret);
+	finalize_ret = uverbs_finalize_attrs(attr_bundle,
+					     method_spec->attr_buckets,
+					     attr_bundle->num_buckets,
+					     !ret);
 
 	return ret ? ret : finalize_ret;
 }
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 04/20] IB/uverbs: Export uverbs idr and fd types
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Matan Barak <matanb@mellanox.com>

As provider drivers could use UVERBS_ATTR_FD and UVERBS_ATTR_IDR macros
need to export them.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/rdma_core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index a6e904973ba8..8035a0a7564c 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -611,6 +611,7 @@ const struct uverbs_obj_type_class uverbs_idr_class = {
 	 */
 	.needs_kfree_rcu = true,
 };
+EXPORT_SYMBOL(uverbs_idr_class);
 
 static void _uverbs_close_fd(struct ib_uobject_file *uobj_file)
 {
@@ -719,6 +720,7 @@ const struct uverbs_obj_type_class uverbs_fd_class = {
 	.remove_commit = remove_commit_fd_uobject,
 	.needs_kfree_rcu = false,
 };
+EXPORT_SYMBOL(uverbs_fd_class);
 
 struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs,
 						   struct ib_ucontext *ucontext,
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 03/20] kernel.h: Reuse u64_to_ptr macro to cast __user pointers
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

Redefine u64_to_user_ptr() macro to be implemented with u64_to_ptr().

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/kernel.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7d60a3472929..3012ae2f52f7 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -83,12 +83,7 @@ static inline u64 ptr_to_u64(const void *ptr)
 }					\
 )
 
-#define u64_to_user_ptr(x) (		\
-{					\
-	typecheck(u64, x);		\
-	(void __user *)(uintptr_t)x;	\
-}					\
-)
+#define u64_to_user_ptr(x)	u64_to_ptr(void __user, x)
 
 /*
  * This looks more complex than it should be. But we need to
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 02/20] drm/i915: Move u64-to-ptr helpers to general header
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev
In-Reply-To: <20180617100006.30663-1-leon@kernel.org>

From: Leon Romanovsky <leonro@mellanox.com>

The macro u64_to_ptr() and function ptr_to_u64() are useful enough
to be part of general header, so move them there and allow RDMA
subsystem reuse them.

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/gpu/drm/i915/i915_utils.h | 12 ++----------
 include/linux/kernel.h            | 12 ++++++++++++
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 00165ad55fb3..2ffc43b97a5a 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -25,6 +25,8 @@
 #ifndef __I915_UTILS_H
 #define __I915_UTILS_H
 
+#include <linux/kernel.h>
+
 #undef WARN_ON
 /* Many gcc seem to no see through this and fall over :( */
 #if 0
@@ -102,16 +104,6 @@
 	__T;								\
 })
 
-static inline u64 ptr_to_u64(const void *ptr)
-{
-	return (uintptr_t)ptr;
-}
-
-#define u64_to_ptr(T, x) ({						\
-	typecheck(u64, x);						\
-	(T *)(uintptr_t)(x);						\
-})
-
 #define __mask_next_bit(mask) ({					\
 	int __idx = ffs(mask) - 1;					\
 	mask &= ~BIT(__idx);						\
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d23123238534..7d60a3472929 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -71,6 +71,18 @@
  */
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
+static inline u64 ptr_to_u64(const void *ptr)
+{
+	return (uintptr_t)ptr;
+}
+
+#define u64_to_ptr(T, x) (		\
+{					\
+	typecheck(u64, x);		\
+	(T *)(uintptr_t)(x);		\
+}					\
+)
+
 #define u64_to_user_ptr(x) (		\
 {					\
 	typecheck(u64, x);		\
-- 
2.14.4

^ permalink raw reply related

* [PATCH rdma-next v2 00/20] Introduce mlx5 DEVX interface
From: Leon Romanovsky @ 2018-06-17  9:59 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Joonas Lahtinen, Matan Barak,
	Yishai Hadas, Saeed Mahameed, linux-netdev

From: Leon Romanovsky <leonro@mellanox.com>

Changelog:
v1->v2:
 * Rebase on top of v4.18-rc1
v0 -> v1:
 * Dropped few validation/debug patches from the KABI part as of Jason's comments.
 * Use kvmalloc/kvfree instead of kmalloc/kfree to prevent higher order allocation under user control.
 * Cleaned up a dependency of CONFIG_INFINIBAND_USER_ACCESS from the uverbs layer.
 * Moved to white list command mode.
 * Serialize access to DEVX object - odify/read new methods were added
   with the appropriate IDR lock.
 * Block DEVX on IB port as of SELinux.
 * Improve uverbs_cleanup_ucontext algorithm to support two objects from the same type.
 * Fix mlx5_ifc.h to use reserved_at_xxx.
 * Enforce devx_uid existence upon DEVX methods.

>From Yishai:
----------------------------------------------------------------------
This DEVX series enables direct access from the user space area to the
mlx5 device driver by using the KABI mechanism.

The main purpose here is to make the user space driver as independent as
possible from the kernel so that future device functionality and commands
can be activated with minimal to none kernel changes.

The series keeps the same level of user process security/isolation as
provided today by the kernel IB verbs by using a user id returned from
the firmware upon context creation.

This user id is put by the kernel code in all firmware command from the
DEVX interface so that the isolation/security will be enforced by the
firmware based on that id.

Below kernel services are exposed to let the above work:

1) Expose a DEVX object that represent some underlay firmware object,
the input command to create it is some raw data given by the user
application which should match the device specification.

2) Expose a UMEM DEVX object for user memory registration for DMA:
The driver provides an API to register user memory, getting as input the
user address, length and access flags, and provides to the user as output
an ID (UMEM ID) returned by the firmware to this registered memory.

The user will use that UMEM ID in device direct commands that use this
memory instead of the physical addresses list.

3) Expose device general command API:
The driver provides an API to issue some general command except than
create and destroy.

4) UAR mapping:
Returns a device UAR ID for a given user index by using the kernel
context information.

This is safe from security point of view as described in details in the
relevant patch in the series.

5) EQ mapping:
Returns the matching device EQN for a given user vector number via the
DEVX interface.

6) Process resources cleanup:
This is achieved by the existing kernel KABI logic, upon DEVX object
creation the kernel builds the FW destroy command inbox in the KABI
object and uses it upon cleanup.

7) Future device commands:
This will be supported by some general command type (create, destroy)
that the DEVX code embeds as part of its code and is going to be
used from now on by the firmware.

At the beginning of this series there are some KABI infra-structures
improvements and fixes to enable the above DEVX functionality in a
cleaner way.

More details appear as part of the commit logs of the series and as part
of the comments in the code itself.

Thanks

Leon Romanovsky (2):
  drm/i915: Move u64-to-ptr helpers to general header
  kernel.h: Reuse u64_to_ptr macro to cast __user pointers

Matan Barak (5):
  IB/uverbs: Export uverbs idr and fd types
  IB/uverbs: Refactor uverbs_finalize_objects
  IB/uverbs: Add PTR_IN attributes that are allocated/copied
    automatically
  IB/uverbs: Add a macro to define a type with no kernel known size
  IB/uverbs: Allow an empty namespace in ioctl() framework

Yishai Hadas (13):
  net/mlx5_core: Prevent warns in dmesg upon firmware commands
  IB/core: Improve uverbs_cleanup_ucontext algorithm
  net/mlx5: Expose DEVX ifc structures
  IB/mlx5: Introduce DEVX
  IB/core: Introduce DECLARE_UVERBS_GLOBAL_METHODS
  IB: Expose ib_ucontext from a given ib_uverbs_file
  IB/mlx5: Add support for DEVX general command
  IB/mlx5: Add obj create and destroy functionality
  IB/mlx5: Add DEVX support for modify and query commands
  IB/mlx5: Add support for DEVX query UAR
  IB/mlx5: Add DEVX support for memory registration
  IB/mlx5: Add DEVX query EQN support
  IB/mlx5: Expose DEVX tree

 drivers/gpu/drm/i915/i915_utils.h                  |   12 +-
 drivers/infiniband/core/rdma_core.c                |  138 +--
 drivers/infiniband/core/rdma_core.h                |   10 +-
 drivers/infiniband/core/uverbs_ioctl.c             |  104 +-
 drivers/infiniband/core/uverbs_main.c              |    6 +
 drivers/infiniband/core/uverbs_std_types.c         |   26 +-
 .../infiniband/core/uverbs_std_types_counters.c    |    2 +-
 drivers/infiniband/core/uverbs_std_types_cq.c      |    2 +-
 drivers/infiniband/core/uverbs_std_types_dm.c      |    3 +-
 .../infiniband/core/uverbs_std_types_flow_action.c |    2 +-
 drivers/infiniband/core/uverbs_std_types_mr.c      |    3 +-
 drivers/infiniband/hw/mlx5/Makefile                |    1 +
 drivers/infiniband/hw/mlx5/devx.c                  | 1106 ++++++++++++++++++++
 drivers/infiniband/hw/mlx5/main.c                  |   31 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |   19 +
 drivers/infiniband/hw/mlx5/qp.c                    |    9 +-
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |   24 +-
 include/linux/kernel.h                             |   11 +-
 include/linux/mlx5/device.h                        |    3 +
 include/linux/mlx5/mlx5_ifc.h                      |   71 +-
 include/rdma/ib_verbs.h                            |    4 +-
 include/rdma/uverbs_ioctl.h                        |   27 +
 include/rdma/uverbs_named_ioctl.h                  |    4 +
 include/rdma/uverbs_std_types.h                    |    2 -
 include/rdma/uverbs_types.h                        |   11 +-
 include/uapi/rdma/mlx5-abi.h                       |    3 +
 include/uapi/rdma/mlx5_user_ioctl_cmds.h           |   73 ++
 27 files changed, 1548 insertions(+), 159 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx5/devx.c

^ permalink raw reply

* Re: [PATCH net-next,RFC 00/13] New fast forwarding path
From: Steffen Klassert @ 2018-06-17  9:23 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: David Miller, pablo, netfilter-devel, netdev
In-Reply-To: <8fd4ce02-6bf0-6bee-f8de-92c551881465@iogearbox.net>

Hi Daniel,

On Fri, Jun 15, 2018 at 03:22:24PM +0200, Daniel Borkmann wrote:
> Hi Steffen,
> 
> On 06/15/2018 08:17 AM, Steffen Klassert wrote:
> > 
> > I started with this last year because I wanted to improve
> > the IPsec (and UDP) forwarding path. Batching packets
> > at layer2  and send them directly to the output path
> > seemed to be a good method to improve this.
> > 
> > In particular, we need to do only one IPsec lookup
> > for the whole packet chain. So it relaxes the pain
> > from reomoving the IPsec flowcache a bit. It can be
> > only a first step, but we need some improvements here
> > as people start to complain about that.
> 
> But did you also experiment with XDP on this? 

I've already tried to figure out what I have to to
do to get XDP with forwarding, but still don't realy
know how to set this up.

Maybe it is time to have a deeper look into BPF/XDP,
but for now I feel a bit lost with this.

> Would be curious about
> the numbers. You'd get implicit batching for the forwarding via devmap
> as well if you're required to flush it out via different device with
> XDP_REDIRECT; otherwise XDP_TX of course. Given we have recently
> integrated helpers for XDP to do a FIB and neighbor lookup from the
> kernel tables, where it's thus shared and integrated with the rest of
> the stack and tooling, it would be awesome to get to the same point
> with xfrm as well. Eyal recently did a start on that for xfrm for tc
> progs; would be nice to have integration on XDP as well, potentially
> it might also result in a bigger plus on the forwarding numbers.

It might make sense to intrgrate XDP with xfrm to
be able to compare numbers etc. But I need a working
XDP setup and some understanding about it first, what
could take some time.

^ permalink raw reply

* [PATCH] tc, bpf: add option to dump bpf verifier as C program fragment
From: Ophir Munk @ 2018-06-17  8:48 UTC (permalink / raw)
  To: netdev, Stephen Hemminger, David Ahern
  Cc: Thomas Monjalon, Olga Shern, Ophir Munk

Similar to cbpf used within tcpdump utility with a "-d" option to dump
the compiled packet-matching code in a human readable form - tc has the
"verbose" option to dump ebpf verifier output.
Another useful option of cbpf using tcpdump "-dd" option is to dump
packet-matching code a C program fragment. Similar to this - this commit
adds a new tc ebpf option named "code" to dump ebpf verifier as C program
fragment.

Existing "verbose" option sample output:

Verifier analysis:
0: (61) r2 = *(u32 *)(r1 +52)
1: (18) r3 = 0xdeadbeef
3: (63) *(u32 *)(r10 -4) = r3
.
.
11: (63) *(u32 *)(r1 +52) = r2
12: (18) r0 = 0xffffffff
14: (95) exit

New "code" option sample output:

/* struct bpf_insn cls_q_code[] = { */
{0x61,    2,    1,       52, 0x00000000},
{0x18,    3,    0,        0, 0xdeadbeef},
{0x00,    0,    0,        0, 0x00000000},
.
.
{0x63,    1,    2,       52, 0x00000000},
{0x18,    0,    0,        0, 0xffffffff},
{0x00,    0,    0,        0, 0x00000000},
{0x95,    0,    0,        0, 0x00000000},

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 include/bpf_util.h |  1 +
 lib/bpf.c          | 35 +++++++++++++++++++++++++++++------
 tc/m_bpf.c         |  3 ++-
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 219beb4..cf611c5 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -72,6 +72,7 @@ struct bpf_cfg_in {
 	enum bpf_mode mode;
 	__u32 ifindex;
 	bool verbose;
+	bool code;	
 	int argc;
 	char **argv;
 	struct sock_filter opcodes[BPF_MAXINSNS];
diff --git a/lib/bpf.c b/lib/bpf.c
index c38d92d..b13ec3f 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -113,10 +113,10 @@ const char *bpf_prog_to_default_section(enum bpf_prog_type type)
 
 #ifdef HAVE_ELF
 static int bpf_obj_open(const char *path, enum bpf_prog_type type,
-			const char *sec, __u32 ifindex, bool verbose);
+			const char *sec, __u32 ifindex, bool verbose, bool code);
 #else
 static int bpf_obj_open(const char *path, enum bpf_prog_type type,
-			const char *sec, __u32 ifindex, bool verbose)
+			const char *sec, __u32 ifindex, bool verbose, bool code)
 {
 	fprintf(stderr, "No ELF library support compiled in.\n");
 	errno = ENOSYS;
@@ -809,6 +809,7 @@ static int bpf_do_parse(struct bpf_cfg_in *cfg, const bool *opt_tbl)
 {
 	const char *file, *section, *uds_name;
 	bool verbose = false;
+	bool code = false;	
 	int i, ret, argc;
 	char **argv;
 
@@ -890,6 +891,11 @@ static int bpf_do_parse(struct bpf_cfg_in *cfg, const bool *opt_tbl)
 			NEXT_ARG_FWD();
 		}
 
+		if (argc > 0 && matches(*argv, "code") == 0) {
+			code = true;
+			NEXT_ARG_FWD();
+		}
+
 		PREV_ARG();
 	}
 
@@ -911,6 +917,7 @@ static int bpf_do_parse(struct bpf_cfg_in *cfg, const bool *opt_tbl)
 	cfg->uds     = uds_name;
 	cfg->argc    = argc;
 	cfg->argv    = argv;
+	cfg->code    = code;
 	cfg->verbose = verbose;
 
 	return ret;
@@ -921,7 +928,7 @@ static int bpf_do_load(struct bpf_cfg_in *cfg)
 	if (cfg->mode == EBPF_OBJECT) {
 		cfg->prog_fd = bpf_obj_open(cfg->object, cfg->type,
 					    cfg->section, cfg->ifindex,
-					    cfg->verbose);
+					    cfg->verbose, cfg->code);
 		return cfg->prog_fd;
 	}
 	return 0;
@@ -1133,6 +1140,7 @@ struct bpf_elf_ctx {
 	enum bpf_prog_type	type;
 	__u32			ifindex;
 	bool			verbose;
+	bool			code;
 	struct bpf_elf_st	stat;
 	struct bpf_hash_entry	*ht[256];
 	char			*log;
@@ -1179,6 +1187,17 @@ bpf_dump_error(struct bpf_elf_ctx *ctx, const char *format, ...)
 	}
 }
 
+static void bpf_dump_code(const char *section, const struct bpf_insn *insns, unsigned int cnt)
+{
+	int i;
+	fprintf(stderr, "/* struct bpf_insn %s_code[] = { */\n", section);
+	for (i=0; i < cnt; i++) {
+		fprintf(stderr, "\t{0x%.2x, %4u, %4u, %8d, 0x%.8x},\n",
+		insns[i].code, insns[i].dst_reg, insns[i].src_reg, insns[i].off, insns[i].imm);
+	}
+	fprintf(stderr, "\n");
+}
+
 static int bpf_log_realloc(struct bpf_elf_ctx *ctx)
 {
 	const size_t log_max = UINT_MAX >> 8;
@@ -1526,6 +1545,9 @@ retry:
 		bpf_prog_report(fd, section, prog, ctx);
 	}
 
+	if (ctx->code)
+		bpf_dump_code(section, prog->insns, prog->size / sizeof(struct bpf_insn));
+
 	return fd;
 }
 
@@ -2439,7 +2461,7 @@ static void bpf_get_cfg(struct bpf_elf_ctx *ctx)
 
 static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname,
 			    enum bpf_prog_type type, __u32 ifindex,
-			    bool verbose)
+			    bool verbose, bool code)
 {
 	int ret = -EINVAL;
 
@@ -2450,6 +2472,7 @@ static int bpf_elf_ctx_init(struct bpf_elf_ctx *ctx, const char *pathname,
 	memset(ctx, 0, sizeof(*ctx));
 	bpf_get_cfg(ctx);
 	ctx->verbose = verbose;
+	ctx->code    = code;
 	ctx->type    = type;
 	ctx->ifindex = ifindex;
 
@@ -2543,12 +2566,12 @@ static void bpf_elf_ctx_destroy(struct bpf_elf_ctx *ctx, bool failure)
 static struct bpf_elf_ctx __ctx;
 
 static int bpf_obj_open(const char *pathname, enum bpf_prog_type type,
-			const char *section, __u32 ifindex, bool verbose)
+			const char *section, __u32 ifindex, bool verbose, bool code)
 {
 	struct bpf_elf_ctx *ctx = &__ctx;
 	int fd = 0, ret;
 
-	ret = bpf_elf_ctx_init(ctx, pathname, type, ifindex, verbose);
+	ret = bpf_elf_ctx_init(ctx, pathname, type, ifindex, verbose, code);
 	if (ret < 0) {
 		fprintf(stderr, "Cannot initialize ELF context!\n");
 		return ret;
diff --git a/tc/m_bpf.c b/tc/m_bpf.c
index 1c1f71c..9947113 100644
--- a/tc/m_bpf.c
+++ b/tc/m_bpf.c
@@ -33,7 +33,8 @@ static void explain(void)
 	fprintf(stderr, "\n");
 	fprintf(stderr, "eBPF use case:\n");
 	fprintf(stderr, " object-file FILE [ section ACT_NAME ] [ export UDS_FILE ]");
-	fprintf(stderr, " [ verbose ]\n");
+	fprintf(stderr, " [ verbose ]");
+	fprintf(stderr, " [ code ]\n");
 	fprintf(stderr, " object-pinned FILE\n");
 	fprintf(stderr, "\n");
 	fprintf(stderr, "Where BPF_BYTECODE := \'s,c t f k,c t f k,c t f k,...\'\n");
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH 1/3] net: dsa: Add DT bindings for Vitesse VSC73xx switches
From: Jiri Pirko @ 2018-06-17  7:40 UTC (permalink / raw)
  To: Linus Walleij
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, netdev,
	openwrt-devel, LEDE Development List, Gabor Juhos, devicetree
In-Reply-To: <20180614123534.8063-2-linus.walleij@linaro.org>

Thu, Jun 14, 2018 at 02:35:32PM CEST, linus.walleij@linaro.org wrote:
>This adds the device tree bindings for the Vitesse VSC73xx
>switches. We also add the vendor name for Vitesse.
>
>Cc: devicetree@vger.kernel.org
>Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
>---
> .../bindings/net/dsa/vitesse,vsc73xx.txt      | 81 +++++++++++++++++++
> .../devicetree/bindings/vendor-prefixes.txt   |  1 +
> 2 files changed, 82 insertions(+)
> create mode 100644 Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
>
>diff --git a/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
>new file mode 100644
>index 000000000000..474cdba5fa37
>--- /dev/null
>+++ b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
>@@ -0,0 +1,81 @@
>+Vitess VSC73xx Switches

s/Vitess/Vitesse/

[...]

^ permalink raw reply

* Re: [PATCH COMMITTED] bluetooth: hci_nokia: Don't include linux/unaligned/le_struct.h directly.
From: Marcel Holtmann @ 2018-06-17  7:42 UTC (permalink / raw)
  To: David S. Miller; +Cc: Johan Hedberg, linux-bluetooth, netdev
In-Reply-To: <20180616.164059.1116555189277956188.davem@davemloft.net>

Hi Dave,

> This breaks the build as this header is not meant to be used in this
> way.
> 
> ./include/linux/unaligned/access_ok.h:8:28: error: redefinition of ‘get_unaligned_le16’
> static __always_inline u16 get_unaligned_le16(const void *p)
>                            ^~~~~~~~~~~~~~~~~~
> In file included from drivers/bluetooth/hci_nokia.c:32:
> ./include/linux/unaligned/le_struct.h:7:19: note: previous definition of ‘get_unaligned_le16’ was here
> static inline u16 get_unaligned_le16(const void *p)
> 
> Use asm/unaligned.h instead.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
> drivers/bluetooth/hci_nokia.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)

Acked-by: Marcel Holtmann <marcel@holtmann.org>

Regards

Marcel

^ permalink raw reply

* [PATCH] net: fix e1000.rst Documentation build errors
From: Randy Dunlap @ 2018-06-17  0:36 UTC (permalink / raw)
  To: netdev@vger.kernel.org, David Miller, linux-doc@vger.kernel.org
  Cc: LKML, Jeff Kirsher, aaron.f.brown

From: Randy Dunlap <rdunlap@infradead.org>

Fix Documentation build errors in e1000.rst.  Several section titles and
their underlines should not be indented.

Documentation/networking/e1000.rst:358: (SEVERE/4) Unexpected section title.

Fixes: 228046e76189 ("Documentation: e1000: Update kernel documentation")

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Aaron Brown <aaron.f.brown@intel.com>
---
Is there a Sphinx version problem here?  Tested-by: should indicate
that there was no error like I am seeing.

 Documentation/networking/e1000.rst |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

--- lnx-418-rc1.orig/Documentation/networking/e1000.rst
+++ lnx-418-rc1/Documentation/networking/e1000.rst
@@ -354,8 +354,8 @@ previously mentioned to force the adapte
 Additional Configurations
 =========================
 
-  Jumbo Frames
-  ------------
+Jumbo Frames
+------------
   Jumbo Frames support is enabled by changing the MTU to a value larger than
   the default of 1500.  Use the ifconfig command to increase the MTU size.
   For example::
@@ -389,8 +389,8 @@ Additional Configurations
      Intel(R) PRO/1000 Gigabit Server Adapter
      Intel(R) PRO/1000 PM Network Connection
 
-  ethtool
-  -------
+ethtool
+-------
   The driver utilizes the ethtool interface for driver configuration and
   diagnostics, as well as displaying statistical information.  The ethtool
   version 1.6 or later is required for this functionality.
@@ -398,8 +398,8 @@ Additional Configurations
   The latest release of ethtool can be found from
   https://www.kernel.org/pub/software/network/ethtool/
 
-  Enabling Wake on LAN* (WoL)
-  ---------------------------
+Enabling Wake on LAN* (WoL)
+---------------------------
   WoL is configured through the ethtool* utility.
 
   WoL will be enabled on the system during the next shut down or reboot.

^ permalink raw reply

* [PATCH] net: fix e100.rst Documentation build errors
From: Randy Dunlap @ 2018-06-17  0:36 UTC (permalink / raw)
  To: linux-doc@vger.kernel.org, netdev@vger.kernel.org, Jeff Kirsher,
	David Miller
  Cc: LKML, Aaron Brown

From: Randy Dunlap <rdunlap@infradead.org>

Fix Documentation build errors in e100.rst.  Several section titles
and the corresponding underlines should not be indented.

Documentation/networking/e100.rst:90: (SEVERE/4) Unexpected section title.
Documentation/networking/e100.rst:109: (SEVERE/4) Unexpected section title.

Fixes: 85d63445f411 ("Documentation: e100: Update the Intel 10/100 driver doc")

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Aaron Brown <aaron.f.brown@intel.com>
---
Is there a Sphinx version problem here?  Tested-by: should indicate
that there was no error like I am seeing.

 Documentation/networking/e100.rst |   24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

--- lnx-418-rc1.orig/Documentation/networking/e100.rst
+++ lnx-418-rc1/Documentation/networking/e100.rst
@@ -86,8 +86,8 @@ Event Log Message Level:  The driver use
 Additional Configurations
 =========================
 
-  Configuring the Driver on Different Distributions
-  -------------------------------------------------
+Configuring the Driver on Different Distributions
+-------------------------------------------------
 
   Configuring a network driver to load properly when the system is started is
   distribution dependent. Typically, the configuration process involves adding
@@ -105,8 +105,8 @@ Additional Configurations
        alias eth0 e100
        alias eth1 e100
 
-  Viewing Link Messages
-  ---------------------
+Viewing Link Messages
+---------------------
   In order to see link messages and other Intel driver information on your
   console, you must set the dmesg level up to six. This can be done by
   entering the following on the command line before loading the e100 driver::
@@ -119,8 +119,8 @@ Additional Configurations
   NOTE: This setting is not saved across reboots.
 
 
-  ethtool
-  -------
+ethtool
+-------
 
   The driver utilizes the ethtool interface for driver configuration and
   diagnostics, as well as displaying statistical information.  The ethtool
@@ -129,8 +129,8 @@ Additional Configurations
   The latest release of ethtool can be found from
   https://www.kernel.org/pub/software/network/ethtool/
 
-  Enabling Wake on LAN* (WoL)
-  ---------------------------
+Enabling Wake on LAN* (WoL)
+---------------------------
   WoL is provided through the ethtool* utility.  For instructions on enabling
   WoL with ethtool, refer to the ethtool man page.
 
@@ -138,16 +138,16 @@ Additional Configurations
   this driver version, in order to enable WoL, the e100 driver must be
   loaded when shutting down or rebooting the system.
 
-  NAPI
-  ----
+NAPI
+----
 
   NAPI (Rx polling mode) is supported in the e100 driver.
 
   See https://wiki.linuxfoundation.org/networking/napi for more information
   on NAPI.
 
-  Multiple Interfaces on Same Ethernet Broadcast Network
-  ------------------------------------------------------
+Multiple Interfaces on Same Ethernet Broadcast Network
+------------------------------------------------------
 
   Due to the default ARP behavior on Linux, it is not possible to have
   one system on two IP networks in the same Ethernet broadcast domain

^ permalink raw reply

* Re: [PATCH] net_sched: blackhole: tell upper qdisc about dropped packets
From: David Miller @ 2018-06-16 23:42 UTC (permalink / raw)
  To: khlebnikov; +Cc: netdev, xiyou.wangcong, jiri, jhs
In-Reply-To: <152905845136.88583.6197221349040585517.stgit@buzz>

From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 15 Jun 2018 13:27:31 +0300

> When blackhole is used on top of classful qdisc like hfsc it breaks
> qlen and backlog counters because packets are disappear without notice.
> 
> In HFSC non-zero qlen while all classes are inactive triggers warning:
> WARNING: ... at net/sched/sch_hfsc.c:1393 hfsc_dequeue+0xba4/0xe90 [sch_hfsc]
> and schedules watchdog work endlessly.
> 
> This patch return __NET_XMIT_BYPASS in addition to NET_XMIT_SUCCESS,
> this flag tells upper layer: this packet is gone and isn't queued.
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

Applied, thanks.

^ permalink raw reply

* [PATCH COMMITTED] bluetooth: hci_nokia: Don't include linux/unaligned/le_struct.h directly.
From: David Miller @ 2018-06-16 23:40 UTC (permalink / raw)
  To: marcel; +Cc: johan.hedberg, linux-bluetooth, netdev


This breaks the build as this header is not meant to be used in this
way.

./include/linux/unaligned/access_ok.h:8:28: error: redefinition of ‘get_unaligned_le16’
 static __always_inline u16 get_unaligned_le16(const void *p)
                            ^~~~~~~~~~~~~~~~~~
In file included from drivers/bluetooth/hci_nokia.c:32:
./include/linux/unaligned/le_struct.h:7:19: note: previous definition of ‘get_unaligned_le16’ was here
 static inline u16 get_unaligned_le16(const void *p)

Use asm/unaligned.h instead.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bluetooth/hci_nokia.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bluetooth/hci_nokia.c b/drivers/bluetooth/hci_nokia.c
index 14d159e2042d..2dc33e65d2d0 100644
--- a/drivers/bluetooth/hci_nokia.c
+++ b/drivers/bluetooth/hci_nokia.c
@@ -29,7 +29,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <linux/unaligned/le_struct.h>
+#include <asm/unaligned.h>
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
 
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: David Miller @ 2018-06-16 23:27 UTC (permalink / raw)
  To: dwmw2; +Cc: eric.dumazet, netdev, ldir
In-Reply-To: <d7644be784f3164bbc5f79154260160d.squirrel@twosheds.infradead.org>

From: "David Woodhouse" <dwmw2@infradead.org>
Date: Sat, 16 Jun 2018 20:52:33 -0000

>> This Fixes tag shoots the messenger really.
>>
>> I suggest to instead use :
>>
>> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")
> 
> 
> Oh, I hadn't realised how recent that was. Sure, let's blame you instead :)

Patch applied with adjusted Fixes: tag, and queued up for -stable.

Thanks.

^ permalink raw reply

* Re: pull-request: bpf 2018-06-16
From: David Miller @ 2018-06-16 22:54 UTC (permalink / raw)
  To: daniel; +Cc: ast, netdev
In-Reply-To: <20180615230630.12117-1-daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 16 Jun 2018 01:06:30 +0200

> The following pull-request contains BPF updates for your *net* tree.
> 
> The main changes are:
 ...
> Please consider pulling these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

Pulled, thanks Daniel.

^ permalink raw reply

* [PATCH] xfrm4: Remove export declaration from xfrm4_protocol_init
From: efremov @ 2018-06-16 20:58 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Denis Efremov, David S. Miller, netdev, linux-kernel, ldv-project

From: Denis Efremov <efremov@linux.com>

The function xfrm4_protocol_init is exported as GPL symbol and
annotated as __init. That is reasonable only in the case when
this function is called from another's module __init section.
Otherwise, we will face section mismatch error. xfrm4_protocol_init
is used in xfrm4_init along with xfrm4_state_init and xfrm4_policy_init.
The last two functions are not exported as GPL symbols. According to
this, it seem's like there is no reason to export xfrm4_protocol_init too.
Fix potential section mismatch by removing export declaration
from xfrm4_protocol_init.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Denis Efremov <efremov@linux.com>
---
 net/ipv4/xfrm4_protocol.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..0e1f5dc2766b 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -297,4 +297,3 @@ void __init xfrm4_protocol_init(void)
 {
 	xfrm_input_register_afinfo(&xfrm4_input_afinfo);
 }
-EXPORT_SYMBOL(xfrm4_protocol_init);
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: David Woodhouse @ 2018-06-16 20:52 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Woodhouse, netdev, ldir
In-Reply-To: <62fbc51a-4738-6c21-8c53-d84fee7a9bbd@gmail.com>

> This Fixes tag shoots the messenger really.
>
> I suggest to instead use :
>
> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")


Oh, I hadn't realised how recent that was. Sure, let's blame you instead :)


-- 
dwmw2

^ permalink raw reply

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: David Woodhouse @ 2018-06-16 15:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Woodhouse, netdev, ldir
In-Reply-To: <62fbc51a-4738-6c21-8c53-d84fee7a9bbd@gmail.com>


>> Commit 14afee4b609 ("net: convert sock.sk_wmem_alloc from atomic_t to
>> refcount_t") did exactly what it was intended to do, and turned this
>> mostly-theoretical problem into a real one, causing PPPoATM to fail
>> immediately as sk_wmem_alloc underflows and atm_may_send() *immediately*
>> starts refusing to allow new packets.

 ...

>> Fixes: 14afee4b ("net: convert sock.sk_wmem_alloc from atomic_t to
>> refcount_t")
>
> This Fixes tag shoots the messenger really.

A little bit, yes. The text hopefully made that clear.

> I suggest to instead use :
>
> Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")
>
> Because even without the conversion to refcount_t, we could have a LOCKDEP
> splat in :
>
> filter = rcu_dereference_check(sk->sk_filter,
>                                atomic_read(&sk->sk_wmem_alloc) == 0);
>
> Note that some places make a further check even when LOCKDEP is not used.
>
> net/ipv4/af_inet.c:154:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/iucv/af_iucv.c:405:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/key/af_key.c:112:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/netlink/af_netlink.c:410:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/packet/af_packet.c:1286:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/rxrpc/af_rxrpc.c:852:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
> net/unix/af_unix.c:490:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
>
>
> We might factorize these checks into __sk_destruct()
>


How many of those were likely to trigger in practice on an ATM VCC though?
If we are taking the Fixes: tag as a hint about which stable kernels we
might want to backport to, rather than a moral assignment of blame, then
14afee4b is probably not the worst place to point it. But I don't mind
much...

-- 
dwmw2

^ permalink raw reply

* Re: [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: Eric Dumazet @ 2018-06-16 12:57 UTC (permalink / raw)
  To: David Woodhouse, netdev, ldir
In-Reply-To: <20180616105544.246714-1-dwmw2@infradead.org>



On 06/16/2018 03:55 AM, David Woodhouse wrote:
> ATM accounts for in-flight TX packets in sk_wmem_alloc of the VCC on
> which they are to be sent. But it doesn't take ownership of those
> packets from the sock (if any) which originally owned them. They should
> remain owned by their actual sender until they've left the box.
> 
> There's a hack in pskb_expand_head() to avoid adjusting skb->truesize
> for certain skbs, precisely to avoid messing up sk_wmem_alloc
> accounting. Ideally that hack would cover the ATM use case too, but it
> doesn't — skbs which aren't owned by any sock, for example PPP control
> frames, still get their truesize adjusted when the low-level ATM driver
> adds headroom.
> 
> This has always been an issue, it seems. The truesize of a packet
> increases, and sk_wmem_alloc on the VCC goes negative. But this wasn't
> for normal traffic, only for control frames. So I think we just got away
> with it, and we probably needed to send 2GiB of LCP echo frames before
> the misaccounting would ever have caused a problem and caused
> atm_may_send() to start refusing packets.
> 
> Commit 14afee4b609 ("net: convert sock.sk_wmem_alloc from atomic_t to
> refcount_t") did exactly what it was intended to do, and turned this
> mostly-theoretical problem into a real one, causing PPPoATM to fail
> immediately as sk_wmem_alloc underflows and atm_may_send() *immediately*
> starts refusing to allow new packets.
> 
> The least intrusive solution to this problem is to stash the value of
> skb->truesize that was accounted to the VCC, in a new member of the
> ATM_SKB(skb) structure. Then in atm_pop_raw() subtract precisely that
> value instead of the then-current value of skb->truesize.
> 
> Fixes: 14afee4b ("net: convert sock.sk_wmem_alloc from atomic_t to refcount_t")

This Fixes tag shoots the messenger really.

I suggest to instead use :

Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()")

Because even without the conversion to refcount_t, we could have a LOCKDEP
splat in :

filter = rcu_dereference_check(sk->sk_filter,
                               atomic_read(&sk->sk_wmem_alloc) == 0);

Note that some places make a further check even when LOCKDEP is not used.

net/ipv4/af_inet.c:154:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/iucv/af_iucv.c:405:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/key/af_key.c:112:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/netlink/af_netlink.c:410:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/packet/af_packet.c:1286:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/rxrpc/af_rxrpc.c:852:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
net/unix/af_unix.c:490:	WARN_ON(refcount_read(&sk->sk_wmem_alloc));


We might factorize these checks into __sk_destruct()

^ permalink raw reply

* Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"
From: Eric Dumazet @ 2018-06-16 12:45 UTC (permalink / raw)
  To: Mathieu Malaterre
  Cc: David S. Miller, Eric Dumazet, LKML, Christophe LEROY,
	Meelis Roos, schwab, netdev, linuxppc-dev
In-Reply-To: <CA+7wUsysaip70ARGQ7Byg1Zktfg0Y1esjEc9HxBGU4sKL8crhg@mail.gmail.com>



On 06/16/2018 12:14 AM, Mathieu Malaterre wrote:
> Hi Eric,
> 
> On Fri, Jun 15, 2018 at 9:14 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>>
>>
>> On 06/15/2018 11:56 AM, Mathieu Malaterre wrote:
>>> This reverts commit 88078d98d1bb085d72af8437707279e203524fa5.
>>>
>>> It causes regressions for people using chips driven by the sungem
>>> driver. Suspicion is that the skb->csum value isn't being adjusted
>>> properly.
>>>
>>> Symptoms as seen on G4+sungem are:
>>>
>>> [   34.023281] eth0: hw csum failure
>>> [   34.023438] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #2
>>> [   34.023618] Call Trace:
>>> [   34.023707] [dffedbd0] [c069ddac] __skb_checksum_complete+0xf0/0x108 (unreliable)
>>> [   34.023948] [dffedbf0] [c0777a70] tcp_v4_rcv+0x604/0xe00
>>> [   34.024118] [dffedc70] [c0731624] ip_local_deliver_finish+0xa8/0x3c4
>>> [   34.024315] [dffedcb0] [c0732430] ip_local_deliver+0xf0/0x154
>>> [   34.024493] [dffedcf0] [c07328dc] ip_rcv+0x448/0x774
>>> [   34.024653] [dffedd50] [c06aeae0] __netif_receive_skb_core+0x5e8/0x1184
>>> [   34.024857] [dffedde0] [c06bba20] napi_gro_receive+0x160/0x22c
>>> [   34.025044] [dffede10] [e14b2590] gem_poll+0x7fc/0x1ac0 [sungem]
>>> [   34.025228] [dffedee0] [c06bacf0] net_rx_action+0x34c/0x618
>>> [   34.025402] [dffedf60] [c07fd27c] __do_softirq+0x16c/0x5f0
>>> [   34.025575] [dffedfd0] [c0064c7c] irq_exit+0x110/0x1a8
>>> [   34.025738] [dffedff0] [c0016170] call_do_irq+0x24/0x3c
>>> [   34.025903] [c0cf7e80] [c0009a84] do_IRQ+0x98/0x1a0
>>> [   34.026055] [c0cf7eb0] [c001b474] ret_from_except+0x0/0x14
>>> [   34.026225] --- interrupt: 501 at arch_cpu_idle+0x30/0x78
>>>                    LR = arch_cpu_idle+0x30/0x78
>>> [   34.026510] [c0cf7f70] [c0cf6000] 0xc0cf6000 (unreliable)
>>> [   34.026682] [c0cf7f80] [c00a3868] do_idle+0xc4/0x158
>>> [   34.026835] [c0cf7fb0] [c00a3ab0] cpu_startup_entry+0x20/0x28
>>> [   34.027013] [c0cf7fc0] [c0998820] start_kernel+0x47c/0x490
>>> [   34.027181] [c0cf7ff0] [00003444] 0x3444
>>>
>>> See commit 7ce5a27f2ef8 ("Revert "net: Handle CHECKSUM_COMPLETE more
>>> adequately in pskb_trim_rcsum()."") for previous reference.
>>
>> This fix seems to hide a bug in csum functions on this architecture.
> 
> That's odd since it seems to only affect g4+sungem user. None of the
> ppc64 seems to be having it. And some ppc32 users are not even seeing
> it.
> 
>> Or a bug on this NIC when receiving a small packet (less than 60 bytes).
>> Maybe the padding bytes are not included in NIC provided csum, and not 0.
> 
> Ok in that case the bug is located in
> ./drivers/net/ethernet/sun/sungem.c that seems more likely. I'll try
> to understand that code, then.


I would try something like :

Basically do not bother using CHECKSUM_COMPLETE for small frames that might have been padded.

Since we need to bring one cache line in eth_type_trans() and further header processing,
computing the checksum in software will be almost free anyway.

diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index 7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..071039f211a8a33153e888bd4014314ba5e686a4 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -855,9 +855,11 @@ static int gem_rx(struct gem *gp, int work_to_do)
                        skb = copy_skb;
                }
 
-               csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 0xffff);
-               skb->csum = csum_unfold(csum);
-               skb->ip_summed = CHECKSUM_COMPLETE;
+               if (len > ETH_ZLEN) {
+                       csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 0xffff);
+                       skb->csum = csum_unfold(csum);
+                       skb->ip_summed = CHECKSUM_COMPLETE;
+               }
                skb->protocol = eth_type_trans(skb, gp->dev);
 
                napi_gro_receive(&gp->napi, skb);

^ permalink raw reply related

* Re: [PATCH 07/17] net: convert sock.sk_wmem_alloc from atomic_t to refcount_t
From: David Woodhouse @ 2018-06-16 11:30 UTC (permalink / raw)
  To: Kevin Darbyshire-Bryant
  Cc: Eric Dumazet, Elena Reshetova, netdev@vger.kernel.org,
	Krzysztof Mazur, 3chas3@gmail.com, Mathias Kresin
In-Reply-To: <60894922-D59A-4F6D-862B-9933DE06F1BD@darbyshire-bryant.me.uk>

[-- Attachment #1: Type: text/plain, Size: 621 bytes --]

On Sat, 2018-06-16 at 03:44 +0000, Kevin Darbyshire-Bryant wrote:
> 
> I can confirm that PPPoA with both vc & llc encapsulations work. 
> BR2684 with PPPoE and both vc & llc encapsulations also work.  No
> nasty messages noted in dmesg.

Thanks.

>   I’m actually gobsmacked at how tolerant TalkTalk/BT are of what
> I’ve thrown at them, they clearly just look for PPP frames :-)

We weren't saying that in the days when they stupidly assumed all PPP
data frames contained Legacy IP, and then dropped any short frames
which, if they *had* been Legacy IP, would have had zeroes in the
'length' field.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5213 bytes --]

^ permalink raw reply

* [PATCH] atm: Preserve value of skb->truesize when accounting to vcc
From: David Woodhouse @ 2018-06-16 10:55 UTC (permalink / raw)
  To: netdev, ldir

ATM accounts for in-flight TX packets in sk_wmem_alloc of the VCC on
which they are to be sent. But it doesn't take ownership of those
packets from the sock (if any) which originally owned them. They should
remain owned by their actual sender until they've left the box.

There's a hack in pskb_expand_head() to avoid adjusting skb->truesize
for certain skbs, precisely to avoid messing up sk_wmem_alloc
accounting. Ideally that hack would cover the ATM use case too, but it
doesn't — skbs which aren't owned by any sock, for example PPP control
frames, still get their truesize adjusted when the low-level ATM driver
adds headroom.

This has always been an issue, it seems. The truesize of a packet
increases, and sk_wmem_alloc on the VCC goes negative. But this wasn't
for normal traffic, only for control frames. So I think we just got away
with it, and we probably needed to send 2GiB of LCP echo frames before
the misaccounting would ever have caused a problem and caused
atm_may_send() to start refusing packets.

Commit 14afee4b609 ("net: convert sock.sk_wmem_alloc from atomic_t to
refcount_t") did exactly what it was intended to do, and turned this
mostly-theoretical problem into a real one, causing PPPoATM to fail
immediately as sk_wmem_alloc underflows and atm_may_send() *immediately*
starts refusing to allow new packets.

The least intrusive solution to this problem is to stash the value of
skb->truesize that was accounted to the VCC, in a new member of the
ATM_SKB(skb) structure. Then in atm_pop_raw() subtract precisely that
value instead of the then-current value of skb->truesize.

Fixes: 14afee4b ("net: convert sock.sk_wmem_alloc from atomic_t to refcount_t")
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Tested-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
---
 include/linux/atmdev.h | 15 +++++++++++++++
 net/atm/br2684.c       |  3 +--
 net/atm/clip.c         |  3 +--
 net/atm/common.c       |  3 +--
 net/atm/lec.c          |  3 +--
 net/atm/mpc.c          |  3 +--
 net/atm/pppoatm.c      |  3 +--
 net/atm/raw.c          |  4 ++--
 8 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index 0c27515d2cf6..8124815eb121 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -214,6 +214,7 @@ struct atmphy_ops {
 struct atm_skb_data {
 	struct atm_vcc	*vcc;		/* ATM VCC */
 	unsigned long	atm_options;	/* ATM layer options */
+	unsigned int	acct_truesize;  /* truesize accounted to vcc */
 };
 
 #define VCC_HTABLE_SIZE 32
@@ -241,6 +242,20 @@ void vcc_insert_socket(struct sock *sk);
 
 void atm_dev_release_vccs(struct atm_dev *dev);
 
+static inline void atm_account_tx(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	/*
+	 * Because ATM skbs may not belong to a sock (and we don't
+	 * necessarily want to), skb->truesize may be adjusted,
+	 * escaping the hack in pskb_expand_head() which avoids
+	 * doing so for some cases. So stash the value of truesize
+	 * at the time we accounted it, and atm_pop_raw() can use
+	 * that value later, in case it changes.
+	 */
+	refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+	ATM_SKB(skb)->acct_truesize = skb->truesize;
+	ATM_SKB(skb)->atm_options = vcc->atm_options;
+}
 
 static inline void atm_force_charge(struct atm_vcc *vcc,int truesize)
 {
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 36b3adacc0dd..10462de734ea 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -252,8 +252,7 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev,
 
 	ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
 	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
-	refcount_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc);
-	ATM_SKB(skb)->atm_options = atmvcc->atm_options;
+	atm_account_tx(atmvcc, skb);
 	dev->stats.tx_packets++;
 	dev->stats.tx_bytes += skb->len;
 
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 66caa48a27c2..d795b9c5aea4 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -381,8 +381,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
 		memcpy(here, llc_oui, sizeof(llc_oui));
 		((__be16 *) here)[3] = skb->protocol;
 	}
-	refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
-	ATM_SKB(skb)->atm_options = vcc->atm_options;
+	atm_account_tx(vcc, skb);
 	entry->vccs->last_use = jiffies;
 	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev);
 	old = xchg(&entry->vccs->xoff, 1);	/* assume XOFF ... */
diff --git a/net/atm/common.c b/net/atm/common.c
index 1f2af59935db..ff5748b2190f 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -630,10 +630,9 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
 		goto out;
 	}
 	pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
-	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+	atm_account_tx(vcc, skb);
 
 	skb->dev = NULL; /* for paths shared with net_device interfaces */
-	ATM_SKB(skb)->atm_options = vcc->atm_options;
 	if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
 		kfree_skb(skb);
 		error = -EFAULT;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5a95fcf6f9b6..d7f5cf5b7594 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -182,9 +182,8 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 
 	ATM_SKB(skb)->vcc = vcc;
-	ATM_SKB(skb)->atm_options = vcc->atm_options;
+	atm_account_tx(vcc, skb);
 
-	refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
 	if (vcc->send(vcc, skb) < 0) {
 		dev->stats.tx_dropped++;
 		return;
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 75620c2f2617..24b53c4c39c6 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -555,8 +555,7 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
 					sizeof(struct llc_snap_hdr));
 	}
 
-	refcount_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc);
-	ATM_SKB(skb)->atm_options = entry->shortcut->atm_options;
+	atm_account_tx(entry->shortcut, skb);
 	entry->shortcut->send(entry->shortcut, skb);
 	entry->packets_fwded++;
 	mpc->in_ops->put(entry);
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index 21d9d341a619..af8c4b38b746 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -350,8 +350,7 @@ static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb)
 		return 1;
 	}
 
-	refcount_add(skb->truesize, &sk_atm(ATM_SKB(skb)->vcc)->sk_wmem_alloc);
-	ATM_SKB(skb)->atm_options = ATM_SKB(skb)->vcc->atm_options;
+	atm_account_tx(vcc, skb);
 	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n",
 		 skb, ATM_SKB(skb)->vcc, ATM_SKB(skb)->vcc->dev);
 	ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
diff --git a/net/atm/raw.c b/net/atm/raw.c
index ee10e8d46185..b3ba44aab0ee 100644
--- a/net/atm/raw.c
+++ b/net/atm/raw.c
@@ -35,8 +35,8 @@ static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb)
 	struct sock *sk = sk_atm(vcc);
 
 	pr_debug("(%d) %d -= %d\n",
-		 vcc->vci, sk_wmem_alloc_get(sk), skb->truesize);
-	WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
+		 vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize);
+	WARN_ON(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, &sk->sk_wmem_alloc));
 	dev_kfree_skb_any(skb);
 	sk->sk_write_space(sk);
 }
-- 
2.17.0

^ permalink raw reply related

* (unknown)
From: Mrs Mavis Wanczyk @ 2018-06-16  8:15 UTC (permalink / raw)




-- 
This is the second time i am sending you this mail.

I, Mavis Wanczyk donates $ 5 Million Dollars from part of my Powerball  
Jackpot Lottery of $ 758 Million Dollars, respond with your details  
for claims.

I await your earliest response and God Bless you

Good luck.
Mrs Mavis L. Wanczyk

^ permalink raw reply

* Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"
From: Mathieu Malaterre @ 2018-06-16  7:14 UTC (permalink / raw)
  To: eric.dumazet
  Cc: David S. Miller, Eric Dumazet, LKML, Christophe LEROY,
	Meelis Roos, schwab, netdev, linuxppc-dev
In-Reply-To: <fbb95c11-240c-1a11-0a62-0483908c577e@gmail.com>

Hi Eric,

On Fri, Jun 15, 2018 at 9:14 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 06/15/2018 11:56 AM, Mathieu Malaterre wrote:
> > This reverts commit 88078d98d1bb085d72af8437707279e203524fa5.
> >
> > It causes regressions for people using chips driven by the sungem
> > driver. Suspicion is that the skb->csum value isn't being adjusted
> > properly.
> >
> > Symptoms as seen on G4+sungem are:
> >
> > [   34.023281] eth0: hw csum failure
> > [   34.023438] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #2
> > [   34.023618] Call Trace:
> > [   34.023707] [dffedbd0] [c069ddac] __skb_checksum_complete+0xf0/0x108 (unreliable)
> > [   34.023948] [dffedbf0] [c0777a70] tcp_v4_rcv+0x604/0xe00
> > [   34.024118] [dffedc70] [c0731624] ip_local_deliver_finish+0xa8/0x3c4
> > [   34.024315] [dffedcb0] [c0732430] ip_local_deliver+0xf0/0x154
> > [   34.024493] [dffedcf0] [c07328dc] ip_rcv+0x448/0x774
> > [   34.024653] [dffedd50] [c06aeae0] __netif_receive_skb_core+0x5e8/0x1184
> > [   34.024857] [dffedde0] [c06bba20] napi_gro_receive+0x160/0x22c
> > [   34.025044] [dffede10] [e14b2590] gem_poll+0x7fc/0x1ac0 [sungem]
> > [   34.025228] [dffedee0] [c06bacf0] net_rx_action+0x34c/0x618
> > [   34.025402] [dffedf60] [c07fd27c] __do_softirq+0x16c/0x5f0
> > [   34.025575] [dffedfd0] [c0064c7c] irq_exit+0x110/0x1a8
> > [   34.025738] [dffedff0] [c0016170] call_do_irq+0x24/0x3c
> > [   34.025903] [c0cf7e80] [c0009a84] do_IRQ+0x98/0x1a0
> > [   34.026055] [c0cf7eb0] [c001b474] ret_from_except+0x0/0x14
> > [   34.026225] --- interrupt: 501 at arch_cpu_idle+0x30/0x78
> >                    LR = arch_cpu_idle+0x30/0x78
> > [   34.026510] [c0cf7f70] [c0cf6000] 0xc0cf6000 (unreliable)
> > [   34.026682] [c0cf7f80] [c00a3868] do_idle+0xc4/0x158
> > [   34.026835] [c0cf7fb0] [c00a3ab0] cpu_startup_entry+0x20/0x28
> > [   34.027013] [c0cf7fc0] [c0998820] start_kernel+0x47c/0x490
> > [   34.027181] [c0cf7ff0] [00003444] 0x3444
> >
> > See commit 7ce5a27f2ef8 ("Revert "net: Handle CHECKSUM_COMPLETE more
> > adequately in pskb_trim_rcsum()."") for previous reference.
>
> This fix seems to hide a bug in csum functions on this architecture.

That's odd since it seems to only affect g4+sungem user. None of the
ppc64 seems to be having it. And some ppc32 users are not even seeing
it.

> Or a bug on this NIC when receiving a small packet (less than 60 bytes).
> Maybe the padding bytes are not included in NIC provided csum, and not 0.

Ok in that case the bug is located in
./drivers/net/ethernet/sun/sungem.c that seems more likely. I'll try
to understand that code, then.

Thanks

>
>
>

^ permalink raw reply

* Re: [PATCH] net_sched: blackhole: tell upper qdisc about dropped packets
From: Cong Wang @ 2018-06-16  4:03 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Eric Dumazet, Linux Kernel Network Developers, David S. Miller,
	Jiri Pirko, Jamal Hadi Salim
In-Reply-To: <CAM_iQpWtVQnfrj8KKE79ea9s9Goj+biF1LEahNYTcb1PgSff5g@mail.gmail.com>

On Fri, Jun 15, 2018 at 9:00 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Fri, Jun 15, 2018 at 6:21 AM, Konstantin Khlebnikov
> <khlebnikov@yandex-team.ru> wrote:
>> On 15.06.2018 16:13, Eric Dumazet wrote:
>>>
>>>
>>>
>>> On 06/15/2018 03:27 AM, Konstantin Khlebnikov wrote:
>>>>
>>>> When blackhole is used on top of classful qdisc like hfsc it breaks
>>>> qlen and backlog counters because packets are disappear without notice.
>>>>
>>>> In HFSC non-zero qlen while all classes are inactive triggers warning:
>>>> WARNING: ... at net/sched/sch_hfsc.c:1393 hfsc_dequeue+0xba4/0xe90
>>>> [sch_hfsc]
>>>> and schedules watchdog work endlessly.
>>>>
>>>> This patch return __NET_XMIT_BYPASS in addition to NET_XMIT_SUCCESS,
>>>> this flag tells upper layer: this packet is gone and isn't queued.
>>>>
>>>> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
>>>> ---
>>>>   net/sched/sch_blackhole.c |    2 +-
>>>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
>>>> index c98a61e980ba..9c4c2bb547d7 100644
>>>> --- a/net/sched/sch_blackhole.c
>>>> +++ b/net/sched/sch_blackhole.c
>>>> @@ -21,7 +21,7 @@ static int blackhole_enqueue(struct sk_buff *skb,
>>>> struct Qdisc *sch,
>>>>                              struct sk_buff **to_free)
>>>>   {
>>>>         qdisc_drop(skb, sch, to_free);
>>>> -       return NET_XMIT_SUCCESS;
>>>> +       return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
>>>
>>>
>>> Why do not we use instead :
>>>
>>>         return qdisc_drop(skb, sch, to_free);
>>>
>>> Although noop_enqueue() seems to use :
>>>
>>>         return NET_XMIT_CN;
>>>
>>> Oh well.
>>>
>>>
>>
>> I suppose "blackhole" should work like "successful" xmit, but counted as
>> drop.
>
> But anything !NET_XMIT_SUCCESS is basically same for upper
> layer:
>
>         err = qdisc_enqueue(skb, cl->qdisc, to_free);
>         if (unlikely(err != NET_XMIT_SUCCESS)) {
>                 if (net_xmit_drop_count(err)) {
>                         cl->qstats.drops++;
>                         qdisc_qstats_drop(sch);
>                 }
>                 return err;
>         }
>
> So using NET_XMIT_DROP is same in this case?

Not same for __dev_xmit_skb()... Yeah, we should keep
the NET_XMIT_SUCCESS bit.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox