Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next 2/3] bpf: Add support for writing to nf_conn:mark
From: Daniel Xu @ 2022-08-15 19:35 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, memxor
  Cc: Daniel Xu, pablo, fw, netfilter-devel, netdev, linux-kernel
In-Reply-To: <cover.1660592020.git.dxu@dxuuu.xyz>

Support direct writes to nf_conn:mark from TC and XDP prog types. This
is useful when applications want to store per-connection metadata. This
is also particularly useful for applications that run both bpf and
iptables/nftables because the latter can trivially access this metadata.

One example use case would be if a bpf prog is responsible for advanced
packet classification and iptables/nftables is later used for routing
due to pre-existing/legacy code.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 include/net/netfilter/nf_conntrack_bpf.h | 18 +++++++++
 net/core/filter.c                        | 34 ++++++++++++++++
 net/netfilter/nf_conntrack_bpf.c         | 50 ++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
index a473b56842c5..0f584c2bd475 100644
--- a/include/net/netfilter/nf_conntrack_bpf.h
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -3,6 +3,7 @@
 #ifndef _NF_CONNTRACK_BPF_H
 #define _NF_CONNTRACK_BPF_H
 
+#include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/kconfig.h>
 
@@ -10,6 +11,12 @@
     (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
 
 extern int register_nf_conntrack_bpf(void);
+extern int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+					  const struct btf *btf,
+					  const struct btf_type *t, int off,
+					  int size, enum bpf_access_type atype,
+					  u32 *next_btf_id,
+					  enum bpf_type_flag *flag);
 
 #else
 
@@ -18,6 +25,17 @@ static inline int register_nf_conntrack_bpf(void)
 	return 0;
 }
 
+static inline int
+nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+			       const struct btf *btf,
+			       const struct btf_type *t, int off,
+			       int size, enum bpf_access_type atype,
+			       u32 *next_btf_id,
+			       enum bpf_type_flag *flag)
+{
+	return -EACCES;
+}
+
 #endif
 
 #endif /* _NF_CONNTRACK_BPF_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index 5669248aff25..d7b768fe9de7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/atomic.h>
+#include <linux/bpf_verifier.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/mm.h>
@@ -55,6 +56,7 @@
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
 #include <net/tcp.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/xfrm.h>
 #include <net/udp.h>
 #include <linux/bpf_trace.h>
@@ -8710,6 +8712,21 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
+					const struct btf *btf,
+					const struct btf_type *t, int off,
+					int size, enum bpf_access_type atype,
+					u32 *next_btf_id,
+					enum bpf_type_flag *flag)
+{
+	if (atype == BPF_READ)
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+					 flag);
+
+	return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype,
+					      next_btf_id, flag);
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -8769,6 +8786,21 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog,
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+				 const struct btf *btf,
+				 const struct btf_type *t, int off,
+				 int size, enum bpf_access_type atype,
+				 u32 *next_btf_id,
+				 enum bpf_type_flag *flag)
+{
+	if (atype == BPF_READ)
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+					 flag);
+
+	return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype,
+					      next_btf_id, flag);
+}
+
 static bool sock_addr_is_valid_access(int off, int size,
 				      enum bpf_access_type type,
 				      const struct bpf_prog *prog,
@@ -10663,6 +10695,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
 	.gen_ld_abs		= bpf_gen_ld_abs,
+	.btf_struct_access	= tc_cls_act_btf_struct_access,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10674,6 +10707,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
 	.gen_prologue		= bpf_noop_prologue,
+	.btf_struct_access	= xdp_btf_struct_access,
 };
 
 const struct bpf_prog_ops xdp_prog_ops = {
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 1cd87b28c9b0..8010cc542d17 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -6,6 +6,7 @@
  * are exposed through to BPF programs is explicitly unstable.
  */
 
+#include <linux/bpf_verifier.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/types.h>
@@ -15,6 +16,8 @@
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netfilter/nf_conntrack_core.h>
 
+static const struct btf_type *nf_conn_type;
+
 /* bpf_ct_opts - Options for CT lookup helpers
  *
  * Members:
@@ -184,6 +187,53 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 	return ct;
 }
 
+/* Check writes into `struct nf_conn` */
+int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+				   const struct btf *btf,
+				   const struct btf_type *t, int off,
+				   int size, enum bpf_access_type atype,
+				   u32 *next_btf_id,
+				   enum bpf_type_flag *flag)
+{
+	const struct btf_type *nct = READ_ONCE(nf_conn_type);
+	s32 type_id;
+	size_t end;
+
+	if (!nct) {
+		type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT);
+		if (type_id < 0)
+			return -EINVAL;
+
+		nct = btf_type_by_id(btf, type_id);
+		WRITE_ONCE(nf_conn_type, nct);
+	}
+
+	if (t != nct) {
+		bpf_log(log, "only read is supported\n");
+		return -EACCES;
+	}
+
+	switch (off) {
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	case offsetof(struct nf_conn, mark):
+		end = offsetofend(struct nf_conn, mark);
+		break;
+#endif
+	default:
+		bpf_log(log, "no write support to nf_conn at off %d\n", off);
+		return -EACCES;
+	}
+
+	if (off + size > end) {
+		bpf_log(log,
+			"write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
+			off, size, end);
+		return -EACCES;
+	}
+
+	return NOT_INIT;
+}
+
 __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in nf_conntrack BTF");
-- 
2.37.1


^ permalink raw reply related

* [PATCH bpf-next 3/3] selftests/bpf: Add tests for writing to nf_conn:mark
From: Daniel Xu @ 2022-08-15 19:35 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, memxor
  Cc: Daniel Xu, pablo, fw, netfilter-devel, netdev, linux-kernel
In-Reply-To: <cover.1660592020.git.dxu@dxuuu.xyz>

Add a simple extension to the existing selftest to write to
nf_conn:mark. Also add a failure test for writing to unsupported field.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 tools/testing/selftests/bpf/prog_tests/bpf_nf.c    |  1 +
 tools/testing/selftests/bpf/progs/test_bpf_nf.c    |  6 ++++--
 .../testing/selftests/bpf/progs/test_bpf_nf_fail.c | 14 ++++++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index 544bf90ac2a7..45389c39f211 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -17,6 +17,7 @@ struct {
 	{ "set_status_after_insert", "kernel function bpf_ct_set_status args#0 expected pointer to STRUCT nf_conn___init but" },
 	{ "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" },
 	{ "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" },
+	{ "write_not_allowlisted_field", "no write support to nf_conn at off" },
 };
 
 enum {
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index 2722441850cc..638b6642d20f 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -175,8 +175,10 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 		       sizeof(opts_def));
 	if (ct) {
 		test_exist_lookup = 0;
-		if (ct->mark == 42)
-			test_exist_lookup_mark = 43;
+		if (ct->mark == 42) {
+			ct->mark++;
+			test_exist_lookup_mark = ct->mark;
+		}
 		bpf_ct_release(ct);
 	} else {
 		test_exist_lookup = opts_def.error;
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
index bf79af15c808..0e4759ab38ff 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
@@ -69,6 +69,20 @@ int lookup_insert(struct __sk_buff *ctx)
 	return 0;
 }
 
+SEC("?tc")
+int write_not_allowlisted_field(struct __sk_buff *ctx)
+{
+	struct bpf_ct_opts___local opts = {};
+	struct bpf_sock_tuple tup = {};
+	struct nf_conn *ct;
+
+	ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), &opts, sizeof(opts));
+	if (!ct)
+		return 0;
+	ct->status = 0xF00;
+	return 0;
+}
+
 SEC("?tc")
 int set_timeout_after_insert(struct __sk_buff *ctx)
 {
-- 
2.37.1


^ permalink raw reply related

* [PATCH bpf-next 0/3] Support direct writes to nf_conn:mark
From: Daniel Xu @ 2022-08-15 19:35 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, memxor
  Cc: Daniel Xu, pablo, fw, netfilter-devel, netdev, linux-kernel

Support direct writes to nf_conn:mark from TC and XDP prog types. This
is useful when applications want to store per-connection metadata. This
is also particularly useful for applications that run both bpf and
iptables/nftables because the latter can trivially access this metadata.

One example use case would be if a bpf prog is responsible for advanced
packet classification and iptables/nftables is later used for routing
due to pre-existing/legacy code.

Daniel Xu (3):
  bpf: Remove duplicate PTR_TO_BTF_ID RO check
  bpf: Add support for writing to nf_conn:mark
  selftests/bpf: Add tests for writing to nf_conn:mark

 include/net/netfilter/nf_conntrack_bpf.h      | 18 +++++++
 kernel/bpf/verifier.c                         |  3 --
 net/core/filter.c                             | 34 +++++++++++++
 net/netfilter/nf_conntrack_bpf.c              | 50 +++++++++++++++++++
 .../testing/selftests/bpf/prog_tests/bpf_nf.c |  1 +
 .../testing/selftests/bpf/progs/test_bpf_nf.c |  6 ++-
 .../selftests/bpf/progs/test_bpf_nf_fail.c    | 14 ++++++
 7 files changed, 121 insertions(+), 5 deletions(-)

-- 
2.37.1

^ permalink raw reply

* [PATCH bpf-next 1/3] bpf: Remove duplicate PTR_TO_BTF_ID RO check
From: Daniel Xu @ 2022-08-15 19:35 UTC (permalink / raw)
  To: bpf, ast, daniel, andrii, memxor
  Cc: Daniel Xu, pablo, fw, netfilter-devel, netdev, linux-kernel
In-Reply-To: <cover.1660592020.git.dxu@dxuuu.xyz>

Since commit 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS")
there has existed bpf_verifier_ops:btf_struct_access. When
btf_struct_access is _unset_ for a prog type, the verifier runs the
default implementation, which is to enforce read only:

        if (env->ops->btf_struct_access) {
                [...]
        } else {
                if (atype != BPF_READ) {
                        verbose(env, "only read is supported\n");
                        return -EACCES;
                }

                [...]
        }

When btf_struct_access is _set_, the expectation is that
btf_struct_access has full control over accesses, including if writes
are allowed.

Rather than carve out an exception for each prog type that may write to
BTF ptrs, delete the redundant check and give full control to
btf_struct_access.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 kernel/bpf/verifier.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2c1f8069f7b7..ca2311bf0cfd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13474,9 +13474,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 				insn->code = BPF_LDX | BPF_PROBE_MEM |
 					BPF_SIZE((insn)->code);
 				env->prog->aux->num_exentries++;
-			} else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
-				verbose(env, "Writes through BTF pointers are not allowed\n");
-				return -EINVAL;
 			}
 			continue;
 		default:
-- 
2.37.1


^ permalink raw reply related

* Re: [PATCH v2] wifi: cfg80211: Fix UAF in ieee80211_scan_rx()
From: Johannes Berg @ 2022-08-15 19:58 UTC (permalink / raw)
  To: Jakub Kicinski, Siddh Raman Pant
  Cc: greg kh, david s. miller, eric dumazet, paolo abeni, netdev,
	linux-kernel-mentees
In-Reply-To: <20220815094722.3c275087@kernel.org>

On Mon, 2022-08-15 at 09:47 -0700, Jakub Kicinski wrote:
> On Sat, 13 Aug 2022 21:49:52 +0530 Siddh Raman Pant wrote:
> > On Sat, 13 Aug 2022 00:55:09 +0530  Jakub Kicinski  wrote:
> > > Similarly to Greg, I'm not very familiar with the code base but one
> > > sure way to move things forward would be to point out a commit which
> > > broke things and put it in a Fixes tag. Much easier to validate a fix
> > > by looking at where things went wrong.  
> > 
> > Thanks, I now looked at some history.
> > 
> > The following commit on 28 Sep 2020 put the kfree call before NULLing:
> > c8cb5b854b40 ("nl80211/cfg80211: support 6 GHz scanning")
> > 
> > The following commit on 19 Nov 2014 introduces RCU:
> > 6ea0a69ca21b ("mac80211: rcu-ify scan and scheduled scan request pointers")
> > 
> > The kfree call wasn't "rcu-ified" in this commit, and neither were
> > RCU heads added.
> > 
> > The following commit on 18 Dec 2014 added RCU head for sched_scan_req:
> > 31a60ed1e95a ("nl80211: Convert sched_scan_req pointer to RCU pointer")
> > 
> > It seems a similar thing might not have been done for scan_req, but I
> > could have also missed commits.
> > 
> > So what should go into the fixes tag, if any? Probably 6ea0a69ca21b?
> 
> That'd be my instinct, too. But do add the full history analysis 
> to the commit message.
> 
> > Also, I probably should use RCU_INIT_POINTER in this patch. Or should
> > I make a patch somewhat like 31a60ed1e95a?
> 
> Yeah, IDK, I'm confused on what the difference between rdev and local
> is. The crash site reads the pointer from local, so other of clearing
> the pointer on rdev should not matter there. Hopefully wireless folks
> can chime in on v3.

Sorry everyone, I always thought "this looks odd" and then never got
around to taking a closer look.

So yeah, I still think this looks odd - cfg80211 doesn't really know
anything about how mac80211 might be doing something with RCU to protect
the pointer.

The patch also leaves the NULL-ing in mac80211 (that is how we reach it)
broken wrt. the kfree_rcu() since it doesn't happen _before_, and the
pointer in rdev isn't how this is reached through RCU (it's not even
__rcu annotated).

I think it might be conceptually better, though not faster, to do
something like https://p.sipsolutions.net/1d23837f455dc4c2.txt which
ensures that from mac80211's POV it can no longer be reached before we
call cfg80211_scan_done().

Yeah, that's slower, but scanning is still a relatively infrequent (and
slow anyway) operation, and this way we can stick to "this is not used
once you call cfg80211_scan_done()" which just makes much more sense?

johannes

^ permalink raw reply

* Re: [RFC net-next 3/4] ynl: add a sample python library
From: Johannes Berg @ 2022-08-15 20:00 UTC (permalink / raw)
  To: Jakub Kicinski, Stephen Hemminger
  Cc: netdev, davem, edumazet, pabeni, sdf, jacob.e.keller, vadfed,
	jiri, dsahern, fw, linux-doc
In-Reply-To: <20220812155308.520831bb@kernel.org>

On Fri, 2022-08-12 at 15:53 -0700, Jakub Kicinski wrote:
> On Thu, 11 Aug 2022 13:09:06 -0700 Stephen Hemminger wrote:
> > Looks interesting, you might want to consider running your code
> > through some of the existing Python checkers such as flake8 and pylint.
> > If you want this to be generally available in repos, best to follow the language conventions
> > 
> > For example flake8 noticed:
> >  $ flake8 --max-line-length=120 ./tools/net/ynl/samples/ynl.py 
> > ./tools/net/ynl/samples/ynl.py:251:55: F821 undefined name 'file_name'
> 
> Thanks! I'll make sure to check flake8 (pylint is too noisy for me :()

FWIW, I've come to really believe in also adding type annotations (and
checking them with mypy, of course, I even use --strict), which has
helped even my smaller projects a lot. YMMV, but it might be something
to look into.

johannes

^ permalink raw reply

* Re: [RFC net-next 2/4] ynl: add the schema for the schemas
From: Johannes Berg @ 2022-08-15 20:03 UTC (permalink / raw)
  To: Jakub Kicinski, netdev, davem, edumazet, pabeni
  Cc: sdf, jacob.e.keller, vadfed, jiri, dsahern, stephen, fw,
	linux-doc
In-Reply-To: <20220811022304.583300-3-kuba@kernel.org>

On Wed, 2022-08-10 at 19:23 -0700, Jakub Kicinski wrote:
> 
> +        subspace-of:
> +          description: |
> +            Name of another space which this is a logical part of. Sub-spaces can be used to define
> +            a limitted group of attributes which are used in a nest.

limited

johannes

^ permalink raw reply

* Re: [PATCH 2/6] vsock: return errors other than -ENOMEM to socket
From: kernel test robot @ 2022-08-15 20:01 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: kbuild-all, Bobby Eshleman, Cong Wang, Jiang Wang,
	Stefan Hajnoczi, Stefano Garzarella, Michael S. Tsirkin,
	Jason Wang, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	K. Y. Srinivasan, Haiyang Zhang, Stephen Hemminger, Wei Liu,
	Dexuan Cui, kvm, virtualization, netdev, linux-kernel,
	linux-hyperv
In-Reply-To: <d81818b868216c774613dd03641fcfe63cc55a45.1660362668.git.bobby.eshleman@bytedance.com>

Hi Bobby,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on mst-vhost/linux-next]
[also build test WARNING on linus/master v6.0-rc1 next-20220815]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Bobby-Eshleman/virtio-vsock-introduce-dgrams-sk_buff-and-qdisc/20220816-015812
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20220816/202208160300.HYFUSTbF-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/68c9c8216a573cdfe2170cad677854e2f4a34634
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Bobby-Eshleman/virtio-vsock-introduce-dgrams-sk_buff-and-qdisc/20220816-015812
        git checkout 68c9c8216a573cdfe2170cad677854e2f4a34634
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash net/vmw_vsock/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> net/vmw_vsock/virtio_transport.c:178: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
    * Merge the two most recent skbs together if possible.


vim +178 net/vmw_vsock/virtio_transport.c

93afaf2cdefaa9 Bobby Eshleman 2022-08-15  176  
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  177  /**
93afaf2cdefaa9 Bobby Eshleman 2022-08-15 @178   * Merge the two most recent skbs together if possible.
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  179   *
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  180   * Caller must hold the queue lock.
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  181   */
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  182  static void
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  183  virtio_transport_add_to_queue(struct sk_buff_head *queue, struct sk_buff *new)
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  184  {
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  185  	struct sk_buff *old;
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  186  
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  187  	spin_lock_bh(&queue->lock);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  188  	/* In order to reduce skb memory overhead, we merge new packets with
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  189  	 * older packets if they pass virtio_transport_skbs_can_merge().
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  190  	 */
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  191  	if (skb_queue_empty_lockless(queue)) {
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  192  		__skb_queue_tail(queue, new);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  193  		goto out;
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  194  	}
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  195  
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  196  	old = skb_peek_tail(queue);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  197  
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  198  	if (!virtio_transport_skbs_can_merge(old, new)) {
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  199  		__skb_queue_tail(queue, new);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  200  		goto out;
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  201  	}
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  202  
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  203  	memcpy(skb_put(old, new->len), new->data, new->len);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  204  	vsock_hdr(old)->len = cpu_to_le32(old->len);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  205  	vsock_hdr(old)->buf_alloc = vsock_hdr(new)->buf_alloc;
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  206  	vsock_hdr(old)->fwd_cnt = vsock_hdr(new)->fwd_cnt;
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  207  	dev_kfree_skb_any(new);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  208  
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  209  out:
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  210  	spin_unlock_bh(&queue->lock);
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  211  }
93afaf2cdefaa9 Bobby Eshleman 2022-08-15  212  

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply

* [PATCH 5.18 1046/1095] batman-adv: tracing: Use the new __vstring() helper
From: Greg Kroah-Hartman @ 2022-08-15 18:07 UTC (permalink / raw)
  To: linux-kernel
  Cc: Greg Kroah-Hartman, stable, Marek Lindner, Ingo Molnar,
	Andrew Morton, Simon Wunderlich, Antonio Quartulli,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	b.a.t.m.a.n, netdev, Sven Eckelmann, Steven Rostedt (Google),
	Sasha Levin
In-Reply-To: <20220815180429.240518113@linuxfoundation.org>

From: Steven Rostedt (Google) <rostedt@goodmis.org>

[ Upstream commit 9abc291812d784bd4a26c01af4ebdbf9f2dbf0bb ]

Instead of open coding a __dynamic_array() with a fixed length (which
defeats the purpose of the dynamic array in the first place). Use the new
__vstring() helper that will use a va_list and only write enough of the
string into the ring buffer that is needed.

Link: https://lkml.kernel.org/r/20220724191650.236b1355@rorschach.local.home

Cc: Marek Lindner <mareklindner@neomailbox.ch>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Cc: Antonio Quartulli <a@unstable.cc>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: b.a.t.m.a.n@lists.open-mesh.org
Cc: netdev@vger.kernel.org
Acked-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/batman-adv/trace.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
index d673ebdd0426..31c8f922651d 100644
--- a/net/batman-adv/trace.h
+++ b/net/batman-adv/trace.h
@@ -28,8 +28,6 @@
 
 #endif /* CONFIG_BATMAN_ADV_TRACING */
 
-#define BATADV_MAX_MSG_LEN	256
-
 TRACE_EVENT(batadv_dbg,
 
 	    TP_PROTO(struct batadv_priv *bat_priv,
@@ -40,16 +38,13 @@ TRACE_EVENT(batadv_dbg,
 	    TP_STRUCT__entry(
 		    __string(device, bat_priv->soft_iface->name)
 		    __string(driver, KBUILD_MODNAME)
-		    __dynamic_array(char, msg, BATADV_MAX_MSG_LEN)
+		    __vstring(msg, vaf->fmt, vaf->va)
 	    ),
 
 	    TP_fast_assign(
 		    __assign_str(device, bat_priv->soft_iface->name);
 		    __assign_str(driver, KBUILD_MODNAME);
-		    WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
-					   BATADV_MAX_MSG_LEN,
-					   vaf->fmt,
-					   *vaf->va) >= BATADV_MAX_MSG_LEN);
+		    __assign_vstr(msg, vaf->fmt, vaf->va);
 	    ),
 
 	    TP_printk(
-- 
2.35.1




^ permalink raw reply related

* Re: [RFC net-next 2/4] ynl: add the schema for the schemas
From: Johannes Berg @ 2022-08-15 20:09 UTC (permalink / raw)
  To: Jakub Kicinski, netdev, davem, edumazet, pabeni
  Cc: sdf, jacob.e.keller, vadfed, jiri, dsahern, stephen, fw,
	linux-doc
In-Reply-To: <20220811022304.583300-3-kuba@kernel.org>

On Wed, 2022-08-10 at 19:23 -0700, Jakub Kicinski wrote:
> 
> +        attributes:
> +          description: List of attributes in the space.
> +          type: array
> +          items:
> +            type: object
> +            required: [ name, type ]
> +            additionalProperties: False
> +            properties:
> +              name:
> +                type: string
> +              type: &attr-type
> +                enum: [ unused, flag, binary, u8, u16, u32, u64, s32, s64,
> +                        nul-string, multi-attr, nest, array-nest, nest-type-value ]

nest-type-value?

> +              description:
> +                description: Documentation of the attribute.
> +                type: string
> +              type-value:
> +                description: Name of the value extracted from the type of a nest-type-value attribute.
> +                type: array
> +                items:
> +                  type: string
> +              len:
> +                oneOf: [ { type: string }, { type: integer }]
> +              sub-type: *attr-type
> +              nested-attributes:
> +                description: Name of the space (sub-space) used inside the attribute.
> +                type: string

Maybe expand that description a bit, it's not really accurate for
"array-nest"?

> +              enum:
> +                description: Name of the enum used for the atttribute.

typo - attribute

Do you mean the "name of the enumeration" or the "name of the
enumeration constant"? (per C99 concepts) I'm a bit confused? I guess
you mean the "name of the enumeration constant" though I agree most
people probably don't know the names from C99 (I had to look them up too
for the sake of being precise here ...)

johannes

^ permalink raw reply

* Re: [RFC net-next 1/4] ynl: add intro docs for the concept
From: Johannes Berg @ 2022-08-15 20:09 UTC (permalink / raw)
  To: Jakub Kicinski, netdev, davem, edumazet, pabeni
  Cc: sdf, jacob.e.keller, vadfed, jiri, dsahern, stephen, fw,
	linux-doc
In-Reply-To: <20220811022304.583300-2-kuba@kernel.org>

On Wed, 2022-08-10 at 19:23 -0700, Jakub Kicinski wrote:
> 
> +Note that attribute spaces do not themselves nest, nested attributes refer to their internal
> +space via a ``nested-attributes`` property, so the YAML spec does not resemble the format
> +of the netlink messages directly.

I find this a bit ... confusing.

I think reading the other patch I know what you mean, but if I think of
this I think more of the policy declarations than the message itself,
and there we do refer to another policy?

Maybe reword a bit and say

   Note that attribute spaces do not themselves nest, nested attributes
   refer to their internal space via a ``nested-attributes`` property
   (the name of another or the same attribute space).

or something?

johannes

^ permalink raw reply

* Re: upstream kernel crashes
From: Michael S. Tsirkin @ 2022-08-15 20:21 UTC (permalink / raw)
  To: Andres Freund
  Cc: Xuan Zhuo, Jason Wang, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, virtualization, netdev,
	Linus Torvalds, Jens Axboe, James Bottomley, Martin K. Petersen,
	Guenter Roeck, linux-kernel, Greg KH, c
In-Reply-To: <20220815174617.z4chnftzcbv6frqr@awork3.anarazel.de>

On Mon, Aug 15, 2022 at 10:46:17AM -0700, Andres Freund wrote:
> Hi,
> 
> On 2022-08-15 12:50:52 -0400, Michael S. Tsirkin wrote:
> > On Mon, Aug 15, 2022 at 09:45:03AM -0700, Andres Freund wrote:
> > > Hi,
> > > 
> > > On 2022-08-15 11:40:59 -0400, Michael S. Tsirkin wrote:
> > > > OK so this gives us a quick revert as a solution for now.
> > > > Next, I would appreciate it if you just try this simple hack.
> > > > If it crashes we either have a long standing problem in virtio
> > > > code or more likely a gcp bug where it can't handle smaller
> > > > rings than what device requestes.
> > > > Thanks!
> > > 
> > > I applied the below and the problem persists.
> > > 
> > > [...]
> >
> > Okay!
> 
> Just checking - I applied and tested this atop 6.0-rc1, correct? Or did you
> want me to test it with the 762faee5a267 reverted? I guess what you're trying
> to test if a smaller queue than what's requested you'd want to do so without
> the problematic patch applied...
> 
> 
> > And just to be 100% sure, can you try the following on top of 5.19:
> 
> > diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
> > index 623906b4996c..6f4e54a618bc 100644
> > --- a/drivers/virtio/virtio_pci_modern.c
> > +++ b/drivers/virtio/virtio_pci_modern.c
> > @@ -208,6 +208,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
> >  		return ERR_PTR(-EINVAL);
> >  	}
> >  
> > +	if (num > 1024)
> > +		num = 1024;
> > +
> >  	info->msix_vector = msix_vec;
> >  
> >  	/* create the vring */
> > 
> > -- 
> 
> Either way, I did this, and there are no issues that I could observe. No
> oopses, no broken networking. But:
> 
> To make sure it does something I added a debugging printk - which doesn't show
> up. I assume this is at a point at least earlyprintk should work (which I see
> getting enabled via serial)?
> 
> Greetings,
> 
> Andres Freund


Sorry if I was unclear.  I wanted to know whether the change somehow
exposes a driver bug or a GCP bug. So what I wanted to do is to test
this patch on top of *5.19*, not on top of the revert.
The idea is if we reduce the size and it starts crashing then
we know it's GCP fault, if not then GCP can handle smaller sizes
and it's one of the driver changes.

It will apply on top of the revert but won't do much.

Yes I think printk should work here.

-- 
MST


^ permalink raw reply

* [PATCH net-next] tcp: Make SYN ACK RTO tunable by BPF programs with TFO
From: Jie Meng @ 2022-08-15 20:29 UTC (permalink / raw)
  To: netdev; +Cc: kafai, kuba, edumazet, bpf, Jie Meng

Instead of the hardcoded TCP_TIMEOUT_INIT, this diff calls tcp_timeout_init
to initiate req->timeout like the non TFO SYN ACK case.

Tested using the following packetdrill script, on a host with a BPF
program that sets the initial connect timeout to 10ms.

`../../common/defaults.sh`

// Initialize connection
    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,FO TFO_COOKIE>
   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
   +.01 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
   +.02 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
   +.04 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
   +.01 < . 1:1(0) ack 1 win 32792

   +0 accept(3, ..., ...) = 4

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 net/ipv4/tcp_fastopen.c | 3 ++-
 net/ipv4/tcp_timer.c    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 825b216d11f5..45cc7f1ca296 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -272,8 +272,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	 * The request socket is not added to the ehash
 	 * because it's been added to the accept queue directly.
 	 */
+	req->timeout = tcp_timeout_init(child);
 	inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
-				  TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+				  req->timeout, TCP_RTO_MAX);
 
 	refcount_set(&req->rsk_refcnt, 2);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b4dfb82d6ecb..cb79127f45c3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -428,7 +428,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	if (!tp->retrans_stamp)
 		tp->retrans_stamp = tcp_time_stamp(tp);
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-			  TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+			  req->timeout << req->num_timeout, TCP_RTO_MAX);
 }
 
 
-- 
2.30.2


^ permalink raw reply related

* Re: [PATCH] virtio_net: Revert "virtio_net: set the default max ring size by find_vqs()"
From: Guenter Roeck @ 2022-08-15 20:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Xuan Zhuo, Jason Wang, Andres Freund,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	virtualization, netdev, Linus Torvalds, Jens Axboe,
	James Bottomley, Martin K. Petersen, Greg KH, c
In-Reply-To: <20220815090521.127607-1-mst@redhat.com>

On Mon, Aug 15, 2022 at 05:16:50AM -0400, Michael S. Tsirkin wrote:
> This reverts commit 762faee5a2678559d3dc09d95f8f2c54cd0466a7.
> 
> This has been reported to trip up guests on GCP (Google Cloud).  Why is
> not yet clear - to be debugged, but the patch itself has several other
> issues:
> 
> - It treats unknown speed as < 10G
> - It leaves userspace no way to find out the ring size set by hypervisor
> - It tests speed when link is down
> - It ignores the virtio spec advice:
>         Both \field{speed} and \field{duplex} can change, thus the driver
>         is expected to re-read these values after receiving a
>         configuration change notification.
> - It is not clear the performance impact has been tested properly
> 
> Revert the patch for now.
> 
> Link: https://lore.kernel.org/r/20220814212610.GA3690074%40roeck-us.net
> Link: https://lore.kernel.org/r/20220815070203.plwjx7b3cyugpdt7%40awork3.anarazel.de
> Link: https://lore.kernel.org/r/3df6bb82-1951-455d-a768-e9e1513eb667%40www.fastmail.com
> Link: https://lore.kernel.org/r/FCDC5DDE-3CDD-4B8A-916F-CA7D87B547CE%40anarazel.de
> Fixes: 762faee5a267 ("virtio_net: set the default max ring size by find_vqs()")
> Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Cc: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> Tested-by: Andres Freund <andres@anarazel.de>

I ran this patch through a total of 14 syskaller tests, 2 test runs each on
7 different crashes reported by syzkaller (as reported to the linux-kernel
mailing list). No problems were reported. I also ran a single cross-check
with one of the syzkaller runs on top of v6.0-rc1, without this patch.
That test run failed.

Overall, I think we can call this fixed.

Guenter

---
syskaller reports:

Reported-and-tested-by: syzbot+2984d1b7aef6b51353f0@syzkaller.appspotmail.com

Tested on:

commit:         568035b0 Linux 6.0-rc1
git tree:       git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v6.0-rc1
kernel config:  https://syzkaller.appspot.com/x/.config?x=3b9175e0879a7749
dashboard link: https://syzkaller.appspot.com/bug?extid=2984d1b7aef6b51353f0
compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
userspace arch: i386
patch:          https://syzkaller.appspot.com/x/patch.diff?x=11949fc3080000

Reported-and-tested-by: syzbot+2c35c4d66094ddfe198e@syzkaller.appspotmail.com

Tested on:

commit:         568035b0 Linux 6.0-rc1
git tree:       git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v6.0-rc1
kernel config:  https://syzkaller.appspot.com/x/.config?x=3cb39b084894e9a5
dashboard link: https://syzkaller.appspot.com/bug?extid=2c35c4d66094ddfe198e
compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
patch:          https://syzkaller.appspot.com/x/patch.diff?x=163e20f3080000

Reported-and-tested-by: syzbot+97f830ad641de86d08c0@syzkaller.appspotmail.com

Tested on:

commit:         568035b0 Linux 6.0-rc1
git tree:       git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v6.0-rc1
kernel config:  https://syzkaller.appspot.com/x/.config?x=f267ed4fb258122a
dashboard link: https://syzkaller.appspot.com/bug?extid=97f830ad641de86d08c0
compiler:       Debian clang version 13.0.1-++20220126092033+75e33f71c2da-1~exp1~20220126212112.63, GNU ld (GNU Binutils for Debian) 2.35.2
patch:          https://syzkaller.appspot.com/x/patch.diff?x=146c8e5b080000

Reported-and-tested-by: syzbot+005efde5e97744047fe4@syzkaller.appspotmail.com

Tested on:

commit:         568035b0 Linux 6.0-rc1
git tree:       git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v6.0-rc1
kernel config:  https://syzkaller.appspot.com/x/.config?x=3cb39b084894e9a5
dashboard link: https://syzkaller.appspot.com/bug?extid=005efde5e97744047fe4
compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
patch:          https://syzkaller.appspot.com/x/patch.diff?x=106c8e5b080000

Reported-and-tested-by: syzbot+9ada839c852179f13999@syzkaller.appspotmail.com

Tested on:

commit:         568035b0 Linux 6.0-rc1
git tree:       git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v6.0-rc1
kernel config:  https://syzkaller.appspot.com/x/.config?x=3b9175e0879a7749
dashboard link: https://syzkaller.appspot.com/bug?extid=9ada839c852179f13999
compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
patch:          https://syzkaller.appspot.com/x/patch.diff?x=118756f3080000

Reported-and-tested-by: syzbot+382af021ce115a936b1f@syzkaller.appspotmail.com

Tested on:

commit:         568035b0 Linux 6.0-rc1
git tree:       git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v6.0-rc1
kernel config:  https://syzkaller.appspot.com/x/.config?x=e656d8727a25e83b
dashboard link: https://syzkaller.appspot.com/bug?extid=382af021ce115a936b1f
compiler:       Debian clang version 13.0.1-++20220126092033+75e33f71c2da-1~exp1~20220126212112.63, GNU ld (GNU Binutils for Debian) 2.35.2
patch:          https://syzkaller.appspot.com/x/patch.diff?x=135f650d080000

Reported-and-tested-by: syzbot+24df94a8d05d5a3e68f0@syzkaller.appspotmail.com

Tested on:

commit:         568035b0 Linux 6.0-rc1
git tree:       git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v6.0-rc1
kernel config:  https://syzkaller.appspot.com/x/.config?x=3b9175e0879a7749
dashboard link: https://syzkaller.appspot.com/bug?extid=24df94a8d05d5a3e68f0
compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
patch:          https://syzkaller.appspot.com/x/patch.diff?x=12758a47080000

^ permalink raw reply

* Re: [PATCH 0/6] virtio/vsock: introduce dgrams, sk_buff, and qdisc
From: Michael S. Tsirkin @ 2022-08-15 20:39 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: Bobby Eshleman, Bobby Eshleman, Cong Wang, Jiang Wang,
	Stefan Hajnoczi, Stefano Garzarella, Jason Wang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, K. Y. Srinivasan,
	Haiyang Zhang, Stephen Hemminger, Wei Liu, Dexuan Cui, kvm,
	virtualization, netdev, linux-kernel, linux-hyperv
In-Reply-To: <cover.1660362668.git.bobby.eshleman@bytedance.com>

On Mon, Aug 15, 2022 at 10:56:03AM -0700, Bobby Eshleman wrote:
> Hey everybody,
> 
> This series introduces datagrams, packet scheduling, and sk_buff usage
> to virtio vsock.
> 
> The usage of struct sk_buff benefits users by a) preparing vsock to use
> other related systems that require sk_buff, such as sockmap and qdisc,
> b) supporting basic congestion control via sock_alloc_send_skb, and c)
> reducing copying when delivering packets to TAP.
> 
> The socket layer no longer forces errors to be -ENOMEM, as typically
> userspace expects -EAGAIN when the sk_sndbuf threshold is reached and
> messages are being sent with option MSG_DONTWAIT.
> 
> The datagram work is based off previous patches by Jiang Wang[1].
> 
> The introduction of datagrams creates a transport layer fairness issue
> where datagrams may freely starve streams of queue access. This happens
> because, unlike streams, datagrams lack the transactions necessary for
> calculating credits and throttling.
> 
> Previous proposals introduce changes to the spec to add an additional
> virtqueue pair for datagrams[1]. Although this solution works, using
> Linux's qdisc for packet scheduling leverages already existing systems,
> avoids the need to change the virtio specification, and gives additional
> capabilities. The usage of SFQ or fq_codel, for example, may solve the
> transport layer starvation problem. It is easy to imagine other use
> cases as well. For example, services of varying importance may be
> assigned different priorities, and qdisc will apply appropriate
> priority-based scheduling. By default, the system default pfifo qdisc is
> used. The qdisc may be bypassed and legacy queuing is resumed by simply
> setting the virtio-vsock%d network device to state DOWN. This technique
> still allows vsock to work with zero-configuration.
> 
> In summary, this series introduces these major changes to vsock:
> 
> - virtio vsock supports datagrams
> - virtio vsock uses struct sk_buff instead of virtio_vsock_pkt
>   - Because virtio vsock uses sk_buff, it also uses sock_alloc_send_skb,
>     which applies the throttling threshold sk_sndbuf.
> - The vsock socket layer supports returning errors other than -ENOMEM.
>   - This is used to return -EAGAIN when the sk_sndbuf threshold is
>     reached.
> - virtio vsock uses a net_device, through which qdisc may be used.
>  - qdisc allows scheduling policies to be applied to vsock flows.
>   - Some qdiscs, like SFQ, may allow vsock to avoid transport layer congestion. That is,
>     it may avoid datagrams from flooding out stream flows. The benefit
>     to this is that additional virtqueues are not needed for datagrams.
>   - The net_device and qdisc is bypassed by simply setting the
>     net_device state to DOWN.
> 
> [1]: https://lore.kernel.org/all/20210914055440.3121004-1-jiang.wang@bytedance.com/

Given this affects the driver/device interface I'd like to
ask you to please copy virtio-dev mailing list on these patches.
Subscriber only I'm afraid you will need to subscribe :(

> Bobby Eshleman (5):
>   vsock: replace virtio_vsock_pkt with sk_buff
>   vsock: return errors other than -ENOMEM to socket
>   vsock: add netdev to vhost/virtio vsock
>   virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit
>   virtio/vsock: add support for dgram
> 
> Jiang Wang (1):
>   vsock_test: add tests for vsock dgram
> 
>  drivers/vhost/vsock.c                   | 238 ++++----
>  include/linux/virtio_vsock.h            |  73 ++-
>  include/net/af_vsock.h                  |   2 +
>  include/uapi/linux/virtio_vsock.h       |   2 +
>  net/vmw_vsock/af_vsock.c                |  30 +-
>  net/vmw_vsock/hyperv_transport.c        |   2 +-
>  net/vmw_vsock/virtio_transport.c        | 237 +++++---
>  net/vmw_vsock/virtio_transport_common.c | 771 ++++++++++++++++--------
>  net/vmw_vsock/vmci_transport.c          |   9 +-
>  net/vmw_vsock/vsock_loopback.c          |  51 +-
>  tools/testing/vsock/util.c              | 105 ++++
>  tools/testing/vsock/util.h              |   4 +
>  tools/testing/vsock/vsock_test.c        | 195 ++++++
>  13 files changed, 1176 insertions(+), 543 deletions(-)
> 
> -- 
> 2.35.1


^ permalink raw reply

* Re: upstream kernel crashes
From: Guenter Roeck @ 2022-08-15 20:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Andres Freund, Xuan Zhuo, Jason Wang, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, virtualization, netdev,
	Linus Torvalds, Jens Axboe, James Bottomley, Martin K. Petersen,
	linux-kernel, Greg KH, c
In-Reply-To: <20220815124748-mutt-send-email-mst@kernel.org>

Michael,

On Mon, Aug 15, 2022 at 12:50:52PM -0400, Michael S. Tsirkin wrote:
[ ...]
> 
> Okay! And just to be 100% sure, can you try the following on top of 5.19:
> 

You should now be able to test any patches using the syzkaller
infrastructure. Pick any (or all) of the now-published syzkaller
reports from the linux-kernel mailing list, reply with:

#syz test git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git v5.19

and provide your patch as attachment.

Guenter

^ permalink raw reply

* Re: [PATCH] virtio_net: Revert "virtio_net: set the default max ring size by find_vqs()"
From: Guenter Roeck @ 2022-08-15 20:50 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Xuan Zhuo, Jason Wang, Andres Freund,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	virtualization, netdev, Linus Torvalds, Jens Axboe,
	James Bottomley, Martin K. Petersen, Greg KH, c
In-Reply-To: <20220815164013-mutt-send-email-mst@kernel.org>

On Mon, Aug 15, 2022 at 04:42:51PM -0400, Michael S. Tsirkin wrote:
> On Mon, Aug 15, 2022 at 01:34:26PM -0700, Guenter Roeck wrote:
> > On Mon, Aug 15, 2022 at 05:16:50AM -0400, Michael S. Tsirkin wrote:
> > > This reverts commit 762faee5a2678559d3dc09d95f8f2c54cd0466a7.
> > > 
> > > This has been reported to trip up guests on GCP (Google Cloud).  Why is
> > > not yet clear - to be debugged, but the patch itself has several other
> > > issues:
> > > 
> > > - It treats unknown speed as < 10G
> > > - It leaves userspace no way to find out the ring size set by hypervisor
> > > - It tests speed when link is down
> > > - It ignores the virtio spec advice:
> > >         Both \field{speed} and \field{duplex} can change, thus the driver
> > >         is expected to re-read these values after receiving a
> > >         configuration change notification.
> > > - It is not clear the performance impact has been tested properly
> > > 
> > > Revert the patch for now.
> > > 
> > > Link: https://lore.kernel.org/r/20220814212610.GA3690074%40roeck-us.net
> > > Link: https://lore.kernel.org/r/20220815070203.plwjx7b3cyugpdt7%40awork3.anarazel.de
> > > Link: https://lore.kernel.org/r/3df6bb82-1951-455d-a768-e9e1513eb667%40www.fastmail.com
> > > Link: https://lore.kernel.org/r/FCDC5DDE-3CDD-4B8A-916F-CA7D87B547CE%40anarazel.de
> > > Fixes: 762faee5a267 ("virtio_net: set the default max ring size by find_vqs()")
> > > Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > Cc: Jason Wang <jasowang@redhat.com>
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > Tested-by: Andres Freund <andres@anarazel.de>
> > 
> > I ran this patch through a total of 14 syskaller tests, 2 test runs each on
> > 7 different crashes reported by syzkaller (as reported to the linux-kernel
> > mailing list). No problems were reported. I also ran a single cross-check
> > with one of the syzkaller runs on top of v6.0-rc1, without this patch.
> > That test run failed.
> > 
> > Overall, I think we can call this fixed.
> > 
> > Guenter
> 
> It's more of a work around though since we don't yet have the root
> cause for this. I suspect a GCP hypervisor bug at the moment.
> This is excercising a path we previously only took on GFP_KERNEL
> allocation failures during probe, I don't think that happens a lot.
>

Even a hypervisor bug should not trigger crashes like this one,
though, or at least I think so. Any idea what to look for on the
hypervisor side, and/or what it might be doing wrong ?

Thanks,
Guenter

^ permalink raw reply

* Re: upstream kernel crashes
From: Andres Freund @ 2022-08-15 20:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Xuan Zhuo, Jason Wang, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, virtualization, netdev,
	Linus Torvalds, Jens Axboe, James Bottomley, Martin K. Petersen,
	Guenter Roeck, linux-kernel, Greg KH, c
In-Reply-To: <20220815161423-mutt-send-email-mst@kernel.org>

Hi,

On 2022-08-15 16:21:51 -0400, Michael S. Tsirkin wrote:
> On Mon, Aug 15, 2022 at 10:46:17AM -0700, Andres Freund wrote:
> > Hi,
> >
> > On 2022-08-15 12:50:52 -0400, Michael S. Tsirkin wrote:
> > > On Mon, Aug 15, 2022 at 09:45:03AM -0700, Andres Freund wrote:
> > > > Hi,
> > > >
> > > > On 2022-08-15 11:40:59 -0400, Michael S. Tsirkin wrote:
> > > > > OK so this gives us a quick revert as a solution for now.
> > > > > Next, I would appreciate it if you just try this simple hack.
> > > > > If it crashes we either have a long standing problem in virtio
> > > > > code or more likely a gcp bug where it can't handle smaller
> > > > > rings than what device requestes.
> > > > > Thanks!
> > > >
> > > > I applied the below and the problem persists.
> > > >
> > > > [...]
> > >
> > > Okay!
> >
> > Just checking - I applied and tested this atop 6.0-rc1, correct? Or did you
> > want me to test it with the 762faee5a267 reverted? I guess what you're trying
> > to test if a smaller queue than what's requested you'd want to do so without
> > the problematic patch applied...
> >
> >
> > Either way, I did this, and there are no issues that I could observe. No
> > oopses, no broken networking. But:
> >
> > To make sure it does something I added a debugging printk - which doesn't show
> > up. I assume this is at a point at least earlyprintk should work (which I see
> > getting enabled via serial)?
> >

> Sorry if I was unclear.  I wanted to know whether the change somehow
> exposes a driver bug or a GCP bug. So what I wanted to do is to test
> this patch on top of *5.19*, not on top of the revert.

Right, the 5.19 part was clear, just the earlier test:

> > > > On 2022-08-15 11:40:59 -0400, Michael S. Tsirkin wrote:
> > > > > OK so this gives us a quick revert as a solution for now.
> > > > > Next, I would appreciate it if you just try this simple hack.
> > > > > If it crashes we either have a long standing problem in virtio
> > > > > code or more likely a gcp bug where it can't handle smaller
> > > > > Thanks!

I wasn't sure about.

After I didn't see any effect on 5.19 + your patch, I grew a bit suspicious
and added the printks.


> Yes I think printk should work here.

The reason the debug patch didn't change anything, and that my debug printk
didn't show, is that gcp uses the legacy paths...

If there were a bug in the legacy path, it'd explain why the problem only
shows on gcp, and not in other situations.

I'll queue testing the legacy path with the equivalent change.

- Andres


Greetings,

Andres Freund

^ permalink raw reply

* Re: upstream kernel crashes
From: Andres Freund @ 2022-08-15 21:10 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Xuan Zhuo, Jason Wang, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, virtualization, netdev,
	Linus Torvalds, Jens Axboe, James Bottomley, Martin K. Petersen,
	Guenter Roeck, linux-kernel, Greg KH, c
In-Reply-To: <20220815210437.saptyw6clr7datun@awork3.anarazel.de>

Hi,

On 2022-08-15 14:04:37 -0700, Andres Freund wrote:
> Booting with the equivalent change, atop 5.19, in the legacy setup_vq()
> reliably causes boot to hang:

I don't know much virtio, so take this with a rock of salt:

Legacy setup_vq() doesn't tell the host about the queue size. The modern one
does:
	vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq));
but the legacy one doesn't.

I assume this means the host will assume the queue is of the size suggested by
vp_legacy_get_queue_size(). If the host continues to write into the space
after the "assumed end" of the queue, but the guest puts other stuff in that
space, well, I'd expect fun roughly like the stuff we've been seeing in this
and related threads.

Greetings,

Andres Freund

^ permalink raw reply

* [PATCH v2 0/2] unix: Add ioctl(SIOCUNIXGRABFDS) to grab files of receive queue skbs
From: Kirill Tkhai @ 2022-08-15 21:13 UTC (permalink / raw)
  To: Linux Kernel Network Developers; +Cc: davem, edumazet, viro, tkhai

When a fd owning a counter of some critical resource, say, of a mount,
it's impossible to umount that mount and disconnect related block device.
That fd may be contained in some unix socket receive queue skb.

Despite we have an interface for detecting such the sockets queues
(/proc/[PID]/fdinfo/[fd] shows non-zero scm_fds count if so) and
it's possible to kill that process to release the counter, the problem is
that there may be several processes, and it's not a good thing to kill
each of them.

This patch adds a simple interface to grab files from receive queue,
so the caller may analyze them, and even do that recursively, if grabbed
file is unix socket itself. So, the described above problem may be solved
by this ioctl() in pair with pidfd_getfd().

Note, that the existing recvmsg(,,MSG_PEEK) is not suitable for that
purpose, since it modifies peek offset inside socket, and this results
in a problem in case of examined process uses peek offset itself.
Additional ptrace freezing of that task plus ioctl(SO_PEEK_OFF) won't help
too, since that socket may relate to several tasks, and there is no
reliable and non-racy way to detect that. Also, if the caller of such
trick will die, the examined task will remain frozen forever. The new
suggested ioctl(SIOCUNIXGRABFDS) does not have such problems.

The realization of ioctl(SIOCUNIXGRABFDS) is pretty simple. The only
interesting thing is protocol with userspace. Firstly, we let userspace
to know the number of all files in receive queue skbs. Then we receive
fds one by one starting from requested offset. We return number of
received fds if there is a successfully received fd, and this number
may be less in case of error or desired fds number lack. Userspace
may detect that situations by comparison of returned value and
out.nr_all minus in.nr_skip. Looking over different variant this one
looks the best for me (I considered returning error in case of error
and there is a received fd. Also I considered returning number of
received files as one more member in struct unix_ioc_grab_fds).

v2: Add "[PATCH 1/2] fs: Export __receive_fd()" to make receive_fd_user() be visible in [2/2] patch, when af_unix is module.
---

Kirill Tkhai (2):
      fs: Export __receive_fd()
      af_unix: Add ioctl(SIOCUNIXGRABFDS) to grab files of receive queue skbs

 fs/file.c               |    1 +
 include/uapi/linux/un.h |   12 ++++++++
 net/unix/af_unix.c      |   70 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)

--
Signed-off-by: Kirill Tkhai <tkhai@ya.ru>

^ permalink raw reply

* [PATCH v2 1/2] fs: Export __receive_fd()
From: Kirill Tkhai @ 2022-08-15 21:15 UTC (permalink / raw)
  To: Linux Kernel Network Developers; +Cc: davem, edumazet, viro
In-Reply-To: <0b07a55f-0713-7ba4-9b6b-88bc8cc6f1f5@ya.ru>

This is needed to make receive_fd_user() available in modules, and it will be used in next patch.

Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
---
v2: New
 fs/file.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..e45d45f1dd45 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1181,6 +1181,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 	__receive_sock(file);
 	return new_fd;
 }
+EXPORT_SYMBOL_GPL(__receive_fd);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
 {



^ permalink raw reply related

* [PATCH v2 2/2] af_unix: Add ioctl(SIOCUNIXGRABFDS) to grab files of receive queue skbs
From: Kirill Tkhai @ 2022-08-15 21:22 UTC (permalink / raw)
  To: Linux Kernel Network Developers; +Cc: davem, edumazet, viro
In-Reply-To: <0b07a55f-0713-7ba4-9b6b-88bc8cc6f1f5@ya.ru>

When a fd owning a counter of some critical resource, say, of a mount,
it's impossible to umount that mount and disconnect related block device.
That fd may be contained in some unix socket receive queue skb.

Despite we have an interface for detecting such the sockets queues
(/proc/[PID]/fdinfo/[fd] shows non-zero scm_fds count if so) and
it's possible to kill that process to release the counter, the problem is
that there may be several processes, and it's not a good thing to kill
each of them.

This patch adds a simple interface to grab files from receive queue,
so the caller may analyze them, and even do that recursively, if grabbed
file is unix socket itself. So, the described above problem may be solved
by this ioctl() in pair with pidfd_getfd().

Note, that the existing recvmsg(,,MSG_PEEK) is not suitable for that
purpose, since it modifies peek offset inside socket, and this results
in a problem in case of examined process uses peek offset itself.
Additional ptrace freezing of that task plus ioctl(SO_PEEK_OFF) won't help
too, since that socket may relate to several tasks, and there is no
reliable and non-racy way to detect that. Also, if the caller of such
trick will die, the examined task will remain frozen forever. The new
suggested ioctl(SIOCUNIXGRABFDS) does not have such problems.

The realization of ioctl(SIOCUNIXGRABFDS) is pretty simple. The only
interesting thing is protocol with userspace. Firstly, we let userspace
to know the number of all files in receive queue skbs. Then we receive
fds one by one starting from requested offset. We return number of
received fds if there is a successfully received fd, and this number
may be less in case of error or desired fds number lack. Userspace
may detect that situations by comparison of returned value and
out.nr_all minus in.nr_skip. Looking over different variant this one
looks the best for me (I considered returning error in case of error
and there is a received fd. Also I considered returning number of
received files as one more member in struct unix_ioc_grab_fds).

Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
---
 include/uapi/linux/un.h |   12 ++++++++
 net/unix/af_unix.c      |   70 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/include/uapi/linux/un.h b/include/uapi/linux/un.h
index 0ad59dc8b686..995b358263dd 100644
--- a/include/uapi/linux/un.h
+++ b/include/uapi/linux/un.h
@@ -11,6 +11,18 @@ struct sockaddr_un {
 	char sun_path[UNIX_PATH_MAX];	/* pathname */
 };

+struct unix_ioc_grab_fds {
+	struct {
+		int nr_grab;
+		int nr_skip;
+		int *fds;
+	} in;
+	struct {
+		int nr_all;
+	} out;
+};
+
 #define SIOCUNIXFILE (SIOCPROTOPRIVATE + 0) /* open a socket file with O_PATH */
+#define SIOCUNIXGRABFDS (SIOCPROTOPRIVATE + 1) /* grab files from recv queue */

 #endif /* _LINUX_UN_H */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index bf338b782fc4..3c7e8049eba1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3079,6 +3079,73 @@ static int unix_open_file(struct sock *sk)
 	return fd;
 }

+static int unix_ioc_grab_fds(struct sock *sk, struct unix_ioc_grab_fds __user *uarg)
+{
+	int i, todo, skip, count, all, err, done = 0;
+	struct unix_sock *u = unix_sk(sk);
+	struct unix_ioc_grab_fds arg;
+	struct sk_buff *skb = NULL;
+	struct scm_fp_list *fp;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	skip = arg.in.nr_skip;
+	todo = arg.in.nr_grab;
+
+	if (skip < 0 || todo <= 0)
+		return -EINVAL;
+	if (mutex_lock_interruptible(&u->iolock))
+		return -EINTR;
+
+	all = atomic_read(&u->scm_stat.nr_fds);
+	err = -EFAULT;
+	/* Set uarg->out.nr_all before the first file is received. */
+	if (put_user(all, &uarg->out.nr_all))
+		goto unlock;
+	err = 0;
+	if (all <= skip)
+		goto unlock;
+	if (all - skip < todo)
+		todo = all - skip;
+	while (todo) {
+		spin_lock(&sk->sk_receive_queue.lock);
+		if (!skb)
+			skb = skb_peek(&sk->sk_receive_queue);
+		else
+			skb = skb_peek_next(skb, &sk->sk_receive_queue);
+		spin_unlock(&sk->sk_receive_queue.lock);
+
+		if (!skb)
+			goto unlock;
+
+		fp = UNIXCB(skb).fp;
+		count = fp->count;
+		if (skip >= count) {
+			skip -= count;
+			continue;
+		}
+
+		for (i = skip; i < count && todo; i++) {
+			err = receive_fd_user(fp->fp[i], &arg.in.fds[done], 0);
+			if (err < 0)
+				goto unlock;
+			done++;
+			todo--;
+		}
+		skip = 0;
+	}
+unlock:
+	mutex_unlock(&u->iolock);
+
+	/* Return number of fds (non-error) if there is a received file. */
+	if (done)
+		return done;
+	if (err < 0)
+		return err;
+	return 0;
+}
+
 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
@@ -3113,6 +3180,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		}
 		break;
 #endif
+	case SIOCUNIXGRABFDS:
+		err = unix_ioc_grab_fds(sk, (struct unix_ioc_grab_fds __user *)arg);
+		break;
 	default:
 		err = -ENOIOCTLCMD;
 		break;

^ permalink raw reply related

* Re: [PATCH] virtio_net: Revert "virtio_net: set the default max ring size by find_vqs()"
From: Michael S. Tsirkin @ 2022-08-15 21:39 UTC (permalink / raw)
  To: Andres Freund
  Cc: Guenter Roeck, linux-kernel, Xuan Zhuo, Jason Wang,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	virtualization, netdev, Linus Torvalds, Jens Axboe,
	James Bottomley, Martin K. Petersen, Greg KH, c
In-Reply-To: <20220815212839.aop6wwx4fkngihbf@awork3.anarazel.de>

On Mon, Aug 15, 2022 at 02:28:39PM -0700, Andres Freund wrote:
> Hi,
> 
> On 2022-08-15 17:04:10 -0400, Michael S. Tsirkin wrote:
> > So virtio has a queue_size register. When read, it will give you
> > originally the maximum queue size. Normally we just read it and
> > use it as queue size.
> > 
> > However, when queue memory allocation fails, and unconditionally with a
> > network device with the problematic patch, driver is asking the
> > hypervisor to make the ring smaller by writing a smaller value into this
> > register.
> > 
> > I suspect that what happens is hypervisor still uses the original value
> > somewhere.
> 
> It looks more like the host is never told about the changed size for legacy
> devices...
> 
> Indeed, adding a vp_legacy_set_queue_size() & call to it to setup_vq(), makes
> 5.19 + restricting queue sizes to 1024 boot again.

Interesting, the register is RO in the legacy interface.
And to be frank I can't find where is vp_legacy_set_queue_size
even implemented. It's midnight here too ...

>  I'd bet that it also would
> fix 6.0rc1, but I'm running out of time to test that.
> 
> Greetings,
> 
> Andres Freund

Yes I figured this out too. And I was able to reproduce on qemu now.

Andres thanks a lot for the help!

I'm posting a new patchset reverting all the handing of resize
restrictions, I think we should rethink it for the next release.

Thanks everyone for the help!

-- 
MST


^ permalink raw reply

* Re: igc: missing HW timestamps at TX
From: Vinicius Costa Gomes @ 2022-08-15 21:39 UTC (permalink / raw)
  To: Vladimir Oltean, Ferenc Fejes
  Cc: netdev@vger.kernel.org, marton12050@gmail.com,
	peti.antal99@gmail.com
In-Reply-To: <20220812201654.qx7e37otu32pxnbk@skbuf>

Hi Vladimir,

Vladimir Oltean <vladimir.oltean@nxp.com> writes:

> Hi Ferenc,
>
> On Fri, Aug 12, 2022 at 02:13:52PM +0000, Ferenc Fejes wrote:
>> Ethtool after the measurement:
>> ethtool -S enp3s0 | grep hwtstamp
>>      tx_hwtstamp_timeouts: 1
>>      tx_hwtstamp_skipped: 419
>>      rx_hwtstamp_cleared: 0
>> 
>> Which is inline with what the isochron see.
>> 
>> But thats only happens if I forcingly put the affinity of the sender
>> different CPU core than the ptp worker of the igc. If those running on
>> the same core I doesnt lost any HW timestam even for 10 million
>> packets. Worth to mention actually I see many lost timestamp which
>> confused me a little bit but those are lost because of the small
>> MSG_ERRQUEUE. When I increased that from few kbytes to 20 mbytes I got
>> every timestamp successfully.
>
> I have zero knowledge of Intel hardware. That being said, I've looked at
> the driver for about 5 minutes, and the design seems to be that where
> the timestamp is not available in band from the TX completion NAPI as
> part of BD ring metadata, but rather, a TX timestamp complete is raised,
> and this results in igc_tsync_interrupt() being called. However there
> are 2 paths in the driver which call this, one is igc_msix_other() and
> the other is igc_intr_msi() - this latter one is also the interrupt that
> triggers the napi_schedule(). It would be interesting to see exactly
> which MSI-X interrupt is the one that triggers igc_tsync_interrupt().

Just some aditional information (note that I know very little about
interrupt internal workings), igc_intr_msi() is called when MSI-X is not
enabled (i.e. "MSI only" system), igc_msix_other() is called when MSI-X
is available. When MSI-X is available, i225/i226 sets up a separate
interrupt handler for "general" events, the TX timestamp being available
to be read from the registers is one those events.

>
> It's also interesting to understand what you mean precisely by affinity
> of isochron. It has a main thread (used for PTP monitoring and for TX
> timestamps) and a pthread for the sending process. The main thread's
> affinity is controlled via taskset; the sender thread via --cpu-mask.
> Is it the *sender* thread the one who makes the TX timestamps be
> available quicker to user space, rather than the main thread, who
> actually dequeues them from the error queue? If so, it might be because
> the TX packets will trigger the TX completion interrupt, and this will
> accelerate the processing of the TX timestamps. I'm unclear what happens
> when the sender thread runs on a different CPU core than the TX
> timestamp thread.
>
> Your need to increase the SO_RCVBUF is also interesting. Keep in mind
> that isochron at that scheduling priority and policy is a CPU hog, and
> that igc_tsync_interrupt() calls schedule_work() - which uses the system
> workqueue that runs at a very low priority (this begs the question, how
> do you know how to match the CPU on which isochron runs with the CPU of
> the system workqueue?). So isochron, high priority, competes for CPU
> time with igc_ptp_tx_work(), low priority. One produces data, one
> consumes it; queues are bound to get full at some point.
> On the other hand, other drivers use the ptp_aux_kworker() that the PTP
> core creates specifically for this purpose. It is a dedicated kthread
> whose scheduling policy and priority can be adjusted using chrt. I think
> it would be interesting to see how things behave when you replace
> schedule_work() with ptp_schedule_worker().

I was planning to do the conversion to use the PTP aux worker thread at
some point, perhaps this is the "excuse" I was looking for.


Cheers,
-- 
Vinicius

^ permalink raw reply

* [PATCH v2 1/2] fs: Export __receive_fd()
From: Kirill Tkhai @ 2022-08-15 21:42 UTC (permalink / raw)
  To: Linux Kernel Network Developers; +Cc: davem, edumazet, viro
In-Reply-To: <0b07a55f-0713-7ba4-9b6b-88bc8cc6f1f5@ya.ru>

This is needed to make receive_fd_user() available in modules, and it will be used in next patch.

Signed-off-by: Kirill Tkhai <tkhai@ya.ru>
---
v2: New
 fs/file.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..e45d45f1dd45 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1181,6 +1181,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 	__receive_sock(file);
 	return new_fd;
 }
+EXPORT_SYMBOL_GPL(__receive_fd);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
 {




^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox