Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v0 12/12] mlxsw: core: Add ports temperature measurement to thermal algorithm
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594915-20741-1-git-send-email-vadimp@mellanox.com>

Ports temperature has most significant impact on system thermal state
and should be considered by the thermal algorithm. The thermal zone
temperature is extended for reading ports temperatures along with a
chip temperature. The temperature value, provided to the core thermal
algorithm will be accumulated value of a chip and ports temperature
sensing, normalized according to the basic constant thresholds.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 66 ++++++++++++++++++++--
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 65962ed..23d6197 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -109,6 +109,8 @@ struct mlxsw_thermal {
 	u8 cooling_levels[MLXSW_THERMAL_MAX_STATE + 1];
 	struct mlxsw_thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
 	enum thermal_device_mode mode;
+	int count;
+	int *ports_temp_cache;
 };
 
 static inline u8 mlxsw_state_to_duty(int state)
@@ -213,10 +215,11 @@ static int mlxsw_thermal_set_mode(struct thermal_zone_device *tzdev,
 	return 0;
 }
 
-static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
-				  int *p_temp)
+static int mlxsw_thermal_init_temp(struct mlxsw_thermal *thermal,
+				   struct mlxsw_env_temp_thresh *delta,
+				   struct mlxsw_env_temp_multi *multi,
+				   int *p_temp, bool *p_crit)
 {
-	struct mlxsw_thermal *thermal = tzdev->devdata;
 	struct device *dev = thermal->bus_info->dev;
 	char mtmp_pl[MLXSW_REG_MTMP_LEN];
 	unsigned int temp;
@@ -231,10 +234,58 @@ static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
 	}
 	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL);
 
-	*p_temp = (int) temp;
+	if (temp >= MLXSW_ENV_TEMP_CRIT) {
+		*p_crit = true;
+	} else if (temp < MLXSW_ENV_TEMP_NORM) {
+		multi->thresh.normal = temp;
+		delta->normal = MLXSW_ENV_TEMP_NORM - temp;
+	} else if (temp >= MLXSW_ENV_TEMP_HOT) {
+		multi->thresh.crit = temp;
+		delta->crit = temp - MLXSW_ENV_TEMP_HOT;
+		multi->mask |= MLXSW_ENV_CRIT_MASK;
+	} else {
+		multi->thresh.hot = temp;
+		delta->hot = temp - MLXSW_ENV_TEMP_NORM;
+		multi->mask |= MLXSW_ENV_HOT_MASK;
+	}
+	*p_temp = temp;
+
 	return 0;
 }
 
+static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
+				  int *p_temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	struct mlxsw_env_temp_multi multi;
+	struct mlxsw_env_temp_thresh delta;
+	bool crit = false;
+	int err;
+
+	memset(&multi, 0, sizeof(struct mlxsw_env_temp_multi));
+	memset(&delta, 0, sizeof(struct mlxsw_env_temp_thresh));
+	/* Read ASIC temperature */
+	err = mlxsw_thermal_init_temp(thermal, &delta, &multi,
+				      p_temp, &crit);
+	if (err) {
+		dev_err(dev, "Failed to query ASIC temp sensor\n");
+		return err;
+	}
+
+	/* No need to proceed ports temperature reading, since ASIC temperature
+	 * should be resulted in system shutdown.
+	 */
+	if (crit)
+		return 0;
+
+	/* Collect ports temperature */
+	return mlxsw_env_collect_port_temp(thermal->core,
+					   thermal->ports_temp_cache,
+					   thermal->count, &multi, &delta,
+					   NULL, p_temp);
+}
+
 static int mlxsw_thermal_get_trip_type(struct thermal_zone_device *tzdev,
 				       int trip,
 				       enum thermal_trip_type *p_type)
@@ -436,6 +487,7 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 		       const struct mlxsw_bus_info *bus_info,
 		       struct mlxsw_thermal **p_thermal)
 {
+	unsigned int max_ports = mlxsw_core_max_ports(core);
 	char mfcr_pl[MLXSW_REG_MFCR_LEN] = { 0 };
 	enum mlxsw_reg_mfcr_pwm_frequency freq;
 	struct device *dev = bus_info->dev;
@@ -452,6 +504,12 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 	thermal->core = core;
 	thermal->bus_info = bus_info;
 	memcpy(thermal->trips, default_thermal_trips, sizeof(thermal->trips));
+	thermal->ports_temp_cache = devm_kmalloc_array(dev, max_ports,
+						       sizeof(int),
+						       GFP_KERNEL);
+	if (!thermal->ports_temp_cache)
+		return -ENOMEM;
+	thermal->count = max_ports;
 
 	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfcr), mfcr_pl);
 	if (err) {
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 11/12] mlxsw: core: Rename cooling device
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594915-20741-1-git-send-email-vadimp@mellanox.com>

Name "Fan" is too common name, and such name is misleading, while it's
interpreted by user.
For example name "Fan" could be used by ACPI.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 53e4ef9..65962ed 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -484,7 +484,8 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 		if (pwm_active & BIT(i)) {
 			struct thermal_cooling_device *cdev;
 
-			cdev = thermal_cooling_device_register("Fan", thermal,
+			cdev = thermal_cooling_device_register("mlxsw_fan",
+							thermal,
 							&mlxsw_cooling_ops);
 			if (IS_ERR(cdev)) {
 				err = PTR_ERR(cdev);
-- 
2.1.4

^ permalink raw reply related

* Re: [GIT] Networking
From: Matteo Croce @ 2018-06-21 13:40 UTC (permalink / raw)
  To: mingo
  Cc: David S . Miller, alexei.starovoitov, sfr, torvalds, akpm, netdev,
	linux-kernel, tglx
In-Reply-To: <20180621084510.GA22870@gmail.com>

On Thu, Jun 21, 2018 at 8:46 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * David Miller <davem@davemloft.net> wrote:
>
> > 1) Fix crash on bpf_prog_load() errors, from Daniel Borkmann.
>
> > Daniel Borkmann (4):
> >       Merge branch 'bpf-misc-fixes'
> >       bpf: fix panic in prog load calls cleanup
> >       bpf: reject any prog that failed read-only lock
> >       bpf, xdp, i40e: fix i40e_build_skb skb reserve and truesize
>
> JFYI, I'm still seeing this BPF build error upstream, on a 32-bit allyesconfig I'm
> getting:
>
>   LD      vmlinux.o
>   ld: i386:x86-64 architecture of input file `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
>   Makefile:1010: recipe for target 'vmlinux' failed
>   make: *** [vmlinux] Error 1
>
> A similar looking build bug was reported by sfr three weeks ago:
>
> > Subject: linux-next: build failure after merge of the net-next tree
> >
> > ...
> >
> > x86_64-linux-ld: unknown architecture of input file `net/bpfilter/bpfilter_umh.o'
> > is incompatible with i386:x86-64 output
> >
> > Caused by commit
> >
> >  d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
> >
> > In my builds, the host is PowerPC 64 LE ...
> >
> > I have reverted that commit along with
> >
> >  61a552eb487f ("bpfilter: fix build dependency")
> >  13405468f49d ("bpfilter: don't pass O_CREAT when opening console for debug")
> >
> > for today.
>
> Is there a fix I could try?
>
> Thanks,
>
>         Ingo

Hi Ingo,

are you compiling a 32 bit kernel on an x86_64 host? I tried to
compile an i386 bit kernel on an i386 host and I have no issue,
running objdump by hand produces correct output:

$ uname -a
Linux debian32 4.16.0-2-686-pae #1 SMP Debian 4.16.16-1 (2018-06-19)
i686 GNU/Linux
$ objdump -f net/bpfilter/bpfilter_umh |awk -F' |,' '/file
format/{print "-O",$NF} /^architecture:/{print "-B",$2}'
-O elf32-i386
-B i386

then I tried to compile an i386 kernel on an x86_64 host and I get the
same error:

$ make -j8 ARCH=i386
...
  LD      vmlinux.o
ld: i386:x86-64 architecture of input file
`net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
make: *** [Makefile:1015: vmlinux] Error 1

the problem seems to be that bpfilter_umh is compiled with host flags,
and so it's a 64 bit binary in my case:

gcc  -static -o net/bpfilter/bpfilter_umh net/bpfilter/main.o
objcopy -I binary `LC_ALL=C objdump -f net/bpfilter/bpfilter_umh |awk
-F' |,' '/file format/{print "-O",$NF} /^architecture:/{print
"-B",$2}'` --rename-section .data=.init.rodata
net/bpfilter/bpfilter_umh net/bpfilter/bpfilter_umh.o
ld -m elf_i386 -r -o vmlinux.o --whole-archive built-in.a
--no-whole-archive --start-group lib/lib.a arch/x86/lib/lib.a
--end-group
ld: i386:x86-64 architecture of input file
`net/bpfilter/bpfilter_umh.o' is incompatible with i386 output

Any idea how to fix it without building it twice, for host and target?
-- 
Matteo Croce
per aspera ad upstream

^ permalink raw reply

* Re: [GIT] Networking
From: Stephen Rothwell @ 2018-06-21 13:46 UTC (permalink / raw)
  To: Matteo Croce
  Cc: mingo, David S . Miller, alexei.starovoitov, torvalds, akpm,
	netdev, linux-kernel, tglx
In-Reply-To: <CAGnkfhxGAYZNhJp7eyg+_j3LY31w7muFqerhQp7jGqQ02iFxkg@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3525 bytes --]

Hi Matteo,

On Thu, 21 Jun 2018 13:40:43 +0000 Matteo Croce <mcroce@redhat.com> wrote:
>
> On Thu, Jun 21, 2018 at 8:46 AM Ingo Molnar <mingo@kernel.org> wrote:
> >
> > * David Miller <davem@davemloft.net> wrote:
> >  
> > > 1) Fix crash on bpf_prog_load() errors, from Daniel Borkmann.  
> >  
> > > Daniel Borkmann (4):
> > >       Merge branch 'bpf-misc-fixes'
> > >       bpf: fix panic in prog load calls cleanup
> > >       bpf: reject any prog that failed read-only lock
> > >       bpf, xdp, i40e: fix i40e_build_skb skb reserve and truesize  
> >
> > JFYI, I'm still seeing this BPF build error upstream, on a 32-bit allyesconfig I'm
> > getting:
> >
> >   LD      vmlinux.o
> >   ld: i386:x86-64 architecture of input file `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
> >   Makefile:1010: recipe for target 'vmlinux' failed
> >   make: *** [vmlinux] Error 1
> >
> > A similar looking build bug was reported by sfr three weeks ago:
> >  
> > > Subject: linux-next: build failure after merge of the net-next tree
> > >
> > > ...
> > >
> > > x86_64-linux-ld: unknown architecture of input file `net/bpfilter/bpfilter_umh.o'
> > > is incompatible with i386:x86-64 output
> > >
> > > Caused by commit
> > >
> > >  d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
> > >
> > > In my builds, the host is PowerPC 64 LE ...
> > >
> > > I have reverted that commit along with
> > >
> > >  61a552eb487f ("bpfilter: fix build dependency")
> > >  13405468f49d ("bpfilter: don't pass O_CREAT when opening console for debug")
> > >
> > > for today.  
> >
> > Is there a fix I could try?
> 
> are you compiling a 32 bit kernel on an x86_64 host? I tried to
> compile an i386 bit kernel on an i386 host and I have no issue,
> running objdump by hand produces correct output:
> 
> $ uname -a
> Linux debian32 4.16.0-2-686-pae #1 SMP Debian 4.16.16-1 (2018-06-19)
> i686 GNU/Linux
> $ objdump -f net/bpfilter/bpfilter_umh |awk -F' |,' '/file
> format/{print "-O",$NF} /^architecture:/{print "-B",$2}'
> -O elf32-i386
> -B i386
> 
> then I tried to compile an i386 kernel on an x86_64 host and I get the
> same error:
> 
> $ make -j8 ARCH=i386
> ...
>   LD      vmlinux.o
> ld: i386:x86-64 architecture of input file
> `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
> make: *** [Makefile:1015: vmlinux] Error 1
> 
> the problem seems to be that bpfilter_umh is compiled with host flags,
> and so it's a 64 bit binary in my case:
> 
> gcc  -static -o net/bpfilter/bpfilter_umh net/bpfilter/main.o
> objcopy -I binary `LC_ALL=C objdump -f net/bpfilter/bpfilter_umh |awk
> -F' |,' '/file format/{print "-O",$NF} /^architecture:/{print
> "-B",$2}'` --rename-section .data=.init.rodata
> net/bpfilter/bpfilter_umh net/bpfilter/bpfilter_umh.o
> ld -m elf_i386 -r -o vmlinux.o --whole-archive built-in.a
> --no-whole-archive --start-group lib/lib.a arch/x86/lib/lib.a
> --end-group
> ld: i386:x86-64 architecture of input file
> `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
> 
> Any idea how to fix it without building it twice, for host and target?

This presumably has the same root cause that means I can't build a big
endian PowerPC version on a little endian host ...

Either I have to have CONFIG_BPFILTER turned off (or maybe just
CONFIG_BPFILTER_UMH) or build with a compiler that cannot link user
mode programs (which effectively does the same).
-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH net V2 1/1] net/smc: coordinate wait queues for nonblocking connect
From: Ursula Braun @ 2018-06-21 14:23 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun,
	xiyou.wangcong, hch

The recent poll change may lead to stalls for non-blocking connecting
SMC sockets, since sock_poll_wait is no longer performed on the
internal CLC socket, but on the outer SMC socket.  kernel_connect() on
the internal CLC socket returns with -EINPROGRESS, but the wake up
logic does not work in all cases. If the internal CLC socket is still
in state TCP_SYN_SENT when polled, sock_poll_wait() from sock_poll()
does not sleep. It is supposed to sleep till the state of the internal
CLC socket switches to TCP_ESTABLISHED.

This patch temporarily propagates the wait queue from the internal
CLC sock to the SMC sock, till the non-blocking connect() is
finished.

In addition locking is reduced due to the removed poll waits.

Fixes: c0129a061442 ("smc: convert to ->poll_mask")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/af_smc.c | 15 +++++++++++----
 net/smc/smc.h    |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index da7f02edcd37..d76331aae6e1 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/in.h>
 #include <linux/sched/signal.h>
+#include <linux/rcupdate.h>
 
 #include <net/sock.h>
 #include <net/tcp.h>
@@ -605,6 +606,13 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 
 	smc_copy_sock_settings_to_clc(smc);
 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+	if (flags & O_NONBLOCK) {
+		rcu_read_lock();
+		smc->smcwq = rcu_dereference(sk->sk_wq);
+		rcu_assign_pointer(sock->sk->sk_wq,
+				   rcu_dereference(smc->clcsock->sk->sk_wq));
+		rcu_read_unlock();
+	}
 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
 	if (rc)
 		goto out;
@@ -1285,12 +1293,9 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 
 	smc = smc_sk(sock->sk);
 	sock_hold(sk);
-	lock_sock(sk);
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
-		release_sock(sk);
 		mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
-		lock_sock(sk);
 		sk->sk_err = smc->clcsock->sk->sk_err;
 		if (sk->sk_err) {
 			mask |= EPOLLERR;
@@ -1299,7 +1304,10 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 			if (sk->sk_state == SMC_INIT &&
 			    mask & EPOLLOUT &&
 			    smc->clcsock->sk->sk_state != TCP_CLOSE) {
+				lock_sock(sk);
+				rcu_assign_pointer(sock->sk->sk_wq, smc->smcwq);
 				rc = __smc_connect(smc);
+				release_sock(sk);
 				if (rc < 0)
 					mask |= EPOLLERR;
 				/* success cases including fallback */
@@ -1334,7 +1342,6 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 			mask |= EPOLLPRI;
 
 	}
-	release_sock(sk);
 	sock_put(sk);
 
 	return mask;
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 51ae1f10d81a..89d6d7ef973f 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -190,6 +190,7 @@ struct smc_connection {
 struct smc_sock {				/* smc sock container */
 	struct sock		sk;
 	struct socket		*clcsock;	/* internal tcp socket */
+	struct socket_wq	*smcwq;		/* original smcsock wq */
 	struct smc_connection	conn;		/* smc connection */
 	struct smc_sock		*listen_smc;	/* listen parent */
 	struct work_struct	tcp_listen_work;/* handle tcp socket accepts */
-- 
2.16.4

^ permalink raw reply related

* Re: [PATCH bpf-next 3/3] bpf: btf: json print map dump with btf info
From: Okash Khawaja @ 2018-06-21 14:26 UTC (permalink / raw)
  To: Quentin Monnet
  Cc: Daniel Borkmann, Martin KaFai Lau, Alexei Starovoitov,
	Yonghong Song, Jakub Kicinski, David S. Miller, netdev,
	kernel-team, linux-kernel
In-Reply-To: <86ae5059-54c8-d078-4f6b-b212285dbfec@netronome.com>

Hi Quentin,

On Thu, Jun 21, 2018 at 11:24:59AM +0100, Quentin Monnet wrote:
> Hi Okash,
> 
> Thanks for the patch! Please find some nitpicks inline below.
Thanks for your feedback. All of it makes sense so I'll send v2 with
those changes. Couple of responses are inlined below.

> 
> 2018-06-20 13:30 UTC-0700 ~ Okash Khawaja <osk@fb.com>
> > This patch modifies `bpftool map dump [-j|-p] id <map-id>` to json-
> > print and pretty-json-print map dump. It calls btf_dumper introduced in
> > previous patch to accomplish this.
> > 
> > The patch only prints debug info when -j or -p flags are supplied. Then
> > too, if the map has associated btf data loaded. Otherwise the usual
> > debug-less output is printed.
> > 
> > Signed-off-by: Okash Khawaja <osk@fb.com>
> > Acked-by: Martin KaFai Lau <kafai@fb.com>
> > 
> > ---
> >  tools/bpf/bpftool/map.c |   94 ++++++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 91 insertions(+), 3 deletions(-)
> > 
> > --- a/tools/bpf/bpftool/map.c
> > +++ b/tools/bpf/bpftool/map.c
> > @@ -43,9 +43,13 @@
> >  #include <unistd.h>
> >  #include <sys/types.h>
> >  #include <sys/stat.h>
> > +#include <linux/err.h>
> >  
> >  #include <bpf.h>
> >  
> > +#include "json_writer.h"
> > +#include "btf.h"
> > +#include "btf_dumper.h"
> >  #include "main.h"
> >  
> >  static const char * const map_type_name[] = {
> > @@ -508,6 +512,83 @@ static int do_show(int argc, char **argv
> >  	return errno == ENOENT ? 0 : -1;
> >  }
> >  
> > +
> > +static int do_dump_btf(struct btf *btf, struct bpf_map_info *map_info,
> > +		void *key, void *value)
> 
> Nit: Please align the second line on the opening parenthesis.
> 
> > +{
> > +	int ret;
> > +
> > +	jsonw_start_object(json_wtr);
> > +	jsonw_name(json_wtr, "key");
> > +
> > +	ret = btf_dumper_type(btf, json_wtr, map_info->btf_key_type_id, key);
> > +	if (ret)
> > +		goto out;
> > +
> > +	jsonw_end_object(json_wtr);
> > +
> > +	jsonw_start_object(json_wtr);
> > +	jsonw_name(json_wtr, "value");
> > +
> > +	ret = btf_dumper_type(btf, json_wtr, map_info->btf_value_type_id,
> > +			value);
> 
> Same comment.
> 
> > +
> > +out:
> > +	/* end of root object */
> > +	jsonw_end_object(json_wtr);
> 
> This is not the root JSON object, which is not produced in that
> function, so I find the comment misleading.
> 
> I also find it confusing that it closes the first JSON object of this
> function if there is an error, but the second if "btf_dumper_type()"
> succeeds. What about the following: closing the first object in all
> cases, before evaluating the value of "ret", and if "ret" is non-null
> returning immediately; and completely removing the "goto" from this
> function?
Code will be more intuitive that way so I'll re-organise it accordingly.

> 
> > +
> > +	return ret;
> > +}
> > +
> > +static struct btf *get_btf(struct bpf_map_info *map_info)
> > +{
> > +	int btf_fd = bpf_btf_get_fd_by_id(map_info->btf_id);
> > +	struct bpf_btf_info btf_info = { 0 };
> > +	__u32 len = sizeof(btf_info);
> > +	uint32_t last_size;
> > +	int err;
> > +	struct btf *btf = NULL;
> > +	void *ptr = NULL, *temp_ptr;
> 
> Nit: please sort declarations in reverse-Christmas-tree order.
> 
> > +
> > +	if (btf_fd < 0)
> > +		return NULL;
> > +
> > +	btf_info.btf_size = 4096;
> > +	do {
> > +		last_size = btf_info.btf_size;
> > +		temp_ptr = realloc(ptr, last_size);
> > +		if (!temp_ptr) {
> > +			p_err("unable allocate memory for debug info.");
> 
> "unable *to* allocate"?
> (Also most other error messages do not end with a period, but here this
> is just me being fussy.)
I think it makes sense to be consistent. I'll remove the full stop.

> 
> > +			goto exit_free;
> > +		}
> > +
> > +		ptr = temp_ptr;
> > +		bzero(ptr, last_size);
> > +		btf_info.btf = ptr_to_u64(ptr);
> > +		err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len);
> > +	} while (!err && btf_info.btf_size > last_size && last_size == 4096);
> 
> If I understand correctly, the first time you try to retrieve up to 4096
> bytes, and if the btf_info is larger than this, you try a second time
> with the size returned in btf_info.btf_size instead. I don't find it
> intuitive (but maybe this is just me?), do you think you could add a
> comment above this bloc maybe?
Yes that is what this code is doing. I'll add comments explaining it.

> 
> > +
> > +	if (err || btf_info.btf_size > last_size) {
> > +		p_info("can't get btf info. debug info won't be displayed. error: %s",
> > +				err ? strerror(errno) : "exceeds size retry");
> 
> Nit: Please align the second line on the opening parenthesis.
> 
> > +		goto exit_free;
> > +	}
> > +
> > +	btf = btf__new((uint8_t *) btf_info.btf,
> 
> Nit: No space between the cast and the name of the variable.
> 
> > +			btf_info.btf_size, NULL);
> 
> Same remark on parenthesis here...
> 
> > +	if (IS_ERR(btf)) {
> > +		printf("error when initialising btf: %s\n",
> > +				strerror(PTR_ERR(btf)));
> 
> ... and here.
> 
> > +		btf = NULL;
> > +	}
> > +
> > +exit_free:
> > +	close(btf_fd);
> > +	free(ptr);
> > +
> > +	return btf;
> > +}
> > +
> >  static int do_dump(int argc, char **argv)
> >  {
> >  	void *key, *value, *prev_key;
> > @@ -516,6 +597,7 @@ static int do_dump(int argc, char **argv
> >  	__u32 len = sizeof(info);
> >  	int err;
> >  	int fd;
> > +	struct btf *btf = NULL;
> 
> Reverse-Christmas-tree order, please.
> 
> >  
> >  	if (argc != 2)
> >  		usage();
> > @@ -538,6 +620,8 @@ static int do_dump(int argc, char **argv
> >  		goto exit_free;
> >  	}
> >  
> > +	btf = get_btf(&info);
> > +
> >  	prev_key = NULL;
> >  	if (json_output)
> >  		jsonw_start_array(json_wtr);
> > @@ -550,9 +634,12 @@ static int do_dump(int argc, char **argv
> >  		}
> >  
> >  		if (!bpf_map_lookup_elem(fd, key, value)) {
> > -			if (json_output)
> > -				print_entry_json(&info, key, value);
> > -			else
> > +			if (json_output) {
> > +				if (btf)
> > +					do_dump_btf(btf, &info, key, value);
> > +				else
> > +					print_entry_json(&info, key, value);
> > +			} else
> >  				print_entry_plain(&info, key, value);
> 
> Please add brackets around "print_entry_plain()" (to harmonise with the
> "if" of the same bloc).
> 
> >  		} else {
> >  			if (json_output) {
> > @@ -584,6 +671,7 @@ exit_free:
> >  	free(key);
> >  	free(value);
> >  	close(fd);
> > +	btf__free(btf);
> >  
> >  	return err;
> >  }
> > 
> 
> Thanks,
> Quentin

^ permalink raw reply

* [PATCH net-next 0/8] be2net: small structures clean-up
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur

The series:
- removes unused / unneccessary fields in several be2net structures
- re-order fields in some structures to eliminate holes, cache-lines
  crosses
- as result reduces size of main struct be_adapter by 4kB

Ivan Vecera (8):
  be2net: remove unused old AIC info
  be2net: remove unused old custom busy-poll fields
  be2net: remove desc field from be_eq_obj
  be2net: reorder fields in be_eq_obj structure
  be2net: move txcp field in be_tx_obj to eliminate holes in the struct
  be2net: remove unused tx_jiffies field from be_tx_stats
  be2net: re-order fields in be_error_recovert to avoid hole
  be2net: move rss_flags field in rss_info to ensure proper alignment

 drivers/net/ethernet/emulex/benet/be.h      | 39 +++++++----------------------
 drivers/net/ethernet/emulex/benet/be_main.c |  6 +++--
 2 files changed, 13 insertions(+), 32 deletions(-)

-- 
2.16.4

^ permalink raw reply

* [PATCH net-next 1/8] be2net: remove unused old AIC info
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

The commit 2632bafd74ae ("be2net: fix adaptive interrupt coalescing")
introduced a separate struct be_aic_obj to hold AIC information but
unfortunately left the old stuff in be_eq_obj. So remove it.

Fixes: 2632bafd74ae ("be2net: fix adaptive interrupt coalescing")
Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 382891f81e09..6cf9d106c989 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -187,13 +187,6 @@ struct be_eq_obj {
 	struct be_queue_info q;
 	char desc[32];
 
-	/* Adaptive interrupt coalescing (AIC) info */
-	bool enable_aic;
-	u32 min_eqd;		/* in usecs */
-	u32 max_eqd;		/* in usecs */
-	u32 eqd;		/* configured val when aic is off */
-	u32 cur_eqd;		/* in usecs */
-
 	u8 idx;			/* array index */
 	u8 msix_idx;
 	u16 spurious_intr;
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next 2/8] be2net: remove unused old custom busy-poll fields
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

The commit fb6113e688e0 ("be2net: get rid of custom busy poll code")
replaced custom busy-poll code by the generic one but left several
macros and fields in struct be_eq_obj that are currently unused.
Remove this stuff.

Fixes: fb6113e688e0 ("be2net: get rid of custom busy poll code")
Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 6cf9d106c989..a4604dea4560 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -193,19 +193,6 @@ struct be_eq_obj {
 	struct napi_struct napi;
 	struct be_adapter *adapter;
 	cpumask_var_t  affinity_mask;
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
-#define BE_EQ_IDLE		0
-#define BE_EQ_NAPI		1	/* napi owns this EQ */
-#define BE_EQ_POLL		2	/* poll owns this EQ */
-#define BE_EQ_LOCKED		(BE_EQ_NAPI | BE_EQ_POLL)
-#define BE_EQ_NAPI_YIELD	4	/* napi yielded this EQ */
-#define BE_EQ_POLL_YIELD	8	/* poll yielded this EQ */
-#define BE_EQ_YIELD		(BE_EQ_NAPI_YIELD | BE_EQ_POLL_YIELD)
-#define BE_EQ_USER_PEND		(BE_EQ_POLL | BE_EQ_POLL_YIELD)
-	unsigned int state;
-	spinlock_t lock;	/* lock to serialize napi and busy-poll */
-#endif  /* CONFIG_NET_RX_BUSY_POLL */
 } ____cacheline_aligned_in_smp;
 
 struct be_aic_obj {		/* Adaptive interrupt coalescing (AIC) info */
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next 3/8] be2net: remove desc field from be_eq_obj
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

The event queue description (be_eq_obj.desc) field is used only to format
string for IRQ name and it is not really needed to hold this value.
Remove it and use local variable to format string for IRQ name.

Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h      | 1 -
 drivers/net/ethernet/emulex/benet/be_main.c | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index a4604dea4560..e71e5e592626 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -185,7 +185,6 @@ static inline void queue_tail_inc(struct be_queue_info *q)
 
 struct be_eq_obj {
 	struct be_queue_info q;
-	char desc[32];
 
 	u8 idx;			/* array index */
 	u8 msix_idx;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 8f755009ff38..05e4c0bb25f4 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -3403,9 +3403,11 @@ static int be_msix_register(struct be_adapter *adapter)
 	int status, i, vec;
 
 	for_all_evt_queues(adapter, eqo, i) {
-		sprintf(eqo->desc, "%s-q%d", netdev->name, i);
+		char irq_name[IFNAMSIZ+4];
+
+		snprintf(irq_name, sizeof(irq_name), "%s-q%d", netdev->name, i);
 		vec = be_msix_vec_get(adapter, eqo);
-		status = request_irq(vec, be_msix, 0, eqo->desc, eqo);
+		status = request_irq(vec, be_msix, 0, irq_name, eqo);
 		if (status)
 			goto err_msix;
 
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next 4/8] be2net: reorder fields in be_eq_obj structure
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

Re-order fields in struct be_eq_obj to ensure that .napi field begins
at start of cache-line. Also the .adapter field is moved to the first
cache-line next to .q field and 3 fields (idx,msi_idx,spurious_intr)
and the 4-bytes hole to 3rd cache-line.

Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index e71e5e592626..716b4bc410f5 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -186,11 +186,11 @@ static inline void queue_tail_inc(struct be_queue_info *q)
 struct be_eq_obj {
 	struct be_queue_info q;
 
+	struct be_adapter *adapter;
+	struct napi_struct napi;
 	u8 idx;			/* array index */
 	u8 msix_idx;
 	u16 spurious_intr;
-	struct napi_struct napi;
-	struct be_adapter *adapter;
 	cpumask_var_t  affinity_mask;
 } ____cacheline_aligned_in_smp;
 
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next 5/8] be2net: move txcp field in be_tx_obj to eliminate holes in the struct
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

Before patch:
struct be_tx_obj {
        u32                        db_offset;            /*     0     4 */

        /* XXX 4 bytes hole, try to pack */

        struct be_queue_info       q;                    /*     8    56 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct be_queue_info       cq;                   /*    64    56 */
        struct be_tx_compl_info    txcp;                 /*   120     4 */

        /* XXX 4 bytes hole, try to pack */

        /* --- cacheline 2 boundary (128 bytes) --- */
        struct sk_buff *           sent_skb_list[2048];  /*   128 16384 */
        ...
}:

After patch:
struct be_tx_obj {
        u32                        db_offset;            /*     0     4 */
        struct be_tx_compl_info    txcp;                 /*     4     4 */
        struct be_queue_info       q;                    /*     8    56 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct be_queue_info       cq;                   /*    64    56 */
        struct sk_buff *           sent_skb_list[2048];  /*   120 16384 */
        ...
};

Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 716b4bc410f5..91ca8d132e87 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -240,9 +240,9 @@ struct be_tx_compl_info {
 
 struct be_tx_obj {
 	u32 db_offset;
+	struct be_tx_compl_info txcp;
 	struct be_queue_info q;
 	struct be_queue_info cq;
-	struct be_tx_compl_info txcp;
 	/* Remember the skbs that were transmitted */
 	struct sk_buff *sent_skb_list[TX_Q_LEN];
 	struct be_tx_stats stats;
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next 6/8] be2net: remove unused tx_jiffies field from be_tx_stats
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 91ca8d132e87..d521364e17cf 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -217,7 +217,6 @@ struct be_tx_stats {
 	u64 tx_vxlan_offload_pkts;
 	u64 tx_reqs;
 	u64 tx_compl;
-	ulong tx_jiffies;
 	u32 tx_stops;
 	u32 tx_drv_drops;	/* pkts dropped by driver */
 	/* the error counters are described in be_ethtool.c */
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next 7/8] be2net: re-order fields in be_error_recovert to avoid hole
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

- Unionize two u8 fields where only one of them is used depending on NIC
chipset.
- Move recovery_supported field after that union

These changes eliminate 7-bytes hole in the struct and makes it smaller
by 8 bytes.

Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index d521364e17cf..4f805be43180 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -522,11 +522,13 @@ enum {
 };
 
 struct be_error_recovery {
-	/* Lancer error recovery variables */
-	u8 recovery_retries;
+	union {
+		u8 recovery_retries;	/* used for Lancer		*/
+		u8 recovery_state;	/* used for BEx and Skyhawk	*/
+	};
 
 	/* BEx/Skyhawk error recovery variables */
-	u8 recovery_state;
+	bool recovery_supported;
 	u16 ue_to_reset_time;		/* Time after UE, to soft reset
 					 * the chip - PF0 only
 					 */
@@ -534,7 +536,6 @@ struct be_error_recovery {
 					 * of SLIPORT_SEMAPHORE reg
 					 */
 	u16 last_err_code;
-	bool recovery_supported;
 	unsigned long probe_time;
 	unsigned long last_recovery_time;
 
-- 
2.16.4

^ permalink raw reply related

* [PATCH net-next 8/8] be2net: move rss_flags field in rss_info to ensure proper alignment
From: Ivan Vecera @ 2018-06-21 14:43 UTC (permalink / raw)
  To: netdev; +Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna, somnath.kotur
In-Reply-To: <20180621144330.12297-1-cera@cera.cz>

The current position of .rss_flags field in struct rss_info causes
that fields .rsstable and .rssqueue (both 128 bytes long) crosses
cache-line boundaries. Moving it at the end properly align all fields.

Before patch:
struct rss_info {
        u64                        rss_flags;            /*     0     8 */
        u8                         rsstable[128];        /*     8   128 */
        /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
        u8                         rss_queue[128];       /*   136   128 */
        /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
        u8                         rss_hkey[40];         /*   264    40 */
};

After patch:
struct rss_info {
        u8                         rsstable[128];        /*     0   128 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        u8                         rss_queue[128];       /*   128   128 */
        /* --- cacheline 4 boundary (256 bytes) --- */
        u8                         rss_hkey[40];         /*   256    40 */
        u64                        rss_flags;            /*   296     8 */
};

Signed-off-by: Ivan Vecera <cera@cera.cz>
---
 drivers/net/ethernet/emulex/benet/be.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 4f805be43180..7005949dc17b 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -436,10 +436,10 @@ struct be_port_resources {
 #define be_is_os2bmc_enabled(adapter) (adapter->flags & BE_FLAGS_OS2BMC)
 
 struct rss_info {
-	u64 rss_flags;
 	u8 rsstable[RSS_INDIR_TABLE_LEN];
 	u8 rss_queue[RSS_INDIR_TABLE_LEN];
 	u8 rss_hkey[RSS_HASH_KEY_LEN];
+	u64 rss_flags;
 };
 
 #define BE_INVALID_DIE_TEMP	0xFF
-- 
2.16.4

^ permalink raw reply related

* Re: [PATCH 4/5] ceph: use timespec64 for r_mtime
From: Arnd Bergmann @ 2018-06-21 14:57 UTC (permalink / raw)
  To: Yan, Zheng
  Cc: Zheng Yan, Sage Weil, Ilya Dryomov, Alex Elder,
	y2038 Mailman List, ceph-devel, Jens Axboe, David Miller,
	Martin K. Petersen, Jason Dillaman, daniel.m.jordan, Jan Kara,
	linux-block, Linux Kernel Mailing List, Networking
In-Reply-To: <CAAM7YAko88Bq0Nbh4EcYifEcnzHTJ500YvofWA-uMxrwqANzpA@mail.gmail.com>

On Thu, Jun 21, 2018 at 2:41 PM, Yan, Zheng <ukernel@gmail.com> wrote:
> On Wed, Jun 20, 2018 at 11:55 PM Arnd Bergmann <arnd@arndb.de> wrote:

>> @@ -1013,7 +1013,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>>                         truncate_inode_pages_range(inode->i_mapping, pos,
>>                                         (pos+len) | (PAGE_SIZE - 1));
>>
>> -                       req->r_mtime = mtime;
>> +                       req->r_mtime = current_time(inode);
> this change is not needed

Good catch, no idea how those two changes ended up in here, I'll
resend without them.

Thanks,

      Arnd

^ permalink raw reply

* Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net
From: Cornelia Huck @ 2018-06-21 14:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Alexander Duyck, virtio-dev, Jiri Pirko, konrad.wilk,
	Jakub Kicinski, Samudrala, Sridhar, qemu-devel, virtualization,
	Siwei Liu, Venu Busireddy, Netdev, boris.ostrovsky, aaron.f.brown,
	Joao Martins
In-Reply-To: <20180620224535-mutt-send-email-mst@kernel.org>

On Wed, 20 Jun 2018 22:48:58 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Jun 20, 2018 at 06:06:19PM +0200, Cornelia Huck wrote:
> > In any case, I'm not sure anymore why we'd want the extra uuid.  
> 
> It's mostly so we can have e.g. multiple devices with same MAC
> (which some people seem to want in order to then use
> then with different containers).
> 
> But it is also handy for when you assign a PF, since then you
> can't set the MAC.
> 

OK, so what about the following:

- introduce a new feature bit, VIRTIO_NET_F_STANDBY_UUID that indicates
  that we have a new uuid field in the virtio-net config space
- in QEMU, add a property for virtio-net that allows to specify a uuid,
  offer VIRTIO_NET_F_STANDBY_UUID if set
- when configuring, set the property to the group UUID of the vfio-pci
  device
- in the guest, use the uuid from the virtio-net device's config space
  if applicable; else, fall back to matching by MAC as done today

That should work for all virtio transports.

^ permalink raw reply

* Re: [PATCH] net: Fix device name resolving crash in default_device_exit()
From: David Ahern @ 2018-06-21 15:28 UTC (permalink / raw)
  To: Kirill Tkhai, netdev
  Cc: davem, daniel, jakub.kicinski, ast, linux, john.fastabend, brouer
In-Reply-To: <84f7019b-c1cb-9851-2ece-aa2e16d8d297@virtuozzo.com>

On 6/21/18 4:03 AM, Kirill Tkhai wrote:
>> This patch does not remove the BUG, so does not really solve the
>> problem. ie., it is fairly trivial to write a script (32k dev%d named
>> devices in init_net) that triggers it again, so your commit subject and
>> commit log are not correct with the references to 'fixing the problem'.
> 
> 1)I'm not agree with you and I don't think removing the BUG() is a good idea.
> This function is called from the place, where it must not fail. But it can
> fail, and the problem with name is not the only reason of this happens.
> We can't continue further pernet_operations in case of a problem happened
> in default_device_exit(), and we can't remove the BUG() before this function
> becomes of void type. But we are not going to make it of void type. So
> we can't remove the BUG().

You missed my point: that the function can still fail means you are not
"fixing" the problem, only delaying it.

> 
> 2)In case of the script is trivial, can't you just post it here to show
> what type of devices you mean? Is there real problem or this is
> a theoretical thinking?

Current code:

# ip li sh dev eth2
4: eth2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP
mode DEFAULT group default qlen 1000
    link/ether 02:e0:f9:46:64:80 brd ff:ff:ff:ff:ff:ff
# ip netns add fubar
# ip li set eth2 netns fubar
# ip li add eth2 type dummy
# ip li add dev4 type dummy
# ip netns del fubar
--> BUG
kernel:[78079.127748] default_device_exit: failed to move eth2 to
init_net: -17


With your patch:

# ip li sh dev eth2
4: eth2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP
mode DEFAULT group default qlen 1000
    link/ether 02:e0:f9:46:64:80 brd ff:ff:ff:ff:ff:ff
# ip netns add fubar
# ip li set eth2 netns fubar
# ip li add eth2 type dummy
# for n in $(seq 0 $((32*1024))); do
  echo "li add dev${n} type dummy"
  done > ip.batch
# ip -batch ip.batch
# ip netns del fubar
--> BUG
kernel:[   25.800024] default_device_exit: failed to move eth2 to
init_net: -17


> 
> All virtual devices I see have rtnl_link_ops, so that they just destroyed
> in default_device_exit_batch(). According to physical devices, it's difficult
> to imagine a node with 32k physical devices, and if someone tried to deploy
> them it may meet problems not only in this place.

Nothing says it has to be a physical device. It is only checking for a name.

> 
>> The change does provide more variability in naming and reduces the
>> likelihood of not being able to push a device back to init_net.
> 
> No, it provides. With the patch one may move real device to a container,
> and allow to do with the device anything including changing of device
> index. Then, the destruction of the container does not resilt a kernel
> panic just because of two devices have the same index.
> 
> Kirill
> 

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 4/4] net/cpsw_switchdev: add switchdev mode of operation on cpsw driver
From: Arnd Bergmann @ 2018-06-21 15:31 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: Ivan Vecera, Florian Fainelli, Andrew Lunn, Networking,
	Grygorii Strashko, ivan.khoronzhuk, Sekhar Nori,
	Jiří Pírko, Francois Ozog, yogeshs, spatton,
	Jose.Abreu
In-Reply-To: <20180621124552.GA15208@apalos>

On Thu, Jun 21, 2018 at 2:45 PM, Ilias Apalodimas
<ilias.apalodimas@linaro.org> wrote:
> On Thu, Jun 21, 2018 at 02:19:55PM +0200, Ivan Vecera wrote:

> The driver is currently widely used and that's the reason we tried to avoid
> rewriting it. The current driver uses a DTS option to distinguish between two
> existing modes. This patch just adds a third one. So to my understanding we
> have the following options:
> 1. The driver already uses DTS to configure the hardware mode. Although this is
> techincally wrong, we can add a third mode on DTS called 'switchdev;', get rid
> of the .config option and keep the configuration method common (although not
> optimal).
> 2. Keep the .config option which overrides the 2 existing modes.
> 3. Introduce a devlink option. If this is applied for all 3 modes, it will break
> backwards compatibility, so it's not an option. If it's introduced for
> configuring 'switchdev' mode only, we fall into the same pitfall as option 2),
> we have something that overrides our current config, slightly better though
> since it's not a compile time option.
> 4. rewrite the driver

As I understand it, the switchdev support can also be added without
becoming incompatible with the existing behavior, this is how I would
suggest it gets added in a way that keeps the existing DT binding and
user view while adding switchdev:

* In non-"dual-emac" mode, show only one network device that is
  configured as a transparent switch as today. Any users that today
  add TI's third-party ioctl interface with a non-upstreamable patch
  can keep using this mode and try to forward-port that patch.
* In "dual-emac" mode (as selected through DT), the hardware is
   configured to be viewed as two separate network devices as before,
   regardless of kernel configuration. Users can add the two device
   to a bridge device as before, and enabling switchdev support in
   the kernel configuration (based on your patch series) would change
   nothing else than using hardware support in the background to
   reconfigure the HW VLAN settings.

This does not require using devlink, adding a third mode, or changing
the DT binding or the user-visible behavior when switchdev is enabled,
but should get all the features you want.

> If it was a brand new driver, i'd definitely pick 4. Since it's a pre-existing
> driver though i can't rule out the rest of the options.

I think the suggestion was to have a new driver with a new binding
so that the DT could choose between the two drivers, one with
somewhat obscure behavior and the other with proper behavior.

However, from what I can tell, the only requirement to get a somewhat
reasonable behavior is that you enable "dual-emac" mode in DT
to get switchdev support. It would be trivial to add a new compatible
value that only allows that mode along with supporting switchdev,
but I don't think that's necessary here.

Writing a new driver might also be a good idea (depending on the
quality of the existing one, I haven't looked in detail), but again
I would see no reason for the new driver to be incompatible with
the existing binding, so a gradual cleanup seems like a better
approach.

       Arnd

^ permalink raw reply

* I am waiting to hear from you soon
From: Mrs Raymond.Mabel @ 2018-06-21 15:34 UTC (permalink / raw)


From
The Desk Of Mrs Mabel Raymond
The International Scammers Crime Worldwide Compensation Financial Unit
Burkina Faso In West Africa..

Attention  Beneficiary
My Name Is Mrs Mabel Raymond Staff Of  international Scammers Crime
Worldwide Compensation Financial Unit .
I  have discovered through our network E-mail  system   That  your
E-mail Address  Has been choosing For Compensation Due to  Your
communication Regarding on  Your Inheritance Claim  Fund  Which Was
trapped by Some Bank Officers and  who Refused To Release Your Fund To
you.
There fore I would appreciate to inform you that there  is hope for
you to recover  Your Inheritance Fund And all what you have lost .
 Your  Inheritance Fund  Has Deposited To  One Of The Security
Financier Company For Security Reason  Here In Burkina Faso.  You  are
Advised  To   Reply  to enable  Me  Notify The Finance Company To
Proceed on Transferring Your Inheritance Fund  ( $ 6.5M usd ) Six
Million Five Hundred Thousand American Dollars  to your Bank Account
In Your country Or Any Place Of your choice  This will be completed
within the  next few  days . Reply I have To Instruct You  On What to
Do To Avoid Any Delay Receiving Your Fund  Into Your Bank Account.

I am waiting to hear from you soon

Thank  you  .

Mrs Raymond  Mabel

^ permalink raw reply

* [PATCH v2 4/5] ceph: use timespec64 for r_mtime
From: Arnd Bergmann @ 2018-06-21 15:46 UTC (permalink / raw)
  To: Ilya Dryomov, Sage Weil, Yan, Zheng
  Cc: y2038, Yan Zheng, Arnd Bergmann, Alex Elder, Jens Axboe,
	David S. Miller, Chengguang Xu, ceph-devel, linux-block,
	linux-kernel, netdev

The request mtime field is used all over ceph, and is currently
represented as a 'timespec' structure in Linux. This changes it to
timespec64 to allow times beyond 2038, modifying all users at the
same time.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
v2: undo an unneeded change pointed out by Yan Zheng.
    Resending only this patch for now, let me know if you
    would like to see the entire series reposted instead.
---
 drivers/block/rbd.c             |  2 +-
 fs/ceph/addr.c                  | 12 ++++++------
 fs/ceph/file.c                  |  8 ++++----
 include/linux/ceph/osd_client.h |  6 +++---
 net/ceph/osd_client.c           |  8 ++++----
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fa0729c1e776..356936333cd9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1452,7 +1452,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
 
 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
-	ktime_get_real_ts(&osd_req->r_mtime);
+	ktime_get_real_ts64(&osd_req->r_mtime);
 	osd_req->r_data_offset = obj_request->ex.oe_off;
 }
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 292b3d72d725..d44d51e69e76 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -574,7 +574,7 @@ static u64 get_writepages_data_length(struct inode *inode,
  */
 static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
-	struct timespec ts;
+	struct timespec64 ts;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct ceph_fs_client *fsc;
@@ -625,7 +625,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 
 	set_page_writeback(page);
-	ts = timespec64_to_timespec(inode->i_mtime);
+	ts = inode->i_mtime;
 	err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
 				   &ci->i_layout, snapc, page_off, len,
 				   ceph_wbc.truncate_seq,
@@ -1134,7 +1134,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 			pages = NULL;
 		}
 
-		req->r_mtime = timespec64_to_timespec(inode->i_mtime);
+		req->r_mtime = inode->i_mtime;
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
 		req = NULL;
@@ -1734,7 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 		goto out;
 	}
 
-	req->r_mtime = timespec64_to_timespec(inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1776,7 +1776,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 			goto out_put;
 	}
 
-	req->r_mtime = timespec64_to_timespec(inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1937,7 +1937,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 				     0, false, true);
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
-	wr_req->r_mtime = timespec64_to_timespec(ci->vfs_inode.i_mtime);
+	wr_req->r_mtime = ci->vfs_inode.i_mtime;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ad0bed99b1d5..2f3a30ca94bf 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -721,7 +721,7 @@ struct ceph_aio_request {
 	struct list_head osd_reqs;
 	unsigned num_reqs;
 	atomic_t pending_reqs;
-	struct timespec mtime;
+	struct timespec64 mtime;
 	struct ceph_cap_flush *prealloc_cf;
 };
 
@@ -923,7 +923,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	int num_pages = 0;
 	int flags;
 	int ret;
-	struct timespec mtime = timespec64_to_timespec(current_time(inode));
+	struct timespec64 mtime = current_time(inode);
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
 	bool write = iov_iter_rw(iter) == WRITE;
@@ -1131,7 +1131,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	int flags;
 	int ret;
 	bool check_caps = false;
-	struct timespec mtime = timespec64_to_timespec(current_time(inode));
+	struct timespec64 mtime = current_time(inode);
 	size_t count = iov_iter_count(from);
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -1663,7 +1663,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 		goto out;
 	}
 
-	req->r_mtime = timespec64_to_timespec(inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 0d6ee04b4c41..2e6611c1e9a0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -199,7 +199,7 @@ struct ceph_osd_request {
 	/* set by submitter */
 	u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
 	struct ceph_snap_context *r_snapc;    /* for writes */
-	struct timespec r_mtime;              /* ditto */
+	struct timespec64 r_mtime;            /* ditto */
 	u64 r_data_offset;                    /* ditto */
 	bool r_linger;                        /* don't resend on failure */
 
@@ -253,7 +253,7 @@ struct ceph_osd_linger_request {
 	struct ceph_osd_request_target t;
 	u32 map_dne_bound;
 
-	struct timespec mtime;
+	struct timespec64 mtime;
 
 	struct kref kref;
 	struct mutex lock;
@@ -508,7 +508,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
 				struct ceph_snap_context *sc,
 				u64 off, u64 len,
 				u32 truncate_seq, u64 truncate_size,
-				struct timespec *mtime,
+				struct timespec64 *mtime,
 				struct page **pages, int nr_pages);
 
 /* watch/notify */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a00c74f1154e..a87a021ca9d0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1978,7 +1978,7 @@ static void encode_request_partial(struct ceph_osd_request *req,
 	p += sizeof(struct ceph_blkin_trace_info);
 
 	ceph_encode_32(&p, 0); /* client_inc, always 0 */
-	ceph_encode_timespec(p, &req->r_mtime);
+	ceph_encode_timespec64(p, &req->r_mtime);
 	p += sizeof(struct ceph_timespec);
 
 	encode_oloc(&p, end, &req->r_t.target_oloc);
@@ -4512,7 +4512,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 	ceph_oid_copy(&lreq->t.base_oid, oid);
 	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
 	lreq->t.flags = CEPH_OSD_FLAG_WRITE;
-	ktime_get_real_ts(&lreq->mtime);
+	ktime_get_real_ts64(&lreq->mtime);
 
 	lreq->reg_req = alloc_linger_request(lreq);
 	if (!lreq->reg_req) {
@@ -4570,7 +4570,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
 	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
 	req->r_flags = CEPH_OSD_FLAG_WRITE;
-	ktime_get_real_ts(&req->r_mtime);
+	ktime_get_real_ts64(&req->r_mtime);
 	osd_req_op_watch_init(req, 0, lreq->linger_id,
 			      CEPH_OSD_WATCH_OP_UNWATCH);
 
@@ -5136,7 +5136,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 struct ceph_snap_context *snapc,
 			 u64 off, u64 len,
 			 u32 truncate_seq, u64 truncate_size,
-			 struct timespec *mtime,
+			 struct timespec64 *mtime,
 			 struct page **pages, int num_pages)
 {
 	struct ceph_osd_request *req;
-- 
2.9.0

^ permalink raw reply related

* Re: [PATCH v4 net-next] net:sched: add action inheritdsfield to skbedit
From: Fu, Qiaobin @ 2018-06-21 15:46 UTC (permalink / raw)
  To: davem@davemloft.net
  Cc: Marcelo Ricardo Leitner, Davide Caratti, Michel Machado,
	netdev@vger.kernel.org, jhs@mojatatu.com,
	xiyou.wangcong@gmail.com
In-Reply-To: <20180620184027.GA3446@localhost.localdomain>

The new action inheritdsfield copies the field DS of
IPv4 and IPv6 packets into skb->priority. This enables
later classification of packets based on the DS field.

v5:
*Update the drop counter for TC_ACT_SHOT.

v4:
*Not allow setting flags other than the expected ones.

*Allow dumping the pure flags.

v3:
*Use optional flags, so that it won't break old versions of tc.

*Allow users to set both SKBEDIT_F_PRIORITY and SKBEDIT_F_INHERITDSFIELD flags.

v2:
*Fix the style issue

*Move the code from skbmod to skbedit

Original idea by Jamal Hadi Salim <jhs@mojatatu.com>

Signed-off-by: Qiaobin Fu <qiaobinf@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---

Note that the motivation for this patch is found in the following discussion:
https://www.spinics.net/lists/netdev/msg501061.html
---
diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index fbcfe27a4e6c..6de6071ebed6 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -30,6 +30,7 @@
 #define SKBEDIT_F_MARK			0x4
 #define SKBEDIT_F_PTYPE			0x8
 #define SKBEDIT_F_MASK			0x10
+#define SKBEDIT_F_INHERITDSFIELD	0x20
 
 struct tc_skbedit {
 	tc_gen;
@@ -45,6 +46,7 @@ enum {
 	TCA_SKBEDIT_PAD,
 	TCA_SKBEDIT_PTYPE,
 	TCA_SKBEDIT_MASK,
+	TCA_SKBEDIT_FLAGS,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6138d1d71900..dfaf5d8028dd 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,9 @@
 #include <linux/rtnetlink.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
 
 #include <linux/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_skbedit.h>
@@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	if (d->flags & SKBEDIT_F_PRIORITY)
 		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD) {
+		int wlen = skb_network_offset(skb);
+
+		switch (tc_skb_protocol(skb)) {
+		case htons(ETH_P_IP):
+			wlen += sizeof(struct iphdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+			break;
+
+		case htons(ETH_P_IPV6):
+			wlen += sizeof(struct ipv6hdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+			break;
+		}
+	}
 	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
 	    skb->dev->real_num_tx_queues > d->queue_mapping)
 		skb_set_queue_mapping(skb, d->queue_mapping);
@@ -53,6 +75,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	spin_unlock(&d->tcf_lock);
 	return d->tcf_action;
+
+err:
+	d->tcf_qstats.drops++;
+	spin_unlock(&d->tcf_lock);
+	return TC_ACT_SHOT;
 }
 
 static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
@@ -62,6 +89,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
 	[TCA_SKBEDIT_PTYPE]		= { .len = sizeof(u16) },
 	[TCA_SKBEDIT_MASK]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_FLAGS]		= { .len = sizeof(u64) },
 };
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -114,6 +142,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		mask = nla_data(tb[TCA_SKBEDIT_MASK]);
 	}
 
+	if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
+		u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+
+		if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
+			flags |= SKBEDIT_F_INHERITDSFIELD;
+	}
+
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
 	exists = tcf_idr_check(tn, parm->index, a, bind);
@@ -178,6 +213,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 		.action  = d->tcf_action,
 	};
 	struct tcf_t t;
+	u64 pure_flags = 0;
 
 	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -196,6 +232,11 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	if ((d->flags & SKBEDIT_F_MASK) &&
 	    nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
 		goto nla_put_failure;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD)
+		pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+	if (pure_flags != 0 &&
+	    nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
+		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))

^ permalink raw reply related

* [PATCH v5 net-next] net:sched: add action inheritdsfield to skbedit
From: Fu, Qiaobin @ 2018-06-21 15:50 UTC (permalink / raw)
  To: davem@davemloft.net
  Cc: Marcelo Ricardo Leitner, Davide Caratti, Michel Machado,
	netdev@vger.kernel.org, jhs@mojatatu.com,
	xiyou.wangcong@gmail.com
In-Reply-To: <8BAB2602-EBB4-4A8D-BBF4-5399CB486175@bu.edu>

The new action inheritdsfield copies the field DS of
IPv4 and IPv6 packets into skb->priority. This enables
later classification of packets based on the DS field.

v5:
*Update the drop counter for TC_ACT_SHOT

v4:
*Not allow setting flags other than the expected ones.

*Allow dumping the pure flags.

v3:
*Use optional flags, so that it won't break old versions of tc.

*Allow users to set both SKBEDIT_F_PRIORITY and SKBEDIT_F_INHERITDSFIELD flags.

v2:
*Fix the style issue

*Move the code from skbmod to skbedit

Original idea by Jamal Hadi Salim <jhs@mojatatu.com>

Signed-off-by: Qiaobin Fu <qiaobinf@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---

Note that the motivation for this patch is found in the following discussion:
https://www.spinics.net/lists/netdev/msg501061.html
---
diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index fbcfe27a4e6c..6de6071ebed6 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -30,6 +30,7 @@
 #define SKBEDIT_F_MARK			0x4
 #define SKBEDIT_F_PTYPE			0x8
 #define SKBEDIT_F_MASK			0x10
+#define SKBEDIT_F_INHERITDSFIELD	0x20
 
 struct tc_skbedit {
 	tc_gen;
@@ -45,6 +46,7 @@ enum {
 	TCA_SKBEDIT_PAD,
 	TCA_SKBEDIT_PTYPE,
 	TCA_SKBEDIT_MASK,
+	TCA_SKBEDIT_FLAGS,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6138d1d71900..dfaf5d8028dd 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,9 @@
 #include <linux/rtnetlink.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
 
 #include <linux/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_skbedit.h>
@@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	if (d->flags & SKBEDIT_F_PRIORITY)
 		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD) {
+		int wlen = skb_network_offset(skb);
+
+		switch (tc_skb_protocol(skb)) {
+		case htons(ETH_P_IP):
+			wlen += sizeof(struct iphdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+			break;
+
+		case htons(ETH_P_IPV6):
+			wlen += sizeof(struct ipv6hdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+			break;
+		}
+	}
 	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
 	    skb->dev->real_num_tx_queues > d->queue_mapping)
 		skb_set_queue_mapping(skb, d->queue_mapping);
@@ -53,6 +75,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	spin_unlock(&d->tcf_lock);
 	return d->tcf_action;
+
+err:
+	d->tcf_qstats.drops++;
+	spin_unlock(&d->tcf_lock);
+	return TC_ACT_SHOT;
 }
 
 static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
@@ -62,6 +89,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
 	[TCA_SKBEDIT_PTYPE]		= { .len = sizeof(u16) },
 	[TCA_SKBEDIT_MASK]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_FLAGS]		= { .len = sizeof(u64) },
 };
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -114,6 +142,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		mask = nla_data(tb[TCA_SKBEDIT_MASK]);
 	}
 
+	if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
+		u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+
+		if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
+			flags |= SKBEDIT_F_INHERITDSFIELD;
+	}
+
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
 	exists = tcf_idr_check(tn, parm->index, a, bind);
@@ -178,6 +213,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 		.action  = d->tcf_action,
 	};
 	struct tcf_t t;
+	u64 pure_flags = 0;
 
 	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -196,6 +232,11 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	if ((d->flags & SKBEDIT_F_MASK) &&
 	    nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
 		goto nla_put_failure;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD)
+		pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+	if (pure_flags != 0 &&
+	    nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
+		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))

^ permalink raw reply related

* Re: [PATCH v1 1/1] VSOCK: fix loopback on big-endian systems
From: Stefan Hajnoczi @ 2018-06-21 16:07 UTC (permalink / raw)
  To: Claudio Imbrenda
  Cc: davem, jhansen, cavery, borntraeger, fiuczy, linux-kernel, netdev
In-Reply-To: <1529502711-8028-1-git-send-email-imbrenda@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 681 bytes --]

On Wed, Jun 20, 2018 at 03:51:51PM +0200, Claudio Imbrenda wrote:
> The dst_cid and src_cid are 64 bits, therefore 64 bit accessors should be
> used, and in fact in virtio_transport_common.c only 64 bit accessors are
> used. Using 32 bit accessors for 64 bit values breaks big endian systems.
> 
> This patch fixes a wrong use of le32_to_cpu in virtio_transport_send_pkt.
> 
> Fixes: b9116823189e85ccf384 ("VSOCK: add loopback to virtio_transport")
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
> ---
>  net/vmw_vsock/virtio_transport.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply

* Re: [PATCH v5 net-next] net:sched: add action inheritdsfield to skbedit
From: Davide Caratti @ 2018-06-21 16:13 UTC (permalink / raw)
  To: Fu, Qiaobin, davem@davemloft.net
  Cc: Marcelo Ricardo Leitner, Michel Machado, netdev@vger.kernel.org,
	jhs@mojatatu.com, xiyou.wangcong@gmail.com
In-Reply-To: <B84B92F9-B872-4430-B7E2-FBF23E543632@bu.edu>

On Thu, 2018-06-21 at 15:50 +0000, Fu, Qiaobin wrote:
> The new action inheritdsfield copies the field DS of
> IPv4 and IPv6 packets into skb->priority. This enables
> later classification of packets based on the DS field.
> 
> v5:
> *Update the drop counter for TC_ACT_SHOT


Acked-by: Davide Caratti <dcaratti@redhat.com>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox