Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 2/5] bpf: Add map_name to bpf_map_info
From: Martin KaFai Lau @ 2017-09-27 21:37 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20170927213756.1254938-1-kafai@fb.com>

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      | 1 +
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 7 ++++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33ccc474fb04..252f4bc9eb25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd6348269bf5..6d2137b4cf38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -190,6 +190,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -829,6 +830,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 45970df3f820..11a7f82a55d1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
@@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
2.9.5

^ permalink raw reply related

* [PATCH net-next 1/5] bpf: Add name, load_time, uid and map_ids to bpf_prog_info
From: Martin KaFai Lau @ 2017-09-27 21:37 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20170927213756.1254938-1-kafai@fb.com>

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  8 ++++++++
 kernel/bpf/syscall.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b672c50f160..33ccc474fb04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e43491ac4823..bd6348269bf5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -175,6 +175,8 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -210,6 +212,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -812,6 +815,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..45970df3f820 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	/* '\0' terminates dst */
+	*dst = 0;
+
+	return 0;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_name
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
-- 
2.9.5

^ permalink raw reply related

* [PATCH net-next 0/5] bpf: Extend bpf_{prog,map}_info
From: Martin KaFai Lau @ 2017-09-27 21:37 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team

This patch series adds more fields to bpf_prog_info and bpf_map_info.
Please see individual patch for details.

Martin KaFai Lau (5):
  bpf: Add name, load_time, uid and map_ids to bpf_prog_info
  bpf: Add map_name to bpf_map_info
  bpf: libbpf: Provide basic API support to specify BPF obj name
  bpf: Swap the order of checking prog_info and map_info
  bpf: Test new fields in bpf_attr and bpf_{prog,map}_info

 include/linux/bpf.h                         |   3 +
 include/uapi/linux/bpf.h                    |  10 ++
 kernel/bpf/syscall.c                        |  58 ++++++++-
 samples/bpf/bpf_load.c                      |   2 +
 samples/bpf/map_perf_test_user.c            |   1 +
 tools/include/uapi/linux/bpf.h              |  10 ++
 tools/lib/bpf/bpf.c                         |  57 ++++++--
 tools/lib/bpf/bpf.h                         |  23 +++-
 tools/lib/bpf/libbpf.c                      | 109 ++++++++++++----
 tools/testing/selftests/bpf/test_progs.c    | 195 +++++++++++++++++++++++-----
 tools/testing/selftests/bpf/test_verifier.c |   2 +-
 11 files changed, 385 insertions(+), 85 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [PATCH net-next 4/5] bpf: Swap the order of checking prog_info and map_info
From: Martin KaFai Lau @ 2017-09-27 21:37 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20170927213756.1254938-1-kafai@fb.com>

This patch swaps the checking order.  It now checks the map_info
first and then prog_info.  It is a prep work for adding
test to the newly added fields (the map_ids of prog_info field
in particular).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_progs.c | 58 +++++++++++++++++---------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 11ee25cea227..31ae27dc8d04 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -316,6 +316,36 @@ static void test_bpf_obj_id(void)
 			error_cnt++;
 		assert(!err);
 
+		/* Insert a magic value to the map */
+		map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
+		assert(map_fds[i] >= 0);
+		err = bpf_map_update_elem(map_fds[i], &array_key,
+					  &array_magic_value, 0);
+		assert(!err);
+
+		/* Check getting map info */
+		info_len = sizeof(struct bpf_map_info) * 2;
+		bzero(&map_infos[i], info_len);
+		err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
+					     &info_len);
+		if (CHECK(err ||
+			  map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
+			  map_infos[i].key_size != sizeof(__u32) ||
+			  map_infos[i].value_size != sizeof(__u64) ||
+			  map_infos[i].max_entries != 1 ||
+			  map_infos[i].map_flags != 0 ||
+			  info_len != sizeof(struct bpf_map_info),
+			  "get-map-info(fd)",
+			  "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X\n",
+			  err, errno,
+			  map_infos[i].type, BPF_MAP_TYPE_ARRAY,
+			  info_len, sizeof(struct bpf_map_info),
+			  map_infos[i].key_size,
+			  map_infos[i].value_size,
+			  map_infos[i].max_entries,
+			  map_infos[i].map_flags))
+			goto done;
+
 		/* Check getting prog info */
 		info_len = sizeof(struct bpf_prog_info) * 2;
 		bzero(&prog_infos[i], info_len);
@@ -347,34 +377,6 @@ static void test_bpf_obj_id(void)
 			  !!memcmp(xlated_insns, zeros, sizeof(zeros))))
 			goto done;
 
-		map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
-		assert(map_fds[i] >= 0);
-		err = bpf_map_update_elem(map_fds[i], &array_key,
-					  &array_magic_value, 0);
-		assert(!err);
-
-		/* Check getting map info */
-		info_len = sizeof(struct bpf_map_info) * 2;
-		bzero(&map_infos[i], info_len);
-		err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
-					     &info_len);
-		if (CHECK(err ||
-			  map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
-			  map_infos[i].key_size != sizeof(__u32) ||
-			  map_infos[i].value_size != sizeof(__u64) ||
-			  map_infos[i].max_entries != 1 ||
-			  map_infos[i].map_flags != 0 ||
-			  info_len != sizeof(struct bpf_map_info),
-			  "get-map-info(fd)",
-			  "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X\n",
-			  err, errno,
-			  map_infos[i].type, BPF_MAP_TYPE_ARRAY,
-			  info_len, sizeof(struct bpf_map_info),
-			  map_infos[i].key_size,
-			  map_infos[i].value_size,
-			  map_infos[i].max_entries,
-			  map_infos[i].map_flags))
-			goto done;
 	}
 
 	/* Check bpf_prog_get_next_id() */
-- 
2.9.5

^ permalink raw reply related

* [PATCH] rndis_host: support Novatel Verizon USB730L
From: Aleksander Morgado @ 2017-09-27 21:31 UTC (permalink / raw)
  To: oliver; +Cc: davem, linux-usb, netdev, Aleksander Morgado

Treat the ef/04/01 interface class/subclass/protocol combination used
by the Novatel Verizon USB730L (1410:9030) as a possible RNDIS
interface.

 T:  Bus=01 Lev=02 Prnt=02 Port=01 Cnt=02 Dev#= 17 Spd=480 MxCh= 0
 D:  Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs=  3
 P:  Vendor=1410 ProdID=9030 Rev=03.10
 S:  Manufacturer=Novatel Wireless
 S:  Product=MiFi USB730L
 S:  SerialNumber=0123456789ABCDEF
 C:  #Ifs= 3 Cfg#= 1 Atr=80 MxPwr=500mA
 I:  If#= 0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host
 I:  If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host
 I:  If#= 2 Alt= 0 #EPs= 1 Cls=03(HID  ) Sub=00 Prot=00 Driver=usbhid

Once the network interface is brought up, the user just needs to run a
DHCP client to get IP address and routing setup.

As a side note, other Novatel Verizon USB730L models with the same
vid:pid end up exposing a standard ECM interface which doesn't require
any other kernel update to make it work.

Signed-off-by: Aleksander Morgado <aleksander@aleksander.es>
---

Hey,

I'm not sure if binding this logic to a specific vid:pid (1410:9030) would be more appropriate here, or if it's ok to just bind class/subclass/protocol (as in the activesync case).
Let me know what you think.

---
 drivers/net/usb/cdc_ether.c  | 11 ++++++++++-
 drivers/net/usb/rndis_host.c |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index 8ab281b478f2..2df0bcc6d30b 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -54,11 +54,19 @@ static int is_wireless_rndis(struct usb_interface_descriptor *desc)
 		desc->bInterfaceProtocol == 3);
 }

+static int is_novatel_rndis(struct usb_interface_descriptor *desc)
+{
+	return (desc->bInterfaceClass == USB_CLASS_MISC &&
+		desc->bInterfaceSubClass == 4 &&
+		desc->bInterfaceProtocol == 1);
+}
+
 #else

 #define is_rndis(desc)		0
 #define is_activesync(desc)	0
 #define is_wireless_rndis(desc)	0
+#define is_novatel_rndis(desc)	0

 #endif

@@ -150,7 +158,8 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 	 */
 	rndis = (is_rndis(&intf->cur_altsetting->desc) ||
 		 is_activesync(&intf->cur_altsetting->desc) ||
-		 is_wireless_rndis(&intf->cur_altsetting->desc));
+		 is_wireless_rndis(&intf->cur_altsetting->desc) ||
+		 is_novatel_rndis(&intf->cur_altsetting->desc));

 	memset(info, 0, sizeof(*info));
 	info->control = intf;
diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index a151f267aebb..b807c91abe1d 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -632,6 +632,10 @@ static const struct usb_device_id	products [] = {
 	/* RNDIS for tethering */
 	USB_INTERFACE_INFO(USB_CLASS_WIRELESS_CONTROLLER, 1, 3),
 	.driver_info = (unsigned long) &rndis_info,
+}, {
+	/* Novatel Verizon USB730L */
+	USB_INTERFACE_INFO(USB_CLASS_MISC, 4, 1),
+	.driver_info = (unsigned long) &rndis_info,
 },
 	{ },		// END
 };

^ permalink raw reply related

* Re: [next-queue PATCH 2/3] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Vinicius Costa Gomes @ 2017-09-27 21:14 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, intel-wired-lan,
	Jamal Hadi Salim, Jiri Pirko, andre.guedes, ivan.briano,
	jesus.sanchez-palencia, boon.leong.ong, richardcochran, henrik
In-Reply-To: <CAM_iQpVrz+e=TYHYqaYyUWRP528GSZXx8EnN1yPhZK8eLvP4Yw@mail.gmail.com>

Hi,

Cong Wang <xiyou.wangcong@gmail.com> writes:

> On Tue, Sep 26, 2017 at 4:39 PM, Vinicius Costa Gomes
> <vinicius.gomes@intel.com> wrote:
>> +static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
>> +{
>> +       struct cbs_sched_data *q = qdisc_priv(sch);
>> +       struct net_device *dev = qdisc_dev(sch);
>> +
>> +       if (!opt)
>> +               return -EINVAL;
>> +
>> +       /* FIXME: this means that we can only install this qdisc
>> +        * "under" mqprio. Do we need a more generic way to retrieve
>> +        * the queue, or do we pass the netdev_queue to the driver?
>> +        */
>> +       q->queue = TC_H_MIN(sch->parent) - 1 - netdev_get_num_tc(dev);
>> +
>> +       return cbs_change(sch, opt);
>> +}
>
> Yeah it is ugly to assume its parent is mqprio, at least you should
> error out if it is not the case.

Will add an error for this, for now.

>
> I am not sure how we can solve this elegantly, perhaps you should
> extend mqprio rather than add a new one?

Is the alternative hinted in the FIXME worse? Instead of passing the
index of the hardware queue to the driver we pass the pointer to a
netdev_queue to the driver and it "discovers" the HW queue from that.


Cheers,

^ permalink raw reply

* Re: [PATCH net] ipv6: remove incorrect WARN_ON() in fib6_del()
From: Wei Wang @ 2017-09-27 21:08 UTC (permalink / raw)
  To: Eric Dumazet, Martin KaFai Lau, David Miller
  Cc: Linux Kernel Network Developers
In-Reply-To: <CANn89i+uLBxzBCUP_TdyswBgAAhUMZajkLN7NGo2+9vuzVc_5A@mail.gmail.com>

On Tue, Sep 26, 2017 at 6:20 AM, Eric Dumazet <edumazet@google.com> wrote:
> On Mon, Sep 25, 2017 at 10:52 PM, Wei Wang <weiwan@google.com> wrote:
>> On Mon, Sep 25, 2017 at 7:23 PM, Eric Dumazet <edumazet@google.com> wrote:
>>> On Mon, Sep 25, 2017 at 7:07 PM, Martin KaFai Lau <kafai@fb.com> wrote:
>>>
>>>> I am probably still missing something.
>>>>
>>>> Considering the del operation should be under the writer lock,
>>>> if rt->rt6i_node should be NULL (for rt that has already been
>>>> removed from fib6), why this WARN_ON() is triggered?
>>>>
>>>> An example may help.
>>>>
>>>
>>> Look at the stack trace, you'll find the answers...
>>>
>>> ip6_link_failure() -> ip6_del_rt()
>>>
>>> Note that rt might have been deleted from the _tree_ already.
>>
>> Had a brief talk with Martin.
>> He has a valid point.
>> The current WARN_ON() code is as follows:
>> #if RT6_DEBUG >= 2
>>        if (rt->dst.obsolete > 0) {
>>                WARN_ON(fn);
>>                return -ENOENT;
>>        }
>> #endif
>>
>> The WARN_ON() only triggers when fn is not NULL. (I missed it before.)
>> In theory, fib6_del() calls fib6_del_route() which should set
>> rt->rt6i_node to NULL and rt->dst.obsolete to DST_OBSOLETE_DEAD within
>> the same write_lock session.
>> If those 2 values are inconsistent, it indicates something is wrong.
>> Will need more time to root cause the issue.
>>
>> Please ignore this patch. Sorry about the confusion.
>
> Oh well, for some reason I was seeing WARN_ON(1)  here, since this is
> a construct I often add in my tests ...

Just an update on this issue:
This WARNING issue should already be fixed by commit
7483cea79957312e9f8e9cf760a1bc5d6c507113:
Author: Ido Schimmel <idosch@mellanox.com>
Date:   Thu Aug 3 13:28:22 2017 +0200

    ipv6: fib: Unlink replaced routes from their nodes

    When a route is deleted its node pointer is set to NULL to indicate it's
    no longer linked to its node. Do the same for routes that are replaced.

    This will later allow us to test if a route is still in the FIB by
    checking its node pointer instead of its reference count.

    Signed-off-by: Ido Schimmel <idosch@mellanox.com>
    Signed-off-by: Jiri Pirko <jiri@mellanox.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

So no further action is needed on this.

Thanks.
Wei

^ permalink raw reply

* Re: [PATCH v2 2/2] ip_tunnel: add mpls over gre encapsulation
From: kbuild test robot @ 2017-09-27 20:45 UTC (permalink / raw)
  To: Amine Kherbouche
  Cc: kbuild-all, netdev, xeb, roopa, amine.kherbouche, equinox
In-Reply-To: <c40295d24ec5207f5be695a2f888bfa840e2ef2c.1506416988.git.amine.kherbouche@6wind.com>

[-- Attachment #1: Type: text/plain, Size: 753 bytes --]

Hi Amine,

[auto build test ERROR on net/master]
[also build test ERROR on v4.14-rc2 next-20170927]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Amine-Kherbouche/mpls-expose-stack-entry-function/20170928-012842
config: x86_64-rhel (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

>> ERROR: "mpls_forward" [net/ipv4/gre.ko] undefined!

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 40101 bytes --]

^ permalink raw reply

* RTL8169 vs low-latency (was: Re: Re: RTL 8169  linux driver question)
From: Kirill Smelkov @ 2017-09-27 20:30 UTC (permalink / raw)
  To: Francois Romieu, Hayes Wang
  Cc: David Laight, Stéphane ANCELOT, netdev, sancelot,
	Klaus Wölfel, Ivan Tyagov, Julien Muchembled,
	Vincent Pelletier, Rafael Monnerat, Hardik Juneja,
	Jean-Paul Smets
In-Reply-To: <20121128231822.GA10158@electric-eye.fr.zoreil.com>

+ klaus, ivan, ...

On Mon, Nov 26, 2012 at 09:15:19AM -0000, David Laight wrote:
> On Fri, Nov 23, 2012 at 08:14:37PM +0100, Stéphane ANCELOT wrote:
> > I had problem with it, my application sends a frame that is immediately
> > transmitted back by some slaves, there was abnormally 100us  lost
> > between the send and receive call.
> > 
> > Finally I found it was coming from the following register setup in the
> > driver :
> > 
> > RTL_W16(IntrMitigate, 0x5151);
> > 
> > Can you give me some details about it, since I do not have the RTL8169
> > programming guide.
> 
> That sounds like an 'interrupt mitigation' setting - which will cause
> RX interrupts to be delayed a short time in order to reduce the
> interrupt load on the kernel.
> 
> There is usually an 'ethtool' setting to disable interrupt mitigation.

On Wed, Nov 28, 2012 at 10:32:12AM +0800, hayeswang wrote:
> Francois Romieu [mailto:romieu@fr.zoreil.com] 
> [...]
> > Something like the patch below against net-next could help once I will
> > have tested it.
> > 
> > I completely guessed the Tx usec scale factor at gigabit 
> > speed (125 us, 
> > 100 us, disabled, who knows ?) and I have no idea which 
> > specific chipsets
> > it should work with.
> > 
> > Hayes, may I expect some hindsight regarding:
> > 1 - the availability of the IntrMitigate (0xe2) register through the
> >     8169, 8168 and 810x line of chipsets
> 
> 8169, 8168, and 8136(810x) serial chipsets support it.
> 
> > 2 - the Tx timer unit at gigabit speed
> 
> The unit of the timer depneds on both the speed and the setting of CPlusCmd
> (0xe0) bit 1 and bit 0.
> 
> For 8169
> bit[1:0] \ speed      1000M           100M            10M
> 0 0           320ns           2.56us          40.96us
> 0 1           2.56us          20.48us         327.7us
> 1 0           5.12us          40.96us         655.4us
> 1 1           10.24us         81.92us         1.31ms
> 
> For the other
> bit[1:0] \ speed      1000M           100M            10M
> 0 0           5us             2.56us          40.96us
> 0 1           40us            20.48us         327.7us
> 1 0           80us            40.96us         655.4us
> 1 1           160us           81.92us         1.31ms

On Thu, Nov 29, 2012 at 12:18:22AM +0100, Francois Romieu wrote:
> David Laight <David.Laight@ACULAB.COM> :
> [David's life]
> 
> 
> The version below fixes several bugs and refuses the frame or timing
> values it can't set. Hayes's Tx parameters still need to be pluged
> into rtl_coalesce_scale.
> 
> Rx delays seem lower than what I had expected when testing with a 8168b
> (XID 18000000).
> 
> diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
> index 248f883..d2594b1 100644
> --- a/drivers/net/ethernet/realtek/r8169.c
> +++ b/drivers/net/ethernet/realtek/r8169.c
> @@ -349,6 +349,12 @@ enum rtl_registers {
>  	RxMaxSize	= 0xda,
>  	CPlusCmd	= 0xe0,
>  	IntrMitigate	= 0xe2,
> +
> +#define RTL_COALESCE_MASK	0x0f
> +#define RTL_COALESCE_SHIFT	4
> +#define RTL_COALESCE_T_MAX	(RTL_COALESCE_MASK)
> +#define RTL_COALESCE_FRAME_MAX	(RTL_COALESCE_MASK << 2)
> +
>
> [...]

Hello up there. Let me chime in into this a bit old thread.

Like Stéphane I care about timings. It is not real-time but in my case network
round-trip latency almost directly translates into client-server
database request/response time and thus it significantly affects
throughput for workloads with many serially performed requests.

We have many computers with gigabit Realtek NICs. For 2 such computers
connected to a gigabit store-and-forward switch the minimum round-trip
time for small pings (`ping -i 0 -w 3 -s 56 -q peer`) is ~ 30μs.

However it turned out that when Ethernet frame length transitions 127 ->
128 bytes (`ping -i 0 -w 3 -s {81 -> 82} -q peer`) the lowest RTT
transitions step-wise to ~ 270μs.

As David said this is RX interrupt mitigation done by NIC which creates
the latency. For workloads when low-latency is required with e.g. Intel,
BCM etc NIC drivers one just uses `ethtool -C rx-usecs ...` to reduce
the time NIC delays before interrupting CPU, but it turned out
`ethtool -C` is not supported by r8169 driver.

Like Stéphane I've traced the problem down to IntrMitigate being
hardcoded to != 0 for our chips (we have 8168 based NICs):

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/ethernet/realtek/r8169.c#n5460
static void rtl_hw_start_8169(struct net_device *dev) {
	...
        /*
         * Undocumented corner. Supposedly:
         * (TxTimer << 12) | (TxPackets << 8) | (RxTimer << 4) | RxPackets
         */ 
        RTL_W16(IntrMitigate, 0x0000);

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/ethernet/realtek/r8169.c#n6346
static void rtl_hw_start_8168(struct net_device *dev) {
	...
        RTL_W16(IntrMitigate, 0x5151);

and then I've also found this thread.

So could we please finally get support for tuning r8169 interrupt
coalescing in tree? (so that next poor soul who hits the problem does
not need to go all the way to dig into driver sources and internet
wildly and finally patch locally

        -RTL_W16(IntrMitigate, 0x5151);
	+RTL_W16(IntrMitigate, 0x5100);

guessing whether it is right or not and also having to care to deploy
the patch everywhere it needs to be used, etc...).

To do so I've took Francois's patch and reworked it a bit:

- updated to latest net-next.git;
- adjusted scaling setup based on feedback from Hayes to pick up scaling
  vector depending not only on link speed but also on CPlusCmd[0:1] and to
  adjust CPlusCmd[0:1] correspondingly when setting timings;
- improved a bit (I think so) error handling.

I've tested the patch on "RTL8168d/8111d" (XID 083000c0) and with it and
`ethtool -C rx-usecs 0 rx-frames 0` on both ends it improves:

- minimum RTT latency:

	~270μs ->  ~30μs (small packet),
	~330μs -> ~110μs (full 1.5K ethernet frame)

- average RTT latency:

	~480μs ->  ~50μs (small packet),
	~560μs -> ~125μs (full 1.5K ethernet frame)

( before:

	root@neo1:# ping -i 0 -w 3 -s 82 -q neo2
	PING neo2.kirr.nexedi.com (192.168.102.21) 82(110) bytes of data.
	
	--- neo2.kirr.nexedi.com ping statistics ---
	5906 packets transmitted, 5905 received, 0% packet loss, time 2999ms
	rtt min/avg/max/mdev = 0.274/0.485/0.607/0.026 ms, ipg/ewma 0.508/0.489 ms
	
	root@neo1:# ping -i 0 -w 3 -s 1472 -q neo2
	PING neo2.kirr.nexedi.com (192.168.102.21) 1472(1500) bytes of data.
	
	--- neo2.kirr.nexedi.com ping statistics ---
	5073 packets transmitted, 5073 received, 0% packet loss, time 2999ms
	rtt min/avg/max/mdev = 0.330/0.566/0.710/0.028 ms, ipg/ewma 0.591/0.544 ms

  after:

	root@neo1# ping -i 0 -w 3 -s 82 -q neo2
	PING neo2.kirr.nexedi.com (192.168.102.21) 82(110) bytes of data.
	
	--- neo2.kirr.nexedi.com ping statistics ---
	45815 packets transmitted, 45815 received, 0% packet loss, time 3000ms
	rtt min/avg/max/mdev = 0.036/0.051/0.368/0.010 ms, ipg/ewma 0.065/0.053 ms
	
	root@neo1:# ping -i 0 -w 3 -s 1472 -q neo2
	PING neo2.kirr.nexedi.com (192.168.102.21) 1472(1500) bytes of data.
	
	--- neo2.kirr.nexedi.com ping statistics ---
	21250 packets transmitted, 21250 received, 0% packet loss, time 3000ms
	rtt min/avg/max/mdev = 0.112/0.125/0.390/0.007 ms, ipg/ewma 0.141/0.125 ms

  the small -> 1.5K latency growth is understandable as it takes ~15μs
  to transmit 1.5K on 1Gbps on the wire and with 2 hosts and 1 switch
  and ICMP ECHO + ECHO reply the packet has to travel 4 ethernet
  segments which is already 60μs;
  
  probably something a bit else is also there as e.g. on Linux, even
  with `cpupower frequency-set -g performance`, on some computers I've
  noticed the kernel can be spending more time in software-only mode
  when incoming packets go in less frequently. E.g. this program can
  demonstrate the effect for ICMP ECHO processing:
  
  https://lab.nexedi.com/kirr/bcc/blob/43cfc13b/tools/pinglat.py )

Once again let's please work towards including the patch into mainline
kernel.

It remains to be clarified whether RX and TX timers use the same base.
For now I've set them equally, but Francois's origianl patch version
suggests it could be not the same.

I would appreciate feedback from Hayes on this and also on whether 128
raw length is the threshold below which packets are considered by NIC as
small and go in without interrupt moderation.

Thanks beforehand,
Kirill

---- 8< ----
From: Francois Romieu <romieu@fr.zoreil.com>
Subject: [PATCH] r8169: Add support for interrupt coalesce tuning (ethtool -C)

In particular with

	ethtool -C <ifname> rx-usecs 0 rx-frames 0

now it is possible to disable RX delays when NIC usage requires low-latency.

See this thread for example and background:

	https://www.spinics.net/lists/netdev/msg217665.html

( kirr:

  - adjusted scaling setup based on feedback from Hayes to pick up scaling
    vector depending not only on speed but also on CPlusCmd[0:1] and to
    adjust CPlusCmd[0:1] correspondingly when setting timings;
  - improved a bit (I think so) error handling. )

Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
---
 drivers/net/ethernet/realtek/r8169.c | 231 +++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index e03fcf914690..bebae6d8ea38 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -399,6 +399,12 @@ enum rtl_registers {
 	RxMaxSize	= 0xda,
 	CPlusCmd	= 0xe0,
 	IntrMitigate	= 0xe2,
+
+#define RTL_COALESCE_MASK	0x0f
+#define RTL_COALESCE_SHIFT	4
+#define RTL_COALESCE_T_MAX	(RTL_COALESCE_MASK)
+#define RTL_COALESCE_FRAME_MAX	(RTL_COALESCE_MASK << 2)
+
 	RxDescAddrLow	= 0xe4,
 	RxDescAddrHigh	= 0xe8,
 	EarlyTxThres	= 0xec,	/* 8169. Unit of 32 bytes. */
@@ -795,6 +801,7 @@ struct rtl8169_private {
 	u16 cp_cmd;
 
 	u16 event_slow;
+	const struct rtl_coalesce_info *coalesce_info;
 
 	struct mdio_ops {
 		void (*write)(struct rtl8169_private *, int, int);
@@ -2363,10 +2370,229 @@ static int rtl8169_nway_reset(struct net_device *dev)
 	return mii_nway_restart(&tp->mii);
 }
 
+/*
+ * Interrupt coalescing
+ *
+ * > 1 - the availability of the IntrMitigate (0xe2) register through the
+ * >     8169, 8168 and 810x line of chipsets
+ *
+ * 8169, 8168, and 8136(810x) serial chipsets support it.
+ *
+ * > 2 - the Tx timer unit at gigabit speed
+ *
+ * The unit of the timer depends on both the speed and the setting of CPlusCmd
+ * (0xe0) bit 1 and bit 0.
+ *
+ * For 8169
+ * bit[1:0] \ speed        1000M           100M            10M
+ * 0 0                     320ns           2.56us          40.96us
+ * 0 1                     2.56us          20.48us         327.7us
+ * 1 0                     5.12us          40.96us         655.4us
+ * 1 1                     10.24us         81.92us         1.31ms
+ *
+ * For the other
+ * bit[1:0] \ speed        1000M           100M            10M
+ * 0 0                     5us             2.56us          40.96us
+ * 0 1                     40us            20.48us         327.7us
+ * 1 0                     80us            40.96us         655.4us
+ * 1 1                     160us           81.92us         1.31ms
+ */
+
+/* rx/tx scale factors for one particular CPlusCmd[0:1] value */
+struct rtl_coalesce_scale {
+	/* Rx / Tx */
+	u32 nsecs[2];
+};
+
+/* rx/tx scale factors for all CPlusCmd[0:1] cases */
+struct rtl_coalesce_info {
+	u32 speed;
+	struct rtl_coalesce_scale scalev[4];	/* each CPlusCmd[0:1] case */
+};
+
+/* produce (r,t) pairs with each being in series of *1, *8, *8*2, *8*2*2 */
+#define rxtx_x1822(r, t) {		\
+	{{(r),		(t)}},		\
+	{{(r)*8,	(t)*8}},	\
+	{{(r)*8*2,	(t)*8*2}},	\
+	{{(r)*8*2*2,	(t)*8*2*2}},	\
+}
+static const struct rtl_coalesce_info rtl_coalesce_info_8169[] = {
+	/* speed	delays:     rx00   tx00	*/
+	{ SPEED_10,	rxtx_x1822(40960, 40960)	},
+	{ SPEED_100,	rxtx_x1822( 2560,  2560)	},
+	{ SPEED_1000,	rxtx_x1822(  320,   320)	},
+	{ 0 },
+};
+
+static const struct rtl_coalesce_info rtl_coalesce_info_8168_8136[] = {
+	/* speed	delays:     rx00   tx00	*/
+	{ SPEED_10,	rxtx_x1822(40960, 40960)	},
+	{ SPEED_100,	rxtx_x1822( 2560,  2560)	},
+	{ SPEED_1000,	rxtx_x1822( 5000,  5000)	},
+	{ 0 },
+};
+#undef rxtx_x1822
+
+/* get rx/tx scale vector corresponding to current speed */
+static const struct rtl_coalesce_info *rtl_coalesce_info(struct net_device *dev)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	struct ethtool_link_ksettings ecmd;
+	const struct rtl_coalesce_info *ci;
+	int rc;
+
+	rc = rtl8169_get_link_ksettings(dev, &ecmd);
+	if (rc < 0)
+		return ERR_PTR(rc);
+
+	for (ci = tp->coalesce_info; ci->speed != 0; ci++) {
+		if (ecmd.base.speed == ci->speed) {
+			return ci;
+		}
+	}
+
+	return ERR_PTR(-ELNRNG);
+}
+
+static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	void __iomem *ioaddr = tp->mmio_addr;
+	const struct rtl_coalesce_info *ci;
+	const struct rtl_coalesce_scale *scale;
+	struct {
+		u32 *max_frames;
+		u32 *usecs;
+	} coal_settings [] = {
+		{ &ec->rx_max_coalesced_frames, &ec->rx_coalesce_usecs },
+		{ &ec->tx_max_coalesced_frames, &ec->tx_coalesce_usecs }
+	}, *p = coal_settings;
+	int i;
+	u16 w;
+
+	memset(ec, 0, sizeof(*ec));
+
+	/* get rx/tx scale corresponding to current speed and CPlusCmd[0:1] */
+	ci = rtl_coalesce_info(dev);
+	if (IS_ERR(ci))
+		return PTR_ERR(ci);
+
+	scale = &ci->scalev[RTL_R16(CPlusCmd) & 3];
+
+	/* read IntrMitigate and adjust according to scale */
+	for (w = RTL_R16(IntrMitigate); w; w >>= RTL_COALESCE_SHIFT, p++) {
+		*p->max_frames = (w & RTL_COALESCE_MASK) << 2;
+		w >>= RTL_COALESCE_SHIFT;
+		*p->usecs = w & RTL_COALESCE_MASK;
+	}
+
+	for (i = 0; i < 2; i++) {
+		p = coal_settings + i;
+		*p->usecs = (*p->usecs * scale->nsecs[i]) / 1000;
+
+		/*
+		 * ethtool_coalesce says it is illegal to set both usecs and
+		 * max_frames to 0.
+		 */
+		if (!*p->usecs && !*p->max_frames)
+			*p->max_frames = 1;
+	}
+
+	return 0;
+}
+
+/* choose appropriate scale factor and CPlusCmd[0:1] for (speed, nsec) */
+static const struct rtl_coalesce_scale *rtl_coalesce_choose_scale(
+			struct net_device *dev, u32 nsec, u16 *cp01)
+{
+	const struct rtl_coalesce_info *ci;
+	u16 i;
+
+	ci = rtl_coalesce_info(dev);
+	if (IS_ERR(ci))
+		return ERR_CAST(ci);
+
+	for (i = 0; i < 4; i++) {
+		u32 rxtx_maxscale = max(ci->scalev[i].nsecs[0],
+					ci->scalev[i].nsecs[1]);
+		if (nsec <= rxtx_maxscale * RTL_COALESCE_T_MAX) {
+			*cp01 = i;
+			return &ci->scalev[i];
+		}
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	void __iomem *ioaddr = tp->mmio_addr;
+	const struct rtl_coalesce_scale *scale;
+	struct {
+		u32 frames;
+		u32 usecs;
+	} coal_settings [] = {
+		{ ec->rx_max_coalesced_frames, ec->rx_coalesce_usecs },
+		{ ec->tx_max_coalesced_frames, ec->tx_coalesce_usecs }
+	}, *p = coal_settings;
+	u16 w = 0, cp01;
+	int i;
+
+	scale = rtl_coalesce_choose_scale(dev,
+			max(p[0].usecs, p[1].usecs) * 1000, &cp01);
+	if (IS_ERR(scale))
+		return PTR_ERR(scale);
+
+	for (i = 0; i < 2; i++, p++) {
+		u32 units;
+
+		/*
+		 * accept max_frames=1 we returned in rtl_get_coalesce.
+		 * accept it not only when usecs=0 because of e.g. the following scenario:
+		 *
+		 * - both rx_usecs=0 & rx_frames=0 in hardware (no delay on RX)
+		 * - rtl_get_coalesce returns rx_usecs=0, rx_frames=1
+		 * - then user does `ethtool -C eth0 rx-usecs 100`
+		 *
+		 * since ethtool sends to kernel whole ethtool_coalesce
+		 * settings, if we do not handle rx_usecs=!0, rx_frames=1
+		 * we'll reject it below in `frames % 4 != 0`.
+		 */
+		if (p->frames == 1) {
+			p->frames = 0;
+		}
+
+		units = p->usecs * 1000 / scale->nsecs[i];
+		if (p->frames > RTL_COALESCE_FRAME_MAX || p->frames % 4)
+			return -EINVAL;
+
+		w <<= RTL_COALESCE_SHIFT;
+		w |= units;
+		w <<= RTL_COALESCE_SHIFT;
+		w |= p->frames >> 2;
+	}
+
+	rtl_lock_work(tp);
+
+	RTL_W16(IntrMitigate, swab16(w));
+
+	tp->cp_cmd = (tp->cp_cmd & ~3) | cp01;
+	RTL_W16(CPlusCmd, tp->cp_cmd);
+	RTL_R16(CPlusCmd);
+
+	rtl_unlock_work(tp);
+
+	return 0;
+}
+
 static const struct ethtool_ops rtl8169_ethtool_ops = {
 	.get_drvinfo		= rtl8169_get_drvinfo,
 	.get_regs_len		= rtl8169_get_regs_len,
 	.get_link		= ethtool_op_get_link,
+	.get_coalesce		= rtl_get_coalesce,
+	.set_coalesce		= rtl_set_coalesce,
 	.set_settings		= rtl8169_set_settings,
 	.get_msglevel		= rtl8169_get_msglevel,
 	.set_msglevel		= rtl8169_set_msglevel,
@@ -8062,6 +8288,7 @@ static const struct rtl_cfg_info {
 	unsigned int align;
 	u16 event_slow;
 	unsigned features;
+	const struct rtl_coalesce_info *coalesce_info;
 	u8 default_ver;
 } rtl_cfg_infos [] = {
 	[RTL_CFG_0] = {
@@ -8070,6 +8297,7 @@ static const struct rtl_cfg_info {
 		.align		= 0,
 		.event_slow	= SYSErr | LinkChg | RxOverflow | RxFIFOOver,
 		.features	= RTL_FEATURE_GMII,
+		.coalesce_info	= rtl_coalesce_info_8169,
 		.default_ver	= RTL_GIGA_MAC_VER_01,
 	},
 	[RTL_CFG_1] = {
@@ -8078,6 +8306,7 @@ static const struct rtl_cfg_info {
 		.align		= 8,
 		.event_slow	= SYSErr | LinkChg | RxOverflow,
 		.features	= RTL_FEATURE_GMII | RTL_FEATURE_MSI,
+		.coalesce_info	= rtl_coalesce_info_8168_8136,
 		.default_ver	= RTL_GIGA_MAC_VER_11,
 	},
 	[RTL_CFG_2] = {
@@ -8087,6 +8316,7 @@ static const struct rtl_cfg_info {
 		.event_slow	= SYSErr | LinkChg | RxOverflow | RxFIFOOver |
 				  PCSTimeout,
 		.features	= RTL_FEATURE_MSI,
+		.coalesce_info	= rtl_coalesce_info_8168_8136,
 		.default_ver	= RTL_GIGA_MAC_VER_13,
 	}
 };
@@ -8450,6 +8680,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	tp->hw_start = cfg->hw_start;
 	tp->event_slow = cfg->event_slow;
+	tp->coalesce_info = cfg->coalesce_info;
 
 	tp->opts1_mask = (tp->mac_version != RTL_GIGA_MAC_VER_01) ?
 		~(RxBOVF | RxFOVF) : ~0;
-- 
2.14.1.581.gf28d330327

^ permalink raw reply related

* Re: [next-queue PATCH 2/3] net/sched: Introduce Credit Based Shaper (CBS) qdisc
From: Cong Wang @ 2017-09-27 20:23 UTC (permalink / raw)
  To: Vinicius Costa Gomes
  Cc: Linux Kernel Network Developers, intel-wired-lan,
	Jamal Hadi Salim, Jiri Pirko, andre.guedes, ivan.briano,
	jesus.sanchez-palencia, boon.leong.ong, richardcochran, henrik
In-Reply-To: <20170926233916.11774-3-vinicius.gomes@intel.com>

On Tue, Sep 26, 2017 at 4:39 PM, Vinicius Costa Gomes
<vinicius.gomes@intel.com> wrote:
> +static int cbs_init(struct Qdisc *sch, struct nlattr *opt)
> +{
> +       struct cbs_sched_data *q = qdisc_priv(sch);
> +       struct net_device *dev = qdisc_dev(sch);
> +
> +       if (!opt)
> +               return -EINVAL;
> +
> +       /* FIXME: this means that we can only install this qdisc
> +        * "under" mqprio. Do we need a more generic way to retrieve
> +        * the queue, or do we pass the netdev_queue to the driver?
> +        */
> +       q->queue = TC_H_MIN(sch->parent) - 1 - netdev_get_num_tc(dev);
> +
> +       return cbs_change(sch, opt);
> +}

Yeah it is ugly to assume its parent is mqprio, at least you should
error out if it is not the case.

I am not sure how we can solve this elegantly, perhaps you should
extend mqprio rather than add a new one?

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Andrew Lunn @ 2017-09-27 20:18 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: Vivien Didelot, David Miller, netdev
In-Reply-To: <465be6d8-67d0-d176-1252-abb222bf0528@gmail.com>

On Wed, Sep 27, 2017 at 12:33:29PM -0700, Florian Fainelli wrote:
> On 09/27/2017 12:19 PM, Vivien Didelot wrote:
> > Hi Andrew, Florian,
> > 
> > Andrew Lunn <andrew@lunn.ch> writes:
> > 
> >> It took me a while to make this work with CONFIG_BRIDGE_VLAN_FILTERING
> >> enabled. Any change to enable hardware flooding needs careful testing
> >> for lots of different configurations. This is another reason i don't
> >> want to do it at the DSA level, until we have a good understanding
> >> what it means in each individual driver.
> > 
> > Then if we are worried about how broadcast flooding is handled on
> > different switches, we can provide a new .flood_broadcast(ds, vid)
> > switch operation for the drivers to implement.
> 
> We don't really have a good visibility on the number of switches
> requiring special configuration for broadcast addresses nor how this
> would have to happen so it would be a tad difficult to define an
> appropriate API with a single user.

Yes, i agree with this. We should wait before adding a generic
solution. I want to wait until a few drivers do whatever is needed for
hardware broadcast. We can then see what is common, and what is
different, find an API to suit and do some refactoring.

	   Andrew

^ permalink raw reply

* Re: [PATCH net-next 2/6] net: dsa: {e}dsa: set offload_fwd_mark on received packets
From: Andrew Lunn @ 2017-09-27 20:11 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: David Miller, Vivien Didelot, netdev
In-Reply-To: <1c4ba973-91c0-ae81-1c4b-d0281f5c7517@gmail.com>

On Wed, Sep 27, 2017 at 12:46:35PM -0700, Florian Fainelli wrote:
> On 09/26/2017 03:25 PM, Andrew Lunn wrote:
> > The software bridge needs to know if a packet has already been bridged
> > by hardware offload to ports in the same hardware offload, in order
> > that it does not re-flood them, causing duplicates. This is
> > particularly true for broadcast and multicast traffic which the host
> > has requested.
> > 
> > By setting offload_fwd_mark in the skb the bridge will only flood to
> > ports in other offloads and other netifs. Set this flag in the DSA and
> > EDSA tag driver.
> 
> Is not there some kind of forwarding code/reason code being provided in
> the EDSA/DSA tag that tell you why this packet was sent to the CPU in
> the first place?

Hi Florian

There are some codes, but nothing specific to broadcast, or ATU
misses. I'm also trying to keep the code generic so it could be a
template for other drivers. Many of the tagging schemes don't provide
a reason code. So i want that any frame that comes from the switch has
no need to go back to the switch. KISS.
 
> What is the impact on non-broadcast traffic, e.g: multicast and unicast?

The bridge uses this flag when flooding. unicast traffic from the
switch should not need flooding. Either it is known in the switch and
hence won't be forwarded to the host, or it is unknown in the switch,
so it probably is on some other interface.

My testing with multicast has not shown issues. The switch pushes down
mdb entries, which causes frames to be replicated out ports. So again,
there should not be a need to pass the frame back to the switch. But
it is possible i missed a corner case or two...
 
> Nit: I don't really have a solution on how to order patches, but until
> the next 4 patches get in, I suppose we temporarily have broadcast
> flooding by the bridge "broken"? Ordering in the opposite way would
> probably result in an equally bad situation so...

Yes, it is an issue. I could put this patch last. We then get
duplication of broadcast...

Which is the lesser of two evils?

      Andrew

^ permalink raw reply

* Re: [PATCH net-next 2/6] net: dsa: {e}dsa: set offload_fwd_mark on received packets
From: Florian Fainelli @ 2017-09-27 19:46 UTC (permalink / raw)
  To: Andrew Lunn, David Miller; +Cc: Vivien Didelot, netdev
In-Reply-To: <1506464764-12699-3-git-send-email-andrew@lunn.ch>

On 09/26/2017 03:25 PM, Andrew Lunn wrote:
> The software bridge needs to know if a packet has already been bridged
> by hardware offload to ports in the same hardware offload, in order
> that it does not re-flood them, causing duplicates. This is
> particularly true for broadcast and multicast traffic which the host
> has requested.
> 
> By setting offload_fwd_mark in the skb the bridge will only flood to
> ports in other offloads and other netifs. Set this flag in the DSA and
> EDSA tag driver.

Is not there some kind of forwarding code/reason code being provided in
the EDSA/DSA tag that tell you why this packet was sent to the CPU in
the first place?

What is the impact on non-broadcast traffic, e.g: multicast and unicast?

Nit: I don't really have a solution on how to order patches, but until
the next 4 patches get in, I suppose we temporarily have broadcast
flooding by the bridge "broken"? Ordering in the opposite way would
probably result in an equally bad situation so...

> 
> Signed-off-by: Andrew Lunn <andrew@lunn.ch>
> ---
> 
> v2
> --
> For the moment, do this in the tag drivers, not the generic code.
> Once we get more test results from other switches, maybe move it back
> again.
> ---
>  net/dsa/tag_dsa.c  | 1 +
>  net/dsa/tag_edsa.c | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
> index fbf9ca954773..ea6ada9d5016 100644
> --- a/net/dsa/tag_dsa.c
> +++ b/net/dsa/tag_dsa.c
> @@ -154,6 +154,7 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
>  	}
>  
>  	skb->dev = ds->ports[source_port].netdev;
> +	skb->offload_fwd_mark = 1;
>  
>  	return skb;
>  }
> diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
> index 76367ba1b2e2..a961b22a7018 100644
> --- a/net/dsa/tag_edsa.c
> +++ b/net/dsa/tag_edsa.c
> @@ -173,6 +173,7 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
>  	}
>  
>  	skb->dev = ds->ports[source_port].netdev;
> +	skb->offload_fwd_mark = 1;
>  
>  	return skb;
>  }
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Florian Fainelli @ 2017-09-27 19:33 UTC (permalink / raw)
  To: Vivien Didelot, Andrew Lunn; +Cc: David Miller, netdev
In-Reply-To: <8760c4t0df.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

On 09/27/2017 12:19 PM, Vivien Didelot wrote:
> Hi Andrew, Florian,
> 
> Andrew Lunn <andrew@lunn.ch> writes:
> 
>> It took me a while to make this work with CONFIG_BRIDGE_VLAN_FILTERING
>> enabled. Any change to enable hardware flooding needs careful testing
>> for lots of different configurations. This is another reason i don't
>> want to do it at the DSA level, until we have a good understanding
>> what it means in each individual driver.
> 
> Then if we are worried about how broadcast flooding is handled on
> different switches, we can provide a new .flood_broadcast(ds, vid)
> switch operation for the drivers to implement.

We don't really have a good visibility on the number of switches
requiring special configuration for broadcast addresses nor how this
would have to happen so it would be a tad difficult to define an
appropriate API with a single user.

In general, single user "generic" facilities tend to be biased towards
their particular problem space (c.f: devlink) so a generic interface to
call into HW specific details does not usually sell well...
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Vivien Didelot @ 2017-09-27 19:19 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli; +Cc: David Miller, netdev
In-Reply-To: <20170927184636.GD12394@lunn.ch>

Hi Andrew, Florian,

Andrew Lunn <andrew@lunn.ch> writes:

> It took me a while to make this work with CONFIG_BRIDGE_VLAN_FILTERING
> enabled. Any change to enable hardware flooding needs careful testing
> for lots of different configurations. This is another reason i don't
> want to do it at the DSA level, until we have a good understanding
> what it means in each individual driver.

Then if we are worried about how broadcast flooding is handled on
different switches, we can provide a new .flood_broadcast(ds, vid)
switch operation for the drivers to implement.

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Vivien Didelot @ 2017-09-27 18:59 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: David Miller, netdev
In-Reply-To: <20170927183650.GC12394@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

>> Adding the broadcast address to an Ethernet switch's FDB is pretty
>> generic and mv88e6xxx mustn't be the only driver doing this.
>
> Actually, it is. All the others seem to do this in hardware without
> needing an FDB. Since mv88e6xxx is the only one requiring it, it has
> to be done in the mv88e6xxx driver.

Adding the broadcast address from the DSA layer wouldn't hurt and make
things pretty obvious. This would also avoid drivers to get
unnecessarily complex. A .port_vlan_add implementation must remain
simple and mustn't do more than adding a VLAN entry.

Don't forget that we want the DSA drivers to be dump and have the core
logic of Ethernet switch handling resides in DSA core itself.

If some switch chips can flood broadcast without an FDB entry, good for
them, they can skip it. We will have the same issue for special L2
Multicast destination addresses, some switches have special bits to
consider them as management, some others don't and require to load the
ATU with them.

Regarding Marvell, what value do you have for the global FloodBC bit
(Global 2, offset 0x05)?


Thanks,

        Vivien

^ permalink raw reply

* Re: [PATCH net-next] ipv6: do lazy dst->__use update when per cpu dst is available
From: Martin KaFai Lau @ 2017-09-27 19:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Wei Wang, Paolo Abeni, Linux Kernel Network Developers,
	David S. Miller, Hannes Frederic Sowa
In-Reply-To: <CANn89iJ0t8Vpukak3WEOA1x4M4hk-Z9hDkcSqG9+7qLsbuJQZg@mail.gmail.com>

On Wed, Sep 27, 2017 at 06:03:33PM +0000, Eric Dumazet wrote:
> >> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> 
> >
> > Hi Paolo,
> >
> > Eric and I discussed about this issue recently as well :).
> >
> > What about the following change:
> >
> > diff --git a/include/net/dst.h b/include/net/dst.h
> > index 93568bd0a352..33e1d86bcef6 100644
> > --- a/include/net/dst.h
> > +++ b/include/net/dst.h
> > @@ -258,14 +258,18 @@ static inline void dst_hold(struct dst_entry *dst)
> >  static inline void dst_use(struct dst_entry *dst, unsigned long time)
> >  {
> >         dst_hold(dst);
> > -       dst->__use++;
> > -       dst->lastuse = time;
> > +       if (dst->lastuse != time) {
> > +               dst->__use++;
> > +               dst->lastuse = time;
> > +       }
> >  }
> >
> >  static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
> >  {
> > -       dst->__use++;
> > -       dst->lastuse = time;
> > +       if (dst->lastuse != time) {
> > +               dst->__use++;
> > +               dst->lastuse = time;
> > +       }
> >  }
> >
> >  static inline struct dst_entry *dst_clone(struct dst_entry *dst)
> > diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> > index 26cc9f483b6d..e195f093add3 100644
> > --- a/net/ipv6/route.c
> > +++ b/net/ipv6/route.c
> > @@ -1170,8 +1170,7 @@ struct rt6_info *ip6_pol_route(struct net *net,
> > struct fib6_table *table,
> >
> >                 struct rt6_info *pcpu_rt;
> >
> > -               rt->dst.lastuse = jiffies;
> > -               rt->dst.__use++;
> > +               dst_use_noref(rt, jiffies);
> >                 pcpu_rt = rt6_get_pcpu_route(rt);
> >
> >                 if (pcpu_rt) {
> >
> >
> > This way we always only update dst->__use and dst->lastuse at most
> > once per jiffy. And we don't really need to update pcpu and then do
> > the copy over from pcpu_rt to rt operation.
> >
> > Another thing is that I don't really see any places making use of
> > dst->__use. So maybe we can also get rid of this dst->__use field?
> >
> > Thanks.
> > Wei
> 
> Paolo, given we are very close to send Wei awesome work about IPv6
> routing cache,
> could we ask you to wait few days before doing the same work from your side ?
> 
> Main issue is the rwlock, and we are converting it to full RCU.
+1

We can get a better picture of other optimizations once
the rwlock is removed.

> 
> You are sending patches that are making our job very difficult IMO.
> 
> We chose to mimic the change done in neighbour code years ago
> ( 0ed8ddf4045fcfcac36bad753dc4046118c603ec )
> 
> Thanks.

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Andrew Lunn @ 2017-09-27 18:46 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: Vivien Didelot, David Miller, netdev
In-Reply-To: <9733da02-0b69-33f0-de8a-63bc5cae6bb4@gmail.com>

> What if I don't have CONFIG_BRIDGE_VLAN_FILTERING enabled, what happens
> in that case, would not this result in not programming the broadcast
> address?

Hi Florian

It took me a while to make this work with CONFIG_BRIDGE_VLAN_FILTERING
enabled. Any change to enable hardware flooding needs careful testing
for lots of different configurations. This is another reason i don't
want to do it at the DSA level, until we have a good understanding
what it means in each individual driver.

     Andrew

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Florian Fainelli @ 2017-09-27 18:37 UTC (permalink / raw)
  To: Vivien Didelot, Andrew Lunn, David Miller; +Cc: netdev
In-Reply-To: <8737786lua.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

On 09/27/2017 11:24 AM, Vivien Didelot wrote:
> Hi Andrew,
> 
> Andrew Lunn <andrew@lunn.ch> writes:
> 
>> +static int mv88e6xxx_port_add_broadcast(struct mv88e6xxx_chip *chip, int port,
>> +					u16 vid)
>> +{
>> +	const char broadcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
>> +
>> +	return mv88e6xxx_port_db_load_purge(
>> +		chip, port, broadcast, vid,
>> +		MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC);
> 
> Please don't do this. This is not a valid coding style and has already
> shown to be a bad example for other DSA drivers copying mv88e6xxx.
> 
> Simply declare u8 state = MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC above...
> 
>> +}
>> +
>> +static int mv88e6xxx_broadcast_setup(struct mv88e6xxx_chip *chip, u16 vid)
>> +{
>> +	int port;
>> +	int err;
>> +
>> +	for (port = 0; port < mv88e6xxx_num_ports(chip); ++port) {
>> +		err = mv88e6xxx_port_add_broadcast(chip, port, vid);
>> +		if (err)
>> +			return err;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int _mv88e6xxx_port_vlan_add(struct mv88e6xxx_chip *chip, int port,
>>  				    u16 vid, u8 member)
>>  {
>> @@ -1247,7 +1271,11 @@ static int _mv88e6xxx_port_vlan_add(struct mv88e6xxx_chip *chip, int port,
>>  
>>  	vlan.member[port] = member;
>>  
>> -	return mv88e6xxx_vtu_loadpurge(chip, &vlan);
>> +	err = mv88e6xxx_vtu_loadpurge(chip, &vlan);
>> +	if (err)
>> +		return err;
>> +
>> +	return mv88e6xxx_broadcast_setup(chip, vid);
>>  }
>>  
>>  static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port,
>> @@ -2025,6 +2053,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
>>  	if (err)
>>  		goto unlock;
>>  
>> +	err = mv88e6xxx_broadcast_setup(chip, 0);
>> +	if (err)
>> +		goto unlock;
>> +
>>  	err = mv88e6xxx_pot_setup(chip);
>>  	if (err)
>>  		goto unlock;
> 
> 
> Adding the broadcast address to an Ethernet switch's FDB is pretty
> generic and mv88e6xxx mustn't be the only driver doing this.

I have not had time to test Andrew's IGMP patch set on bcm_sf2/b53, but
while I agree that adding a broadcast address using a FDB entry is
generic in premise, we don't know yet whether other switch drivers need
that or not, so until we do, it seems like Andrew's approach is
appropriate here by keeping this local to mv88e6xxx.

> 
> They do not have to care about the broadcast address, this is just a
> standard MDB entry for them. This must be moved up to the DSA layer.
> 
> Adding the broadcast address in dsa_port_vlan_add and dsa_port_enable
> like this should be enough: http://ix.io/AmS
> 

What if I don't have CONFIG_BRIDGE_VLAN_FILTERING enabled, what happens
in that case, would not this result in not programming the broadcast
address?
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Andrew Lunn @ 2017-09-27 18:36 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: David Miller, netdev
In-Reply-To: <8737786lua.fsf@weeman.i-did-not-set--mail-host-address--so-tickle-me>

> Adding the broadcast address to an Ethernet switch's FDB is pretty
> generic and mv88e6xxx mustn't be the only driver doing this.

Actually, it is. All the others seem to do this in hardware without
needing an FDB. Since mv88e6xxx is the only one requiring it, it has
to be done in the mv88e6xxx driver.

   Andrew

^ permalink raw reply

* Re: [patch net-next v3 00/12] mlxsw: Add support for offloading IPv4 multicast routes
From: David Miller @ 2017-09-27 18:33 UTC (permalink / raw)
  To: jiri; +Cc: netdev, yotamg, idosch, mlxsw, nikolay, andrew, linyunsheng
In-Reply-To: <20170927062322.5476-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 27 Sep 2017 08:23:10 +0200

> This patch-set introduces offloading of the kernel IPv4 multicast router
> logic in the Spectrum driver.

Series applied, thank you.

^ permalink raw reply

* Re: [PATCH net-next 0/6] mv88e6xxx broadcast flooding in hardware
From: Vivien Didelot @ 2017-09-27 18:28 UTC (permalink / raw)
  To: Andrew Lunn, David Miller; +Cc: netdev, Andrew Lunn
In-Reply-To: <1506464764-12699-1-git-send-email-andrew@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

> This patchset makes the mv88e6xxx driver perform flooding in hardware,
> rather than let the software bridge perform the flooding. This is a
> prerequisite for IGMP snooping on the bridge interface.
>
> In order to make hardware broadcasting work, a few other issues need
> fixing or improving. SWITCHDEV_ATTR_ID_PORT_PARENT_ID is broken, which
> is apparent when testing on the ZII devel board with multiple
> switches.
>
> Some of these patches are taken from a previous RFC patchset of IGMP
> support. Hence the v2 comments...

mlxsw and others return BR_FLOOD and BR_MCAST_FLOOD in
SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT, is this something DSA needs
here?


Thanks,

        Vivien

^ permalink raw reply

* Re: [PATCH net-next 6/6] net: dsa: mv88e6xxx: Flood broadcast frames in hardware
From: Vivien Didelot @ 2017-09-27 18:24 UTC (permalink / raw)
  To: Andrew Lunn, David Miller; +Cc: netdev, Andrew Lunn
In-Reply-To: <1506464764-12699-7-git-send-email-andrew@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

> +static int mv88e6xxx_port_add_broadcast(struct mv88e6xxx_chip *chip, int port,
> +					u16 vid)
> +{
> +	const char broadcast[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
> +
> +	return mv88e6xxx_port_db_load_purge(
> +		chip, port, broadcast, vid,
> +		MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC);

Please don't do this. This is not a valid coding style and has already
shown to be a bad example for other DSA drivers copying mv88e6xxx.

Simply declare u8 state = MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC above...

> +}
> +
> +static int mv88e6xxx_broadcast_setup(struct mv88e6xxx_chip *chip, u16 vid)
> +{
> +	int port;
> +	int err;
> +
> +	for (port = 0; port < mv88e6xxx_num_ports(chip); ++port) {
> +		err = mv88e6xxx_port_add_broadcast(chip, port, vid);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +
>  static int _mv88e6xxx_port_vlan_add(struct mv88e6xxx_chip *chip, int port,
>  				    u16 vid, u8 member)
>  {
> @@ -1247,7 +1271,11 @@ static int _mv88e6xxx_port_vlan_add(struct mv88e6xxx_chip *chip, int port,
>  
>  	vlan.member[port] = member;
>  
> -	return mv88e6xxx_vtu_loadpurge(chip, &vlan);
> +	err = mv88e6xxx_vtu_loadpurge(chip, &vlan);
> +	if (err)
> +		return err;
> +
> +	return mv88e6xxx_broadcast_setup(chip, vid);
>  }
>  
>  static void mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port,
> @@ -2025,6 +2053,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
>  	if (err)
>  		goto unlock;
>  
> +	err = mv88e6xxx_broadcast_setup(chip, 0);
> +	if (err)
> +		goto unlock;
> +
>  	err = mv88e6xxx_pot_setup(chip);
>  	if (err)
>  		goto unlock;


Adding the broadcast address to an Ethernet switch's FDB is pretty
generic and mv88e6xxx mustn't be the only driver doing this.

They do not have to care about the broadcast address, this is just a
standard MDB entry for them. This must be moved up to the DSA layer.

Adding the broadcast address in dsa_port_vlan_add and dsa_port_enable
like this should be enough: http://ix.io/AmS


Thanks,

        Vivien

^ permalink raw reply

* Re: [PATCH net-next] ipv6: do lazy dst->__use update when per cpu dst is available
From: Cong Wang @ 2017-09-27 18:25 UTC (permalink / raw)
  To: Wei Wang
  Cc: Paolo Abeni, Eric Dumazet, Martin KaFai Lau,
	Linux Kernel Network Developers, David S. Miller,
	Hannes Frederic Sowa
In-Reply-To: <CAEA6p_DZxGesjymubWoWx7hQjaZ7Vdchh=cdOW9tAr4F7QvzXg@mail.gmail.com>

On Wed, Sep 27, 2017 at 10:44 AM, Wei Wang <weiwan@google.com> wrote:
> Another thing is that I don't really see any places making use of
> dst->__use. So maybe we can also get rid of this dst->__use field?

It is used in rtnl_put_cacheinfo():

        struct rta_cacheinfo ci = {
                .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
                .rta_used = dst->__use,
                .rta_clntref = atomic_read(&(dst->__refcnt)),
                .rta_error = error,
                .rta_id =  id,
        };

^ permalink raw reply

* Re: [PATCH v2 16/16] net: Add support for networking over Thunderbolt cable
From: David Miller @ 2017-09-27 18:23 UTC (permalink / raw)
  To: mika.westerberg
  Cc: gregkh, andreas.noever, michael.jamet, yehezkel.bernat,
	amir.jer.levy, Mario.Limonciello, lukas, andriy.shevchenko,
	andrew, linux-kernel, netdev
In-Reply-To: <20170927172702.GE4630@lahna.fi.intel.com>

From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 27 Sep 2017 20:27:02 +0300

> On Wed, Sep 27, 2017 at 09:27:09AM -0700, David Miller wrote:
>> From: Mika Westerberg <mika.westerberg@linux.intel.com>
>> Date: Wed, 27 Sep 2017 16:42:38 +0300
>> 
>> > Using build_skb() then would require to allocate larger buffer, that
>> > includes NET_SKB_PAD + SKB_DATA_ALIGN(skb_shared_info) and that exceeds
>> > page size. Is this something supported by build_skb()? It was not clear
>> > to me based on the code and other users of build_skb() but I may be
>> > missing something.
>> 
>> You need NET_SKB_PAD before and SKB_DATA_ALIGN(skb_shared_info) afterwards.
>> An order 1 page, if that's what you need, should work just fine.
> 
> I mean in order to fit a single ThunderboltIP frame, I would need to
> allocate NET_SKB_PAD+4096+SKB_DATA_ALIGN(skb_shared_info) size buffer.

Which would be an order 1 page or 8192 bytes.

> Is that still fine for build_skb()? Also can I use that with
> skb_add_rx_frag() which seem to take single page?

Again, an order 1 page should work fine.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox