Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 04/10] net: constify net_ns_type_operations
From: Stephen Hemminger @ 2017-08-18 20:46 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170818204628.17147-1-sthemmin@microsoft.com>

This can be const.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 include/linux/netdevice.h | 2 +-
 net/core/net-sysfs.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaa77bd9cb80..b0c928598dab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4028,7 +4028,7 @@ static inline void netdev_class_remove_file(const struct class_attribute *class_
 	netdev_class_remove_file_ns(class_attr, NULL);
 }
 
-extern struct kobj_ns_type_operations net_ns_type_operations;
+extern const struct kobj_ns_type_operations net_ns_type_operations;
 
 const char *netdev_drivername(const struct net_device *dev);
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 99061b0a1ebd..2de441692f28 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1461,7 +1461,7 @@ static const void *net_netlink_ns(struct sock *sk)
 	return sock_net(sk);
 }
 
-struct kobj_ns_type_operations net_ns_type_operations = {
+const struct kobj_ns_type_operations net_ns_type_operations = {
 	.type = KOBJ_NS_TYPE_NET,
 	.current_may_mount = net_current_may_mount,
 	.grab_current_ns = net_grab_current_ns,
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 02/10] net: constify netdev_class_file
From: Stephen Hemminger @ 2017-08-18 20:46 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170818204628.17147-1-sthemmin@microsoft.com>

These functions are wrapper arount class_create_file which can take a
const attribute.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 include/linux/netdevice.h | 8 ++++----
 net/core/net-sysfs.c      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0f1c4cb2441e..eaa77bd9cb80 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4013,17 +4013,17 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi
 	return rc;
 }
 
-int netdev_class_create_file_ns(struct class_attribute *class_attr,
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
 				const void *ns);
-void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
 				 const void *ns);
 
-static inline int netdev_class_create_file(struct class_attribute *class_attr)
+static inline int netdev_class_create_file(const struct class_attribute *class_attr)
 {
 	return netdev_class_create_file_ns(class_attr, NULL);
 }
 
-static inline void netdev_class_remove_file(struct class_attribute *class_attr)
+static inline void netdev_class_remove_file(const struct class_attribute *class_attr)
 {
 	netdev_class_remove_file_ns(class_attr, NULL);
 }
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 46ff41bf0210..40937ee63f14 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1618,14 +1618,14 @@ int netdev_register_kobject(struct net_device *ndev)
 	return error;
 }
 
-int netdev_class_create_file_ns(struct class_attribute *class_attr,
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
 				const void *ns)
 {
 	return class_create_file_ns(&net_class, class_attr, ns);
 }
 EXPORT_SYMBOL(netdev_class_create_file_ns);
 
-void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
 				 const void *ns)
 {
 	class_remove_file_ns(&net_class, class_attr, ns);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 03/10] net: make net_class ro_after_init
From: Stephen Hemminger @ 2017-08-18 20:46 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170818204628.17147-1-sthemmin@microsoft.com>

The net_class in sysfs is only modified on init.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 net/core/net-sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 40937ee63f14..99061b0a1ebd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1511,7 +1511,7 @@ static const void *net_namespace(struct device *d)
 	return dev_net(dev);
 }
 
-static struct class net_class = {
+static struct class net_class __ro_after_init = {
 	.name = "net",
 	.dev_release = netdev_release,
 	.dev_groups = net_class_groups,
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 01/10] net: don't decrement kobj reference count on init failure
From: Stephen Hemminger @ 2017-08-18 20:46 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger
In-Reply-To: <20170818204628.17147-1-sthemmin@microsoft.com>

If kobject_init_and_add failed, then the failure path would
decrement the reference count of the queue kobject whose reference
count was already zero.

Fixes: 114cf5802165 ("bql: Byte queue limits")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 net/core/net-sysfs.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b4f9922b6f23..46ff41bf0210 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -917,21 +917,20 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
 	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
 	    "rx-%u", index);
 	if (error)
-		goto exit;
+		return error;
 
 	if (dev->sysfs_rx_queue_group) {
 		error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
-		if (error)
-			goto exit;
+		if (error) {
+			kobject_put(kobj);
+			return error;
+		}
 	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
 	dev_hold(queue->dev);
 
 	return error;
-exit:
-	kobject_put(kobj);
-	return error;
 }
 #endif /* CONFIG_SYSFS */
 
@@ -1339,21 +1338,20 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
 	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
 	    "tx-%u", index);
 	if (error)
-		goto exit;
+		return error;
 
 #ifdef CONFIG_BQL
 	error = sysfs_create_group(kobj, &dql_group);
-	if (error)
-		goto exit;
+	if (error) {
+		kobject_put(kobj);
+		return error;
+	}
 #endif
 
 	kobject_uevent(kobj, KOBJ_ADD);
 	dev_hold(queue->dev);
 
 	return 0;
-exit:
-	kobject_put(kobj);
-	return error;
 }
 #endif /* CONFIG_SYSFS */
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 00/10] sysfs related cleanups
From: Stephen Hemminger @ 2017-08-18 20:46 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger

Network sysfs infrastructure changes. Mostly related to using ro_after_init
to make function tables immutable.

Stephen Hemminger (10):
  net: don't decrement kobj reference count on init failure
  net: constify netdev_class_file
  net: make net_class ro_after_init
  net: constify net_ns_type_operations
  net: make net sysfs attributes ro_after_init
  net: drop unused attribute argument from sysfs queue funcs
  net: make BQL sysfs attributes ro_after_init
  net: make queue attributes ro_after_init
  net: mark receive queue attributes ro_after_init
  net: style cleanups

 include/linux/netdevice.h |  15 ++--
 net/core/net-sysfs.c      | 222 ++++++++++++++++++++++------------------------
 2 files changed, 111 insertions(+), 126 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net] tun: handle register_netdevice() failures properly
From: Eric Dumazet @ 2017-08-18 20:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

syzkaller reported a double free [1], caused by the fact
that tun driver was not updated properly when priv_destructor
was added.

When/if register_netdevice() fails, priv_destructor() must have been
called already.

[1]
BUG: KASAN: double-free or invalid-free in selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023

CPU: 0 PID: 2919 Comm: syzkaller227220 Not tainted 4.13.0-rc4+ #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x7f/0x260 mm/kasan/report.c:252
 kasan_report_double_free+0x55/0x80 mm/kasan/report.c:333
 kasan_slab_free+0xa0/0xc0 mm/kasan/kasan.c:514
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_set_iff drivers/net/tun.c:1884 [inline]
 __tun_chr_ioctl+0x2ce6/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x443ff9
RSP: 002b:00007ffc34271f68 EFLAGS: 00000217 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000004002e0 RCX: 0000000000443ff9
RDX: 0000000020533000 RSI: 00000000400454ca RDI: 0000000000000003
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000217 R12: 0000000000401ce0
R13: 0000000000401d70 R14: 0000000000000000 R15: 0000000000000000

Allocated by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xaa/0xd0 mm/kasan/kasan.c:551
 kmem_cache_alloc_trace+0x101/0x6f0 mm/slab.c:3627
 kmalloc include/linux/slab.h:493 [inline]
 kzalloc include/linux/slab.h:666 [inline]
 selinux_tun_dev_alloc_security+0x49/0x170 security/selinux/hooks.c:5012
 security_tun_dev_alloc_security+0x6d/0xa0 security/security.c:1506
 tun_set_iff drivers/net/tun.c:1839 [inline]
 __tun_chr_ioctl+0x1730/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x6e/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_free_netdev+0x13b/0x1b0 drivers/net/tun.c:1563
 register_netdevice+0x8d0/0xee0 net/core/dev.c:7605
 tun_set_iff drivers/net/tun.c:1859 [inline]
 __tun_chr_ioctl+0x1caf/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801d2843b40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 0 bytes inside of
 32-byte region [ffff8801d2843b40, ffff8801d2843b60)
The buggy address belongs to the page:
page:ffffea000660cea8 count:1 mapcount:0 mapping:ffff8801d2843000 index:0xffff8801d2843fc1
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801d2843000 ffff8801d2843fc1 000000010000003f
raw: ffffea0006626a40 ffffea00066141a0 ffff8801dbc00100
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2843a00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843a80: 00 00 00 fc fc fc fc fc fb fb fb fb fc fc fc fc
>ffff8801d2843b00: 00 00 00 00 fc fc fc fc fb fb fb fb fc fc fc fc
                                           ^
 ffff8801d2843b80: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843c00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
==================================================================

Fixes: cf124db566e6 ("net: Fix inconsistent teardown and release of private netdev state.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/net/tun.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 32ad87345f57..0a2c0a42283f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1879,6 +1879,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 err_detach:
 	tun_detach_all(dev);
+	/* register_netdevice() already called tun_free_netdev() */
+	goto err_free_dev;
+
 err_free_flow:
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);

^ permalink raw reply related

* [PATCH v5 4/4] can: m_can: Add call to of_can_transceiver
From: Franklin S Cooper Jr @ 2017-08-18 20:21 UTC (permalink / raw)
  To: linux-kernel, devicetree, netdev, linux-can, wg, mkl, robh+dt,
	quentin.schulz, dev.kurt, andrew, sergei.shtylyov, socketcan
  Cc: Franklin S Cooper Jr
In-Reply-To: <20170818202137.29816-1-fcooper@ti.com>

Add call to new generic functions that provides support via a binding
to limit the arbitration rate and/or data rate imposed by the physical
transceiver connected to the MCAN peripheral.

Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
---
 drivers/net/can/m_can/m_can.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index f4947a7..f72116e 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1649,6 +1649,8 @@ static int m_can_plat_probe(struct platform_device *pdev)
 
 	devm_can_led_init(dev);
 
+	of_can_transceiver(dev);
+
 	dev_info(&pdev->dev, "%s device registered (irq=%d, version=%d)\n",
 		 KBUILD_MODNAME, dev->irq, priv->version);
 
-- 
2.9.4.dirty

^ permalink raw reply related

* [PATCH v5 3/4] dt-bindings: can: m_can: Document new can transceiver binding
From: Franklin S Cooper Jr @ 2017-08-18 20:21 UTC (permalink / raw)
  To: linux-kernel, devicetree, netdev, linux-can, wg, mkl, robh+dt,
	quentin.schulz, dev.kurt, andrew, sergei.shtylyov, socketcan
  Cc: Franklin S Cooper Jr
In-Reply-To: <20170818202137.29816-1-fcooper@ti.com>

Add information regarding can-transceiver binding. This is especially
important for MCAN since the IP allows CAN FD mode to run significantly
faster than what most transceivers are capable of.

Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
Acked-by: Rob Herring <robh@kernel.org>
---
Remove @ symbol from can-transceiver example

 Documentation/devicetree/bindings/net/can/m_can.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/can/m_can.txt b/Documentation/devicetree/bindings/net/can/m_can.txt
index 9e33177..0bdae2a 100644
--- a/Documentation/devicetree/bindings/net/can/m_can.txt
+++ b/Documentation/devicetree/bindings/net/can/m_can.txt
@@ -43,6 +43,11 @@ Required properties:
 			  Please refer to 2.4.1 Message RAM Configuration in
 			  Bosch M_CAN user manual for details.
 
+Optional Subnode:
+- can-transceiver	: Can-transceiver subnode describing maximum speed
+			  that can be used for CAN/CAN-FD modes. See
+			  Documentation/devicetree/bindings/net/can/can-transceiver.txt
+			  for details.
 Example:
 SoC dtsi:
 m_can1: can@020e8000 {
@@ -64,4 +69,8 @@ Board dts:
 	pinctrl-names = "default";
 	pinctrl-0 = <&pinctrl_m_can1>;
 	status = "enabled";
+
+	can-transceiver {
+		max-bitrate = <5000000>;
+	};
 };
-- 
2.9.4.dirty

^ permalink raw reply related

* [PATCH v5 2/4] dt-bindings: can: can-transceiver: Document new binding
From: Franklin S Cooper Jr @ 2017-08-18 20:21 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-can-u79uwXL29TY76Z2rM5mHXA, wg-5Yr1BZd7O62+XT7JhA+gdA,
	mkl-bIcnvbaLZ9MEGnE8C9+IrQ, robh+dt-DgEjT+Ai2ygdnm+yROfE0A,
	quentin.schulz-wi1+55ScJUtKEb57/3fJTNBPR1lH4CV8,
	dev.kurt-yI9piX4KPfawT/RRk36CISFp6vIno51x, andrew-g2DYL2Zd6BY,
	sergei.shtylyov-M4DtvfQ/ZS1MRgGoP+s0PdBPR1lH4CV8,
	socketcan-fJ+pQTUTwRTk1uMJSBkQmQ
  Cc: Franklin S Cooper Jr
In-Reply-To: <20170818202137.29816-1-fcooper-l0cyMroinI0@public.gmane.org>

Add documentation to describe usage of the new can-transceiver binding.
This new binding is applicable for any CAN device therefore it exists as
its own document.

Signed-off-by: Franklin S Cooper Jr <fcooper-l0cyMroinI0@public.gmane.org>
Acked-by: Rob Herring <robh-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
Version 5 changes:
Remove @ symbol from can-transceiver example

 .../bindings/net/can/can-transceiver.txt           | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/can/can-transceiver.txt

diff --git a/Documentation/devicetree/bindings/net/can/can-transceiver.txt b/Documentation/devicetree/bindings/net/can/can-transceiver.txt
new file mode 100644
index 0000000..0011f53
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/can/can-transceiver.txt
@@ -0,0 +1,24 @@
+Generic CAN transceiver Device Tree binding
+------------------------------
+
+CAN transceiver typically limits the max speed in standard CAN and CAN FD
+modes. Typically these limitations are static and the transceivers themselves
+provide no way to detect this limitation at runtime. For this situation,
+the "can-transceiver" node can be used.
+
+Required Properties:
+ max-bitrate:	a positive non 0 value that determines the max
+		speed that CAN/CAN-FD can run. Any other value
+		will be ignored.
+
+Examples:
+
+Based on Texas Instrument's TCAN1042HGV CAN Transceiver
+
+m_can0 {
+	....
+	can-transceiver {
+		max-bitrate = <5000000>;
+	};
+	...
+};
-- 
2.9.4.dirty

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH v5 1/4] can: dev: Add support for limiting configured bitrate
From: Franklin S Cooper Jr @ 2017-08-18 20:21 UTC (permalink / raw)
  To: linux-kernel, devicetree, netdev, linux-can, wg, mkl, robh+dt,
	quentin.schulz, dev.kurt, andrew, sergei.shtylyov, socketcan
  Cc: Franklin S Cooper Jr
In-Reply-To: <20170818202137.29816-1-fcooper@ti.com>

Various CAN or CAN-FD IP may be able to run at a faster rate than
what the transceiver the CAN node is connected to. This can lead to
unexpected errors. However, CAN transceivers typically have fixed
limitations and provide no means to discover these limitations at
runtime. Therefore, add support for a can-transceiver node that
can be reused by other CAN peripheral drivers to determine for both
CAN and CAN-FD what the max bitrate that can be used. If the user
tries to configure CAN to pass these maximum bitrates it will throw
an error.

Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
---
Version 5 changes:
Set values for some variables at the very top.
Remove usage of is_bitrate_limited

 drivers/net/can/dev.c   | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h |  4 ++++
 2 files changed, 43 insertions(+)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 365a8cc..d0e5b46 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -27,6 +27,7 @@
 #include <linux/can/skb.h>
 #include <linux/can/netlink.h>
 #include <linux/can/led.h>
+#include <linux/of.h>
 #include <net/rtnetlink.h>
 
 #define MOD_DESC "CAN device driver interface"
@@ -814,6 +815,30 @@ int open_candev(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(open_candev);
 
+#ifdef CONFIG_OF
+/*
+ * Common function that can be used to understand the limitation of
+ * a transceiver when it provides no means to determine these limitations
+ * at runtime.
+ */
+void of_can_transceiver(struct net_device *dev)
+{
+	struct device_node *dn;
+	struct can_priv *priv = netdev_priv(dev);
+	struct device_node *np = dev->dev.parent->of_node;
+	int ret;
+
+	dn = of_get_child_by_name(np, "can-transceiver");
+	if (!dn)
+		return;
+
+	ret = of_property_read_u32(dn, "max-bitrate", &priv->max_bitrate);
+	if ((ret && ret != -EINVAL) || (!ret && !priv->max_bitrate))
+		netdev_warn(dev, "Invalid value for transceiver max bitrate. Ignoring bitrate limit.\n");
+}
+EXPORT_SYMBOL(of_can_transceiver);
+#endif
+
 /*
  * Common close function for cleanup before the device gets closed.
  *
@@ -913,6 +938,13 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 					priv->bitrate_const_cnt);
 		if (err)
 			return err;
+
+		if (priv->max_bitrate && bt.bitrate > priv->max_bitrate) {
+			netdev_err(dev, "arbitration bitrate surpasses transceiver capabilities of %d bps\n",
+				   priv->max_bitrate);
+			return -EINVAL;
+		}
+
 		memcpy(&priv->bittiming, &bt, sizeof(bt));
 
 		if (priv->do_set_bittiming) {
@@ -997,6 +1029,13 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 					priv->data_bitrate_const_cnt);
 		if (err)
 			return err;
+
+		if (priv->max_bitrate && dbt.bitrate > priv->max_bitrate) {
+			netdev_err(dev, "canfd data bitrate surpasses transceiver capabilities of %d bps\n",
+				   priv->max_bitrate);
+			return -EINVAL;
+		}
+
 		memcpy(&priv->data_bittiming, &dbt, sizeof(dbt));
 
 		if (priv->do_set_data_bittiming) {
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 141b05a..0063c51 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -47,6 +47,8 @@ struct can_priv {
 	unsigned int data_bitrate_const_cnt;
 	struct can_clock clock;
 
+	unsigned int max_bitrate;
+
 	enum can_state state;
 
 	/* CAN controller features - see include/uapi/linux/can/netlink.h */
@@ -165,6 +167,8 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
 unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
 void can_free_echo_skb(struct net_device *dev, unsigned int idx);
 
+void of_can_transceiver(struct net_device *dev);
+
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
 struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 				struct canfd_frame **cfd);
-- 
2.9.4.dirty

^ permalink raw reply related

* [PATCH v5 0/4] can: Support transceiver based bit rate limit
From: Franklin S Cooper Jr @ 2017-08-18 20:21 UTC (permalink / raw)
  To: linux-kernel, devicetree, netdev, linux-can, wg, mkl, robh+dt,
	quentin.schulz, dev.kurt, andrew, sergei.shtylyov, socketcan
  Cc: Franklin S Cooper Jr

Add a new generic binding that CAN drivers can be used to specify the max
bit rate supported by a transceiver. This is useful since in some instances
since the maximum speeds may be limited by the transceiver used. However,
transceivers may not provide a means to determine this limitation at
runtime. Therefore, create a new binding that mimics "fixed-link" that
allows a user to hardcode the max speeds that can be used.

Also add support for this new binding in the MCAN driver.

Note this is an optional subnode so even if a driver adds support for
parsing can-transceiver the user does not have to define it in their
device tree.

Version 5 changes:
Got rid of is_bitrate_limited
Removed @ symbol from can-transceiver binding

Version 4 changes:
Switch from fixed-transceiver to can-transceiver
Drop unit address that snuck back in again.
Indicate that can-transceiver is a subnode and not a property in
documentation

Version 3 changes:
Switch from having two "max bitrates" to one universal bitrate.

Version 2 changes:
Rename function
Define proper variable default
Drop unit address
Move check to changelink function
Reword commit message
Reword documentation

Franklin S Cooper Jr (4):
  can: dev: Add support for limiting configured bitrate
  dt-bindings: can: can-transceiver: Document new binding
  dt-bindings: can: m_can: Document new can transceiver binding
  can: m_can: Add call to of_can_transceiver

 .../bindings/net/can/can-transceiver.txt           | 24 +++++++++++++
 .../devicetree/bindings/net/can/m_can.txt          |  9 +++++
 drivers/net/can/dev.c                              | 39 ++++++++++++++++++++++
 drivers/net/can/m_can/m_can.c                      |  2 ++
 include/linux/can/dev.h                            |  4 +++
 5 files changed, 78 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/can/can-transceiver.txt

-- 
2.9.4.dirty

^ permalink raw reply

* Re: [PATCH RESEND 0/2] enable hires timer to timeout datagram socket
From: Richard Cochran @ 2017-08-18 20:18 UTC (permalink / raw)
  To: Vallish Vaidyeshwara
  Cc: davem, shuah, netdev, linux-kernel, eduval, anchalag, tglx
In-Reply-To: <1503081850-10671-1-git-send-email-vallish@amazon.com>

On Fri, Aug 18, 2017 at 06:44:08PM +0000, Vallish Vaidyeshwara wrote:
> There has been a behavior change in 4.9 kernel with refactoring of Kernel
> timer wheel in 4.8. We have a use case wherein our datagram socket
> application is sensitive to socket timeout including long timeouts.
> 
> One of the test runs with a timeout value of 180 seconds timed out at
> 190 seconds.

So the whole premise of the new timer wheel is that long timeouts need
not be very accurate.

> Patch 1: Has core code change of enabling hires timer to timeout datagram
> 	 socket on AF_UNIX and AF_INET domain

Using hrtimers will hurt performance for most applications.  Can you
please explain your use case and why is it so important?

Thanks,
Richard

^ permalink raw reply

* [PATCH net-next] liquidio: fix Smatch error
From: Felix Manlunas @ 2017-08-18 20:07 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	intiyaz.basha

From: Intiyaz Basha <intiyaz.basha@cavium.com>

Fix Smatch error by not dereferencing iq pointer if it's NULL.

See http://marc.info/?l=kernel-janitors&m=150296723301129&w=2

Also, remove unnecessary parentheses.

Fixes: d314ac222829 ("liquidio: moved liquidio_napi_poll to lio_core.c")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index d4f0646..0e7896c 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -668,8 +668,8 @@ static int liquidio_napi_poll(struct napi_struct *napi, int budget)
 
 #define MAX_REG_CNT  2000000U
 	/* force enable interrupt if reg cnts are high to avoid wraparound */
-	if (((work_done < budget) && (tx_done)) ||
-	    (iq->pkt_in_done >= MAX_REG_CNT) ||
+	if ((work_done < budget && tx_done) ||
+	    (iq && iq->pkt_in_done >= MAX_REG_CNT) ||
 	    (droq->pkt_count >= MAX_REG_CNT)) {
 		tx_done = 1;
 		napi_complete_done(napi, work_done);
-- 
1.8.3.1

^ permalink raw reply related

* Re: [net-next PATCH] ipv6: fix false-postive maybe-uninitialized warning
From: Arnd Bergmann @ 2017-08-18 19:46 UTC (permalink / raw)
  To: David Miller
  Cc: kuznet, yoshfuji, Florian Westphal, dsahern, kafai, weiwan,
	xiyou.wangcong, Networking, Linux Kernel Mailing List
In-Reply-To: <20170818.104940.305624900632112023.davem@davemloft.net>

On Fri, Aug 18, 2017 at 7:49 PM, David Miller <davem@davemloft.net> wrote:
> From: Arnd Bergmann <arnd@arndb.de>
> Date: Fri, 18 Aug 2017 13:34:22 +0200

>>
>> This kind of warning involving an unlock between variable initialization
>> and use is relatively frequent for false-positives. I should try to
>> seek clarification from the gcc developers on whether this can be
>> improved.
>
> This will have to do for now I suppose.
>
> I guess the issue is that if the local variable ever sits on the stack
> then the memory barriers in the locks block the full dataflow
> analysis.
>
> But this makes no sense from a dataflow perspective.  Even if the
> local variable has a stack slot, there is no "escapability" of that
> memory addres to foreign modifications.
>
> If I had a nickel for every uninitialized variable warning we had to
> work around....

Since this pattern has come up so often, I spent most of my working
day today on a reduced testcase, and ended up with this surprising
snippet:

int f(void);
static inline void rcu_read_unlock(void)
{
        static _Bool __warned;
        if (f() && !__warned && !f()) {
                __warned = 1;
        }
}
int inet6_rtm_getroute(void)
{
        int dst;
        int fibmatch = f();

        if (!fibmatch)
                dst = f();
        rcu_read_unlock();
        if (fibmatch)
                dst = 0;

        return dst;
}

So at least in this particular case, the culprit is not actually
a memory barrier, but RCU_LOCKDEP_WARN(). A related
problem is __branch_check__()/__trace_if().

While the maybe-uninitialized warnings are unreliable by
definition, I think that case really should be understood by gcc.

I looked through the gcc bug database which has countless
entries but doesn't seem to have this one yet, so I opened
a new bug:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81897

Unfortunately the basic behavior shows up in gcc-4.7 already,
so it has no chance of getting fixed on older compilers.

        Arnd

^ permalink raw reply

* [RFC PATCH] can: m_can: Support higher speed CAN-FD bitrates
From: Franklin S Cooper Jr @ 2017-08-18 19:39 UTC (permalink / raw)
  To: wg, mkl, mario.huettel, socketcan, quentin.schulz, edumazet,
	linux-can, netdev, linux-kernel
  Cc: Franklin S Cooper Jr

During test transmitting using CAN-FD at high bitrates (4 Mbps) only
resulted in errors. Scoping the signals I noticed that only a single bit
was being transmitted and with a bit more investigation realized the actual
MCAN IP would go back to initialization mode automatically.

It appears this issue is due to the MCAN needing to use the Transmitter
Delay Compensation Mode as defined in the MCAN User's Guide. When this
mode is used the User's Guide indicates that the Transmitter Delay
Compensation Offset register should be set. The document mentions that this
register should be set to (1/dbitrate)/2*(Func Clk Freq).

Additional CAN-CIA's "Bit Time Requirements for CAN FD" document indicates
that this TDC mode is only needed for data bit rates above 2.5 Mbps.
Therefore, only enable this mode and only set TDCO when the data bit rate
is above 2.5 Mbps.

Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
---
I'm pretty surprised that this hasn't been implemented already since
the primary purpose of CAN-FD is to go beyond 1 Mbps and the MCAN IP
supports up to 10 Mbps.

So it will be nice to get comments from users of this driver to understand
if they have been able to use CAN-FD beyond 2.5 Mbps without this patch.
If they haven't what did they do to get around it if they needed higher
speeds.

Meanwhile I plan on testing this using a more "realistic" CAN bus to insure
everything still works at 5 Mbps which is the max speed of my CAN
transceiver.

 drivers/net/can/m_can/m_can.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index f4947a7..720e073 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -126,6 +126,12 @@ enum m_can_mram_cfg {
 #define DBTP_DSJW_SHIFT		0
 #define DBTP_DSJW_MASK		(0xf << DBTP_DSJW_SHIFT)

+/* Transmitter Delay Compensation Register (TDCR) */
+#define TDCR_TDCO_SHIFT		8
+#define TDCR_TDCO_MASK		(0x7F << TDCR_TDCO_SHIFT)
+#define TDCR_TDCF_SHIFT		0
+#define TDCR_TDCF_MASK		(0x7F << TDCR_TDCO_SHIFT)
+
 /* Test Register (TEST) */
 #define TEST_LBCK		BIT(4)

@@ -977,6 +983,8 @@ static int m_can_set_bittiming(struct net_device *dev)
 	const struct can_bittiming *dbt = &priv->can.data_bittiming;
 	u16 brp, sjw, tseg1, tseg2;
 	u32 reg_btp;
+	u32 enable_tdc = 0;
+	u32 tdco;

 	brp = bt->brp - 1;
 	sjw = bt->sjw - 1;
@@ -991,9 +999,23 @@ static int m_can_set_bittiming(struct net_device *dev)
 		sjw = dbt->sjw - 1;
 		tseg1 = dbt->prop_seg + dbt->phase_seg1 - 1;
 		tseg2 = dbt->phase_seg2 - 1;
+
+		/* TDC is only needed for bitrates beyond 2.5 MBit/s
+		 * Specified in the "Bit Time Requirements for CAN FD" document
+		 */
+		if (dbt->bitrate > 2500000) {
+			enable_tdc = DBTP_TDC;
+			/* Equation based on Bosch's M_CAN User Manual's
+			 * Transmitter Delay Compensation Section
+			 */
+			tdco = priv->can.clock.freq / (dbt->bitrate * 2);
+			m_can_write(priv, M_CAN_TDCR, tdco << TDCR_TDCO_SHIFT);
+		}
+
 		reg_btp = (brp << DBTP_DBRP_SHIFT) | (sjw << DBTP_DSJW_SHIFT) |
 			(tseg1 << DBTP_DTSEG1_SHIFT) |
-			(tseg2 << DBTP_DTSEG2_SHIFT);
+			(tseg2 << DBTP_DTSEG2_SHIFT) | enable_tdc;
+
 		m_can_write(priv, M_CAN_DBTP, reg_btp);
 	}

-- 
2.9.4.dirty

^ permalink raw reply related

* Re: [iproute PATCH 21/51] lib/libnetlink: Don't pass NULL parameter to memcpy()
From: Lance Richardson @ 2017-08-18 19:13 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Phil Sutter
In-Reply-To: <20170815164255.GA10864@orbyte.nwl.cc>

> From: "Phil Sutter" <phil@nwl.cc>
> To: "Stephen Hemminger" <stephen@networkplumber.org>
> Cc: netdev@vger.kernel.org
> Sent: Tuesday, August 15, 2017 12:42:55 PM
> Subject: Re: [iproute PATCH 21/51] lib/libnetlink: Don't pass NULL parameter to memcpy()
> 
> On Tue, Aug 15, 2017 at 08:15:55AM -0700, Stephen Hemminger wrote:
> > On Sat, 12 Aug 2017 14:04:40 +0200
> > Phil Sutter <phil@nwl.cc> wrote:
> > 
> > > Both addattr_l() and rta_addattr_l() may be called with NULL data
> > > pointer and 0 alen parameters. Avoid calling memcpy() in that case.
> > > 
> > > Signed-off-by: Phil Sutter <phil@nwl.cc>
> > 
> > What are you fixing. memcpy(dest, NULL, 0) should be harmless NOP
> 
> Yes, if that turns into a NOP this patch is not needed.
> 
> Thanks, Phil
> 

It is a NOP in this case, but it is also "undefined behavior" and can lead
to the compiler assuming that dest != NULL, which would be problematic
if dest were dereferenced later in the code (it isn't in this case, but
might be in general).

A small example with current gcc:

foo.c:
    #include <stdio.h>

    extern void foo(char *, size_t);

    int main(int argc, char **argv)
    {
            char x[128];

            foo(x, sizeof x);
            foo(NULL, 0);

            return 0;
    }

bar.c:
    #include <stdio.h>
    #include <string.h>

    void foo(char *ptr, size_t len)
    {
            memset(ptr, 0, len);

            if (ptr)
                    printf("ptr is non-null: %p\n", ptr);
    }

Compile the code:

    $ gcc -o foobar -O2 foo.c bar.c

Execute it (note second line of output, which might be surprising):

    $ ./foobar
    ptr is non-null: 0x7ffdc47daef0
    ptr is non-null: (nil)


Regards,

    Lance Richardson

^ permalink raw reply

* Re: [PATCH net-next v4] openvswitch: enable NSH support
From: Eric Garver @ 2017-08-18 19:09 UTC (permalink / raw)
  To: Yi Yang; +Cc: netdev, dev, blp, jbenc, jan.scheurich
In-Reply-To: <1503041071-68753-1-git-send-email-yi.y.yang@intel.com>

On Fri, Aug 18, 2017 at 03:24:31PM +0800, Yi Yang wrote:
> v3->v4
>  - Add new NSH match field ttl
>  - Update NSH header to the latest format
>    which will be final format and won't change
>    per its author's confirmation.
>  - Fix comments for v3.

Hi Yi,
Only a few comments below since Jiri already supplied lots of feedback.

> 
> v2->v3
>  - Change OVS_KEY_ATTR_NSH to nested key to handle
>    length-fixed attributes and length-variable
>    attriubte more flexibly.
>  - Remove struct ovs_action_push_nsh completely
>  - Add code to handle nested attribute for SET_MASKED
>  - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
>    to transfer NSH header data.
>  - Fix comments and coding style issues by Jiri and Eric
> 
> v1->v2
>  - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
>  - Dynamically allocate struct ovs_action_push_nsh for
>    length-variable metadata.
> 
> OVS master and 2.8 branch has merged NSH userspace
> patch series, this patch is to enable NSH support
> in kernel data path in order that OVS can support
> NSH in 2.8 release in compat mode by porting this.
> 
> Signed-off-by: Yi Yang <yi.y.yang@intel.com>
> ---
[..]
> @@ -1210,6 +1373,20 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>  		case OVS_ACTION_ATTR_POP_ETH:
>  			err = pop_eth(skb, key);
>  			break;
> +
> +		case OVS_ACTION_ATTR_PUSH_NSH: {
> +			u8 buffer[256];

Use NSH_M_TYPE2_MAX_LEN

> +			struct nsh_hdr *nsh_hdr = (struct nsh_hdr *)buffer;
> +			const struct nsh_hdr *nsh_src = nsh_hdr;
> +
> +			nsh_hdr_from_nlattr(nla_data(a), nsh_hdr);
> +			err = push_nsh(skb, key, nsh_src);
> +			break;
> +		}
> +
> +		case OVS_ACTION_ATTR_POP_NSH:
> +			err = pop_nsh(skb, key);
> +			break;
>  		}
>  
>  		if (unlikely(err)) {
[..]
> +int nsh_key_from_nlattr(const struct nlattr *attr,
> +			struct ovs_key_nsh *nsh)
> +{
> +	struct nlattr *a;
> +	int rem;
> +	bool has_md1 = false;
> +	bool has_md2 = false;
> +
> +	nla_for_each_nested(a, attr, rem) {
> +		int type = nla_type(a);
> +
> +		if (type > OVS_NSH_KEY_ATTR_MAX) {
> +			OVS_NLERR(1, "nsh attr %d is out of range max %d",
> +				  type, OVS_NSH_KEY_ATTR_MAX);
> +			return -EINVAL;
> +		}
> +
> +		if (!check_attr_len(nla_len(a),
> +				    ovs_nsh_key_attr_lens[type].len)) {
> +			OVS_NLERR(
> +			    1,
> +			    "nsh attr %d has unexpected len %d expected %d",
> +			    type,
> +			    nla_len(a),
> +			    ovs_nsh_key_attr_lens[type].len
> +			);
> +			return -EINVAL;
> +		}
> +
> +		switch (type) {
> +		case OVS_NSH_KEY_ATTR_BASE: {
> +			const struct ovs_nsh_key_base *base =
> +				(struct ovs_nsh_key_base *)nla_data(a);
> +
> +			memcpy(nsh, base, sizeof(*base));
> +			break;
> +		}
> +		case OVS_NSH_KEY_ATTR_MD1: {
> +			const struct ovs_nsh_key_md1 *md1 =
> +				(struct ovs_nsh_key_md1 *)nla_data(a);
> +
> +			has_md1 = true;
> +			memcpy(nsh->context, md1->context, sizeof(*md1));
> +			break;
> +		}
> +		case OVS_NSH_KEY_ATTR_MD2:
> +			/* Not supported yet */

return -ENOTPSUPP if it's not supported.

> +			has_md2 = true;
> +			break;
> +		default:
> +			OVS_NLERR(1, "Unknown nsh attribute %d",
> +				  type);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (rem > 0) {
> +		OVS_NLERR(1, "nsh attribute has %d unknown bytes.", rem);
> +		return -EINVAL;
> +	}
> +
> +	if ((has_md1 && nsh->mdtype != NSH_M_TYPE1) ||
> +	    (has_md2 && nsh->mdtype != NSH_M_TYPE2)) {
> +		OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
> +			  nsh->mdtype);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static int nsh_key_put_from_nlattr(const struct nlattr *attr,
> +				   struct sw_flow_match *match, bool is_mask,
> +				   bool log)
> +{
> +	struct nlattr *a;
> +	int rem;
> +	bool has_md1 = false;
> +	bool has_md2 = false;
> +	u8 mdtype = 0;
> +
> +	nla_for_each_nested(a, attr, rem) {
> +		int type = nla_type(a);
> +		int i;
> +
> +		if (type > OVS_NSH_KEY_ATTR_MAX) {
> +			OVS_NLERR(log, "nsh attr %d is out of range max %d",
> +				  type, OVS_NSH_KEY_ATTR_MAX);
> +			return -EINVAL;
> +		}
> +
> +		if (!check_attr_len(nla_len(a),
> +				    ovs_nsh_key_attr_lens[type].len)) {
> +			OVS_NLERR(
> +			    log,
> +			    "nsh attr %d has unexpected len %d expected %d",
> +			    type,
> +			    nla_len(a),
> +			    ovs_nsh_key_attr_lens[type].len
> +			);
> +			return -EINVAL;
> +		}
> +
> +		switch (type) {
> +		case OVS_NSH_KEY_ATTR_BASE: {
> +			const struct ovs_nsh_key_base *base =
> +				(struct ovs_nsh_key_base *)nla_data(a);
> +
> +			mdtype = base->mdtype;
> +			SW_FLOW_KEY_PUT(match, nsh.flags,
> +					base->flags, is_mask);
> +			SW_FLOW_KEY_PUT(match, nsh.ttl,
> +					base->ttl, is_mask);
> +			SW_FLOW_KEY_PUT(match, nsh.mdtype,
> +					base->mdtype, is_mask);
> +			SW_FLOW_KEY_PUT(match, nsh.np,
> +					base->np, is_mask);
> +			SW_FLOW_KEY_PUT(match, nsh.path_hdr,
> +					base->path_hdr, is_mask);
> +			break;
> +		}
> +		case OVS_NSH_KEY_ATTR_MD1: {
> +			const struct ovs_nsh_key_md1 *md1 =
> +				(struct ovs_nsh_key_md1 *)nla_data(a);
> +
> +			has_md1 = true;
> +			for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++)
> +				SW_FLOW_KEY_PUT(match, nsh.context[i],
> +						md1->context[i], is_mask);
> +			break;
> +		}
> +		case OVS_NSH_KEY_ATTR_MD2:
> +			/* Not supported yet */

return -ENOTPSUPP if it's not supported.

> +			has_md2 = true;
> +			break;
> +		default:
> +			OVS_NLERR(log, "Unknown nsh attribute %d",
> +				  type);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	if (rem > 0) {
> +		OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem);
> +		return -EINVAL;
> +	}
> +
> +	if (!is_mask) {
> +		if ((has_md1 && mdtype != NSH_M_TYPE1) ||
> +		    (has_md2 && mdtype != NSH_M_TYPE2)) {
> +			OVS_NLERR(1, "nsh attribute has unmatched MD type %d.",
> +				  mdtype);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
[..]
> @@ -2636,6 +2984,17 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
>  			mac_proto = MAC_PROTO_ETHERNET;
>  			break;
>  
> +		case OVS_ACTION_ATTR_PUSH_NSH:

You need to some validation here, especially the metadata lengths.
Relying on action_lens is not enough because it's variable.

> +			mac_proto = MAC_PROTO_NONE;
> +			break;
> +
> +		case OVS_ACTION_ATTR_POP_NSH:
> +			if (key->nsh.np == NSH_P_ETHERNET)
> +				mac_proto = MAC_PROTO_ETHERNET;
> +			else
> +				mac_proto = MAC_PROTO_NONE;
> +			break;
> +
>  		default:
>  			OVS_NLERR(log, "Unknown Action type %d", type);
>  			return -EINVAL;
[..]

^ permalink raw reply

* [PATCH v3 net-next] ipv4: convert dst_metrics.refcnt from atomic_t to refcount_t
From: Eric Dumazet @ 2017-08-18 19:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Cong Wang
In-Reply-To: <CAM_iQpWbxs6aa-axQrwN5pipaCDX3ws_08LaQON8p4WfrJ7SDg@mail.gmail.com>

From: Eric Dumazet <edumazet@google.com>

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
v3: added the include <linux/refcount.h> to be safe (Cong Wang)
v2: fix a missing change in net/ipv4/fib_semantics.c

 include/net/dst.h        |    3 ++-
 net/core/dst.c           |    6 +++---
 net/ipv4/fib_semantics.c |    4 ++--
 net/ipv4/route.c         |    4 ++--
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index f73611ec401754d4f52b5310a24da53566dafce6..93568bd0a3520bb7402f04d90cf04ac99c81cfbe 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -14,6 +14,7 @@
 #include <linux/rcupdate.h>
 #include <linux/bug.h>
 #include <linux/jiffies.h>
+#include <linux/refcount.h>
 #include <net/neighbour.h>
 #include <asm/processor.h>
 
@@ -107,7 +108,7 @@ struct dst_entry {
 
 struct dst_metrics {
 	u32		metrics[RTAX_MAX];
-	atomic_t	refcnt;
+	refcount_t	refcnt;
 };
 extern const struct dst_metrics dst_default_metrics;
 
diff --git a/net/core/dst.c b/net/core/dst.c
index 00aa972ad1a1a451c24f3f8211243ad35c19433a..d6ead757c25895da01eb61bc9636c7e9b3cdfb3e 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -55,7 +55,7 @@ const struct dst_metrics dst_default_metrics = {
 	 * We really want to avoid false sharing on this variable, and catch
 	 * any writes on it.
 	 */
-	.refcnt = ATOMIC_INIT(1),
+	.refcnt = REFCOUNT_INIT(1),
 };
 
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
@@ -213,7 +213,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
 		struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
 		unsigned long prev, new;
 
-		atomic_set(&p->refcnt, 1);
+		refcount_set(&p->refcnt, 1);
 		memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
 
 		new = (unsigned long) p;
@@ -225,7 +225,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
 			if (prev & DST_METRICS_READ_ONLY)
 				p = NULL;
 		} else if (prev & DST_METRICS_REFCOUNTED) {
-			if (atomic_dec_and_test(&old_p->refcnt))
+			if (refcount_dec_and_test(&old_p->refcnt))
 				kfree(old_p);
 		}
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d521caf57385fa05f76036708057b95052330cb1..394d800db50c77c21b65e14569eb4d8b5246406f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -220,7 +220,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 	} endfor_nexthops(fi);
 
 	m = fi->fib_metrics;
-	if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
 		kfree(m);
 	kfree(fi);
 }
@@ -1090,7 +1090,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			kfree(fi);
 			return ERR_PTR(err);
 		}
-		atomic_set(&fi->fib_metrics->refcnt, 1);
+		refcount_set(&fi->fib_metrics->refcnt, 1);
 	} else {
 		fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d400c05431063fc7bdd15b83ab540acc86decb3d..872b4cb136d3fa0cda403836cc83a156a65310a3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1398,7 +1398,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
 	struct rtable *rt = (struct rtable *) dst;
 
-	if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
+	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
 		kfree(p);
 
 	if (!list_empty(&rt->rt_uncached)) {
@@ -1456,7 +1456,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
 		if (fi->fib_metrics != &dst_default_metrics) {
 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
-			atomic_inc(&fi->fib_metrics->refcnt);
+			refcount_inc(&fi->fib_metrics->refcnt);
 		}
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;

^ permalink raw reply related

* [PATCH net v3] datagram: When peeking datagrams with offset < 0 don't skip empty skbs
From: Matthew Dawson @ 2017-08-18 19:04 UTC (permalink / raw)
  To: netdev; +Cc: Matthew Dawson, Macieira, Thiago, willemdebruijn.kernel,
	Paolo Abeni

Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
headers from UDP packets before queueing"), when udp packets are being
peeked the requested extra offset is always 0 as there is no need to skip
the udp header.  However, when the offset is 0 and the next skb is
of length 0, it is only returned once.  The behaviour can be seen with
the following python script:

from socket import *;
f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
f.bind(('::', 0));
addr=('::1', f.getsockname()[1]);
g.sendto(b'', addr)
g.sendto(b'b', addr)
print(f.recvfrom(10, MSG_PEEK));
print(f.recvfrom(10, MSG_PEEK));

Where the expected output should be the empty string twice.

Instead, make sk_peek_offset return negative values, and pass those values
to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed offset
to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
__skb_try_recv_from_queue will then ensure the offset is reset back to 0
if a peek is requested without an offset, unless no packets are found.

Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
greater then 0, and off is greater then or equal to skb->len, then
(_off || skb->len) must always be true assuming skb->len >= 0 is always
true.

Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
as it double checked if MSG_PEEK was set in the flags.

V2:
 - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
redundant checks
 - Fix peeking in udp{,v6}_recvmsg to report the right value when the
offset is 0

V3:
 - Marked new branch in __skb_try_recv_from_queue as unlikely.

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Acked-by: Willem de Bruijn <willemb@google.com>
---
 include/net/sock.h  |  4 +---
 net/core/datagram.c | 12 +++++++++---
 net/ipv4/udp.c      |  3 ++-
 net/ipv6/udp.c      |  3 ++-
 net/unix/af_unix.c  |  5 +----
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..aeeec62992ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val);
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
-		s32 off = READ_ONCE(sk->sk_peek_off);
-		if (off >= 0)
-			return off;
+		return READ_ONCE(sk->sk_peek_off);
 	}
 
 	return 0;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..a21ca8dee5ea 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  int *peeked, int *off, int *err,
 					  struct sk_buff **last)
 {
+	bool peek_at_off = false;
 	struct sk_buff *skb;
-	int _off = *off;
+	int _off = 0;
+
+	if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+		peek_at_off = true;
+		_off = *off;
+	}
 
 	*last = queue->prev;
 	skb_queue_walk(queue, skb) {
 		if (flags & MSG_PEEK) {
-			if (_off >= skb->len && (skb->len || _off ||
-						 skb->peeked)) {
+			if (peek_at_off && _off >= skb->len &&
+			    (_off || skb->peeked)) {
 				_off -= skb->len;
 				continue;
 			}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a7c804f73990..cd1d044a7fa5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..20039c8501eb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7b52a380d710..be8982b4f8c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	 */
 	mutex_lock(&u->iolock);
 
-	if (flags & MSG_PEEK)
-		skip = sk_peek_offset(sk, flags);
-	else
-		skip = 0;
+	skip = max(sk_peek_offset(sk, flags), 0);
 
 	do {
 		int chunk;
-- 
2.13.0

^ permalink raw reply related

* [PATCH RESEND 2/2] selftests/net: add test to verify datagram socket timeout
From: Vallish Vaidyeshwara @ 2017-08-18 18:44 UTC (permalink / raw)
  To: davem, shuah, netdev, linux-kernel; +Cc: eduval, anchalag, vallish
In-Reply-To: <1503081850-10671-1-git-send-email-vallish@amazon.com>

AF_UNIX and AF_INET datagram sockets use high resolution timer to time
SO_RCVTIMEO value used with setsockopt(2). This test checks for the
accuracy of kernel notifying these sockets timeout to application. Test
program has code to check AF_UNIX socket, however the kernel function used
to timeout AF_INET socket is the same kernel function used by AF_UNIX as
well which is __skb_wait_for_more_packets().

Reported-by: Manjula Peiris <thelgep@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 tools/testing/selftests/net/Makefile               |   3 +-
 .../testing/selftests/net/datagram_sock_timeout.c  | 119 +++++++++++++++++++++
 .../selftests/net/run_datagram_sock_timeout.sh     |  12 +++
 3 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/datagram_sock_timeout.c
 create mode 100755 tools/testing/selftests/net/run_datagram_sock_timeout.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index f6c9dbf..eb5a8c7 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -3,11 +3,12 @@
 CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
 CFLAGS += -I../../../../usr/include/
 
-TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh
+TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh run_datagram_sock_timeout.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket
 TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_FILES += reuseport_dualstack
+TEST_GEN_FILES += datagram_sock_timeout
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/net/datagram_sock_timeout.c b/tools/testing/selftests/net/datagram_sock_timeout.c
new file mode 100644
index 0000000..2549be5
--- /dev/null
+++ b/tools/testing/selftests/net/datagram_sock_timeout.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2017 Amazon.com, Inc.
+ * Author: Manjula Peiris <thelgep@amazon.com>
+ *         Vallish Vaidyeshwara <vallish@amazon.com>
+ *
+ * selftests/net: test to verify datagram socket timeout
+ *
+ * AF_UNIX and AF_INET datagram sockets use high resolution timer to time
+ * SO_RCVTIMEO value used with setsockopt(2). This test checks for the accuracy
+ * of kernel notifying these sockets timeout to application. Test program has
+ * code to check AF_UNIX socket, however the kernel function used to timeout
+ * AF_INET socket is the same kernel function used by AF_UNIX as well which is
+ * __skb_wait_for_more_packets().
+ *
+ * License (GPLv2):
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.
+ */
+
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <err.h>
+#include <errno.h>
+#include <sys/un.h>
+#include <time.h>
+#include <assert.h>
+
+#define BUF_SIZE 128
+#define KB 1024
+#define NUM_FD 2
+
+static int set_socket_timeout(int sockfd, unsigned int ms)
+{
+	int ret;
+	struct timeval timeout;
+	socklen_t cb = sizeof(timeout);
+
+	timeout.tv_sec = ms / 1000;
+	timeout.tv_usec = (ms % 1000) * 1000;
+	ret = setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO, &timeout, cb);
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	char err[BUF_SIZE];
+	int ret;
+	int fds[NUM_FD];
+	struct msghdr message;
+	char buffer[KB];
+	struct sockaddr_storage src_addr;
+	struct iovec iov[1];
+	time_t start, end;
+	unsigned int timeout;
+
+	iov[0].iov_base = buffer;
+	iov[0].iov_len = sizeof(buffer);
+	message.msg_name = &src_addr;
+	message.msg_namelen = sizeof(src_addr);
+	message.msg_iov = iov;
+	message.msg_iovlen = 1;
+	message.msg_control = 0;
+	message.msg_controllen = 0;
+
+	if (argc != 2) {
+		fprintf(stderr,
+			"datagram_sock_timeout failed: no timeout specified\n");
+		return -1;
+	}
+	timeout = (unsigned int)(atoi(argv[1]));
+
+	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, fds) != 0) {
+		strerror_r(errno, err, BUF_SIZE);
+		fprintf(stderr, "socketpair() call failed with %s\n", err);
+		return -1;
+	}
+
+	if (set_socket_timeout(fds[0], timeout) != 0) {
+		strerror_r(errno, err, BUF_SIZE);
+		fprintf(stderr, "setsockopt() call failed with %s\n", err);
+		return -1;
+	}
+
+	start = time(NULL);
+	ret = (int)recvmsg(fds[0], &message, 0);
+	end = time(NULL);
+	if (!(ret == -1 && errno == 11)) {
+		fprintf(stderr,
+			"datagram_sock_timeout failed: test was interrupted\n");
+		return -1;
+	}
+
+	if (((int)(end - start)) != (timeout / 1000)) {
+		fprintf(stderr,
+			"datagram_sock_timeout failed: took %.2f seconds\n",
+			(double)(end - start));
+		return -1;
+	}
+
+	close(fds[0]);
+	close(fds[1]);
+
+	fprintf(stderr, "datagram_sock_timeout passed\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/net/run_datagram_sock_timeout.sh b/tools/testing/selftests/net/run_datagram_sock_timeout.sh
new file mode 100755
index 0000000..d5f4f82
--- /dev/null
+++ b/tools/testing/selftests/net/run_datagram_sock_timeout.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+# Runs datagram socket timeout test
+
+echo "--------------------"
+echo "running run_datagram_sock_timeout test"
+echo "--------------------"
+./datagram_sock_timeout 180000
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+else
+	echo "[PASS]"
+fi
-- 
2.7.3.AMZN

^ permalink raw reply related

* [PATCH RESEND 1/2] net: enable high resolution timer mode to timeout datagram sockets
From: Vallish Vaidyeshwara @ 2017-08-18 18:44 UTC (permalink / raw)
  To: davem, shuah, netdev, linux-kernel; +Cc: eduval, anchalag, vallish
In-Reply-To: <1503081850-10671-1-git-send-email-vallish@amazon.com>

Enable high resolution timer mode to time SO_RCVTIMEO value used with
setsockopt(2) on AF_UNIX and AF_INET datagram sockets. By default,
SO_RCVTIMEO uses low resolution timer which is good for most of socket
use cases.

Background:
Kernel timer wheel was refactored in 4.8 to avoid drawbacks with previous
implementation:
https://lwn.net/Articles/691064/
Unlike the previous "kernel timer wheel" implementation in 4.4 which aimed
for accuracy by paying cost for cascading tracked timers at the boundary of
256 jiffies, the new timer wheel implementation gets rid of cascading
latency by paying a price for being less accurate for far off timers.

Use Case:
New implementation is good for most of socket use cases. However we have a
use case where our application is sensitive to socket timeout including
long timeouts.  Please refer to test code as part of this patch series.
One of the test runs with a timeout value of 180 seconds timed out at
190 seconds.
[root@]# ./datagram_sock_timeout 180000
datagram_sock_timeout failed: took 190.00 seconds
[root@]#
The same program when run on a 4.4 kernel would timeout more acurately and
the kernel added slack was not noticeable to user application.

Interesting text:
a) Standards for setsockopt:
http://pubs.opengroup.org/onlinepubs/009695399/functions/setsockopt.html
<snip>
SO_RCVTIMEO
Sets the timeout value that specifies the maximum amount of time an input
function waits until it completes. It accepts a timeval structure with the
number of seconds and microseconds specifying the limit on how long to wait
for an input operation to complete. If a receive operation has blocked for
this much time without receiving additional data, it shall return with a
partial count or errno set to [EAGAIN] or [EWOULDBLOCK] if no data is
received. The default for this option is zero, which indicates that a
receive operation shall not time out. This option takes a timeval
structure. Note that not all implementations allow this option to be set.
<end snip>
This only talks about the maximum time and the current behavior indeed
follows this standard. System call does not return before the time
specified and it does return EAGAIN.
b) Man page for SETSOCKOPT(3P):
<snip>
The  option_name  argument  specifies  a  single  option to set. It can be
one of the socket-level options defined in <sys_socket.h> and described in
Section 2.10.16, Use of Options.  If option_name is equal to SO_RCVTIMEO
or SO_SNDTIMEO and the implementation supports setting the option, it is
unspecified whether the struct timeval  pointed  to by  option_value  is
stored  as  provided by this function or is rounded up to align with the
resolution of the clock being used. If setsockopt() is called with
option_name equal to SO_ACCEPTCONN, SO_ERROR, or SO_TYPE, the behavior is
unspecified.
<end snip>
Behavior is unspecified.
3) Man page for SELECT(2):
<snip>
Note  that  the  timeout  interval  will  be  rounded up to the system
clock granularity, and kernel scheduling delays mean that the blocking
interval may overrun by a small amount.  If both fields of the timeval
structure are zero, then select() returns immediately.  (This is useful
for polling.)  If timeout is NULL (no timeout),  select()  can block
indefinitely.
<end snip>
Select system call guarantees timeout interval and inturn uses highres
timer.

Reported-by: Manjula Peiris <thelgep@amazon.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Anchal Agarwal <anchalag@amazon.com>
Signed-off-by: Vallish Vaidyeshwara <vallish@amazon.com>
---
 net/core/datagram.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647b..c89a104 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -80,6 +80,7 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, key);
 }
+
 /*
  * Wait for the last received packet to be different from skb
  */
@@ -87,6 +88,8 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 				const struct sk_buff *skb)
 {
 	int error;
+	ktime_t expires;
+	unsigned long pre_sched_time;
 	DEFINE_WAIT_FUNC(wait, receiver_wake_function);

 	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
@@ -116,7 +119,13 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 		goto interrupted;

 	error = 0;
-	*timeo_p = schedule_timeout(*timeo_p);
+	/* Wait using highres timer */
+	expires = ktime_add_ns(ktime_get(), jiffies_to_nsecs(*timeo_p));
+	pre_sched_time = jiffies;
+	if (schedule_hrtimeout(&expires, HRTIMER_MODE_ABS))
+		*timeo_p = jiffies - pre_sched_time;
+	else
+		*timeo_p = 0;
 out:
 	finish_wait(sk_sleep(sk), &wait);
 	return error;
-- 
2.7.3.AMZN

^ permalink raw reply related

* [PATCH RESEND 0/2] enable hires timer to timeout datagram socket
From: Vallish Vaidyeshwara @ 2017-08-18 18:44 UTC (permalink / raw)
  To: davem, shuah, netdev, linux-kernel; +Cc: eduval, anchalag, vallish

Hello Dave,

Resending the patch series to include netdev mailing list with a
cover letter.

I am submitting 2 patch series to enable hires timer to timeout
datagram sockets (AF_UNIX & AF_INET domain) and test code to test
timeout accuracy on these sockets.

There has been a behavior change in 4.9 kernel with refactoring of Kernel
timer wheel in 4.8. We have a use case wherein our datagram socket
application is sensitive to socket timeout including long timeouts.

One of the test runs with a timeout value of 180 seconds timed out at
190 seconds.
[root@]# ./datagram_sock_timeout 180000
datagram_sock_timeout failed: took 190.00 seconds
[root@]#
The same program when run on a 4.4 kernel would timeout more accurately and
the kernel added slack was not noticeable to user application.

Patch 1: Has core code change of enabling hires timer to timeout datagram
	 socket on AF_UNIX and AF_INET domain
Patch 2: Test code to report regression in timeout behavior related to
	 patch 1

Vallish Vaidyeshwara (2):
  net: enable high resolution timer mode to timeout datagram sockets
  selftests/net: add test to verify datagram socket timeout

 net/core/datagram.c                                |  11 +-
 tools/testing/selftests/net/Makefile               |   3 +-
 .../testing/selftests/net/datagram_sock_timeout.c  | 119 +++++++++++++++++++++
 .../selftests/net/run_datagram_sock_timeout.sh     |  12 +++
 4 files changed, 143 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/datagram_sock_timeout.c
 create mode 100755 tools/testing/selftests/net/run_datagram_sock_timeout.sh

-- 
2.7.3.AMZN

^ permalink raw reply

* Re: Does the kernel have a function to parse a text IPv6 address?
From: Cong Wang @ 2017-08-18 18:37 UTC (permalink / raw)
  To: David Howells; +Cc: Linux Kernel Network Developers
In-Reply-To: <30054.1503081121@warthog.procyon.org.uk>

On Fri, Aug 18, 2017 at 11:32 AM, David Howells <dhowells@redhat.com> wrote:
> Does the kernel have a function to parse a text IPv6 address of the form
> "x:y:..::z" and put it into a struct sockaddr_in6?
>

in6_pton() in net/core/utils.c.

^ permalink raw reply

* [PATCH net-next 2/2] liquidio: make VF driver notify NIC firmware of MTU change
From: Felix Manlunas @ 2017-08-18 18:35 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	veerasenareddy.burru
In-Reply-To: <20170818183432.GA4487@felix-thinkpad.cavium.com>

Signed-off-by: Veerasenareddy Burru <veerasenareddy.burru@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 0402b18..e947783 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1545,13 +1545,27 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
 static int liquidio_change_mtu(struct net_device *netdev, int new_mtu)
 {
 	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+	int ret = 0;
 
-	lio->mtu = new_mtu;
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
 
-	netif_info(lio, probe, lio->netdev, "MTU Changed from %d to %d\n",
-		   netdev->mtu, new_mtu);
+	nctrl.ncmd.u64 = 0;
+	nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MTU;
+	nctrl.ncmd.s.param1 = new_mtu;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.wait_time = LIO_CMD_WAIT_TM;
+	nctrl.netpndev = (u64)netdev;
+	nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
 
-	netdev->mtu = new_mtu;
+	ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl);
+	if (ret < 0) {
+		dev_err(&oct->pci_dev->dev, "Failed to set MTU\n");
+		return -EIO;
+	}
+
+	lio->mtu = new_mtu;
 
 	return 0;
 }
-- 
2.9.0

^ permalink raw reply related

* [PATCH net-next 1/2] liquidio: move macro definition to a proper place
From: Felix Manlunas @ 2017-08-18 18:35 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	veerasenareddy.burru
In-Reply-To: <20170818183432.GA4487@felix-thinkpad.cavium.com>

The macro LIO_CMD_WAIT_TM is not specific to the PF driver; it can be used
by the VF driver too, so move its definition from a PF-specific header file
to one that's common to PF and VF.

Signed-off-by: Veerasenareddy Burru <veerasenareddy.burru@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h | 2 --
 drivers/net/ethernet/cavium/liquidio/liquidio_common.h  | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
index dee6046..2aba524 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
@@ -24,8 +24,6 @@
 
 #include "cn23xx_pf_regs.h"
 
-#define LIO_CMD_WAIT_TM 100
-
 /* Register address and configuration for a CN23XX devices.
  * If device specific changes need to be made then add a struct to include
  * device specific fields as shown in the commented section
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 18d2955..a2274e6 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -238,6 +238,8 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry,
 #define   OCTNET_CMD_VLAN_FILTER_ENABLE 0x1
 #define   OCTNET_CMD_VLAN_FILTER_DISABLE 0x0
 
+#define   LIO_CMD_WAIT_TM 100
+
 /* RX(packets coming from wire) Checksum verification flags */
 /* TCP/UDP csum */
 #define   CNNIC_L4SUM_VERIFIED             0x1
-- 
2.9.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox