Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH -next 20/22] net: socket: add __compat_sys_getsockopt() helper; remove in-kernel call to compat syscall
From: Dominik Brodowski @ 2018-03-16 17:06 UTC (permalink / raw)
  To: linux-kernel, torvalds, davem; +Cc: netdev
In-Reply-To: <20180316170614.5392-1-linux@dominikbrodowski.net>

Using the net-internal helper __compat_sys_getsockopt() allows us to avoid
the internal calls to the compat_sys_getsockopt() syscall.

Cc: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 net/compat.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/compat.c b/net/compat.c
index 75bfcbbb2e3e..cdf5b0c1b962 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -509,8 +509,9 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta
 }
 EXPORT_SYMBOL(compat_sock_get_timestampns);
 
-COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
-		       char __user *, optval, int __user *, optlen)
+static int __compat_sys_getsockopt(int fd, int level, int optname,
+				   char __user *optval,
+				   int __user *optlen)
 {
 	int err;
 	struct socket *sock = sockfd_lookup(fd, &err);
@@ -536,6 +537,12 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
 	return err;
 }
 
+COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
+		       char __user *, optval, int __user *, optlen)
+{
+	return __compat_sys_getsockopt(fd, level, optname, optval, optlen);
+}
+
 struct compat_group_req {
 	__u32				 gr_interface;
 	struct __kernel_sockaddr_storage gr_group
@@ -874,8 +881,9 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 					      compat_ptr(a[3]), a[4]);
 		break;
 	case SYS_GETSOCKOPT:
-		ret = compat_sys_getsockopt(a0, a1, a[2],
-				compat_ptr(a[3]), compat_ptr(a[4]));
+		ret = __compat_sys_getsockopt(a0, a1, a[2],
+					      compat_ptr(a[3]),
+					      compat_ptr(a[4]));
 		break;
 	case SYS_SENDMSG:
 		ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
-- 
2.16.2

^ permalink raw reply related

* [PATCH -next 21/22] net: socket: add __compat_sys_recvmmsg() helper; remove in-kernel call to compat syscall
From: Dominik Brodowski @ 2018-03-16 17:06 UTC (permalink / raw)
  To: linux-kernel, torvalds, davem; +Cc: netdev
In-Reply-To: <20180316170614.5392-1-linux@dominikbrodowski.net>

Using the net-internal helper __compat_sys_recvmmsg() allows us to avoid
the internal calls to the compat_sys_recvmmsg() syscall.

Cc: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 net/compat.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/net/compat.c b/net/compat.c
index cdf5b0c1b962..7b2ae42a1598 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -787,9 +787,9 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len
 	return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen);
 }
 
-COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
-		       unsigned int, vlen, unsigned int, flags,
-		       struct compat_timespec __user *, timeout)
+static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				 unsigned int vlen, unsigned int flags,
+				 struct compat_timespec __user *timeout)
 {
 	int datagrams;
 	struct timespec ktspec;
@@ -809,6 +809,13 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
 	return datagrams;
 }
 
+COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
+		       unsigned int, vlen, unsigned int, flags,
+		       struct compat_timespec __user *, timeout)
+{
+	return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
+}
+
 COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 {
 	u32 a[AUDITSC_ARGS];
@@ -895,8 +902,8 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
 	case SYS_RECVMMSG:
-		ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
-					  compat_ptr(a[4]));
+		ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
+					    compat_ptr(a[4]));
 		break;
 	case SYS_ACCEPT4:
 		ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
-- 
2.16.2

^ permalink raw reply related

* [PATCH -next 22/22] net: socket: add __compat_sys_...msg() helpers; remove in-kernel calls to compat syscalls
From: Dominik Brodowski @ 2018-03-16 17:06 UTC (permalink / raw)
  To: linux-kernel, torvalds, davem; +Cc: netdev
In-Reply-To: <20180316170614.5392-1-linux@dominikbrodowski.net>

Using the net-internal helpers __compat_sys_...msg() allows us to avoid
the internal calls to the compat_sys_...msg() syscalls.
compat_sys_recvmmsg() is handled in a different patch.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 net/compat.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/net/compat.c b/net/compat.c
index 7b2ae42a1598..5ae7437d3853 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -747,25 +747,48 @@ static unsigned char nas[21] = {
 };
 #undef AL
 
-COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags)
+static inline long __compat_sys_sendmsg(int fd,
+					struct compat_msghdr __user *msg,
+					unsigned int flags)
 {
 	return __sys_sendmsg(fd, (struct user_msghdr __user *)msg,
 			     flags | MSG_CMSG_COMPAT, false);
 }
 
-COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
-		       unsigned int, vlen, unsigned int, flags)
+COMPAT_SYSCALL_DEFINE3(sendmsg, int, fd, struct compat_msghdr __user *, msg,
+		       unsigned int, flags)
+{
+	return __compat_sys_sendmsg(fd, msg, flags);
+}
+
+static inline long __compat_sys_sendmmsg(int fd,
+					 struct compat_mmsghdr __user *mmsg,
+					 unsigned int vlen, unsigned int flags)
 {
 	return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
 			      flags | MSG_CMSG_COMPAT, false);
 }
 
-COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg, unsigned int, flags)
+COMPAT_SYSCALL_DEFINE4(sendmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
+		       unsigned int, vlen, unsigned int, flags)
+{
+	return __compat_sys_sendmmsg(fd, mmsg, vlen, flags);
+}
+
+static inline long __compat_sys_recvmsg(int fd,
+					struct compat_msghdr __user *msg,
+					unsigned int flags)
 {
 	return __sys_recvmsg(fd, (struct user_msghdr __user *)msg,
 			     flags | MSG_CMSG_COMPAT, false);
 }
 
+COMPAT_SYSCALL_DEFINE3(recvmsg, int, fd, struct compat_msghdr __user *, msg,
+		       unsigned int, flags)
+{
+	return __compat_sys_recvmsg(fd, msg, flags);
+}
+
 static inline long __compat_sys_recvfrom(int fd, void __user *buf,
 					 compat_size_t len, unsigned int flags,
 					 struct sockaddr __user *addr,
@@ -893,13 +916,13 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
 					      compat_ptr(a[4]));
 		break;
 	case SYS_SENDMSG:
-		ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
+		ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
 		break;
 	case SYS_SENDMMSG:
-		ret = compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]);
+		ret = __compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]);
 		break;
 	case SYS_RECVMSG:
-		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
+		ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
 	case SYS_RECVMMSG:
 		ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
-- 
2.16.2

^ permalink raw reply related

* [PATCH net-next] liquidio: Simplified napi poll
From: Felix Manlunas @ 2018-03-16 17:21 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	felix.manlunas, intiyaz.basha

From: Intiyaz Basha <intiyaz.basha@cavium.com>

1) Moved interrupt enable related code from octeon_process_droq_poll_cmd()
   to separate function octeon_enable_irq().
2) Removed wrapper function octeon_process_droq_poll_cmd(), and directlyi
   using octeon_droq_process_poll_pkts().
3) Removed unused macros POLL_EVENT_XXX.
 
Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    |  7 +-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c | 83 ++++++++--------------
 drivers/net/ethernet/cavium/liquidio/octeon_droq.h | 11 ++-
 3 files changed, 35 insertions(+), 66 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 8bb4cfb..84f6594 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -636,9 +636,7 @@ static int liquidio_napi_poll(struct napi_struct *napi, int budget)
 	iq_no = droq->q_no;
 
 	/* Handle Droq descriptors */
-	work_done = octeon_process_droq_poll_cmd(oct, droq->q_no,
-						 POLL_EVENT_PROCESS_PKTS,
-						 budget);
+	work_done = octeon_droq_process_poll_pkts(oct, droq, budget);
 
 	/* Flush the instruction queue */
 	iq = oct->instr_queue[iq_no];
@@ -669,8 +667,7 @@ static int liquidio_napi_poll(struct napi_struct *napi, int budget)
 		tx_done = 1;
 		napi_complete_done(napi, work_done);
 
-		octeon_process_droq_poll_cmd(droq->oct_dev, droq->q_no,
-					     POLL_EVENT_ENABLE_INTR, 0);
+		octeon_enable_irq(droq->oct_dev, droq->q_no);
 		return 0;
 	}
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
index 3461d65..f044718 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c
@@ -788,7 +788,7 @@ static inline void octeon_droq_drop_packets(struct octeon_device *oct,
  * called before calling this routine.
  */
 
-static int
+int
 octeon_droq_process_poll_pkts(struct octeon_device *oct,
 			      struct octeon_droq *droq, u32 budget)
 {
@@ -835,71 +835,46 @@ static inline void octeon_droq_drop_packets(struct octeon_device *oct,
 	return total_pkts_processed;
 }
 
+/* Enable Pkt Interrupt */
 int
-octeon_process_droq_poll_cmd(struct octeon_device *oct, u32 q_no, int cmd,
-			     u32 arg)
+octeon_enable_irq(struct octeon_device *oct, u32 q_no)
 {
-	struct octeon_droq *droq;
-
-	droq = oct->droq[q_no];
+	switch (oct->chip_id) {
+	case OCTEON_CN66XX:
+	case OCTEON_CN68XX: {
+		struct octeon_cn6xxx *cn6xxx =
+			(struct octeon_cn6xxx *)oct->chip;
+		unsigned long flags;
+		u32 value;
 
-	if (cmd == POLL_EVENT_PROCESS_PKTS)
-		return octeon_droq_process_poll_pkts(oct, droq, arg);
+		spin_lock_irqsave
+			(&cn6xxx->lock_for_droq_int_enb_reg, flags);
+		value = octeon_read_csr(oct, CN6XXX_SLI_PKT_TIME_INT_ENB);
+		value |= (1 << q_no);
+		octeon_write_csr(oct, CN6XXX_SLI_PKT_TIME_INT_ENB, value);
+		value = octeon_read_csr(oct, CN6XXX_SLI_PKT_CNT_INT_ENB);
+		value |= (1 << q_no);
+		octeon_write_csr(oct, CN6XXX_SLI_PKT_CNT_INT_ENB, value);
 
-	if (cmd == POLL_EVENT_PENDING_PKTS) {
-		u32 pkt_cnt = atomic_read(&droq->pkts_pending);
+		/* don't bother flushing the enables */
 
-		return  octeon_droq_process_packets(oct, droq, pkt_cnt);
+		spin_unlock_irqrestore
+			(&cn6xxx->lock_for_droq_int_enb_reg, flags);
 	}
-
-	if (cmd == POLL_EVENT_ENABLE_INTR) {
-		u32 value;
-		unsigned long flags;
-
-		/* Enable Pkt Interrupt */
-		switch (oct->chip_id) {
-		case OCTEON_CN66XX:
-		case OCTEON_CN68XX: {
-			struct octeon_cn6xxx *cn6xxx =
-				(struct octeon_cn6xxx *)oct->chip;
-			spin_lock_irqsave
-				(&cn6xxx->lock_for_droq_int_enb_reg, flags);
-			value =
-				octeon_read_csr(oct,
-						CN6XXX_SLI_PKT_TIME_INT_ENB);
-			value |= (1 << q_no);
-			octeon_write_csr(oct,
-					 CN6XXX_SLI_PKT_TIME_INT_ENB,
-					 value);
-			value =
-				octeon_read_csr(oct,
-						CN6XXX_SLI_PKT_CNT_INT_ENB);
-			value |= (1 << q_no);
-			octeon_write_csr(oct,
-					 CN6XXX_SLI_PKT_CNT_INT_ENB,
-					 value);
-
-			/* don't bother flushing the enables */
-
-			spin_unlock_irqrestore
-				(&cn6xxx->lock_for_droq_int_enb_reg, flags);
-			return 0;
-		}
 		break;
-		case OCTEON_CN23XX_PF_VID: {
-			lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]);
-		}
+	case OCTEON_CN23XX_PF_VID:
+		lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]);
 		break;
 
-		case OCTEON_CN23XX_VF_VID:
-			lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]);
+	case OCTEON_CN23XX_VF_VID:
+		lio_enable_irq(oct->droq[q_no], oct->instr_queue[q_no]);
 		break;
-		}
-		return 0;
+	default:
+		dev_err(&oct->pci_dev->dev, "%s Unknown Chip\n", __func__);
+		return 1;
 	}
 
-	dev_err(&oct->pci_dev->dev, "%s Unknown command: %d\n", __func__, cmd);
-	return -EINVAL;
+	return 0;
 }
 
 int octeon_register_droq_ops(struct octeon_device *oct, u32 q_no,
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
index 815a9f5..f28f262 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
@@ -123,11 +123,6 @@ struct oct_droq_stats {
 
 };
 
-#define POLL_EVENT_INTR_ARRIVED  1
-#define POLL_EVENT_PROCESS_PKTS  2
-#define POLL_EVENT_PENDING_PKTS  3
-#define POLL_EVENT_ENABLE_INTR   4
-
 /* The maximum number of buffers that can be dispatched from the
  * output/dma queue. Set to 64 assuming 1K buffers in DROQ and the fact that
  * max packet size from DROQ is 64K.
@@ -414,8 +409,10 @@ int octeon_droq_process_packets(struct octeon_device *oct,
 				struct octeon_droq *droq,
 				u32 budget);
 
-int octeon_process_droq_poll_cmd(struct octeon_device *oct, u32 q_no,
-				 int cmd, u32 arg);
+int octeon_droq_process_poll_pkts(struct octeon_device *oct,
+				  struct octeon_droq *droq, u32 budget);
+
+int octeon_enable_irq(struct octeon_device *oct, u32 q_no);
 
 void octeon_droq_check_oom(struct octeon_droq *droq);
 
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH 0/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Andrew Lunn @ 2018-03-16 17:22 UTC (permalink / raw)
  To: Grygorii Strashko
  Cc: David S. Miller, netdev, Florian Fainelli, Greg Kroah-Hartman,
	Sekhar Nori, linux-kernel, linux-omap
In-Reply-To: <20180314222624.12744-1-grygorii.strashko@ti.com>

On Wed, Mar 14, 2018 at 05:26:22PM -0500, Grygorii Strashko wrote:
> Some ethernet drivers (like TI CPSW) may connect and manage >1 Net PHYs per
> one netdevice, as result such drivers will produce warning during system
> boot and fail to connect second phy to netdevice when PHYLIB framework
> will try to create sysfs link netdev->phydev for second PHY
> in phy_attach_direct(), because sysfs link with the same name has been
> created already for the first PHY.
> As result, second CPSW external port will became unusable.
> This issue was introduced by commits:
> 5568363f0cb3 ("net: phy: Create sysfs reciprocal links for attached_dev/phydev"
> a3995460491d ("net: phy: Relax error checking on sysfs_create_link()"

I wonder if it would be better to add a flag to the phydev that
indicates it is the second PHY connected to a MAC? Add a bit to
phydrv->mdiodrv.flags. If that bit is set, don't create the sysfs
file.

For 99% of MAC drivers, having two PHYs is an error, so we want to aid
debug by reporting the sysfs error.

      Andrew

^ permalink raw reply

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Linus Torvalds @ 2018-03-16 17:29 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Kees Cook, Andrew Morton, Josh Poimboeuf, Rasmus Villemoes,
	Randy Dunlap, Miguel Ojeda, Ingo Molnar, David Laight, Ian Abbott,
	linux-input, linux-btrfs, Network Development,
	Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <ab2933a2-ca7b-eb3b-744d-bc037b1be688@redhat.com>

On Fri, Mar 16, 2018 at 4:47 AM, Florian Weimer <fweimer@redhat.com> wrote:
>
> If you want to catch stack frames which have unbounded size,
> -Werror=stack-usage=1000 or -Werror=vla-larger-than=1000 (with the constant
> adjusted as needed) might be the better approach.

No, we want to catch *variable* stack sizes.

Does "-Werror=vla-larger-than=0" perhaps work for that? No, because
the stupid compiler says that is "meaningless".

And no, using "-Werror=vla-larger-than=1" doesn't work either, because
the moronic compiler continues to think that "vla" is about the
_type_, not the code:

   t.c: In function ‘test’:
   t.c:6:6: error: argument to variable-length array is too large
[-Werror=vla-larger-than=]
     int array[(1,100)];

Gcc people are crazy.

Is there really no way to just say "shut up about the stupid _syntax_
issue that is entirely irrelevant, and give us the _code_ issue".

                 Linus

^ permalink raw reply

* Re: [PATCH RFC RFC] rds: Use NETDEV_UNREGISTER in rds_tcp_dev_event() (then kill NETDEV_UNREGISTER_FINAL)
From: Sowmini Varadhan @ 2018-03-16 17:29 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: santosh.shilimkar, davem, netdev, linux-rdma, rds-devel, edumazet
In-Reply-To: <512c2483-aa62-b385-6079-1b6ff8ba4636@virtuozzo.com>

I had taken some of this offline, but it occurs to me
that some of these notes should be saved to the netdev archives, 
in case this question pops up again in a few years.

When I run your patch, I get a repeatable panic by doing
  modprobe rds-tcp
  ip netns create blue
the panic is because we are finding a null trn in rds_tcp_init_net.

I think there's something very disturbed about calling
register_pernet_operations() twice, once via 
register_pernet_device() and again via register_pernet_subsys().

I suspect this has everything to do with the panic but I have
not had time to debug every little detail here.

In general, rds_tcp is not a network device, it is a kernel
module.  That is the fundamental problem here. 

To repeat the comments form net_namespace.h:
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.

when the "blue" netns starts up, it creates at least one kernel listen
socket on *.16385. This socket, and any other child/client sockets 
created must be cleaned up before the cleanup_net can happen.

This is why I chose to call regster_pernet_subsys. Again, as per
comments in net_namespace.h:

 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.

On (03/16/18 18:51), Kirill Tkhai wrote:
> > Let's find another approach. Could you tell what problem we have in 
> > case of rds_tcp_dev_ops is declared as pernet_device?

As above, rds-tcp is not a network device.

> One more question. Which time we take a reference of loopback device?
> Is it possible before we created a net completely?

We dont take a reference on the loopback device. 
We make sure none of the kernel sockets does a get_net() so
that we dont block the cleanup_net, and then, when all
the network interfaces have been taken down (loopback is
the last one) we know there are no more packets coming in
and out, so it is safe to dismantle all kernel sockets 
created by rds-tcp.

Hope that helps.

--Sowmini

^ permalink raw reply

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Florian Weimer @ 2018-03-16 17:32 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kees Cook, Andrew Morton, Josh Poimboeuf, Rasmus Villemoes,
	Randy Dunlap, Miguel Ojeda, Ingo Molnar, David Laight, Ian Abbott,
	linux-input, linux-btrfs, Network Development,
	Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <CA+55aFx92dXg-qnrjGS2Rsna6TE5HSPBgGpSA3cv4n_n3RqBzA@mail.gmail.com>

On 03/16/2018 06:29 PM, Linus Torvalds wrote:

> Gcc people are crazy.

End of discussion from me.  This is not acceptable.

Florian

^ permalink raw reply

* Re: [PATCH 0/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Florian Fainelli @ 2018-03-16 17:34 UTC (permalink / raw)
  To: Andrew Lunn, Grygorii Strashko
  Cc: David S. Miller, netdev, Greg Kroah-Hartman, Sekhar Nori,
	linux-kernel, linux-omap
In-Reply-To: <20180316172234.GA4212@lunn.ch>



On 03/16/2018 10:22 AM, Andrew Lunn wrote:
> On Wed, Mar 14, 2018 at 05:26:22PM -0500, Grygorii Strashko wrote:
>> Some ethernet drivers (like TI CPSW) may connect and manage >1 Net PHYs per
>> one netdevice, as result such drivers will produce warning during system
>> boot and fail to connect second phy to netdevice when PHYLIB framework
>> will try to create sysfs link netdev->phydev for second PHY
>> in phy_attach_direct(), because sysfs link with the same name has been
>> created already for the first PHY.
>> As result, second CPSW external port will became unusable.
>> This issue was introduced by commits:
>> 5568363f0cb3 ("net: phy: Create sysfs reciprocal links for attached_dev/phydev"
>> a3995460491d ("net: phy: Relax error checking on sysfs_create_link()"
> 
> I wonder if it would be better to add a flag to the phydev that
> indicates it is the second PHY connected to a MAC? Add a bit to
> phydrv->mdiodrv.flags. If that bit is set, don't create the sysfs
> file.

We could indeed do that, I am fine with Grygorii's approach though in
making the creation more silent and non fatal.

> 
> For 99% of MAC drivers, having two PHYs is an error, so we want to aid
> debug by reporting the sysfs error.
That is true, either way is fine with me, really.
-- 
Florian

^ permalink raw reply

* Re: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address support for ethtool nftuple filters
From: Alexander Duyck @ 2018-03-16 17:38 UTC (permalink / raw)
  To: Brown, Aaron F
  Cc: Gomes, Vinicius, intel-wired-lan@lists.osuosl.org,
	netdev@vger.kernel.org, Sanchez-Palencia, Jesus
In-Reply-To: <309B89C4C689E141A5FF6A0C5FB2118B8C8032A0@ORSMSX101.amr.corp.intel.com>

On Tue, Mar 13, 2018 at 8:04 PM, Brown, Aaron F <aaron.f.brown@intel.com> wrote:
>> From: Intel-wired-lan [mailto:intel-wired-lan-bounces@osuosl.org] On
>> Behalf Of Vinicius Costa Gomes
>> Sent: Wednesday, March 7, 2018 4:37 PM
>> To: intel-wired-lan@lists.osuosl.org
>> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus <jesus.sanchez-
>> palencia@intel.com>
>> Subject: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address
>> support for ethtool nftuple filters
>>
>> This adds the capability of configuring the queue steering of arriving
>> packets based on their source and destination MAC addresses.
>>
>> In practical terms this adds support for the following use cases,
>> characterized by these examples:
>>
>> $ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
>> (this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
>> to the RX queue 0)
>>
>> $ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 action 3
>> (this will direct packets with source address "44:44:44:44:44:44" to
>> the RX queue 3)
>
> This seems to work fine on i210, and the patch series allows me to set the rx filters on the i350, i354 and i211, but it is not directing the packets to the queue I request.
>
> With the exception of i210 the rx_queues number does not seem to be effected by setting the filter.  In the case of i211 the rx packets stay on rx_queue 0 with or without an ether src or dst filter.  The first example one seems to work at first since it's directing to queue 0, but changing the filter to "action 1" does not change the behavior.  With the i350 and i354 ports the packets are spread across the rx_queues with or without the filter set.

Do any of the other parts actually support this functionality? I don't
think they do.

What we might look at doing instead of trying to add support for other
parts would be to explicitly limit this functionality to the i210
since if I am not mistaken this may be a feature only available in
that hardware.

Thanks.

- Alex

^ permalink raw reply

* [PATCH net-next v3 0/1] skbuff: Fix applications not being woken for errors
From: Vinicius Costa Gomes @ 2018-03-16 17:41 UTC (permalink / raw)
  To: netdev; +Cc: Vinicius Costa Gomes, randy.e.witt, davem, eric.dumazet

Hi,

Changes from v2:
 - As the skbuff fix got applied into the net tree, removing from this
 series (didn't change the subject to avoid causing any more confusion);

Changes from v1:
 - Fixed comments from Willem de Bruijn, about the order of the
 options passed to getopt();
 - Added Reviewed-by and Fixes tags to patch (2);

Changes from the RFC:
 - tweaked commit messages;

Original cover letter:

This is actually a "bug report"-RFC instead of the more usual "new
feature"-RFC.

We are developing an application that uses TX hardware timestamping to
make some measurements, and during development Randy Witt initially
reported that the application poll() never unblocked when TX hardware
timestamping was enabled.

After some investigation, it turned out the problem wasn't only
exclusive to hardware timestamping, and could be reproduced with
software timestamping.

Applying patch (1), and running txtimestamp like this, for example:

$ ./txtimestamp -u -4 192.168.1.71 -c 1000 -D -l 1000 -F

('-u' to use UDP only, '-4' for ipv4 only, '-c 1000' to send 1000
packets for each test, '-D' to remove the delay between packets, '-l
1000' to set the payload to 1000 bytes, '-F' for configuring poll() to
wait forever)

will cause the application to become stuck in the poll() call in most
of the times. (Note: I couldn't reproduce the issue running against an
address that is routed through loopback.)

Another interesting fact is that if the POLLIN event is added to the
poll() .events, poll() no longer becomes stuck, and more interestingly
the returned event in .revents is only POLLERR.

After a few debugging sessions, we got to 'sock_queue_err_skb()' and
how it notifies applications of the error just enqueued. Changing it
to use 'sk->sk_error_report()', fixes the issue for hardware and
software timestamping. That is patch (2).

The "solution" proposed in patch (2) looks like too big a hammer, if
it's not, then it seems that this problem existed since a long time
ago (pre git) and was uncommon for folks to reach the necessary
conditions to trigger it (my hypothesis is that only triggers when the
error is reported from a different task context than the application).

Am I missing something here?

Cheers,

Vinicius Costa Gomes (1):
  selftests/txtimestamp: Add more configurable parameters

 .../selftests/networking/timestamping/txtimestamp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

--
2.16.2

^ permalink raw reply

* [PATCH net-next v3 1/1] selftests/txtimestamp: Add more configurable parameters
From: Vinicius Costa Gomes @ 2018-03-16 17:41 UTC (permalink / raw)
  To: netdev; +Cc: Vinicius Costa Gomes, randy.e.witt, davem, eric.dumazet
In-Reply-To: <20180316174114.4575-1-vinicius.gomes@intel.com>

Add a way to configure if poll() should wait forever for an event, the
number of packets that should be sent for each and if there should be
any delay between packets.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
 .../selftests/networking/timestamping/txtimestamp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/networking/timestamping/txtimestamp.c b/tools/testing/selftests/networking/timestamping/txtimestamp.c
index 5df07047ca86..81a98a240456 100644
--- a/tools/testing/selftests/networking/timestamping/txtimestamp.c
+++ b/tools/testing/selftests/networking/timestamping/txtimestamp.c
@@ -68,9 +68,11 @@ static int cfg_num_pkts = 4;
 static int do_ipv4 = 1;
 static int do_ipv6 = 1;
 static int cfg_payload_len = 10;
+static int cfg_poll_timeout = 100;
 static bool cfg_show_payload;
 static bool cfg_do_pktinfo;
 static bool cfg_loop_nodata;
+static bool cfg_no_delay;
 static uint16_t dest_port = 9000;
 
 static struct sockaddr_in daddr;
@@ -171,7 +173,7 @@ static void __poll(int fd)
 
 	memset(&pollfd, 0, sizeof(pollfd));
 	pollfd.fd = fd;
-	ret = poll(&pollfd, 1, 100);
+	ret = poll(&pollfd, 1, cfg_poll_timeout);
 	if (ret != 1)
 		error(1, errno, "poll");
 }
@@ -371,7 +373,8 @@ static void do_test(int family, unsigned int opt)
 			error(1, errno, "send");
 
 		/* wait for all errors to be queued, else ACKs arrive OOO */
-		usleep(50 * 1000);
+		if (!cfg_no_delay)
+			usleep(50 * 1000);
 
 		__poll(fd);
 
@@ -392,6 +395,9 @@ static void __attribute__((noreturn)) usage(const char *filepath)
 			"  -4:   only IPv4\n"
 			"  -6:   only IPv6\n"
 			"  -h:   show this message\n"
+			"  -c N: number of packets for each test\n"
+			"  -D:   no delay between packets\n"
+			"  -F:   poll() waits forever for an event\n"
 			"  -I:   request PKTINFO\n"
 			"  -l N: send N bytes at a time\n"
 			"  -n:   set no-payload option\n"
@@ -409,7 +415,7 @@ static void parse_opt(int argc, char **argv)
 	int proto_count = 0;
 	char c;
 
-	while ((c = getopt(argc, argv, "46hIl:np:rRux")) != -1) {
+	while ((c = getopt(argc, argv, "46c:DFhIl:np:rRux")) != -1) {
 		switch (c) {
 		case '4':
 			do_ipv6 = 0;
@@ -417,6 +423,15 @@ static void parse_opt(int argc, char **argv)
 		case '6':
 			do_ipv4 = 0;
 			break;
+		case 'c':
+			cfg_num_pkts = strtoul(optarg, NULL, 10);
+			break;
+		case 'D':
+			cfg_no_delay = true;
+			break;
+		case 'F':
+			cfg_poll_timeout = -1;
+			break;
 		case 'I':
 			cfg_do_pktinfo = true;
 			break;
-- 
2.16.2

^ permalink raw reply related

* RE: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: David Laight @ 2018-03-16 17:44 UTC (permalink / raw)
  To: 'Linus Torvalds', Florian Weimer
  Cc: Kees Cook, Andrew Morton, Josh Poimboeuf, Rasmus Villemoes,
	Randy Dunlap, Miguel Ojeda, Ingo Molnar, Ian Abbott, linux-input,
	linux-btrfs, Network Development, Linux Kernel Mailing List,
	Kernel Hardening
In-Reply-To: <CA+55aFx92dXg-qnrjGS2Rsna6TE5HSPBgGpSA3cv4n_n3RqBzA@mail.gmail.com>

From: Linus Torvalds
> Sent: 16 March 2018 17:29
> On Fri, Mar 16, 2018 at 4:47 AM, Florian Weimer <fweimer@redhat.com> wrote:
> >
> > If you want to catch stack frames which have unbounded size,
> > -Werror=stack-usage=1000 or -Werror=vla-larger-than=1000 (with the constant
> > adjusted as needed) might be the better approach.
> 
> No, we want to catch *variable* stack sizes.
> 
> Does "-Werror=vla-larger-than=0" perhaps work for that? No, because
> the stupid compiler says that is "meaningless".
> 
> And no, using "-Werror=vla-larger-than=1" doesn't work either, because
> the moronic compiler continues to think that "vla" is about the
> _type_, not the code:
> 
>    t.c: In function ‘test’:
>    t.c:6:6: error: argument to variable-length array is too large
> [-Werror=vla-larger-than=]
>      int array[(1,100)];
> 
> Gcc people are crazy.
> 
> Is there really no way to just say "shut up about the stupid _syntax_
> issue that is entirely irrelevant, and give us the _code_ issue".

I looked at the generated code for one of the constant sized VLA that
the compiler barfed at.
It seemed to subtract constants from %sp separately for the VLA.
So it looks like the compiler treats them as VLA even though it
knows the size.
That is probably missing optimisation.

	David


^ permalink raw reply

* [PATCH net-next 1/2] tcp: add snd_ssthresh stat in SCM_TIMESTAMPING_OPT_STATS
From: Yousuk Seung @ 2018-03-16 17:51 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Yousuk Seung, Neal Cardwell, Priyaranjan Jha,
	Soheil Hassas Yeganeh, Yuchung Cheng

This patch adds TCP_NLA_SND_SSTHRESH stat into SCM_TIMESTAMPING_OPT_STATS
that reports tcp_sock.snd_ssthresh.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp.c           | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 4c0ae0faf7ca..560374c978f9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -243,6 +243,7 @@ enum {
 	TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */
 	TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */
 	TCP_NLA_CA_STATE,	/* ca_state of socket */
+	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
 
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fb350f740f69..e553f84bde83 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3031,7 +3031,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	u32 rate;
 
 	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
-			  4 * nla_total_size(sizeof(u32)) +
+			  5 * nla_total_size(sizeof(u32)) +
 			  3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
@@ -3061,6 +3061,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
+	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
 
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
-- 
2.16.2.804.g6dcf76e118-goog

^ permalink raw reply related

* [PATCH net-next 2/2] net-tcp_bbr: set tp->snd_ssthresh to BDP upon STARTUP exit
From: Yousuk Seung @ 2018-03-16 17:51 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Yousuk Seung, Neal Cardwell, Priyaranjan Jha,
	Soheil Hassas Yeganeh, Yuchung Cheng

Set tp->snd_ssthresh to BDP upon STARTUP exit. This allows us
to check if a BBR flow exited STARTUP and the BDP at the
time of STARTUP exit with SCM_TIMESTAMPING_OPT_STATS. Since BBR does not
use snd_ssthresh this fix has no impact on BBR's behavior.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_bbr.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index c92014cb1e16..158d105e76da 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -731,6 +731,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
+		tcp_sk(sk)->snd_ssthresh =
+				bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
 	}	/* fall through to check if in-flight is already small: */
 	if (bbr->mode == BBR_DRAIN &&
 	    tcp_packets_in_flight(tcp_sk(sk)) <=
@@ -834,6 +836,7 @@ static void bbr_init(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->prior_cwnd = 0;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	bbr->rtt_cnt = 0;
 	bbr->next_rtt_delivered = 0;
 	bbr->prev_ca_state = TCP_CA_Open;
@@ -886,7 +889,7 @@ static u32 bbr_undo_cwnd(struct sock *sk)
 static u32 bbr_ssthresh(struct sock *sk)
 {
 	bbr_save_cwnd(sk);
-	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
+	return tcp_sk(sk)->snd_ssthresh;
 }
 
 static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
-- 
2.16.2.804.g6dcf76e118-goog

^ permalink raw reply related

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Al Viro @ 2018-03-16 17:55 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Florian Weimer, Kees Cook, Andrew Morton, Josh Poimboeuf,
	Rasmus Villemoes, Randy Dunlap, Miguel Ojeda, Ingo Molnar,
	David Laight, Ian Abbott, linux-input, linux-btrfs,
	Network Development, Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <CA+55aFx92dXg-qnrjGS2Rsna6TE5HSPBgGpSA3cv4n_n3RqBzA@mail.gmail.com>

On Fri, Mar 16, 2018 at 10:29:16AM -0700, Linus Torvalds wrote:
>    t.c: In function ‘test’:
>    t.c:6:6: error: argument to variable-length array is too large
> [-Werror=vla-larger-than=]
>      int array[(1,100)];
> 
> Gcc people are crazy.

That's not them, that's C standard regarding ICE.  1,100 is *not* a
constant expression as far as the standard is concerned, and that
type is actually a VLA with the size that can be optimized into
a compiler-calculated value.

Would you argue that in
void foo(char c)
{
	int a[(c<<1) + 10 - c + 2 - c];

a is not a VLA?  Sure, compiler probably would be able to reduce
that expression to 12, but demanding that to be recognized means
that compiler must do a bunch of optimizations in the middle of
typechecking.

expr, constant_expression is not a constant_expression.  And in
this particular case the standard is not insane - the only reason
for using that is typechecking and _that_ can be achieved without
violating 6.6p6:
	sizeof(expr,0) * 0 + ICE
*is* an integer constant expression, and it gives you exact same
typechecking.  So if somebody wants to play odd games, they can
do that just fine, without complicating the logics for compilers...

^ permalink raw reply

* [PATCH net 1/1] qede: Fix barrier usage after tx doorbell write.
From: Manish Chopra @ 2018-03-16 17:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, ariel.elior, michal.kalderon

Since commit c5ad119fb6c09b0297446be05bd66602fa564758
("net: sched: pfifo_fast use skb_array") driver is exposed
to an issue where it is hitting NULL skbs while handling TX
completions. Driver uses mmiowb() to flush the writes to the
doorbell bar which is a write-combined bar, however on x86
mmiowb() does not flush the write combined buffer.

This patch fixes this problem by replacing mmiowb() with wmb()
after the write combined doorbell write so that writes are
flushed and synchronized from more than one processor.

Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede_fp.c |   10 ++++------
 1 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index dafc079..2e921ca 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -320,13 +320,11 @@ static inline void qede_update_tx_producer(struct qede_tx_queue *txq)
 	barrier();
 	writel(txq->tx_db.raw, txq->doorbell_addr);

-	/* mmiowb is needed to synchronize doorbell writes from more than one
-	 * processor. It guarantees that the write arrives to the device before
-	 * the queue lock is released and another start_xmit is called (possibly
-	 * on another CPU). Without this barrier, the next doorbell can bypass
-	 * this doorbell. This is applicable to IA64/Altix systems.
+	/* Fence required to flush the write combined buffer, since another
+	 * CPU may write to the same doorbell address and data may be lost
+	 * due to relaxed order nature of write combined bar.
 	 */
-	mmiowb();
+	wmb();
 }

 static int qede_xdp_xmit(struct qede_dev *edev, struct qede_fastpath *fp,
-- 
1.7.1

^ permalink raw reply related

* Re: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address support for ethtool nftuple filters
From: Vinicius Costa Gomes @ 2018-03-16 17:59 UTC (permalink / raw)
  To: Alexander Duyck, Brown, Aaron F
  Cc: intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	Sanchez-Palencia, Jesus
In-Reply-To: <CAKgT0Uf3MmMRELo0c11Aqah92+shLFASzPkZxE6OzGU5bjvzfA@mail.gmail.com>

Hi,

Alexander Duyck <alexander.duyck@gmail.com> writes:

> On Tue, Mar 13, 2018 at 8:04 PM, Brown, Aaron F <aaron.f.brown@intel.com> wrote:
>>> From: Intel-wired-lan [mailto:intel-wired-lan-bounces@osuosl.org] On
>>> Behalf Of Vinicius Costa Gomes
>>> Sent: Wednesday, March 7, 2018 4:37 PM
>>> To: intel-wired-lan@lists.osuosl.org
>>> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus <jesus.sanchez-
>>> palencia@intel.com>
>>> Subject: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address
>>> support for ethtool nftuple filters
>>>
>>> This adds the capability of configuring the queue steering of arriving
>>> packets based on their source and destination MAC addresses.
>>>
>>> In practical terms this adds support for the following use cases,
>>> characterized by these examples:
>>>
>>> $ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
>>> (this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
>>> to the RX queue 0)
>>>
>>> $ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 action 3
>>> (this will direct packets with source address "44:44:44:44:44:44" to
>>> the RX queue 3)
>>
>> This seems to work fine on i210, and the patch series allows me to set the rx filters on the i350, i354 and i211, but it is not directing the packets to the queue I request.
>>
>> With the exception of i210 the rx_queues number does not seem to be effected by setting the filter.  In the case of i211 the rx packets stay on rx_queue 0 with or without an ether src or dst filter.  The first example one seems to work at first since it's directing to queue 0, but changing the filter to "action 1" does not change the behavior.  With the i350 and i354 ports the packets are spread across the rx_queues with or without the filter set.
>
> Do any of the other parts actually support this functionality? I don't
> think they do.

>From what I can see, the only other part that supports queue steering (by MAC
addresses) is the 82575. But as I don't have any of those handy, making
it work only for the i210 seems more reasonable, to avoid getting into
this situation again.

>
> What we might look at doing instead of trying to add support for other
> parts would be to explicitly limit this functionality to the i210
> since if I am not mistaken this may be a feature only available in
> that hardware.

Sounds good to me.

>
> Thanks.
>
> - Alex


Cheers,
--
Vinicius

^ permalink raw reply

* Re: [PATCH net-next 1/2] tcp: add snd_ssthresh stat in SCM_TIMESTAMPING_OPT_STATS
From: Eric Dumazet @ 2018-03-16 18:04 UTC (permalink / raw)
  To: Yousuk Seung, David Miller
  Cc: netdev, Neal Cardwell, Priyaranjan Jha, Soheil Hassas Yeganeh,
	Yuchung Cheng
In-Reply-To: <20180316175107.224147-1-ysseung@google.com>



On 03/16/2018 10:51 AM, Yousuk Seung wrote:
> This patch adds TCP_NLA_SND_SSTHRESH stat into SCM_TIMESTAMPING_OPT_STATS
> that reports tcp_sock.snd_ssthresh.
> 

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [PATCH net-next 3/9] selftests: pmtu: Introduce support for multiple tests
From: David Ahern @ 2018-03-16 18:06 UTC (permalink / raw)
  To: Stefano Brivio, David S . Miller
  Cc: Sabrina Dubroca, Steffen Klassert, netdev
In-Reply-To: <2870b8f3ab9ce40d3dbcfc1f308aba580f59197f.1521129192.git.sbrivio@redhat.com>

On 3/15/18 9:18 AM, Stefano Brivio wrote:
>  trap cleanup EXIT
>  
> -test_pmtu_vti6_exception
> +exitcode=0
> +for name in ${tests}; do
> +	echo "${name}: START"
> +	eval test_${name}
> +	ret=$?
> +	cleanup
> +
> +	if [ $ret -eq 0 ]; then echo "${name}: FAIL"; exitcode=1

ret = 0 == failure is counterintuitive for Linux.

> +	elif [ $ret -eq 1 ]; then echo "${name}: PASS"
> +	elif [ $ret -eq 2 ]; then echo "${name}: SKIP"

I use printf in other scripts so that the pass/fail verdict lineup. e.g.,
printf "        %-60s  [PASS]\n" "${name}"

> +	fi
> +done
>  
> -exit 0
> +exit ${exitcode}
> 

^ permalink raw reply

* Re: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address support for ethtool nftuple filters
From: Alexander Duyck @ 2018-03-16 18:07 UTC (permalink / raw)
  To: Vinicius Costa Gomes
  Cc: Brown, Aaron F, intel-wired-lan@lists.osuosl.org,
	netdev@vger.kernel.org, Sanchez-Palencia, Jesus
In-Reply-To: <87muz7vr4n.fsf@intel.com>

On Fri, Mar 16, 2018 at 10:59 AM, Vinicius Costa Gomes
<vinicius.gomes@intel.com> wrote:
> Hi,
>
> Alexander Duyck <alexander.duyck@gmail.com> writes:
>
>> On Tue, Mar 13, 2018 at 8:04 PM, Brown, Aaron F <aaron.f.brown@intel.com> wrote:
>>>> From: Intel-wired-lan [mailto:intel-wired-lan-bounces@osuosl.org] On
>>>> Behalf Of Vinicius Costa Gomes
>>>> Sent: Wednesday, March 7, 2018 4:37 PM
>>>> To: intel-wired-lan@lists.osuosl.org
>>>> Cc: netdev@vger.kernel.org; Sanchez-Palencia, Jesus <jesus.sanchez-
>>>> palencia@intel.com>
>>>> Subject: [Intel-wired-lan] [next-queue PATCH v4 6/8] igb: Add MAC address
>>>> support for ethtool nftuple filters
>>>>
>>>> This adds the capability of configuring the queue steering of arriving
>>>> packets based on their source and destination MAC addresses.
>>>>
>>>> In practical terms this adds support for the following use cases,
>>>> characterized by these examples:
>>>>
>>>> $ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
>>>> (this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
>>>> to the RX queue 0)
>>>>
>>>> $ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 action 3
>>>> (this will direct packets with source address "44:44:44:44:44:44" to
>>>> the RX queue 3)
>>>
>>> This seems to work fine on i210, and the patch series allows me to set the rx filters on the i350, i354 and i211, but it is not directing the packets to the queue I request.
>>>
>>> With the exception of i210 the rx_queues number does not seem to be effected by setting the filter.  In the case of i211 the rx packets stay on rx_queue 0 with or without an ether src or dst filter.  The first example one seems to work at first since it's directing to queue 0, but changing the filter to "action 1" does not change the behavior.  With the i350 and i354 ports the packets are spread across the rx_queues with or without the filter set.
>>
>> Do any of the other parts actually support this functionality? I don't
>> think they do.
>
> From what I can see, the only other part that supports queue steering (by MAC
> addresses) is the 82575. But as I don't have any of those handy, making
> it work only for the i210 seems more reasonable, to avoid getting into
> this situation again.

That sounds good to me. What you might do is add a comment explaining
that this is only supported on 82575 and i210 wherever you put the
check that limits this. Then if we have time for the
development/validation efforts, or someone in the community does they
could take it upon themselves to enable and test it for 82575.

I have done similar things in the past. As long as it is clear that
the reason why we limited it to i210 is mostly because of
development/validation resources somebody else can come along and
enable it if they have the resources and time to invest in doing so.

^ permalink raw reply

* [next-queue v2 4/4] ixgbe: enable tso with ipsec offload
From: Shannon Nelson @ 2018-03-16 18:09 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert
In-Reply-To: <1521223747-12018-1-git-send-email-shannon.nelson@oracle.com>

Fix things up to support TSO offload in conjunction
with IPsec hw offload.  This raises throughput with
IPsec offload on to nearly line rate.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
v2 updates from Alex's comments:
 - changed feature add from variable to #define
 - fixed a reverse christmas tree miss
 - GSO_PARTIAL'd the ipv4 header checksum
 - dropped an extra blank line


 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c |  9 +++++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 26 ++++++++++++++++++--------
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 5ddea43..68af127 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -929,8 +929,13 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 	ixgbe_ipsec_clear_hw_tables(adapter);
 
 	adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops;
-	adapter->netdev->features |= NETIF_F_HW_ESP;
-	adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP;
+
+#define IXGBE_ESP_FEATURES	(NETIF_F_HW_ESP | \
+				 NETIF_F_HW_ESP_TX_CSUM | \
+				 NETIF_F_GSO_ESP)
+
+	adapter->netdev->features |= IXGBE_ESP_FEATURES;
+	adapter->netdev->hw_enc_features |= IXGBE_ESP_FEATURES;
 
 	return;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a54f3d8..b947435 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7721,7 +7721,8 @@ static void ixgbe_service_task(struct work_struct *work)
 
 static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 		     struct ixgbe_tx_buffer *first,
-		     u8 *hdr_len)
+		     u8 *hdr_len,
+		     struct ixgbe_ipsec_tx_data *itd)
 {
 	u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
 	struct sk_buff *skb = first->skb;
@@ -7735,6 +7736,7 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 		unsigned char *hdr;
 	} l4;
 	u32 paylen, l4_offset;
+	u32 fceof_saidx = 0;
 	int err;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -7760,13 +7762,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 	if (ip.v4->version == 4) {
 		unsigned char *csum_start = skb_checksum_start(skb);
 		unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4);
+		int len = csum_start - trans_start;
 
 		/* IP header will have to cancel out any data that
-		 * is not a part of the outer IP header
+		 * is not a part of the outer IP header, so set to
+		 * a reverse csum if needed, else init check to 0.
 		 */
-		ip.v4->check = csum_fold(csum_partial(trans_start,
-						      csum_start - trans_start,
-						      0));
+		ip.v4->check = (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) ?
+					   csum_fold(csum_partial(trans_start,
+								  len, 0)) : 0;
 		type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
 
 		ip.v4->tot_len = 0;
@@ -7797,12 +7801,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 	mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
 	mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 
+	fceof_saidx |= itd->sa_idx;
+	type_tucmd |= itd->flags | itd->trailer_len;
+
 	/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
 	vlan_macip_lens = l4.hdr - ip.hdr;
 	vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-	ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
+	ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd,
 			  mss_l4len_idx);
 
 	return 1;
@@ -8493,7 +8500,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 	if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, &ipsec_tx))
 		goto out_drop;
 #endif
-	tso = ixgbe_tso(tx_ring, first, &hdr_len);
+	tso = ixgbe_tso(tx_ring, first, &hdr_len, &ipsec_tx);
 	if (tso < 0)
 		goto out_drop;
 	else if (!tso)
@@ -9902,8 +9909,11 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
 
 	/* We can only support IPV4 TSO in tunnels if we can mangle the
 	 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+	 * IPsec offoad sets skb->encapsulation but still can handle
+	 * the TSO, so it's the exception.
 	 */
-	if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
+	if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID) &&
+	    !skb->sp)
 		features &= ~NETIF_F_TSO;
 
 	return features;
-- 
2.7.4

^ permalink raw reply related

* [next-queue v2 2/4] ixgbe: remove unneeded ipsec test in TX path
From: Shannon Nelson @ 2018-03-16 18:09 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert
In-Reply-To: <1521223747-12018-1-git-send-email-shannon.nelson@oracle.com>

Since the ipsec data fields will be zero anyway in the non-ipsec
case, we can remove the conditional jump.

Suggested-by: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 153cd9e..a54f3d8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7864,10 +7864,8 @@ static void ixgbe_tx_csum(struct ixgbe_ring *tx_ring,
 	vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-	if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC) {
-		fceof_saidx |= itd->sa_idx;
-		type_tucmd |= itd->flags | itd->trailer_len;
-	}
+	fceof_saidx |= itd->sa_idx;
+	type_tucmd |= itd->flags | itd->trailer_len;
 
 	ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd, 0);
 }
-- 
2.7.4

^ permalink raw reply related

* [next-queue v2 1/4] ixgbe: no need for ipsec csum feature check
From: Shannon Nelson @ 2018-03-16 18:09 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert
In-Reply-To: <1521223747-12018-1-git-send-email-shannon.nelson@oracle.com>

With the patch
commit f8aa2696b4af ("esp: check the NETIF_F_HW_ESP_TX_CSUM bit before segmenting")
we no longer need to protect ourself from checksum
offload requests on IPsec packets, so we can remove
the check in our .ndo_features_check callback.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8536942..153cd9e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9908,12 +9908,6 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
 	if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
 		features &= ~NETIF_F_TSO;
 
-#ifdef CONFIG_XFRM_OFFLOAD
-	/* IPsec offload doesn't get along well with others *yet* */
-	if (skb->sp)
-		features &= ~(NETIF_F_TSO | NETIF_F_HW_CSUM);
-#endif
-
 	return features;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [next-queue v2 0/4] ixgbe: Enable tso and checksum offload with ipsec
From: Shannon Nelson @ 2018-03-16 18:09 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert

This patchset fixes up the bits for supporting TSO and checksum
offload in conjunction with IPsec offload.  This brings the
throughput of a simple iperf test back up to nearly line rate.

v2: fixes in patch 4 from Alex's comments

Shannon Nelson (4):
  ixgbe: no need for ipsec csum feature check
  ixgbe: remove unneeded ipsec test in Tx path
  ixgbe: no need for esp trailer if gso
  ixgbe: enable tso with ipsec offload

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 45 +++++++++++++++-----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 30 ++++++++---------
 2 files changed, 42 insertions(+), 33 deletions(-)

-- 
2.7.4

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox