Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2] net: phy: marvell10g: add thermal hwmon device
From: Russell King @ 2018-04-03  9:31 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, Guenter Roeck; +Cc: netdev

Add a thermal monitoring device for the Marvell 88x3310, which updates
once a second.  We also need to hook into the suspend/resume mechanism
to ensure that the thermal monitoring is reconfigured when we resume.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
v2: update to apply to net-next

 drivers/net/phy/marvell10g.c | 184 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 182 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 8a0bd98fdec7..db9d66781da6 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -21,8 +21,10 @@
  * If both the fiber and copper ports are connected, the first to gain
  * link takes priority and the other port is completely locked out.
  */
-#include <linux/phy.h>
+#include <linux/ctype.h>
+#include <linux/hwmon.h>
 #include <linux/marvell_phy.h>
+#include <linux/phy.h>
 
 enum {
 	MV_PCS_BASE_T		= 0x0000,
@@ -40,6 +42,19 @@ enum {
 	 */
 	MV_AN_CTRL1000		= 0x8000, /* 1000base-T control register */
 	MV_AN_STAT1000		= 0x8001, /* 1000base-T status register */
+
+	/* Vendor2 MMD registers */
+	MV_V2_TEMP_CTRL		= 0xf08a,
+	MV_V2_TEMP_CTRL_MASK	= 0xc000,
+	MV_V2_TEMP_CTRL_SAMPLE	= 0x0000,
+	MV_V2_TEMP_CTRL_DISABLE	= 0xc000,
+	MV_V2_TEMP		= 0xf08c,
+	MV_V2_TEMP_UNKNOWN	= 0x9600, /* unknown function */
+};
+
+struct mv3310_priv {
+	struct device *hwmon_dev;
+	char *hwmon_name;
 };
 
 static int mv3310_modify(struct phy_device *phydev, int devad, u16 reg,
@@ -60,17 +75,180 @@ static int mv3310_modify(struct phy_device *phydev, int devad, u16 reg,
 	return ret < 0 ? ret : 1;
 }
 
+#ifdef CONFIG_HWMON
+static umode_t mv3310_hwmon_is_visible(const void *data,
+				       enum hwmon_sensor_types type,
+				       u32 attr, int channel)
+{
+	if (type == hwmon_chip && attr == hwmon_chip_update_interval)
+		return 0444;
+	if (type == hwmon_temp && attr == hwmon_temp_input)
+		return 0444;
+	return 0;
+}
+
+static int mv3310_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+			     u32 attr, int channel, long *value)
+{
+	struct phy_device *phydev = dev_get_drvdata(dev);
+	int temp;
+
+	if (type == hwmon_chip && attr == hwmon_chip_update_interval) {
+		*value = MSEC_PER_SEC;
+		return 0;
+	}
+
+	if (type == hwmon_temp && attr == hwmon_temp_input) {
+		temp = phy_read_mmd(phydev, MDIO_MMD_VEND2, MV_V2_TEMP);
+		if (temp < 0)
+			return temp;
+
+		*value = ((temp & 0xff) - 75) * 1000;
+
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct hwmon_ops mv3310_hwmon_ops = {
+	.is_visible = mv3310_hwmon_is_visible,
+	.read = mv3310_hwmon_read,
+};
+
+static u32 mv3310_hwmon_chip_config[] = {
+	HWMON_C_REGISTER_TZ | HWMON_C_UPDATE_INTERVAL,
+	0,
+};
+
+static const struct hwmon_channel_info mv3310_hwmon_chip = {
+	.type = hwmon_chip,
+	.config = mv3310_hwmon_chip_config,
+};
+
+static u32 mv3310_hwmon_temp_config[] = {
+	HWMON_T_INPUT,
+	0,
+};
+
+static const struct hwmon_channel_info mv3310_hwmon_temp = {
+	.type = hwmon_temp,
+	.config = mv3310_hwmon_temp_config,
+};
+
+static const struct hwmon_channel_info *mv3310_hwmon_info[] = {
+	&mv3310_hwmon_chip,
+	&mv3310_hwmon_temp,
+	NULL,
+};
+
+static const struct hwmon_chip_info mv3310_hwmon_chip_info = {
+	.ops = &mv3310_hwmon_ops,
+	.info = mv3310_hwmon_info,
+};
+
+static int mv3310_hwmon_config(struct phy_device *phydev, bool enable)
+{
+	u16 val;
+	int ret;
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_V2_TEMP,
+			    MV_V2_TEMP_UNKNOWN);
+	if (ret < 0)
+		return ret;
+
+	val = enable ? MV_V2_TEMP_CTRL_SAMPLE : MV_V2_TEMP_CTRL_DISABLE;
+	ret = mv3310_modify(phydev, MDIO_MMD_VEND2, MV_V2_TEMP_CTRL,
+			    MV_V2_TEMP_CTRL_MASK, val);
+
+	return ret < 0 ? ret : 0;
+}
+
+static void mv3310_hwmon_disable(void *data)
+{
+	struct phy_device *phydev = data;
+
+	mv3310_hwmon_config(phydev, false);
+}
+
+static int mv3310_hwmon_probe(struct phy_device *phydev)
+{
+	struct device *dev = &phydev->mdio.dev;
+	struct mv3310_priv *priv = dev_get_drvdata(&phydev->mdio.dev);
+	int i, j, ret;
+
+	priv->hwmon_name = devm_kstrdup(dev, dev_name(dev), GFP_KERNEL);
+	if (!priv->hwmon_name)
+		return -ENODEV;
+
+	for (i = j = 0; priv->hwmon_name[i]; i++) {
+		if (isalnum(priv->hwmon_name[i])) {
+			if (i != j)
+				priv->hwmon_name[j] = priv->hwmon_name[i];
+			j++;
+		}
+	}
+	priv->hwmon_name[j] = '\0';
+
+	ret = mv3310_hwmon_config(phydev, true);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action_or_reset(dev, mv3310_hwmon_disable, phydev);
+	if (ret)
+		return ret;
+
+	priv->hwmon_dev = devm_hwmon_device_register_with_info(dev,
+				priv->hwmon_name, phydev,
+				&mv3310_hwmon_chip_info, NULL);
+
+	return PTR_ERR_OR_ZERO(priv->hwmon_dev);
+}
+#else
+static inline int mv3310_hwmon_config(struct phy_device *phydev, bool enable)
+{
+	return 0;
+}
+
+static int mv3310_hwmon_probe(struct phy_device *phydev)
+{
+	return 0;
+}
+#endif
+
 static int mv3310_probe(struct phy_device *phydev)
 {
+	struct mv3310_priv *priv;
 	u32 mmd_mask = MDIO_DEVS_PMAPMD | MDIO_DEVS_AN;
+	int ret;
 
 	if (!phydev->is_c45 ||
 	    (phydev->c45_ids.devices_in_package & mmd_mask) != mmd_mask)
 		return -ENODEV;
 
+	priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	dev_set_drvdata(&phydev->mdio.dev, priv);
+
+	ret = mv3310_hwmon_probe(phydev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int mv3310_suspend(struct phy_device *phydev)
+{
 	return 0;
 }
 
+static int mv3310_resume(struct phy_device *phydev)
+{
+	return mv3310_hwmon_config(phydev, true);
+}
+
 static int mv3310_config_init(struct phy_device *phydev)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0, };
@@ -367,9 +545,11 @@ static struct phy_driver mv3310_drivers[] = {
 				  SUPPORTED_FIBRE |
 				  SUPPORTED_10000baseT_Full |
 				  SUPPORTED_Backplane,
-		.probe		= mv3310_probe,
 		.soft_reset	= gen10g_no_soft_reset,
 		.config_init	= mv3310_config_init,
+		.probe		= mv3310_probe,
+		.suspend	= mv3310_suspend,
+		.resume		= mv3310_resume,
 		.config_aneg	= mv3310_config_aneg,
 		.aneg_done	= mv3310_aneg_done,
 		.read_status	= mv3310_read_status,
-- 
2.7.4

^ permalink raw reply related

* Re: linux-next: build failure after merge of the tip tree
From: Peter Zijlstra @ 2018-04-03  9:30 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, David Miller,
	Networking, Linux-Next Mailing List, Linux Kernel Mailing List,
	David Howells
In-Reply-To: <20180403154122.00d76d61@canb.auug.org.au>

On Tue, Apr 03, 2018 at 03:41:22PM +1000, Stephen Rothwell wrote:

> Caused by commit
> 
>   9b8cce52c4b5 ("sched/wait: Remove the wait_on_atomic_t() API")
> 
> interacting with commits
> 
>   d3be4d244330 ("xrpc: Fix potential call vs socket/net destruction race")
>   31f5f9a1691e ("rxrpc: Fix apparent leak of rxrpc_local objects")
> 
> from the net-next tree.
> 
> Haven't we figured out how to remove/change APIs yet? :-(

I figured that since there were only a handful of users it wasn't a
popular API, also David very much knew of those patches changing it so
could easily have pulled in the special tip/sched/wait branch :/

^ permalink raw reply

* Re: [Patch net] af_unix: remove redundant lockdep class
From: Paolo Abeni @ 2018-04-03  9:07 UTC (permalink / raw)
  To: Cong Wang, netdev
In-Reply-To: <20180402180127.4047-1-xiyou.wangcong@gmail.com>

On Mon, 2018-04-02 at 11:01 -0700, Cong Wang wrote:
> After commit 581319c58600 ("net/socket: use per af lockdep classes for sk queues")
> sock queue locks now have per-af lockdep classes, including unix socket.
> It is no longer necessary to workaround it.
> 
> I noticed this while looking at a syzbot deadlock report, this patch
> itself doesn't fix it (this is why I don't add Reported-by).
> 
> Fixes: 581319c58600 ("net/socket: use per af lockdep classes for sk queues")
> Cc: Paolo Abeni <pabeni@redhat.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---
>  net/unix/af_unix.c | 10 ----------
>  1 file changed, 10 deletions(-)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 2d465bdeccbc..45971e173924 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -745,14 +745,6 @@ static struct proto unix_proto = {
>  	.obj_size		= sizeof(struct unix_sock),
>  };
>  
> -/*
> - * AF_UNIX sockets do not interact with hardware, hence they
> - * dont trigger interrupts - so it's safe for them to have
> - * bh-unsafe locking for their sk_receive_queue.lock. Split off
> - * this special lock-class by reinitializing the spinlock key:
> - */
> -static struct lock_class_key af_unix_sk_receive_queue_lock_key;
> -
>  static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
>  {
>  	struct sock *sk = NULL;
> @@ -767,8 +759,6 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
>  		goto out;
>  
>  	sock_init_data(sock, sk);
> -	lockdep_set_class(&sk->sk_receive_queue.lock,
> -				&af_unix_sk_receive_queue_lock_key);
>  
>  	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
>  	sk->sk_write_space	= unix_write_space;

LGTM

Acked-by: Paolo Abeni <pabeni@redhat.com>

^ permalink raw reply

* linux-next on x60: hangs when I request suspend was Re: linux-next on x60: network manager often complains "network is disabled" after resume
From: Pavel Machek @ 2018-04-03  8:51 UTC (permalink / raw)
  To: Dan Williams
  Cc: Woody Suwalski, Rafael J. Wysocki, kernel list,
	Linux-pm mailing list, Netdev list
In-Reply-To: <95efbba35c3389015d4919a59f8d01bc2d375a19.camel@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2655 bytes --]

Hi!

I wanted to re-test next (4.16.0-rc7-next-20180329), but that one does
not suspend at all.

I normally suspend by pressing power button in MATE, but that action
currently results in machine hanging.

								Pavel


On Mon 2018-03-26 10:33:55, Dan Williams wrote:
> On Sun, 2018-03-25 at 08:19 +0200, Pavel Machek wrote:
> > > > > Ok, what does 'nmcli dev' and 'nmcli radio' show?
> > > > 
> > > > Broken state.
> > > > 
> > > > pavel@amd:~$ nmcli dev
> > > > DEVICE  TYPE      STATE        CONNECTION
> > > > eth1    ethernet  unavailable  --
> > > > lo      loopback  unmanaged    --
> > > > wlan0   wifi      unmanaged    --
> > > 
> > > If the state is "unmanaged" on resume, that would indicate a
> > > problem
> > > with sleep/wake and likely not a kernel network device issue.
> > > 
> > > We should probably move this discussion to the NM lists to debug
> > > further.  Before you suspend, run "nmcli gen log level trace" to
> > > turn
> > > on full debug logging, then reproduce the issue, and send a pointer
> > > to
> > > those logs (scrubbed for anything you consider sensitive) to the NM
> > > mailing list.
> > 
> > Hmm :-)
> > 
> > root@amd:/data/pavel# nmcli gen log level trace
> > Error: Unknown log level 'trace'
> 
> What NM version?  'trace' is pretty old (since 1.0 from December 2014)
> so unless you're using a really, really old version of Debian I'd
> expect you'd have it.  Anyway, debug would do.
> 
> > root@amd:/data/pavel# nmcli gen log level help
> > Error: Unknown log level 'help'
> 
> nmcli gen help
> 
> > root@amd:/data/pavel# nmcli gen log level
> > Error: value for 'level' argument is required.
> > root@amd:/data/pavel# nmcli gen log level debug
> 
> This should be OK.
> 
> > root@amd:/data/pavel# cat /var/log/sys/log
> 
> It routes it to whatever the syslog 'daemon' facility logs to (however
> that's configured on your system).  Usually /var/log/messages or
> /var/log/daemon.log or sometimes your distro configures it to
> /var/log/NetworkManager.log.
> 
> Or if you're using a systemd-based distro, it would probably be in the
> systemd journal so "journalctl -b -u NetworkManager"
> 
> > Where do I get the logs? I don't see much in the syslog...
> 
> > And.. It seems that it is "every other suspend". One resume results
> > in
> > broken network, one in working one, one in broken one...
> 
> Does your distro use pm-utils, upower, or systemd for suspend/resume
> handling?
> 
> Dan

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
From: Pavel Machek @ 2018-04-03  8:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, David Laight, 'Rahul Lakkireddy',
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu
In-Reply-To: <20180320105427.bm4od7cpessbraag@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2512 bytes --]

Hi!

> > On Tue, 20 Mar 2018, Ingo Molnar wrote:
> > > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > > 
> > > > > So I do think we could do more in this area to improve driver performance, if the 
> > > > > code is correct and if there's actual benchmarks that are showing real benefits.
> > > > 
> > > > If it's about hotpath performance I'm all for it, but the use case here is
> > > > a debug facility...
> > > > 
> > > > And if we go down that road then we want a AVX based memcpy()
> > > > implementation which is runtime conditional on the feature bit(s) and
> > > > length dependent. Just slapping a readqq() at it and use it in a loop does
> > > > not make any sense.
> > > 
> > > Yeah, so generic memcpy() replacement is only feasible I think if the most 
> > > optimistic implementation is actually correct:
> > > 
> > >  - if no preempt disable()/enable() is required
> > > 
> > >  - if direct access to the AVX[2] registers does not disturb legacy FPU state in 
> > >    any fashion
> > > 
> > >  - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > >    weird behavior if the FPU control word is modified to non-standard values by 
> > >    untrusted user-space
> > > 
> > > If we have to touch the FPU tag or control words then it's probably only good for 
> > > a specialized API.
> > 
> > I did not mean to have a general memcpy replacement. Rather something like
> > magic_memcpy() which falls back to memcpy when AVX is not usable or the
> > length does not justify the AVX stuff at all.
> 
> OK, fair enough.
> 
> Note that a generic version might still be worth trying out, if and only if it's 
> safe to access those vector registers directly: modern x86 CPUs will do their 
> non-constant memcpy()s via the common memcpy_erms() function - which could in 
> theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if 
> size (and alignment, perhaps) is a multiple of 32 bytes or so.

How is AVX2 supposed to help the memcpy speed?

If the copy is small, constant overhead will dominate, and I don't
think AVX2 is going to be win there.

If the copy is big, well, the copy loop will likely run out of L1 and
maybe even out of L2, and at that point speed of the loop does not
matter because memory is slow...?

Best regards,
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* [PATCH] vhost-net: add limitation of sent packets for tx polling
From: haibinzhang(张海斌) @ 2018-04-03  8:08 UTC (permalink / raw)
  To: mst@redhat.com, Jason Wang, kvm@vger.kernel.org,
	virtualization@lists.linux-foundation.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
  Cc: lidongchen(陈立东),
	yunfangtai(台运方)

handle_tx will delay rx for a long time when tx busy polling udp packets
with small length(e.g. 1byte udp payload), because setting VHOST_NET_WEIGHT
takes into account only sent-bytes but no single packet length.

Tests were done between two Virtual Machines using netperf(UDP_STREAM, len=1),
then another machine pinged the client. Result shows as follow:

Packet#       Ping-Latency(ms)
              min     avg     max
Origin      3.319  18.489  57.503
64          1.643   2.021   2.552
128         1.825   2.600   3.224
256         1.997   2.710   4.295
512*        1.860   3.171   4.631
1024        2.002   4.173   9.056
2048        2.257   5.650   9.688
4096        2.093   8.508  15.943

512 is selected, which is multi-VRING_SIZE and close to VHOST_NET_WEIGHT/MTU.
To evaluate this change, another tests were done using netperf(RR, TX) between
two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz. Result as follow
does not show obvious changes:

TCP_RR

size/sessions/+thu%/+normalize%
   1/       1/  -7%/        -2%
   1/       4/  +1%/         0%
   1/       8/  +1%/        -2%
  64/       1/  -6%/         0%
  64/       4/   0%/        +2%
  64/       8/   0%/         0%
 256/       1/  -3%/        -4%
 256/       4/  +3%/        +4%
 256/       8/  +2%/         0%

UDP_RR

size/sessions/+thu%/+normalize%
   1/       1/  -5%/        +1%
   1/       4/  +4%/        +1%
   1/       8/  -1%/        -1%
  64/       1/  -2%/        -3%
  64/       4/  -5%/        -1%
  64/       8/   0%/        -1%
 256/       1/  +7%/        +1%
 256/       4/  +1%/        +1%
 256/       8/  +2%/        +2%

TCP_STREAM

size/sessions/+thu%/+normalize%
  64/       1/   0%/        -3%
  64/       4/  +3%/        -1%
  64/       8/  +9%/        -4%
 256/       1/  +1%/        -4%
 256/       4/  -1%/        -1%
 256/       8/  +7%/        +5%
 512/       1/  +1%/         0%
 512/       4/  +1%/        -1%
 512/       8/  +7%/        -5%
1024/       1/   0%/        -1%
1024/       4/  +3%/         0%
1024/       8/  +8%/        +5%
2048/       1/  +2%/        +2%
2048/       4/  +1%/         0%
2048/       8/  -2%/         0%
4096/       1/  -2%/         0%
4096/       4/  +2%/         0%
4096/       8/  +9%/        -2%

Signed-off-by: Haibin Zhang <haibinzhang@tencent.com>
Signed-off-by: Yunfang Tai <yunfangtai@tencent.com>
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
---
 drivers/vhost/net.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8139bc70ad7d..13a23f3f3ea4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -44,6 +44,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+/* Max number of packets transferred before requeueing the job.
+ * Using this limit prevents one virtqueue from starving rx. */
+#define VHOST_NET_PKT_WEIGHT 512
+
 /* MAX number of TX used buffers for outstanding zerocopy */
 #define VHOST_MAX_PEND 128
 #define VHOST_GOODCOPY_LEN 256
@@ -473,6 +477,7 @@ static void handle_tx(struct vhost_net *net)
 	struct socket *sock;
 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
 	bool zcopy, zcopy_used;
+	int sent_pkts = 0;
 
 	mutex_lock(&vq->mutex);
 	sock = vq->private_data;
@@ -580,7 +585,8 @@ static void handle_tx(struct vhost_net *net)
 		else
 			vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
-		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+		if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+		    unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
 		}
-- 
2.12.3


^ permalink raw reply related

* Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
From: Siwei Liu @ 2018-04-03  7:40 UTC (permalink / raw)
  To: David Ahern
  Cc: Si-Wei Liu, Michael S. Tsirkin, Jiri Pirko, Stephen Hemminger,
	Alexander Duyck, David Miller, Brandeburg, Jesse, Jakub Kicinski,
	Jason Wang, Samudrala, Sridhar, Netdev, virtualization,
	virtio-dev
In-Reply-To: <8b589cd2-1abc-59c2-99f1-96df8174bb6b@gmail.com>

On Sun, Apr 1, 2018 at 9:11 AM, David Ahern <dsahern@gmail.com> wrote:
> On 4/1/18 3:13 AM, Si-Wei Liu wrote:
>> Hidden netdevice is not visible to userspace such that
>> typical network utilities e.g. ip, ifconfig and et al,
>> cannot sense its existence or configure it. Internally
>> hidden netdev may associate with an upper level netdev
>> that userspace has access to. Although userspace cannot
>> manipulate the lower netdev directly, user may control
>> or configure the underlying hidden device through the
>> upper-level netdev. For identification purpose, the
>> kobject for hidden netdev still presents in the sysfs
>> hierarchy, however, no uevent message will be generated
>> when the sysfs entry is created, modified or destroyed.
>>
>> For that end, a separate namescope needs to be carved
>> out for IFF_HIDDEN netdevs. As of now netdev name that
>> starts with colon i.e. ':' is invalid in userspace,
>> since socket ioctls such as SIOCGIFCONF use ':' as the
>> separator for ifname. The absence of namescope started
>> with ':' can rightly be used as the namescope for
>> the kernel-only IFF_HIDDEN netdevs.
>>
>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>> ---
>>  include/linux/netdevice.h   |  12 ++
>>  include/net/net_namespace.h |   2 +
>>  net/core/dev.c              | 281 ++++++++++++++++++++++++++++++++++++++------
>>  net/core/net_namespace.c    |   1 +
>>  4 files changed, 263 insertions(+), 33 deletions(-)
>>
>
> There are other use cases that want to hide a device from userspace.

Can you elaborate your case in more details? Looking at the links
below I realize that the purpose of hiding devices in your case is
quite different from the our migration case. Particularly, I don't
like the part of elaborately allowing user to manipulate the link's
visibility - things fall apart easily while live migration is on
going. And, why doing additional check for invisible links in every
for_each_netdev() and its friends. This is effectively changing
semantics of internal APIs that exist for decades.

> I would prefer a better solution than playing games with name prefixes and

This part is intentionally left to be that way and I would anticipate
feedback before going too far. The idea in my mind was that I need a
completely new device namespace underneath (or namescope, which is !=
netns) for all netdevs: kernel-only IFF_HIDDEN network devices and
those not. The current namespace for devname is already exposed to
userspace and visible in the sysfs hierarchy, but for backwards
compatibility reasons it's necessary to keep the old udevd still able
to reference the entry of IFF_HIDDEN netdev under the /sys/net
directory. By using the ':' prefix it has the benefit of minimal
changes without introducing another namespace or the accompanied
complexity in managing these two separate namespaces.

Technically, I can create a separate sysfs directory for the new
namescope, say /sys/net-kernel, which includes all netdev entries like
':eth0' and 'ens3', and which can be referenced from /sys/net. It
would make the /sys/net consistent with the view of userspace
utilities, but I am not even sure if that's an overkill, and would
like to gather more feedback before moving to that model.

> one that includes an API for users to list all devices -- even ones

What kind of API you would like to query for hidden devices?
rtnetlink? a private socket API? or something else?

For our case, the sysfs interface is what we need and is sufficient,
since udev is the main target we'd like to support to make the naming
of virtio_bypass consistent and compatible.

> hidden by default.
>
> https://github.com/dsahern/linux/commit/48a80a00eac284e58bae04af10a5a932dd7aee00
>
> https://github.com/dsahern/iproute2/commit/7563f5b26f5539960e99066e34a995d22ea908ed
>
> Also, why are you suggesting that the device should still be visible via
> /sysfs? That leads to inconsistent views of networking state - /sys
> shows a device but a link dump does not.

See my clarifications above. I don't mind kernel-only netdevs being
visible via sysfs, as that way we get a good trade-off between
backwards compatibility and visibility. There's still kobject created
there right. Bottom line is that all kernel devices and its life-cycle
uevents are made invisible to userpace network utilities, and I think
it simply gets to the goal of not breaking existing apps while being
able to add new features.

-Siwei

^ permalink raw reply

* Re: [PATCH net-next 09/11] devlink: convert occ_get op to separate registration
From: Jiri Pirko @ 2018-04-03  7:32 UTC (permalink / raw)
  To: David Ahern; +Cc: Ido Schimmel, netdev, davem, jiri, petrm, mlxsw
In-Reply-To: <5f2dc834-0d8a-77ef-3d33-2228e7bd530c@gmail.com>

Fri, Mar 30, 2018 at 04:45:50PM CEST, dsahern@gmail.com wrote:
>On 3/29/18 2:33 PM, Ido Schimmel wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> This resolves race during initialization where the resources with
>> ops are registered before driver and the structures used by occ_get
>> op is initialized. So keep occ_get callbacks registered only when
>> all structs are initialized.
>
>Why can't the occ_get handler look at some flag in an mlxsw struct to
>know if the system has initialized?
>
>Separate registration here is awkward. You register a resource and then
>register its op later.

The separation is exactly why this patch is made. Note that devlink
resouce is registered by core way before the initialization is done and
the driver is actually able to perform the op. Also consider "reload"
case, when the resource is still registered and the driver unloads and
loads again. For that makes perfect sense to have that separated.
Flag would just make things odd. Also, the priv could not be used in
that case.



>
>Also, this should be a standalone patch rather than embedded in a
>'mlxsw: Various cleanups' set.
>
>
>> 
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>> ---
>>  drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 24 ++-----
>>  drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  1 -
>>  .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c    | 67 ++++++++++++--------
>>  include/net/devlink.h                              | 40 ++++++++----
>>  net/core/devlink.c                                 | 74 +++++++++++++++++++---
>>  5 files changed, 134 insertions(+), 72 deletions(-)
>> 
>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> index b831af38e0a1..0d95d2cb73e3 100644
>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
>> @@ -3805,18 +3805,6 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
>>  	},
>>  };
>>  
>> -static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
>> -{
>> -	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> -	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
>> -
>> -	return mlxsw_sp_kvdl_occ_get(mlxsw_sp);
>> -}
>> -
>> -static const struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = {
>> -	.occ_get = mlxsw_sp_resource_kvd_linear_occ_get,
>> -};
>> -
>>  static void
>>  mlxsw_sp_resource_size_params_prepare(struct mlxsw_core *mlxsw_core,
>>  				      struct devlink_resource_size_params *kvd_size_params,
>> @@ -3877,8 +3865,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
>>  	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD,
>>  					kvd_size, MLXSW_SP_RESOURCE_KVD,
>>  					DEVLINK_RESOURCE_ID_PARENT_TOP,
>> -					&kvd_size_params,
>> -					NULL);
>> +					&kvd_size_params);
>>  	if (err)
>>  		return err;
>>  
>> @@ -3887,8 +3874,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
>>  					linear_size,
>>  					MLXSW_SP_RESOURCE_KVD_LINEAR,
>>  					MLXSW_SP_RESOURCE_KVD,
>> -					&linear_size_params,
>> -					&mlxsw_sp_resource_kvd_linear_ops);
>> +					&linear_size_params);
>>  	if (err)
>>  		return err;
>>  
>> @@ -3905,8 +3891,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
>>  					double_size,
>>  					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
>>  					MLXSW_SP_RESOURCE_KVD,
>> -					&hash_double_size_params,
>> -					NULL);
>> +					&hash_double_size_params);
>>  	if (err)
>>  		return err;
>>  
>> @@ -3915,8 +3900,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
>>  					single_size,
>>  					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
>>  					MLXSW_SP_RESOURCE_KVD,
>> -					&hash_single_size_params,
>> -					NULL);
>> +					&hash_single_size_params);
>>  	if (err)
>>  		return err;
>>  
>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
>> index 21bee8f19894..c59a0d7d81d5 100644
>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
>> @@ -442,7 +442,6 @@ void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index);
>>  int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
>>  				   unsigned int entry_count,
>>  				   unsigned int *p_alloc_size);
>> -u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp);
>>  int mlxsw_sp_kvdl_resources_register(struct devlink *devlink);
>>  
>>  struct mlxsw_sp_acl_rule_info {
>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
>> index 7b28f65d6407..1b7280168e6b 100644
>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
>> @@ -315,8 +315,9 @@ static u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part)
>>  	return occ;
>>  }
>>  
>> -u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
>> +static u64 mlxsw_sp_kvdl_occ_get(void *priv)
>>  {
>> +	const struct mlxsw_sp *mlxsw_sp = priv;
>>  	u64 occ = 0;
>>  	int i;
>>  
>> @@ -326,48 +327,33 @@ u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
>>  	return occ;
>>  }
>>  
>> -static u64 mlxsw_sp_kvdl_single_occ_get(struct devlink *devlink)
>> +static u64 mlxsw_sp_kvdl_single_occ_get(void *priv)
>>  {
>> -	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> -	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
>> +	const struct mlxsw_sp *mlxsw_sp = priv;
>>  	struct mlxsw_sp_kvdl_part *part;
>>  
>>  	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_SINGLE];
>>  	return mlxsw_sp_kvdl_part_occ(part);
>>  }
>>  
>> -static u64 mlxsw_sp_kvdl_chunks_occ_get(struct devlink *devlink)
>> +static u64 mlxsw_sp_kvdl_chunks_occ_get(void *priv)
>>  {
>> -	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> -	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
>> +	const struct mlxsw_sp *mlxsw_sp = priv;
>>  	struct mlxsw_sp_kvdl_part *part;
>>  
>>  	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_CHUNKS];
>>  	return mlxsw_sp_kvdl_part_occ(part);
>>  }
>>  
>> -static u64 mlxsw_sp_kvdl_large_chunks_occ_get(struct devlink *devlink)
>> +static u64 mlxsw_sp_kvdl_large_chunks_occ_get(void *priv)
>>  {
>> -	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> -	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
>> +	const struct mlxsw_sp *mlxsw_sp = priv;
>>  	struct mlxsw_sp_kvdl_part *part;
>>  
>>  	part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS];
>>  	return mlxsw_sp_kvdl_part_occ(part);
>>  }
>>  
>> -static const struct devlink_resource_ops mlxsw_sp_kvdl_single_ops = {
>> -	.occ_get = mlxsw_sp_kvdl_single_occ_get,
>> -};
>> -
>> -static const struct devlink_resource_ops mlxsw_sp_kvdl_chunks_ops = {
>> -	.occ_get = mlxsw_sp_kvdl_chunks_occ_get,
>> -};
>> -
>> -static const struct devlink_resource_ops mlxsw_sp_kvdl_chunks_large_ops = {
>> -	.occ_get = mlxsw_sp_kvdl_large_chunks_occ_get,
>> -};
>> -
>>  int mlxsw_sp_kvdl_resources_register(struct devlink *devlink)
>>  {
>>  	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
>> @@ -386,8 +372,7 @@ int mlxsw_sp_kvdl_resources_register(struct devlink *devlink)
>>  					MLXSW_SP_KVDL_SINGLE_SIZE,
>>  					MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
>>  					MLXSW_SP_RESOURCE_KVD_LINEAR,
>> -					&size_params,
>> -					&mlxsw_sp_kvdl_single_ops);
>> +					&size_params);
>>  	if (err)
>>  		return err;
>>  
>> @@ -398,8 +383,7 @@ int mlxsw_sp_kvdl_resources_register(struct devlink *devlink)
>>  					MLXSW_SP_KVDL_CHUNKS_SIZE,
>>  					MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
>>  					MLXSW_SP_RESOURCE_KVD_LINEAR,
>> -					&size_params,
>> -					&mlxsw_sp_kvdl_chunks_ops);
>> +					&size_params);
>>  	if (err)
>>  		return err;
>>  
>> @@ -410,13 +394,13 @@ int mlxsw_sp_kvdl_resources_register(struct devlink *devlink)
>>  					MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE,
>>  					MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
>>  					MLXSW_SP_RESOURCE_KVD_LINEAR,
>> -					&size_params,
>> -					&mlxsw_sp_kvdl_chunks_large_ops);
>> +					&size_params);
>>  	return err;
>>  }
>>  
>>  int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
>>  {
>> +	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
>>  	struct mlxsw_sp_kvdl *kvdl;
>>  	int err;
>>  
>> @@ -429,6 +413,23 @@ int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
>>  	if (err)
>>  		goto err_kvdl_parts_init;
>>  
>> +	devlink_resource_occ_get_register(devlink,
>> +					  MLXSW_SP_RESOURCE_KVD_LINEAR,
>> +					  mlxsw_sp_kvdl_occ_get,
>> +					  mlxsw_sp);
>> +	devlink_resource_occ_get_register(devlink,
>> +					  MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
>> +					  mlxsw_sp_kvdl_single_occ_get,
>> +					  mlxsw_sp);
>> +	devlink_resource_occ_get_register(devlink,
>> +					  MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
>> +					  mlxsw_sp_kvdl_chunks_occ_get,
>> +					  mlxsw_sp);
>> +	devlink_resource_occ_get_register(devlink,
>> +					  MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
>> +					  mlxsw_sp_kvdl_large_chunks_occ_get,
>> +					  mlxsw_sp);
>> +
>>  	return 0;
>>  
>>  err_kvdl_parts_init:
>> @@ -438,6 +439,16 @@ int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
>>  
>>  void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp)
>>  {
>> +	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
>> +
>> +	devlink_resource_occ_get_unregister(devlink,
>> +					    MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS);
>> +	devlink_resource_occ_get_unregister(devlink,
>> +					    MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS);
>> +	devlink_resource_occ_get_unregister(devlink,
>> +					    MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE);
>> +	devlink_resource_occ_get_unregister(devlink,
>> +					    MLXSW_SP_RESOURCE_KVD_LINEAR);
>>  	mlxsw_sp_kvdl_parts_fini(mlxsw_sp);
>>  	kfree(mlxsw_sp->kvdl);
>>  }
>> diff --git a/include/net/devlink.h b/include/net/devlink.h
>> index e21d8cadd480..2e4f71e16e95 100644
>> --- a/include/net/devlink.h
>> +++ b/include/net/devlink.h
>> @@ -231,14 +231,6 @@ struct devlink_dpipe_headers {
>>  	unsigned int headers_count;
>>  };
>>  
>> -/**
>> - * struct devlink_resource_ops - resource ops
>> - * @occ_get: get the occupied size
>> - */
>> -struct devlink_resource_ops {
>> -	u64 (*occ_get)(struct devlink *devlink);
>> -};
>> -
>>  /**
>>   * struct devlink_resource_size_params - resource's size parameters
>>   * @size_min: minimum size which can be set
>> @@ -265,6 +257,8 @@ devlink_resource_size_params_init(struct devlink_resource_size_params *size_para
>>  	size_params->unit = unit;
>>  }
>>  
>> +typedef u64 devlink_resource_occ_get_t(void *priv);
>> +
>>  /**
>>   * struct devlink_resource - devlink resource
>>   * @name: name of the resource
>> @@ -277,7 +271,6 @@ devlink_resource_size_params_init(struct devlink_resource_size_params *size_para
>>   * @size_params: size parameters
>>   * @list: parent list
>>   * @resource_list: list of child resources
>> - * @resource_ops: resource ops
>>   */
>>  struct devlink_resource {
>>  	const char *name;
>> @@ -289,7 +282,8 @@ struct devlink_resource {
>>  	struct devlink_resource_size_params size_params;
>>  	struct list_head list;
>>  	struct list_head resource_list;
>> -	const struct devlink_resource_ops *resource_ops;
>> +	devlink_resource_occ_get_t *occ_get;
>> +	void *occ_get_priv;
>>  };
>>  
>>  #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
>> @@ -409,8 +403,7 @@ int devlink_resource_register(struct devlink *devlink,
>>  			      u64 resource_size,
>>  			      u64 resource_id,
>>  			      u64 parent_resource_id,
>> -			      const struct devlink_resource_size_params *size_params,
>> -			      const struct devlink_resource_ops *resource_ops);
>> +			      const struct devlink_resource_size_params *size_params);
>>  void devlink_resources_unregister(struct devlink *devlink,
>>  				  struct devlink_resource *resource);
>>  int devlink_resource_size_get(struct devlink *devlink,
>> @@ -419,6 +412,12 @@ int devlink_resource_size_get(struct devlink *devlink,
>>  int devlink_dpipe_table_resource_set(struct devlink *devlink,
>>  				     const char *table_name, u64 resource_id,
>>  				     u64 resource_units);
>> +void devlink_resource_occ_get_register(struct devlink *devlink,
>> +				       u64 resource_id,
>> +				       devlink_resource_occ_get_t *occ_get,
>> +				       void *occ_get_priv);
>> +void devlink_resource_occ_get_unregister(struct devlink *devlink,
>> +					 u64 resource_id);
>>  
>>  #else
>>  
>> @@ -562,8 +561,7 @@ devlink_resource_register(struct devlink *devlink,
>>  			  u64 resource_size,
>>  			  u64 resource_id,
>>  			  u64 parent_resource_id,
>> -			  const struct devlink_resource_size_params *size_params,
>> -			  const struct devlink_resource_ops *resource_ops)
>> +			  const struct devlink_resource_size_params *size_params)
>>  {
>>  	return 0;
>>  }
>> @@ -589,6 +587,20 @@ devlink_dpipe_table_resource_set(struct devlink *devlink,
>>  	return -EOPNOTSUPP;
>>  }
>>  
>> +static inline void
>> +devlink_resource_occ_get_register(struct devlink *devlink,
>> +				  u64 resource_id,
>> +				  devlink_resource_occ_get_t *occ_get,
>> +				  void *occ_get_priv)
>> +{
>> +}
>> +
>> +static inline void
>> +devlink_resource_occ_get_unregister(struct devlink *devlink,
>> +				    u64 resource_id)
>> +{
>> +}
>> +
>>  #endif
>>  
>>  #endif /* _NET_DEVLINK_H_ */
>> diff --git a/net/core/devlink.c b/net/core/devlink.c
>> index 9236e421bd62..ad1317376798 100644
>> --- a/net/core/devlink.c
>> +++ b/net/core/devlink.c
>> @@ -2405,6 +2405,16 @@ devlink_resource_size_params_put(struct devlink_resource *resource,
>>  	return 0;
>>  }
>>  
>> +static int devlink_resource_occ_put(struct devlink_resource *resource,
>> +				    struct sk_buff *skb)
>> +{
>> +	if (!resource->occ_get)
>> +		return 0;
>> +	return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
>> +				 resource->occ_get(resource->occ_get_priv),
>> +				 DEVLINK_ATTR_PAD);
>> +}
>> +
>>  static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
>>  				struct devlink_resource *resource)
>>  {
>> @@ -2425,11 +2435,8 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
>>  	if (resource->size != resource->size_new)
>>  		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
>>  				  resource->size_new, DEVLINK_ATTR_PAD);
>> -	if (resource->resource_ops && resource->resource_ops->occ_get)
>> -		if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
>> -				      resource->resource_ops->occ_get(devlink),
>> -				      DEVLINK_ATTR_PAD))
>> -			goto nla_put_failure;
>> +	if (devlink_resource_occ_put(resource, skb))
>> +		goto nla_put_failure;
>>  	if (devlink_resource_size_params_put(resource, skb))
>>  		goto nla_put_failure;
>>  	if (list_empty(&resource->resource_list))
>> @@ -3162,15 +3169,13 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
>>   *	@resource_id: resource's id
>>   *	@parent_reosurce_id: resource's parent id
>>   *	@size params: size parameters
>> - *	@resource_ops: resource ops
>>   */
>>  int devlink_resource_register(struct devlink *devlink,
>>  			      const char *resource_name,
>>  			      u64 resource_size,
>>  			      u64 resource_id,
>>  			      u64 parent_resource_id,
>> -			      const struct devlink_resource_size_params *size_params,
>> -			      const struct devlink_resource_ops *resource_ops)
>> +			      const struct devlink_resource_size_params *size_params)
>>  {
>>  	struct devlink_resource *resource;
>>  	struct list_head *resource_list;
>> @@ -3213,7 +3218,6 @@ int devlink_resource_register(struct devlink *devlink,
>>  	resource->size = resource_size;
>>  	resource->size_new = resource_size;
>>  	resource->id = resource_id;
>> -	resource->resource_ops = resource_ops;
>>  	resource->size_valid = true;
>>  	memcpy(&resource->size_params, size_params,
>>  	       sizeof(resource->size_params));
>> @@ -3315,6 +3319,58 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink,
>>  }
>>  EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
>>  
>> +/**
>> + *	devlink_resource_occ_get_register - register occupancy getter
>> + *
>> + *	@devlink: devlink
>> + *	@resource_id: resource id
>> + *	@occ_get: occupancy getter callback
>> + *	@occ_get_priv: occupancy getter callback priv
>> + */
>> +void devlink_resource_occ_get_register(struct devlink *devlink,
>> +				       u64 resource_id,
>> +				       devlink_resource_occ_get_t *occ_get,
>> +				       void *occ_get_priv)
>> +{
>> +	struct devlink_resource *resource;
>> +
>> +	mutex_lock(&devlink->lock);
>> +	resource = devlink_resource_find(devlink, NULL, resource_id);
>> +	if (WARN_ON(!resource))
>> +		goto out;
>> +	WARN_ON(resource->occ_get);
>> +
>> +	resource->occ_get = occ_get;
>> +	resource->occ_get_priv = occ_get_priv;
>> +out:
>> +	mutex_unlock(&devlink->lock);
>> +}
>> +EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
>> +
>> +/**
>> + *	devlink_resource_occ_get_unregister - unregister occupancy getter
>> + *
>> + *	@devlink: devlink
>> + *	@resource_id: resource id
>> + */
>> +void devlink_resource_occ_get_unregister(struct devlink *devlink,
>> +					 u64 resource_id)
>> +{
>> +	struct devlink_resource *resource;
>> +
>> +	mutex_lock(&devlink->lock);
>> +	resource = devlink_resource_find(devlink, NULL, resource_id);
>> +	if (WARN_ON(!resource))
>> +		goto out;
>> +	WARN_ON(!resource->occ_get);
>> +
>> +	resource->occ_get = NULL;
>> +	resource->occ_get_priv = NULL;
>> +out:
>> +	mutex_unlock(&devlink->lock);
>> +}
>> +EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
>> +
>>  static int __init devlink_module_init(void)
>>  {
>>  	return genl_register_family(&devlink_nl_family);
>> 
>

^ permalink raw reply

* [PATCH iproute2 rdma: Ignore unknown netlink attributes
From: Leon Romanovsky @ 2018-04-03  7:28 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Leon Romanovsky, netdev, RDMA mailing list, David Ahern,
	Steve Wise

From: Leon Romanovsky <leonro@mellanox.com>

The check if netlink attributes supplied more than maximum supported
is to strict and may lead to backward compatibility issues with old
application with a newer kernel that supports new attribute.

CC: Steve Wise <swise@opengridcomputing.com>
Fixes: 74bd75c2b68d ("rdma: Add basic infrastructure for RDMA tool")
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 rdma/utils.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rdma/utils.c b/rdma/utils.c
index a2e08e91..5c1e736a 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -399,7 +399,8 @@ int rd_attr_cb(const struct nlattr *attr, void *data)
 	int type;

 	if (mnl_attr_type_valid(attr, RDMA_NLDEV_ATTR_MAX) < 0)
-		return MNL_CB_ERROR;
+		/* We received uknown attribute */
+		return MNL_CB_OK;

 	type = mnl_attr_get_type(attr);

^ permalink raw reply related

* Re: [PATCH 00/15] ARM: pxa: switch to DMA slave maps
From: Arnd Bergmann @ 2018-04-03  7:15 UTC (permalink / raw)
  To: Robert Jarzmik
  Cc: Ulf Hansson, alsa-devel, Jaroslav Kysela, IDE-ML, Networking,
	linux-mtd, devel, Boris Brezillon, Vinod Koul, Richard Weinberger,
	Takashi Iwai, Marek Vasut, Ezequiel Garcia,
	Linux Media Mailing List, Samuel Ortiz, Bartlomiej Zolnierkiewicz,
	Haojian Zhuang, dmaengine, Mark Brown, Mauro Carvalho Chehab,
	Linux ARM, Nicolas Pitre, Greg 
In-Reply-To: <20180402142656.26815-1-robert.jarzmik@free.fr>

On Mon, Apr 2, 2018 at 4:26 PM, Robert Jarzmik <robert.jarzmik@free.fr> wrote:
> Hi,
>
> This serie is aimed at removing the dmaengine slave compat use, and transfer
> knowledge of the DMA requestors into architecture code.
>
> This was discussed/advised by Arnd a couple of years back, it's almost time.
>
> The serie is divided in 3 phasees :
>  - phase 1 : patch 1/15 and patch 2/15
>    => this is the preparation work
>  - phase 2 : patches 3/15 .. 10/15
>    => this is the switch of all the drivers
>    => this one will require either an Ack of the maintainers or be taken by them
>       once phase 1 is merged
>  - phase 3 : patches 11/15
>    => this is the last part, cleanup and removal of export of the DMA filter
>       function
>
> As this looks like a patch bomb, each maintainer expressing for his tree either
> an Ack or "I want to take through my tree" will be spared in the next iterations
> of this serie.
>
> Several of these changes have been tested on actual hardware, including :
>  - pxamci
>  - pxa_camera
>  - smc*
>  - ASoC and SSP
>
> Happy review.

This looks really great overall, thanks for cleaning this up!

The SSP part is still a bit weird, as I commented, but I'm sure we can
figure something out there.

    Arnd

^ permalink raw reply

* Re: [PATCH 12/15] dmaengine: pxa: make the filter function internal
From: Arnd Bergmann @ 2018-04-03  7:13 UTC (permalink / raw)
  To: kbuild test robot
  Cc: Robert Jarzmik, kbuild-all, Daniel Mack, Haojian Zhuang,
	Bartlomiej Zolnierkiewicz, Tejun Heo, Vinod Koul,
	Mauro Carvalho Chehab, Ulf Hansson, Ezequiel Garcia,
	Boris Brezillon, David Woodhouse, Brian Norris, Marek Vasut,
	Richard Weinberger, Cyrille Pitchen, Nicolas Pitre, Samuel Ortiz,
	Greg Kroah-Hartman
In-Reply-To: <201804030025.FmWPyArN%fengguang.wu@intel.com>

On Mon, Apr 2, 2018 at 6:35 PM, kbuild test robot <lkp@intel.com> wrote:

>
>    drivers/mtd/nand/marvell_nand.c:2621:17: sparse: undefined identifier 'pxad_filter_fn'
>>> drivers/mtd/nand/marvell_nand.c:2621:17: sparse: call with no type!
>    In file included from drivers/mtd/nand/marvell_nand.c:21:0:
>    drivers/mtd/nand/marvell_nand.c: In function 'marvell_nfc_init_dma':
>    drivers/mtd/nand/marvell_nand.c:2621:42: error: 'pxad_filter_fn' undeclared (first use in this function); did you mean 'dma_filter_fn'?
>       dma_request_slave_channel_compat(mask, pxad_filter_fn,
>                                              ^
>    include/linux/dmaengine.h:1408:46: note: in definition of macro 'dma_request_slave_channel_compat'
>      __dma_request_slave_channel_compat(&(mask), x, y, dev, name)
>                                                  ^
>    drivers/mtd/nand/marvell_nand.c:2621:42: note: each undeclared identifier is reported only once for each function it appears in
>       dma_request_slave_channel_compat(mask, pxad_filter_fn,
>                                              ^
>    include/linux/dmaengine.h:1408:46: note: in definition of macro 'dma_request_slave_channel_compat'
>      __dma_request_slave_channel_compat(&(mask), x, y, dev, name)

The driver is a replacement for the pxa3xx nand driver, so it now has
to get changed as well.

       Arnd

^ permalink raw reply

* Re: [PATCH 14/15] ARM: pxa: change SSP devices allocation
From: Arnd Bergmann @ 2018-04-03  7:06 UTC (permalink / raw)
  To: Robert Jarzmik
  Cc: Kate Stewart, Ulf Hansson, alsa-devel, Jaroslav Kysela, IDE-ML,
	Networking, linux-mtd, devel, Boris Brezillon, Vinod Koul,
	Richard Weinberger, Takashi Iwai, Russell King, Marek Vasut,
	Ezequiel Garcia, Linux Media Mailing List, Samuel Ortiz,
	Bartlomiej Zolnierkiewicz, Philippe Ombredanne, Haojian Zhuang,
	dmaengine, Mark Brown, Thomas 
In-Reply-To: <20180402142656.26815-15-robert.jarzmik@free.fr>

On Mon, Apr 2, 2018 at 4:26 PM, Robert Jarzmik <robert.jarzmik@free.fr> wrote:

>
> +static struct pxa_ssp_info pxa_ssp_infos[] = {
> +       { .dma_chan_rx_name = "ssp1_rx", .dma_chan_tx_name = "ssp1_tx", },
> +       { .dma_chan_rx_name = "ssp1_rx", .dma_chan_tx_name = "ssp1_tx", },
> +       { .dma_chan_rx_name = "ssp2_rx", .dma_chan_tx_name = "ssp2_tx", },
> +       { .dma_chan_rx_name = "ssp2_rx", .dma_chan_tx_name = "ssp2_tx", },
> +       { .dma_chan_rx_name = "ssp3_rx", .dma_chan_tx_name = "ssp3_tx", },
> +       { .dma_chan_rx_name = "ssp3_rx", .dma_chan_tx_name = "ssp3_tx", },
> +       { .dma_chan_rx_name = "ssp4_rx", .dma_chan_tx_name = "ssp4_tx", },
> +       { .dma_chan_rx_name = "ssp4_rx", .dma_chan_tx_name = "ssp4_tx", },
> +};

This part looks odd to me, you're adding an extra level of indirection to
do two stages of lookups in some form of platform data.

Why can't you just always use "rx" and "tx" as the names here?

(also, I don't see why each line is duplicated, but I'm sure there's
an easy answer for that).

       Arnd

^ permalink raw reply

* Re: [PATCH net-next v2 1/2] fs/crashdd: add API to collect hardware dump in second kernel
From: Jiri Pirko @ 2018-04-03  7:04 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: Eric W. Biederman, netdev@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, kexec@lists.infradead.org,
	linux-kernel@vger.kernel.org, davem@davemloft.net,
	viro@zeniv.linux.org.uk, stephen@networkplumber.org,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	Ganesh GR, Nirranjan Kirubaharan, Indranil Choudhury
In-Reply-To: <20180402123044.GA31231@chelsio.com>

Mon, Apr 02, 2018 at 02:30:45PM CEST, rahul.lakkireddy@chelsio.com wrote:
>On Monday, April 04/02/18, 2018 at 14:41:43 +0530, Jiri Pirko wrote:
>> Fri, Mar 30, 2018 at 08:42:00PM CEST, ebiederm@xmission.com wrote:
>> >Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:
>> >
>> >> On Friday, March 03/30/18, 2018 at 16:09:07 +0530, Jiri Pirko wrote:
>> >>> Sat, Mar 24, 2018 at 11:56:33AM CET, rahul.lakkireddy@chelsio.com wrote:
>> >>> >Add a new module crashdd that exports the /sys/kernel/crashdd/
>> >>> >directory in second kernel, containing collected hardware/firmware
>> >>> >dumps.
>> >>> >
>> >>> >The sequence of actions done by device drivers to append their device
>> >>> >specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
>> >>> >as follows:
>> >>> >
>> >>> >1. During probe (before hardware is initialized), device drivers
>> >>> >register to the crashdd module (via crashdd_add_dump()), with
>> >>> >callback function, along with buffer size and log name needed for
>> >>> >firmware/hardware log collection.
>> >>> >
>> >>> >2. Crashdd creates a driver's directory under
>> >>> >/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
>> >>> 
>> >>> This smells. I need to identify the exact ASIC instance that produced
>> >>> the dump. To identify by driver name does not help me if I have multiple
>> >>> instances of the same driver. This looks wrong to me. This looks like
>> >>> a job for devlink where you have 1 devlink instance per 1 ASIC instance.
>> >>> 
>> >>> Please see:
>> >>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
>> >>> 
>> >>> I bevieve that the solution in the patchset could be used for
>> >>> your usecase too.
>> >>> 
>> >>> 
>> >>
>> >> The sysfs approach proposed here had been dropped in favour exporting
>> >> the dumps as ELF notes in /proc/vmcore.
>> >>
>> >> Will be posting the new patches soon.
>> >
>> >The concern was actually how you identify which device that came from.
>> >Where you read the identifier changes but sysfs or /proc/vmcore the
>> >change remains valid.
>> 
>> Yeah. I still don't see how you link the dump and the device.
>
>In our case, the dump and the device are being identified by the
>driver’s name followed by its corresponding pci bus id.  I’ve posted an
>example in my v3 series:
>
>https://www.spinics.net/lists/netdev/msg493781.html
>
>Here’s an extract from the link above:
>
># readelf -n /proc/vmcore
>
>Displaying notes found at file offset 0x00001000 with length 0x04003288:
>Owner                 Data size     Description
>VMCOREDD_cxgb4_0000:02:00.4 0x02000fd8      Unknown note type:(0x00000700)
>VMCOREDD_cxgb4_0000:04:00.4 0x02000fd8      Unknown note type:(0x00000700)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>VMCOREINFO           0x0000074f     Unknown note type: (0x00000000)
>
>Here, for my two devices, the dump’s names are
>VMCOREDD_cxgb4_0000:02:00.4 and VMCOREDD_cxgb4_0000:04:00.4.
>
>It’s really up to the callers to write their own unique name for the
>dump.  The name is appended to “VMCOREDD_” string.
>
>> Rahul, did you look at the patchset I pointed out?
>
>For devlink, I think the dump name would be identified by
>bus_type/device_name; i.e. “pci/0000:02:00.4” for my example.
>Is my understanding correct?

Yes.


>
>Thanks,
>Rahul

^ permalink raw reply

* Re: [PATCH 02/15] ARM: pxa: add dma slave map
From: Arnd Bergmann @ 2018-04-03  6:51 UTC (permalink / raw)
  To: Robert Jarzmik
  Cc: Ulf Hansson, alsa-devel, Jaroslav Kysela, IDE-ML, Networking,
	linux-mtd, devel, Boris Brezillon, Vinod Koul, Richard Weinberger,
	Takashi Iwai, Russell King, Marek Vasut, Ezequiel Garcia,
	Linux Media Mailing List, Samuel Ortiz, Bartlomiej Zolnierkiewicz,
	Haojian Zhuang, dmaengine, Mark Brown, Mauro Carvalho Chehab,
	Linux ARM, Nicolas 
In-Reply-To: <20180402142656.26815-3-robert.jarzmik@free.fr>

On Mon, Apr 2, 2018 at 4:26 PM, Robert Jarzmik <robert.jarzmik@free.fr> wrote:
> +
> +static const struct dma_slave_map pxa_slave_map[] = {
> +       /* PXA25x, PXA27x and PXA3xx common entries */
> +       { "pxa-pcm-audio", "ac97_mic_mono", PDMA_FILTER_PARAM(LOWEST, 8) },
> +       { "pxa-pcm-audio", "ac97_aux_mono_in", PDMA_FILTER_PARAM(LOWEST, 9) },
> +       { "pxa-pcm-audio", "ac97_aux_mono_out", PDMA_FILTER_PARAM(LOWEST, 10) },
> +       { "pxa-pcm-audio", "ac97_stereo_in", PDMA_FILTER_PARAM(LOWEST, 11) },
> +       { "pxa-pcm-audio", "ac97_stereo_out", PDMA_FILTER_PARAM(LOWEST, 12) },
> +       { "pxa-pcm-audio", "ssp1_rx", PDMA_FILTER_PARAM(LOWEST, 13) },
> +       { "pxa-pcm-audio", "ssp1_tx", PDMA_FILTER_PARAM(LOWEST, 14) },
> +       { "pxa-pcm-audio", "ssp2_rx", PDMA_FILTER_PARAM(LOWEST, 15) },
> +       { "pxa-pcm-audio", "ssp2_tx", PDMA_FILTER_PARAM(LOWEST, 16) },
> +       { "pxa2xx-ir", "rx", PDMA_FILTER_PARAM(LOWEST, 17) },
> +       { "pxa2xx-ir", "tx", PDMA_FILTER_PARAM(LOWEST, 18) },
> +       { "pxa2xx-mci.0", "rx", PDMA_FILTER_PARAM(LOWEST, 21) },
> +       { "pxa2xx-mci.0", "tx", PDMA_FILTER_PARAM(LOWEST, 22) },


> +       { "smc911x.0", "rx", PDMA_FILTER_PARAM(LOWEST, -1) },
> +       { "smc911x.0", "tx", PDMA_FILTER_PARAM(LOWEST, -1) },
> +       { "smc91x.0", "data", PDMA_FILTER_PARAM(LOWEST, -1) },

This one is interesting, as you are dealing with an off-chip device,
and the channel number is '-'1. How does this even work? Does it
mean

> +       /* PXA25x specific map */
> +       { "pxa25x-ssp.0", "rx", PDMA_FILTER_PARAM(LOWEST, 13) },
> +       { "pxa25x-ssp.0", "tx", PDMA_FILTER_PARAM(LOWEST, 14) },
> +       { "pxa25x-nssp.1", "rx", PDMA_FILTER_PARAM(LOWEST, 15) },
> +       { "pxa25x-nssp.1", "tx", PDMA_FILTER_PARAM(LOWEST, 16) },
> +       { "pxa25x-nssp.2", "rx", PDMA_FILTER_PARAM(LOWEST, 23) },
> +       { "pxa25x-nssp.2", "tx", PDMA_FILTER_PARAM(LOWEST, 24) },
> +       { "pxa-pcm-audio", "nssp2_rx", PDMA_FILTER_PARAM(LOWEST, 15) },
> +       { "pxa-pcm-audio", "nssp2_tx", PDMA_FILTER_PARAM(LOWEST, 16) },
> +       { "pxa-pcm-audio", "nssp3_rx", PDMA_FILTER_PARAM(LOWEST, 23) },
> +       { "pxa-pcm-audio", "nssp3_tx", PDMA_FILTER_PARAM(LOWEST, 24) },
> +
> +       /* PXA27x specific map */
> +       { "pxa-pcm-audio", "ssp3_rx", PDMA_FILTER_PARAM(LOWEST, 66) },
> +       { "pxa-pcm-audio", "ssp3_tx", PDMA_FILTER_PARAM(LOWEST, 67) },
> +       { "pxa27x-camera.0", "CI_Y", PDMA_FILTER_PARAM(HIGHEST, 68) },
> +       { "pxa27x-camera.0", "CI_U", PDMA_FILTER_PARAM(HIGHEST, 69) },
> +       { "pxa27x-camera.0", "CI_V", PDMA_FILTER_PARAM(HIGHEST, 70) },
> +
> +       /* PXA3xx specific map */
> +       { "pxa-pcm-audio", "ssp4_rx", PDMA_FILTER_PARAM(LOWEST, 2) },
> +       { "pxa-pcm-audio", "ssp4_tx", PDMA_FILTER_PARAM(LOWEST, 3) },
> +       { "pxa2xx-mci.1", "rx", PDMA_FILTER_PARAM(LOWEST, 93) },
> +       { "pxa2xx-mci.1", "tx", PDMA_FILTER_PARAM(LOWEST, 94) },
> +       { "pxa3xx-nand", "data", PDMA_FILTER_PARAM(LOWEST, 97) },
> +       { "pxa2xx-mci.2", "rx", PDMA_FILTER_PARAM(LOWEST, 100) },
> +       { "pxa2xx-mci.2", "tx", PDMA_FILTER_PARAM(LOWEST, 101) },
> +};

Since more than half the entries in here are chip specific, maybe it would be
better to split that table into three and have a copy for each one in
arch/arm/mach-pxa/pxa{25x.27x.3xx}.c? Does that mean it's actually
a memory-to-memory transfer with a device being on the external
SRAM interface?

       Arnd

^ permalink raw reply

* Re: [PATCH net-next] bridge: Allow max MTU when multiple VLANs present
From: Roopa Prabhu @ 2018-04-03  6:13 UTC (permalink / raw)
  To: Chas Williams
  Cc: Toshiaki Makita, David Miller, netdev, Stephen Hemminger,
	Nikolay Aleksandrov
In-Reply-To: <CAG2-Gkm3TsN3YPsbckkmBBRRcgpwHe5sC0aUJ+BX6tcg_NHUcw@mail.gmail.com>

On Mon, Apr 2, 2018 at 8:26 AM, Chas Williams <3chas3@gmail.com> wrote:
> On Mon, Apr 2, 2018 at 11:08 AM, Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
>>

[snip]

>> they are popular...in-fact they are the default bridge mode on our
>> network switches.
>> And they have been around for some time now to ignore its users.
>> Plus it is not right to change default mtu behavior for one mode of the bridge
>> and not the others (bridge mtu handling from user-space is complex enough today
>> due to dynamic mtu changes on port enslave/deslave).
>
> I don't see the issue with one mode of bridge behaving differently
> from another mode.
> The VLAN behavior between the two bridge modes is completely different so having
> a different MTU behavior doesn't seem that surprising.
>
> You are potentially mixing different sized VLAN on a same bridge.  The only sane
> choice is to pick the largest MTU for the bridge.  This lets you have
> whatever MTU
> is appropriate on the child VLAN interfaces of the bridge.  If you
> attempt to forward
> from a port with a larger MTU to a smaller MTU, you get the expected behavior.


you mean larger MTU on the vlan device on the bridge to a smaller MTU
on the bridge port ?.
this will result in dropping the packet. how is this supposed to be
expected default behavior ?.


>
> Forcing the end user to configure all the ports to the maximum MTU of
> all the VLANs
> on the bridge is wrong IMHO.
> You then risk attempting to forward
> oversize packets
> on a network that can't support that.

I am a bit confused: Are you trying to solve the config problem by
implicitly making it the default and there by creating the oversize
packet drop issue by default ?


>
>>
>>>
>>> I don't think those drops are unexpected.  If a user has misconfigured
>>> the bridge
>>> we can't be expected to fix that for them.  It is the user's
>>> responsbility to ensure
>>> that the ports on the VLAN have a size consistent with the traffic
>>> they expect to
>>> pass.
>>>
>>
>> By default they are not expected today. The problem is changing the bridge
>> to max mtu changes 'all' the vlan devices on top of the vlan aware bridge to
>> max mtu by default which makes drops at the bridge driver more common if the
>> user had mixed mtu on its ports.
>
> That's not been my experience.  The MTU on the vlan devices is only
> limited by the
> bridges's MTU.  Setting the bridge MTU doesn't change the children
> VLAN devices MTUs.

It does not, but it now allows vlan devices on the bridge to have a
larger MTU if they need to (some or all of them).
This is consistent with vxlan driver as well: picks default mtu to be
lower or equal to the default dst dev mtu and allows user to override
it with a larger MTU.

^ permalink raw reply

* Re: [PATCH 00/47] Netfilter/IPVS updates for net-next
From: Rafał Miłecki @ 2018-04-03  6:13 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, Network Development, David Miller
In-Reply-To: <20180330113729.18335-1-pablo@netfilter.org>

Hi Pablo,

 > The following patchset contains Netfilter/IPVS updates for your net-next
 > tree. This batch comes with more input sanitization for xtables to
 > address bug reports from fuzzers, preparation works to the flowtable
 > infrastructure and assorted updates. In no particular order, they are:
 >
 > 1) Make sure userspace provides a valid standard target verdict, from
 >    Florian Westphal.
 >
 > 2) Sanitize error target size, also from Florian.
 >
 > 3) Validate that last rule in basechain matches underflow/policy since
 >    userspace assumes this when decoding the ruleset blob that comes
 >    from the kernel, from Florian.
 >
 > 4) Consolidate hook entry checks through xt_check_table_hooks(),
 >    patch from Florian.
 >
 > 5) Cap ruleset allocations at 512 mbytes, 134217728 rules and reject
 >    very large compat offset arrays, so we have a reasonable upper limit
 >    and fuzzers don't exercise the oom-killer. Patches from Florian.
 >
 > 6) Several WARN_ON checks on xtables mutex helper, from Florian.
 >
 > 7) xt_rateest now has a hashtable per net, from Cong Wang.
 >
 > 8) Consolidate counter allocation in xt_counters_alloc(), from Florian.
 >
 > 9) Earlier xt_table_unlock() call in {ip,ip6,arp,eb}tables, patch
 >    from Xin Long.
 >
 > 10) Set FLOW_OFFLOAD_DIR_* to IP_CT_DIR_* definitions, patch from
 >     Felix Fietkau.
 >
 > 11) Consolidate code through flow_offload_fill_dir(), also from Felix.
 >
 > 12) Inline ip6_dst_mtu_forward() just like ip_dst_mtu_maybe_forward()
 >     to remove a dependency with flowtable and ipv6.ko, from Felix.
 >
 > 13) Cache mtu size in flow_offload_tuple object, this is safe for
 >     forwarding as f87c10a8aa1e describes, from Felix.
 >
 > 14) Rename nf_flow_table.c to nf_flow_table_core.o, to simplify too
 >     modular infrastructure, from Felix.

I see you mentioned changes from Felix in the pull request but:
1) I don't see any commits from Felix listed below
2) I don't think you sent any of these patches

Can you take a look at what has happened to them, please?

^ permalink raw reply

* Re: [PATCH net-next v2 1/2] fs/crashdd: add API to collect hardware dump in second kernel
From: Alex Vesker @ 2018-04-03  5:43 UTC (permalink / raw)
  To: Jiri Pirko, Andrew Lunn
  Cc: Rahul Lakkireddy, netdev, linux-fsdevel, kexec, linux-kernel,
	davem, viro, ebiederm, stephen, akpm, torvalds, ganeshgr,
	nirranjan, indranil
In-Reply-To: <20180402091232.GE3313@nanopsycho>



On 4/2/2018 12:12 PM, Jiri Pirko wrote:
> Fri, Mar 30, 2018 at 05:11:29PM CEST, andrew@lunn.ch wrote:
>>> Please see:
>>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
>>>
>>> I bevieve that the solution in the patchset could be used for
>>> your usecase too.
>> Hi Jiri
>>
>> https://lkml.org/lkml/2018/3/20/436
>>
>> How well does this API work for a 2Gbyte snapshot?
> Ccing Alex who did the tests.

I didn't check the performance for such a large snapshot.
 From my measurement it takes 0.09s for 1 MB of data this means
about ~3m.
This can be tuned and improved since this is a socket application.

>>     Andrew

^ permalink raw reply

* linux-next: build failure after merge of the tip tree
From: Stephen Rothwell @ 2018-04-03  5:41 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Peter Zijlstra,
	David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, David Howells

[-- Attachment #1: Type: text/plain, Size: 4619 bytes --]

Hi all,

After merging the tip tree, today's linux-next build (x86_64 allmodconfig)
failed like this:

net/rxrpc/call_object.c: In function 'rxrpc_rcu_destroy_call':
net/rxrpc/call_object.c:661:3: error: implicit declaration of function 'wake_up_atomic_t'; did you mean 'wake_up_bit'? [-Werror=implicit-function-declaration]
   wake_up_atomic_t(&rxnet->nr_calls);
   ^~~~~~~~~~~~~~~~
   wake_up_bit
net/rxrpc/call_object.c: In function 'rxrpc_destroy_all_calls':
net/rxrpc/call_object.c:728:2: error: implicit declaration of function 'wait_on_atomic_t'; did you mean 'wait_on_bit'? [-Werror=implicit-function-declaration]
  wait_on_atomic_t(&rxnet->nr_calls, atomic_t_wait, TASK_UNINTERRUPTIBLE);
  ^~~~~~~~~~~~~~~~
  wait_on_bit
net/rxrpc/call_object.c:728:37: error: 'atomic_t_wait' undeclared (first use in this function); did you mean 'atomic_long_t'?
  wait_on_atomic_t(&rxnet->nr_calls, atomic_t_wait, TASK_UNINTERRUPTIBLE);
                                     ^~~~~~~~~~~~~
                                     atomic_long_t
net/rxrpc/call_object.c:728:37: note: each undeclared identifier is reported only once for each function it appears in
net/rxrpc/call_accept.c: In function 'rxrpc_discard_prealloc':
net/rxrpc/call_accept.c:223:4: error: implicit declaration of function 'wake_up_atomic_t'; did you mean 'wake_up_bit'? [-Werror=implicit-function-declaration]
    wake_up_atomic_t(&rxnet->nr_conns);
    ^~~~~~~~~~~~~~~~
    wake_up_bit

Caused by commit

  9b8cce52c4b5 ("sched/wait: Remove the wait_on_atomic_t() API")

interacting with commits

  d3be4d244330 ("xrpc: Fix potential call vs socket/net destruction race")
  31f5f9a1691e ("rxrpc: Fix apparent leak of rxrpc_local objects")

from the net-next tree.

Haven't we figured out how to remove/change APIs yet? :-(

That tip tree commit is now in Linus' tree (merged since I started this
morning) so the net-next tree will need the below patch (or something
similar when it is merged.

I have applied the following merge fix patch (this may need more work):

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 3 Apr 2018 15:34:48 +1000
Subject: [PATCH] sched/wait: merge fix up for wait_on_atomic() API removal

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 net/rxrpc/call_accept.c | 2 +-
 net/rxrpc/call_object.c | 4 ++--
 net/rxrpc/conn_object.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index f67017dcb25e..a9a9be5519b9 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -220,7 +220,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 		write_unlock(&rxnet->conn_lock);
 		kfree(conn);
 		if (atomic_dec_and_test(&rxnet->nr_conns))
-			wake_up_atomic_t(&rxnet->nr_conns);
+			wake_up_var(&rxnet->nr_conns);
 		tail = (tail + 1) & (size - 1);
 	}
 
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f721c2b7e234..f6734d8cb01a 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -658,7 +658,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
 	kfree(call->rxtx_annotations);
 	kmem_cache_free(rxrpc_call_jar, call);
 	if (atomic_dec_and_test(&rxnet->nr_calls))
-		wake_up_atomic_t(&rxnet->nr_calls);
+		wake_up_var(&rxnet->nr_calls);
 }
 
 /*
@@ -725,5 +725,5 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet)
 	write_unlock(&rxnet->call_lock);
 
 	atomic_dec(&rxnet->nr_calls);
-	wait_on_atomic_t(&rxnet->nr_calls, atomic_t_wait, TASK_UNINTERRUPTIBLE);
+	wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls));
 }
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 0950ee3d26f5..4c77a78a252a 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -367,7 +367,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
 	rxrpc_put_peer(conn->params.peer);
 
 	if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns))
-		wake_up_atomic_t(&conn->params.local->rxnet->nr_conns);
+		wake_up_var(&conn->params.local->rxnet->nr_conns);
 	rxrpc_put_local(conn->params.local);
 
 	kfree(conn);
@@ -482,6 +482,6 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 	/* We need to wait for the connections to be destroyed by RCU as they
 	 * pin things that we still need to get rid of.
 	 */
-	wait_on_atomic_t(&rxnet->nr_conns, atomic_t_wait, TASK_UNINTERRUPTIBLE);
+	wait_var_event(&rxnet->nr_conns, !atomic_read(&rxnet->nr_conns));
 	_leave("");
 }
-- 
2.16.1

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply related

* Re: WARNING: refcount bug in should_fail
From: Al Viro @ 2018-04-03  5:20 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Tetsuo Handa, syzbot+, syzkaller-bugs, dvyukov, linux-fsdevel,
	linux-mm, netdev
In-Reply-To: <20180402215934.GG30522@ZenIV.linux.org.uk>

On Mon, Apr 02, 2018 at 10:59:34PM +0100, Al Viro wrote:

> FWIW, I'm going through the ->kill_sb() instances, fixing that sort
> of bugs (most of them preexisting, but I should've checked instead
> of assuming that everything's fine).  Will push out later tonight.

OK, see vfs.git#for-linus.  Caught: 4 old bugs (allocation failure
in fill_super oopses ->kill_sb() in hypfs, jffs2 and orangefs resp.
and double-dput in late failure exit in rpc_fill_super())
and 5 regressions from register_shrinker() failure recovery.  

^ permalink raw reply

* Re: [GIT PULL] remove in-kernel calls to syscalls
From: Linus Torvalds @ 2018-04-03  4:33 UTC (permalink / raw)
  To: Dominik Brodowski
  Cc: Linux Kernel Mailing List, Al Viro, Arnd Bergmann, linux-arch,
	hmclauchlan, tautschn, Amir Goldstein, Andi Kleen, Andrew Morton,
	Christoph Hellwig, Darren Hart, David S. Miller,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jaswinder Singh,
	Jeff Dike, Jiri Slaby, Kexec Mailing List, linux-fsdevel,
	linux-mm
In-Reply-To: <20180402190454.GB29890@light.dominikbrodowski.net>

On Mon, Apr 2, 2018 at 12:04 PM, Dominik Brodowski
<linux@dominikbrodowski.net> wrote:
>
> This patchset removes all in-kernel calls to syscall functions in the
> kernel with the exception of arch/.

Ok, this finished off my arch updates for today, I'll probably move on
to driver pulls tomorrow.

Anyway, it's in my tree, will push out once my test build finishes.

                Linus

^ permalink raw reply

* [PATCH iproute2-next v1] rdma: Print net device name and index for RDMA device
From: Leon Romanovsky @ 2018-04-03  4:29 UTC (permalink / raw)
  To: David Ahern
  Cc: Leon Romanovsky, netdev, RDMA mailing list, Stephen Hemminger,
	Steve Wise

From: Leon Romanovsky <leonro@mellanox.com>

The RDMA devices are operated in RoCE and iWARP modes have net device
underneath. Present their names in regular output and their net index
in detailed mode.

[root@nps ~]# rdma link show mlx5_3/1
4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7
[root@nps ~]# rdma link show mlx5_3/1 -d
4/1: mlx5_3/1: state ACTIVE physical_state LINK_UP netdev ens7 netdev_index 7
    caps: <CM, IP_BASED_GIDS>

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 Changes v0->v1:
  * Resend after commit 29122c1aae35 ("rdma: update rdma_netlink.h")
    which updated relevant netlink attributes.
  * Added Steve's ROB
---
 rdma/include/uapi/rdma/rdma_netlink.h |  4 ++++
 rdma/link.c                           | 21 +++++++++++++++++++++
 rdma/utils.c                          |  2 ++
 3 files changed, 27 insertions(+)

diff --git a/rdma/include/uapi/rdma/rdma_netlink.h b/rdma/include/uapi/rdma/rdma_netlink.h
index 9446a721..45474f13 100644
--- a/rdma/include/uapi/rdma/rdma_netlink.h
+++ b/rdma/include/uapi/rdma/rdma_netlink.h
@@ -388,6 +388,10 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,	/* u32 */
 	RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,	/* u32 */

+	/* Netdev information for relevant protocols, like RoCE and iWARP */
+	RDMA_NLDEV_ATTR_NDEV_INDEX,		/* u32 */
+	RDMA_NLDEV_ATTR_NDEV_NAME,		/* string */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _RDMA_NETLINK_H */
diff --git a/rdma/link.c b/rdma/link.c
index 66bcd50e..7e914c87 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -205,6 +205,26 @@ static void link_print_phys_state(struct rd *rd, struct nlattr **tb)
 		pr_out("physical_state %s ", phys_state_to_str(phys_state));
 }

+static void link_print_netdev(struct rd *rd, struct nlattr **tb)
+{
+	const char *netdev_name;
+	uint32_t idx;
+
+	if (!tb[RDMA_NLDEV_ATTR_NDEV_NAME] || !tb[RDMA_NLDEV_ATTR_NDEV_INDEX])
+		return;
+
+	netdev_name = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_NDEV_NAME]);
+	idx = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_NDEV_INDEX]);
+	if (rd->json_output) {
+		jsonw_string_field(rd->jw, "netdev", netdev_name);
+		jsonw_uint_field(rd->jw, "netdev_index", idx);
+	} else {
+		pr_out("netdev %s ", netdev_name);
+		if (rd->show_details)
+			pr_out("netdev_index %u ", idx);
+	}
+}
+
 static int link_parse_cb(const struct nlmsghdr *nlh, void *data)
 {
 	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX] = {};
@@ -241,6 +261,7 @@ static int link_parse_cb(const struct nlmsghdr *nlh, void *data)
 	link_print_lmc(rd, tb);
 	link_print_state(rd, tb);
 	link_print_phys_state(rd, tb);
+	link_print_netdev(rd, tb);
 	if (rd->show_details)
 		link_print_caps(rd, tb);

diff --git a/rdma/utils.c b/rdma/utils.c
index a2e08e91..f38f0459 100644
--- a/rdma/utils.c
+++ b/rdma/utils.c
@@ -391,6 +391,8 @@ static const enum mnl_attr_data_type nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_LKEY] = MNL_TYPE_U32,
 	[RDMA_NLDEV_ATTR_RES_IOVA] = MNL_TYPE_U64,
 	[RDMA_NLDEV_ATTR_RES_MRLEN] = MNL_TYPE_U64,
+	[RDMA_NLDEV_ATTR_NDEV_INDEX]		= MNL_TYPE_U32,
+	[RDMA_NLDEV_ATTR_NDEV_NAME]		= MNL_TYPE_NUL_STRING,
 };

 int rd_attr_cb(const struct nlattr *attr, void *data)

^ permalink raw reply related

* Re: [PATCH net-next RFC V1 5/5] net: mdio: Add a driver for InES time stamping IP core.
From: Richard Cochran @ 2018-04-03  4:27 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev, devicetree, David Miller, Florian Fainelli, Mark Rutland,
	Miroslav Lichvar, Rob Herring, Willem de Bruijn
In-Reply-To: <20180325230158.GB19365@lunn.ch>

On Mon, Mar 26, 2018 at 01:01:58AM +0200, Andrew Lunn wrote:
> The phylib core code will take the phydev lock before calling into the
> driver. By violating the layering, we are missing on this lock.

That lock protects the fields within the struct phy_device, like the
state field.  None of the time stamping methods need to read or write
any part of that data structure.

Actually it is not true that the core always takes the lock before
calling the driver methods.  See .ack_interrupt for example.

> Maybe the one driver which currently implements these calls does not
> need locking.

It has locking.  If a specific device needs locking to protect the
integrity of its registers or other internal data structures, then it
can and should implement those locks.

Thanks,
Richard

^ permalink raw reply

* Re: WARNING in refcount_dec
From: DaeRyong Jeong @ 2018-04-03  4:12 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Cong Wang, Byoungyoung Lee, LKML, Kyungtae Kim,
	Linux Kernel Network Developers, Willem de Bruijn
In-Reply-To: <CAF=yD-Lyu8YYiquNNat_RZ53Ydniivz59C8dek6RsWuy0YE8Cw@mail.gmail.com>

No. Only the first crash (WARNING in refcount_dec) is reproduced by
the attached reproducer.

The second crash (kernel bug at af_packet.c:3107) is reproduced by
another reproducer.
We reported it here.
http://lkml.iu.edu/hypermail/linux/kernel/1803.3/05324.html

On Sun, Apr 1, 2018 at 4:38 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Thu, Mar 29, 2018 at 1:16 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> (Cc'ing netdev and Willem)
>>
>> On Wed, Mar 28, 2018 at 12:03 PM, Byoungyoung Lee
>> <byoungyoung@purdue.edu> wrote:
>>> Another crash patterns observed: race between (setsockopt$packet_int)
>>> and (bind$packet).
>>>
>>> ------------------------------
>>> [  357.731597] kernel BUG at
>>> /home/blee/project/race-fuzzer/kernels/kernel_v4.16-rc3/net/packet/af_packet.c:3107!
>>> [  357.733382] invalid opcode: 0000 [#1] SMP KASAN
>>> [  357.734017] Modules linked in:
>>> [  357.734662] CPU: 1 PID: 3871 Comm: repro.exe Not tainted 4.16.0-rc3 #1
>>> [  357.735791] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>>> BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
>>> [  357.737434] RIP: 0010:packet_do_bind+0x88d/0x950
>>> [  357.738121] RSP: 0018:ffff8800b2787b08 EFLAGS: 00010293
>>> [  357.738906] RAX: ffff8800b2fdc780 RBX: ffff880234358cc0 RCX: ffffffff838b244c
>>> [  357.739905] RDX: 0000000000000000 RSI: ffffffff838b257d RDI: 0000000000000001
>>> [  357.741315] RBP: ffff8800b2787c10 R08: ffff8800b2fdc780 R09: 0000000000000000
>>> [  357.743055] R10: 0000000000000001 R11: 0000000000000000 R12: ffff88023352ecc0
>>> [  357.744744] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000001d00
>>> [  357.746377] FS:  00007f4b43733700(0000) GS:ffff8800b8b00000(0000)
>>> knlGS:0000000000000000
>>> [  357.749599] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [  357.752096] CR2: 0000000020058000 CR3: 00000002334b8000 CR4: 00000000000006e0
>>> [  357.755045] Call Trace:
>>> [  357.755822]  ? compat_packet_setsockopt+0x100/0x100
>>> [  357.757324]  ? __sanitizer_cov_trace_const_cmp8+0x18/0x20
>>> [  357.758810]  packet_bind+0xa2/0xe0
>>> [  357.759640]  SYSC_bind+0x279/0x2f0
>>> [  357.760364]  ? move_addr_to_kernel.part.19+0xc0/0xc0
>>> [  357.761491]  ? __handle_mm_fault+0x25d0/0x25d0
>>> [  357.762449]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>> [  357.763663]  ? __do_page_fault+0x417/0xba0
>>> [  357.764569]  ? vmalloc_fault+0x910/0x910
>>> [  357.765405]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>> [  357.766525]  ? mark_held_locks+0x25/0xb0
>>> [  357.767336]  ? SyS_socketpair+0x4a0/0x4a0
>>> [  357.768182]  SyS_bind+0x24/0x30
>>> [  357.768851]  do_syscall_64+0x209/0x5d0
>>> [  357.769650]  ? syscall_return_slowpath+0x3e0/0x3e0
>>> [  357.770665]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>> [  357.771779]  ? syscall_return_slowpath+0x260/0x3e0
>>> [  357.772748]  ? mark_held_locks+0x25/0xb0
>>> [  357.773581]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>> [  357.774720]  ? retint_user+0x18/0x18
>>> [  357.775493]  ? trace_hardirqs_off_caller+0xb5/0x120
>>> [  357.776567]  ? trace_hardirqs_off_thunk+0x1a/0x1c
>>> [  357.777512]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
>>> [  357.778508] RIP: 0033:0x4503a9
>>> [  357.779156] RSP: 002b:00007f4b43732ce8 EFLAGS: 00000246 ORIG_RAX:
>>> 0000000000000031
>>> [  357.780737] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004503a9
>>> [  357.782169] RDX: 0000000000000014 RSI: 0000000020058000 RDI: 0000000000000003
>>> [  357.783710] RBP: 00007f4b43732d10 R08: 0000000000000000 R09: 0000000000000000
>>> [  357.785202] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
>>> [  357.786664] R13: 0000000000000000 R14: 00007f4b437339c0 R15: 00007f4b43733700
>>> [  357.788210] Code: c0 fd 48 c7 c2 00 c8 d9 84 be ab 02 00 00 48 c7
>>> c7 60 c8 d9 84 c6 05 e7 a2 48 02 01 e8 3f 17 af fd e9 60 fb ff ff e8
>>> 43 b3 c0 fd <0f> 0b e8 3c b3 c0 fd 48 8b bd 20 ff ff ff e8 60 1e e7 fd
>>> 4c 89
>>> [  357.792260] RIP: packet_do_bind+0x88d/0x950 RSP: ffff8800b2787b08
>>> [  357.793698] ---[ end trace 0c5a2539f0247369 ]---
>>> [  357.794696] Kernel panic - not syncing: Fatal exception
>>> [  357.795918] Kernel Offset: disabled
>>> [  357.796614] Rebooting in 86400 seconds..
>>>
>>> On Wed, Mar 28, 2018 at 1:19 AM, Byoungyoung Lee <byoungyoung@purdue.edu> wrote:
>>>> We report the crash: WARNING in refcount_dec
>>>>
>>>> This crash has been found in v4.16-rc3 using RaceFuzzer (a modified
>>>> version of Syzkaller), which we describe more at the end of this
>>>> report. Our analysis shows that the race occurs when invoking two
>>>> syscalls concurrently, (setsockopt$packet_int) and
>>>> (setsockopt$packet_rx_ring).
>>>>
>>>> C repro code : https://kiwi.cs.purdue.edu/static/race-fuzzer/repro-refcount_dec.c
>>>> kernel config: https://kiwi.cs.purdue.edu/static/race-fuzzer/kernel-config-v4.16-rc3
>>
>>
>> I tried your reproducer, no luck here.
>>
>
> Are both crashes with the same reproducer?
>
> It races setsockopt PACKET_RX_RING with PACKET_VNET_HDR.
>
> There have been previous bug fixes for other setsockopts racing with
> ring creation. The change would be
>
> @@ -3763,14 +3763,19 @@ packet_setsockopt(struct socket *sock, int
> level, int optname, char __user *optv
>
>                 if (sock->type != SOCK_RAW)
>                         return -EINVAL;
> -               if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
> -                       return -EBUSY;
>                 if (optlen < sizeof(val))
>                         return -EINVAL;
>                 if (copy_from_user(&val, optval, sizeof(val)))
>                         return -EFAULT;
>
> -               po->has_vnet_hdr = !!val;
> +               lock_sock(sk);
> +               if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
> +                       ret = -EBUSY;
> +               } else {
> +                       po->has_vnet_hdr = !!val;
> +                       ret = 0;
> +               }
> +               release_sock(sk);
>                 return 0;
>         }
>
> But I do not immediately see why these concurrent operations
> would be unsafe.
>
> The program races a lot more complex operations, like bind and
> close. So the specific setsockopt may be a red herring.
>
> I'm traveling; haven't been able to setup your fuzzer and run the
> repro locally yet.
>
>>
>>
>>>>
>>>> ---------------------------------------
>>>> [  305.838560] refcount_t: decrement hit 0; leaking memory.
>>>> [  305.839669] WARNING: CPU: 0 PID: 3867 at
>>>> /home/blee/project/race-fuzzer/kernels/kernel_v4.16-rc3/lib/refcount.c:228
>>>> refcount_dec+0x62/0x70
>>>> [  305.841441] Modules linked in:
>>>> [  305.841883] CPU: 0 PID: 3867 Comm: repro.exe Not tainted 4.16.0-rc3 #1
>>>> [  305.842803] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>>>> BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
>>>> [  305.844345] RIP: 0010:refcount_dec+0x62/0x70
>>>> [  305.845005] RSP: 0018:ffff880224d374f8 EFLAGS: 00010286
>>>> [  305.845802] RAX: 000000000000002c RBX: 0000000000000000 RCX: ffffffff81538991
>>>> [  305.846768] RDX: 0000000000000000 RSI: ffffffff813cd761 RDI: 0000000000000005
>>>> [  305.847748] RBP: ffff880224d37500 R08: ffff88023169a440 R09: 0000000000000000
>>>> [  305.848748] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88023473ad40
>>>> [  305.849738] R13: ffff88023473b368 R14: 0000000000000001 R15: 0000000000000000
>>>> [  305.850733] FS:  0000000000c6e940(0000) GS:ffff8800b8a00000(0000)
>>>> knlGS:0000000000000000
>>>> [  305.851837] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [  305.852652] CR2: 00007fb120571db8 CR3: 0000000005422000 CR4: 00000000000006f0
>>>> [  305.853619] Call Trace:
>>>> [  305.854086]  __unregister_prot_hook+0x15f/0x1d0
>>>> [  305.854722]  packet_release+0x77a/0x7a0
>>>> [  305.855335]  ? mark_held_locks+0x25/0xb0
>>>> [  305.855883]  ? packet_lookup_frame+0x110/0x110
>>>> [  305.856576]  ? __lock_is_held+0x39/0xc0
>>>> [  305.857109]  ? note_gp_changes+0x300/0x300
>>>> [  305.857745]  ? __sanitizer_cov_trace_const_cmp8+0x18/0x20
>>>> [  305.858548]  ? locks_remove_file+0x31b/0x420
>>>> [  305.859138]  ? fcntl_setlk+0xad0/0xad0
>>>> [  305.859743]  ? trace_event_raw_event_sched_switch+0x320/0x320
>>>> [  305.860534]  ? fsnotify_first_mark+0x2c0/0x2c0
>>>> [  305.861234]  sock_release+0x53/0xe0
>>>> [  305.861711]  ? sock_alloc_file+0x2c0/0x2c0
>>>> [  305.862334]  sock_close+0x16/0x20
>>>> [  305.862801]  __fput+0x246/0x4e0
>>>> [  305.863310]  ? fput+0x130/0x130
>>>> [  305.863743]  ? trace_event_raw_event_sched_switch+0x320/0x320
>>>> [  305.864604]  ____fput+0x15/0x20
>>>> [  305.865046]  task_work_run+0x1a5/0x200
>>>> [  305.865636]  ? kmem_cache_free+0x25c/0x2d0
>>>> [  305.866194]  ? task_work_cancel+0x1a0/0x1a0
>>>> [  305.866829]  ? switch_task_namespaces+0x9e/0xb0
>>>> [  305.867458]  do_exit+0xacf/0x10d0
>>>> [  305.868023]  ? mm_update_next_owner+0x650/0x650
>>>> [  305.868642]  ? __pv_queued_spin_lock_slowpath+0xbf0/0xbf0
>>>> [  305.869427]  ? trace_hardirqs_on_caller+0x136/0x2a0
>>>> [  305.870102]  ? trace_hardirqs_on+0xd/0x10
>>>> [  305.870701]  ? wake_up_new_task+0x41c/0x650
>>>> [  305.871324]  ? to_ratio+0x20/0x20
>>>> [  305.871816]  ? lock_release+0x530/0x530
>>>> [  305.872341]  ? trace_event_raw_event_sched_switch+0x320/0x320
>>>> [  305.873161]  ? match_held_lock+0x7e/0x360
>>>> [  305.873777]  ? trace_hardirqs_off+0x10/0x10
>>>> [  305.874359]  ? put_pid+0x111/0x140
>>>> [  305.874897]  ? task_active_pid_ns+0x70/0x70
>>>> [  305.875500]  ? find_held_lock+0xca/0xf0
>>>> [  305.876118]  ? do_group_exit+0x1f9/0x260
>>>> [  305.876650]  ? lock_downgrade+0x380/0x380
>>>> [  305.877297]  ? task_clear_jobctl_pending+0xb5/0xd0
>>>> [  305.877951]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>>> [  305.878725]  ? do_raw_spin_unlock+0x112/0x1a0
>>>> [  305.879309]  ? do_raw_spin_trylock+0x100/0x100
>>>> [  305.879969]  ? mark_held_locks+0x25/0xb0
>>>> [  305.880505]  ? force_sig+0x30/0x30
>>>> [  305.881054]  ? _raw_spin_unlock_irq+0x27/0x50
>>>> [  305.881671]  ? trace_hardirqs_on_caller+0x136/0x2a0
>>>> [  305.882412]  do_group_exit+0xfb/0x260
>>>> [  305.882945]  ? SyS_exit+0x30/0x30
>>>> [  305.883442]  ? find_mergeable_anon_vma+0x90/0x90
>>>> [  305.884103]  ? SyS_read+0x1c0/0x1c0
>>>> [  305.884785]  ? mark_held_locks+0x25/0xb0
>>>> [  305.885503]  ? do_syscall_64+0xb2/0x5d0
>>>> [  305.886093]  ? do_group_exit+0x260/0x260
>>>> [  305.886741]  SyS_exit_group+0x1d/0x20
>>>> [  305.887255]  do_syscall_64+0x209/0x5d0
>>>> [  305.887888]  ? syscall_return_slowpath+0x3e0/0x3e0
>>>> [  305.888611]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>>> [  305.889420]  ? syscall_return_slowpath+0x260/0x3e0
>>>> [  305.890188]  ? mark_held_locks+0x25/0xb0
>>>> [  305.890724]  ? entry_SYSCALL_64_after_hwframe+0x52/0xb7
>>>> [  305.891556]  ? trace_hardirqs_off_caller+0xb5/0x120
>>>> [  305.892265]  ? trace_hardirqs_off_thunk+0x1a/0x1c
>>>> [  305.892939]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
>>>> [  305.893676] RIP: 0033:0x44d989
>>>> [  305.894100] RSP: 002b:00000000007fff38 EFLAGS: 00000202 ORIG_RAX:
>>>> 00000000000000e7
>>>> [  305.895158] RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000044d989
>>>> [  305.896174] RDX: 00007fb120d739c0 RSI: 00007fb120572700 RDI: 0000000000000001
>>>> [  305.897161] RBP: 00000000007fff60 R08: 00007fb120572700 R09: 0000000000000000
>>>> [  305.898128] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
>>>> [  305.899464] R13: 000000000040d270 R14: 000000000040d300 R15: 0000000000000000
>>>> [  305.900823] Code: b6 1d 81 9a ef 03 31 ff 89 de e8 ca a3 67 ff 84
>>>> db 75 df e8 f1 a2 67 ff 48 c7 c7 60 8f 83 84 c6 05 61 9a ef 03 01 e8
>>>> ee 5f 49 ff <0f> 0b eb c3 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 41
>>>> 57 49
>>>> [  305.904324] ---[ end trace 360c084b02d93021 ]---
>>>> [  305.919117] ------------[ cut here ]------------
>>>> [  305.920120] refcount_t: underflow; use-after-free.
>>>> [  305.921335] WARNING: CPU: 0 PID: 3867 at
>>>> /home/blee/project/race-fuzzer/kernels/kernel_v4.16-rc3/lib/refcount.c:187
>>>> refcount_sub_and_test+0x1ec/0x200
>>>> [  305.923927] Modules linked in:
>>>> [  305.924611] CPU: 0 PID: 3867 Comm: repro.exe Tainted: G        W
>>>>     4.16.0-rc3 #1
>>>> [  305.925987] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
>>>> BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
>>>> [  305.928119] RIP: 0010:refcount_sub_and_test+0x1ec/0x200
>>>> [  305.929124] RSP: 0018:ffff880224d374a0 EFLAGS: 00010282
>>>> [  305.930161] RAX: 0000000000000026 RBX: 0000000000000000 RCX: ffffffff813c9644
>>>> [  305.931504] RDX: 0000000000000000 RSI: ffffffff813cd761 RDI: ffff880224d37018
>>>> [  305.932942] RBP: ffff880224d37538 R08: ffff88023169a440 R09: 0000000000000000
>>>> [  305.934365] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000ffffffff
>>>> [  305.935734] R13: ffff88023473adc0 R14: 0000000000000001 R15: 1ffff100449a6e96
>>>> [  305.937114] FS:  0000000000c6e940(0000) GS:ffff8800b8a00000(0000)
>>>> knlGS:0000000000000000
>>>> [  305.938668] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [  305.939768] CR2: 00007fb120571db8 CR3: 0000000005422000 CR4: 00000000000006f0
>>>> [  305.941212] Call Trace:
>>>> [  305.941689]  ? refcount_inc+0x70/0x70
>>>> [  305.942216]  ? skb_dequeue+0xa5/0xc0
>>>> [  305.942713]  refcount_dec_and_test+0x1a/0x20
>>>> [  305.943295]  packet_release+0x702/0x7a0
>>>> [  305.943816]  ? mark_held_locks+0x25/0xb0
>>>> [  305.944378]  ? packet_lookup_frame+0x110/0x110
>>>> [  305.945021]  ? __lock_is_held+0x39/0xc0
>>>> [  305.945561]  ? note_gp_changes+0x300/0x300
>>>> [  305.946132]  ? __sanitizer_cov_trace_const_cmp8+0x18/0x20
>>>> [  305.946866]  ? locks_remove_file+0x31b/0x420
>>>> [  305.947464]  ? fcntl_setlk+0xad0/0xad0
>>>> [  305.948000]  ? trace_event_raw_event_sched_switch+0x320/0x320
>>>> [  305.948781]  ? fsnotify_first_mark+0x2c0/0x2c0
>>>> [  305.949386]  sock_release+0x53/0xe0
>>>> [  305.949866]  ? sock_alloc_file+0x2c0/0x2c0
>>>> [  305.950437]  sock_close+0x16/0x20
>>>> [  305.950906]  __fput+0x246/0x4e0
>>>> [  305.951360]  ? fput+0x130/0x130
>>>> [  305.951807]  ? trace_event_raw_event_sched_switch+0x320/0x320
>>>> [  305.952620]  ____fput+0x15/0x20
>>>> [  305.953071]  task_work_run+0x1a5/0x200
>>>> [  305.953585]  ? kmem_cache_free+0x25c/0x2d0
>>>> [  305.954143]  ? task_work_cancel+0x1a0/0x1a0
>>>> [  305.954714]  ? switch_task_namespaces+0x9e/0xb0
>>>> [  305.955334]  do_exit+0xacf/0x10d0
>>>> [  305.955801]  ? mm_update_next_owner+0x650/0x650
>>>> [  305.956431]  ? __pv_queued_spin_lock_slowpath+0xbf0/0xbf0
>>>> [  305.957157]  ? trace_hardirqs_on_caller+0x136/0x2a0
>>>> [  305.957811]  ? trace_hardirqs_on+0xd/0x10
>>>> [  305.958360]  ? wake_up_new_task+0x41c/0x650
>>>> [  305.958937]  ? to_ratio+0x20/0x20
>>>> [  305.959391]  ? lock_release+0x530/0x530
>>>> [  305.959924]  ? trace_event_raw_event_sched_switch+0x320/0x320
>>>> [  305.960693]  ? match_held_lock+0x7e/0x360
>>>> [  305.961244]  ? trace_hardirqs_off+0x10/0x10
>>>> [  305.961810]  ? put_pid+0x111/0x140
>>>> [  305.962277]  ? task_active_pid_ns+0x70/0x70
>>>> [  305.962862]  ? find_held_lock+0xca/0xf0
>>>> [  305.963396]  ? do_group_exit+0x1f9/0x260
>>>> [  305.963933]  ? lock_downgrade+0x380/0x380
>>>> [  305.964508]  ? task_clear_jobctl_pending+0xb5/0xd0
>>>> [  305.965147]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>>> [  305.965871]  ? do_raw_spin_unlock+0x112/0x1a0
>>>> [  305.966459]  ? do_raw_spin_trylock+0x100/0x100
>>>> [  305.967060]  ? mark_held_locks+0x25/0xb0
>>>> [  305.967592]  ? force_sig+0x30/0x30
>>>> [  305.968135]  ? _raw_spin_unlock_irq+0x27/0x50
>>>> [  305.968741]  ? trace_hardirqs_on_caller+0x136/0x2a0
>>>> [  305.969470]  do_group_exit+0xfb/0x260
>>>> [  305.969987]  ? SyS_exit+0x30/0x30
>>>> [  305.970505]  ? find_mergeable_anon_vma+0x90/0x90
>>>> [  305.971126]  ? SyS_read+0x1c0/0x1c0
>>>> [  305.971718]  ? mark_held_locks+0x25/0xb0
>>>> [  305.972259]  ? do_syscall_64+0xb2/0x5d0
>>>> [  305.972843]  ? do_group_exit+0x260/0x260
>>>> [  305.973374]  SyS_exit_group+0x1d/0x20
>>>> [  305.973932]  do_syscall_64+0x209/0x5d0
>>>> [  305.974452]  ? syscall_return_slowpath+0x3e0/0x3e0
>>>> [  305.975149]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
>>>> [  305.975941]  ? syscall_return_slowpath+0x260/0x3e0
>>>> [  305.976669]  ? mark_held_locks+0x25/0xb0
>>>> [  305.977206]  ? entry_SYSCALL_64_after_hwframe+0x52/0xb7
>>>> [  305.977978]  ? trace_hardirqs_off_caller+0xb5/0x120
>>>> [  305.978690]  ? trace_hardirqs_off_thunk+0x1a/0x1c
>>>> [  305.979381]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
>>>> [  305.980114] RIP: 0033:0x44d989
>>>> [  305.980531] RSP: 002b:00000000007fff38 EFLAGS: 00000202 ORIG_RAX:
>>>> 00000000000000e7
>>>> [  305.981664] RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000044d989
>>>> [  305.982655] RDX: 00007fb120d739c0 RSI: 00007fb120572700 RDI: 0000000000000001
>>>> [  305.983654] RBP: 00000000007fff60 R08: 00007fb120572700 R09: 0000000000000000
>>>> [  305.984656] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000
>>>> [  305.985707] R13: 000000000040d270 R14: 000000000040d300 R15: 0000000000000000
>>>> [  305.986724] Code: b6 1d 18 9b ef 03 31 ff 89 de e8 60 a4 67 ff 84
>>>> db 75 1a e8 87 a3 67 ff 48 c7 c7 00 8f 83 84 c6 05 f8 9a ef 03 01 e8
>>>> 84 60 49 ff <0f> 0b 31 db e9 2b ff ff ff 66 66 2e 0f 1f 84 00 00 00 00
>>>> 00 55
>>>> [  305.990106] ---[ end trace 360c084b02d93022 ]---
>>>> [  305.998636] IPVS: ftp: loaded support on port[0] = 21
>>>> ---------------------------------------
>>>>
>>>> = About RaceFuzzer
>>>>
>>>> RaceFuzzer is a customized version of Syzkaller, specifically tailored
>>>> to find race condition bugs in the Linux kernel. While we leverage
>>>> many different technique, the notable feature of RaceFuzzer is in
>>>> leveraging a custom hypervisor (QEMU/KVM) to interleave the
>>>> scheduling. In particular, we modified the hypervisor to intentionally
>>>> stall a per-core execution, which is similar to supporting per-core
>>>> breakpoint functionality. This allows RaceFuzzer to force the kernel
>>>> to deterministically trigger racy condition (which may rarely happen
>>>> in practice due to randomness in scheduling).
>>>>
>>>> RaceFuzzer's C repro always pinpoints two racy syscalls. Since C
>>>> repro's scheduling synchronization should be performed at the user
>>>> space, its reproducibility is limited (reproduction may take from 1
>>>> second to 10 minutes (or even more), depending on a bug). This is
>>>> because, while RaceFuzzer precisely interleaves the scheduling at the
>>>> kernel's instruction level when finding this bug, C repro cannot fully
>>>> utilize such a feature. Please disregard all code related to
>>>> "should_hypercall" in the C repro, as this is only for our debugging
>>>> purposes using our own hypervisor.

^ permalink raw reply

* Re: [PATCH v2 0/2] gfs2: Stop using rhashtable_walk_peek
From: Herbert Xu @ 2018-04-03  4:03 UTC (permalink / raw)
  To: NeilBrown
  Cc: Andreas Gruenbacher, cluster-devel, netdev, LKML, Thomas Graf,
	Tom Herbert
In-Reply-To: <871sfx0xeh.fsf@notabene.neil.brown.name>

On Tue, Apr 03, 2018 at 01:41:26PM +1000, NeilBrown wrote:
>
> Do we really need a rhashtable_walk_peek() interface?
> I imagine that a seqfile ->start function can do:
> 
>   if (*ppos == 0 && last_pos != 0) {
>   	rhashtable_walk_exit(&iter);
>         rhashtable_walk_enter(&table, &iter);
>         last_pos = 0;
>   }
>   rhashtable_walk_start(&iter);
>   if (*ppos == last_pos && iter.p)
>   	return iter.p;
>   last_pos = *ppos;

We don't want users poking into the internals of iter.  If you're
suggesting we could simplify rhashtable_walk_peek to just this after
your patch then yes possibly.  You also need to ensure that not only
seqfs users continue to work but also netlink dump users which are
slightly different.

> It might be OK to have a function call instead of expecting people to
> use iter.p directly.

Yes that would definitely be the preferred option.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH net-next RFC V1 5/5] net: mdio: Add a driver for InES time stamping IP core.
From: Richard Cochran @ 2018-04-03  3:55 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Andrew Lunn, netdev, devicetree, David Miller, Mark Rutland,
	Miroslav Lichvar, Rob Herring, Willem de Bruijn
In-Reply-To: <4c3c939f-4cbe-51db-c141-950b85a5b4de@gmail.com>

On Sun, Mar 25, 2018 at 04:01:49PM -0700, Florian Fainelli wrote:
> The best that I can think about and it still is a hack in some way, is
> to you have your time stamping driver create a proxy mii_bus whose
> purpose is just to hook to mdio/phy_device events (such as link changes)
> in order to do what is necessary, or at least, this would indicate its
> transparent nature towards the MDIO/MDC lines...

That won't work at all, AFAICT.  There is only one mii_bus per netdev,
that is one that is attached to the phydev.
 
> Tangential: the existing PHY time stamping logic should probably be
> generalized to a mdio_device (which the phy_device is a specialized
> superset of) instead of to the phy_device. This would still allow
> existing use cases but it would also allow us to support possible "pure
> MDIO" devices would that become some thing in the future.

So this is exactly what I did.  The time stamping methods were pushed
down into the mdio_device.  The active device (mdiots pointer) points
either to a non-PHY mdio_device or to the mdio_device embedded in the
phydev.

Thanks,
Richard

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox