Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH RFC v2 3/7] net: phy: resume PHY only if needed in mdio_bus_phy_suspend
From: Heiner Kallweit @ 2018-03-16 21:15 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

Currently the PHY is unconditionally resumed in mdio_bus_phy_suspend().
In cases where the PHY was sleepinh before suspending or if somebody else
takes care of resuming later, this is not needed and wastes energy.

Also start the state machine only if it's used by the driver (indicated
by the adjust_link callback being defined).

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
v2:
- rename variable in mdio_bus_phy_needs_start
---
 drivers/net/phy/phy_device.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 85ebb969..c934725b 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -124,6 +124,18 @@ static bool phy_may_suspend(struct phy_device *phydev)
 }
 
 #ifdef CONFIG_PM
+
+static bool mdio_bus_phy_needs_start(struct phy_device *phydev)
+{
+	bool needs_start;
+
+	mutex_lock(&phydev->lock);
+	needs_start = phydev->state == PHY_UP && phydev->adjust_link;
+	mutex_unlock(&phydev->lock);
+
+	return needs_start;
+}
+
 static int mdio_bus_phy_suspend(struct device *dev)
 {
 	struct phy_device *phydev = to_phy_device(dev);
@@ -142,16 +154,17 @@ static int mdio_bus_phy_suspend(struct device *dev)
 static int mdio_bus_phy_resume(struct device *dev)
 {
 	struct phy_device *phydev = to_phy_device(dev);
-	int ret;
+	int ret = 0;
 
-	ret = phy_resume(phydev);
-	if (ret < 0)
-		return ret;
+	if (!phydev->attached_dev)
+		return 0;
 
-	if (phydev->attached_dev && phydev->adjust_link)
-		phy_start_machine(phydev);
+	if (mdio_bus_phy_needs_start(phydev))
+		phy_start(phydev);
+	else if (!phydev->adjust_link)
+		ret = phy_resume(phydev);
 
-	return 0;
+	return ret;
 }
 
 static int mdio_bus_phy_restore(struct device *dev)
@@ -171,7 +184,8 @@ static int mdio_bus_phy_restore(struct device *dev)
 	phydev->link = 0;
 	phydev->state = PHY_UP;
 
-	phy_start_machine(phydev);
+	if (mdio_bus_phy_needs_start(phydev))
+		phy_start(phydev);
 
 	return 0;
 }
-- 
2.16.2

^ permalink raw reply related

* [PATCH RFC v2 2/7] net: phy: improve checking for when PHY is allowed to, suspend
From: Heiner Kallweit @ 2018-03-16 21:14 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

This patch improves and unifies checking for when PHY is allowed to
suspend. New is a check for the parent of the MDIO bus being
runtime-suspended. In this case the MDIO bus may not be accessible and
therefore we don't try to suspend the PHY. Instead we rely on the
parent to suspend all devices on the MDIO bus when it's suspended.

In addition change the behavior to return 0 instead of -EBUSY from
phy_suspend() if we detect a situation where the PHY shouldn't be
suspended. Returning -EBUSY from a suspend callback is supported
for runtime pm, however in case of system suspend it prevents the
system from suspending.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
v2:
- no changes
---
 drivers/net/phy/phy_device.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index b2853233..85ebb969 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -35,6 +35,7 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/of.h>
+#include <linux/pm_runtime.h>
 
 #include <asm/irq.h>
 
@@ -75,14 +76,27 @@ extern struct phy_driver genphy_10g_driver;
 static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
-#ifdef CONFIG_PM
-static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
+static bool phy_may_suspend(struct phy_device *phydev)
 {
 	struct device_driver *drv = phydev->mdio.dev.driver;
 	struct phy_driver *phydrv = to_phy_driver(drv);
 	struct net_device *netdev = phydev->attached_dev;
+	struct device *mdio_parent = phydev->mdio.bus->parent;
+	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+
+	if (!drv || !phydrv->suspend || phydev->suspended)
+		return false;
+
+	/* If the device has WOL enabled, we cannot suspend the PHY */
+	phy_ethtool_get_wol(phydev, &wol);
+	if (wol.wolopts)
+		return false;
 
-	if (!drv || !phydrv->suspend)
+	/* If the parent of the MDIO bus is runtime-suspended, the MDIO bus may
+	 * not be accessible and we expect the parent to suspend all devices
+	 * on the MDIO bus when it suspends.
+	 */
+	if (mdio_parent && pm_runtime_suspended(mdio_parent))
 		return false;
 
 	/* PHY not attached? May suspend if the PHY has not already been
@@ -91,7 +105,7 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
 	 * MDIO bus driver and clock gated at this point.
 	 */
 	if (!netdev)
-		return !phydev->suspended;
+		return true;
 
 	/* Don't suspend PHY if the attached netdev parent may wakeup.
 	 * The parent may point to a PCI device, as in tg3 driver.
@@ -109,6 +123,7 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
 	return true;
 }
 
+#ifdef CONFIG_PM
 static int mdio_bus_phy_suspend(struct device *dev)
 {
 	struct phy_device *phydev = to_phy_device(dev);
@@ -121,9 +136,6 @@ static int mdio_bus_phy_suspend(struct device *dev)
 	if (phydev->attached_dev && phydev->adjust_link)
 		phy_stop_machine(phydev);
 
-	if (!mdio_bus_phy_may_suspend(phydev))
-		return 0;
-
 	return phy_suspend(phydev);
 }
 
@@ -132,14 +144,10 @@ static int mdio_bus_phy_resume(struct device *dev)
 	struct phy_device *phydev = to_phy_device(dev);
 	int ret;
 
-	if (!mdio_bus_phy_may_suspend(phydev))
-		goto no_resume;
-
 	ret = phy_resume(phydev);
 	if (ret < 0)
 		return ret;
 
-no_resume:
 	if (phydev->attached_dev && phydev->adjust_link)
 		phy_start_machine(phydev);
 
@@ -1148,13 +1156,10 @@ EXPORT_SYMBOL(phy_detach);
 int phy_suspend(struct phy_device *phydev)
 {
 	struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
-	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
 	int ret = 0;
 
-	/* If the device has WOL enabled, we cannot suspend the PHY */
-	phy_ethtool_get_wol(phydev, &wol);
-	if (wol.wolopts)
-		return -EBUSY;
+	if (!phy_may_suspend(phydev))
+		return 0;
 
 	if (phydev->drv && phydrv->suspend)
 		ret = phydrv->suspend(phydev);
-- 
2.16.2

^ permalink raw reply related

* [PATCH RFC v2 1/7] net: phy: unconditionally resume and re-enable interrupts, in phy_start
From: Heiner Kallweit @ 2018-03-16 21:14 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org
In-Reply-To: <6618f53c-778a-cb12-deb4-c618a728b43e@gmail.com>

In subsequent patches of this series interrupts will be disabled during
system suspend, also we will have to resume the PHY in states other than
PHY_HALTED. To prepare for this unconditionally resume and re-enable
interrupts in phy_start().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
v2:
- no changes
---
 drivers/net/phy/phy.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 05c1e8ef..0aef35ef 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -791,10 +791,16 @@ EXPORT_SYMBOL(phy_stop);
  */
 void phy_start(struct phy_device *phydev)
 {
-	int err = 0;
+	/* if phy was suspended, bring the physical link up again */
+	phy_resume(phydev);
 
-	mutex_lock(&phydev->lock);
+	/* make sure interrupts are re-enabled for the PHY */
+	if (phy_interrupt_is_valid(phydev) && phy_enable_interrupts(phydev)) {
+		phy_error(phydev);
+		return;
+	}
 
+	mutex_lock(&phydev->lock);
 	switch (phydev->state) {
 	case PHY_STARTING:
 		phydev->state = PHY_PENDING;
@@ -803,16 +809,6 @@ void phy_start(struct phy_device *phydev)
 		phydev->state = PHY_UP;
 		break;
 	case PHY_HALTED:
-		/* if phy was suspended, bring the physical link up again */
-		__phy_resume(phydev);
-
-		/* make sure interrupts are re-enabled for the PHY */
-		if (phy_interrupt_is_valid(phydev)) {
-			err = phy_enable_interrupts(phydev);
-			if (err < 0)
-				break;
-		}
-
 		phydev->state = PHY_RESUMING;
 		break;
 	default:
-- 
2.16.2

^ permalink raw reply related

* Re: [PATCH v2 00/21] Allow compile-testing NO_DMA (drivers)
From: Wolfram Sang @ 2018-03-16 21:23 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Ulf Hansson, linux-iio-u79uwXL29TY76Z2rM5mHXA,
	linux-fpga-u79uwXL29TY76Z2rM5mHXA,
	linux-remoteproc-u79uwXL29TY76Z2rM5mHXA,
	alsa-devel-K7yf7f+aM1XWsZ/bQMPhNw, Bjorn Andersson, Eric Anholt,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-mtd-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA,
	linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	Christoph Hellwig, Stefan Wahren, Boris Brezillon,
	James E . J . Bottomley, Herbert Xu,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA, Richard Weinberger, Jassi Brar,
	Marek Vasut, linux-serial-u79uwXL29TY76Z2rM5mHXA, Matias Bjorling,
	David Woodhouse, linux-media-u79uwXL29TY76Z2rM5mHXA,
	Ohad Ben-Cohen
In-Reply-To: <1521208314-4783-1-git-send-email-geert-Td1EMuHUCqxL1ZNQvxDV9g@public.gmane.org>


[-- Attachment #1.1: Type: text/plain, Size: 387 bytes --]


> To avoid allmodconfig/allyesconfig regressions on NO_DMA=y platforms,
> this (drivers) series should be applied after the previous (core)
> series (but not many people may notice/care ;-)

I still don't get if there is a dependency on the core patches. I.e.
shall I apply the subsystem patch now by myself or do you want to push
the series after the core patch and need my ack here?


[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply

* Re: [PATCH 0/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Andrew Lunn @ 2018-03-16 21:14 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Grygorii Strashko, David S. Miller, netdev, Greg Kroah-Hartman,
	Sekhar Nori, linux-kernel, linux-omap
In-Reply-To: <c790757c-0b4f-575b-1f6a-5f870f840e20@gmail.com>

> I agree, let's not have you run into circles, let's just use your
> patches as they are since they fix the problem and are not intrusive in
> any way.

Agreed, this is too complex, for little gain.

	Andrew

^ permalink raw reply

* Re: [PATCH 0/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Florian Fainelli @ 2018-03-16 21:09 UTC (permalink / raw)
  To: Grygorii Strashko, Andrew Lunn
  Cc: David S. Miller, netdev, Greg Kroah-Hartman, Sekhar Nori,
	linux-kernel, linux-omap
In-Reply-To: <0daaae50-ddb9-8a94-d4a7-02861f0a6eb7@ti.com>

On 03/16/2018 01:13 PM, Grygorii Strashko wrote:
> 
> 
> On 03/16/2018 02:54 PM, Andrew Lunn wrote:
>>> The phydrv->mdiodrv.flags can be accessible only after call to of_phy_connect()/phy_connect(),
>>
>> You need to use a function like of_phy_find_device() to get the
>> phydev, set the flag, and then call phy_connect_direct().
> 
> 
> So, do you propose me to replace direct calls of of_phy_connect()/phy_connect() in
> CPSW driver with buddies of the same functions? Right?
> 
> cpsw_slave_open()
> {
> ....
> 	if (slave->data->phy_node) {
> 		phy = of_phy_connect(priv->ndev, slave->data->phy_node,
> 				 &cpsw_adjust_link, 0, slave->data->phy_if);
> ----- replace ^^^^ with below
> {
> 	struct phy_device *phy = of_phy_find_device(phy_np);
> 	int ret;
> 
> 	if (!phy)
> 		return NULL;
> 
> 	phy->dev_flags = flags;
> 
> -----	[set flag in phydrv->mdiodrv.flags]
> 
> 	ret = phy_connect_direct(dev, phy, hndlr, iface);
> 
> 	/* refcount is held by phy_connect_direct() on success */
> 	put_device(&phy->mdio.dev);
> 
> 	return ret ? NULL : phy;
> }
> -----
> 		if (!phy) {
> 			dev_err(priv->dev, "phy \"%pOF\" not found on slave %d\n",
> 				slave->data->phy_node,
> 				slave->slave_num);
> 			return;
> 		}
> 	} else {
> 		phy = phy_connect(priv->ndev, slave->data->phy_id,
> 				 &cpsw_adjust_link, slave->data->phy_if);
> ----- replace ^^^^ with below
> {
> 	struct phy_device *phydev;
> 	struct device *d;
> 	int rc;
> 
> 	/* Search the list of PHY devices on the mdio bus for the
> 	 * PHY with the requested name
> 	 */
> 	d = bus_find_device_by_name(&mdio_bus_type, NULL, bus_id);
> 	if (!d) {
> 		pr_err("PHY %s not found\n", bus_id);
> 		return ERR_PTR(-ENODEV);
> 	}
> 	phydev = to_phy_device(d);
> 
> -----	[set flag in phydrv->mdiodrv.flags]
> 
> 	rc = phy_connect_direct(dev, phydev, handler, interface);
> 	put_device(d);
> 	if (rc)
> 		return ERR_PTR(rc);
> 
> 	return phydev;
> }
> -----
> 		if (IS_ERR(phy)) {
> 			dev_err(priv->dev,
> 				"phy \"%s\" not found on slave %d, err %ld\n",
> 				slave->data->phy_id, slave->slave_num,
> 				PTR_ERR(phy));
> 			return;
> 		}
> 	}
> }
> 
> and all above just to set a flag which will be used by just one driver as of now.
> 
> Hm. Is this some sort of punishment ;) Sry. I'll probably will take a pause.

I agree, let's not have you run into circles, let's just use your
patches as they are since they fix the problem and are not intrusive in
any way.
-- 
Florian

^ permalink raw reply

* RE: [PATCH v3 18/18] infiniband: cxgb4: Eliminate duplicate barriers on weakly-ordered archs
From: Steve Wise @ 2018-03-16 21:05 UTC (permalink / raw)
  To: 'Sinan Kaya', netdev, timur, sulrich
  Cc: linux-arm-msm, linux-arm-kernel, 'Steve Wise',
	'Doug Ledford', 'Jason Gunthorpe', linux-rdma,
	linux-kernel, 'Michael Werner', 'Casey Leedom'
In-Reply-To: <1521216991-28706-19-git-send-email-okaya@codeaurora.org>

> Code includes wmb() followed by writel(). writel() already has a barrier
on
> some architectures like arm64.
> 
> This ends up CPU observing two barriers back to back before executing the
> register write.
> 
> Since code already has an explicit barrier call, changing writel() to
> writel_relaxed().
> 
> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>

NAK - This isn't correct for PowerPC.  For PowerPC, writeX_relaxed() is just
writeX().  

I was just looking at this with Chelsio developers, and they said the
writeX() should be replaced with __raw_writeX(), not writeX_relaxed(), to
get rid of the extra barrier for all architectures.

Also, t4.h:pio_copy() needs to  use __raw_writeq() to enable the write
combining fastpath for ARM and PowerPC.  The code as it stands doesn't
achieve any write combining on PowerPC at least.  

And the writel()s at the end of the ring functions (the non bar2 udb path)
needs a mmiowb() afterwards if you're going to use __raw_writeX() there.
However that path is only used for very old hardware (T4), so I wouldn't
worry about them. 

Steve.


> ---
>  drivers/infiniband/hw/cxgb4/t4.h | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/cxgb4/t4.h
> b/drivers/infiniband/hw/cxgb4/t4.h
> index 8369c7c..7a48c9e 100644
> --- a/drivers/infiniband/hw/cxgb4/t4.h
> +++ b/drivers/infiniband/hw/cxgb4/t4.h
> @@ -477,15 +477,15 @@ static inline void t4_ring_sq_db(struct t4_wq *wq,
> u16 inc, union t4_wr *wqe)
>  				 (u64 *)wqe);
>  		} else {
>  			pr_debug("DB wq->sq.pidx = %d\n", wq->sq.pidx);
> -			writel(PIDX_T5_V(inc) | QID_V(wq->sq.bar2_qid),
> -			       wq->sq.bar2_va + SGE_UDB_KDOORBELL);
> +			writel_relaxed(PIDX_T5_V(inc) | QID_V(wq-
> >sq.bar2_qid),
> +				       wq->sq.bar2_va +
> SGE_UDB_KDOORBELL);
>  		}
> 
>  		/* Flush user doorbell area writes. */
>  		wmb();
>  		return;
>  	}
> -	writel(QID_V(wq->sq.qid) | PIDX_V(inc), wq->db);
> +	writel_relaxed(QID_V(wq->sq.qid) | PIDX_V(inc), wq->db);
>  }
> 
>  static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc,
> @@ -502,15 +502,15 @@ static inline void t4_ring_rq_db(struct t4_wq *wq,
> u16 inc,
>  				 (void *)wqe);
>  		} else {
>  			pr_debug("DB wq->rq.pidx = %d\n", wq->rq.pidx);
> -			writel(PIDX_T5_V(inc) | QID_V(wq->rq.bar2_qid),
> -			       wq->rq.bar2_va + SGE_UDB_KDOORBELL);
> +			writel_relaxed(PIDX_T5_V(inc) | QID_V(wq-
> >rq.bar2_qid),
> +				       wq->rq.bar2_va +
> SGE_UDB_KDOORBELL);
>  		}
> 
>  		/* Flush user doorbell area writes. */
>  		wmb();
>  		return;
>  	}
> -	writel(QID_V(wq->rq.qid) | PIDX_V(inc), wq->db);
> +	writel_relaxed(QID_V(wq->rq.qid) | PIDX_V(inc), wq->db);
>  }
> 
>  static inline int t4_wq_in_error(struct t4_wq *wq)
> --
> 2.7.4

^ permalink raw reply

* Re: [RFC 2/2] page_frag_cache: Store metadata in struct page
From: Matthew Wilcox @ 2018-03-16 21:05 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Alexander Duyck, linux-mm, Netdev, Matthew Wilcox
In-Reply-To: <CAKgT0Uf0qWbo7T8j00Owt_hEj6vZSY-MOsqUeZCek1x62DKz4Q@mail.gmail.com>

On Thu, Mar 15, 2018 at 02:26:28PM -0700, Alexander Duyck wrote:
> On Thu, Mar 15, 2018 at 12:53 PM, Matthew Wilcox <willy@infradead.org> wrote:
> > From: Matthew Wilcox <mawilcox@microsoft.com>
> >
> > Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
> > currently-in-use struct page) by using the page's refcount directly
> > (instead of maintaining a bias) and storing our current progress through
> > the page in the same bits currently used for page->index.  We no longer
> > need to reflect the page pfmemalloc state if we're storing the page
> > directly.
> >
> > On the downside, we now call page_address() on every allocation, and we
> > do an atomic_inc() rather than a non-atomic decrement, but we should
> > touch the same number of cachelines and there is far less code (and
> > the code is less complex).
> >
> > Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
> 
> So my concern with this patch is that it is essentially trading off
> CPU performance for reduced size. One of the reasons for getting away
> from using the page pointer is because it is expensive to access the
> page when the ref_count is bouncing on multiple cache lines.

I'm not really interested in trading off SMP scalability for reduced
memory usage; if we see more than a trivial penalty then I'll withdraw
this RFC.  I read & understand the commits that you made to get us to
the current codebase, but there's no history of this particular variation
in the tree.

0e39250845c0f91acc64264709b25f7f9b85c2c3
9451980a6646ed487efce04a9df28f450935683e
540eb7bf0bbedb65277d68ab89ae43cdec3fd6ba

I think that by moving the 'offset' into the struct page, we end up updating
only one cacheline instead of two, which should be a performance win.
I understand your concern about the cacheline bouncing between the freeing
and allocating CPUs.  Is cross-CPU freeing a frequent occurrence?  From
looking at its current usage, it seemed like the allocation and freeing
were usually on the same CPU.

> In
> addition converting from a page to a virtual address is actually more
> expensive then you would think it should be. I went through and
> optimized that the best I could, but there is still a pretty
> significant cost to the call.

Were you able to break down where that cost occurred?  It looks
pretty cheap on x86-32 nohighmem:

    343e:       a1 00 00 00 00          mov    0x0,%eax
                        343f: R_386_32  mem_map
    3443:       29 f3                   sub    %esi,%ebx
    3445:       89 5a 08                mov    %ebx,0x8(%edx)
    3448:       29 c2                   sub    %eax,%edx
    344a:       c1 fa 05                sar    $0x5,%edx
    344d:       c1 e2 0c                shl    $0xc,%edx
    3450:       8d 84 13 00 00 00 c0    lea    -0x40000000(%ebx,%edx,1),%eax
    3457:       8b 5d f8                mov    -0x8(%ebp),%ebx
    345a:       8b 75 fc                mov    -0x4(%ebp),%esi
    345d:       89 ec                   mov    %ebp,%esp
    345f:       5d                      pop    %ebp
    3460:       c3                      ret    

I did code up a variant which stored the virtual address of the offset
in page->index, but I threw that away after seeing how much more code
it turned into.  Fortunately I had a better idea of how to implement that
early this morning.  I haven't even tested it yet, but it looks like better
generated code:

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 28d9a4c9c5fd..ec38204eb903 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4357,8 +4357,8 @@ struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 		unsigned int offset = PAGE_SIZE << compound_order(old);
 
 		if (offset > size) {
-			old->offset = offset;
-			return old;
+			page = old;
+			goto reset;
 		}
 	}
 
@@ -4379,7 +4379,10 @@ struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 	if (old)
 		put_page(old);
 	nc->page = page;
-	page->offset = PAGE_SIZE << compound_order(page);
+	page->private = (PAGE_SIZE << compound_order(page)) - 1;
+reset:
+	page->index = (unsigned long)page_to_virt(page) +
+					(PAGE_SIZE << compound_order(page));
 	return page;
 }
 
@@ -4402,20 +4405,26 @@ void *page_frag_alloc(struct page_frag_cache *nc,
 		      unsigned int size, gfp_t gfp_mask)
 {
 	struct page *page = nc->page;
-	unsigned int offset = page->offset;
+	unsigned long addr = addr;
+	unsigned int offset = 0;
 
-	if (unlikely(!page || offset < size)) {
+	if (page) {
+		addr = page->index;
+		offset = addr & page->private;
+	}
+
+	if (unlikely(offset < size)) {
 		page = __page_frag_cache_refill(nc, size, gfp_mask);
 		if (!page)
 			return NULL;
-		offset = page->offset;
+		addr = page->index;
 	}
 
 	page_ref_inc(page);
-	offset -= size;
-	page->offset = offset;
+	addr -= size;
+	page->index = addr;
 
-	return page_address(page) + offset;
+	return (void *)addr;
 }
 EXPORT_SYMBOL(page_frag_alloc);
 

Obviously if the problem turns out to be the cacheline thrashing rather
than the call to page_to_virt, then this is pointless to test.

> I won't be able to test the patches until next week, but I expect I
> will probably see a noticeable regression when performing a small
> packet routing test.

I really appreciate you being willing to try this for me.  I need to
get myself a dual-socket machine to test things like this.

^ permalink raw reply related

* [PATCH RFC v2 0/7] net: phy: patch series aiming to improve few aspects of phylib
From: Heiner Kallweit @ 2018-03-16 21:04 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven; +Cc: netdev@vger.kernel.org

This patch series aims to tackle few issues with phylib:
 
- address issues with patch series [1] (smsc911x + phylib changes)
- make phy_stop synchronous
- get rid of phy_start/stop_machine and handle it in phy_start/phy_stop
- in mdio_suspend consider runtime pm state of mdio bus parent
- consider more WOL conditions when deciding whether PHY is allowed to
  suspend
- only resume phy after system suspend if needed

[1] https://www.mail-archive.com/netdev@vger.kernel.org/msg196061.html

It works fine here but other NIC drivers may use phylib differently. 
Therefore I'd appreciate feedback and more testing.

I could think of some subsequent patches, e.g. phy_error() could be
reduced to calling phy_stop() and printing an error message
(today it silently sets the PHY state to PHY_HALTED).

Changes in v2:
- Incorporate review comments
- Address error reported by Geert by changing phy_stop() to not trigger
  a linkwatch event.

Heiner Kallweit (7):
  net: phy: unconditionally resume and re-enable interrupts in phy_start
  net: phy: improve checking for when PHY is allowed to suspend
  net: phy: resume PHY only if needed in mdio_bus_phy_suspend
  net: phy: remove phy_start_machine
  net: phy: make phy_stop synchronous
  net: phy: use new function phy_stop_suspending in mdio_bus_phy_suspend
  net: phy: remove phy_stop_machine

 drivers/net/phy/phy.c        | 127 +++++++++++++++++++++----------------------
 drivers/net/phy/phy_device.c |  77 ++++++++++++++++----------
 drivers/net/phy/phylink.c    |   1 -
 include/linux/phy.h          |   3 +-
 4 files changed, 112 insertions(+), 96 deletions(-)

-- 
2.16.2

^ permalink raw reply

* Re: HW question: i210 vs. BCM5461S over SGMII: no response from PHY to MDIO requests?
From: Andrew Lunn @ 2018-03-16 21:02 UTC (permalink / raw)
  To: Frantisek Rysanek; +Cc: netdev
In-Reply-To: <5AAC2BA1.5854.31008546@Frantisek.Rysanek.post.cz>

> The DeLock board is this beauty:
> http://www.delock.de/produkte/G_89481/merkmale.html?setLanguage=en
> DeLock techsupport were so kind as to provide me with a schematic 
> snippet, showing the wiring of the i210 fiber SKU's dedicated 
> I2C/MDIO pins to the SFP socket's standard I2C pins. There are 
> pull-up resistors on the board for SDA/SCL. Nothing outright custom 
> here - more likely close to some Intel reference design. The board 
> works just fine.

I would suggest you ignore MDIO and try looking on the I2C bus.

Does ethtool -m show anything useful?

     Andrew

^ permalink raw reply

* Re: [PATCH iproute2] treat "default" and "all"/"any" parameters differenty
From: Alexander Zubkov @ 2018-03-16 20:59 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Luca Boccassi, netdev@vger.kernel.org
In-Reply-To: <20180316133733.5be5be82@xeon-e3>

Hi Stephen,

This patch is against v4.15.0 only. It will not fit the current master, because there were changes it that areas of code since then. I have made some patch for master too and trying to discuss it with Serhey, who made other changes there. He has not answered yet. If there are no answer for some more time, I'll consider that fix as suitable and send it as a proper patch to the list. That will not require reverse patch too.

16.03.2018, 21:37, "Stephen Hemminger" <stephen@networkplumber.org>:
> On Tue, 13 Mar 2018 21:19:45 +0100
> Alexander Zubkov <green@msu.ru> wrote:
>
>>  Debian maintainer found that basic command:
>>          # ip route flush all
>>  No longer worked as expected which breaks user scripts and
>>  expectations. It no longer flushed all IPv4 routes.
>>
>>  Recently behaviour of "default" prefix parameter was corrected. But at
>>  the same time behaviour of "all"/"any" was altered too, because they
>>  were the same branch of the code. As those parameters mean different,
>>  they need to be treated differently in code too. This patch reflects
>>  the difference.
>>
>>  Reported-by: Luca Boccassi <bluca@debian.org>
>>  Signed-off-by: Alexander Zubkov <green@msu.ru>
>>  ---
>>    lib/utils.c | 3 ++-
>>    1 file changed, 2 insertions(+), 1 deletion(-)
>>
>>  diff --git a/lib/utils.c b/lib/utils.c
>>  index 9fa5220..b289d9c 100644
>>  --- a/lib/utils.c
>>  +++ b/lib/utils.c
>>  @@ -658,7 +658,8 @@ int get_prefix_1(inet_prefix *dst, char *arg, int
>>  family)
>>                    dst->family = family;
>>                    dst->bytelen = 0;
>>                    dst->bitlen = 0;
>>  - dst->flags |= PREFIXLEN_SPECIFIED;
>>  + if (strcmp(arg, "default") == 0)
>>  + dst->flags |= PREFIXLEN_SPECIFIED;
>>                    return 0;
>>            }
>>
>>  --
>>  1.9.1
>
> Which code is this a patch against? the current master or followup to my revert
> of the original patch?

^ permalink raw reply

* Re: [PATCH] net: ethernet: arc: Fix a potential memory leak if an optional regulator is deferred
From: Christophe JAILLET @ 2018-03-16 20:52 UTC (permalink / raw)
  To: David Miller
  Cc: heiko, branislav, netdev, linux-arm-kernel, linux-rockchip,
	linux-kernel, kernel-janitors
In-Reply-To: <20180316.152712.1584177628223195375.davem@davemloft.net>

Le 16/03/2018 à 20:27, David Miller a écrit :
> From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
> Date: Wed, 14 Mar 2018 22:09:34 +0100
>
>> diff --git a/drivers/net/ethernet/arc/emac_rockchip.c b/drivers/net/ethernet/arc/emac_rockchip.c
>> index 16f9bee992fe..8ee9dfd0e363 100644
>> --- a/drivers/net/ethernet/arc/emac_rockchip.c
>> +++ b/drivers/net/ethernet/arc/emac_rockchip.c
>> @@ -169,8 +169,10 @@ static int emac_rockchip_probe(struct platform_device *pdev)
>>   	/* Optional regulator for PHY */
>>   	priv->regulator = devm_regulator_get_optional(dev, "phy");
>>   	if (IS_ERR(priv->regulator)) {
>> -		if (PTR_ERR(priv->regulator) == -EPROBE_DEFER)
>> -			return -EPROBE_DEFER;
>> +		if (PTR_ERR(priv->regulator) == -EPROBE_DEFER) {
>> +			ret = -EPROBE_DEFER;
>> +			goto out_clk_disable;
>> +		}
> Please build test your changes.
>
> There is no 'ret' variable in this function, perhaps you meant 'err'.
>
Yes, obviously, this is 'err'.

I apologize. I usually build-test all my patches before submission.
This one seems to have gone out of my normal process.

Can you fix it yourself or do you want me to re-submit?

CJ

^ permalink raw reply

* Re: [PATCH] net: ethernet: arc: Fix a potential memory leak if an optional regulator is deferred
From: Christophe JAILLET @ 2018-03-16 20:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, linux-arm-kernel, linux-rockchip, kernel-janitors
In-Reply-To: <20180316.152712.1584177628223195375.davem@davemloft.net>

Le 16/03/2018 à 20:27, David Miller a écrit :
> From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
> Date: Wed, 14 Mar 2018 22:09:34 +0100
>
>> diff --git a/drivers/net/ethernet/arc/emac_rockchip.c b/drivers/net/ethernet/arc/emac_rockchip.c
>> index 16f9bee992fe..8ee9dfd0e363 100644
>> --- a/drivers/net/ethernet/arc/emac_rockchip.c
>> +++ b/drivers/net/ethernet/arc/emac_rockchip.c
>> @@ -169,8 +169,10 @@ static int emac_rockchip_probe(struct platform_device *pdev)
>>   	/* Optional regulator for PHY */
>>   	priv->regulator = devm_regulator_get_optional(dev, "phy");
>>   	if (IS_ERR(priv->regulator)) {
>> -		if (PTR_ERR(priv->regulator) == -EPROBE_DEFER)
>> -			return -EPROBE_DEFER;
>> +		if (PTR_ERR(priv->regulator) == -EPROBE_DEFER) {
>> +			ret = -EPROBE_DEFER;
>> +			goto out_clk_disable;
>> +		}
> Please build test your changes.
>
> There is no 'ret' variable in this function, perhaps you meant 'err'.
>
Yes, obviously, this is 'err'.

I apologize. I usually build-test all my patches before submission.
This one seems to have gone out of my normal process.

Can you fix it yourself or do you want me to re-submit?

CJ

^ permalink raw reply

* Re: HW question: i210 vs. BCM5461S over SGMII: no response from PHY to MDIO requests?
From: Frantisek Rysanek @ 2018-03-16 20:40 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev
In-Reply-To: <20180316184208.GC4212@lunn.ch>

On 16 Mar 2018 at 19:42, Andrew Lunn wrote:
> Hi Frantisek
> 
> This seems a bit odd. The SFP cage only has i2c, not MDIO. It is
> possible to carry MDIO over i2c, which is what is done when the SFP
> module is copper, not fibre. But you are doing 100Base-FX, so fibre.
> 
> The BCM5461 is a copper PHY. There are some PHYs which are designed to
> act as translators. You can feed SERDES in one side, and get SGMII out
> the other side, for example. Such PHYs can be used between a MAC and
> an SFP socket. But from what i can find online about the BCM5461, it
> does not have this capability.
> 
> Can you tell us the exact make/model of the DeLock card and the SFP
> modules.
> 
> 	Andrew
>

Hello Andrew,

thanks for your polite response.

The DeLock board is this beauty:
http://www.delock.de/produkte/G_89481/merkmale.html?setLanguage=en
DeLock techsupport were so kind as to provide me with a schematic 
snippet, showing the wiring of the i210 fiber SKU's dedicated 
I2C/MDIO pins to the SFP socket's standard I2C pins. There are 
pull-up resistors on the board for SDA/SCL. Nothing outright custom 
here - more likely close to some Intel reference design. The board 
works just fine.

Interesting - I've noticed before that the sparse Broadcom data brief
for the BCM5461S doesn't cotain a word about 100Base-FX.
This might be a good question to the SFP vendor's techsupport :-)
The 54616S datasheet does mention 100Base-FX, but I have no reason to 
believe that my SFP contains that chip.

It was mostly my naive assumption that the BCM5461S would speak 
MDIO/MDC via the SFP slot's I2C lines :-) The i210 pinout allows for 
that and the BCM5461S doesn't seem to have I2C for management. It 
does have MDIO and MDC. I've tried the "SGMII with i2c management" 
option too - as I said the SFP seems to be responding, but the 
response is some junk. Not corresponding to what the Intel driver 
expects.

As for the SFP's... 

The one that I actually bought as shiny new and speaking SGMII, and 
that contains a chip labeled 5461SA, is sold as Huwei-compatible. 
I do not know what that means at pinout level and protocol level :-)  
"got it at e-bay" says it all. Yet, I have to say that I've been 
politely and firmly warned by the selling party that it's not gonna 
work in my Intel board. I bought it anyway, out of curiosity. So I'm 
not going to provide a link, unless I find something to praise on the 
product :-)
I cannot exclude the possibility that the SFP is a fake, despite 
being mechanically/visually a masterpiece.

The other SFP I got for free, with a comment that "it's some Chinese 
junk, supposedly SGMII compatible, but it hasn't ever worked for us 
in any hardware. Here you go, have it." And the chips inside are 
obscured between two PCB's = no idea what they are, unless I take
a hot air pen to it.

I'll probably take some macro photoes of the SFP internals
(on monday) and post them. Perhaps the set of chips may ring a bell 
with someone here :-)

For the moment, thanks for your attention.

Frank Rysanek

Customer support
FCC prumyslove systemy s.r.o.
Usti nad Labem
Czech Republic
+420 47 2774 173 (office)
+420 736 630 449 (mobile)
rysanek@fccps.cz

^ permalink raw reply

* Re: [PATCH iproute2 1/1] tc: use get_u32() in psample action to match types
From: Stephen Hemminger @ 2018-03-16 20:40 UTC (permalink / raw)
  To: Roman Mashak; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri
In-Reply-To: <1520975783-8593-1-git-send-email-mrv@mojatatu.com>

On Tue, 13 Mar 2018 17:16:23 -0400
Roman Mashak <mrv@mojatatu.com> wrote:

> Signed-off-by: Roman Mashak <mrv@mojatatu.com>


Applied

^ permalink raw reply

* Re: [PATCH iproute2 1/1] tc: print actual action for sample action
From: Stephen Hemminger @ 2018-03-16 20:39 UTC (permalink / raw)
  To: Roman Mashak; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri
In-Reply-To: <1520949430-14491-1-git-send-email-mrv@mojatatu.com>

On Tue, 13 Mar 2018 09:57:10 -0400
Roman Mashak <mrv@mojatatu.com> wrote:

> Signed-off-by: Roman Mashak <mrv@mojatatu.com>

Thanks for catching this, applied.

^ permalink raw reply

* Re: [PATCH iproute2] treat "default" and "all"/"any" parameters differenty
From: Stephen Hemminger @ 2018-03-16 20:37 UTC (permalink / raw)
  To: Alexander Zubkov; +Cc: Luca Boccassi, netdev@vger.kernel.org
In-Reply-To: <3ded09a4-54f9-9d82-d952-33527a6a3e58@msu.ru>

On Tue, 13 Mar 2018 21:19:45 +0100
Alexander Zubkov <green@msu.ru> wrote:

> Debian maintainer found that basic command:
> 	# ip route flush all
> No longer worked as expected which breaks user scripts and
> expectations. It no longer flushed all IPv4 routes.
> 
> Recently behaviour of "default" prefix parameter was corrected. But at
> the same time behaviour of "all"/"any" was altered too, because they
> were the same branch of the code. As those parameters mean different,
> they need to be treated differently in code too. This patch reflects
> the difference.
> 
> Reported-by: Luca Boccassi <bluca@debian.org>
> Signed-off-by: Alexander Zubkov <green@msu.ru>
> ---
>   lib/utils.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/utils.c b/lib/utils.c
> index 9fa5220..b289d9c 100644
> --- a/lib/utils.c
> +++ b/lib/utils.c
> @@ -658,7 +658,8 @@ int get_prefix_1(inet_prefix *dst, char *arg, int 
> family)
>   		dst->family = family;
>   		dst->bytelen = 0;
>   		dst->bitlen = 0;
> -		dst->flags |= PREFIXLEN_SPECIFIED;
> +		if (strcmp(arg, "default") == 0)
> +			dst->flags |= PREFIXLEN_SPECIFIED;
>   		return 0;
>   	}
> 
> --
> 1.9.1
> 

Which code is this a patch against? the current master or followup to my revert
of the original patch?

^ permalink raw reply

* Re: [PATCH] json_print: fix print_uint with helper type extensions
From: Stephen Hemminger @ 2018-03-16 20:34 UTC (permalink / raw)
  To: Kevin Darbyshire-Bryant; +Cc: netdev
In-Reply-To: <20180315200720.829-1-ldir@darbyshire-bryant.me.uk>

On Thu, 15 Mar 2018 20:07:20 +0000
Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk> wrote:

> Introduce print helper functions for int, uint, explicit int32, uint32,
> int64 & uint64.
> 
> print_int used 'int' type internally, whereas print_uint used 'uint64_t'
> 
> These helper functions eventually call vfprintf(fp, fmt, args) which is
> a variable argument list function and is dependent upon 'fmt' containing
> correct information about the length of the passed arguments.
> 
> Unfortunately print_int v print_uint offered no clue to the programmer
> that internally passed ints to print_uint were being promoted to 64bits,
> thus the format passed in 'fmt' string vs the actual passed integer
> could be different lengths.  This is even more interesting on big endian
> architectures where 'vfprintf' would be looking in the middle of an
> int64 type and hence produced wildly incorrect values in tc qdisc
> output.
> 
> print_u/int now stick with native int size.  print_u/int32 & print
> u/int64 functions offer explicit integer sizes.
> 
> To portably use these formats you should use the relevant PRIdN or PRIuN
> formats as defined in inttypes.h
> 
> e.g.
> 
> print_uint64(PRINT_ANY, "refcnt", "refcnt %" PRIu64 " ", t->tcm_info)
> 
> Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>

I am fine with this. But since there is no code using it yet, it should
go net-next branch.

Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Linus Torvalds @ 2018-03-16 20:25 UTC (permalink / raw)
  To: David Laight
  Cc: Florian Weimer, Kees Cook, Andrew Morton, Josh Poimboeuf,
	Rasmus Villemoes, Randy Dunlap, Miguel Ojeda, Ingo Molnar,
	Ian Abbott, linux-input, linux-btrfs, Network Development,
	Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <a37f7b0da04f4ede9fac2c88f7079a8a@AcuMS.aculab.com>

On Fri, Mar 16, 2018 at 10:44 AM, David Laight <David.Laight@aculab.com> wrote:
>
> I looked at the generated code for one of the constant sized VLA that
> the compiler barfed at.
> It seemed to subtract constants from %sp separately for the VLA.
> So it looks like the compiler treats them as VLA even though it
> knows the size.
> That is probably missing optimisation.

Looking at the code is definitely an option.

In fact, instead of depending on -Wvla, we could just make 'objtool'
warn about real variable-sized stack frames.

That said, if that "sizeof()" trick of Al's actually works with older
gcc versions too (it *should*, but it's not like
__builtin_choose_expr() and __builtin_constant_p() have well-defined
rules in the standard), that may just be the solution.

And if gcc ends up generating bad code for those "constant sized vlas"
anyway, then -Wvla would effectively warn about that code generation
problem.

              Linus

^ permalink raw reply

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Linus Torvalds @ 2018-03-16 20:19 UTC (permalink / raw)
  To: Miguel Ojeda
  Cc: Al Viro, Florian Weimer, Kees Cook, Andrew Morton, Josh Poimboeuf,
	Rasmus Villemoes, Randy Dunlap, Ingo Molnar, David Laight,
	Ian Abbott, linux-input, linux-btrfs, Network Development,
	Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <CA+55aFxLpdfyfTqhkHQoedQuqTcLRw3bYOgyz1s0eRW6cBmFTA@mail.gmail.com>

On Fri, Mar 16, 2018 at 1:14 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> It does not work with gcc-4.1.x, but works with gcc-4.4.x.
>
> I can't seem to see the errors any way, I wonder if
> __builtin_choose_expr() simply didn't exist back then.

No, that goes further back.

It seems to be -Wvla itself that doesn't exist in 4.1, so the test
build failed simply because I used that flag ;)

               Linus

^ permalink raw reply

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Al Viro @ 2018-03-16 20:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Florian Weimer, Kees Cook, Andrew Morton, Josh Poimboeuf,
	Rasmus Villemoes, Randy Dunlap, Miguel Ojeda, Ingo Molnar,
	David Laight, Ian Abbott, linux-input, linux-btrfs,
	Network Development, Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <CA+55aFwxWEWya44pVAxSLM8oH8NyrOVf8QfLowSnX4WUXvC0xg@mail.gmail.com>

On Fri, Mar 16, 2018 at 01:15:27PM -0700, Linus Torvalds wrote:
> On Fri, Mar 16, 2018 at 1:12 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > That's C99, straight from N1256.pdf (C99-TC3)...
> 
> I checked C90, since the error is
> 
>    ISO C90 forbids variable length array
> 
> and I didn't see anything there.

Well, yes - VLA are new in C99; C90 never had those...

^ permalink raw reply

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Linus Torvalds @ 2018-03-16 20:15 UTC (permalink / raw)
  To: Al Viro
  Cc: Florian Weimer, Kees Cook, Andrew Morton, Josh Poimboeuf,
	Rasmus Villemoes, Randy Dunlap, Miguel Ojeda, Ingo Molnar,
	David Laight, Ian Abbott, linux-input, linux-btrfs,
	Network Development, Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <20180316201215.GG30522@ZenIV.linux.org.uk>

On Fri, Mar 16, 2018 at 1:12 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> That's C99, straight from N1256.pdf (C99-TC3)...

I checked C90, since the error is

   ISO C90 forbids variable length array

and I didn't see anything there.

Admittedly I only found a draft copy.

                Linus

^ permalink raw reply

* Re: [PATCH v2] netns: send uevent messages
From: Kirill Tkhai @ 2018-03-16 20:14 UTC (permalink / raw)
  To: Christian Brauner, ebiederm, gregkh, netdev, linux-kernel; +Cc: serge, avagin
In-Reply-To: <20180316125030.23624-1-christian.brauner@ubuntu.com>

On 16.03.2018 15:50, Christian Brauner wrote:
> This patch adds a receive method to NETLINK_KOBJECT_UEVENT netlink sockets
> to allow sending uevent messages into the network namespace the socket
> belongs to.
> 
> Currently non-initial network namespaces are already isolated and don't
> receive uevents. There are a number of cases where it is beneficial for a
> sufficiently privileged userspace process to send a uevent into a network
> namespace.
> 
> One such use case would be debugging and fuzzing of a piece of software
> which listens and reacts to uevents. By running a copy of that software
> inside a network namespace, specific uevents could then be presented to it.
> More concretely, this would allow for easy testing of udevd/ueventd.
> 
> This will also allow some piece of software to run components inside a
> separate network namespace and then effectively filter what that software
> can receive. Some examples of software that do directly listen to uevents
> and that we have in the past attempted to run inside a network namespace
> are rbd (CEPH client) or the X server.
> 
> Implementation:
> The implementation has been kept as simple as possible from the kernel's
> perspective. Specifically, a simple input method uevent_net_rcv() is added
> to NETLINK_KOBJECT_UEVENT sockets which completely reuses existing
> af_netlink infrastructure and does neither add an additional netlink family
> nor requires any user-visible changes.
> 
> For example, by using netlink_rcv_skb() we can make use of existing netlink
> infrastructure to report back informative error messages to userspace.
> 
> Furthermore, this implementation does not introduce any overhead for
> existing uevent generating codepaths. The struct netns gets a new uevent
> socket member that records the uevent socket associated with that network
> namespace including its position in the uevent socket list. Since we record
> the uevent socket for each network namespace in struct net we don't have to
> walk the whole uevent socket list. Instead we can directly retrieve the
> relevant uevent socket and send the message. At exit time we can now also
> trivially remove the uevent socket from the uevent socket list. This keeps
> the codepath very performant without introducing needless overhead and even
> makes older codepaths faster.
> 
> Uevent sequence numbers are kept global. When a uevent message is sent to
> another network namespace the implementation will simply increment the
> global uevent sequence number and append it to the received uevent. This
> has the advantage that the kernel will never need to parse the received
> uevent message to replace any existing uevent sequence numbers. Instead it
> is up to the userspace process to remove any existing uevent sequence
> numbers in case the uevent message to be sent contains any.
> 
> Security:
> In order for a caller to send uevent messages to a target network namespace
> the caller must have CAP_SYS_ADMIN in the owning user namespace of the
> target network namespace. Additionally, any received uevent message is
> verified to not exceed size UEVENT_BUFFER_SIZE. This includes the space
> needed to append the uevent sequence number.
> 
> Testing:
> This patch has been tested and verified to work with the following udev
> implementations:
> 1. CentOS 6 with udevd version 147
> 2. Debian Sid with systemd-udevd version 237
> 3. Android 7.1.1 with ueventd
> 
> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
> ---
> Changelog v1->v2:
> * Add the whole struct uevent_sock to struct net not just the socket
>   member. Since struct uevent_sock records the position of the uevent
>   socket in the uevent socket list we can trivially remove it from the
>   uevent socket list during cleanup. This speeds up the old removal
>   codepath. list_del() will hitl __list_del_entry_valid() in its call chain
>   which will validate that the element is a member of the list. If it isn't
>   it will take care that the list is not modified.
> Changelog v0->v1:
> * Hold mutex_lock() until uevent is sent to preserve uevent message
>   ordering. See udev and commit for reference:
> 
>   commit 7b60a18da393ed70db043a777fd9e6d5363077c4
>   Author: Andrew Vagin <avagin@openvz.org>
>   Date:   Wed Mar 7 14:49:56 2012 +0400
> 
>       uevent: send events in correct order according to seqnum (v3)
> 
>       The queue handling in the udev daemon assumes that the events are
>       ordered.
> ---
>  include/linux/kobject.h     |  6 +++
>  include/net/net_namespace.h |  4 +-
>  lib/kobject_uevent.c        | 98 ++++++++++++++++++++++++++++++++++++++-------
>  3 files changed, 93 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/kobject.h b/include/linux/kobject.h
> index 7f6f93c3df9c..c572c7abc609 100644
> --- a/include/linux/kobject.h
> +++ b/include/linux/kobject.h
> @@ -39,6 +39,12 @@ extern char uevent_helper[];
>  /* counter to tag the uevent, read only except for the kobject core */
>  extern u64 uevent_seqnum;
>  
> +/* uevent socket */
> +struct uevent_sock {
> +	struct list_head list;
> +	struct sock *sk;
> +};

I missed, why we do this external?

> +
>  /*
>   * The actions here must match the index to the string array
>   * in lib/kobject_uevent.c
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index f306b2aa15a4..abd7d91bffac 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -40,7 +40,7 @@ struct net_device;
>  struct sock;
>  struct ctl_table_header;
>  struct net_generic;
> -struct sock;
> +struct uevent_sock;
>  struct netns_ipvs;
>  
>  
> @@ -79,6 +79,8 @@ struct net {
>  	struct sock 		*rtnl;			/* rtnetlink socket */
>  	struct sock		*genl_sock;
>  
> +	struct uevent_sock	*uevent_sock;		/* uevent socket */

Since there will be one more version, could you please to move all preparation
related to the above change to a separate patch? Then we have series of two patches
with two logical changes.

> +
>  	struct list_head 	dev_base_head;
>  	struct hlist_head 	*dev_name_head;
>  	struct hlist_head	*dev_index_head;
> diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
> index 9fe6ec8fda28..53e9123474c0 100644
> --- a/lib/kobject_uevent.c
> +++ b/lib/kobject_uevent.c
> @@ -25,6 +25,7 @@
>  #include <linux/uuid.h>
>  #include <linux/ctype.h>
>  #include <net/sock.h>
> +#include <net/netlink.h>
>  #include <net/net_namespace.h>
>  
>  
> @@ -33,10 +34,6 @@ u64 uevent_seqnum;
>  char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
>  #endif
>  #ifdef CONFIG_NET
> -struct uevent_sock {
> -	struct list_head list;
> -	struct sock *sk;
> -};
>  static LIST_HEAD(uevent_sock_list);
>  #endif
>  
> @@ -602,12 +599,88 @@ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
>  EXPORT_SYMBOL_GPL(add_uevent_var);
>  
>  #if defined(CONFIG_NET)
> +static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb,
> +				struct netlink_ext_ack *extack)
> +{
> +	int ret;
> +	/* u64 to chars: 2^64 - 1 = 21 chars */
> +	char buf[sizeof("SEQNUM=") + 21];
> +	struct sk_buff *skbc;
> +
> +	/* bump and prepare sequence number */
> +	ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu", ++uevent_seqnum);
> +	if (ret < 0 || (size_t)ret >= sizeof(buf))
> +		return -ENOMEM;
> +	ret++;
> +
> +	/* verify message does not overflow */
> +	if ((skb->len + ret) > UEVENT_BUFFER_SIZE) {
> +		NL_SET_ERR_MSG(extack, "uevent message too big");
> +		return -EINVAL;
> +	}
> +
> +	/* copy skb and extend to accommodate sequence number */
> +	skbc = skb_copy_expand(skb, 0, ret, GFP_KERNEL);
> +	if (!skbc)
> +		return -ENOMEM;
> +
> +	/* append sequence number */
> +	skb_put_data(skbc, buf, ret);
> +
> +	/* remove msg header */
> +	skb_pull(skbc, NLMSG_HDRLEN);
> +
> +	/* set portid 0 to inform userspace message comes from kernel */
> +	NETLINK_CB(skbc).portid = 0;
> +	NETLINK_CB(skbc).dst_group = 1;
> +
> +	ret = netlink_broadcast(usk, skbc, 0, 1, GFP_KERNEL);
> +	/* ENOBUFS should be handled in userspace */
> +	if (ret == -ENOBUFS || ret == -ESRCH)
> +		ret = 0;
> +
> +	return ret;
> +}
> +
> +static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh,
> +			      struct netlink_ext_ack *extack)
> +{
> +	int ret;
> +	struct net *net;
> +
> +	if (!nlmsg_data(nlh))
> +		return -EINVAL;
> +
> +	/*
> +	 * Verify that we are allowed to send messages to the target
> +	 * network namespace. The caller must have CAP_SYS_ADMIN in the
> +	 * owning user namespace of the target network namespace.
> +	 */
> +	net = sock_net(NETLINK_CB(skb).sk);
> +	if (!netlink_ns_capable(skb, net->user_ns, CAP_SYS_ADMIN)) {
> +		NL_SET_ERR_MSG(extack, "missing CAP_SYS_ADMIN capability");
> +		return -EPERM;
> +	}
> +
> +	mutex_lock(&uevent_sock_mutex);
> +	ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack);
> +	mutex_unlock(&uevent_sock_mutex);
> +
> +	return ret;
> +}
> +
> +static void uevent_net_rcv(struct sk_buff *skb)
> +{
> +	netlink_rcv_skb(skb, &uevent_net_rcv_skb);
> +}
> +
>  static int uevent_net_init(struct net *net)
>  {
>  	struct uevent_sock *ue_sk;
>  	struct netlink_kernel_cfg cfg = {
>  		.groups	= 1,
> -		.flags	= NL_CFG_F_NONROOT_RECV,
> +		.input = uevent_net_rcv,
> +		.flags	= NL_CFG_F_NONROOT_RECV
>  	};
>  
>  	ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL);
> @@ -621,6 +694,9 @@ static int uevent_net_init(struct net *net)
>  		kfree(ue_sk);
>  		return -ENODEV;
>  	}
> +
> +	net->uevent_sock = ue_sk;
> +
>  	mutex_lock(&uevent_sock_mutex);
>  	list_add_tail(&ue_sk->list, &uevent_sock_list);
>  	mutex_unlock(&uevent_sock_mutex);
> @@ -629,22 +705,16 @@ static int uevent_net_init(struct net *net)
>  
>  static void uevent_net_exit(struct net *net)
>  {
> -	struct uevent_sock *ue_sk;
> +	struct uevent_sock *ue_sk = net->uevent_sock;
>  
>  	mutex_lock(&uevent_sock_mutex);
> -	list_for_each_entry(ue_sk, &uevent_sock_list, list) {
> -		if (sock_net(ue_sk->sk) == net)
> -			goto found;
> -	}
> -	mutex_unlock(&uevent_sock_mutex);
> -	return;
> -
> -found:
>  	list_del(&ue_sk->list);
>  	mutex_unlock(&uevent_sock_mutex);
>  
>  	netlink_kernel_release(ue_sk->sk);
>  	kfree(ue_sk);
> +
> +	return;
>  }
>  
>  static struct pernet_operations uevent_net_ops = {
 
Thanks,
Kirill

^ permalink raw reply

* Re: [PATCH v5 0/2] Remove false-positive VLAs when using max()
From: Linus Torvalds @ 2018-03-16 20:14 UTC (permalink / raw)
  To: Miguel Ojeda
  Cc: Al Viro, Florian Weimer, Kees Cook, Andrew Morton, Josh Poimboeuf,
	Rasmus Villemoes, Randy Dunlap, Ingo Molnar, David Laight,
	Ian Abbott, linux-input, linux-btrfs, Network Development,
	Linux Kernel Mailing List, Kernel Hardening
In-Reply-To: <CANiq72mjDQWsPG60JqfuLSY-6SDzB4goP4n35XZxc8bfFx1NDg@mail.gmail.com>

On Fri, Mar 16, 2018 at 1:03 PM, Miguel Ojeda
<miguel.ojeda.sandonis@gmail.com> wrote:
>>
>> Kees - is there some online "gcc-4.4 checker" somewhere? This does
>> seem to work with my gcc. I actually tested some of those files you
>> pointed at now.
>
> I use this one:
>
>   https://godbolt.org/

Well, my *test* code works on that one and -Wvla -Werror.

It does not work with gcc-4.1.x, but works with gcc-4.4.x.

I can't seem to see the errors any way, I wonder if
__builtin_choose_expr() simply didn't exist back then.

Odd that you can't view warnings/errors with it.

But it's possible that it fails on more complex stuff in the kernel.

I've done a "allmodconfig" build with that patch, and the only issue
it found was that (real) type issue in tpm_tis_core.h.

                Linus

^ permalink raw reply

* Re: [PATCH 0/2] net: phy: relax error checking when creating sysfs link netdev->phydev
From: Grygorii Strashko @ 2018-03-16 20:13 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Florian Fainelli, David S. Miller, netdev, Greg Kroah-Hartman,
	Sekhar Nori, linux-kernel, linux-omap
In-Reply-To: <20180316195403.GA8735@lunn.ch>



On 03/16/2018 02:54 PM, Andrew Lunn wrote:
>> The phydrv->mdiodrv.flags can be accessible only after call to of_phy_connect()/phy_connect(),
> 
> You need to use a function like of_phy_find_device() to get the
> phydev, set the flag, and then call phy_connect_direct().


So, do you propose me to replace direct calls of of_phy_connect()/phy_connect() in
CPSW driver with buddies of the same functions? Right?

cpsw_slave_open()
{
....
	if (slave->data->phy_node) {
		phy = of_phy_connect(priv->ndev, slave->data->phy_node,
				 &cpsw_adjust_link, 0, slave->data->phy_if);
----- replace ^^^^ with below
{
	struct phy_device *phy = of_phy_find_device(phy_np);
	int ret;

	if (!phy)
		return NULL;

	phy->dev_flags = flags;

-----	[set flag in phydrv->mdiodrv.flags]

	ret = phy_connect_direct(dev, phy, hndlr, iface);

	/* refcount is held by phy_connect_direct() on success */
	put_device(&phy->mdio.dev);

	return ret ? NULL : phy;
}
-----
		if (!phy) {
			dev_err(priv->dev, "phy \"%pOF\" not found on slave %d\n",
				slave->data->phy_node,
				slave->slave_num);
			return;
		}
	} else {
		phy = phy_connect(priv->ndev, slave->data->phy_id,
				 &cpsw_adjust_link, slave->data->phy_if);
----- replace ^^^^ with below
{
	struct phy_device *phydev;
	struct device *d;
	int rc;

	/* Search the list of PHY devices on the mdio bus for the
	 * PHY with the requested name
	 */
	d = bus_find_device_by_name(&mdio_bus_type, NULL, bus_id);
	if (!d) {
		pr_err("PHY %s not found\n", bus_id);
		return ERR_PTR(-ENODEV);
	}
	phydev = to_phy_device(d);

-----	[set flag in phydrv->mdiodrv.flags]

	rc = phy_connect_direct(dev, phydev, handler, interface);
	put_device(d);
	if (rc)
		return ERR_PTR(rc);

	return phydev;
}
-----
		if (IS_ERR(phy)) {
			dev_err(priv->dev,
				"phy \"%s\" not found on slave %d, err %ld\n",
				slave->data->phy_id, slave->slave_num,
				PTR_ERR(phy));
			return;
		}
	}
}

and all above just to set a flag which will be used by just one driver as of now.

Hm. Is this some sort of punishment ;) Sry. I'll probably will take a pause.

-- 
regards,
-grygorii

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox