Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next] sunvnet: fix rx packet length check to allow for TSO
From: David L Stevens @ 2015-01-13 17:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Sowmini Varadhan

This patch fixes the rx packet length check in the sunvnet driver to allow
for a TSO max packet length greater than the LDC channel negotiated MTU.
These are negotiated separately and there is no requirement that
port->tsolen be less than port->rmtu, but if it isn't, it'll drop packets
with rx length errors.

Signed-off-by: David L Stevens <david.stevens@oracle.com>
---
 drivers/net/ethernet/sun/sunvnet.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index d2835bf..b5a1d3d 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -351,10 +351,15 @@ static int vnet_rx_one(struct vnet_port *port, struct vio_net_desc *desc)
 	unsigned int len = desc->size;
 	unsigned int copy_len;
 	struct sk_buff *skb;
+	int maxlen;
 	int err;
 
 	err = -EMSGSIZE;
-	if (unlikely(len < ETH_ZLEN || len > port->rmtu)) {
+	if (port->tso && port->tsolen > port->rmtu)
+		maxlen = port->tsolen;
+	else
+		maxlen = port->rmtu;
+	if (unlikely(len < ETH_ZLEN || len > maxlen)) {
 		dev->stats.rx_length_errors++;
 		goto out_dropped;
 	}
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH v5] can: Convert to runtime_pm
From: Marc Kleine-Budde @ 2015-01-13 17:44 UTC (permalink / raw)
  To: Sören Brinkmann
  Cc: Kedareswara rao Appana, wg-5Yr1BZd7O62+XT7JhA+gdA,
	michal.simek-gjFFaj9aHVfQT0dZR+AlfA,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	robh+dt-DgEjT+Ai2ygdnm+yROfE0A, linux-can-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Kedareswara rao Appana
In-Reply-To: <1ef3e0f060f54c888061514bd2926762-neA4ZlFjCT3DAA6W7k9C4mYJ4DzVTqeXkX/xN29GLwg@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 5556 bytes --]

On 01/13/2015 06:24 PM, Sören Brinkmann wrote:
> On Tue, 2015-01-13 at 06:17PM +0100, Marc Kleine-Budde wrote:
>> On 01/13/2015 06:08 PM, Sören Brinkmann wrote:
>>> On Tue, 2015-01-13 at 12:08PM +0100, Marc Kleine-Budde wrote:
>>>> On 01/12/2015 07:45 PM, Sören Brinkmann wrote:
>>>>> On Mon, 2015-01-12 at 08:34PM +0530, Kedareswara rao Appana wrote:
>>>>>> Instead of enabling/disabling clocks at several locations in the driver,
>>>>>> Use the runtime_pm framework. This consolidates the actions for runtime PM
>>>>>> In the appropriate callbacks and makes the driver more readable and mantainable.
>>>>>>
>>>>>> Signed-off-by: Soren Brinkmann <soren.brinkmann-gjFFaj9aHVfQT0dZR+AlfA@public.gmane.org>
>>>>>> Signed-off-by: Kedareswara rao Appana <appanad-gjFFaj9aHVfQT0dZR+AlfA@public.gmane.org>
>>>>>> ---
>>>>>> Changes for v5:
>>>>>>  - Updated with the review comments.
>>>>>>    Updated the remove fuction to use runtime_pm.
>>>>>> Chnages for v4:
>>>>>>  - Updated with the review comments.
>>>>>> Changes for v3:
>>>>>>   - Converted the driver to use runtime_pm.
>>>>>> Changes for v2:
>>>>>>   - Removed the struct platform_device* from suspend/resume
>>>>>>     as suggest by Lothar.
>>>>>>
>>>>>>  drivers/net/can/xilinx_can.c |  157 ++++++++++++++++++++++++++++-------------
>>>>>>  1 files changed, 107 insertions(+), 50 deletions(-)
>>>>> [..]
>>>>>> +static int __maybe_unused xcan_runtime_resume(struct device *dev)
>>>>>>  {
>>>>>> -	struct platform_device *pdev = dev_get_drvdata(dev);
>>>>>> -	struct net_device *ndev = platform_get_drvdata(pdev);
>>>>>> +	struct net_device *ndev = dev_get_drvdata(dev);
>>>>>>  	struct xcan_priv *priv = netdev_priv(ndev);
>>>>>>  	int ret;
>>>>>> +	u32 isr, status;
>>>>>>  
>>>>>>  	ret = clk_enable(priv->bus_clk);
>>>>>>  	if (ret) {
>>>>>> @@ -1014,15 +1030,28 @@ static int __maybe_unused xcan_resume(struct device *dev)
>>>>>>  	ret = clk_enable(priv->can_clk);
>>>>>>  	if (ret) {
>>>>>>  		dev_err(dev, "Cannot enable clock.\n");
>>>>>> -		clk_disable_unprepare(priv->bus_clk);
>>>>>> +		clk_disable(priv->bus_clk);
>>>>> [...]
>>>>>> @@ -1173,12 +1219,23 @@ static int xcan_remove(struct platform_device *pdev)
>>>>>>  {
>>>>>>  	struct net_device *ndev = platform_get_drvdata(pdev);
>>>>>>  	struct xcan_priv *priv = netdev_priv(ndev);
>>>>>> +	int ret;
>>>>>> +
>>>>>> +	ret = pm_runtime_get_sync(&pdev->dev);
>>>>>> +	if (ret < 0) {
>>>>>> +		netdev_err(ndev, "%s: pm_runtime_get failed(%d)\n",
>>>>>> +				__func__, ret);
>>>>>> +		return ret;
>>>>>> +	}
>>>>>>  
>>>>>>  	if (set_reset_mode(ndev) < 0)
>>>>>>  		netdev_err(ndev, "mode resetting failed!\n");
>>>>>>  
>>>>>>  	unregister_candev(ndev);
>>>>>> +	pm_runtime_disable(&pdev->dev);
>>>>>>  	netif_napi_del(&priv->napi);
>>>>>> +	clk_disable_unprepare(priv->bus_clk);
>>>>>> +	clk_disable_unprepare(priv->can_clk);
>>>>>
>>>>> Shouldn't pretty much all these occurrences of clk_disable/enable
>>>>> disappear? This should all be handled by the runtime_pm framework now.
>>>>
>>>> We have:
>>>> - clk_prepare_enable() in probe
>>>
>>> This should become something like pm_runtime_get_sync(), shouldn't it?
>>>
>>>> - clk_disable_unprepare() in remove
>>>
>>> pm_runtime_put()
>>>
>>>> - clk_enable() in runtime_resume
>>>> - clk_disable() in runtime_suspend
>>>
>>> These are the ones needed.
>>>
>>> The above makes me suspect that the clocks are always on, regardless of
>>
>> Define "on" :)
>> The clocks are prepared after probe() exists, but not enabled. The first
>> pm_runtime_get_sync() will enable the clocks.
>>
>>> the runtime suspend state since they are enabled in probe and disabled
>>> in remove, is that right? Ideally, the usage in probe and remove should
>>> be migrated to runtime_pm and clocks should really only be running when
>>> needed and not throughout the whole lifetime of the driver.
>>
>> The clocks are not en/disabled via pm_runtime, because
>> pm_runtime_get_sync() is called from atomic contect. We can have another
>> look into the driver and try to change this.

> Wasn't that why the call to pm_runtime_irq_safe() was added?

Good question. That should be investigated.

> Also, clk_enable/disable should be okay to be run from atomic context.
> And if the clock are already prepared after the exit of probe that
> should be enough. Then remove() should just have to do the unprepare.
> But I don't see why runtime_pm shouldn't be able to do the
> enable/disable.

runtime_pm does call the clk_{enable,disable} function. But you mean
clk_prepare() + pm_runtime_get_sync() should be used in probe() instead
of calling clk_prepare_enable(). Good idea! I think the
"pm_runtime_set_active(&pdev->dev);" has to be removed from the patch.

Coming back whether blocking calls are allowed or not.
If you make a call to pm_runtime_irq_safe(), you state that it's okay to
call pm_runtime_get_sync() from atomic context. But it's only called in
open, probe, remove and in xcan_get_berr_counter, which is not called
from atomic either. So let's try to remove the pm_runtime_irq_safe() and
use clk_prepare_enable() clk_disable_unprepare() in the runtime_resume()
runtime_suspend() functions.

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: why are IPv6 addresses removed on link down
From: Hannes Frederic Sowa @ 2015-01-13 17:34 UTC (permalink / raw)
  To: David Miller; +Cc: dsahern, hideaki.yoshifuji, stephen, netdev
In-Reply-To: <20150113.122542.815831933030545121.davem@davemloft.net>

On Di, 2015-01-13 at 12:25 -0500, David Miller wrote:
> From: David Ahern <dsahern@gmail.com>
> Date: Tue, 13 Jan 2015 07:53:59 -0700
> 
> > Bottom line is there a harm in removing the flush? If there is no harm
> > will mainline kernel take a patch to do that or is your backward
> > compatibility concern enough to block it?
> 
> Backward compatibility trumps all other concerns here, and I say is
> enough to block changing the behavior.  One which we've had for more
> than a decade.

Totally agreed, a new sysctl will definitely be needed and the default
should be the old behavior. Otherwise we cannot do it.

^ permalink raw reply

* Re: [RFC PATCH v2 1/2] net: af_packet support for direct ring access in user space
From: David Miller @ 2015-01-13 17:27 UTC (permalink / raw)
  To: David.Laight
  Cc: john.fastabend, dborkman, hannes, netdev, danny.zhou, nhorman,
	john.ronciak, brouer
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6D1CAC6BA5@AcuExch.aculab.com>

From: David Laight <David.Laight@ACULAB.COM>
Date: Tue, 13 Jan 2015 17:15:30 +0000

> How about something like:
> 
> struct tpacket_dma_mem_region {
>     __u64 addr;        /* userspace virtual address */
>     __u64 phys_addr;    /* physical address */
>     __u64 iova;        /* IO virtual address used for DMA */
>     __u64 size;    /* size of region */
>     int direction;        /* dma data direction */
> } aligned(8);
> 
> So that it is independant of 32/64 bits.
> It is a shame that gcc has no way of defining a 64bit 'void *' on 32bit systems.
> You can use a union, but you still need to zero extend the value on LE (worse on BE).

We have an __aligned_u64, please use that.

^ permalink raw reply

* Re: [PATCH net-next v2 2/2] vxlan: Remote checksum offload
From: Tom Herbert @ 2015-01-13 17:26 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David Miller, Linux Netdev List
In-Reply-To: <20150113164011.GO20387@casper.infradead.org>

On Tue, Jan 13, 2015 at 8:40 AM, Thomas Graf <tgraf@suug.ch> wrote:
> [Moving this discussion to the thread of the respective patch]
>
> On 01/13/15 at 08:16am, Tom Herbert wrote:
>> On 01/13/15 at 11:44am, Thomas Graf wrote:
>> > The major difference here is that we have to consider backwards
>> > compatibility specifically for VXLAN. Your initial feedback on GPE
>> > actually led me to how I implemented GBP.
>> >
>> > I think the axioms we want to establish are as follows:
>> >  1. Extensions need to be explicitly enabled by the user. A previously
>> >     dropped frame should only be processed if the user explitly asks
>> >     for it.
>> >  2. As a consequence: only share a VLXAN UDP port if the enabled
>> >     extensions match (vxlan_sock_add), e.g. user A might want RCO
>> >     but user B might be unaware. They cannot share the same UDP port.
>> >
>> > The 2nd lead me to introduce the 'exts' member to vxlan_sock so we can
>> > compare it in vxlan_find_sock() and only share a UDP port if the
>> > enabled extensions match.
>> >
>> RCO is represented in the socket in VXLAN flags (VLXAN_F_*). My patch
>> also adds a flags to vxlan_sock which contains the VLXAN flags. For
>> shared port, I suspect all the receive features must match, including
>> receive checksum settings for instance, but we don't care about
>> transmit side. To facilitate this, I would suggest splitting flags
>> into o_flags and i_flags like ip_tunnel does, and then compare i_flags
>> in vxlan_find_sock.
>
> Not sure I understand why you want to omit the transmit side. If a
> VXLAN socket with RCO TX enabled is found and shared with a user
> who does not want RCO enabled, it will get RCO enabled frames which
> will get dropped by non RCO VXLAN receivers.

Yes, if the transmit parameters are shared between different users
then that is an issue and transmit features should be matched. Since
VLXAN is using the same same socket for TX and RX that would appear to
be the case. However, in other encaps like GRE, FOU, & GUE the receive
and transmit paths are separate where each tunnel can have their own
transmit parameters even if they share a common receive port or path.
This allows for useful asymmetric configurations (like defining two
tunnels that receive packets on same port, but one transmits non-zero
UDP checksums and one with zero checksums).

Tom

^ permalink raw reply

* Re: why are IPv6 addresses removed on link down
From: David Miller @ 2015-01-13 17:25 UTC (permalink / raw)
  To: dsahern; +Cc: hannes, hideaki.yoshifuji, stephen, netdev
In-Reply-To: <54B53187.7080306@gmail.com>

From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Jan 2015 07:53:59 -0700

> Bottom line is there a harm in removing the flush? If there is no harm
> will mainline kernel take a patch to do that or is your backward
> compatibility concern enough to block it?

Backward compatibility trumps all other concerns here, and I say is
enough to block changing the behavior.  One which we've had for more
than a decade.

^ permalink raw reply

* Re: [3.19-rc3] tg3: BUG: sleeping function called from invalid context
From: Michael Chan @ 2015-01-13 17:25 UTC (permalink / raw)
  To: Peter Hurley; +Cc: Prashant Sreedharan, netdev, Linux kernel
In-Reply-To: <54B513F4.5020403@hurleysoftware.com>

On Tue, 2015-01-13 at 07:47 -0500, Peter Hurley wrote: 
> > tp->lock is held in this code path.  If synchronize_irq() sleeps in
> > wait_event(desc->wait_for_threads, ...), we'll get the warning.
> > 
> > The synchronize_irq() call is to wait for any tg3 irq handler to finish
> > so that it is guaranteed that next time it will see the CHIP_RESETTING
> > flag and do nothing.
> > 
> > Not sure if we can drop the tp->lock before we call synchronize_irq()
> > and then take it again after synchronize_irq().
> 
> Well, this device [1] is using MSI (INTx disabled) so if the synchronize_irq()
> is _only_ for the CHIP_RESETTING logic then it would seem ok to skip it (the
> synchronize_irq()). 

It is only for INTx.  But any device can operate in INTx mode if
MSI/MSIX is not available, so the fix needs to work in all cases.

Let me review the code some more.  If we can guarantee that another
reset, the timer code, etc, cannot come in even if we drop the tp->lock,
the simplest fix will be to drop it before calling synchronize_irq().

Thanks. 

^ permalink raw reply

* Re: [PATCH v5] can: Convert to runtime_pm
From: Sören Brinkmann @ 2015-01-13 17:24 UTC (permalink / raw)
  To: Marc Kleine-Budde
  Cc: Kedareswara rao Appana, wg, michal.simek, grant.likely, robh+dt,
	linux-can, netdev, linux-arm-kernel, linux-kernel, devicetree,
	Kedareswara rao Appana
In-Reply-To: <54B55326.1060606@pengutronix.de>

On Tue, 2015-01-13 at 06:17PM +0100, Marc Kleine-Budde wrote:
> On 01/13/2015 06:08 PM, Sören Brinkmann wrote:
> > On Tue, 2015-01-13 at 12:08PM +0100, Marc Kleine-Budde wrote:
> >> On 01/12/2015 07:45 PM, Sören Brinkmann wrote:
> >>> On Mon, 2015-01-12 at 08:34PM +0530, Kedareswara rao Appana wrote:
> >>>> Instead of enabling/disabling clocks at several locations in the driver,
> >>>> Use the runtime_pm framework. This consolidates the actions for runtime PM
> >>>> In the appropriate callbacks and makes the driver more readable and mantainable.
> >>>>
> >>>> Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
> >>>> Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
> >>>> ---
> >>>> Changes for v5:
> >>>>  - Updated with the review comments.
> >>>>    Updated the remove fuction to use runtime_pm.
> >>>> Chnages for v4:
> >>>>  - Updated with the review comments.
> >>>> Changes for v3:
> >>>>   - Converted the driver to use runtime_pm.
> >>>> Changes for v2:
> >>>>   - Removed the struct platform_device* from suspend/resume
> >>>>     as suggest by Lothar.
> >>>>
> >>>>  drivers/net/can/xilinx_can.c |  157 ++++++++++++++++++++++++++++-------------
> >>>>  1 files changed, 107 insertions(+), 50 deletions(-)
> >>> [..]
> >>>> +static int __maybe_unused xcan_runtime_resume(struct device *dev)
> >>>>  {
> >>>> -	struct platform_device *pdev = dev_get_drvdata(dev);
> >>>> -	struct net_device *ndev = platform_get_drvdata(pdev);
> >>>> +	struct net_device *ndev = dev_get_drvdata(dev);
> >>>>  	struct xcan_priv *priv = netdev_priv(ndev);
> >>>>  	int ret;
> >>>> +	u32 isr, status;
> >>>>  
> >>>>  	ret = clk_enable(priv->bus_clk);
> >>>>  	if (ret) {
> >>>> @@ -1014,15 +1030,28 @@ static int __maybe_unused xcan_resume(struct device *dev)
> >>>>  	ret = clk_enable(priv->can_clk);
> >>>>  	if (ret) {
> >>>>  		dev_err(dev, "Cannot enable clock.\n");
> >>>> -		clk_disable_unprepare(priv->bus_clk);
> >>>> +		clk_disable(priv->bus_clk);
> >>> [...]
> >>>> @@ -1173,12 +1219,23 @@ static int xcan_remove(struct platform_device *pdev)
> >>>>  {
> >>>>  	struct net_device *ndev = platform_get_drvdata(pdev);
> >>>>  	struct xcan_priv *priv = netdev_priv(ndev);
> >>>> +	int ret;
> >>>> +
> >>>> +	ret = pm_runtime_get_sync(&pdev->dev);
> >>>> +	if (ret < 0) {
> >>>> +		netdev_err(ndev, "%s: pm_runtime_get failed(%d)\n",
> >>>> +				__func__, ret);
> >>>> +		return ret;
> >>>> +	}
> >>>>  
> >>>>  	if (set_reset_mode(ndev) < 0)
> >>>>  		netdev_err(ndev, "mode resetting failed!\n");
> >>>>  
> >>>>  	unregister_candev(ndev);
> >>>> +	pm_runtime_disable(&pdev->dev);
> >>>>  	netif_napi_del(&priv->napi);
> >>>> +	clk_disable_unprepare(priv->bus_clk);
> >>>> +	clk_disable_unprepare(priv->can_clk);
> >>>
> >>> Shouldn't pretty much all these occurrences of clk_disable/enable
> >>> disappear? This should all be handled by the runtime_pm framework now.
> >>
> >> We have:
> >> - clk_prepare_enable() in probe
> > 
> > This should become something like pm_runtime_get_sync(), shouldn't it?
> > 
> >> - clk_disable_unprepare() in remove
> > 
> > pm_runtime_put()
> > 
> >> - clk_enable() in runtime_resume
> >> - clk_disable() in runtime_suspend
> > 
> > These are the ones needed.
> > 
> > The above makes me suspect that the clocks are always on, regardless of
> 
> Define "on" :)
> The clocks are prepared after probe() exists, but not enabled. The first
> pm_runtime_get_sync() will enable the clocks.
> 
> > the runtime suspend state since they are enabled in probe and disabled
> > in remove, is that right? Ideally, the usage in probe and remove should
> > be migrated to runtime_pm and clocks should really only be running when
> > needed and not throughout the whole lifetime of the driver.
> 
> The clocks are not en/disabled via pm_runtime, because
> pm_runtime_get_sync() is called from atomic contect. We can have another
> look into the driver and try to change this.

Wasn't that why the call to pm_runtime_irq_safe() was added?
Also, clk_enable/disable should be okay to be run from atomic context.
And if the clock are already prepared after the exit of probe that
should be enough. Then remove() should just have to do the unprepare.
But I don't see why runtime_pm shouldn't be able to do the
enable/disable.

	Sören

^ permalink raw reply

* [PATCH 3/3] xen-netfront: refactor making Tx requests
From: David Vrabel @ 2015-01-13 17:16 UTC (permalink / raw)
  To: netdev; +Cc: David Vrabel, xen-devel, Konrad Rzeszutek Wilk, Boris Ostrovsky
In-Reply-To: <1421169404-27461-1-git-send-email-david.vrabel@citrix.com>

Eliminate all the duplicate code for making Tx requests by
consolidating them into a single xennet_make_one_txreq() function.

xennet_make_one_txreq() and xennet_make_txreqs() work with pages and
offsets so it will be easier to make netfront handle highmem frags in
the future.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/net/xen-netfront.c |  181 ++++++++++++++++----------------------------
 1 file changed, 67 insertions(+), 114 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 6b29b3a..68e0e8f 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -425,99 +425,56 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue)
 	xennet_maybe_wake_tx(queue);
 }
 
-static void xennet_make_frags(struct sk_buff *skb, struct netfront_queue *queue,
-			      struct xen_netif_tx_request *tx)
-{
-	char *data = skb->data;
-	unsigned long mfn;
-	RING_IDX prod = queue->tx.req_prod_pvt;
-	int frags = skb_shinfo(skb)->nr_frags;
-	unsigned int offset = offset_in_page(data);
-	unsigned int len = skb_headlen(skb);
+static struct xen_netif_tx_request *xennet_make_one_txreq(
+	struct netfront_queue *queue, struct sk_buff *skb,
+	struct page *page, unsigned int offset, unsigned int len)
+{
 	unsigned int id;
+	struct xen_netif_tx_request *tx;
 	grant_ref_t ref;
-	int i;
 
-	/* While the header overlaps a page boundary (including being
-	   larger than a page), split it it into page-sized chunks. */
-	while (len > PAGE_SIZE - offset) {
-		tx->size = PAGE_SIZE - offset;
-		tx->flags |= XEN_NETTXF_more_data;
-		len -= tx->size;
-		data += tx->size;
-		offset = 0;
+	len = min_t(unsigned int, PAGE_SIZE - offset, len);
 
-		id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs);
-		queue->tx_skbs[id].skb = skb_get(skb);
-		tx = RING_GET_REQUEST(&queue->tx, prod++);
-		tx->id = id;
-		ref = gnttab_claim_grant_reference(&queue->gref_tx_head);
-		BUG_ON((signed short)ref < 0);
+	id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs);
+	tx = RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++);
+	ref = gnttab_claim_grant_reference(&queue->gref_tx_head);
+	BUG_ON((signed short)ref < 0);
 
-		mfn = virt_to_mfn(data);
-		gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id,
-						mfn, GNTMAP_readonly);
+	gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id,
+					page_to_mfn(page), GNTMAP_readonly);
 
-		queue->grant_tx_page[id] = virt_to_page(data);
-		tx->gref = queue->grant_tx_ref[id] = ref;
-		tx->offset = offset;
-		tx->size = len;
-		tx->flags = 0;
-	}
+	queue->tx_skbs[id].skb = skb;
+	queue->grant_tx_page[id] = page;
+	queue->grant_tx_ref[id] = ref;
 
-	/* Grant backend access to each skb fragment page. */
-	for (i = 0; i < frags; i++) {
-		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
-		struct page *page = skb_frag_page(frag);
+	tx->id = id;
+	tx->gref = ref;
+	tx->offset = offset;
+	tx->size = len;
+	tx->flags = 0;
 
-		len = skb_frag_size(frag);
-		offset = frag->page_offset;
+	return tx;
+}
 
-		/* Skip unused frames from start of page */
-		page += offset >> PAGE_SHIFT;
-		offset &= ~PAGE_MASK;
+static struct xen_netif_tx_request *xennet_make_txreqs(
+	struct netfront_queue *queue, struct xen_netif_tx_request *tx,
+	struct sk_buff *skb, struct page *page,
+	unsigned int offset, unsigned int len)
+{
+	/* Skip unused frames from start of page */
+	page += offset >> PAGE_SHIFT;
+	offset &= ~PAGE_MASK;
 
-		while (len > 0) {
-			unsigned long bytes;
-
-			bytes = PAGE_SIZE - offset;
-			if (bytes > len)
-				bytes = len;
-
-			tx->flags |= XEN_NETTXF_more_data;
-
-			id = get_id_from_freelist(&queue->tx_skb_freelist,
-						  queue->tx_skbs);
-			queue->tx_skbs[id].skb = skb_get(skb);
-			tx = RING_GET_REQUEST(&queue->tx, prod++);
-			tx->id = id;
-			ref = gnttab_claim_grant_reference(&queue->gref_tx_head);
-			BUG_ON((signed short)ref < 0);
-
-			mfn = pfn_to_mfn(page_to_pfn(page));
-			gnttab_grant_foreign_access_ref(ref,
-							queue->info->xbdev->otherend_id,
-							mfn, GNTMAP_readonly);
-
-			queue->grant_tx_page[id] = page;
-			tx->gref = queue->grant_tx_ref[id] = ref;
-			tx->offset = offset;
-			tx->size = bytes;
-			tx->flags = 0;
-
-			offset += bytes;
-			len -= bytes;
-
-			/* Next frame */
-			if (offset == PAGE_SIZE && len) {
-				BUG_ON(!PageCompound(page));
-				page++;
-				offset = 0;
-			}
-		}
+	while (len) {
+		tx->flags |= XEN_NETTXF_more_data;
+		tx = xennet_make_one_txreq(queue, skb_get(skb),
+					   page, offset, len);
+		page++;
+		offset = 0;
+		len -= tx->size;
 	}
 
-	queue->tx.req_prod_pvt = prod;
+	return tx;
 }
 
 /*
@@ -565,18 +522,15 @@ static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb,
 
 static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	unsigned short id;
 	struct netfront_info *np = netdev_priv(dev);
 	struct netfront_stats *stats = this_cpu_ptr(np->stats);
-	struct xen_netif_tx_request *tx;
-	char *data = skb->data;
-	RING_IDX i;
-	grant_ref_t ref;
-	unsigned long mfn;
+	struct xen_netif_tx_request *tx, *first_tx;
+	unsigned int i;
 	int notify;
 	int slots;
-	unsigned int offset = offset_in_page(data);
-	unsigned int len = skb_headlen(skb);
+	struct page *page;
+	unsigned int offset;
+	unsigned int len;
 	unsigned long flags;
 	struct netfront_queue *queue = NULL;
 	unsigned int num_queues = dev->real_num_tx_queues;
@@ -605,11 +559,12 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				    slots, skb->len);
 		if (skb_linearize(skb))
 			goto drop;
-		data = skb->data;
-		offset = offset_in_page(data);
-		len = skb_headlen(skb);
 	}
 
+	page = virt_to_page(skb->data);
+	offset = offset_in_page(skb->data);
+	len = skb_headlen(skb);
+
 	spin_lock_irqsave(&queue->tx_lock, flags);
 
 	if (unlikely(!netif_carrier_ok(dev) ||
@@ -619,25 +574,13 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto drop;
 	}
 
-	i = queue->tx.req_prod_pvt;
-
-	id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs);
-	queue->tx_skbs[id].skb = skb;
-
-	tx = RING_GET_REQUEST(&queue->tx, i);
+	/* First request for the linear area. */
+	first_tx = tx = xennet_make_one_txreq(queue, skb,
+					      page, offset, len);
+	page++;
+	offset = 0;
+	len -= tx->size;
 
-	tx->id   = id;
-	ref = gnttab_claim_grant_reference(&queue->gref_tx_head);
-	BUG_ON((signed short)ref < 0);
-	mfn = virt_to_mfn(data);
-	gnttab_grant_foreign_access_ref(
-		ref, queue->info->xbdev->otherend_id, mfn, GNTMAP_readonly);
-	queue->grant_tx_page[id] = virt_to_page(data);
-	tx->gref = queue->grant_tx_ref[id] = ref;
-	tx->offset = offset;
-	tx->size = len;
-
-	tx->flags = 0;
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		/* local packet? */
 		tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated;
@@ -645,11 +588,12 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		/* remote but checksummed. */
 		tx->flags |= XEN_NETTXF_data_validated;
 
+	/* Optional extra info after the first request. */
 	if (skb_shinfo(skb)->gso_size) {
 		struct xen_netif_extra_info *gso;
 
 		gso = (struct xen_netif_extra_info *)
-			RING_GET_REQUEST(&queue->tx, ++i);
+			RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++);
 
 		tx->flags |= XEN_NETTXF_extra_info;
 
@@ -664,10 +608,19 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		gso->flags = 0;
 	}
 
-	queue->tx.req_prod_pvt = i + 1;
+	/* Requests for the rest of the linear area. */
+	tx = xennet_make_txreqs(queue, tx, skb, page, offset, len);
+
+	/* Requests for all the frags. */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		tx = xennet_make_txreqs(queue, tx, skb,
+					skb_frag_page(frag), frag->page_offset,
+					skb_frag_size(frag));
+	}
 
-	xennet_make_frags(skb, queue, tx);
-	tx->size = skb->len;
+	/* First request has the packet length. */
+	first_tx->size = skb->len;
 
 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&queue->tx, notify);
 	if (notify)
-- 
1.7.10.4

^ permalink raw reply related

* [PATCHv1 0/3 net-next] xen-netfront: refactor making Tx requests
From: David Vrabel @ 2015-01-13 17:16 UTC (permalink / raw)
  To: netdev; +Cc: David Vrabel, xen-devel, Konrad Rzeszutek Wilk, Boris Ostrovsky

As netfront as evolved to handle different sorts of skbs the code to
fill a Tx requests has been copy and pasted several times.  The series
refactors this and a few other areas.

The first patch is to a Xen header but this can be merged via
net-next.

David

^ permalink raw reply

* [PATCH 1/3] xen: add page_to_mfn()
From: David Vrabel @ 2015-01-13 17:16 UTC (permalink / raw)
  To: netdev; +Cc: David Vrabel, xen-devel, Konrad Rzeszutek Wilk, Boris Ostrovsky
In-Reply-To: <1421169404-27461-1-git-send-email-david.vrabel@citrix.com>

pfn_to_mfn(page_to_pfn(p)) is a common use case so add a generic
helper for it.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 include/xen/page.h |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/xen/page.h b/include/xen/page.h
index 12765b6..c5ed20b 100644
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -3,6 +3,11 @@
 
 #include <asm/xen/page.h>
 
+static inline unsigned long page_to_mfn(struct page *page)
+{
+	return pfn_to_mfn(page_to_pfn(page));
+}
+
 struct xen_memory_region {
 	phys_addr_t start;
 	phys_addr_t size;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 2/3] xen-netfront: refactor skb slot counting
From: David Vrabel @ 2015-01-13 17:16 UTC (permalink / raw)
  To: netdev; +Cc: David Vrabel, xen-devel, Konrad Rzeszutek Wilk, Boris Ostrovsky
In-Reply-To: <1421169404-27461-1-git-send-email-david.vrabel@citrix.com>

A function to count the number of slots an skb needs is more useful
than one that counts the slots needed for only the frags.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/net/xen-netfront.c |   13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 22bcb4e..6b29b3a 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -521,13 +521,15 @@ static void xennet_make_frags(struct sk_buff *skb, struct netfront_queue *queue,
 }
 
 /*
- * Count how many ring slots are required to send the frags of this
- * skb. Each frag might be a compound page.
+ * Count how many ring slots are required to send this skb. Each frag
+ * might be a compound page.
  */
-static int xennet_count_skb_frag_slots(struct sk_buff *skb)
+static int xennet_count_skb_slots(struct sk_buff *skb)
 {
 	int i, frags = skb_shinfo(skb)->nr_frags;
-	int pages = 0;
+	int pages;
+
+	pages = PFN_UP(offset_in_page(skb->data) + skb_headlen(skb));
 
 	for (i = 0; i < frags; i++) {
 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
@@ -597,8 +599,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto drop;
 	}
 
-	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
-		xennet_count_skb_frag_slots(skb);
+	slots = xennet_count_skb_slots(skb);
 	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {
 		net_dbg_ratelimited("xennet: skb rides the rocket: %d slots, %d bytes\n",
 				    slots, skb->len);
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH] rtlwifi/rtl8192de: remove redundant else if check
From: Larry Finger @ 2015-01-13 17:19 UTC (permalink / raw)
  To: Colin King, Kalle Valo, linux-wireless, netdev; +Cc: linux-kernel
In-Reply-To: <1421158054-22566-1-git-send-email-colin.king@canonical.com>

On 01/13/2015 08:07 AM, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
>
> The else if check condition checks for the opposite of the
> if check, hence the else if check is redundant and can be
> replaced with a simple else:
>
> if (rtlpriv->rtlhal.macphymode == SINGLEMAC_SINGLEPHY) {
> 	..
> } else if (rtlpriv->rtlhal.macphymode != SINGLEMAC_SINGLEPHY) {
> 	..
> }
>
> replaced with:
>
> if (rtlpriv->rtlhal.macphymode == SINGLEMAC_SINGLEPHY) {
> 	..
> } else {
> 	..
> }
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Acked-by: Larry Finger <Larry.Finger@lwfinger.net>

Thanks,

Larry

> ---
>   drivers/net/wireless/rtlwifi/rtl8192de/hw.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/wireless/rtlwifi/rtl8192de/hw.c b/drivers/net/wireless/rtlwifi/rtl8192de/hw.c
> index 280c3da..01bcc2d 100644
> --- a/drivers/net/wireless/rtlwifi/rtl8192de/hw.c
> +++ b/drivers/net/wireless/rtlwifi/rtl8192de/hw.c
> @@ -546,7 +546,7 @@ static bool _rtl92de_llt_table_init(struct ieee80211_hw *hw)
>   		txpktbuf_bndy = 246;
>   		value8 = 0;
>   		value32 = 0x80bf0d29;
> -	} else if (rtlpriv->rtlhal.macphymode != SINGLEMAC_SINGLEPHY) {
> +	} else {
>   		maxPage = 127;
>   		txpktbuf_bndy = 123;
>   		value8 = 0;
>

^ permalink raw reply

* Re: [PATCH] mISDN: avoid arch specific __builtin_return_address call
From: Joe Perches @ 2015-01-13 17:17 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: isdn4linux, netdev, Karsten Keil, davem, linux-arm-kernel,
	linux-kernel
In-Reply-To: <1816278.9Bv8N8K5IB@wuerfel>

On Tue, 2015-01-13 at 17:10 +0100, Arnd Bergmann wrote:
> Not all architectures are able to call __builtin_return_address().
> On ARM, the mISDN code produces this warning:
> 
> hardware/mISDN/w6692.c: In function 'w6692_dctrl':
> hardware/mISDN/w6692.c:1181:75: warning: unsupported argument to '__builtin_return_address'
>   pr_debug("%s: %s dev(%d) open from %p\n", card->name, __func__,

I didn't know that tidbit. Thanks.

^ permalink raw reply

* Re: [PATCH v5] can: Convert to runtime_pm
From: Marc Kleine-Budde @ 2015-01-13 17:17 UTC (permalink / raw)
  To: Sören Brinkmann
  Cc: Kedareswara rao Appana, wg, michal.simek, grant.likely, robh+dt,
	linux-can, netdev, linux-arm-kernel, linux-kernel, devicetree,
	Kedareswara rao Appana
In-Reply-To: <6ea3c0ea0f5c4deb9a6e4738a8d94a36@BN1AFFO11FD047.protection.gbl>

[-- Attachment #1: Type: text/plain, Size: 4050 bytes --]

On 01/13/2015 06:08 PM, Sören Brinkmann wrote:
> On Tue, 2015-01-13 at 12:08PM +0100, Marc Kleine-Budde wrote:
>> On 01/12/2015 07:45 PM, Sören Brinkmann wrote:
>>> On Mon, 2015-01-12 at 08:34PM +0530, Kedareswara rao Appana wrote:
>>>> Instead of enabling/disabling clocks at several locations in the driver,
>>>> Use the runtime_pm framework. This consolidates the actions for runtime PM
>>>> In the appropriate callbacks and makes the driver more readable and mantainable.
>>>>
>>>> Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
>>>> Signed-off-by: Kedareswara rao Appana <appanad@xilinx.com>
>>>> ---
>>>> Changes for v5:
>>>>  - Updated with the review comments.
>>>>    Updated the remove fuction to use runtime_pm.
>>>> Chnages for v4:
>>>>  - Updated with the review comments.
>>>> Changes for v3:
>>>>   - Converted the driver to use runtime_pm.
>>>> Changes for v2:
>>>>   - Removed the struct platform_device* from suspend/resume
>>>>     as suggest by Lothar.
>>>>
>>>>  drivers/net/can/xilinx_can.c |  157 ++++++++++++++++++++++++++++-------------
>>>>  1 files changed, 107 insertions(+), 50 deletions(-)
>>> [..]
>>>> +static int __maybe_unused xcan_runtime_resume(struct device *dev)
>>>>  {
>>>> -	struct platform_device *pdev = dev_get_drvdata(dev);
>>>> -	struct net_device *ndev = platform_get_drvdata(pdev);
>>>> +	struct net_device *ndev = dev_get_drvdata(dev);
>>>>  	struct xcan_priv *priv = netdev_priv(ndev);
>>>>  	int ret;
>>>> +	u32 isr, status;
>>>>  
>>>>  	ret = clk_enable(priv->bus_clk);
>>>>  	if (ret) {
>>>> @@ -1014,15 +1030,28 @@ static int __maybe_unused xcan_resume(struct device *dev)
>>>>  	ret = clk_enable(priv->can_clk);
>>>>  	if (ret) {
>>>>  		dev_err(dev, "Cannot enable clock.\n");
>>>> -		clk_disable_unprepare(priv->bus_clk);
>>>> +		clk_disable(priv->bus_clk);
>>> [...]
>>>> @@ -1173,12 +1219,23 @@ static int xcan_remove(struct platform_device *pdev)
>>>>  {
>>>>  	struct net_device *ndev = platform_get_drvdata(pdev);
>>>>  	struct xcan_priv *priv = netdev_priv(ndev);
>>>> +	int ret;
>>>> +
>>>> +	ret = pm_runtime_get_sync(&pdev->dev);
>>>> +	if (ret < 0) {
>>>> +		netdev_err(ndev, "%s: pm_runtime_get failed(%d)\n",
>>>> +				__func__, ret);
>>>> +		return ret;
>>>> +	}
>>>>  
>>>>  	if (set_reset_mode(ndev) < 0)
>>>>  		netdev_err(ndev, "mode resetting failed!\n");
>>>>  
>>>>  	unregister_candev(ndev);
>>>> +	pm_runtime_disable(&pdev->dev);
>>>>  	netif_napi_del(&priv->napi);
>>>> +	clk_disable_unprepare(priv->bus_clk);
>>>> +	clk_disable_unprepare(priv->can_clk);
>>>
>>> Shouldn't pretty much all these occurrences of clk_disable/enable
>>> disappear? This should all be handled by the runtime_pm framework now.
>>
>> We have:
>> - clk_prepare_enable() in probe
> 
> This should become something like pm_runtime_get_sync(), shouldn't it?
> 
>> - clk_disable_unprepare() in remove
> 
> pm_runtime_put()
> 
>> - clk_enable() in runtime_resume
>> - clk_disable() in runtime_suspend
> 
> These are the ones needed.
> 
> The above makes me suspect that the clocks are always on, regardless of

Define "on" :)
The clocks are prepared after probe() exists, but not enabled. The first
pm_runtime_get_sync() will enable the clocks.

> the runtime suspend state since they are enabled in probe and disabled
> in remove, is that right? Ideally, the usage in probe and remove should
> be migrated to runtime_pm and clocks should really only be running when
> needed and not throughout the whole lifetime of the driver.

The clocks are not en/disabled via pm_runtime, because
pm_runtime_get_sync() is called from atomic contect. We can have another
look into the driver and try to change this.

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* RE: [RFC PATCH v2 1/2] net: af_packet support for direct ring access in user space
From: David Laight @ 2015-01-13 17:15 UTC (permalink / raw)
  To: 'John Fastabend', Daniel Borkmann
  Cc: Hannes Frederic Sowa, netdev@vger.kernel.org,
	danny.zhou@intel.com, nhorman@tuxdriver.com,
	john.ronciak@intel.com, brouer@redhat.com
In-Reply-To: <54B538C1.6000500@gmail.com>

From: John Fastabend
> On 01/13/2015 05:21 AM, Daniel Borkmann wrote:
> > On 01/13/2015 01:35 PM, Hannes Frederic Sowa wrote:
> >> On Mo, 2015-01-12 at 20:35 -0800, John Fastabend wrote:
> > ...
> >>> +/* setsockopt takes addr, size ,direction parametner, getsockopt takes
> >>> + * iova, size, direction.
> >>> + * */
> >>> +struct tpacket_dma_mem_region {
> >>> +    void *addr;        /* userspace virtual address */
> >>> +    __u64 phys_addr;    /* physical address */
> >>> +    __u64 iova;        /* IO virtual address used for DMA */
> >>> +    unsigned long size;    /* size of region */
> >>> +    int direction;        /* dma data direction */
> >>> +};
> >>
> >> Have you tested this with with 32 bit user space and 32 bit kernel, too?
> >> I don't have any problem with only supporting 64 bit kernels for this
> >> feature, but looking through the code I wonder if we handle the __u64
> >> addresses correctly in all situations.
> 
> We still need to test/implement this I'm going to guess there is some
> more work needed for this to work correctly.

How about something like:

struct tpacket_dma_mem_region {
    __u64 addr;        /* userspace virtual address */
    __u64 phys_addr;    /* physical address */
    __u64 iova;        /* IO virtual address used for DMA */
    __u64 size;    /* size of region */
    int direction;        /* dma data direction */
} aligned(8);

So that it is independant of 32/64 bits.
It is a shame that gcc has no way of defining a 64bit 'void *' on 32bit systems.
You can use a union, but you still need to zero extend the value on LE (worse on BE).

	David



^ permalink raw reply

* Re: [PATCH v5] can: Convert to runtime_pm
From: Sören Brinkmann @ 2015-01-13 17:08 UTC (permalink / raw)
  To: Marc Kleine-Budde
  Cc: Kedareswara rao Appana, wg-5Yr1BZd7O62+XT7JhA+gdA,
	michal.simek-gjFFaj9aHVfQT0dZR+AlfA,
	grant.likely-QSEj5FYQhm4dnm+yROfE0A,
	robh+dt-DgEjT+Ai2ygdnm+yROfE0A, linux-can-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	devicetree-u79uwXL29TY76Z2rM5mHXA, Kedareswara rao Appana
In-Reply-To: <54B4FCB5.6040904-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

On Tue, 2015-01-13 at 12:08PM +0100, Marc Kleine-Budde wrote:
> On 01/12/2015 07:45 PM, Sören Brinkmann wrote:
> > On Mon, 2015-01-12 at 08:34PM +0530, Kedareswara rao Appana wrote:
> >> Instead of enabling/disabling clocks at several locations in the driver,
> >> Use the runtime_pm framework. This consolidates the actions for runtime PM
> >> In the appropriate callbacks and makes the driver more readable and mantainable.
> >>
> >> Signed-off-by: Soren Brinkmann <soren.brinkmann-gjFFaj9aHVfQT0dZR+AlfA@public.gmane.org>
> >> Signed-off-by: Kedareswara rao Appana <appanad-gjFFaj9aHVfQT0dZR+AlfA@public.gmane.org>
> >> ---
> >> Changes for v5:
> >>  - Updated with the review comments.
> >>    Updated the remove fuction to use runtime_pm.
> >> Chnages for v4:
> >>  - Updated with the review comments.
> >> Changes for v3:
> >>   - Converted the driver to use runtime_pm.
> >> Changes for v2:
> >>   - Removed the struct platform_device* from suspend/resume
> >>     as suggest by Lothar.
> >>
> >>  drivers/net/can/xilinx_can.c |  157 ++++++++++++++++++++++++++++-------------
> >>  1 files changed, 107 insertions(+), 50 deletions(-)
> > [..]
> >> +static int __maybe_unused xcan_runtime_resume(struct device *dev)
> >>  {
> >> -	struct platform_device *pdev = dev_get_drvdata(dev);
> >> -	struct net_device *ndev = platform_get_drvdata(pdev);
> >> +	struct net_device *ndev = dev_get_drvdata(dev);
> >>  	struct xcan_priv *priv = netdev_priv(ndev);
> >>  	int ret;
> >> +	u32 isr, status;
> >>  
> >>  	ret = clk_enable(priv->bus_clk);
> >>  	if (ret) {
> >> @@ -1014,15 +1030,28 @@ static int __maybe_unused xcan_resume(struct device *dev)
> >>  	ret = clk_enable(priv->can_clk);
> >>  	if (ret) {
> >>  		dev_err(dev, "Cannot enable clock.\n");
> >> -		clk_disable_unprepare(priv->bus_clk);
> >> +		clk_disable(priv->bus_clk);
> > [...]
> >> @@ -1173,12 +1219,23 @@ static int xcan_remove(struct platform_device *pdev)
> >>  {
> >>  	struct net_device *ndev = platform_get_drvdata(pdev);
> >>  	struct xcan_priv *priv = netdev_priv(ndev);
> >> +	int ret;
> >> +
> >> +	ret = pm_runtime_get_sync(&pdev->dev);
> >> +	if (ret < 0) {
> >> +		netdev_err(ndev, "%s: pm_runtime_get failed(%d)\n",
> >> +				__func__, ret);
> >> +		return ret;
> >> +	}
> >>  
> >>  	if (set_reset_mode(ndev) < 0)
> >>  		netdev_err(ndev, "mode resetting failed!\n");
> >>  
> >>  	unregister_candev(ndev);
> >> +	pm_runtime_disable(&pdev->dev);
> >>  	netif_napi_del(&priv->napi);
> >> +	clk_disable_unprepare(priv->bus_clk);
> >> +	clk_disable_unprepare(priv->can_clk);
> > 
> > Shouldn't pretty much all these occurrences of clk_disable/enable
> > disappear? This should all be handled by the runtime_pm framework now.
> 
> We have:
> - clk_prepare_enable() in probe

This should become something like pm_runtime_get_sync(), shouldn't it?

> - clk_disable_unprepare() in remove

pm_runtime_put()

> - clk_enable() in runtime_resume
> - clk_disable() in runtime_suspend

These are the ones needed.

The above makes me suspect that the clocks are always on, regardless of
the runtime suspend state since they are enabled in probe and disabled
in remove, is that right? Ideally, the usage in probe and remove should
be migrated to runtime_pm and clocks should really only be running when
needed and not throughout the whole lifetime of the driver.

	Sören
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: why are IPv6 addresses removed on link down
From: Nicolas Dichtel @ 2015-01-13 17:05 UTC (permalink / raw)
  To: Hannes Frederic Sowa, David Ahern
  Cc: YOSHIFUJI Hideaki, Stephen Hemminger, netdev@vger.kernel.org
In-Reply-To: <1421161230.13626.30.camel@stressinduktion.org>

Le 13/01/2015 16:00, Hannes Frederic Sowa a écrit :
[snip]
>
> This was already discussed several times here, e.g. one patch I just
> found:
>
> http://lists.openwall.net/netdev/2011/01/24/8
> and
> http://patchwork.ozlabs.org/patch/17558/
>
> Albeit I hate sysctls for things like this, it might I tend to find it
> acceptable because it solves a problem which happened to lots of people.
> And I don't like the current behavior neither.
>
> I think this can work, but we should follow up all the old discussions
> to not introduce any kind of new undesired behavior this time.
If I remember well, one of the problem was that it was not possible anymore to
unload the IPv6 module after the patches. But since commit 8ce440610357 ("ipv6:
do not allow ipv6 module to be removed"), it's not possible anymore.

^ permalink raw reply

* Re: [PATCH net] gso: do GSO for local skb with size bigger than MTU
From: Thomas Graf @ 2015-01-13 16:58 UTC (permalink / raw)
  To: Jesse Gross
  Cc: Fan Du, Michael S. Tsirkin, Du, Fan, davem@davemloft.net,
	Jason Wang, netdev@vger.kernel.org, fw@strlen.de,
	dev@openvswitch.org, pshelar@nicira.com, fuscof
In-Reply-To: <CAEP_g=_+sPkuib6CNsSEHB0txW+rEVzOVRXKfkYO=FLqR=8uFQ@mail.gmail.com>

On 01/12/15 at 10:55am, Jesse Gross wrote:
> There are at least two parts to this:
>  * Calculating the right MTU for the guest device.
>  * Transferring the MTU from the host to the guest.
> 
> The first would presumably involve exposing some kind of API that the
> component that does know the right value could program. In this case,
> that component could be OVS using the same type of information that
> you just described in the earlier post about L3. The API could simply
> to just set the MTU of the device in the host and this gets mirrored
> to the guest.
> 
> The second part I guess is probably a fairly straightforward extension
> to virtio but I don't know the details.

Francesco Fusco wrote code to do exactly this. Maybe he still has
it somewhere.

^ permalink raw reply

* BW regression after "tcp: refine TSO autosizing"
From: Eyal Perry @ 2015-01-13 16:48 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, Amir Vadai, yevgenyp, saeedm, idos, amira, eyalpe

Hello Eric,
Lately we've observed performance degradation in BW of about 30-40% (depends on
the setup we use).
I've bisected the issue down to the this commit: 605ad7f1 ("tcp: refine TSO
autosizing")

For instance, I was running the following test:
1. Bounding net device' irqs to core 0 for both client and server side
2. Running netperf with 64K massage size (used the following command)
$ netperf -H remote -T 1,1 -l 100 -t TCP_STREAM -- -k THROUGHPUT -M 65536 -m 65536

I ran the test on upstream net-next including your patch and than reverted it
and these are the results I got was improvement from 14.6Gbps to 22.1Gbps.

an additional difference I've noticed when inspecting the ethtool statics,
number of xmit_more packets increased from 4 to 160 with the reverted kernel.

We are investigating this issue, do you have a hint?

Best regards,
Eyal.

^ permalink raw reply

* [PATCHv2 net] xen-netfront: use different locks for Rx and Tx stats
From: David Vrabel @ 2015-01-13 16:42 UTC (permalink / raw)
  To: netdev; +Cc: David Vrabel, xen-devel, Konrad Rzeszutek Wilk, Boris Ostrovsky

In netfront the Rx and Tx path are independent and use different
locks.  The Tx lock is held with hard irqs disabled, but Rx lock is
held with only BH disabled.  Since both sides use the same stats lock,
a deadlock may occur.

  [ INFO: possible irq lock inversion dependency detected ]
  3.16.2 #16 Not tainted
  ---------------------------------------------------------
  swapper/0/0 just changed the state of lock:
   (&(&queue->tx_lock)->rlock){-.....}, at: [<c03adec8>]
  xennet_tx_interrupt+0x14/0x34
  but this lock took another, HARDIRQ-unsafe lock in the past:
   (&stat->syncp.seq#2){+.-...}
  and interrupts could create inverse lock ordering between them.
  other info that might help us debug this:
   Possible interrupt unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&stat->syncp.seq#2);
                                 local_irq_disable();
                                 lock(&(&queue->tx_lock)->rlock);
                                 lock(&stat->syncp.seq#2);
    <Interrupt>
      lock(&(&queue->tx_lock)->rlock);

Using separate locks for the Rx and Tx stats fixes this deadlock.

Reported-by: Dmitry Piotrovsky <piotrovskydmitry@gmail.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 drivers/net/xen-netfront.c |   71 ++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 22bcb4e..d8c1076 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -88,10 +88,8 @@ struct netfront_cb {
 #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3)
 
 struct netfront_stats {
-	u64			rx_packets;
-	u64			tx_packets;
-	u64			rx_bytes;
-	u64			tx_bytes;
+	u64			packets;
+	u64			bytes;
 	struct u64_stats_sync	syncp;
 };
 
@@ -160,7 +158,8 @@ struct netfront_info {
 	struct netfront_queue *queues;
 
 	/* Statistics */
-	struct netfront_stats __percpu *stats;
+	struct netfront_stats __percpu *rx_stats;
+	struct netfront_stats __percpu *tx_stats;
 
 	atomic_t rx_gso_checksum_fixup;
 };
@@ -565,7 +564,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	unsigned short id;
 	struct netfront_info *np = netdev_priv(dev);
-	struct netfront_stats *stats = this_cpu_ptr(np->stats);
+	struct netfront_stats *tx_stats = this_cpu_ptr(np->tx_stats);
 	struct xen_netif_tx_request *tx;
 	char *data = skb->data;
 	RING_IDX i;
@@ -672,10 +671,10 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (notify)
 		notify_remote_via_irq(queue->tx_irq);
 
-	u64_stats_update_begin(&stats->syncp);
-	stats->tx_bytes += skb->len;
-	stats->tx_packets++;
-	u64_stats_update_end(&stats->syncp);
+	u64_stats_update_begin(&tx_stats->syncp);
+	tx_stats->bytes += skb->len;
+	tx_stats->packets++;
+	u64_stats_update_end(&tx_stats->syncp);
 
 	/* Note: It is not safe to access skb after xennet_tx_buf_gc()! */
 	xennet_tx_buf_gc(queue);
@@ -931,7 +930,7 @@ static int checksum_setup(struct net_device *dev, struct sk_buff *skb)
 static int handle_incoming_queue(struct netfront_queue *queue,
 				 struct sk_buff_head *rxq)
 {
-	struct netfront_stats *stats = this_cpu_ptr(queue->info->stats);
+	struct netfront_stats *rx_stats = this_cpu_ptr(queue->info->rx_stats);
 	int packets_dropped = 0;
 	struct sk_buff *skb;
 
@@ -952,10 +951,10 @@ static int handle_incoming_queue(struct netfront_queue *queue,
 			continue;
 		}
 
-		u64_stats_update_begin(&stats->syncp);
-		stats->rx_packets++;
-		stats->rx_bytes += skb->len;
-		u64_stats_update_end(&stats->syncp);
+		u64_stats_update_begin(&rx_stats->syncp);
+		rx_stats->packets++;
+		rx_stats->bytes += skb->len;
+		u64_stats_update_end(&rx_stats->syncp);
 
 		/* Pass it up. */
 		napi_gro_receive(&queue->napi, skb);
@@ -1079,18 +1078,22 @@ static struct rtnl_link_stats64 *xennet_get_stats64(struct net_device *dev,
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct netfront_stats *stats = per_cpu_ptr(np->stats, cpu);
+		struct netfront_stats *rx_stats = per_cpu_ptr(np->rx_stats, cpu);
+		struct netfront_stats *tx_stats = per_cpu_ptr(np->tx_stats, cpu);
 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 		unsigned int start;
 
 		do {
-			start = u64_stats_fetch_begin_irq(&stats->syncp);
+			start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+			tx_packets = tx_stats->packets;
+			tx_bytes = tx_stats->bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
 
-			rx_packets = stats->rx_packets;
-			tx_packets = stats->tx_packets;
-			rx_bytes = stats->rx_bytes;
-			tx_bytes = stats->tx_bytes;
-		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+			rx_packets = rx_stats->packets;
+			rx_bytes = rx_stats->bytes;
+		} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
 
 		tot->rx_packets += rx_packets;
 		tot->tx_packets += tx_packets;
@@ -1275,6 +1278,15 @@ static const struct net_device_ops xennet_netdev_ops = {
 #endif
 };
 
+static void xennet_free_netdev(struct net_device *netdev)
+{
+	struct netfront_info *np = netdev_priv(netdev);
+
+	free_percpu(np->rx_stats);
+	free_percpu(np->tx_stats);
+	free_netdev(netdev);
+}
+
 static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 {
 	int err;
@@ -1295,8 +1307,11 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 	np->queues = NULL;
 
 	err = -ENOMEM;
-	np->stats = netdev_alloc_pcpu_stats(struct netfront_stats);
-	if (np->stats == NULL)
+	np->rx_stats = netdev_alloc_pcpu_stats(struct netfront_stats);
+	if (np->rx_stats == NULL)
+		goto exit;
+	np->tx_stats = netdev_alloc_pcpu_stats(struct netfront_stats);
+	if (np->tx_stats == NULL)
 		goto exit;
 
 	netdev->netdev_ops	= &xennet_netdev_ops;
@@ -1327,7 +1342,7 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev)
 	return netdev;
 
  exit:
-	free_netdev(netdev);
+	xennet_free_netdev(netdev);
 	return ERR_PTR(err);
 }
 
@@ -1369,7 +1384,7 @@ static int netfront_probe(struct xenbus_device *dev,
 	return 0;
 
  fail:
-	free_netdev(netdev);
+	xennet_free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 	return err;
 }
@@ -2189,9 +2204,7 @@ static int xennet_remove(struct xenbus_device *dev)
 		info->queues = NULL;
 	}
 
-	free_percpu(info->stats);
-
-	free_netdev(info->netdev);
+	xennet_free_netdev(info->netdev);
 
 	return 0;
 }
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH net-next v2 2/2] vxlan: Remote checksum offload
From: Thomas Graf @ 2015-01-13 16:40 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <20150113114443.GK20387@casper.infradead.org>

[Moving this discussion to the thread of the respective patch]

On 01/13/15 at 08:16am, Tom Herbert wrote:
> On 01/13/15 at 11:44am, Thomas Graf wrote:
> > The major difference here is that we have to consider backwards
> > compatibility specifically for VXLAN. Your initial feedback on GPE
> > actually led me to how I implemented GBP.
> >
> > I think the axioms we want to establish are as follows:
> >  1. Extensions need to be explicitly enabled by the user. A previously
> >     dropped frame should only be processed if the user explitly asks
> >     for it.
> >  2. As a consequence: only share a VLXAN UDP port if the enabled
> >     extensions match (vxlan_sock_add), e.g. user A might want RCO
> >     but user B might be unaware. They cannot share the same UDP port.
> >
> > The 2nd lead me to introduce the 'exts' member to vxlan_sock so we can
> > compare it in vxlan_find_sock() and only share a UDP port if the
> > enabled extensions match.
> >
> RCO is represented in the socket in VXLAN flags (VLXAN_F_*). My patch
> also adds a flags to vxlan_sock which contains the VLXAN flags. For
> shared port, I suspect all the receive features must match, including
> receive checksum settings for instance, but we don't care about
> transmit side. To facilitate this, I would suggest splitting flags
> into o_flags and i_flags like ip_tunnel does, and then compare i_flags
> in vxlan_find_sock.

Not sure I understand why you want to omit the transmit side. If a
VXLAN socket with RCO TX enabled is found and shared with a user
who does not want RCO enabled, it will get RCO enabled frames which
will get dropped by non RCO VXLAN receivers.

^ permalink raw reply

* Re: [PATCH] infiniband: mlx5: avoid a compile-time warning
From: Eli Cohen @ 2015-01-13 16:30 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Hal Rosenstock,
	Sean Hefty, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <2267004.49D0qFBpL1@wuerfel>

Acked-by: Eli Cohen <eli-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

On Tue, Jan 13, 2015 at 05:09:43PM +0100, Arnd Bergmann wrote:
> The return type of find_first_bit() is architecture specific,
> on ARM it is 'unsigned int', while the asm-generic code used
> on x86 and a lot of other architectures returns 'unsigned long'.
> 
> When building the mlx5 driver on ARM, we get a warning about
> this:
> 
> infiniband/hw/mlx5/mem.c: In function 'mlx5_ib_cont_pages':
> infiniband/hw/mlx5/mem.c:84:143: warning: comparison of distinct pointer types lacks a cast
>      m = min(m, find_first_bit(&tmp, sizeof(tmp)));
> 
> This patch changes the driver to use min_t to make it behave
> the same way on all architectures.
> 
> Signed-off-by: Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>
> 
> diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
> index b56e4c5593ee..611a9fdf2f38 100644
> --- a/drivers/infiniband/hw/mlx5/mem.c
> +++ b/drivers/infiniband/hw/mlx5/mem.c
> @@ -81,7 +81,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
>  		for (k = 0; k < len; k++) {
>  			if (!(i & mask)) {
>  				tmp = (unsigned long)pfn;
> -				m = min(m, find_first_bit(&tmp, sizeof(tmp)));
> +				m = min_t(unsigned long, m, find_first_bit(&tmp, sizeof(tmp)));
>  				skip = 1 << m;
>  				mask = skip - 1;
>  				base = pfn;
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] mlx5: avoid build warnings on 32-bit
From: Eli Cohen @ 2015-01-13 16:28 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4105686.Jz5UR0277i@wuerfel>

On Tue, Jan 13, 2015 at 05:08:06PM +0100, Arnd Bergmann wrote:

Hi Arnd,
wouldn't it work by casting to uintptr_t instead of unsigned long?

> The mlx5 driver passes a string pointer in through a 'u64' variable,
> which on 32-bit machines causes a build warning:
> 
> drivers/net/ethernet/mellanox/mlx5/core/debugfs.c: In function 'qp_read_field':
> drivers/net/ethernet/mellanox/mlx5/core/debugfs.c:303:11: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
> 
> The code is in fact safe, so we can shut up the warning by adding
> extra type casts.
> 
> Signed-off-by: Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
> index 10e1f1a18255..4878025e231c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
> @@ -300,11 +300,11 @@ static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
>  		param = qp->pid;
>  		break;
>  	case QP_STATE:
> -		param = (u64)mlx5_qp_state_str(be32_to_cpu(ctx->flags) >> 28);
> +		param = (unsigned long)mlx5_qp_state_str(be32_to_cpu(ctx->flags) >> 28);
>  		*is_str = 1;
>  		break;
>  	case QP_XPORT:
> -		param = (u64)mlx5_qp_type_str((be32_to_cpu(ctx->flags) >> 16) & 0xff);
> +		param = (unsigned long)mlx5_qp_type_str((be32_to_cpu(ctx->flags) >> 16) & 0xff);
>  		*is_str = 1;
>  		break;
>  	case QP_MTU:
> @@ -464,7 +464,7 @@ static ssize_t dbg_read(struct file *filp, char __user *buf, size_t count,
>  
>  
>  	if (is_str)
> -		ret = snprintf(tbuf, sizeof(tbuf), "%s\n", (const char *)field);
> +		ret = snprintf(tbuf, sizeof(tbuf), "%s\n", (const char *)(unsigned long)field);
>  	else
>  		ret = snprintf(tbuf, sizeof(tbuf), "0x%llx\n", field);
>  
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 5/5] openvswitch: Support VXLAN Group Policy extension
From: Thomas Graf @ 2015-01-13 16:20 UTC (permalink / raw)
  To: davem, jesse, stephen, pshelar, therbert, alexei.starovoitov,
	nicolas.dichtel
  Cc: netdev, dev
In-Reply-To: <cover.1421165078.git.tgraf@suug.ch>

Introduces support for the group policy extension to the VXLAN virtual
port. The extension is disabled by default and only enabled if the user
has provided the respective configuration.

  ovs-vsctl add-port br0 vxlan0 -- \
     set Interface vxlan0 type=vxlan options:exts=gbp

The configuration interface to enable the extension is based on a new
attribute OVS_VXLAN_EXT_GBP nested inside OVS_TUNNEL_ATTR_EXTENSION
which can carry additional extensions as needed in the future.

The group policy metadata is stored as binary blob (struct ovs_vxlan_opts)
internally just like Geneve options but transported as nested Netlink
attributes to user space.

Renames the existing TUNNEL_OPTIONS_PRESENT to TUNNEL_GENEVE_OPT with the
binary value kept intact, a new flag TUNNEL_VXLAN_OPT is introduced.

The attributes OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and existing
OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS are implemented mutually exclusive.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
v3->v4:
 - Fixed OVS_VXLAN_EXT_MAX->OVS_VXLAN_EXT_GBP typo as spotted by Jesse
 - Only applied tunnel options if they are of the right type as
   suggested by Jesse
v2->v3:
 - No change
v1->v2:
 - Addressed Jesse's request to transport VXLAN options as Netlink
   attributes instead of a binary blob. Allows a partial transport of
   VXLAN extensions. Internally, the datapath continues to use a binary
   blob (defined in vport-vxlan.h) for performance reasons.
 - Added new TUNNEL_GENEVE_OPT and TUNNEL_VXLAN_OPT flags to mark
   tunnel option flavour
 - Correctly report VXLAN options to user space

 include/net/ip_tunnels.h         |   5 +-
 include/uapi/linux/openvswitch.h |  11 ++++
 net/openvswitch/flow_netlink.c   | 114 ++++++++++++++++++++++++++++++++++-----
 net/openvswitch/vport-geneve.c   |  15 ++++--
 net/openvswitch/vport-vxlan.c    |  82 +++++++++++++++++++++++++++-
 net/openvswitch/vport-vxlan.h    |  11 ++++
 6 files changed, 218 insertions(+), 20 deletions(-)
 create mode 100644 net/openvswitch/vport-vxlan.h

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 25a59eb..ce4db3c 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -97,7 +97,10 @@ struct ip_tunnel {
 #define TUNNEL_DONT_FRAGMENT    __cpu_to_be16(0x0100)
 #define TUNNEL_OAM		__cpu_to_be16(0x0200)
 #define TUNNEL_CRIT_OPT		__cpu_to_be16(0x0400)
-#define TUNNEL_OPTIONS_PRESENT	__cpu_to_be16(0x0800)
+#define TUNNEL_GENEVE_OPT	__cpu_to_be16(0x0800)
+#define TUNNEL_VXLAN_OPT	__cpu_to_be16(0x1000)
+
+#define TUNNEL_OPTIONS_PRESENT	(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)
 
 struct tnl_ptk_info {
 	__be16 flags;
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 3a6dcaa..e474c95 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -248,11 +248,21 @@ enum ovs_vport_attr {
 
 #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
 
+enum {
+	OVS_VXLAN_EXT_UNSPEC,
+	OVS_VXLAN_EXT_GBP,	/* Flag or __u32 */
+	__OVS_VXLAN_EXT_MAX,
+};
+
+#define OVS_VXLAN_EXT_MAX (__OVS_VXLAN_EXT_MAX - 1)
+
+
 /* OVS_VPORT_ATTR_OPTIONS attributes for tunnels.
  */
 enum {
 	OVS_TUNNEL_ATTR_UNSPEC,
 	OVS_TUNNEL_ATTR_DST_PORT, /* 16-bit UDP port, used by L4 tunnels. */
+	OVS_TUNNEL_ATTR_EXTENSION,
 	__OVS_TUNNEL_ATTR_MAX
 };
 
@@ -324,6 +334,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,        /* Array of Geneve options. */
 	OVS_TUNNEL_KEY_ATTR_TP_SRC,		/* be16 src Transport Port. */
 	OVS_TUNNEL_KEY_ATTR_TP_DST,		/* be16 dst Transport Port. */
+	OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS,		/* Nested OVS_VXLAN_EXT_* */
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 518941c..d210d1b 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -49,6 +49,7 @@
 #include <net/mpls.h>
 
 #include "flow_netlink.h"
+#include "vport-vxlan.h"
 
 struct ovs_len_tbl {
 	int len;
@@ -268,6 +269,9 @@ size_t ovs_tun_key_attr_size(void)
 		+ nla_total_size(0)    /* OVS_TUNNEL_KEY_ATTR_CSUM */
 		+ nla_total_size(0)    /* OVS_TUNNEL_KEY_ATTR_OAM */
 		+ nla_total_size(256)  /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */
+		/* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with
+		 * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
+		 */
 		+ nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
 		+ nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
 }
@@ -308,6 +312,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
 	[OVS_TUNNEL_KEY_ATTR_TP_DST]	    = { .len = sizeof(u16) },
 	[OVS_TUNNEL_KEY_ATTR_OAM]	    = { .len = 0 },
 	[OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS]   = { .len = OVS_ATTR_NESTED },
+	[OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS]    = { .len = OVS_ATTR_NESTED },
 };
 
 /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
@@ -460,6 +465,41 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a,
 	return 0;
 }
 
+static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = {
+	[OVS_VXLAN_EXT_GBP]	= { .type = NLA_U32 },
+};
+
+static int vxlan_tun_opt_from_nlattr(const struct nlattr *a,
+				     struct sw_flow_match *match, bool is_mask,
+				     bool log)
+{
+	struct nlattr *tb[OVS_VXLAN_EXT_MAX+1];
+	unsigned long opt_key_offset;
+	struct ovs_vxlan_opts opts;
+	int err;
+
+	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
+
+	err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy);
+	if (err < 0)
+		return err;
+
+	memset(&opts, 0, sizeof(opts));
+
+	if (tb[OVS_VXLAN_EXT_GBP])
+		opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]);
+
+	if (!is_mask)
+		SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false);
+	else
+		SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true);
+
+	opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
+	SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
+				  is_mask);
+	return 0;
+}
+
 static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 				struct sw_flow_match *match, bool is_mask,
 				bool log)
@@ -468,6 +508,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 	int rem;
 	bool ttl = false;
 	__be16 tun_flags = 0;
+	int opts_type = 0;
 
 	nla_for_each_nested(a, attr, rem) {
 		int type = nla_type(a);
@@ -527,11 +568,30 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 			tun_flags |= TUNNEL_OAM;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+			if (opts_type) {
+				OVS_NLERR(log, "Multiple metadata blocks provided");
+				return -EINVAL;
+			}
+
 			err = genev_tun_opt_from_nlattr(a, match, is_mask, log);
 			if (err)
 				return err;
 
-			tun_flags |= TUNNEL_OPTIONS_PRESENT;
+			tun_flags |= TUNNEL_GENEVE_OPT;
+			opts_type = type;
+			break;
+		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+			if (opts_type) {
+				OVS_NLERR(log, "Multiple metadata blocks provided");
+				return -EINVAL;
+			}
+
+			err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log);
+			if (err)
+				return err;
+
+			tun_flags |= TUNNEL_VXLAN_OPT;
+			opts_type = type;
 			break;
 		default:
 			OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d",
@@ -560,6 +620,23 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
 		}
 	}
 
+	return opts_type;
+}
+
+static int vxlan_opt_to_nlattr(struct sk_buff *skb,
+			       const void *tun_opts, int swkey_tun_opts_len)
+{
+	const struct ovs_vxlan_opts *opts = tun_opts;
+	struct nlattr *nla;
+
+	nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
+	if (!nla)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0)
+		return -EMSGSIZE;
+
+	nla_nest_end(skb, nla);
 	return 0;
 }
 
@@ -596,10 +673,15 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
 	if ((output->tun_flags & TUNNEL_OAM) &&
 	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
 		return -EMSGSIZE;
-	if (tun_opts &&
-	    nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
-		    swkey_tun_opts_len, tun_opts))
-		return -EMSGSIZE;
+	if (tun_opts) {
+		if (output->tun_flags & TUNNEL_GENEVE_OPT &&
+		    nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
+			    swkey_tun_opts_len, tun_opts))
+			return -EMSGSIZE;
+		else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
+			 vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
+			return -EMSGSIZE;
+	}
 
 	return 0;
 }
@@ -680,7 +762,7 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 	}
 	if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
 		if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
-					 is_mask, log))
+					 is_mask, log) < 0)
 			return -EINVAL;
 		*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
 	}
@@ -1578,17 +1660,23 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	struct sw_flow_key key;
 	struct ovs_tunnel_info *tun_info;
 	struct nlattr *a;
-	int err, start;
+	int err, start, opts_type;
 
 	ovs_match_init(&match, &key, NULL);
-	err = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log);
-	if (err)
-		return err;
+	opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log);
+	if (opts_type < 0)
+		return opts_type;
 
 	if (key.tun_opts_len) {
-		err = validate_geneve_opts(&key);
-		if (err < 0)
-			return err;
+		switch (opts_type) {
+		case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
+			err = validate_geneve_opts(&key);
+			if (err < 0)
+				return err;
+			break;
+		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+			break;
+		}
 	};
 
 	start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log);
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 484864d..f01f3f8 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -90,7 +90,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
 
 	opts_len = geneveh->opt_len * 4;
 
-	flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT |
+	flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
 		(udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
 		(geneveh->oam ? TUNNEL_OAM : 0) |
 		(geneveh->critical ? TUNNEL_CRIT_OPT : 0);
@@ -180,7 +180,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 	__be16 sport;
 	struct rtable *rt;
 	struct flowi4 fl;
-	u8 vni[3];
+	u8 vni[3], opts_len, *opts;
 	__be16 df;
 	int err;
 
@@ -211,11 +211,18 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 	tunnel_id_to_vni(tun_key->tun_id, vni);
 	skb->ignore_df = 1;
 
+	if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) {
+		opts = (u8 *)tun_info->options;
+		opts_len = tun_info->options_len;
+	} else {
+		opts = NULL;
+		opts_len = 0;
+	}
+
 	err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
 			      tun_key->ipv4_dst, tun_key->ipv4_tos,
 			      tun_key->ipv4_ttl, df, sport, dport,
-			      tun_key->tun_flags, vni,
-			      tun_info->options_len, (u8 *)tun_info->options,
+			      tun_key->tun_flags, vni, opts_len, opts,
 			      false);
 	if (err < 0)
 		ip_rt_put(rt);
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 40a16fb..9f47c23 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -40,6 +40,7 @@
 
 #include "datapath.h"
 #include "vport.h"
+#include "vport-vxlan.h"
 
 /**
  * struct vxlan_port - Keeps track of open UDP ports
@@ -49,6 +50,7 @@
 struct vxlan_port {
 	struct vxlan_sock *vs;
 	char name[IFNAMSIZ];
+	u32 exts; /* VXLAN_EXT_* in <net/vxlan.h> */
 };
 
 static struct vport_ops ovs_vxlan_vport_ops;
@@ -63,16 +65,26 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 		      struct vxlan_metadata *md)
 {
 	struct ovs_tunnel_info tun_info;
+	struct vxlan_port *vxlan_port;
 	struct vport *vport = vs->data;
 	struct iphdr *iph;
+	struct ovs_vxlan_opts opts = {
+		.gbp = md->gbp,
+	};
 	__be64 key;
+	__be16 flags;
+
+	flags = TUNNEL_KEY;
+	vxlan_port = vxlan_vport(vport);
+	if (vxlan_port->exts & VXLAN_EXT_GBP)
+		flags |= TUNNEL_VXLAN_OPT;
 
 	/* Save outer tunnel values */
 	iph = ip_hdr(skb);
 	key = cpu_to_be64(ntohl(md->vni) >> 8);
 	ovs_flow_tun_info_init(&tun_info, iph,
 			       udp_hdr(skb)->source, udp_hdr(skb)->dest,
-			       key, TUNNEL_KEY, NULL, 0);
+			       key, flags, &opts, sizeof(opts));
 
 	ovs_vport_receive(vport, skb, &tun_info);
 }
@@ -84,6 +96,21 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
 
 	if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
 		return -EMSGSIZE;
+
+	if (vxlan_port->exts) {
+		struct nlattr *exts;
+
+		exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
+		if (!exts)
+			return -EMSGSIZE;
+
+		if (vxlan_port->exts & VXLAN_EXT_GBP &&
+		    nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
+			return -EMSGSIZE;
+
+		nla_nest_end(skb, exts);
+	}
+
 	return 0;
 }
 
@@ -96,6 +123,31 @@ static void vxlan_tnl_destroy(struct vport *vport)
 	ovs_vport_deferred_free(vport);
 }
 
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
+	[OVS_VXLAN_EXT_GBP]	= { .type = NLA_FLAG, },
+};
+
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
+{
+	struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
+	struct vxlan_port *vxlan_port;
+	int err;
+
+	if (nla_len(attr) < sizeof(struct nlattr))
+		return -EINVAL;
+
+	err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
+	if (err < 0)
+		return err;
+
+	vxlan_port = vxlan_vport(vport);
+
+	if (exts[OVS_VXLAN_EXT_GBP])
+		vxlan_port->exts |= VXLAN_EXT_GBP;
+
+	return 0;
+}
+
 static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
 {
 	struct net *net = ovs_dp_get_net(parms->dp);
@@ -128,7 +180,17 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
 	vxlan_port = vxlan_vport(vport);
 	strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
 
-	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0, 0);
+	a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
+	if (a) {
+		err = vxlan_configure_exts(vport, a);
+		if (err) {
+			ovs_vport_free(vport);
+			goto error;
+		}
+	}
+
+	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, 0,
+			    vxlan_port->exts);
 	if (IS_ERR(vs)) {
 		ovs_vport_free(vport);
 		return (void *)vs;
@@ -141,6 +203,21 @@ error:
 	return ERR_PTR(err);
 }
 
+static int vxlan_ext_gbp(struct sk_buff *skb)
+{
+	const struct ovs_tunnel_info *tun_info;
+	const struct ovs_vxlan_opts *opts;
+
+	tun_info = OVS_CB(skb)->egress_tun_info;
+	opts = tun_info->options;
+
+	if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
+	    tun_info->options_len >= sizeof(*opts))
+		return opts->gbp;
+	else
+		return 0;
+}
+
 static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 {
 	struct net *net = ovs_dp_get_net(vport->dp);
@@ -181,6 +258,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	src_port = udp_flow_src_port(net, skb, 0, 0, true);
 	md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
+	md.gbp = vxlan_ext_gbp(skb);
 
 	err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
 			     fl.saddr, tun_key->ipv4_dst,
diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h
new file mode 100644
index 0000000..4b08233e
--- /dev/null
+++ b/net/openvswitch/vport-vxlan.h
@@ -0,0 +1,11 @@
+#ifndef VPORT_VXLAN_H
+#define VPORT_VXLAN_H 1
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+struct ovs_vxlan_opts {
+	__u32 gbp;
+};
+
+#endif
-- 
1.9.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox