Netdev List
 help / color / mirror / Atom feed
* Re: Suspend/resume - slow resume
From: Ciprian Docan @ 2011-04-21  2:07 UTC (permalink / raw)
  To: Francois Romieu
  Cc: Linus Torvalds, netdev, Linux Kernel Mailing List, Len Brown,
	Pavel Machek, Rafael, J. Wysocki, Greg KH, nic_swsd
In-Reply-To: <20110420195330.GB18897@electric-eye.fr.zoreil.com>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 316 bytes --]


Hello Francois,

>> I tried to unload the module, but it crashed. I re-booted the
>> machine and tried to unload the module again. I got the output from
>> dmesg; please see attached.
>
> Hmmm...

I adapted your patch and fixed the module unload problem; please let me 
know what you think. Thank you.

--
 	Ciprian

[-- Attachment #2: Type: TEXT/PLAIN, Size: 5545 bytes --]

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 493b0de..397c368 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -170,6 +170,16 @@ static const struct {
 };
 #undef _R
 
+static const struct rtl_firmware_info {
+	int mac_version;
+	const char *fw_name;
+} rtl_firmware_infos[] = {
+	{ .mac_version = RTL_GIGA_MAC_VER_25, .fw_name = FIRMWARE_8168D_1 },
+	{ .mac_version = RTL_GIGA_MAC_VER_26, .fw_name = FIRMWARE_8168D_2 },
+	{ .mac_version = RTL_GIGA_MAC_VER_29, .fw_name = FIRMWARE_8105E_1 },
+	{ .mac_version = RTL_GIGA_MAC_VER_30, .fw_name = FIRMWARE_8105E_1 }
+};
+
 enum cfg_version {
 	RTL_CFG_0 = 0x00,
 	RTL_CFG_1,
@@ -565,6 +575,7 @@ struct rtl8169_private {
 	u32 saved_wolopts;
 
 	const struct firmware *fw;
+#define RTL_FIRMWARE_UNKNOWN	ERR_PTR(-EAGAIN);
 };
 
 MODULE_AUTHOR("Realtek and the Linux r8169 crew <netdev@vger.kernel.org>");
@@ -1789,25 +1800,26 @@ rtl_phy_write_fw(struct rtl8169_private *tp, const struct firmware *fw)
 
 static void rtl_release_firmware(struct rtl8169_private *tp)
 {
-	release_firmware(tp->fw);
-	tp->fw = NULL;
+	if (!IS_ERR_OR_NULL(tp->fw))
+		release_firmware(tp->fw);
+	tp->fw = RTL_FIRMWARE_UNKNOWN;
 }
 
-static int rtl_apply_firmware(struct rtl8169_private *tp, const char *fw_name)
+static void rtl_apply_firmware(struct rtl8169_private *tp)
 {
-	const struct firmware **fw = &tp->fw;
-	int rc = !*fw;
-
-	if (rc) {
-		rc = request_firmware(fw, fw_name, &tp->pci_dev->dev);
-		if (rc < 0)
-			goto out;
-	}
+	const struct firmware *fw = tp->fw;
 
 	/* TODO: release firmware once rtl_phy_write_fw signals failures. */
-	rtl_phy_write_fw(tp, *fw);
-out:
-	return rc;
+	if (!IS_ERR_OR_NULL(fw))
+		rtl_phy_write_fw(tp, fw);
+}
+
+static void rtl_apply_firmware_cond(struct rtl8169_private *tp, u8 reg, u16 val)
+{
+	if (rtl_readphy(tp, reg) != val)
+		netif_warn(tp, hw, tp->dev, "chipset not ready for firmware\n");
+	else
+		rtl_apply_firmware(tp);
 }
 
 static void rtl8169s_hw_phy_config(struct rtl8169_private *tp)
@@ -2246,10 +2258,8 @@ static void rtl8168d_1_hw_phy_config(struct rtl8169_private *tp)
 
 	rtl_writephy(tp, 0x1f, 0x0005);
 	rtl_writephy(tp, 0x05, 0x001b);
-	if ((rtl_readphy(tp, 0x06) != 0xbf00) ||
-	    (rtl_apply_firmware(tp, FIRMWARE_8168D_1) < 0)) {
-		netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n");
-	}
+
+	rtl_apply_firmware_cond(tp, MII_EXPANSION, 0xbf00);
 
 	rtl_writephy(tp, 0x1f, 0x0000);
 }
@@ -2351,10 +2361,8 @@ static void rtl8168d_2_hw_phy_config(struct rtl8169_private *tp)
 
 	rtl_writephy(tp, 0x1f, 0x0005);
 	rtl_writephy(tp, 0x05, 0x001b);
-	if ((rtl_readphy(tp, 0x06) != 0xb300) ||
-	    (rtl_apply_firmware(tp, FIRMWARE_8168D_2) < 0)) {
-		netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n");
-	}
+
+	rtl_apply_firmware_cond(tp, MII_EXPANSION, 0xb300);
 
 	rtl_writephy(tp, 0x1f, 0x0000);
 }
@@ -2474,8 +2482,7 @@ static void rtl8105e_hw_phy_config(struct rtl8169_private *tp)
 	rtl_writephy(tp, 0x18, 0x0310);
 	msleep(100);
 
-	if (rtl_apply_firmware(tp, FIRMWARE_8105E_1) < 0)
-		netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n");
+	rtl_apply_firmware(tp);
 
 	rtl_writephy_batch(tp, phy_reg_init, ARRAY_SIZE(phy_reg_init));
 }
@@ -3237,6 +3244,8 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	tp->timer.data = (unsigned long) dev;
 	tp->timer.function = rtl8169_phy_timer;
 
+	tp->fw = RTL_FIRMWARE_UNKNOWN;
+
 	rc = register_netdev(dev);
 	if (rc < 0)
 		goto err_out_msi_4;
@@ -3288,10 +3297,10 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
 
 	cancel_delayed_work_sync(&tp->task);
 
-	rtl_release_firmware(tp);
-
 	unregister_netdev(dev);
 
+	rtl_release_firmware(tp);
+
 	if (pci_dev_run_wake(pdev))
 		pm_runtime_get_noresume(&pdev->dev);
 
@@ -3303,6 +3312,37 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
 	pci_set_drvdata(pdev, NULL);
 }
 
+static void rtl_request_firmware(struct rtl8169_private *tp)
+{
+	int i;
+
+	/* Return early if the firmware is already loaded / cached. */
+	if (!IS_ERR(tp->fw))
+		goto out;
+
+	for (i = 0; i < ARRAY_SIZE(rtl_firmware_infos); i++) {
+		const struct rtl_firmware_info *info = rtl_firmware_infos + i;
+
+		if (info->mac_version == tp->mac_version) {
+			const char *name = info->fw_name;
+			int rc;
+
+			rc = request_firmware(&tp->fw, name, &tp->pci_dev->dev);
+			if (rc < 0) {
+				netif_warn(tp, ifup, tp->dev, "unable to load "
+					"firmware patch %s (%d)\n", name, rc);
+				goto out_disable_request_firmware;
+			}
+			goto out;
+		}
+	}
+
+out_disable_request_firmware:
+	tp->fw = NULL;
+out:
+	return;
+}
+
 static int rtl8169_open(struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
@@ -3334,11 +3374,13 @@ static int rtl8169_open(struct net_device *dev)
 
 	smp_mb();
 
+	rtl_request_firmware(tp);
+
 	retval = request_irq(dev->irq, rtl8169_interrupt,
 			     (tp->features & RTL_FEATURE_MSI) ? 0 : IRQF_SHARED,
 			     dev->name, dev);
 	if (retval < 0)
-		goto err_release_ring_2;
+		goto err_release_fw_2;
 
 	napi_enable(&tp->napi);
 
@@ -3359,7 +3401,8 @@ static int rtl8169_open(struct net_device *dev)
 out:
 	return retval;
 
-err_release_ring_2:
+err_release_fw_2:
+	rtl_release_firmware(tp);
 	rtl8169_rx_clear(tp);
 err_free_rx_1:
 	dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,

^ permalink raw reply related

* [PATCH net-next] net: Allow ethtool to set interface in loopback mode.
From: Mahesh Bandewar @ 2011-04-21  0:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Mahesh Bandewar

This patch enables ethtool to set the loopback mode on a given interface.
By configuring the interface in loopback mode in conjunction with a policy
route / rule, a userland application can stress the egress / ingress path
exposing the flows of the change in progress and potentially help developer(s)
understand the impact of those changes without even sending a packet out
on the network.

Following set of commands illustrates one such example -
    a) ip -4 addr add 192.168.1.1/24 dev eth1
    b) ip -4 rule add from all iif eth1 lookup 250
    c) ip -4 route add local 0/0 dev lo proto kernel scope host table 250
    d) arp -Ds 192.168.1.100 eth1
    e) arp -Ds 192.168.1.200 eth1
    f) sysctl -w net.ipv4.ip_nonlocal_bind=1
    g) sysctl -w net.ipv4.conf.all.accept_local=1
    # Assuming that the machine has 8 cores
    h) taskset 000f netserver -L 192.168.1.200
    i) taskset 00f0 netperf -t TCP_CRR -L 192.168.1.100 -H 192.168.1.200 -l 30

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
 include/linux/netdevice.h |    3 ++-
 net/core/ethtool.c        |    2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb8178a..c4d9dd7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1067,6 +1067,7 @@ struct net_device {
 #define NETIF_F_RXHASH		(1 << 28) /* Receive hashing offload */
 #define NETIF_F_RXCSUM		(1 << 29) /* Receive checksumming offload */
 #define NETIF_F_NOCACHE_COPY	(1 << 30) /* Use no-cache copyfromuser */
+#define NETIF_F_LOOPBACK	(1 << 31) /* Enable loopback */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
@@ -1082,7 +1083,7 @@ struct net_device {
 	/* = all defined minus driver/device-class-related */
 #define NETIF_F_NEVER_CHANGE	(NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \
 				  NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
-#define NETIF_F_ETHTOOL_BITS	(0x7f3fffff & ~NETIF_F_NEVER_CHANGE)
+#define NETIF_F_ETHTOOL_BITS	(0xff3fffff & ~NETIF_F_NEVER_CHANGE)
 
 	/* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 13d79f5..d177ce9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -362,7 +362,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS
 	/* NETIF_F_RXHASH */          "rx-hashing",
 	/* NETIF_F_RXCSUM */          "rx-checksum",
 	/* NETIF_F_NOCACHE_COPY */    "tx-nocache-copy"
-	"",
+	/* NETIF_F_LOOPBACK */        "loopback",
 };
 
 static int __ethtool_get_sset_count(struct net_device *dev, int sset)
-- 
1.7.3.1


^ permalink raw reply related

* Re: [PATCH] net: tun: convert to hw_features
From: Michał Mirosław @ 2011-04-20 23:15 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev
In-Reply-To: <871v0xip3j.fsf@rustcorp.com.au>

On Wed, Apr 20, 2011 at 12:52:24PM +0930, Rusty Russell wrote:
> On Tue, 19 Apr 2011 18:13:10 +0200 (CEST), Michał Mirosław <mirq-linux@rere.qmqm.pl> wrote:
> > This changes offload setting behaviour to what I think is correct:
> >  - offloads set via ethtool mean what admin wants to use (by default
> >    he wants 'em all)
> >  - offloads set via ioctl() mean what userspace is expecting to get
> >    (this limits which admin wishes are granted)
> >  - TUN_NOCHECKSUM is ignored, as it might cause broken packets when
> >    forwarded (ip_summed == CHECKSUM_UNNECESSARY means that checksum
> >    was verified, not that it can be ignored)
> > 
> > If TUN_NOCHECKSUM is implemented, it should set skb->csum_* and
> > skb->ip_summed (= CHECKSUM_PARTIAL) for known protocols and let others
> > be verified by kernel when necessary.
> > 
> > TUN_NOCHECKSUM handling was introduced by commit
> > f43798c27684ab925adde7d8acc34c78c6e50df8:
> > 
> >     tun: Allow GSO using virtio_net_hdr
> Err, not in my git tree!  It predates git in fact.

Ah! Sorry! Your patch just moved the buggy line around.

> Since tap requires privs, I wouldn't worry about invalid packets too
> much.

I prefer correctness over uncertainty. ;)

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH] net: tun: convert to hw_features
From: Rusty Russell @ 2011-04-20  3:22 UTC (permalink / raw)
  To: Michał Mirosław, netdev
In-Reply-To: <20110419161310.7508513909@rere.qmqm.pl>

On Tue, 19 Apr 2011 18:13:10 +0200 (CEST), Michał Mirosław <mirq-linux@rere.qmqm.pl> wrote:
> This changes offload setting behaviour to what I think is correct:
>  - offloads set via ethtool mean what admin wants to use (by default
>    he wants 'em all)
>  - offloads set via ioctl() mean what userspace is expecting to get
>    (this limits which admin wishes are granted)
>  - TUN_NOCHECKSUM is ignored, as it might cause broken packets when
>    forwarded (ip_summed == CHECKSUM_UNNECESSARY means that checksum
>    was verified, not that it can be ignored)
> 
> If TUN_NOCHECKSUM is implemented, it should set skb->csum_* and
> skb->ip_summed (= CHECKSUM_PARTIAL) for known protocols and let others
> be verified by kernel when necessary.
> 
> TUN_NOCHECKSUM handling was introduced by commit
> f43798c27684ab925adde7d8acc34c78c6e50df8:
> 
>     tun: Allow GSO using virtio_net_hdr

Err, not in my git tree!  It predates git in fact.

Since tap requires privs, I wouldn't worry about invalid packets too
much.

Thanks,
Rusty.

^ permalink raw reply

* Re: r8169 doesn't report link state correctly.
From: Ben Greear @ 2011-04-20 21:56 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev
In-Reply-To: <20110420191403.GC18805@electric-eye.fr.zoreil.com>

On 04/20/2011 12:14 PM, Francois Romieu wrote:
> François Romieu<romieu@fr.zoreil.com>  :
>> On Mon, Apr 11, 2011 at 01:09:19PM -0700, Ben Greear wrote:
>>> I notice that in kernel 2.6.38-wl, the realtek 8169 NIC doesn't
>>> report link down when in fact there is no cable connected.  Instead,
> [...]
>> Thanks for the report. I'll try it tomorrow or friday.
>
> I have not been able to notice it with a current kernel.
>
> I'd welcome the XID of the 8169 NIC (see dmesg) and a short explanation
> (no cable from boot ? cable removed after ifconfig up ? brand / ability
> of the switch / hub ?).

Well, as luck would have it, my system will boot today's upstream
kernel (39-rc4+).  And, I no longer see the problem in that release,
so it seems it is fixed (or harder to reproduce that I thought).

Basically, I was seeing it claim to have link in 'ethtool' output
when there was no cable connected.  It did go to 10Mbps/half duplex
link speed when un-plugged.  It showed full 1Gbps link when plugged
in.

I'll let you know if I see this again.

Thanks,
Ben

>
> Thanks.
>


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply

* Re: [v2 PATCH 0/6] IPVS: init and cleanup.
From: Julian Anastasov @ 2011-04-20 21:36 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: horms, ebiederm, lvs-devel, netdev, netfilter-devel,
	hans.schillstrom
In-Reply-To: <1303324434-14024-1-git-send-email-hans@schillstrom.com>


	Hello,

On Wed, 20 Apr 2011, Hans Schillstrom wrote:

> This patch series handles exit from a network name space.
> 
> REVISION
> 
> This is version 2
> 
> OVERVIEW
> Basically there was three faults in the netns implementation.
> - Kernel threads hold devices and preventing an exit.
> - dst cache holds references to devices.
> - Services was not always released.
> 
> Patch 1 & 3 contains the functionality
>       4 renames funcctions
>       5 removes empty functions
>       6 Debuging.
> 
> IMPLEMENTATION
> - Avoid to increment the usage counter for kernel threads.
>   this is done in the first patch.
> - Patch 3 tries to restore the cleanup order.
>   Add NETDEV_UNREGISTER notification for dst_reset
> 
> >From Julian -
> "For __ip_vs_service_cleanup: it still has to use mutex.
> Or we can avoid it by introducing ip_vs_unlink_service_nolock:
> ip_vs_flush will look like your __ip_vs_service_cleanup and
> will call ip_vs_unlink_service_nolock. ip_vs_unlink_service_nolock
> will be called by ip_vs_flush and by ip_vs_unlink_service."
> 
> I will give above another try later on see if I can get it to work.
> Right now ip_vs_service_net_cleanup() seems to work.
> 
> 
> An netns exit could look like this
> IPVS: Enter: __ip_vs_dev_cleanup, net/netfilter/ipvs/ip_vs_core.c
> IPVS: stopping master sync thread 1286 ...
> IPVS: stopping backup sync thread 1294 ...
> IPVS: Leave: __ip_vs_dev_cleanup, net/netfilter/ipvs/ip_vs_core.c
> IPVS: Enter: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> IPVS: Leave: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> ...
> IPVS: Enter: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> IPVS: Leave: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> IPVS: Enter: ip_vs_service_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: __ip_vs_del_service: enter
> IPVS: Moving dest 192.168.1.6:0 into trash, dest->refcnt=43450
> ...
> IPVS: Moving dest 192.168.1.3:0 into trash, dest->refcnt=43449
> IPVS: __ip_vs_del_service: enter
> IPVS: Removing destination 0/[2003:0000:0000:0000:0000:0002:0000:0006]:80
> ...
> IPVS: Removing destination 0/[2003:0000:0000:0000:0000:0002:0000:0003]:80
> IPVS: Removing service 0/[2003:0000:0000:0000:0000:0002:0004:0100]:80 usecnt=0
> IPVS: Leave: ip_vs_service_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: Enter: ip_vs_control_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: Removing service 80/0.0.0.0:0 usecnt=0
> IPVS: Leave: ip_vs_control_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: ipvs netns 8 released

	Notes only about PATCH 3/6:

- I was worried that throttle was used during cleanup
with the idea to stop traffic. But as now it is only an
optimization to avoid traffic to spend cycles in IPVS
code I'll suggest to use it without atomic operations.
It is not fatal if some packet does not see the disabling
immediately. So, may be using 'if (net_ipvs(net)->enable)'
is enough.

As result, there is no need to disable traffic in
__ip_vs_dev_cleanup, it is going to stop during
_device cleanup anyways.

- Please move check for 'enable' in ip_vs_in() before
ip_vs_fill_iphdr, this is the preferred place:

+	net = skb_net(skb);
+	if (!net_ipvs(net)->enable)
+		return NF_ACCEPT;
	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

Note that only ip_vs_in, ip_vs_out and ip_vs_forward_icmp*
need check for 'enable'. Then we can remove them from
ip_vs_in_icmp*

- As device can change its name space it was mandatory we
to add support for NETDEV_UNREGISTER. As we are using
_subsys pernet operations I expect we to see NETDEV_UNREGISTER
while the _device pernet methods are called, so before
our _subsys cleanup.

We can see in log that ip_vs_dst_event() is called before
ip_vs_service_net_cleanup, i.e. before our subsys cleanup.
net/core/dev.c uses register_pernet_device(&default_device_ops)
to call NETDEV_UNREGISTER* during _device cleanup and
to move devices to init_net. So, it should be safe
to rely on NETDEV_UNREGISTER event while we are subsys.

- __ip_vs_service_cleanup needs to call ip_vs_flush after
converting to ip_vs_unlink_service_nolock.

- For ip_vs_dst_event: I prefer to put everything in this
function, the ip_vs_svc_reset is not needed (name is not
good too). For example:

	struct ip_vs_dest *dest;
	unsigned int hash;

	mutex_lock(&__ip_vs_mutex);
	/* No need to use rs_lock, the mutex protects the list */
	for (hash = 0; hash < IP_VS_RTAB_SIZE; hash++) {
		list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
			__ip_vs_dev_reset(dest, dev);
		}
	}

	/* The mutex protects the trash list */
	list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
		__ip_vs_dev_reset(dest, dev);
	}

	mutex_unlock(&__ip_vs_mutex);

	No need to use __ip_vs_svc_lock or rs_lock because we
do not change the lists, __ip_vs_dev_reset has the needed
dst_cache locking (dst_lock). I assume we can safely use our
__ip_vs_mutex from netdevice notifier.

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* Re: [PATCHv4] usbnet: Resubmit interrupt URB once if halted
From: Paul Stewart @ 2011-04-20 21:17 UTC (permalink / raw)
  To: Alan Stern
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-usb-u79uwXL29TY76Z2rM5mHXA,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, greg-U8xfFu+wG4EAvxtiuMwx3w
In-Reply-To: <Pine.LNX.4.44L0.1104201658280.1686-100000-IYeN2dnnYyZXsRXLowluHWD2FQJk+8+b@public.gmane.org>

On Wed, Apr 20, 2011 at 2:08 PM, Alan Stern <stern-nwvwT67g6+6dFdvTe/nMLpVzexx5G7lz@public.gmane.org> wrote:
> On Tue, 19 Apr 2011, Paul Stewart wrote:
>
>> Set a flag if the interrupt URB completes with ENOENT as this
>> occurs legitimately during system suspend.  When the usbnet_bh
>> is called after resume, test this flag and try once to resubmit
>> the interrupt URB.
>
> No doubt there's a good reason for doing things this way, but it isn't
> clear.  Why wait until usbnet_bh() is called after resume?  Why not
> resubmit the interrupt URB _during_ usbnet_resume()?

Actually, I was doing this in the bh because of feedback I had gained
early in this process about not doing submit_urb in the resume().  If
that issue doesn't exist, that makes my work a lot easier.  In testing
I found that just setting this to happen in the bh might be problematic
due to firing too early, so this is good news.

> This would seem
> to be the logical approach, seeing as how usbnet_suspend() kills the
> interrupt URB.

Aha!  But you'll see from the current version of my patch that we don't
actually ever kill the interrupt URB.  It gets killed all on its own (by the
hcd?) and handed back to us in intr_complete().  This last bit about the
complete function being called was lost on me for a while which is why
in a previous iteration of the patch I was trying to kill the urb in suspend().

> For that matter, where does the interrupt URB get deallocated?  It
> obviously gets allocated in init_status(), but it doesn't seem to be
> freed anywhere.

That's a very interesting question and one that I noticed as well.
I was going to tackle that as a separate issue.

>
> Alan Stern
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V3 5/8] Enable cxgb3 to support zerocopy
From: Shirley Ma @ 2011-04-20 21:15 UTC (permalink / raw)
  To: Dimitris Michailidis
  Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
	netdev, kvm, linux-kernel
In-Reply-To: <1303333134.19336.70.camel@localhost.localdomain>

On Wed, 2011-04-20 at 13:58 -0700, Shirley Ma wrote:
> This flag can be ON when HIGHDMA and scatter/gather support. I will
> modify the patch to make it conditionally.

Double checked, it only needs HIGHDMA condition, not scatter/gather.

thanks
Shirley


^ permalink raw reply

* Re: [PATCHv4] usbnet: Resubmit interrupt URB once if halted
From: Alan Stern @ 2011-04-20 21:08 UTC (permalink / raw)
  To: Paul Stewart; +Cc: netdev, linux-usb, davem, greg
In-Reply-To: <20110420195015.20A96201B8@glenhelen.mtv.corp.google.com>

On Tue, 19 Apr 2011, Paul Stewart wrote:

> Set a flag if the interrupt URB completes with ENOENT as this
> occurs legitimately during system suspend.  When the usbnet_bh
> is called after resume, test this flag and try once to resubmit
> the interrupt URB.

No doubt there's a good reason for doing things this way, but it isn't
clear.  Why wait until usbnet_bh() is called after resume?  Why not
resubmit the interrupt URB _during_ usbnet_resume()?  This would seem
to be the logical approach, seeing as how usbnet_suspend() kills the
interrupt URB.

For that matter, where does the interrupt URB get deallocated?  It 
obviously gets allocated in init_status(), but it doesn't seem to be 
freed anywhere.

Alan Stern


^ permalink raw reply

* Re: [PATCH V3 5/8] Enable cxgb3 to support zerocopy
From: Shirley Ma @ 2011-04-20 20:58 UTC (permalink / raw)
  To: Dimitris Michailidis
  Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
	netdev, kvm, linux-kernel
In-Reply-To: <4DAF479D.7090309@chelsio.com>

On Wed, 2011-04-20 at 13:52 -0700, Dimitris Michailidis wrote:
> The features handling has been reworked in net-next and patches like
> this 
> won't apply as the code you're patching has changed.  Also core code
> now 
> does a lot of the related work and you'll need to tell it what to do
> with 
> any new flags.
Ok, will do.

> What properties does a device or driver need to meet to set this flag?
> It 
> seems to be set a bit too unconditionally.  For example, does it work
> if one 
> disables scatter/gather? 

This flag can be ON when HIGHDMA and scatter/gather support. I will
modify the patch to make it conditionally.

thanks
Shirley


^ permalink raw reply

* Re: [PATCH V3 5/8] Enable cxgb3 to support zerocopy
From: Dimitris Michailidis @ 2011-04-20 20:52 UTC (permalink / raw)
  To: Shirley Ma
  Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
	netdev, kvm, linux-kernel
In-Reply-To: <1303330438.19336.57.camel@localhost.localdomain>

On 04/20/2011 01:13 PM, Shirley Ma wrote:
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
> 
>  drivers/net/cxgb3/cxgb3_main.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
> index 9108931..93a1101 100644
> --- a/drivers/net/cxgb3/cxgb3_main.c
> +++ b/drivers/net/cxgb3/cxgb3_main.c
> @@ -3313,7 +3313,7 @@ static int __devinit init_one(struct pci_dev *pdev,
>  		netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
>  		netdev->features |= NETIF_F_GRO;
>  		if (pci_using_dac)
> -			netdev->features |= NETIF_F_HIGHDMA;
> +			netdev->features |= NETIF_F_HIGHDMA | NETIF_F_ZEROCOPY;
>  
>  		netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
>  		netdev->netdev_ops = &cxgb_netdev_ops;

The features handling has been reworked in net-next and patches like this 
won't apply as the code you're patching has changed.  Also core code now 
does a lot of the related work and you'll need to tell it what to do with 
any new flags.

What properties does a device or driver need to meet to set this flag?  It 
seems to be set a bit too unconditionally.  For example, does it work if one 
disables scatter/gather?

^ permalink raw reply

* [PATCH net-next-2.6 v5 5/5] sctp: Add ADD/DEL ASCONF handling at the receiver
From: Michio Honda @ 2011-04-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: lksctp-developers

This patch fixes the problem that the original code cannot delete the remote address where the corresponding transport is currently directed, even when the ASCONF is sent from the other address (this situation happens when the single-homed sender transmits  ASCONF with ADD and DEL.)  

Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index de98665..a9f25d7 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2990,7 +2990,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
 		 * an Error Cause TLV set to the new error code 'Request to
 		 * Delete Source IP Address'
 		 */
-		if (sctp_cmp_addr_exact(sctp_source(asconf), &addr))
+		if (sctp_cmp_addr_exact(&asconf->source, &addr))
 			return SCTP_ERROR_DEL_SRC_IP;
 
 		/* Section 4.2.2


^ permalink raw reply related

* Re: [Bugme-new] [Bug 33502] New: Caught 64-bit read from uninitialized memory in __alloc_skb
From: Eric Dumazet @ 2011-04-20 20:32 UTC (permalink / raw)
  To: Christian Casteyde
  Cc: Christoph Lameter, Pekka Enberg, Andrew Morton, netdev,
	bugzilla-daemon, bugme-daemon, Vegard Nossum, Changli Gao
In-Reply-To: <1303329300.2690.25.camel@edumazet-laptop>

Le mercredi 20 avril 2011 à 21:55 +0200, Eric Dumazet a écrit :
> Le mercredi 20 avril 2011 à 21:36 +0200, Christian Casteyde a écrit :
> 
> > I'm not sure it's the same problem anyway as I've said previously.
> > I append my config file used to build this kernel.
> 
> This is not the same problem, and is a known false positive from skb
> reallocation. (skb_reserve() doesnt mark memory as initialized) 
> 
> 

Please try following patch. It's a bit tricky, because network stack has
different functions to fill bytes in skb and change pointers/offsets in
it.

Alternative would be to change pskb_expand_head() to not copy zone
between skb->head and skb->data (might contain unitialized data)

Thanks


diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 79aafbb..3f2cba4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1252,6 +1252,7 @@ static inline int skb_tailroom(const struct sk_buff *skb)
  */
 static inline void skb_reserve(struct sk_buff *skb, int len)
 {
+	kmemcheck_mark_initialized(skb->data, len);
 	skb->data += len;
 	skb->tail += len;
 }



^ permalink raw reply related

* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Shirley Ma @ 2011-04-20 20:30 UTC (permalink / raw)
  To: Dimitris Michailidis
  Cc: Ben Hutchings, David Miller, mst, Eric Dumazet, Avi Kivity,
	Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <4DAF40EB.20303@chelsio.com>

Resubmit it with 31 bit.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 include/linux/netdevice.h |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0249fe7..0998d3d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1067,6 +1067,9 @@ struct net_device {
 #define NETIF_F_RXHASH		(1 << 28) /* Receive hashing offload */
 #define NETIF_F_RXCSUM		(1 << 29) /* Receive checksumming offload */
 
+/* Bit 31 is for device to map userspace buffers -- zerocopy */
+#define NETIF_F_ZEROCOPY	(1 << 31)
+
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
 #define NETIF_F_GSO_MASK	0x00ff0000

^ permalink raw reply related

* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Shirley Ma @ 2011-04-20 20:28 UTC (permalink / raw)
  To: Dimitris Michailidis
  Cc: Ben Hutchings, David Miller, mst, Eric Dumazet, Avi Kivity,
	Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <4DAF40EB.20303@chelsio.com>

On Wed, 2011-04-20 at 13:24 -0700, Dimitris Michailidis wrote:
> Bit 30 is also taken in net-next.

How about 31?

Thanks
Shirley

^ permalink raw reply

* [PATCH V3 6/8] macvtap/vhost TX zero copy support
From: Shirley Ma @ 2011-04-20 20:27 UTC (permalink / raw)
  To: David Miller, Arnd Bergmann, mst
  Cc: Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>

Only when buffer size is greater than GOODCOPY_LEN (128), macvtap
enables zero-copy.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/net/macvtap.c |  124
++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6696e56..b4e6656 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
  */
 static dev_t macvtap_major;
 #define MACVTAP_NUM_DEVS 65536
+#define GOODCOPY_LEN  (L1_CACHE_BYTES < 128 ? 128 : L1_CACHE_BYTES)
 static struct class *macvtap_class;
 static struct cdev macvtap_cdev;
 
@@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode, struct
file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct net_device *dev = dev_get_by_index(net, iminor(inode));
+	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvtap_queue *q;
 	int err;
 
@@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode, struct
file *file)
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
+	/*
+	 * so far only VM uses macvtap, enable zero copy between guest
+	 * kernel and host kernel when lower device supports high memory
+	 * DMA
+	 */
+	if (vlan) {
+		if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
+			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+	}
+
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
 		sock_put(&q->sk);
@@ -433,6 +445,80 @@ static inline struct sk_buff
*macvtap_alloc_skb(struct sock *sk, size_t prepad,
 	return skb;
 }
 
+/* set skb frags from iovec, this can move to core network code for
reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct
iovec *from,
+				  int offset, size_t count)
+{
+	int len = iov_length(from, count) - offset;
+	int copy = skb_headlen(skb);
+	int size, offset1 = 0;
+	int i = 0;
+	skb_frag_t *f;
+
+	/* Skip over from offset */
+	while (offset >= from->iov_len) {
+		offset -= from->iov_len;
+		++from;
+		--count;
+	}
+
+	/* copy up to skb headlen */
+	while (copy > 0) {
+		size = min_t(unsigned int, copy, from->iov_len - offset);
+		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+				   size))
+			return -EFAULT;
+		if (copy > size) {
+			++from;
+			--count;
+		}
+		copy -= size;
+		offset1 += size;
+		offset = 0;
+	}
+
+	if (len == offset1)
+		return 0;
+
+	while (count--) {
+		struct page *page[MAX_SKB_FRAGS];
+		int num_pages;
+		unsigned long base;
+
+		len = from->iov_len - offset1;
+		if (!len) {
+			offset1 = 0;
+			++from;
+			continue;
+		}
+		base = (unsigned long)from->iov_base + offset1;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+		if ((num_pages != size) ||
+		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
+			/* put_page is in skb free */
+			return -EFAULT;
+		skb->data_len += len;
+		skb->len += len;
+		skb->truesize += len;
+		while (len) {
+			f = &skb_shinfo(skb)->frags[i];
+			f->page = page[i];
+			f->page_offset = base & ~PAGE_MASK;
+			f->size = min_t(int, len, PAGE_SIZE - f->page_offset);
+			skb_shinfo(skb)->nr_frags++;
+			/* increase sk_wmem_alloc */
+			atomic_add(f->size, &skb->sk->sk_wmem_alloc);
+			base += f->size;
+			len -= f->size;
+			i++;
+		}
+		offset1 = 0;
+		++from;
+	}
+	return 0;
+}
+
 /*
  * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
  * be shared with the tun/tap driver.
@@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const struct
sk_buff *skb,
 
 
 /* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q,
-				const struct iovec *iv, size_t count,
-				int noblock)
+static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr
*m,
+				const struct iovec *iv, unsigned long total_len,
+				size_t count, int noblock)
 {
 	struct sk_buff *skb;
 	struct macvlan_dev *vlan;
-	size_t len = count;
+	unsigned long len = total_len;
 	int err;
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int vnet_hdr_len = 0;
+	int copylen, zerocopy;
 
+	zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len > GOODCOPY_LEN);
 	if (q->flags & IFF_VNET_HDR) {
 		vnet_hdr_len = q->vnet_hdr_sz;
 
@@ -552,12 +640,24 @@ static ssize_t macvtap_get_user(struct
macvtap_queue *q,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
-	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
-				noblock, &err);
+	if (zerocopy)
+		copylen = vnet_hdr.hdr_len;
+	else
+		copylen = len;
+
+	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
+				vnet_hdr.hdr_len, noblock, &err);
 	if (!skb)
 		goto err;
-
-	err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
+
+	if (zerocopy)
+		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
+	else
+		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
+						   len);
+	if (sock_flag(&q->sk, SOCK_ZEROCOPY))
+		memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
+			sizeof(struct skb_ubuf_info));
 	if (err)
 		goto err_kfree;
 
@@ -579,7 +679,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue
*q,
 		kfree_skb(skb);
 	rcu_read_unlock_bh();
 
-	return count;
+	return total_len;
 
 err_kfree:
 	kfree_skb(skb);
@@ -601,8 +701,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb,
const struct iovec *iv,
 	ssize_t result = -ENOLINK;
 	struct macvtap_queue *q = file->private_data;
 
-	result = macvtap_get_user(q, iv, iov_length(iv, count),
-			      file->f_flags & O_NONBLOCK);
+	result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
+				  file->f_flags & O_NONBLOCK);
 	return result;
 }
 
@@ -815,7 +915,7 @@ static int macvtap_sendmsg(struct kiocb *iocb,
struct socket *sock,
 			   struct msghdr *m, size_t total_len)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue,
sock);
-	return macvtap_get_user(q, m->msg_iov, total_len,
+	return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
 			    m->msg_flags & MSG_DONTWAIT);
 }
 



^ permalink raw reply related

* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Shirley Ma @ 2011-04-20 20:05 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
	netdev, kvm, linux-kernel
In-Reply-To: <1303328906.2823.24.camel@bwh-desktop>

Thanks. I need to update it to 30 bit.

Shirley


^ permalink raw reply

* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Dimitris Michailidis @ 2011-04-20 20:24 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Ben Hutchings, David Miller, mst, Eric Dumazet, Avi Kivity,
	Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <1303330173.19336.52.camel@localhost.localdomain>

On 04/20/2011 01:09 PM, Shirley Ma wrote:
> Resubmit this patch with the new bit.

Bit 30 is also taken in net-next.

> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
> 
>  include/linux/netdevice.h |    3 +++
>  1 files changed, 3 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0249fe7..0998d3d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1067,6 +1067,9 @@ struct net_device {
>  #define NETIF_F_RXHASH		(1 << 28) /* Receive hashing offload */
>  #define NETIF_F_RXCSUM		(1 << 29) /* Receive checksumming offload */
>  
> +/* Bit 30 is for device to map userspace buffers -- zerocopy */
> +#define NETIF_F_ZEROCOPY	(1 << 30)
> +
>  	/* Segmentation offload features */
>  #define NETIF_F_GSO_SHIFT	16
>  #define NETIF_F_GSO_MASK	0x00ff0000
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* [PATCH V3 8/8] Enable benet to support zerocopy
From: Shirley Ma @ 2011-04-20 20:17 UTC (permalink / raw)
  To: David Miller
  Cc: mst, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev, kvm,
	linux-kernel
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/net/benet/be_main.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c
index 7cb5a11..d7b7254 100644
--- a/drivers/net/benet/be_main.c
+++ b/drivers/net/benet/be_main.c
@@ -2982,6 +2982,7 @@ static int __devinit be_probe(struct pci_dev *pdev,
 	status = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
 	if (!status) {
 		netdev->features |= NETIF_F_HIGHDMA;
+		netdev->features |= NETIF_F_ZEROCOPY;
 	} else {
 		status = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
 		if (status) {



^ permalink raw reply related

* [PATCH V3 7/8] Enable ixgbe to support zerocopy
From: Shirley Ma @ 2011-04-20 20:15 UTC (permalink / raw)
  To: David Miller
  Cc: mst, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev, kvm,
	linux-kernel
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/net/ixgbe/ixgbe_main.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 6f8adc7..68f1e93 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -7395,6 +7395,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 #endif /* IXGBE_FCOE */
 	if (pci_using_dac) {
 		netdev->features |= NETIF_F_HIGHDMA;
+		netdev->features |= NETIF_F_ZEROCOPY;
 		netdev->vlan_features |= NETIF_F_HIGHDMA;
 	}
 

^ permalink raw reply related

* [PATCH V3 5/8] Enable cxgb3 to support zerocopy
From: Shirley Ma @ 2011-04-20 20:13 UTC (permalink / raw)
  To: David Miller
  Cc: mst, Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev, kvm,
	linux-kernel
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/net/cxgb3/cxgb3_main.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 9108931..93a1101 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -3313,7 +3313,7 @@ static int __devinit init_one(struct pci_dev *pdev,
 		netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
 		netdev->features |= NETIF_F_GRO;
 		if (pci_using_dac)
-			netdev->features |= NETIF_F_HIGHDMA;
+			netdev->features |= NETIF_F_HIGHDMA | NETIF_F_ZEROCOPY;
 
 		netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
 		netdev->netdev_ops = &cxgb_netdev_ops;

^ permalink raw reply related

* Re: [PATCH V3 0/8] macvtap/vhost TX zero copy support
From: Shirley Ma @ 2011-04-20 20:12 UTC (permalink / raw)
  To: David Miller, Arnd Bergmann
  Cc: mst, Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>

Only when buffer size is greater than GOODCOPY_LEN (128), macvtap
enables zero-copy.

Signed-off-by: Shirley MA <xma@us.ibm.com>
---

 drivers/net/macvtap.c |  124 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6696e56..b4e6656 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
  */
 static dev_t macvtap_major;
 #define MACVTAP_NUM_DEVS 65536
+#define GOODCOPY_LEN  (L1_CACHE_BYTES < 128 ? 128 : L1_CACHE_BYTES)
 static struct class *macvtap_class;
 static struct cdev macvtap_cdev;
 
@@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct net_device *dev = dev_get_by_index(net, iminor(inode));
+	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvtap_queue *q;
 	int err;
 
@@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
+	/*
+	 * so far only VM uses macvtap, enable zero copy between guest
+	 * kernel and host kernel when lower device supports high memory
+	 * DMA
+	 */
+	if (vlan) {
+		if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
+			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+	}
+
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
 		sock_put(&q->sk);
@@ -433,6 +445,80 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
 	return skb;
 }
 
+/* set skb frags from iovec, this can move to core network code for reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+				  int offset, size_t count)
+{
+	int len = iov_length(from, count) - offset;
+	int copy = skb_headlen(skb);
+	int size, offset1 = 0;
+	int i = 0;
+	skb_frag_t *f;
+
+	/* Skip over from offset */
+	while (offset >= from->iov_len) {
+		offset -= from->iov_len;
+		++from;
+		--count;
+	}
+
+	/* copy up to skb headlen */
+	while (copy > 0) {
+		size = min_t(unsigned int, copy, from->iov_len - offset);
+		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+				   size))
+			return -EFAULT;
+		if (copy > size) {
+			++from;
+			--count;
+		}
+		copy -= size;
+		offset1 += size;
+		offset = 0;
+	}
+
+	if (len == offset1)
+		return 0;
+
+	while (count--) {
+		struct page *page[MAX_SKB_FRAGS];
+		int num_pages;
+		unsigned long base;
+
+		len = from->iov_len - offset1;
+		if (!len) {
+			offset1 = 0;
+			++from;
+			continue;
+		}
+		base = (unsigned long)from->iov_base + offset1;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+		if ((num_pages != size) ||
+		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
+			/* put_page is in skb free */
+			return -EFAULT;
+		skb->data_len += len;
+		skb->len += len;
+		skb->truesize += len;
+		while (len) {
+			f = &skb_shinfo(skb)->frags[i];
+			f->page = page[i];
+			f->page_offset = base & ~PAGE_MASK;
+			f->size = min_t(int, len, PAGE_SIZE - f->page_offset);
+			skb_shinfo(skb)->nr_frags++;
+			/* increase sk_wmem_alloc */
+			atomic_add(f->size, &skb->sk->sk_wmem_alloc);
+			base += f->size;
+			len -= f->size;
+			i++;
+		}
+		offset1 = 0;
+		++from;
+	}
+	return 0;
+}
+
 /*
  * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
  * be shared with the tun/tap driver.
@@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
 
 
 /* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q,
-				const struct iovec *iv, size_t count,
-				int noblock)
+static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
+				const struct iovec *iv, unsigned long total_len,
+				size_t count, int noblock)
 {
 	struct sk_buff *skb;
 	struct macvlan_dev *vlan;
-	size_t len = count;
+	unsigned long len = total_len;
 	int err;
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int vnet_hdr_len = 0;
+	int copylen, zerocopy;
 
+	zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len > GOODCOPY_LEN);
 	if (q->flags & IFF_VNET_HDR) {
 		vnet_hdr_len = q->vnet_hdr_sz;
 
@@ -552,12 +640,24 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
-	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
-				noblock, &err);
+	if (zerocopy)
+		copylen = vnet_hdr.hdr_len;
+	else
+		copylen = len;
+
+	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
+				vnet_hdr.hdr_len, noblock, &err);
 	if (!skb)
 		goto err;
-
-	err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
+
+	if (zerocopy)
+		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
+	else
+		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
+						   len);
+	if (sock_flag(&q->sk, SOCK_ZEROCOPY))
+		memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
+			sizeof(struct skb_ubuf_info));
 	if (err)
 		goto err_kfree;
 
@@ -579,7 +679,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
 		kfree_skb(skb);
 	rcu_read_unlock_bh();
 
-	return count;
+	return total_len;
 
 err_kfree:
 	kfree_skb(skb);
@@ -601,8 +701,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 	ssize_t result = -ENOLINK;
 	struct macvtap_queue *q = file->private_data;
 
-	result = macvtap_get_user(q, iv, iov_length(iv, count),
-			      file->f_flags & O_NONBLOCK);
+	result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
+				  file->f_flags & O_NONBLOCK);
 	return result;
 }
 
@@ -815,7 +915,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
 			   struct msghdr *m, size_t total_len)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	return macvtap_get_user(q, m->msg_iov, total_len,
+	return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
 			    m->msg_flags & MSG_DONTWAIT);
 }
 

^ permalink raw reply related

* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Shirley Ma @ 2011-04-20 20:09 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
	netdev, kvm, linux-kernel
In-Reply-To: <1303328906.2823.24.camel@bwh-desktop>

Resubmit this patch with the new bit.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 include/linux/netdevice.h |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0249fe7..0998d3d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1067,6 +1067,9 @@ struct net_device {
 #define NETIF_F_RXHASH		(1 << 28) /* Receive hashing offload */
 #define NETIF_F_RXCSUM		(1 << 29) /* Receive checksumming offload */
 
+/* Bit 30 is for device to map userspace buffers -- zerocopy */
+#define NETIF_F_ZEROCOPY	(1 << 30)
+
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
 #define NETIF_F_GSO_MASK	0x00ff0000

^ permalink raw reply related

* [PATCH V3 4/8] vhost TX zero copy support
From: Shirley Ma @ 2011-04-20 20:07 UTC (permalink / raw)
  To: mst, David Miller
  Cc: Eric Dumazet, Avi Kivity, Arnd Bergmann, netdev, kvm,
	linux-kernel
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>

This patch maintains the outstanding userspace buffers in the 
sequence it is delivered to vhost. The outstanding userspace buffers 
will be marked as done once the lower device buffers DMA has finished. 
This is monitored through last reference of kfree_skb callback. Two
buffer index are used for this purpose.

The vhost passes the userspace buffers info to lower device skb 
through message control. Since there will be some done DMAs when
entering vhost handle_tx. The worse case is all buffers in the vq are
in pending/done status, so we need to notify guest to release DMA done 
buffers first before get any new buffers from the vq.

Signed-off-by: Shirley <xma@us.ibm.com>
---

 drivers/vhost/net.c   |   30 +++++++++++++++++++++++++++-
 drivers/vhost/vhost.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/vhost/vhost.h |   10 +++++++++
 3 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2f7c76a..1bc4536 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,8 @@
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
 
+#define MAX_ZEROCOPY_PEND 64
+
 enum {
 	VHOST_NET_VQ_RX = 0,
 	VHOST_NET_VQ_TX = 1,
@@ -129,6 +131,7 @@ static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	struct skb_ubuf_info pend;
 
 	/* TODO: check that we are running from vhost_worker? */
 	sock = rcu_dereference_check(vq->private_data, 1);
@@ -151,6 +154,10 @@ static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 
 	for (;;) {
+		/* Release DMAs done buffers first */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_zerocopy_signal_used(vq);
+
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
 					 &out, &in,
@@ -166,6 +173,12 @@ static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			/* If more outstanding DMAs, queue the work */
+			if (sock_flag(sock->sk, SOCK_ZEROCOPY) &&
+			    (atomic_read(&vq->refcnt) > MAX_ZEROCOPY_PEND)) {
+				vhost_poll_queue(&vq->poll);
+				break;
+			}
 			if (unlikely(vhost_enable_notify(vq))) {
 				vhost_disable_notify(vq);
 				continue;
@@ -188,17 +201,30 @@ static void handle_tx(struct vhost_net *net)
 			       iov_length(vq->hdr, s), hdr_size);
 			break;
 		}
+		/* use msg_control to pass vhost zerocopy ubuf info to skb */
+		if (sock_flag(sock->sk, SOCK_ZEROCOPY)) {
+			pend.callback = vhost_zerocopy_callback;
+			pend.arg = vq;
+			pend.desc = vq->upend_idx;
+			msg.msg_control = &pend;
+			msg.msg_controllen = sizeof(pend);
+			vq->heads[vq->upend_idx].id = head;
+			vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+			atomic_inc(&vq->refcnt);
+		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq, 1);
+			if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+				vhost_discard_vq_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		if (!sock_flag(sock->sk, SOCK_ZEROCOPY))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab2912..09bcb1d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -174,6 +174,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	vq->upend_idx = 0;
+	vq->done_idx = 0;
+	atomic_set(&vq->refcnt, 0);
 }
 
 static int vhost_worker(void *data)
@@ -230,7 +233,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 					       UIO_MAXIOV, GFP_KERNEL);
 		dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
 					  GFP_KERNEL);
-		dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
+		dev->vqs[i].heads = kzalloc(sizeof *dev->vqs[i].heads *
 					    UIO_MAXIOV, GFP_KERNEL);
 
 		if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
@@ -385,10 +388,41 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
 	return 0;
 }
 
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+{
+	int i, j = 0;
+
+	i = vq->done_idx;
+	while (i != vq->upend_idx) {
+		/* len = 1 means DMA done */
+		if (vq->heads[i].len == 1) {
+			/* reset len = 0 */
+			vq->heads[i].len = 0;
+			i = (i + 1) % UIO_MAXIOV;
+			++j;
+		} else
+			break;
+	}
+	if (j) {
+		if (i > vq->done_idx)
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx], j);
+		else {
+			vhost_add_used_n(vq, &vq->heads[vq->done_idx],
+					 UIO_MAXIOV - vq->done_idx);
+			vhost_add_used_n(vq, vq->heads, i);
+		}
+		vq->done_idx = i;
+		vhost_signal(vq->dev, vq);
+		atomic_sub(j, &vq->refcnt);
+	}
+}
+
 /* Caller should have device mutex */
 void vhost_dev_cleanup(struct vhost_dev *dev)
 {
 	int i;
+	unsigned long begin = jiffies;
+
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
@@ -405,6 +439,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 			eventfd_ctx_put(dev->vqs[i].call_ctx);
 		if (dev->vqs[i].call)
 			fput(dev->vqs[i].call);
+		/* wait for all lower device DMAs done, then notify guest */
+		while (atomic_read(&dev->vqs[i].refcnt)) {
+			if (time_after(jiffies, begin + 5 * HZ))
+				vhost_zerocopy_signal_used(&dev->vqs[i]);
+		}
 		vhost_vq_reset(dev, dev->vqs + i);
 	}
 	vhost_dev_free_iovecs(dev);
@@ -1416,3 +1455,12 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
+
+void vhost_zerocopy_callback(struct sk_buff *skb)
+{
+	int idx = skb_shinfo(skb)->ubuf.desc;
+	struct vhost_virtqueue *vq = skb_shinfo(skb)->ubuf.arg;
+
+	/* set len = 1 to mark this desc buffers done DMA */
+	vq->heads[idx].len = 1;
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b3363ae..cd2febb 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,14 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* vhost zerocopy support */
+	atomic_t refcnt; /* num of outstanding zerocopy DMAs */
+	/* index of zerocopy pending DMA buffers */
+	int upend_idx;
+	/* index of zerocopy done DMA buffers, but not notify guest yet */
+	int done_idx;
+	/* notify vhost zerocopy DMA buffers has done in lower device */
+	void (*callback)(struct sk_buff *);
 };
 
 struct vhost_dev {
@@ -154,6 +162,8 @@ bool vhost_enable_notify(struct vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
+void vhost_zerocopy_callback(struct sk_buff *skb);
+void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \

^ permalink raw reply related

* Re: [Bugme-new] [Bug 33502] New: Caught 64-bit read from uninitialized memory in __alloc_skb
From: Eric Dumazet @ 2011-04-20 19:55 UTC (permalink / raw)
  To: Christian Casteyde
  Cc: Christoph Lameter, Pekka Enberg, Andrew Morton, netdev,
	bugzilla-daemon, bugme-daemon, Vegard Nossum
In-Reply-To: <201104202136.52568.casteyde.christian@free.fr>

Le mercredi 20 avril 2011 à 21:36 +0200, Christian Casteyde a écrit :

> I'm not sure it's the same problem anyway as I've said previously.
> I append my config file used to build this kernel.

This is not the same problem, and is a known false positive from skb
reallocation. (skb_reserve() doesnt mark memory as initialized) 




^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox